From 316670eb35587141e969394ae8537d66b9211e80 Mon Sep 17 00:00:00 2001 From: Apple Date: Fri, 24 Aug 2012 20:25:02 +0000 Subject: [PATCH] xnu-2050.7.9.tar.gz --- .lldbinit | 17 + EXTERNAL_HEADERS/Availability.h | 27 +- EXTERNAL_HEADERS/AvailabilityInternal.h | 1537 ++++- EXTERNAL_HEADERS/AvailabilityMacros.h | 340 +- EXTERNAL_HEADERS/Makefile | 1 - EXTERNAL_HEADERS/corecrypto/cc.h | 69 + EXTERNAL_HEADERS/corecrypto/cc_config.h | 131 + EXTERNAL_HEADERS/corecrypto/cc_priv.h | 362 ++ EXTERNAL_HEADERS/corecrypto/ccaes.h | 83 + EXTERNAL_HEADERS/corecrypto/ccder.h | 263 + EXTERNAL_HEADERS/corecrypto/ccdes.h | 67 + EXTERNAL_HEADERS/corecrypto/ccdigest.h | 129 + EXTERNAL_HEADERS/corecrypto/cchmac.h | 83 + EXTERNAL_HEADERS/corecrypto/ccmd5.h | 27 + EXTERNAL_HEADERS/corecrypto/ccmode.h | 469 ++ EXTERNAL_HEADERS/corecrypto/ccmode_factory.h | 571 ++ EXTERNAL_HEADERS/corecrypto/ccmode_impl.h | 166 + EXTERNAL_HEADERS/corecrypto/ccn.h | 636 ++ EXTERNAL_HEADERS/corecrypto/ccpad.h | 65 + EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h | 43 + EXTERNAL_HEADERS/corecrypto/ccrc4.h | 43 + EXTERNAL_HEADERS/corecrypto/ccrng.h | 26 + EXTERNAL_HEADERS/corecrypto/ccrng_system.h | 22 + EXTERNAL_HEADERS/corecrypto/ccsha1.h | 47 + EXTERNAL_HEADERS/corecrypto/ccsha2.h | 62 + EXTERNAL_HEADERS/mach-o/Makefile | 1 - EXTERNAL_HEADERS/mach-o/kld.h | 104 - EXTERNAL_HEADERS/mach-o/loader.h | 157 +- EXTERNAL_HEADERS/mach-o/nlist.h | 14 +- EXTERNAL_HEADERS/mach-o/reloc.h | 3 +- EXTERNAL_HEADERS/mach-o/x86_64/reloc.h | 17 +- EXTERNAL_HEADERS/stdint.h | 2 + Makefile | 2 + README | 6 +- SETUP/Makefile | 4 +- SETUP/config/config.h | 4 +- SETUP/config/doconf | 3 +- SETUP/config/mkioconf.c | 24 +- SETUP/config/mkmakefile.c | 23 +- SETUP/decomment/Makefile | 31 + SETUP/decomment/decomment.c | 189 + SETUP/kextsymboltool/Makefile | 5 +- SETUP/md/Makefile | 31 + SETUP/md/md.1 | 121 + SETUP/md/md.c | 654 +++ bsd/bsm/audit.h | 22 +- bsd/bsm/audit_errno.h | 3 +- bsd/bsm/audit_kevents.h | 3 +- bsd/conf/MASTER | 54 +- bsd/conf/MASTER.i386 | 10 +- bsd/conf/MASTER.x86_64 | 13 +- bsd/conf/Makefile | 2 + bsd/conf/Makefile.i386 | 5 + bsd/conf/Makefile.template | 160 +- bsd/conf/Makefile.x86_64 | 5 + bsd/conf/files | 61 +- bsd/conf/files.i386 | 10 - bsd/conf/files.x86_64 | 10 - bsd/crypto/Makefile | 16 +- bsd/crypto/aes.h | 33 + bsd/crypto/aes/Assert.c | 34 - bsd/crypto/aes/aes.h | 219 - bsd/crypto/aes/gen/Makefile | 30 - bsd/crypto/aes/gen/aescrypt.c | 411 -- bsd/crypto/aes/gen/aeskey.c | 455 -- bsd/crypto/aes/gen/aesopt.h | 736 --- bsd/crypto/aes/gen/aestab.c | 384 -- bsd/crypto/aes/gen/aestab.h | 175 - bsd/crypto/aes/i386/AES.s | 143 - bsd/crypto/aes/i386/Context.h | 9 - bsd/crypto/aes/i386/Data.mk | 30 - bsd/crypto/aes/i386/Data.s | 5196 ----------------- bsd/crypto/aes/i386/EncryptDecrypt.s | 607 -- bsd/crypto/aes/i386/ExpandKeyForDecryption.s | 1214 ---- bsd/crypto/aes/i386/ExpandKeyForEncryption.s | 801 --- bsd/crypto/aes/i386/MakeData.c | 516 -- bsd/crypto/aes/i386/Makefile | 34 - bsd/crypto/aes/i386/ReadMe.txt | 22 - bsd/crypto/aes/i386/aes_crypt_hw.s | 472 -- bsd/crypto/aes/i386/aes_key_hw.s | 405 -- bsd/crypto/aes/i386/aes_modes_asm.s | 420 -- bsd/crypto/aes/i386/aes_modes_hw.s | 1623 ----- bsd/crypto/aes/i386/aesxts.c | 392 -- bsd/crypto/aes/i386/aesxts.h | 103 - bsd/crypto/aes/i386/aesxts_asm.s | 1305 ----- bsd/crypto/aesxts.h | 27 + bsd/crypto/des.h | 33 + bsd/crypto/des/des.h | 117 - bsd/crypto/des/des_ecb.c | 137 - bsd/crypto/des/des_enc.c | 294 - bsd/crypto/des/des_locl.h | 364 -- bsd/crypto/des/des_setkey.c | 232 - bsd/crypto/des/podd.h | 67 - bsd/crypto/des/sk.h | 196 - bsd/crypto/des/spr.h | 207 - bsd/crypto/sha2.h | 33 + bsd/crypto/sha2/Makefile | 36 - bsd/crypto/sha2/intel/sha256.s | 617 -- bsd/crypto/sha2/intel/sha256nossse3.s | 649 -- bsd/crypto/sha2/sha2.c | 1083 ---- bsd/crypto/sha2/sha2.h | 141 - bsd/dev/dtrace/dtrace.c | 47 +- bsd/dev/dtrace/dtrace_glue.c | 7 - bsd/dev/dtrace/dtrace_ptss.c | 8 +- bsd/dev/dtrace/dtrace_subr.c | 2 +- bsd/dev/dtrace/lockstat.c | 116 +- bsd/dev/dtrace/profile_prvd.c | 4 +- bsd/dev/dtrace/sdt.c | 14 +- bsd/dev/dtrace/systrace.c | 156 +- bsd/dev/dtrace/systrace.h | 4 +- bsd/dev/i386/conf.c | 13 +- bsd/dev/i386/dtrace_isa.c | 9 + bsd/dev/i386/fbt_x86.c | 10 +- bsd/dev/i386/kern_machdep.c | 4 +- bsd/dev/i386/mem.c | 12 +- bsd/dev/i386/sysctl.c | 42 +- bsd/dev/i386/systemcalls.c | 33 +- bsd/dev/i386/unix_signal.c | 9 +- bsd/dev/memdev.c | 10 +- bsd/dev/random/randomdev.c | 47 +- bsd/dev/unix_startup.c | 5 - bsd/hfs/hfs.h | 15 +- bsd/hfs/hfs_attrlist.c | 2 +- bsd/hfs/hfs_attrlist.h | 17 + bsd/hfs/hfs_btreeio.c | 12 + bsd/hfs/hfs_catalog.c | 128 +- bsd/hfs/hfs_catalog.h | 2 +- bsd/hfs/hfs_chash.c | 18 +- bsd/hfs/hfs_cnode.c | 266 +- bsd/hfs/hfs_cnode.h | 40 +- bsd/hfs/hfs_cprotect.c | 1456 ++++- bsd/hfs/hfs_format.h | 1 - bsd/hfs/hfs_fsctl.h | 14 + bsd/hfs/hfs_hotfiles.c | 2 +- bsd/hfs/hfs_kdebug.h | 4 +- bsd/hfs/hfs_link.c | 2 +- bsd/hfs/hfs_lookup.c | 109 +- bsd/hfs/hfs_notification.c | 33 +- bsd/hfs/hfs_readwrite.c | 387 +- bsd/hfs/hfs_search.c | 15 +- bsd/hfs/hfs_vfsops.c | 707 ++- bsd/hfs/hfs_vfsutils.c | 8 +- bsd/hfs/hfs_vnops.c | 955 +-- bsd/hfs/hfs_xattr.c | 33 +- bsd/hfs/hfscommon/Catalog/CatalogUtilities.c | 53 +- bsd/hfs/hfscommon/Catalog/FileIDsServices.c | 252 +- bsd/hfs/hfscommon/Misc/BTreeWrapper.c | 25 +- bsd/hfs/hfscommon/Misc/FileExtentMapping.c | 101 +- bsd/hfs/hfscommon/Misc/VolumeAllocation.c | 1099 ++-- bsd/hfs/hfscommon/headers/FileMgrInternal.h | 5 +- bsd/kern/bsd_init.c | 64 +- bsd/kern/bsd_stubs.c | 106 +- bsd/kern/decmpfs.c | 132 +- bsd/kern/imageboot.c | 1 + bsd/kern/kdebug.c | 274 +- bsd/kern/kern_authorization.c | 7 +- bsd/kern/kern_callout.c | 232 - bsd/kern/kern_control.c | 22 +- bsd/kern/kern_core.c | 4 +- bsd/kern/kern_credential.c | 544 +- bsd/kern/kern_descrip.c | 246 +- bsd/kern/kern_event.c | 62 +- bsd/kern/kern_exec.c | 580 +- bsd/kern/kern_exit.c | 498 +- bsd/kern/kern_fork.c | 87 +- bsd/kern/kern_lockf.c | 144 +- bsd/kern/kern_malloc.c | 30 +- bsd/kern/kern_memorystatus.c | 2248 ++++--- bsd/kern/kern_mib.c | 10 +- bsd/kern/kern_mman.c | 48 +- bsd/kern/kern_proc.c | 111 +- bsd/kern/kern_resource.c | 281 +- bsd/kern/kern_sig.c | 40 +- bsd/kern/kern_subr.c | 4 +- bsd/kern/kern_symfile.c | 161 +- bsd/kern/kern_synch.c | 1 - bsd/kern/kern_sysctl.c | 554 +- bsd/kern/kpi_mbuf.c | 81 +- bsd/kern/kpi_socket.c | 12 +- bsd/kern/kpi_socketfilter.c | 241 +- bsd/kern/mach_fat.c | 20 +- bsd/kern/mach_loader.c | 222 +- bsd/kern/mach_loader.h | 5 +- bsd/kern/mach_process.c | 21 +- bsd/kern/makesyscalls.sh | 6 +- bsd/kern/mcache.c | 10 +- bsd/kern/netboot.c | 2 +- bsd/kern/policy_check.c | 11 +- bsd/kern/posix_shm.c | 32 +- bsd/kern/proc_info.c | 304 +- bsd/kern/process_policy.c | 199 +- bsd/kern/pthread_support.c | 33 +- bsd/kern/pthread_synch.c | 553 +- bsd/kern/socket_info.c | 19 +- bsd/kern/subr_prf.c | 3 + bsd/kern/subr_prof.c | 24 - bsd/kern/sys_generic.c | 120 +- bsd/kern/sys_pipe.c | 938 +-- bsd/kern/sys_socket.c | 43 +- bsd/kern/syscalls.master | 32 +- bsd/kern/sysv_sem.c | 9 +- bsd/kern/trace.codes | 106 +- bsd/kern/tty.c | 69 +- bsd/kern/tty_tty.c | 69 +- bsd/kern/ubc_subr.c | 39 +- bsd/kern/uipc_domain.c | 138 +- bsd/kern/uipc_mbuf.c | 474 +- bsd/kern/uipc_mbuf2.c | 165 +- bsd/kern/uipc_socket.c | 435 +- bsd/kern/uipc_socket2.c | 153 +- bsd/kern/uipc_syscalls.c | 78 +- bsd/kern/uipc_usrreq.c | 97 +- bsd/kern/vm_pressure.c | 356 +- bsd/kern/vm_pressure.h | 9 +- bsd/libkern/libkern.h | 1 + bsd/machine/exec.h | 8 - bsd/machine/setjmp.h | 8 +- bsd/man/man2/getattrlist.2 | 2 +- bsd/man/man2/getaudit.2 | 194 +- bsd/man/man2/getaudit_addr.2 | 215 +- bsd/man/man2/getgroups.2 | 11 +- bsd/man/man2/getrusage.2 | 4 +- bsd/man/man2/getsockopt.2 | 2 +- bsd/man/man2/searchfs.2 | 4 + bsd/man/man2/setaudit.2 | 237 +- bsd/man/man2/setaudit_addr.2 | 254 +- bsd/man/man2/setxattr.2 | 9 + bsd/man/man2/statfs.2 | 6 +- .../man3/posix_spawnattr_setspecialport_np.3 | 4 +- bsd/man/man4/Makefile | 1 - bsd/man/man4/inet6.4 | 22 +- bsd/man/man4/ip6.4 | 8 +- bsd/miscfs/specfs/spec_vnops.c | 925 ++- bsd/miscfs/specfs/specdev.h | 4 +- bsd/net/Makefile | 13 +- bsd/{crypto/aes => net/altq}/Makefile | 33 +- bsd/net/altq/altq.h | 75 + bsd/net/altq/altq_cbq.c | 268 + bsd/net/altq/altq_cbq.h | 86 + bsd/net/altq/altq_fairq.c | 300 + bsd/net/altq/altq_fairq.h | 88 + bsd/net/altq/altq_hfsc.c | 286 + bsd/net/altq/altq_hfsc.h | 84 + bsd/net/altq/altq_priq.c | 266 + bsd/net/altq/altq_priq.h | 80 + bsd/net/altq/altq_qfq.c | 236 + bsd/net/altq/altq_qfq.h | 52 + bsd/net/altq/altq_subr.c | 487 ++ bsd/net/altq/altq_var.h | 95 + bsd/net/altq/if_altq.h | 168 + bsd/net/bpf.c | 572 +- bsd/net/bpf.h | 31 +- bsd/net/bpf_filter.c | 37 +- bsd/net/bpfdesc.h | 5 +- bsd/net/bridgestp.c | 12 +- bsd/net/bridgestp.h | 2 - bsd/{crypto/des => net/classq}/Makefile | 30 +- bsd/net/classq/classq.c | 366 ++ bsd/net/classq/classq.h | 181 + bsd/net/classq/classq_blue.c | 369 ++ bsd/net/classq/classq_blue.h | 127 + bsd/net/classq/classq_red.c | 615 ++ bsd/net/classq/classq_red.h | 168 + bsd/net/classq/classq_rio.c | 528 ++ bsd/net/classq/classq_rio.h | 139 + bsd/net/classq/classq_sfb.c | 1184 ++++ bsd/net/classq/classq_sfb.h | 153 + bsd/net/classq/classq_subr.c | 794 +++ bsd/net/classq/classq_util.c | 304 + bsd/net/classq/if_classq.h | 361 ++ bsd/net/dlil.c | 3024 ++++++++-- bsd/net/dlil.h | 156 +- bsd/net/ether_at_pr_module.c | 15 +- bsd/net/ether_if_module.c | 43 +- bsd/net/ether_inet6_pr_module.c | 30 +- bsd/net/ether_inet_pr_module.c | 82 +- bsd/net/{pf_mtag.h => flowadv.h} | 32 +- bsd/net/flowhash.c | 825 +++ osfmk/vm/vm_print.h => bsd/net/flowhash.h | 69 +- bsd/net/if.c | 900 ++- bsd/net/if.h | 265 +- bsd/net/if_bond.c | 32 +- bsd/net/if_bond_internal.h | 35 + bsd/net/if_bond_var.h | 4 +- bsd/net/if_bridge.c | 2609 +++++---- bsd/net/if_bridgevar.h | 2 + bsd/net/if_dl.h | 4 +- bsd/net/if_ether.h | 7 +- bsd/net/if_gif.c | 4 +- bsd/net/if_llreach.c | 108 +- bsd/net/if_llreach.h | 20 +- bsd/net/if_loop.c | 648 +- bsd/net/if_media.h | 2 +- bsd/net/if_mib.c | 32 +- bsd/net/if_mib.h | 5 +- bsd/net/if_pflog.c | 4 - bsd/net/if_stf.c | 17 +- bsd/net/if_utun.c | 156 +- bsd/net/if_utun.h | 45 +- bsd/net/if_utun_crypto.c | 532 ++ bsd/net/if_utun_crypto.h | 353 ++ bsd/net/if_utun_crypto_ipsec.c | 1088 ++++ bsd/net/if_utun_crypto_ipsec.h | 105 + bsd/net/if_var.h | 235 +- bsd/net/if_vlan.c | 19 +- bsd/net/iptap.c | 437 ++ osfmk/ddb/db_expr.h => bsd/net/iptap.h | 86 +- bsd/net/kpi_interface.c | 1426 +++-- bsd/net/kpi_interface.h | 749 ++- bsd/net/kpi_protocol.c | 388 +- bsd/net/lacp.h | 92 +- bsd/net/ndrv.c | 23 +- bsd/net/ndrv.h | 8 + bsd/net/ndrv_var.h | 9 +- bsd/net/net_str_id.c | 5 +- bsd/net/netsrc.c | 2 + bsd/net/netsrc.h | 1 + bsd/net/ntstat.c | 505 +- bsd/net/ntstat.h | 7 +- bsd/net/pf.c | 2193 +++---- bsd/net/pf_if.c | 20 +- bsd/net/pf_ioctl.c | 3032 +++++++--- bsd/net/pf_norm.c | 935 ++- bsd/net/pf_osfp.c | 4 +- bsd/net/pf_ruleset.c | 30 +- bsd/net/pf_table.c | 4 +- bsd/net/pfkeyv2.h | 9 +- bsd/net/pfvar.h | 382 +- bsd/net/pktsched/Makefile | 44 + bsd/net/pktsched/pktsched.c | 267 + bsd/net/pktsched/pktsched.h | 156 + bsd/net/pktsched/pktsched_cbq.c | 699 +++ bsd/net/pktsched/pktsched_cbq.h | 204 + bsd/net/pktsched/pktsched_fairq.c | 1290 ++++ bsd/net/pktsched/pktsched_fairq.h | 231 + bsd/net/pktsched/pktsched_hfsc.c | 2057 +++++++ bsd/net/pktsched/pktsched_hfsc.h | 355 ++ bsd/net/pktsched/pktsched_priq.c | 1275 ++++ bsd/net/pktsched/pktsched_priq.h | 192 + bsd/net/pktsched/pktsched_qfq.c | 2034 +++++++ bsd/net/pktsched/pktsched_qfq.h | 304 + bsd/net/pktsched/pktsched_rmclass.c | 1849 ++++++ bsd/net/pktsched/pktsched_rmclass.h | 317 + bsd/net/pktsched/pktsched_rmclass_debug.h | 140 + bsd/net/pktsched/pktsched_tcq.c | 1215 ++++ bsd/net/pktsched/pktsched_tcq.h | 162 + bsd/net/radix.c | 3 - bsd/net/raw_usrreq.c | 8 +- bsd/net/route.c | 470 +- bsd/net/route.h | 22 +- bsd/net/rtsock.c | 61 +- bsd/netat/drv_dep.c | 10 +- bsd/netinet/Makefile | 5 +- bsd/netinet/icmp6.h | 45 +- bsd/netinet/if_ether.h | 5 +- bsd/netinet/igmp.c | 74 +- bsd/netinet/igmp_var.h | 1 + bsd/netinet/in.c | 687 ++- bsd/netinet/in.h | 17 +- bsd/netinet/in_arp.c | 243 +- bsd/netinet/in_arp.h | 2 - bsd/netinet/in_cksum.c | 6 +- bsd/netinet/in_dhcp.c | 22 +- bsd/netinet/in_gif.c | 14 +- bsd/netinet/in_mcast.c | 53 +- bsd/netinet/in_pcb.c | 532 +- bsd/netinet/in_pcb.h | 139 +- bsd/netinet/in_pcblist.c | 73 +- bsd/netinet/in_proto.c | 10 +- bsd/netinet/in_rmx.c | 9 +- bsd/netinet/in_tclass.c | 857 ++- bsd/netinet/in_var.h | 7 +- bsd/netinet/ip_divert.c | 17 +- bsd/netinet/ip_dummynet.c | 362 +- bsd/netinet/ip_dummynet.h | 77 +- bsd/netinet/ip_encap.c | 10 +- bsd/netinet/ip_flowid.h | 132 + bsd/netinet/ip_fw2.c | 203 +- bsd/netinet/ip_fw2.h | 68 +- bsd/netinet/ip_fw2_compat.c | 6 +- bsd/netinet/ip_icmp.c | 75 +- bsd/netinet/ip_input.c | 358 +- bsd/netinet/ip_output.c | 423 +- bsd/netinet/ip_var.h | 36 +- bsd/netinet/kpi_ipfilter.c | 43 +- bsd/netinet/kpi_ipfilter.h | 4 +- osfmk/ddb/db_cond.h => bsd/netinet/lro_ext.h | 70 +- bsd/netinet/raw_ip.c | 109 +- bsd/netinet/tcp.h | 46 +- bsd/netinet/tcp_cc.h | 7 +- bsd/netinet/tcp_input.c | 912 ++- bsd/netinet/tcp_ledbat.c | 14 +- bsd/netinet/tcp_lro.c | 997 ++++ bsd/netinet/tcp_lro.h | 81 + bsd/netinet/tcp_newreno.c | 90 +- bsd/netinet/tcp_output.c | 711 ++- bsd/netinet/tcp_seq.h | 2 +- bsd/netinet/tcp_subr.c | 322 +- bsd/netinet/tcp_timer.c | 50 +- bsd/netinet/tcp_timer.h | 14 +- bsd/netinet/tcp_usrreq.c | 305 +- bsd/netinet/tcp_var.h | 104 +- bsd/netinet/udp_usrreq.c | 298 +- bsd/netinet/udp_var.h | 15 +- bsd/netinet6/Makefile | 6 +- bsd/netinet6/ah_core.c | 64 +- bsd/netinet6/ah_input.c | 24 +- bsd/netinet6/ah_output.c | 8 +- bsd/netinet6/esp_core.c | 287 +- bsd/netinet6/esp_input.c | 92 +- bsd/netinet6/esp_output.c | 8 +- bsd/netinet6/esp_rijndael.c | 63 +- bsd/netinet6/frag6.c | 6 +- bsd/netinet6/icmp6.c | 164 +- bsd/netinet6/in6.c | 806 ++- bsd/netinet6/in6.h | 15 +- bsd/netinet6/in6_cksum.c | 10 +- bsd/netinet6/in6_gif.c | 10 +- bsd/netinet6/in6_ifattach.c | 158 +- bsd/netinet6/in6_mcast.c | 33 +- bsd/netinet6/in6_pcb.c | 251 +- bsd/netinet6/in6_pcb.h | 10 +- bsd/netinet6/in6_proto.c | 12 +- bsd/netinet6/in6_rmx.c | 2 +- bsd/netinet6/in6_src.c | 306 +- bsd/netinet6/in6_var.h | 21 +- bsd/netinet6/ip6_forward.c | 113 +- bsd/netinet6/ip6_fw.c | 2 +- bsd/netinet6/ip6_input.c | 310 +- bsd/netinet6/ip6_mroute.c | 56 +- bsd/netinet6/ip6_output.c | 307 +- bsd/netinet6/ip6_var.h | 58 +- bsd/netinet6/ipcomp_input.c | 10 +- bsd/netinet6/ipsec.c | 357 +- bsd/netinet6/ipsec.h | 14 + bsd/netinet6/mld6.c | 50 +- bsd/netinet6/mld6_var.h | 1 + bsd/netinet6/nd6.c | 695 ++- bsd/netinet6/nd6.h | 146 +- bsd/netinet6/nd6_nbr.c | 492 +- bsd/netinet6/nd6_prproxy.c | 1357 +++++ bsd/netinet6/nd6_rtr.c | 512 +- bsd/netinet6/raw_ip6.c | 94 +- bsd/netinet6/route6.c | 11 +- bsd/netinet6/scope6.c | 39 +- bsd/netinet6/scope6_var.h | 2 +- bsd/netinet6/udp6_output.c | 86 +- bsd/netinet6/udp6_usrreq.c | 91 +- bsd/netkey/key.c | 643 +- bsd/netkey/key.h | 31 + bsd/netkey/keydb.h | 8 + bsd/nfs/nfs.h | 84 +- bsd/nfs/nfs4_subs.c | 57 +- bsd/nfs/nfs4_vnops.c | 52 +- bsd/nfs/nfs_bio.c | 10 +- bsd/nfs/nfs_gss.c | 98 +- bsd/nfs/nfs_gss.h | 8 +- bsd/nfs/nfs_gss_crypto.c | 229 +- bsd/nfs/nfs_gss_crypto.h | 13 +- bsd/nfs/nfs_lock.c | 3 +- bsd/nfs/nfs_node.c | 34 + bsd/nfs/nfs_serv.c | 21 +- bsd/nfs/nfs_socket.c | 236 +- bsd/nfs/nfs_srvcache.c | 10 +- bsd/nfs/nfs_subs.c | 25 +- bsd/nfs/nfs_syscalls.c | 47 +- bsd/nfs/nfs_upcall.c | 390 ++ bsd/nfs/nfs_vfsops.c | 9 +- bsd/nfs/nfs_vnops.c | 93 +- bsd/nfs/nfsmount.h | 8 +- bsd/nfs/nfsnode.h | 35 +- bsd/security/audit/audit_bsd.h | 10 +- bsd/security/audit/audit_bsm_errno.c | 3 +- bsd/security/audit/audit_pipe.c | 2 +- bsd/security/audit/audit_session.c | 2 +- bsd/security/audit/audit_syscalls.c | 139 - bsd/security/audit/audit_worker.c | 69 +- bsd/sys/Makefile | 10 +- bsd/sys/attr.h | 2 - bsd/sys/bsdtask_info.h | 2 +- bsd/sys/buf.h | 94 +- bsd/sys/buf_internal.h | 32 +- bsd/sys/cdefs.h | 118 +- bsd/sys/codesign.h | 2 + bsd/sys/conf.h | 9 +- bsd/sys/cprotect.h | 87 +- bsd/sys/decmpfs.h | 41 +- bsd/sys/disk.h | 26 +- bsd/sys/domain.h | 4 +- bsd/sys/dtrace.h | 4 +- bsd/sys/dtrace_impl.h | 1 + bsd/sys/errno.h | 13 +- bsd/sys/event.h | 41 +- bsd/sys/fcntl.h | 38 +- bsd/sys/file.h | 17 - bsd/sys/file_internal.h | 3 - bsd/sys/imgact.h | 7 +- osfmk/ipc/ipc_print.h => bsd/sys/kas_info.h | 48 +- bsd/sys/kauth.h | 12 +- bsd/sys/kdebug.h | 110 +- bsd/sys/kern_callout.h | 68 - bsd/sys/kern_memorystatus.h | 256 +- bsd/sys/kpi_mbuf.h | 128 +- bsd/sys/kpi_socketfilter.h | 6 +- bsd/sys/lockf.h | 1 + bsd/sys/lockstat.h | 5 + bsd/sys/make_posix_availability.sh | 13 +- bsd/sys/make_symbol_aliasing.sh | 19 +- bsd/sys/malloc.h | 6 +- bsd/sys/mbuf.h | 332 +- bsd/sys/mcache.h | 15 +- bsd/sys/mount.h | 6 +- bsd/sys/mount_internal.h | 14 +- bsd/sys/munge.h | 69 + bsd/sys/namei.h | 6 +- bsd/sys/pipe.h | 2 + bsd/sys/priv.h | 11 + bsd/sys/proc.h | 13 +- bsd/sys/proc_info.h | 23 + bsd/sys/proc_internal.h | 21 +- bsd/sys/process_policy.h | 45 +- bsd/sys/pthread_internal.h | 28 +- bsd/sys/reboot.h | 4 - bsd/sys/resource.h | 1 + bsd/sys/sem.h | 10 - bsd/sys/sem_internal.h | 2 +- bsd/sys/signal.h | 3 + bsd/sys/signalvar.h | 4 +- bsd/sys/socket.h | 133 +- bsd/sys/socketvar.h | 105 +- bsd/sys/sockio.h | 15 +- bsd/sys/spawn.h | 6 +- bsd/sys/spawn_internal.h | 53 +- bsd/sys/sysctl.h | 4 +- bsd/sys/sysent.h | 2 +- bsd/sys/syslimits.h | 2 +- bsd/sys/syslog.h | 4 +- bsd/sys/systm.h | 15 +- bsd/sys/tty.h | 1 + bsd/sys/ubc.h | 1 + bsd/sys/un.h | 1 + bsd/sys/unpcb.h | 2 +- bsd/sys/user.h | 25 +- bsd/sys/vnode.h | 49 +- bsd/sys/vnode_internal.h | 18 +- bsd/sys/xattr.h | 1 + bsd/uxkern/ux_exception.c | 17 +- bsd/vfs/kpi_vfs.c | 845 +-- bsd/vfs/vfs_attrlist.c | 19 +- bsd/vfs/vfs_bio.c | 259 +- bsd/vfs/vfs_cache.c | 24 +- bsd/vfs/vfs_cluster.c | 429 +- bsd/vfs/vfs_conf.c | 6 +- bsd/vfs/vfs_fsevents.c | 42 +- bsd/vfs/vfs_journal.c | 319 +- bsd/vfs/vfs_journal.h | 42 +- bsd/vfs/vfs_lookup.c | 36 +- bsd/vfs/vfs_subr.c | 494 +- bsd/vfs/vfs_syscalls.c | 290 +- bsd/vfs/vfs_vnops.c | 52 +- bsd/vfs/vfs_xattr.c | 43 +- bsd/vm/dp_backing_file.c | 17 +- bsd/vm/vm_unix.c | 221 +- bsd/vm/vnode_pager.c | 32 +- config/BSDKernel.exports | 1 + config/IOKit.exports | 254 +- config/IOKit.i386.exports | 229 + config/IOKit.x86_64.exports | 229 + config/Libkern.exports | 137 +- config/Libkern.i386.exports | 135 +- config/Libkern.x86_64.exports | 133 +- config/MACFramework.exports | 3 + config/MACFramework.i386.exports | 2 + config/MACFramework.x86_64.exports | 2 + config/Makefile | 34 +- config/MasterVersion | 2 +- config/Private.exports | 49 +- config/Private.i386.exports | 1 + config/Private.x86_64.exports | 1 + config/System6.0.exports | 9 - config/System6.0.i386.exports | 5 + config/System6.0.x86_64.exports | 5 + config/Unsupported.exports | 17 +- config/Unsupported.i386.exports | 22 +- config/Unsupported.x86_64.exports | 16 +- config/newvers.pl | 42 +- iokit/IOKit/IOCatalogue.h | 17 +- iokit/IOKit/IODeviceTreeSupport.h | 4 + iokit/IOKit/IOHibernatePrivate.h | 44 +- iokit/IOKit/IOKitServer.h | 15 +- iokit/IOKit/IOLib.h | 10 +- iokit/IOKit/IOMemoryDescriptor.h | 16 +- iokit/IOKit/IOService.h | 23 +- iokit/IOKit/IOTypes.h | 14 +- iokit/IOKit/Makefile | 5 +- iokit/IOKit/i386/Makefile | 2 - iokit/IOKit/pwr_mgt/IOPM.h | 26 +- iokit/IOKit/pwr_mgt/IOPMPrivate.h | 177 +- iokit/IOKit/pwr_mgt/Makefile | 2 +- iokit/IOKit/pwr_mgt/RootDomain.h | 59 +- iokit/IOKit/x86_64/Makefile | 33 + iokit/Kernel/IOBufferMemoryDescriptor.cpp | 6 + iokit/Kernel/IOCPU.cpp | 39 +- iokit/Kernel/IOCatalogue.cpp | 653 +-- iokit/Kernel/IOCommandGate.cpp | 8 +- iokit/Kernel/IODMACommand.cpp | 13 +- iokit/Kernel/IODataQueue.cpp | 10 +- iokit/Kernel/IODeviceTreeSupport.cpp | 63 +- iokit/Kernel/IOFilterInterruptEventSource.cpp | 8 +- iokit/Kernel/IOHibernateIO.cpp | 271 +- iokit/Kernel/IOHibernateInternal.h | 11 +- iokit/Kernel/IOHibernateRestoreKernel.c | 1 - iokit/Kernel/IOInterruptEventSource.cpp | 8 +- iokit/Kernel/IOKitDebug.cpp | 53 +- iokit/Kernel/IOKitKernelInternal.h | 28 +- iokit/Kernel/IOLib.cpp | 5 +- iokit/Kernel/IOMemoryDescriptor.cpp | 120 +- iokit/Kernel/IONVRAM.cpp | 20 +- iokit/Kernel/IOPMrootDomain.cpp | 944 +-- iokit/Kernel/IOPlatformExpert.cpp | 42 + iokit/Kernel/IORegistryEntry.cpp | 24 +- iokit/Kernel/IOService.cpp | 644 +- iokit/Kernel/IOServicePM.cpp | 343 +- iokit/Kernel/IOServicePMPrivate.h | 8 +- iokit/Kernel/IOServicePrivate.h | 9 +- iokit/Kernel/IOStatistics.cpp | 11 +- iokit/Kernel/IOSubMemoryDescriptor.cpp | 30 +- iokit/Kernel/IOUserClient.cpp | 151 +- iokit/Kernel/IOWorkLoop.cpp | 3 +- iokit/Kernel/RootDomainUserClient.cpp | 2 + iokit/bsddev/IOKitBSDInit.cpp | 290 +- iokit/conf/MASTER | 6 +- iokit/conf/MASTER.i386 | 2 +- iokit/conf/MASTER.x86_64 | 4 +- iokit/conf/Makefile | 2 + iokit/conf/Makefile.i386 | 2 + iokit/conf/Makefile.x86_64 | 2 + iokit/conf/files | 1 - iokit/conf/files.i386 | 4 + iokit/conf/files.x86_64 | 4 + kgmacros | 612 +- libkern/Makefile | 8 +- libkern/OSKextLib.cpp | 8 +- libkern/c++/OSData.cpp | 44 +- libkern/c++/OSDictionary.cpp | 98 +- libkern/c++/OSKext.cpp | 1735 +++--- libkern/c++/OSMetaClass.cpp | 177 +- libkern/c++/OSRuntime.cpp | 1 + libkern/c++/OSSet.cpp | 2 + libkern/c++/OSSymbol.cpp | 30 + .../test1/test1.xcodeproj/project.pbxproj | 4 + libkern/conf/MASTER | 19 +- libkern/conf/MASTER.i386 | 8 +- libkern/conf/MASTER.x86_64 | 6 +- libkern/conf/Makefile | 2 + libkern/conf/Makefile.i386 | 4 + libkern/conf/Makefile.template | 11 +- libkern/conf/Makefile.x86_64 | 5 +- libkern/conf/files | 19 +- libkern/conf/files.i386 | 5 +- libkern/conf/files.x86_64 | 5 +- libkern/crypto/corecrypto_aes.c | 116 + libkern/crypto/corecrypto_aesxts.c | 105 + libkern/crypto/corecrypto_des.c | 210 + libkern/crypto/corecrypto_md5.c | 65 + libkern/crypto/corecrypto_sha1.c | 110 + libkern/crypto/corecrypto_sha2.c | 117 + libkern/crypto/intel/sha1edp.s | 16 +- libkern/crypto/md5.c | 364 -- libkern/crypto/register_crypto.c | 45 + libkern/crypto/sha1.c | 515 -- libkern/gen/OSAtomicOperations.c | 42 +- libkern/gen/OSDebug.cpp | 1 - libkern/kernel_mach_header.c | 110 +- libkern/kmod/Makefile | 38 - libkern/kmod/Makefile.kmod | 111 - .../kmod/libkmod.xcodeproj/project.pbxproj | 482 ++ .../kmod/libkmodtest/libkmodtest-Info.plist | 51 + .../kmod/libkmodtest/libkmodtest.cpp | 28 +- libkern/kmod/libkmodtest/libkmodtest.h | 39 + libkern/kxld/Makefile | 25 +- libkern/kxld/WKdmCompress.c | 4 +- libkern/kxld/WKdmDecompress.c | 2 +- libkern/kxld/kxld.c | 11 +- libkern/kxld/kxld_kext.c | 19 +- libkern/kxld/kxld_kext.h | 6 +- libkern/kxld/kxld_object.c | 299 +- libkern/kxld/kxld_object.h | 8 +- libkern/kxld/kxld_reloc.c | 530 +- libkern/kxld/kxld_reloc.h | 23 +- libkern/kxld/kxld_sect.c | 14 +- libkern/kxld/kxld_sect.h | 1 - libkern/kxld/kxld_seg.c | 36 +- libkern/kxld/kxld_seg.h | 13 +- libkern/kxld/kxld_srcversion.c | 93 + libkern/kxld/kxld_srcversion.h | 68 + libkern/kxld/kxld_sym.c | 4 +- libkern/kxld/kxld_sym.h | 1 - libkern/kxld/kxld_symtab.c | 24 +- libkern/kxld/kxld_symtab.h | 1 - libkern/kxld/kxld_util.c | 34 +- libkern/kxld/kxld_util.h | 7 +- libkern/kxld/kxld_uuid.c | 2 +- libkern/kxld/kxld_versionmin.c | 112 + libkern/kxld/kxld_versionmin.h | 75 + libkern/kxld/kxld_vtable.c | 1 + libkern/libkern/Makefile | 9 +- libkern/libkern/OSAtomic.h | 4 - libkern/libkern/OSKextLib.h | 28 + libkern/libkern/OSKextLibPrivate.h | 22 + libkern/libkern/OSTypes.h | 2 +- libkern/libkern/WKdm.h | 6 +- libkern/libkern/c++/Makefile | 2 - libkern/libkern/c++/OSCollection.h | 5 + libkern/libkern/c++/OSData.h | 5 +- libkern/libkern/c++/OSKext.h | 7 +- libkern/libkern/c++/OSMetaClass.h | 75 +- libkern/libkern/c++/OSObject.h | 4 + libkern/libkern/c++/OSSymbol.h | 8 + libkern/libkern/crypto/Makefile | 6 +- libkern/libkern/crypto/aes.h | 99 + libkern/libkern/crypto/aesxts.h | 80 + libkern/libkern/crypto/crypto_internal.h | 45 + libkern/libkern/crypto/des.h | 109 + libkern/libkern/crypto/register_crypto.h | 142 + libkern/libkern/crypto/sha2.h | 77 + libkern/libkern/kernel_mach_header.h | 17 +- libkern/libkern/kext_request_keys.h | 10 - libkern/libkern/kxld_types.h | 23 +- libkern/libkern/machine/Makefile | 2 - .../libkern/stack_protector.h | 17 +- libkern/libkern/tree.h | 2 +- libkern/stack_protector.c | 70 +- libkern/uuid/Makefile | 3 +- libkern/uuid/uuid.c | 27 +- libkern/x86_64/OSAtomic.s | 24 + libkern/zlib/zutil.h | 4 +- libsa/bootstrap.cpp | 210 +- libsa/conf/MASTER | 18 +- libsa/conf/MASTER.i386 | 2 +- libsa/conf/MASTER.x86_64 | 2 +- libsa/conf/Makefile | 2 + libsa/libsa/Makefile | 2 - libsyscall/Libsyscall.xcconfig | 7 +- .../Libsyscall.xcodeproj/project.pbxproj | 22 +- libsyscall/Platforms/MacOSX/i386/syscall.map | 1 + .../Platforms/MacOSX/x86_64/syscall.map | 1 + libsyscall/custom/SYS.h | 33 +- libsyscall/custom/__getpid.s | 8 +- libsyscall/custom/__gettimeofday.s | 2 +- libsyscall/custom/__pipe.s | 2 +- libsyscall/custom/__ptrace.s | 4 +- libsyscall/custom/custom.s | 21 +- libsyscall/mach/mach/mach_interface.h | 1 - libsyscall/mach/mach_msg.c | 1 + libsyscall/mach/mach_port.c | 483 ++ libsyscall/mach/mach_vm.c | 142 + libsyscall/mach/string.h | 1 - libsyscall/mach/vm_map.defs | 6 +- libsyscall/wrappers/__get_cpu_capabilities.s | 2 + libsyscall/wrappers/cancelable/fcntl-base.c | 4 +- libsyscall/wrappers/legacy/getaudit.c | 73 + libsyscall/wrappers/memcpy.c | 17 - libsyscall/wrappers/open_dprotected_np.c | 41 + libsyscall/xcodescripts/create-syscalls.pl | 35 +- libsyscall/xcodescripts/mach_install_mig.sh | 53 +- lldbmacros.py | 184 + makedefs/MakeInc.cmd | 51 +- makedefs/MakeInc.def | 231 +- makedefs/MakeInc.dir | 161 +- makedefs/MakeInc.rule | 56 +- osfmk/Makefile | 12 +- osfmk/chud/chud_glue.c | 10 + osfmk/chud/chud_thread.c | 70 + osfmk/chud/chud_xnu.h | 3 + osfmk/chud/i386/chud_osfmk_callback_i386.c | 3 + osfmk/chud/i386/chud_thread_i386.c | 22 +- osfmk/conf/MASTER | 52 +- osfmk/conf/MASTER.i386 | 8 +- osfmk/conf/MASTER.x86_64 | 7 +- osfmk/conf/Makefile | 2 + osfmk/conf/Makefile.i386 | 14 +- osfmk/conf/Makefile.x86_64 | 3 + osfmk/conf/files | 39 +- osfmk/conf/files.i386 | 6 - osfmk/conf/files.x86_64 | 15 +- osfmk/console/i386/serial_console.c | 7 - osfmk/console/serial_general.c | 1 - osfmk/console/video_console.c | 15 +- osfmk/console/video_console.h | 26 +- osfmk/ddb/Makefile | 27 - osfmk/ddb/db_access.c | 152 - osfmk/ddb/db_access.h | 118 - osfmk/ddb/db_aout.c | 961 --- osfmk/ddb/db_aout.h | 91 - osfmk/ddb/db_break.c | 816 --- osfmk/ddb/db_break.h | 244 - osfmk/ddb/db_coff.h | 112 - osfmk/ddb/db_command.c | 930 --- osfmk/ddb/db_command.h | 114 - osfmk/ddb/db_cond.c | 274 - osfmk/ddb/db_examine.c | 747 --- osfmk/ddb/db_examine.h | 112 - osfmk/ddb/db_expr.c | 482 -- osfmk/ddb/db_ext_symtab.c | 93 - osfmk/ddb/db_input.c | 821 --- osfmk/ddb/db_input.h | 67 - osfmk/ddb/db_lex.c | 575 -- osfmk/ddb/db_lex.h | 220 - osfmk/ddb/db_macro.c | 227 - osfmk/ddb/db_macro.h | 77 - osfmk/ddb/db_output.c | 345 -- osfmk/ddb/db_print.c | 931 --- osfmk/ddb/db_print.h | 203 - osfmk/ddb/db_run.c | 541 -- osfmk/ddb/db_run.h | 96 - osfmk/ddb/db_sym.c | 1502 ----- osfmk/ddb/db_sym.h | 354 -- osfmk/ddb/db_task_thread.c | 337 -- osfmk/ddb/db_task_thread.h | 122 - osfmk/ddb/db_trap.c | 145 - osfmk/ddb/db_trap.h | 82 - osfmk/ddb/db_variables.c | 716 --- osfmk/ddb/db_variables.h | 256 - osfmk/ddb/db_watch.c | 366 -- osfmk/ddb/db_watch.h | 160 - osfmk/ddb/db_write_cmd.c | 132 - osfmk/ddb/db_write_cmd.h | 67 - osfmk/ddb/makedis.c | 2386 -------- osfmk/ddb/nlist.h | 141 - osfmk/ddb/orig/db_print.c | 1373 ----- osfmk/ddb/stab.h | 153 - osfmk/default_pager/default_pager.c | 14 +- osfmk/default_pager/default_pager_internal.h | 9 +- osfmk/default_pager/dp_backing_store.c | 75 +- osfmk/default_pager/dp_memory_object.c | 2 +- osfmk/device/device.defs | 18 +- osfmk/device/device_init.c | 15 + osfmk/device/iokit_rpc.c | 77 +- osfmk/device/subrs.c | 7 +- osfmk/gssd/gssd_mach.defs | 19 +- osfmk/gssd/gssd_mach_types.h | 29 +- osfmk/i386/AT386/model_dep.c | 51 +- osfmk/i386/Diagnostics.c | 95 +- osfmk/i386/Diagnostics.h | 14 +- osfmk/i386/asm.h | 19 +- osfmk/i386/asm64.h | 30 + osfmk/i386/bsd_i386.c | 66 +- osfmk/i386/commpage/commpage.c | 139 +- osfmk/i386/commpage/fifo_queues.s | 3 + osfmk/i386/commpage/pthreads.s | 2 + osfmk/i386/cpu_capabilities.h | 48 +- osfmk/i386/cpu_data.h | 18 +- osfmk/i386/cpu_threads.c | 8 +- osfmk/i386/cpuid.c | 106 +- osfmk/i386/cpuid.h | 28 +- osfmk/i386/db_disasm.c | 1826 ------ osfmk/i386/db_gcc_aout.c | 687 --- osfmk/i386/db_interface.c | 1027 ---- osfmk/i386/db_machdep.h | 211 - osfmk/i386/db_trace.c | 876 --- osfmk/i386/etimer.c | 22 +- osfmk/i386/fpu.c | 6 +- osfmk/i386/gdt.c | 8 +- osfmk/i386/genassym.c | 17 - osfmk/i386/hibernate_restore.c | 6 +- osfmk/i386/hpet.c | 75 - osfmk/i386/i386_init.c | 212 +- osfmk/i386/i386_lock.s | 76 +- osfmk/i386/i386_lowmem.h | 4 +- osfmk/i386/i386_vm_init.c | 223 +- osfmk/i386/idle_pt.c | 33 +- osfmk/i386/idt.s | 116 +- osfmk/i386/idt64.s | 53 +- osfmk/i386/ktss.c | 40 +- osfmk/i386/lapic.h | 4 +- osfmk/i386/lapic_native.c | 43 +- osfmk/i386/locks.h | 8 +- osfmk/i386/locks_i386.c | 76 - osfmk/i386/locore.s | 15 - osfmk/i386/loose_ends.c | 18 +- osfmk/i386/lowmem_vectors.s | 1 - osfmk/i386/machdep_call.c | 4 +- osfmk/i386/machdep_call.h | 4 +- osfmk/i386/machine_check.c | 126 +- osfmk/i386/machine_routines.c | 86 +- osfmk/i386/machine_routines.h | 5 + osfmk/i386/machine_routines_asm.s | 3 + osfmk/i386/machine_task.c | 25 + osfmk/i386/misc_protos.h | 5 +- osfmk/i386/mp.c | 372 +- osfmk/i386/mp.h | 3 - osfmk/i386/mp_desc.c | 128 +- osfmk/i386/mp_desc.h | 7 +- osfmk/i386/mp_native.c | 12 - osfmk/i386/mtrr.c | 63 +- osfmk/i386/pal_hibernate.h | 2 +- osfmk/i386/pal_routines.c | 17 +- osfmk/i386/pcb.c | 46 +- osfmk/i386/phys.c | 8 +- osfmk/i386/pmCPU.c | 5 +- osfmk/i386/pmap.c | 303 +- osfmk/i386/pmap.h | 159 +- osfmk/i386/pmap_common.c | 2 + osfmk/i386/pmap_internal.h | 162 +- osfmk/i386/pmap_x86_common.c | 138 +- osfmk/i386/postcode.h | 50 +- osfmk/i386/proc_reg.h | 3 + osfmk/i386/rtclock.c | 2 +- osfmk/i386/rtclock_native.c | 1 - osfmk/i386/seg.h | 23 - osfmk/i386/start.s | 38 +- osfmk/i386/start64.s | 45 +- osfmk/i386/startup64.c | 46 + osfmk/i386/trap.c | 250 +- osfmk/i386/trap_native.c | 18 +- osfmk/i386/tsc.c | 49 +- osfmk/i386/vmx/vmx_asm.h | 100 - osfmk/i386/vmx/vmx_cpu.c | 18 +- osfmk/i386/vmx/vmx_cpu.h | 12 + osfmk/ipc/ipc_entry.c | 860 +-- osfmk/ipc/ipc_entry.h | 36 +- osfmk/ipc/ipc_hash.c | 382 +- osfmk/ipc/ipc_hash.h | 31 +- osfmk/ipc/ipc_init.c | 24 +- osfmk/ipc/ipc_init.h | 1 - osfmk/ipc/ipc_kmsg.c | 437 +- osfmk/ipc/ipc_kmsg.h | 8 +- osfmk/ipc/ipc_labelh.c | 1 + osfmk/ipc/ipc_labelh.h | 1 + osfmk/ipc/ipc_mqueue.c | 30 +- osfmk/ipc/ipc_mqueue.h | 12 +- osfmk/ipc/ipc_object.c | 112 +- osfmk/ipc/ipc_object.h | 61 +- osfmk/ipc/ipc_port.c | 869 +-- osfmk/ipc/ipc_port.h | 45 +- osfmk/ipc/ipc_pset.c | 130 +- osfmk/ipc/ipc_pset.h | 23 +- osfmk/ipc/ipc_right.c | 256 +- osfmk/ipc/ipc_right.h | 9 +- osfmk/ipc/ipc_space.c | 120 +- osfmk/ipc/ipc_space.h | 145 +- osfmk/ipc/ipc_splay.c | 950 --- osfmk/ipc/ipc_splay.h | 144 - osfmk/ipc/ipc_table.c | 27 - osfmk/ipc/ipc_table.h | 40 +- osfmk/ipc/ipc_types.h | 6 +- osfmk/ipc/mach_debug.c | 171 +- osfmk/ipc/mach_kernelrpc.c | 244 + osfmk/ipc/mach_msg.c | 88 +- osfmk/ipc/mach_port.c | 150 +- osfmk/kdp/Makefile | 7 +- osfmk/kdp/kdp.c | 75 +- osfmk/kdp/kdp_dyld.h | 24 +- osfmk/kdp/kdp_udp.c | 49 +- osfmk/kdp/ml/i386/kdp_machdep.c | 1 + osfmk/kdp/ml/i386/kdp_vm.c | 315 +- osfmk/kdp/ml/i386/kdp_x86_common.c | 298 +- .../ml/i386/kdp_x86_common.h} | 56 +- osfmk/kdp/ml/x86_64/kdp_machdep.c | 4 + osfmk/kdp/ml/x86_64/kdp_vm.c | 317 +- osfmk/kern/Makefile | 1 + osfmk/kern/affinity.c | 5 +- osfmk/kern/ast.c | 31 +- osfmk/kern/ast.h | 8 +- osfmk/kern/audit_sessionport.c | 10 +- osfmk/kern/bsd_kern.c | 66 +- osfmk/kern/clock.c | 43 +- osfmk/kern/clock.h | 17 +- osfmk/kern/debug.c | 105 +- osfmk/kern/debug.h | 24 +- osfmk/kern/exception.c | 45 +- osfmk/kern/exception.h | 4 +- osfmk/kern/gzalloc.c | 439 ++ osfmk/kern/host.c | 8 +- osfmk/kern/ipc_kobject.c | 77 +- osfmk/kern/ipc_mig.c | 31 +- osfmk/kern/ipc_mig.h | 27 +- osfmk/kern/ipc_misc.c | 2 +- osfmk/kern/ipc_tt.c | 30 +- osfmk/kern/ipc_tt.h | 12 - osfmk/kern/kalloc.c | 538 +- osfmk/kern/kalloc.h | 9 - osfmk/kern/kext_alloc.c | 71 +- osfmk/kern/ledger.c | 1277 +++- osfmk/kern/ledger.h | 128 +- osfmk/kern/locks.c | 5 +- osfmk/kern/mach_clock.c | 176 - osfmk/kern/mach_param.h | 2 - osfmk/kern/misc_protos.h | 14 +- osfmk/kern/mk_sp.c | 12 +- osfmk/kern/mk_timer.c | 42 +- osfmk/kern/mk_timer.h | 6 +- osfmk/kern/printf.c | 19 - osfmk/kern/priority.c | 94 +- osfmk/kern/processor.c | 28 +- osfmk/kern/queue.h | 4 + osfmk/kern/sched.h | 8 +- osfmk/kern/sched_average.c | 3 +- osfmk/kern/sched_fixedpriority.c | 65 +- osfmk/kern/sched_grrr.c | 1 - osfmk/kern/sched_prim.c | 340 +- osfmk/kern/sched_prim.h | 18 +- osfmk/kern/security.c | 8 +- osfmk/kern/stack.c | 14 +- osfmk/kern/startup.c | 28 +- osfmk/kern/sync_lock.c | 12 +- osfmk/kern/sync_sema.c | 23 +- osfmk/kern/syscall_subr.c | 2 +- osfmk/kern/syscall_sw.c | 53 +- osfmk/kern/syscall_sw.h | 12 +- osfmk/kern/task.c | 602 +- osfmk/kern/task.h | 156 +- osfmk/kern/task_policy.c | 1617 ++++- osfmk/kern/thread.c | 245 +- osfmk/kern/thread.h | 111 +- osfmk/kern/thread_act.c | 72 +- osfmk/kern/thread_call.c | 1150 ++-- osfmk/kern/thread_call.h | 222 +- osfmk/kern/thread_policy.c | 71 +- osfmk/kern/timer.c | 13 +- osfmk/kern/timer.h | 53 +- osfmk/kern/timer_call.c | 40 +- osfmk/kern/wait_queue.c | 287 +- osfmk/kern/wait_queue.h | 20 +- osfmk/kern/xpr.c | 172 - osfmk/kern/zalloc.c | 971 ++- osfmk/kern/zalloc.h | 75 +- osfmk/kperf/Makefile | 33 + osfmk/kperf/action.c | 364 ++ osfmk/kperf/action.h | 68 + osfmk/kperf/ast.h | 30 + osfmk/kperf/buffer.h | 103 + osfmk/kperf/callstack.c | 167 + osfmk/kperf/callstack.h | 57 + osfmk/kperf/context.h | 39 + osfmk/kperf/filter.c | 117 + osfmk/kperf/filter.h | 39 + osfmk/kperf/kperf.c | 194 + osfmk/kperf/kperf.h | 46 + osfmk/kperf/kperf_arch.h | 41 + osfmk/kperf/kperfbsd.c | 342 ++ osfmk/kperf/kperfbsd.h | 29 + osfmk/kperf/pet.c | 331 ++ osfmk/kperf/pet.h | 40 + .../kperf/sample.h | 21 +- osfmk/kperf/threadinfo.c | 227 + osfmk/kperf/threadinfo.h | 56 + osfmk/kperf/timetrigger.c | 351 ++ osfmk/kperf/timetrigger.h | 52 + osfmk/kperf/x86_64/kperf_arch.h | 30 + osfmk/kperf/x86_64/kperf_mp.c | 40 + osfmk/mach/Makefile | 5 - osfmk/mach/Makefile.template | 14 +- osfmk/mach/exception_types.h | 9 + osfmk/mach/host_priv.defs | 13 +- osfmk/mach/host_special_ports.h | 10 +- osfmk/mach/i386/exception.h | 2 +- osfmk/mach/i386/machine_types.defs | 2 + osfmk/mach/i386/sdt_isa.h | 2 +- osfmk/mach/i386/vm_param.h | 64 +- osfmk/mach/i386/vm_types.h | 2 + osfmk/mach/kmod.h | 9 + osfmk/mach/ledger.defs | 33 +- osfmk/mach/mach_host.defs | 25 +- osfmk/mach/mach_interface.h | 3 +- osfmk/mach/mach_port.defs | 21 +- osfmk/mach/mach_traps.h | 174 +- osfmk/mach/mach_types.defs | 7 +- osfmk/mach/mach_types.h | 10 +- osfmk/mach/mach_vm.defs | 62 +- osfmk/mach/machine.h | 6 + osfmk/mach/memory_object_types.h | 3 + osfmk/mach/message.h | 30 +- osfmk/mach/mig.h | 20 +- osfmk/mach/ndr.h | 18 +- osfmk/mach/shared_region.h | 1 + osfmk/mach/syscall_sw.h | 19 + osfmk/mach/task_info.h | 37 +- osfmk/mach/task_special_ports.h | 25 +- osfmk/mach/vm_map.defs | 29 +- osfmk/mach/vm_param.h | 36 +- osfmk/mach/vm_statistics.h | 9 +- osfmk/machine/Makefile | 3 +- osfmk/machine/commpage.h | 2 +- .../machine/{db_machdep.h => machine_cpuid.h} | 16 +- osfmk/pmc/pmc.c | 473 +- osfmk/pmc/pmc.h | 131 +- osfmk/profiling/Makefile | 4 - osfmk/vm/bsd_vm.c | 65 +- osfmk/vm/cpm.h | 11 - osfmk/vm/default_freezer.c | 529 +- osfmk/vm/default_freezer.h | 39 +- osfmk/vm/memory_object.c | 70 +- osfmk/vm/pmap.h | 50 +- osfmk/vm/vm_apple_protect.c | 3 + osfmk/vm/vm_fault.c | 475 +- osfmk/vm/vm_fault.h | 1 + osfmk/vm/vm_init.c | 43 +- osfmk/vm/vm_kern.c | 26 +- osfmk/vm/vm_kern.h | 1 + osfmk/vm/vm_map.c | 709 +-- osfmk/vm/vm_map.h | 38 +- osfmk/vm/vm_map_store.c | 1 + osfmk/vm/vm_object.c | 664 +-- osfmk/vm/vm_object.h | 51 +- osfmk/vm/vm_page.h | 130 +- osfmk/vm/vm_pageout.c | 1599 +++-- osfmk/vm/vm_pageout.h | 42 +- osfmk/vm/vm_protos.h | 43 +- osfmk/vm/vm_purgeable.c | 79 +- osfmk/vm/vm_purgeable_internal.h | 1 + osfmk/vm/vm_resident.c | 917 +-- osfmk/vm/vm_shared_region.c | 107 +- osfmk/vm/vm_shared_region.h | 1 + osfmk/vm/vm_swapfile_pager.c | 1 + osfmk/vm/vm_user.c | 37 +- osfmk/x86_64/boot_pt.c | 90 + osfmk/x86_64/idt64.s | 88 +- osfmk/x86_64/idt_table.h | 10 +- osfmk/x86_64/locore.s | 3 - osfmk/x86_64/loose_ends.c | 18 +- osfmk/x86_64/lowglobals.h | 38 +- .../db_output.h => x86_64/lowmem_vectors.c} | 73 +- osfmk/x86_64/lowmem_vectors.s | 104 - osfmk/x86_64/machine_routines_asm.s | 86 +- osfmk/x86_64/pmap.c | 480 +- osfmk/x86_64/start.s | 369 +- pexpert/conf/MASTER | 1 + pexpert/conf/MASTER.x86_64 | 2 +- pexpert/conf/Makefile | 2 + pexpert/conf/files | 1 - pexpert/gen/bootargs.c | 5 +- pexpert/gen/device_tree.c | 7 + pexpert/i386/pe_init.c | 9 + pexpert/pexpert/device_tree.h | 4 +- pexpert/pexpert/i386/boot.h | 7 +- pexpert/pexpert/pexpert.h | 9 +- security/conf/MASTER | 7 + security/conf/MASTER.i386 | 4 +- security/conf/MASTER.x86_64 | 5 +- security/conf/Makefile | 2 + security/mac.h | 3 +- security/mac_base.c | 127 +- security/mac_framework.h | 11 +- security/mac_inet.c | 4 +- security/mac_internal.h | 7 - security/mac_mach_internal.h | 13 + security/mac_policy.h | 83 +- security/mac_posix_shm.c | 4 +- security/mac_process.c | 80 + security/mac_socket.c | 8 +- security/mac_system.c | 13 + security/mac_vfs.c | 2 +- tools/tests/MPMMTest/KQMPMMtest.c | 22 +- tools/tests/MPMMTest/MPMMtest.c | 16 +- tools/tests/execperf/exit.c | 2 + tools/tests/execperf/printexecinfo.c | 1 + tools/tests/execperf/run.c | 1 + tools/tests/libMicro/AppleReadMe | 15 +- tools/tests/libMicro/Makefile | 9 +- tools/tests/libMicro/Makefile.Darwin | 11 +- tools/tests/libMicro/apple/Makefile.Darwin | 10 +- .../tests/libMicro/apple/Makefile.benchmarks | 8 +- tools/tests/libMicro/coreos_bench.sh | 18 +- tools/tests/libMicro/embd_bench.sh | 815 +++ .../tests/xnu_quick_test/32bit_inode_tests.c | 2 +- tools/tests/xnu_quick_test/commpage_tests.c | 9 +- .../xnu_quick_test/content_protection_test.c | 922 +++ .../tests/xnu_quick_test/helpers/data_exec.c | 2 +- tools/tests/xnu_quick_test/helpers/launch.c | 1 + tools/tests/xnu_quick_test/main.c | 36 +- tools/tests/xnu_quick_test/makefile | 39 +- tools/tests/xnu_quick_test/memory_tests.c | 48 +- tools/tests/xnu_quick_test/misc.c | 5 +- tools/tests/xnu_quick_test/pipes_tests.c | 880 +++ tools/tests/xnu_quick_test/socket_tests.c | 4 +- tools/tests/xnu_quick_test/tests.c | 217 +- tools/tests/xnu_quick_test/tests.h | 9 +- 1179 files changed, 105414 insertions(+), 82359 deletions(-) create mode 100644 .lldbinit create mode 100644 EXTERNAL_HEADERS/corecrypto/cc.h create mode 100644 EXTERNAL_HEADERS/corecrypto/cc_config.h create mode 100644 EXTERNAL_HEADERS/corecrypto/cc_priv.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccaes.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccder.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccdes.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccdigest.h create mode 100644 EXTERNAL_HEADERS/corecrypto/cchmac.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccmd5.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccmode.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccmode_factory.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccmode_impl.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccn.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccpad.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccrc4.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccrng.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccrng_system.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccsha1.h create mode 100644 EXTERNAL_HEADERS/corecrypto/ccsha2.h delete mode 100644 EXTERNAL_HEADERS/mach-o/kld.h create mode 100644 SETUP/decomment/Makefile create mode 100644 SETUP/decomment/decomment.c create mode 100644 SETUP/md/Makefile create mode 100644 SETUP/md/md.1 create mode 100644 SETUP/md/md.c create mode 100644 bsd/crypto/aes.h delete mode 100644 bsd/crypto/aes/Assert.c delete mode 100755 bsd/crypto/aes/aes.h delete mode 100644 bsd/crypto/aes/gen/Makefile delete mode 100644 bsd/crypto/aes/gen/aescrypt.c delete mode 100644 bsd/crypto/aes/gen/aeskey.c delete mode 100644 bsd/crypto/aes/gen/aesopt.h delete mode 100644 bsd/crypto/aes/gen/aestab.c delete mode 100644 bsd/crypto/aes/gen/aestab.h delete mode 100644 bsd/crypto/aes/i386/AES.s delete mode 100644 bsd/crypto/aes/i386/Context.h delete mode 100644 bsd/crypto/aes/i386/Data.mk delete mode 100644 bsd/crypto/aes/i386/Data.s delete mode 100644 bsd/crypto/aes/i386/EncryptDecrypt.s delete mode 100644 bsd/crypto/aes/i386/ExpandKeyForDecryption.s delete mode 100644 bsd/crypto/aes/i386/ExpandKeyForEncryption.s delete mode 100644 bsd/crypto/aes/i386/MakeData.c delete mode 100644 bsd/crypto/aes/i386/Makefile delete mode 100644 bsd/crypto/aes/i386/ReadMe.txt delete mode 100644 bsd/crypto/aes/i386/aes_crypt_hw.s delete mode 100644 bsd/crypto/aes/i386/aes_key_hw.s delete mode 100644 bsd/crypto/aes/i386/aes_modes_asm.s delete mode 100644 bsd/crypto/aes/i386/aes_modes_hw.s delete mode 100644 bsd/crypto/aes/i386/aesxts.c delete mode 100644 bsd/crypto/aes/i386/aesxts.h delete mode 100644 bsd/crypto/aes/i386/aesxts_asm.s create mode 100644 bsd/crypto/aesxts.h create mode 100644 bsd/crypto/des.h delete mode 100644 bsd/crypto/des/des.h delete mode 100644 bsd/crypto/des/des_ecb.c delete mode 100644 bsd/crypto/des/des_enc.c delete mode 100644 bsd/crypto/des/des_locl.h delete mode 100644 bsd/crypto/des/des_setkey.c delete mode 100644 bsd/crypto/des/podd.h delete mode 100644 bsd/crypto/des/sk.h delete mode 100644 bsd/crypto/des/spr.h create mode 100644 bsd/crypto/sha2.h delete mode 100644 bsd/crypto/sha2/Makefile delete mode 100644 bsd/crypto/sha2/intel/sha256.s delete mode 100644 bsd/crypto/sha2/intel/sha256nossse3.s delete mode 100644 bsd/crypto/sha2/sha2.c delete mode 100644 bsd/crypto/sha2/sha2.h delete mode 100644 bsd/kern/kern_callout.c rename bsd/{crypto/aes => net/altq}/Makefile (52%) create mode 100644 bsd/net/altq/altq.h create mode 100644 bsd/net/altq/altq_cbq.c create mode 100644 bsd/net/altq/altq_cbq.h create mode 100644 bsd/net/altq/altq_fairq.c create mode 100644 bsd/net/altq/altq_fairq.h create mode 100644 bsd/net/altq/altq_hfsc.c create mode 100644 bsd/net/altq/altq_hfsc.h create mode 100644 bsd/net/altq/altq_priq.c create mode 100644 bsd/net/altq/altq_priq.h create mode 100644 bsd/net/altq/altq_qfq.c create mode 100644 bsd/net/altq/altq_qfq.h create mode 100644 bsd/net/altq/altq_subr.c create mode 100644 bsd/net/altq/altq_var.h create mode 100644 bsd/net/altq/if_altq.h rename bsd/{crypto/des => net/classq}/Makefile (51%) create mode 100644 bsd/net/classq/classq.c create mode 100644 bsd/net/classq/classq.h create mode 100644 bsd/net/classq/classq_blue.c create mode 100644 bsd/net/classq/classq_blue.h create mode 100644 bsd/net/classq/classq_red.c create mode 100644 bsd/net/classq/classq_red.h create mode 100644 bsd/net/classq/classq_rio.c create mode 100644 bsd/net/classq/classq_rio.h create mode 100644 bsd/net/classq/classq_sfb.c create mode 100644 bsd/net/classq/classq_sfb.h create mode 100644 bsd/net/classq/classq_subr.c create mode 100644 bsd/net/classq/classq_util.c create mode 100644 bsd/net/classq/if_classq.h rename bsd/net/{pf_mtag.h => flowadv.h} (67%) create mode 100644 bsd/net/flowhash.c rename osfmk/vm/vm_print.h => bsd/net/flowhash.h (61%) create mode 100644 bsd/net/if_bond_internal.h create mode 100644 bsd/net/if_utun_crypto.c create mode 100644 bsd/net/if_utun_crypto.h create mode 100644 bsd/net/if_utun_crypto_ipsec.c create mode 100644 bsd/net/if_utun_crypto_ipsec.h create mode 100644 bsd/net/iptap.c rename osfmk/ddb/db_expr.h => bsd/net/iptap.h (55%) create mode 100644 bsd/net/pktsched/Makefile create mode 100644 bsd/net/pktsched/pktsched.c create mode 100644 bsd/net/pktsched/pktsched.h create mode 100644 bsd/net/pktsched/pktsched_cbq.c create mode 100644 bsd/net/pktsched/pktsched_cbq.h create mode 100644 bsd/net/pktsched/pktsched_fairq.c create mode 100644 bsd/net/pktsched/pktsched_fairq.h create mode 100644 bsd/net/pktsched/pktsched_hfsc.c create mode 100644 bsd/net/pktsched/pktsched_hfsc.h create mode 100644 bsd/net/pktsched/pktsched_priq.c create mode 100644 bsd/net/pktsched/pktsched_priq.h create mode 100644 bsd/net/pktsched/pktsched_qfq.c create mode 100644 bsd/net/pktsched/pktsched_qfq.h create mode 100644 bsd/net/pktsched/pktsched_rmclass.c create mode 100644 bsd/net/pktsched/pktsched_rmclass.h create mode 100644 bsd/net/pktsched/pktsched_rmclass_debug.h create mode 100644 bsd/net/pktsched/pktsched_tcq.c create mode 100644 bsd/net/pktsched/pktsched_tcq.h create mode 100644 bsd/netinet/ip_flowid.h rename osfmk/ddb/db_cond.h => bsd/netinet/lro_ext.h (51%) create mode 100644 bsd/netinet/tcp_lro.c create mode 100644 bsd/netinet/tcp_lro.h create mode 100644 bsd/netinet6/nd6_prproxy.c create mode 100644 bsd/nfs/nfs_upcall.c rename osfmk/ipc/ipc_print.h => bsd/sys/kas_info.h (64%) delete mode 100644 bsd/sys/kern_callout.h create mode 100644 bsd/sys/munge.h create mode 100644 iokit/IOKit/x86_64/Makefile create mode 100644 libkern/crypto/corecrypto_aes.c create mode 100644 libkern/crypto/corecrypto_aesxts.c create mode 100644 libkern/crypto/corecrypto_des.c create mode 100644 libkern/crypto/corecrypto_md5.c create mode 100644 libkern/crypto/corecrypto_sha1.c create mode 100644 libkern/crypto/corecrypto_sha2.c delete mode 100644 libkern/crypto/md5.c create mode 100644 libkern/crypto/register_crypto.c delete mode 100644 libkern/crypto/sha1.c delete mode 100644 libkern/kmod/Makefile delete mode 100644 libkern/kmod/Makefile.kmod create mode 100644 libkern/kmod/libkmod.xcodeproj/project.pbxproj create mode 100644 libkern/kmod/libkmodtest/libkmodtest-Info.plist rename bsd/net/dlil_pvt.h => libkern/kmod/libkmodtest/libkmodtest.cpp (71%) create mode 100644 libkern/kmod/libkmodtest/libkmodtest.h create mode 100644 libkern/kxld/kxld_srcversion.c create mode 100644 libkern/kxld/kxld_srcversion.h create mode 100644 libkern/kxld/kxld_versionmin.c create mode 100644 libkern/kxld/kxld_versionmin.h create mode 100644 libkern/libkern/crypto/aes.h create mode 100644 libkern/libkern/crypto/aesxts.h create mode 100644 libkern/libkern/crypto/crypto_internal.h create mode 100644 libkern/libkern/crypto/des.h create mode 100644 libkern/libkern/crypto/register_crypto.h create mode 100644 libkern/libkern/crypto/sha2.h rename libsyscall/custom/__psynch_cvwait.s => libkern/libkern/stack_protector.h (80%) create mode 100644 libsyscall/mach/mach_port.c create mode 100644 libsyscall/mach/mach_vm.c create mode 100644 libsyscall/wrappers/legacy/getaudit.c create mode 100644 libsyscall/wrappers/open_dprotected_np.c create mode 100644 lldbmacros.py delete mode 100644 osfmk/ddb/Makefile delete mode 100644 osfmk/ddb/db_access.c delete mode 100644 osfmk/ddb/db_access.h delete mode 100644 osfmk/ddb/db_aout.c delete mode 100644 osfmk/ddb/db_aout.h delete mode 100644 osfmk/ddb/db_break.c delete mode 100644 osfmk/ddb/db_break.h delete mode 100644 osfmk/ddb/db_coff.h delete mode 100644 osfmk/ddb/db_command.c delete mode 100644 osfmk/ddb/db_command.h delete mode 100644 osfmk/ddb/db_cond.c delete mode 100644 osfmk/ddb/db_examine.c delete mode 100644 osfmk/ddb/db_examine.h delete mode 100644 osfmk/ddb/db_expr.c delete mode 100644 osfmk/ddb/db_ext_symtab.c delete mode 100644 osfmk/ddb/db_input.c delete mode 100644 osfmk/ddb/db_input.h delete mode 100644 osfmk/ddb/db_lex.c delete mode 100644 osfmk/ddb/db_lex.h delete mode 100644 osfmk/ddb/db_macro.c delete mode 100644 osfmk/ddb/db_macro.h delete mode 100644 osfmk/ddb/db_output.c delete mode 100644 osfmk/ddb/db_print.c delete mode 100644 osfmk/ddb/db_print.h delete mode 100644 osfmk/ddb/db_run.c delete mode 100644 osfmk/ddb/db_run.h delete mode 100644 osfmk/ddb/db_sym.c delete mode 100644 osfmk/ddb/db_sym.h delete mode 100644 osfmk/ddb/db_task_thread.c delete mode 100644 osfmk/ddb/db_task_thread.h delete mode 100644 osfmk/ddb/db_trap.c delete mode 100644 osfmk/ddb/db_trap.h delete mode 100644 osfmk/ddb/db_variables.c delete mode 100644 osfmk/ddb/db_variables.h delete mode 100644 osfmk/ddb/db_watch.c delete mode 100644 osfmk/ddb/db_watch.h delete mode 100644 osfmk/ddb/db_write_cmd.c delete mode 100644 osfmk/ddb/db_write_cmd.h delete mode 100644 osfmk/ddb/makedis.c delete mode 100644 osfmk/ddb/nlist.h delete mode 100644 osfmk/ddb/orig/db_print.c delete mode 100644 osfmk/ddb/stab.h delete mode 100644 osfmk/i386/db_disasm.c delete mode 100644 osfmk/i386/db_gcc_aout.c delete mode 100644 osfmk/i386/db_interface.c delete mode 100644 osfmk/i386/db_machdep.h delete mode 100644 osfmk/i386/db_trace.c delete mode 100644 osfmk/ipc/ipc_splay.c delete mode 100644 osfmk/ipc/ipc_splay.h create mode 100644 osfmk/ipc/mach_kernelrpc.c rename osfmk/{kern/kern_print.h => kdp/ml/i386/kdp_x86_common.h} (57%) create mode 100644 osfmk/kern/gzalloc.c delete mode 100644 osfmk/kern/mach_clock.c create mode 100644 osfmk/kperf/Makefile create mode 100644 osfmk/kperf/action.c create mode 100644 osfmk/kperf/action.h create mode 100644 osfmk/kperf/ast.h create mode 100644 osfmk/kperf/buffer.h create mode 100644 osfmk/kperf/callstack.c create mode 100644 osfmk/kperf/callstack.h create mode 100644 osfmk/kperf/context.h create mode 100644 osfmk/kperf/filter.c create mode 100644 osfmk/kperf/filter.h create mode 100644 osfmk/kperf/kperf.c create mode 100644 osfmk/kperf/kperf.h create mode 100644 osfmk/kperf/kperf_arch.h create mode 100644 osfmk/kperf/kperfbsd.c create mode 100644 osfmk/kperf/kperfbsd.h create mode 100644 osfmk/kperf/pet.c create mode 100644 osfmk/kperf/pet.h rename libsyscall/custom/__psynch_cvbroad.s => osfmk/kperf/sample.h (79%) create mode 100644 osfmk/kperf/threadinfo.c create mode 100644 osfmk/kperf/threadinfo.h create mode 100644 osfmk/kperf/timetrigger.c create mode 100644 osfmk/kperf/timetrigger.h create mode 100644 osfmk/kperf/x86_64/kperf_arch.h create mode 100644 osfmk/kperf/x86_64/kperf_mp.c rename osfmk/machine/{db_machdep.h => machine_cpuid.h} (84%) create mode 100644 osfmk/x86_64/boot_pt.c rename osfmk/{ddb/db_output.h => x86_64/lowmem_vectors.c} (68%) delete mode 100644 osfmk/x86_64/lowmem_vectors.s create mode 100644 tools/tests/libMicro/embd_bench.sh create mode 100644 tools/tests/xnu_quick_test/content_protection_test.c create mode 100644 tools/tests/xnu_quick_test/pipes_tests.c diff --git a/.lldbinit b/.lldbinit new file mode 100644 index 000000000..6fd12a4c5 --- /dev/null +++ b/.lldbinit @@ -0,0 +1,17 @@ +# Import python macros +script import lldbmacros + +# Basic types +type summary add --regex --summary-string "${var%s}" "char \[[0-9]*\]" +type summary add --summary-string "${var[0]%y}${var[1]%y}${var[2]%y}${var[3]%y}-${var[4]%y}${var[5]%y}-${var[6]%y}${var[7]%y}-${var[8]%y}${var[9]%y}-${var[10]%y}${var[11]%y}${var[12]%y}${var[13]%y}${var[14]%y}${var[15]%y}" uuid_t + +# Kexts +type summary add --summary-string "${var->loadTag%u} ${var->address%x} ${var->size%x} ${var->version%u} ${var->name%s}" OSKextLoadedKextSummary +type summary add -v --python-function lldbmacros.showallkexts_summary OSKextLoadedKextSummaryHeader +command script add -f lldbmacros.showallkexts_command showallkexts + +#KGMacros +command script add -f lldbmacros.zprint_command zprint +command script add -f lldbmacros.memstats_command memstats +command script add -f lldbmacros.showioalloc_command showioalloc + diff --git a/EXTERNAL_HEADERS/Availability.h b/EXTERNAL_HEADERS/Availability.h index e811335c1..5c6ccf781 100644 --- a/EXTERNAL_HEADERS/Availability.h +++ b/EXTERNAL_HEADERS/Availability.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2010 by Apple Inc.. All rights reserved. + * Copyright (c) 2007-2011 by Apple Inc.. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -55,7 +55,7 @@ For these macros to function properly, a program must specify the OS version range it is targeting. The min OS version is specified as an option to the compiler: - -mmacosx-version-min=10.x when building for Mac OS X, and -miphone-version-min=1.x.x + -mmacosx-version-min=10.x when building for Mac OS X, and -miphoneos-version-min=x.x when building for the iPhone. The upper bound for the OS version is rarely needed, but it can be set on the command line via: -D__MAC_OS_X_VERSION_MAX_ALLOWED=10xx for Mac OS X and __IPHONE_OS_VERSION_MAX_ALLOWED = 1xxx for iPhone. @@ -124,6 +124,7 @@ #define __MAC_10_5 1050 #define __MAC_10_6 1060 #define __MAC_10_7 1070 +#define __MAC_10_8 1080 #define __MAC_NA 9999 /* not available */ #define __IPHONE_2_0 20000 @@ -132,24 +133,30 @@ #define __IPHONE_3_0 30000 #define __IPHONE_3_1 30100 #define __IPHONE_3_2 30200 +#define __IPHONE_4_0 40000 +#define __IPHONE_4_1 40100 +#define __IPHONE_4_2 40200 +#define __IPHONE_4_3 40300 +#define __IPHONE_5_0 50000 +#define __IPHONE_5_1 50100 #define __IPHONE_NA 99999 /* not available */ #include #ifdef __IPHONE_OS_VERSION_MIN_REQUIRED - #define __OSX_AVAILABLE_STARTING(_mac, _iphone) __AVAILABILITY_INTERNAL##_iphone - #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) \ - __AVAILABILITY_INTERNAL##_iphoneIntro##_DEP##_iphoneDep + #define __OSX_AVAILABLE_STARTING(_osx, _ios) __AVAILABILITY_INTERNAL##_ios + #define __OSX_AVAILABLE_BUT_DEPRECATED(_osxIntro, _osxDep, _iosIntro, _iosDep) \ + __AVAILABILITY_INTERNAL##_iosIntro##_DEP##_iosDep #elif defined(__MAC_OS_X_VERSION_MIN_REQUIRED) - #define __OSX_AVAILABLE_STARTING(_mac, _iphone) __AVAILABILITY_INTERNAL##_mac - #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) \ - __AVAILABILITY_INTERNAL##_macIntro##_DEP##_macDep + #define __OSX_AVAILABLE_STARTING(_osx, _ios) __AVAILABILITY_INTERNAL##_osx + #define __OSX_AVAILABLE_BUT_DEPRECATED(_osxIntro, _osxDep, _iosIntro, _iosDep) \ + __AVAILABILITY_INTERNAL##_osxIntro##_DEP##_osxDep #else - #define __OSX_AVAILABLE_STARTING(_mac, _iphone) - #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) + #define __OSX_AVAILABLE_STARTING(_osx, _ios) + #define __OSX_AVAILABLE_BUT_DEPRECATED(_osxIntro, _osxDep, _iosIntro, _iosDep) #endif diff --git a/EXTERNAL_HEADERS/AvailabilityInternal.h b/EXTERNAL_HEADERS/AvailabilityInternal.h index a4524708e..d94e55d08 100644 --- a/EXTERNAL_HEADERS/AvailabilityInternal.h +++ b/EXTERNAL_HEADERS/AvailabilityInternal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2010 by Apple Inc.. All rights reserved. + * Copyright (c) 2007-2011 by Apple Inc.. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -55,7 +55,7 @@ #ifdef __IPHONE_OS_VERSION_MIN_REQUIRED /* make sure a default max version is set */ #ifndef __IPHONE_OS_VERSION_MAX_ALLOWED - #define __IPHONE_OS_VERSION_MAX_ALLOWED __IPHONE_3_2 + #define __IPHONE_OS_VERSION_MAX_ALLOWED __IPHONE_5_1 #endif /* make sure a valid min is set */ #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 @@ -63,331 +63,1236 @@ #define __IPHONE_OS_VERSION_MIN_REQUIRED __IPHONE_2_0 #endif - /* set up internal macros (up to 2.0) */ - #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_0 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_REGULAR + #ifdef __has_attribute + #if __has_attribute(availability) + /* use better attributes if possible */ + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __attribute__((availability(ios,introduced=2.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_0 __attribute__((availability(ios,introduced=2.0,deprecated=2.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __attribute__((availability(ios,introduced=2.0,deprecated=2.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __attribute__((availability(ios,introduced=2.0,deprecated=2.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __attribute__((availability(ios,introduced=2.0,deprecated=3.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __attribute__((availability(ios,introduced=2.0,deprecated=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __attribute__((availability(ios,introduced=2.0,deprecated=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=2.0,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=2.0,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=2.0,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=2.0,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=2.0,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=2.0,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=2.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __attribute__((availability(ios,introduced=2.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __attribute__((availability(ios,introduced=2.1,deprecated=2.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __attribute__((availability(ios,introduced=2.1,deprecated=2.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __attribute__((availability(ios,introduced=2.1,deprecated=3.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __attribute__((availability(ios,introduced=2.1,deprecated=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __attribute__((availability(ios,introduced=2.1,deprecated=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=2.1,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=2.1,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=2.1,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=2.1,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=2.1,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=2.1,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=2.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __attribute__((availability(ios,introduced=2.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __attribute__((availability(ios,introduced=2.2,deprecated=2.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __attribute__((availability(ios,introduced=2.2,deprecated=3.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __attribute__((availability(ios,introduced=2.2,deprecated=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __attribute__((availability(ios,introduced=2.2,deprecated=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=2.2,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=2.2,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=2.2,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=2.2,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=2.2,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=2.2,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=2.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __attribute__((availability(ios,introduced=3.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __attribute__((availability(ios,introduced=3.0,deprecated=3.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __attribute__((availability(ios,introduced=3.0,deprecated=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __attribute__((availability(ios,introduced=3.0,deprecated=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=3.0,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=3.0,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=3.0,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=3.0,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=3.0,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=3.0,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=3.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __attribute__((availability(ios,introduced=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __attribute__((availability(ios,introduced=3.1,deprecated=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __attribute__((availability(ios,introduced=3.1,deprecated=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=3.1,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=3.1,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=3.1,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=3.1,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=3.1,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=3.1,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=3.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __attribute__((availability(ios,introduced=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __attribute__((availability(ios,introduced=3.2,deprecated=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=3.2,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=3.2,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=3.2,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=3.2,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=3.2,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=3.2,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=3.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0 __attribute__((availability(ios,introduced=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __attribute__((availability(ios,introduced=4.0,deprecated=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=4.0,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=4.0,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=4.0,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=4.0,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=4.0,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1 __attribute__((availability(ios,introduced=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __attribute__((availability(ios,introduced=4.1,deprecated=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=4.1,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=4.1,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=4.1,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=4.1,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2 __attribute__((availability(ios,introduced=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __attribute__((availability(ios,introduced=4.2,deprecated=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=4.2,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=4.2,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=4.2,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.2))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3 __attribute__((availability(ios,introduced=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __attribute__((availability(ios,introduced=4.3,deprecated=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=4.3,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=4.3,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_NA __attribute__((availability(ios,introduced=4.3))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0 __attribute__((availability(ios,introduced=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __attribute__((availability(ios,introduced=5.0,deprecated=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=5.0,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_NA __attribute__((availability(ios,introduced=5.0))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1 __attribute__((availability(ios,introduced=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __attribute__((availability(ios,introduced=5.1,deprecated=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_NA __attribute__((availability(ios,introduced=5.1))) + #define __AVAILABILITY_INTERNAL__IPHONE_NA __attribute__((availability(ios,unavailable))) + #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __attribute__((availability(ios,unavailable))) + #endif #endif - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_0 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_0 __AVAILABILITY_INTERNAL_DEPRECATED - /* set up internal macros (up to 2.1) */ - #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR - #endif - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_1 - #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED - #endif - /* set up internal macros (up to 2.2) */ - #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR - #endif - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_2 - #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED - #endif - /* set up internal macros (up to 3.0) */ - #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #endif - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_0 - #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED - #endif - /* set up internal macros (up to 3.1) */ - #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #endif - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_1 - #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED - #endif - /* set up internal macros (up to 3.2) */ - #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #endif - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_2 - #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 - #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 - #else - #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + + #ifndef __AVAILABILITY_INTERNAL__IPHONE_2_0 + /* set up old style internal macros (up to 2.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_0 __AVAILABILITY_INTERNAL_DEPRECATED + /* set up old style internal macros (up to 2.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 2.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 3.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_0 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 3.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 3.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 4.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_4_0 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_0 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 4.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_4_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 4.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_4_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 4.3) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_4_3 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_4_3 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 5.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_5_0 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_0 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up old style internal macros (up to 5.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_5_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_4_3 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_5_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_5_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL__IPHONE_5_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_2_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_4_3_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_0_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_5_1_DEP__IPHONE_5_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (n/a) */ + #define __AVAILABILITY_INTERNAL__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE #endif - /* set up internal macros (n/a) */ - #define __AVAILABILITY_INTERNAL__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE - #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE #elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) /* compiler for Mac OS X sets __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ */ #define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ /* make sure a default max version is set */ #ifndef __MAC_OS_X_VERSION_MAX_ALLOWED - #define __MAC_OS_X_VERSION_MAX_ALLOWED __MAC_10_7 - #endif - /* set up internal macros */ - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_7 - #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_7 - #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_REGULAR - #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_6 - #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_6 - #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_REGULAR - #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_5 - #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_5 - #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_REGULAR + #define __MAC_OS_X_VERSION_MAX_ALLOWED __MAC_10_8 #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_4 - #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_4 - #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_REGULAR - #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_REGULAR - #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_REGULAR - #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_REGULAR - #endif - #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_UNAVAILABLE - #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_REGULAR - #endif - #define __AVAILABILITY_INTERNAL__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL__MAC_10_0 - #endif - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_1 - #endif - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_2 - #endif - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_4 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_3 - #endif - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_5 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_4 - #endif - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_6 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_4 - #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_5 + + #ifdef __has_attribute + #if __has_attribute(availability) + /* use better attributes if possible */ + #define __AVAILABILITY_INTERNAL__MAC_10_0 __attribute__((availability(macosx,introduced=10.0))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_0 __attribute__((availability(macosx,introduced=10.0,deprecated=10.0))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __attribute__((availability(macosx,introduced=10.0,deprecated=10.1))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __attribute__((availability(macosx,introduced=10.0,deprecated=10.2))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __attribute__((availability(macosx,introduced=10.0,deprecated=10.3))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __attribute__((availability(macosx,introduced=10.0,deprecated=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __attribute__((availability(macosx,introduced=10.0,deprecated=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.0,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.0,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.0,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.0))) + #define __AVAILABILITY_INTERNAL__MAC_10_1 __attribute__((availability(macosx,introduced=10.1))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_1 __attribute__((availability(macosx,introduced=10.1,deprecated=10.1))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __attribute__((availability(macosx,introduced=10.1,deprecated=10.2))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __attribute__((availability(macosx,introduced=10.1,deprecated=10.3))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __attribute__((availability(macosx,introduced=10.1,deprecated=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __attribute__((availability(macosx,introduced=10.1,deprecated=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.1,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.1,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.1,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.1))) + #define __AVAILABILITY_INTERNAL__MAC_10_2 __attribute__((availability(macosx,introduced=10.2))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_2 __attribute__((availability(macosx,introduced=10.2,deprecated=10.2))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __attribute__((availability(macosx,introduced=10.2,deprecated=10.3))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __attribute__((availability(macosx,introduced=10.2,deprecated=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __attribute__((availability(macosx,introduced=10.2,deprecated=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.2,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.2,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.2,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.2))) + #define __AVAILABILITY_INTERNAL__MAC_10_3 __attribute__((availability(macosx,introduced=10.3))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_3 __attribute__((availability(macosx,introduced=10.3,deprecated=10.3))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __attribute__((availability(macosx,introduced=10.3,deprecated=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __attribute__((availability(macosx,introduced=10.3,deprecated=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.3,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.3,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.3,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.3))) + #define __AVAILABILITY_INTERNAL__MAC_10_4 __attribute__((availability(macosx,introduced=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_4 __attribute__((availability(macosx,introduced=10.4,deprecated=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __attribute__((availability(macosx,introduced=10.4,deprecated=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.4,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.4,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.4,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.4))) + #define __AVAILABILITY_INTERNAL__MAC_10_5 __attribute__((availability(macosx,introduced=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_5 __attribute__((availability(macosx,introduced=10.5,deprecated=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.5,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.5,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.5,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.5))) + #define __AVAILABILITY_INTERNAL__MAC_10_6 __attribute__((availability(macosx,introduced=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_6 __attribute__((availability(macosx,introduced=10.6,deprecated=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.6,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.6,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.6))) + #define __AVAILABILITY_INTERNAL__MAC_10_7 __attribute__((availability(macosx,introduced=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_7 __attribute__((availability(macosx,introduced=10.7,deprecated=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.7,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.7))) + #define __AVAILABILITY_INTERNAL__MAC_10_8 __attribute__((availability(macosx,introduced=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_8 __attribute__((availability(macosx,introduced=10.8,deprecated=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_NA __attribute__((availability(macosx,introduced=10.8))) + #define __AVAILABILITY_INTERNAL__MAC_NA __attribute__((availability(macosx,unavailable))) + #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __attribute__((availability(macosx,unavailable))) + #endif #endif - #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_7 - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED - #else - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_4 - #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_5 - #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_6 + + #ifndef __AVAILABILITY_INTERNAL__MAC_10_0 + /* use old style attributes */ + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_8 + #define __AVAILABILITY_INTERNAL__MAC_10_8 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_8 + #define __AVAILABILITY_INTERNAL__MAC_10_8 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_8 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_1 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_1 __AVAILABILITY_INTERNAL__MAC_10_1 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_2 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_3 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_4 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_5 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_6 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_7 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_8 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_8 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_10_8 __AVAILABILITY_INTERNAL__MAC_10_8 + #endif + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_8_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_8 + #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE #endif - #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_0 - #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_1 - #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_2 - #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_3 - #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_4 - #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_5 - #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_6 - #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_7 - #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE #endif #endif /* __AVAILABILITY_INTERNAL__ */ diff --git a/EXTERNAL_HEADERS/AvailabilityMacros.h b/EXTERNAL_HEADERS/AvailabilityMacros.h index 02981bd13..25587d849 100644 --- a/EXTERNAL_HEADERS/AvailabilityMacros.h +++ b/EXTERNAL_HEADERS/AvailabilityMacros.h @@ -97,7 +97,7 @@ #define MAC_OS_X_VERSION_10_5 1050 #define MAC_OS_X_VERSION_10_6 1060 #define MAC_OS_X_VERSION_10_7 1070 - +#define MAC_OS_X_VERSION_10_8 1080 /* * If min OS not specified, assume 10.1 for ppc and 10.4 for all others @@ -121,13 +121,13 @@ #endif /* - * if max OS not specified, assume largerof(10.6, min) + * if max OS not specified, assume larger of (10.8, min) */ #ifndef MAC_OS_X_VERSION_MAX_ALLOWED - #if MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_7 + #if MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_8 #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_MIN_REQUIRED #else - #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_10_7 + #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_10_8 #endif #endif @@ -171,7 +171,6 @@ #endif - /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER * @@ -194,11 +193,6 @@ */ #define DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER DEPRECATED_ATTRIBUTE - - - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER * @@ -214,8 +208,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * and deprecated in Mac OS X 10.1 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 @@ -226,8 +220,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.1 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 @@ -238,8 +232,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER - * - * Used on types deprecated in Mac OS X 10.1 + * + * Used on types deprecated in Mac OS X 10.1 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER DEPRECATED_ATTRIBUTE @@ -248,11 +242,6 @@ #endif - - - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER * @@ -268,8 +257,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.2, + * + * Used on declarations introduced in Mac OS X 10.2, * and deprecated in Mac OS X 10.2 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 @@ -280,8 +269,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.2 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 @@ -292,8 +281,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * but later deprecated in Mac OS X 10.2 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 @@ -304,8 +293,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER - * - * Used on types deprecated in Mac OS X 10.2 + * + * Used on types deprecated in Mac OS X 10.2 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER DEPRECATED_ATTRIBUTE @@ -314,9 +303,6 @@ #endif - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER * @@ -332,8 +318,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.3, + * + * Used on declarations introduced in Mac OS X 10.3, * and deprecated in Mac OS X 10.3 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 @@ -344,8 +330,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.3 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 @@ -356,8 +342,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * but later deprecated in Mac OS X 10.3 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 @@ -368,8 +354,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 - * - * Used on declarations introduced in Mac OS X 10.2, + * + * Used on declarations introduced in Mac OS X 10.2, * but later deprecated in Mac OS X 10.3 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 @@ -380,8 +366,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER - * - * Used on types deprecated in Mac OS X 10.3 + * + * Used on types deprecated in Mac OS X 10.3 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER DEPRECATED_ATTRIBUTE @@ -390,10 +376,6 @@ #endif - - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER * @@ -409,8 +391,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.4, + * + * Used on declarations introduced in Mac OS X 10.4, * and deprecated in Mac OS X 10.4 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 @@ -421,8 +403,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.4 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 @@ -433,8 +415,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * but later deprecated in Mac OS X 10.4 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 @@ -445,8 +427,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 - * - * Used on declarations introduced in Mac OS X 10.2, + * + * Used on declarations introduced in Mac OS X 10.2, * but later deprecated in Mac OS X 10.4 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 @@ -457,8 +439,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 - * - * Used on declarations introduced in Mac OS X 10.3, + * + * Used on declarations introduced in Mac OS X 10.3, * but later deprecated in Mac OS X 10.4 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 @@ -469,8 +451,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER - * - * Used on types deprecated in Mac OS X 10.4 + * + * Used on types deprecated in Mac OS X 10.4 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER DEPRECATED_ATTRIBUTE @@ -479,9 +461,6 @@ #endif - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER * @@ -497,8 +476,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.5, + * + * Used on declarations introduced in Mac OS X 10.5, * and deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 @@ -509,8 +488,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 @@ -521,8 +500,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * but later deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 @@ -533,8 +512,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 - * - * Used on declarations introduced in Mac OS X 10.2, + * + * Used on declarations introduced in Mac OS X 10.2, * but later deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 @@ -545,8 +524,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 - * - * Used on declarations introduced in Mac OS X 10.3, + * + * Used on declarations introduced in Mac OS X 10.3, * but later deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 @@ -557,8 +536,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 - * - * Used on declarations introduced in Mac OS X 10.4, + * + * Used on declarations introduced in Mac OS X 10.4, * but later deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 @@ -569,8 +548,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER - * - * Used on types deprecated in Mac OS X 10.5 + * + * Used on types deprecated in Mac OS X 10.5 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER DEPRECATED_ATTRIBUTE @@ -579,10 +558,6 @@ #endif - - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER * @@ -598,8 +573,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.6, + * + * Used on declarations introduced in Mac OS X 10.6, * and deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -610,8 +585,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -622,8 +597,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * but later deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -634,8 +609,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 - * - * Used on declarations introduced in Mac OS X 10.2, + * + * Used on declarations introduced in Mac OS X 10.2, * but later deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -646,8 +621,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 - * - * Used on declarations introduced in Mac OS X 10.3, + * + * Used on declarations introduced in Mac OS X 10.3, * but later deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -658,8 +633,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 - * - * Used on declarations introduced in Mac OS X 10.4, + * + * Used on declarations introduced in Mac OS X 10.4, * but later deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -670,8 +645,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 - * - * Used on declarations introduced in Mac OS X 10.5, + * + * Used on declarations introduced in Mac OS X 10.5, * but later deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 @@ -682,8 +657,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER - * - * Used on types deprecated in Mac OS X 10.6 + * + * Used on types deprecated in Mac OS X 10.6 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER DEPRECATED_ATTRIBUTE @@ -692,9 +667,6 @@ #endif - - - /* * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER * @@ -710,8 +682,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED - * - * Used on declarations introduced in Mac OS X 10.7, + * + * Used on declarations introduced in Mac OS X 10.7, * and deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -722,8 +694,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.0, + * + * Used on declarations introduced in Mac OS X 10.0, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -734,8 +706,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.1, + * + * Used on declarations introduced in Mac OS X 10.1, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -746,8 +718,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.2, + * + * Used on declarations introduced in Mac OS X 10.2, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -758,8 +730,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.3, + * + * Used on declarations introduced in Mac OS X 10.3, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -770,8 +742,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.4, + * + * Used on declarations introduced in Mac OS X 10.4, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -782,8 +754,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.5, + * + * Used on declarations introduced in Mac OS X 10.5, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -794,8 +766,8 @@ /* * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - * - * Used on declarations introduced in Mac OS X 10.6, + * + * Used on declarations introduced in Mac OS X 10.6, * but later deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 @@ -806,8 +778,8 @@ /* * DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER - * - * Used on types deprecated in Mac OS X 10.7 + * + * Used on types deprecated in Mac OS X 10.7 */ #if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER DEPRECATED_ATTRIBUTE @@ -815,6 +787,142 @@ #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER #endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.8, + * and deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.5, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.6, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + * + * Used on declarations introduced in Mac OS X 10.7, + * but later deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_8_AND_LATER + * + * Used on types deprecated in Mac OS X 10.8 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_8 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_8_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_8_AND_LATER +#endif + + + + #endif /* __AVAILABILITYMACROS__ */ diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index 46ee40f90..61680a394 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -22,7 +22,6 @@ INSTINC_SUBDIRS_ARM = \ EXPORT_FILES = \ - AppleSecureBootEpoch.h \ Availability.h \ AvailabilityInternal.h \ AvailabilityMacros.h \ diff --git a/EXTERNAL_HEADERS/corecrypto/cc.h b/EXTERNAL_HEADERS/corecrypto/cc.h new file mode 100644 index 000000000..ecf053182 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cc.h @@ -0,0 +1,69 @@ +/* + * cc.h + * corecrypto + * + * Created by Michael Brouwer on 12/16/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CC_H_ +#define _CORECRYPTO_CC_H_ + +#include +#include +#include + +#if KERNEL +#include +#else +#include +#endif + +/* Declare a struct element with a guarenteed alignment of _alignment_. + The resulting struct can be used to create arrays that are aligned by + a certain amount. */ +#define cc_aligned_struct(_alignment_) \ + typedef struct { \ + uint8_t b[_alignment_]; \ + } __attribute__((aligned(_alignment_))) + +/* number of array elements used in a cc_ctx_decl */ +#define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_)) + +/* sizeof of a context declared with cc_ctx_decl */ +#define cc_ctx_sizeof(_type_, _size_) sizeof(_type_[cc_ctx_n(_type_, _size_)]) + +#define cc_ctx_decl(_type_, _size_, _name_) \ + _type_ _name_[cc_ctx_n(_type_, _size_)] + +#define cc_zero(_size_,_data_) bzero((_data_), (_size_)) + +#define cc_copy(_size_, _dst_, _src_) memcpy(_dst_, _src_, _size_) + +#define cc_ctx_clear(_type_, _size_, _name_) \ + cc_zero((_size_ + sizeof(_type_) - 1) / sizeof(_type_), _name_) + +CC_INLINE CC_NONNULL2 CC_NONNULL3 CC_NONNULL4 +void cc_xor(size_t size, void *r, const void *s, const void *t) { + uint8_t *_r=(uint8_t *)r; + const uint8_t *_s=(uint8_t *)s; + const uint8_t *_t=(uint8_t *)t; + while (size--) { + _r[size] = _s[size] ^ _t[size]; + } +} + +/* Exchange S and T of any type. NOTE: Both and S and T are evaluated + mutliple times and MUST NOT be expressions. */ +#define CC_SWAP(S,T) do { \ + __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \ +} while(0) + +/* Return the maximum value between S and T. */ +#define CC_MAX(S, T) ({__typeof__(S) _cc_max_s = S; __typeof__(T) _cc_max_t = T; _cc_max_s > _cc_max_t ? _cc_max_s : _cc_max_t;}) + +/* Return the minimum value between S and T. */ +#define CC_MIN(S, T) ({__typeof__(S) _cc_min_s = S; __typeof__(T) _cc_min_t = T; _cc_min_s <= _cc_min_t ? _cc_min_s : _cc_min_t;}) + +#endif /* _CORECRYPTO_CC_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_config.h b/EXTERNAL_HEADERS/corecrypto/cc_config.h new file mode 100644 index 000000000..7b0f2ed78 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cc_config.h @@ -0,0 +1,131 @@ +/* + * cc_config.h + * corecrypto + * + * Created by Michael Brouwer on 10/18/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ +#ifndef _CORECRYPTO_CC_CONFIG_H_ +#define _CORECRYPTO_CC_CONFIG_H_ + +#if !defined(CCN_UNIT_SIZE) +#if defined(__x86_64__) +#define CCN_UNIT_SIZE 8 +#elif defined(__arm__) || defined(__i386__) +#define CCN_UNIT_SIZE 4 +#else +#define CCN_UNIT_SIZE 2 +#endif +#endif /* !defined(CCN_UNIT_SIZE) */ + +/* No dynamic linking allowed in L4, e.g. avoid nonlazy symbols */ +/* For corecrypto kext, CC_STATIC should be 0 */ + +#if defined(__x86_64__) || defined(__i386__) + +/* These assembly routines only work for a single CCN_UNIT_SIZE. */ +#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) || (defined(__i386__) && CCN_UNIT_SIZE == 4) +#define CCN_ADD_ASM 1 +#define CCN_SUB_ASM 1 +#define CCN_MUL_ASM 1 +#else +#define CCN_ADD_ASM 0 +#define CCN_SUB_ASM 0 +#define CCN_MUL_ASM 0 +#endif + +#define CCN_ADDMUL1_ASM 0 +#define CCN_MUL1_ASM 0 +#define CCN_CMP_ASM 0 +#define CCN_ADD1_ASM 0 +#define CCN_SUB1_ASM 0 +#define CCN_N_ASM 0 +#define CCN_SET_ASM 0 +#define CCAES_ARM 0 +#define CCAES_INTEL 1 +#define CCN_USE_BUILTIN_CLZ 0 +#define CCSHA1_VNG_INTEL 1 +#define CCSHA2_VNG_INTEL 1 +#define CCSHA1_VNG_ARMV7NEON 0 +#define CCSHA2_VNG_ARMV7NEON 0 + +#else + +#define CCN_ADD_ASM 0 +#define CCN_SUB_ASM 0 +#define CCN_MUL_ASM 0 +#define CCN_ADDMUL1_ASM 0 +#define CCN_MUL1_ASM 0 +#define CCN_CMP_ASM 0 +#define CCN_ADD1_ASM 0 +#define CCN_SUB1_ASM 0 +#define CCN_N_ASM 0 +#define CCN_SET_ASM 0 +#define CCAES_ARM 0 +#define CCAES_INTEL 0 +#define CCN_USE_BUILTIN_CLZ 0 +#define CCSHA1_VNG_INTEL 0 +#define CCSHA2_VNG_INTEL 0 +#define CCSHA1_VNG_ARMV7NEON 0 +#define CCSHA2_VNG_ARMV7NEON 0 + +#endif /* !defined(__i386__) */ + +#define CCN_N_INLINE 0 +#define CCN_CMP_INLINE 0 + +#define CC_INLINE static inline + +#ifdef __GNUC__ +#define CC_NORETURN __attribute__((__noreturn__)) +#define CC_NOTHROW __attribute__((__nothrow__)) +#define CC_NONNULL(N) __attribute__((__nonnull__ N)) +#define CC_NONNULL1 __attribute__((__nonnull__(1))) +#define CC_NONNULL2 __attribute__((__nonnull__(2))) +#define CC_NONNULL3 __attribute__((__nonnull__(3))) +#define CC_NONNULL4 __attribute__((__nonnull__(4))) +#define CC_NONNULL5 __attribute__((__nonnull__(5))) +#define CC_NONNULL6 __attribute__((__nonnull__(6))) +#define CC_NONNULL7 __attribute__((__nonnull__(7))) +#define CC_NONNULL_ALL __attribute__((__nonnull__)) +#define CC_SENTINEL __attribute__((__sentinel__)) +#define CC_CONST __attribute__((__const__)) +#define CC_PURE __attribute__((__pure__)) +#define CC_WARN_RESULT __attribute__((__warn_unused_result__)) +#define CC_MALLOC __attribute__((__malloc__)) +#define CC_UNUSED __attribute__((unused)) +#else /* !__GNUC__ */ +/*! @parseOnly */ +#define CC_NORETURN +/*! @parseOnly */ +#define CC_NOTHROW +/*! @parseOnly */ +#define CC_NONNULL1 +/*! @parseOnly */ +#define CC_NONNULL2 +/*! @parseOnly */ +#define CC_NONNULL3 +/*! @parseOnly */ +#define CC_NONNULL4 +/*! @parseOnly */ +#define CC_NONNULL5 +/*! @parseOnly */ +#define CC_NONNULL6 +/*! @parseOnly */ +#define CC_NONNULL7 +/*! @parseOnly */ +#define CC_NONNULL_ALL +/*! @parseOnly */ +#define CC_SENTINEL +/*! @parseOnly */ +#define CC_CONST +/*! @parseOnly */ +#define CC_PURE +/*! @parseOnly */ +#define CC_WARN_RESULT +/*! @parseOnly */ +#define CC_MALLOC +#endif /* !__GNUC__ */ + +#endif /* _CORECRYPTO_CC_CONFIG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_priv.h b/EXTERNAL_HEADERS/corecrypto/cc_priv.h new file mode 100644 index 000000000..db962d461 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cc_priv.h @@ -0,0 +1,362 @@ +/* + * cc_priv.h + * corecrypto + * + * Created by Michael Brouwer on 12/1/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CC_PRIV_H_ +#define _CORECRYPTO_CC_PRIV_H_ + +#include +#include + +/* defines the following macros : + + CC_MEMCPY : optimized memcpy. + CC_MEMMOVE : optimized memmove. + CC_MEMSET : optimized memset. + CC_BZERO : optimized bzero. + + CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer. + CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer. + CC_STORE64_BE : store 64 bit value in big endian in unaligned buffer. + CC_STORE64_LE : store 64 bit value in little endian in unaligned buffer. + + CC_LOAD32_BE : load 32 bit value in big endian from unaligned buffer. + CC_LOAD32_LE : load 32 bit value in little endian from unaligned buffer. + CC_LOAD64_BE : load 64 bit value in big endian from unaligned buffer. + CC_LOAD64_LE : load 64 bit value in little endian from unaligned buffer. + + CC_ROR : Rotate Right 32 bits. Rotate count can be a variable. + CC_ROL : Rotate Left 32 bits. Rotate count can be a variable. + CC_RORc : Rotate Right 32 bits. Rotate count must be a constant. + CC_ROLc : Rotate Left 32 bits. Rotate count must be a constant. + + CC_ROR64 : Rotate Right 64 bits. Rotate count can be a variable. + CC_ROL64 : Rotate Left 64 bits. Rotate count can be a variable. + CC_ROR64c : Rotate Right 64 bits. Rotate count must be a constant. + CC_ROL64c : Rotate Left 64 bits. Rotate count must be a constant. + + CC_BSWAP : byte swap a 32 bits variable. + + CC_H2BE32 : convert a 32 bits value between host and big endian order. + CC_H2LE32 : convert a 32 bits value between host and little endian order. + +The following are not defined yet... define them if needed. + + CC_BSWAPc : byte swap a 32 bits constant + + CC_BSWAP64 : byte swap a 64 bits variable + CC_BSWAP64c : byte swap a 64 bits constant + + CC_READ_LE32 : read a 32 bits little endian value + CC_READ_LE64 : read a 64 bits little endian value + CC_READ_BE32 : read a 32 bits big endian value + CC_READ_BE64 : read a 64 bits big endian value + + CC_WRITE_LE32 : write a 32 bits little endian value + CC_WRITE_LE64 : write a 64 bits little endian value + CC_WRITE_BE32 : write a 32 bits big endian value + CC_WRITE_BE64 : write a 64 bits big endian value + + CC_H2BE64 : convert a 64 bits value between host and big endian order + CC_H2LE64 : convert a 64 bits value between host and little endian order + +*/ + +/* TODO: optimized versions */ +#define CC_MEMCPY(D,S,L) memcpy((D),(S),(L)) +#define CC_MEMMOVE(D,S,L) memmove((D),(S),(L)) +#define CC_MEMSET(D,V,L) memset((D),(V),(L)) +#define CC_BZERO(D,L) memset((D),0,(L)) + + +#pragma mark - Loads and Store + +#pragma mark -- 32 bits - little endian + +#pragma mark --- Default version + +#define CC_STORE32_LE(x, y) do { \ + ((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255); \ + ((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255); \ + ((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255); \ + ((unsigned char *)(y))[0] = (unsigned char)((x)&255); \ +} while(0) + +#define CC_LOAD32_LE(x, y) do { \ +x = ((uint32_t)(((unsigned char *)(y))[3] & 255)<<24) | \ + ((uint32_t)(((unsigned char *)(y))[2] & 255)<<16) | \ + ((uint32_t)(((unsigned char *)(y))[1] & 255)<<8) | \ + ((uint32_t)(((unsigned char *)(y))[0] & 255)); \ +} while(0) + +#pragma mark -- 64 bits - little endian + +#define CC_STORE64_LE(x, y) do { \ + ((unsigned char *)(y))[7] = (unsigned char)(((x)>>56)&255); \ + ((unsigned char *)(y))[6] = (unsigned char)(((x)>>48)&255); \ + ((unsigned char *)(y))[5] = (unsigned char)(((x)>>40)&255); \ + ((unsigned char *)(y))[4] = (unsigned char)(((x)>>32)&255); \ + ((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255); \ + ((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255); \ + ((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255); \ + ((unsigned char *)(y))[0] = (unsigned char)((x)&255); \ +} while(0) + +#define CC_LOAD64_LE(x, y) do { \ +x = (((uint64_t)(((unsigned char *)(y))[7] & 255))<<56) | \ + (((uint64_t)(((unsigned char *)(y))[6] & 255))<<48) | \ + (((uint64_t)(((unsigned char *)(y))[5] & 255))<<40) | \ + (((uint64_t)(((unsigned char *)(y))[4] & 255))<<32) | \ + (((uint64_t)(((unsigned char *)(y))[3] & 255))<<24) | \ + (((uint64_t)(((unsigned char *)(y))[2] & 255))<<16) | \ + (((uint64_t)(((unsigned char *)(y))[1] & 255))<<8) | \ + (((uint64_t)(((unsigned char *)(y))[0] & 255))); \ +} while(0) + +#pragma mark -- 32 bits - big endian +#pragma mark --- intel version + +#if (defined(__i386__) || defined(__x86_64__)) + +#define CC_STORE32_BE(x, y) \ + __asm__ __volatile__ ( \ + "bswapl %0 \n\t" \ + "movl %0,(%1)\n\t" \ + "bswapl %0 \n\t" \ + ::"r"(x), "r"(y)) + +#define CC_LOAD32_BE(x, y) \ + __asm__ __volatile__ ( \ + "movl (%1),%0\n\t" \ + "bswapl %0\n\t" \ + :"=r"(x): "r"(y)) + +#else +#pragma mark --- default version +#define CC_STORE32_BE(x, y) do { \ + ((unsigned char *)(y))[0] = (unsigned char)(((x)>>24)&255); \ + ((unsigned char *)(y))[1] = (unsigned char)(((x)>>16)&255); \ + ((unsigned char *)(y))[2] = (unsigned char)(((x)>>8)&255); \ + ((unsigned char *)(y))[3] = (unsigned char)((x)&255); \ +} while(0) + +#define CC_LOAD32_BE(x, y) do { \ +x = ((uint32_t)(((unsigned char *)(y))[0] & 255)<<24) | \ + ((uint32_t)(((unsigned char *)(y))[1] & 255)<<16) | \ + ((uint32_t)(((unsigned char *)(y))[2] & 255)<<8) | \ + ((uint32_t)(((unsigned char *)(y))[3] & 255)); \ +} while(0) + +#endif + +#pragma mark -- 64 bits - big endian + +#pragma mark --- intel 64 bits version + +#if defined(__x86_64__) + +#define CC_STORE64_BE(x, y) \ +__asm__ __volatile__ ( \ +"bswapq %0 \n\t" \ +"movq %0,(%1)\n\t" \ +"bswapq %0 \n\t" \ +::"r"(x), "r"(y)) + +#define CC_LOAD64_BE(x, y) \ +__asm__ __volatile__ ( \ +"movq (%1),%0\n\t" \ +"bswapq %0\n\t" \ +:"=r"(x): "r"(y)) + +#else + +#pragma mark --- default version + +#define CC_STORE64_BE(x, y) do { \ + ((unsigned char *)(y))[0] = (unsigned char)(((x)>>56)&255); \ + ((unsigned char *)(y))[1] = (unsigned char)(((x)>>48)&255); \ + ((unsigned char *)(y))[2] = (unsigned char)(((x)>>40)&255); \ + ((unsigned char *)(y))[3] = (unsigned char)(((x)>>32)&255); \ + ((unsigned char *)(y))[4] = (unsigned char)(((x)>>24)&255); \ + ((unsigned char *)(y))[5] = (unsigned char)(((x)>>16)&255); \ + ((unsigned char *)(y))[6] = (unsigned char)(((x)>>8)&255); \ + ((unsigned char *)(y))[7] = (unsigned char)((x)&255); \ +} while(0) + +#define CC_LOAD64_BE(x, y) do { \ +x = (((uint64_t)(((unsigned char *)(y))[0] & 255))<<56) | \ + (((uint64_t)(((unsigned char *)(y))[1] & 255))<<48) | \ + (((uint64_t)(((unsigned char *)(y))[2] & 255))<<40) | \ + (((uint64_t)(((unsigned char *)(y))[3] & 255))<<32) | \ + (((uint64_t)(((unsigned char *)(y))[4] & 255))<<24) | \ + (((uint64_t)(((unsigned char *)(y))[5] & 255))<<16) | \ + (((uint64_t)(((unsigned char *)(y))[6] & 255))<<8) | \ + (((uint64_t)(((unsigned char *)(y))[7] & 255))); \ +} while(0) + +#endif + +#pragma mark - 32-bit Rotates + +#if defined(_MSC_VER) +#pragma mark -- MSVC version + +#include +#pragma intrinsic(_lrotr,_lrotl) +#define CC_ROR(x,n) _lrotr(x,n) +#define CC_ROL(x,n) _lrotl(x,n) +#define CC_RORc(x,n) _lrotr(x,n) +#define CC_ROLc(x,n) _lrotl(x,n) + +#elif (defined(__i386__) || defined(__x86_64__)) +#pragma mark -- intel asm version + +static inline uint32_t CC_ROL(uint32_t word, int i) +{ + __asm__ ("roll %%cl,%0" + :"=r" (word) + :"0" (word),"c" (i)); + return word; +} + +static inline uint32_t CC_ROR(uint32_t word, int i) +{ + __asm__ ("rorl %%cl,%0" + :"=r" (word) + :"0" (word),"c" (i)); + return word; +} + +/* Need to be a macro here, because 'i' is an immediate (constant) */ +#define CC_ROLc(word, i) \ +({ uint32_t _word=(word); \ + __asm__ __volatile__ ("roll %2,%0" \ + :"=r" (_word) \ + :"0" (_word),"I" (i)); \ + _word; \ +}) + + +#define CC_RORc(word, i) \ +({ uint32_t _word=(word); \ + __asm__ __volatile__ ("rorl %2,%0" \ + :"=r" (_word) \ + :"0" (_word),"I" (i)); \ + _word; \ +}) + +#else + +#pragma mark -- default version + +static inline uint32_t CC_ROL(uint32_t word, int i) +{ + return ( (word<<(i&31)) | (word>>(32-(i&31))) ); +} + +static inline uint32_t CC_ROR(uint32_t word, int i) +{ + return ( (word>>(i&31)) | (word<<(32-(i&31))) ); +} + +#define CC_ROLc(x, y) CC_ROL(x, y) +#define CC_RORc(x, y) CC_ROR(x, y) + +#endif + +#pragma mark - 64 bits rotates + +#if defined(__x86_64__) +#pragma mark -- intel 64 asm version + +static inline uint64_t CC_ROL64(uint64_t word, int i) +{ + __asm__("rolq %%cl,%0" + :"=r" (word) + :"0" (word),"c" (i)); + return word; +} + +static inline uint64_t CC_ROR64(uint64_t word, int i) +{ + __asm__("rorq %%cl,%0" + :"=r" (word) + :"0" (word),"c" (i)); + return word; +} + +/* Need to be a macro here, because 'i' is an immediate (constant) */ +#define CC_ROL64c(word, i) \ +({ \ + uint64_t _word=(word); \ + __asm__("rolq %2,%0" \ + :"=r" (_word) \ + :"0" (_word),"J" (i)); \ + _word; \ +}) + +#define CC_ROR64c(word, i) \ +({ \ + uint64_t _word=(word); \ + __asm__("rorq %2,%0" \ + :"=r" (_word) \ + :"0" (_word),"J" (i)); \ + _word; \ +}) + + +#else /* Not x86_64 */ + +#pragma mark -- default C version + +static inline uint64_t CC_ROL64(uint64_t word, int i) +{ + return ( (word<<(i&63)) | (word>>(64-(i&63))) ); +} + +static inline uint64_t CC_ROR64(uint64_t word, int i) +{ + return ( (word>>(i&63)) | (word<<(64-(i&63))) ); +} + +#define CC_ROL64c(x, y) CC_ROL64(x, y) +#define CC_ROR64c(x, y) CC_ROR64(x, y) + +#endif + + +#pragma mark - Byte Swaps + +static inline uint32_t CC_BSWAP(uint32_t x) +{ + return ( + ((x>>24)&0x000000FF) | + ((x<<24)&0xFF000000) | + ((x>>8) &0x0000FF00) | + ((x<<8) &0x00FF0000) + ); +} + +#ifdef __LITTLE_ENDIAN__ +#define CC_H2BE32(x) CC_BSWAP(x) +#define CC_H2LE32(x) (x) +#else +#error not good. +#define CC_H2BE32(x) (x) +#define CC_H2LE32(x) CC_BSWAP(x) +#endif + + +/* extract a byte portably */ +#ifdef _MSC_VER +#define cc_byte(x, n) ((unsigned char)((x) >> (8 * (n)))) +#else +#define cc_byte(x, n) (((x) >> (8 * (n))) & 255) +#endif + +#endif /* _CORECRYPTO_CC_PRIV_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccaes.h b/EXTERNAL_HEADERS/corecrypto/ccaes.h new file mode 100644 index 000000000..9dca39bd6 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccaes.h @@ -0,0 +1,83 @@ +/* + * ccaes.h + * corecrypto + * + * Created by Michael Brouwer on 12/10/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCAES_H_ +#define _CORECRYPTO_CCAES_H_ + +#include +#include + +#define CCAES_BLOCK_SIZE 16 +#define CCAES_KEY_SIZE_128 16 +#define CCAES_KEY_SIZE_192 24 +#define CCAES_KEY_SIZE_256 32 + +extern const struct ccmode_ecb ccaes_ltc_ecb_decrypt_mode; +extern const struct ccmode_ecb ccaes_ltc_ecb_encrypt_mode; + +extern const struct ccmode_cbc ccaes_gladman_cbc_encrypt_mode; +extern const struct ccmode_cbc ccaes_gladman_cbc_decrypt_mode; + +#if CCAES_ARM +extern const struct ccmode_ecb ccaes_arm_ecb_encrypt_mode; +extern const struct ccmode_ecb ccaes_arm_ecb_decrypt_mode; + +extern const struct ccmode_cbc ccaes_arm_cbc_encrypt_mode; +extern const struct ccmode_cbc ccaes_arm_cbc_decrypt_mode; +#endif + +#if CCAES_INTEL +//extern const struct ccmode_ecb ccaes_intel_ecb_encrypt_mode; +//extern const struct ccmode_ecb ccaes_intel_ecb_decrypt_mode; + +extern const struct ccmode_ecb ccaes_intel_ecb_encrypt_opt_mode; +extern const struct ccmode_ecb ccaes_intel_ecb_encrypt_aesni_mode; + +extern const struct ccmode_ecb ccaes_intel_ecb_decrypt_opt_mode; +extern const struct ccmode_ecb ccaes_intel_ecb_decrypt_aesni_mode; + +//extern const struct ccmode_cbc ccaes_intel_cbc_encrypt_mode; +//extern const struct ccmode_cbc ccaes_intel_cbc_decrypt_mode; + +extern const struct ccmode_cbc ccaes_intel_cbc_encrypt_opt_mode; +extern const struct ccmode_cbc ccaes_intel_cbc_encrypt_aesni_mode; + +extern const struct ccmode_cbc ccaes_intel_cbc_decrypt_opt_mode; +extern const struct ccmode_cbc ccaes_intel_cbc_decrypt_aesni_mode; + +//extern const struct ccmode_xts ccaes_intel_xts_encrypt_mode; +//extern const struct ccmode_xts ccaes_intel_xts_decrypt_mode; + +extern const struct ccmode_xts ccaes_intel_xts_encrypt_opt_mode; +extern const struct ccmode_xts ccaes_intel_xts_encrypt_aesni_mode; + +extern const struct ccmode_xts ccaes_intel_xts_decrypt_opt_mode; +extern const struct ccmode_xts ccaes_intel_xts_decrypt_aesni_mode; +#endif + + +/* Implementation Selectors: */ +const struct ccmode_ecb *ccaes_ecb_encrypt_mode(void); +const struct ccmode_cbc *ccaes_cbc_encrypt_mode(void); +const struct ccmode_cfb *ccaes_cfb_encrypt_mode(void); +const struct ccmode_cfb8 *ccaes_cfb8_encrypt_mode(void); +const struct ccmode_xts *ccaes_xts_encrypt_mode(void); +const struct ccmode_gcm *ccaes_gcm_encrypt_mode(void); + +const struct ccmode_ecb *ccaes_ecb_decrypt_mode(void); +const struct ccmode_cbc *ccaes_cbc_decrypt_mode(void); +const struct ccmode_cfb *ccaes_cfb_decrypt_mode(void); +const struct ccmode_cfb8 *ccaes_cfb8_decrypt_mode(void); +const struct ccmode_xts *ccaes_xts_decrypt_mode(void); +const struct ccmode_gcm *ccaes_gcm_decrypt_mode(void); + +const struct ccmode_ctr *ccaes_ctr_crypt_mode(void); +const struct ccmode_ofb *ccaes_ofb_crypt_mode(void); + +#endif /* _CORECRYPTO_CCAES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccder.h b/EXTERNAL_HEADERS/corecrypto/ccder.h new file mode 100644 index 000000000..756afd295 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccder.h @@ -0,0 +1,263 @@ +/* + * ccder.h + * corecrypto + * + * Created by Michael Brouwer on 2/28/12. + * Copyright 2012 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCDER_H_ +#define _CORECRYPTO_CCDER_H_ + +#include +#include + +#define CCDER_MULTIBYTE_TAGS 1 + +#ifdef CCDER_MULTIBYTE_TAGS +typedef unsigned long ccder_tag; +#else +typedef uint8_t ccder_tag; +#endif + +/* DER types to be used with ccder_decode and ccder_encode functions. */ +enum { + CCDER_EOL = CCASN1_EOL, + CCDER_BOOLEAN = CCASN1_BOOLEAN, + CCDER_INTEGER = CCASN1_INTEGER, + CCDER_BIT_STRING = CCASN1_BIT_STRING, + CCDER_OCTET_STRING = CCASN1_OCTET_STRING, + CCDER_NULL = CCASN1_NULL, + CCDER_OBJECT_IDENTIFIER = CCASN1_OBJECT_IDENTIFIER, + CCDER_OBJECT_DESCRIPTOR = CCASN1_OBJECT_DESCRIPTOR, + /* External or instance-of 0x08 */ + CCDER_REAL = CCASN1_REAL, + CCDER_ENUMERATED = CCASN1_ENUMERATED, + CCDER_EMBEDDED_PDV = CCASN1_EMBEDDED_PDV, + CCDER_UTF8_STRING = CCASN1_UTF8_STRING, + /* 0x0d */ + /* 0x0e */ + /* 0x0f */ + CCDER_SEQUENCE = CCASN1_SEQUENCE, + CCDER_SET = CCASN1_SET, + CCDER_NUMERIC_STRING = CCASN1_NUMERIC_STRING, + CCDER_PRINTABLE_STRING = CCASN1_PRINTABLE_STRING, + CCDER_T61_STRING = CCASN1_T61_STRING, + CCDER_VIDEOTEX_STRING = CCASN1_VIDEOTEX_STRING, + CCDER_IA5_STRING = CCASN1_IA5_STRING, + CCDER_UTC_TIME = CCASN1_UTC_TIME, + CCDER_GENERALIZED_TIME = CCASN1_GENERALIZED_TIME, + CCDER_GRAPHIC_STRING = CCASN1_GRAPHIC_STRING, + CCDER_VISIBLE_STRING = CCASN1_VISIBLE_STRING, + CCDER_GENERAL_STRING = CCASN1_GENERAL_STRING, + CCDER_UNIVERSAL_STRING = CCASN1_UNIVERSAL_STRING, + /* 0x1d */ + CCDER_BMP_STRING = CCASN1_BMP_STRING, + CCDER_HIGH_TAG_NUMBER = CCASN1_HIGH_TAG_NUMBER, + CCDER_TELETEX_STRING = CCDER_T61_STRING, + +#ifdef CCDER_MULTIBYTE_TAGS + CCDER_TAG_MASK = ((ccder_tag)~0), + CCDER_TAGNUM_MASK = ((ccder_tag)~((ccder_tag)7 << (sizeof(ccder_tag) * 8 - 3))), + + CCDER_METHOD_MASK = ((ccder_tag)1 << (sizeof(ccder_tag) * 8 - 3)), + CCDER_PRIMITIVE = ((ccder_tag)0 << (sizeof(ccder_tag) * 8 - 3)), + CCDER_CONSTRUCTED = ((ccder_tag)1 << (sizeof(ccder_tag) * 8 - 3)), + + CCDER_CLASS_MASK = ((ccder_tag)3 << (sizeof(ccder_tag) * 8 - 2)), + CCDER_UNIVERSAL = ((ccder_tag)0 << (sizeof(ccder_tag) * 8 - 2)), + CCDER_APPLICATION = ((ccder_tag)1 << (sizeof(ccder_tag) * 8 - 2)), + CCDER_CONTEXT_SPECIFIC = ((ccder_tag)2 << (sizeof(ccder_tag) * 8 - 2)), + CCDER_PRIVATE = ((ccder_tag)3 << (sizeof(ccder_tag) * 8 - 2)), +#else + CCDER_TAG_MASK = CCASN1_TAG_MASK, + CCDER_TAGNUM_MASK = CCASN1_TAGNUM_MASK, + + CCDER_METHOD_MASK = CCASN1_METHOD_MASK, + CCDER_PRIMITIVE = CCASN1_PRIMITIVE, + CCDER_CONSTRUCTED = CCASN1_CONSTRUCTED, + + CCDER_CLASS_MASK = CCASN1_CLASS_MASK, + CCDER_UNIVERSAL = CCASN1_UNIVERSAL, + CCDER_APPLICATION = CCASN1_APPLICATION, + CCDER_CONTEXT_SPECIFIC = CCASN1_CONTEXT_SPECIFIC, + CCDER_PRIVATE = CCASN1_PRIVATE, +#endif + CCDER_CONSTRUCTED_SET = CCDER_SET | CCDER_CONSTRUCTED, + CCDER_CONSTRUCTED_SEQUENCE = CCDER_SEQUENCE | CCDER_CONSTRUCTED, +}; + + +#pragma mark ccder_sizeof_ functions + +inline CC_CONST +size_t ccder_sizeof_tag(ccder_tag tag); + +inline CC_CONST +size_t ccder_sizeof_len(size_t len); + +/* Returns the size of an asn1 encoded item of length l in bytes, + assuming a 1 byte tag. */ +inline CC_CONST +size_t ccder_sizeof(ccder_tag tag, size_t len); + +inline CC_CONST +size_t ccder_sizeof_oid(ccoid_t oid); + +#pragma mark ccder_encode_ functions. + +/* Encode a tag backwards, der_end should point to one byte past the end of + destination for the tag, returns a pointer to the first byte of the tag. + Returns NULL if there is an encoding error. */ +inline CC_NONNULL2 +uint8_t *ccder_encode_tag(ccder_tag tag, const uint8_t *der, uint8_t *der_end); + +/* Returns a pointer to the start of the len field. returns NULL if there + is an encoding error. */ +inline CC_NONNULL2 +uint8_t * +ccder_encode_len(size_t len, const uint8_t *der, uint8_t *der_end); + +/* der_end should point to the first byte of the content of this der item. */ +inline CC_NONNULL3 +uint8_t * +ccder_encode_tl(ccder_tag tag, size_t len, const uint8_t *der, uint8_t *der_end); + +inline CC_PURE CC_NONNULL2 +uint8_t * +ccder_encode_body_nocopy(size_t size, const uint8_t *der, uint8_t *der_end); + +/* Encode the tag and length of a constructed object. der is the lower + bound, der_end is one byte paste where we want to write the length and + body_end is one byte past the end of the body of the der object we are + encoding the tag and length of. */ +inline CC_NONNULL((2,3)) +uint8_t * +ccder_encode_constructed_tl(ccder_tag tag, const uint8_t *body_end, + const uint8_t *der, uint8_t *der_end); + +/* Encodes oid into der and returns + der + ccder_sizeof_oid(oid). */ +inline CC_NONNULL1 CC_NONNULL2 +uint8_t *ccder_encode_oid(ccoid_t oid, const uint8_t *der, uint8_t *der_end); + +inline CC_NONNULL((3,4)) +uint8_t *ccder_encode_implicit_integer(ccder_tag implicit_tag, + cc_size n, const cc_unit *s, + const uint8_t *der, uint8_t *der_end); + +inline CC_NONNULL((2,3)) +uint8_t *ccder_encode_integer(cc_size n, const cc_unit *s, + const uint8_t *der, uint8_t *der_end); + +inline CC_NONNULL3 +uint8_t *ccder_encode_implicit_uint64(ccder_tag implicit_tag, + uint64_t value, + const uint8_t *der, uint8_t *der_end); + +inline CC_NONNULL3 +uint8_t *ccder_encode_uint64(uint64_t value, + const uint8_t *der, uint8_t *der_end); + +inline CC_NONNULL((3,4)) +uint8_t *ccder_encode_implicit_octet_string(ccder_tag implicit_tag, + cc_size n, const cc_unit *s, + const uint8_t *der, + uint8_t *der_end); + +inline CC_NONNULL((2,3)) +uint8_t *ccder_encode_octet_string(cc_size n, const cc_unit *s, + const uint8_t *der, uint8_t *der_end); + +inline CC_NONNULL((3,4)) +uint8_t *ccder_encode_implicit_raw_octet_string(ccder_tag implicit_tag, + size_t s_size, const uint8_t *s, + const uint8_t *der, + uint8_t *der_end); + +inline CC_NONNULL((2,3)) +uint8_t *ccder_encode_raw_octet_string(size_t s_size, const uint8_t *s, + const uint8_t *der, uint8_t *der_end); + +/* ccder_encode_body COPIES the body into the der. + It's inefficient – especially when you already have to convert to get to + the form for the body. + see encode integer for the right way to unify conversion and insertion */ +inline CC_NONNULL3 +uint8_t * +ccder_encode_body(size_t size, const uint8_t* body, + const uint8_t *der, uint8_t *der_end); + +#pragma mark ccder_decode_ functions. + +/* Returns a pointer to the start of the length field, and returns the decoded tag in tag. + returns NULL if there is a decoding error. */ +inline CC_NONNULL((1,3)) +const uint8_t *ccder_decode_tag(ccder_tag *tagp, const uint8_t *der, const uint8_t *der_end); + +inline CC_NONNULL((1,3)) +const uint8_t *ccder_decode_len(size_t *lenp, const uint8_t *der, const uint8_t *der_end); + +/* Returns a pointer to the start of the der object, and returns the length in len. + returns NULL if there is a decoding error. */ +inline CC_NONNULL((2,4)) +const uint8_t *ccder_decode_tl(ccder_tag expected_tag, size_t *lenp, + const uint8_t *der, const uint8_t *der_end); + +inline CC_NONNULL((2,3)) +const uint8_t * +ccder_decode_constructed_tl(ccder_tag expected_tag, const uint8_t **body_end, + const uint8_t *der, const uint8_t *der_end); + +inline CC_NONNULL((1,3)) +const uint8_t * +ccder_decode_sequence_tl(const uint8_t **body_end, + const uint8_t *der, const uint8_t *der_end); + +inline CC_NONNULL((2,4)) +const uint8_t *ccder_decode_uint(cc_size n, cc_unit *r, + const uint8_t *der, const uint8_t *der_end); + +inline CC_NONNULL((1,3)) +const uint8_t *ccder_decode_uint64(uint64_t* r, + const uint8_t *der, const uint8_t *der_end); + +/* Decode SEQUENCE { r, s -- (unsigned)integer } in der into r and s. + Returns NULL on decode errors, returns pointer just past the end of the + sequence of integers otherwise. */ +inline CC_NONNULL((2,3,5)) +const uint8_t *ccder_decode_seqii(cc_size n, cc_unit *r, cc_unit *s, + const uint8_t *der, const uint8_t *der_end); +inline CC_NONNULL_ALL +const uint8_t *ccder_decode_oid(ccoid_t *oidp, + const uint8_t *der, const uint8_t *der_end); + +#ifndef CCDER_MULTIBYTE_TAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#endif /* _CORECRYPTO_CCDER_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdes.h b/EXTERNAL_HEADERS/corecrypto/ccdes.h new file mode 100644 index 000000000..aff622bfb --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccdes.h @@ -0,0 +1,67 @@ +/* + * ccdes.h + * corecrypto + * + * Created by Fabrice Gautier on 12/20/10. + * Copyright 2010 Apple, Inc. All rights reserved. + * + */ + + +#ifndef _CORECRYPTO_CCDES_H_ +#define _CORECRYPTO_CCDES_H_ + +#include + +#define CCDES_BLOCK_SIZE 8 +#define CCDES_KEY_SIZE 8 + +extern const struct ccmode_ecb ccdes_ltc_ecb_decrypt_mode; +extern const struct ccmode_ecb ccdes_ltc_ecb_encrypt_mode; + +extern const struct ccmode_ecb ccdes3_ltc_ecb_decrypt_mode; +extern const struct ccmode_ecb ccdes3_ltc_ecb_encrypt_mode; +extern const struct ccmode_ecb ccdes168_ltc_ecb_encrypt_mode; + +const struct ccmode_ecb *ccdes_ecb_decrypt_mode(void); +const struct ccmode_ecb *ccdes_ecb_encrypt_mode(void); + +const struct ccmode_cbc *ccdes_cbc_decrypt_mode(void); +const struct ccmode_cbc *ccdes_cbc_encrypt_mode(void); + +const struct ccmode_cfb *ccdes_cfb_decrypt_mode(void); +const struct ccmode_cfb *ccdes_cfb_encrypt_mode(void); + +const struct ccmode_cfb8 *ccdes_cfb8_decrypt_mode(void); +const struct ccmode_cfb8 *ccdes_cfb8_encrypt_mode(void); + +const struct ccmode_ctr *ccdes_ctr_crypt_mode(void); + +const struct ccmode_ofb *ccdes_ofb_crypt_mode(void); + + +const struct ccmode_ecb *ccdes3_ecb_decrypt_mode(void); +const struct ccmode_ecb *ccdes3_ecb_encrypt_mode(void); + +const struct ccmode_cbc *ccdes3_cbc_decrypt_mode(void); +const struct ccmode_cbc *ccdes3_cbc_encrypt_mode(void); + +const struct ccmode_cfb *ccdes3_cfb_decrypt_mode(void); +const struct ccmode_cfb *ccdes3_cfb_encrypt_mode(void); + +const struct ccmode_cfb8 *ccdes3_cfb8_decrypt_mode(void); +const struct ccmode_cfb8 *ccdes3_cfb8_encrypt_mode(void); + +const struct ccmode_ctr *ccdes3_ctr_crypt_mode(void); + +const struct ccmode_ofb *ccdes3_ofb_crypt_mode(void); + +int ccdes_key_is_weak( void *key, unsigned long length); +void ccdes_key_set_odd_parity(void *key, unsigned long length); + +uint32_t +ccdes_cbc_cksum(void *in, void *out, unsigned long length, + void *key, unsigned long keylen, void *ivec); + + +#endif /* _CORECRYPTO_CCDES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest.h b/EXTERNAL_HEADERS/corecrypto/ccdigest.h new file mode 100644 index 000000000..7aa8ada33 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest.h @@ -0,0 +1,129 @@ +/* + * ccdigest.h + * corecrypto + * + * Created by Michael Brouwer on 11/30/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCDIGEST_H_ +#define _CORECRYPTO_CCDIGEST_H_ + +#include +#include + +/* To malloc a digest context for a given di, use malloc(ccdigest_di_size(di)) + and assign the result to a pointer to a struct ccdigest_ctx. */ +struct ccdigest_ctx { + union { + uint8_t u8; + uint32_t u32; + uint64_t u64; + cc_unit ccn; + } state; +} __attribute((aligned(8))); + +typedef union { + struct ccdigest_ctx *hdr; +} ccdigest_ctx_t __attribute__((transparent_union)); + +struct ccdigest_state { + union { + uint8_t u8; + uint32_t u32; + uint64_t u64; + cc_unit ccn; + } state; +} __attribute((aligned(8))); + +typedef union { + struct ccdigest_state *hdr; + struct ccdigest_ctx *_ctx; + ccdigest_ctx_t _ctxt; +} ccdigest_state_t __attribute__((transparent_union)); + +struct ccdigest_info { + unsigned long output_size; + unsigned long state_size; + unsigned long block_size; + unsigned long oid_size; + unsigned char *oid; + const void *initial_state; + void(*compress)(ccdigest_state_t state, unsigned long nblocks, + const void *data); + void(*final)(const struct ccdigest_info *di, ccdigest_ctx_t ctx, + unsigned char *digest); +}; + +/* Return sizeof a ccdigest_ctx for a given size_t _state_size_ and + size_t _block_size_. */ +#define ccdigest_ctx_size(_state_size_, _block_size_) ((_state_size_) + sizeof(uint64_t) + (_block_size_) + sizeof(unsigned int)) +/* Return sizeof a ccdigest_ctx for a given struct ccdigest_info *_di_. */ +#define ccdigest_di_size(_di_) (ccdigest_ctx_size((_di_)->state_size, (_di_)->block_size)) + +/* Declare a ccdigest_ctx for a given size_t _state_size_ and + size_t _block_size_, named _name_. Can be used in structs or on the + stack. */ +#define ccdigest_ctx_decl(_state_size_, _block_size_, _name_) cc_ctx_decl(struct ccdigest_ctx, ccdigest_ctx_size(_state_size_, _block_size_), _name_) +#define ccdigest_ctx_clear(_state_size_, _block_size_, _name_) cc_ctx_clear(struct ccdigest_ctx, ccdigest_ctx_size(_state_size_, _block_size_), _name_) +/* Declare a ccdigest_ctx for a given size_t _state_size_ and + size_t _block_size_, named _name_. Can be used on the stack. */ +#define ccdigest_di_decl(_di_, _name_) cc_ctx_decl(struct ccdigest_ctx, ccdigest_di_size(_di_), _name_) +#define ccdigest_di_clear(_di_, _name_) cc_ctx_clear(struct ccdigest_ctx, ccdigest_di_size(_di_), _name_) + +/* Digest context field accessors. Consider the implementation private. */ +#define ccdigest_state(_di_, _ctx_) ((ccdigest_state_t)(_ctx_)) +#define ccdigest_state_u8(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8) +#define ccdigest_state_u32(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u32) +#define ccdigest_state_u64(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u64) +#define ccdigest_state_ccn(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.ccn) +#define ccdigest_nbits(_di_, _ctx_) (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size))[0]) +#define ccdigest_data(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t)) +#define ccdigest_num(_di_, _ctx_) (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0]) + +/* Digest state field accessors. Consider the implementation private. */ +#define ccdigest_u8(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u8) +#define ccdigest_u32(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u32) +#define ccdigest_u64(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u64) +#define ccdigest_ccn(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.ccn) + +/* We could just use memcpy instead of this special macro, but this allows us + to use the optimized ccn_set() assembly routine if we have one, which for + 32 bit arm is about 200% quicker than generic memcpy(). */ +#if CCN_SET_ASM && CCN_UNIT_SIZE <= 4 +#define ccdigest_copy_state(_di_, _dst_, _src_) ccn_set((_di_)->state_size / CCN_UNIT_SIZE, _dst_, _src_) +#else +#define ccdigest_copy_state(_di_, _dst_, _src_) CC_MEMCPY(_dst_, _src_, (_di_)->state_size) +#endif + +void ccdigest_init(const struct ccdigest_info *di, ccdigest_ctx_t ctx); +void ccdigest_update(const struct ccdigest_info *di, ccdigest_ctx_t ctx, + unsigned long len, const void *data); + +CC_INLINE +void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest) +{ + di->final(di,ctx,digest); +} + +void ccdigest(const struct ccdigest_info *di, unsigned long len, + const void *data, void *digest); + +/* test functions */ +int ccdigest_test(const struct ccdigest_info *di, unsigned long len, + const void *data, const void *digest); + +int ccdigest_test_chunk(const struct ccdigest_info *di, unsigned long len, + const void *data, const void *digest, unsigned long chunk); + +struct ccdigest_vector { + unsigned long len; + const void *message; + const void *digest; +}; + +int ccdigest_test_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v); +int ccdigest_test_chunk_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v, unsigned long chunk); + +#endif /* _CORECRYPTO_CCDIGEST_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cchmac.h b/EXTERNAL_HEADERS/corecrypto/cchmac.h new file mode 100644 index 000000000..b6fd0dcae --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cchmac.h @@ -0,0 +1,83 @@ +/* + * cchmac.h + * corecrypto + * + * Created by Michael Brouwer on 12/7/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCHMAC_H_ +#define _CORECRYPTO_CCHMAC_H_ + +#include +#include + +/* An hmac_ctx_t is normally allocated as an array of these. */ +struct cchmac_ctx { + uint8_t b[8]; +} __attribute__((aligned(8))); + +typedef union { + struct cchmac_ctx *hdr; + ccdigest_ctx_t digest; +} cchmac_ctx_t __attribute__((transparent_union)); + +#define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE) (ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE) + (STATE_SIZE)) +#define cchmac_di_size(_di_) (cchmac_ctx_size((_di_)->state_size, (_di_)->block_size)) + +#define cchmac_ctx_n(STATE_SIZE, BLOCK_SIZE) ccn_nof_size(cchmac_ctx_size((STATE_SIZE), (BLOCK_SIZE))) + +#define cchmac_ctx_decl(STATE_SIZE, BLOCK_SIZE, _name_) cc_ctx_decl(struct cchmac_ctx, cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE), _name_) +#define cchmac_ctx_clear(STATE_SIZE, BLOCK_SIZE, _name_) cc_ctx_clear(struct cchmac_ctx, cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE), _name_) +#define cchmac_di_decl(_di_, _name_) cchmac_ctx_decl((_di_)->state_size, (_di_)->block_size, _name_) +#define cchmac_di_clear(_di_, _name_) cchmac_ctx_clear((_di_)->state_size, (_di_)->block_size, _name_) + +/* Return a ccdigest_ctx_t which can be accesed with the macros in ccdigest.h */ +#define cchmac_digest_ctx(_di_, HC) (((cchmac_ctx_t)(HC)).digest) + +/* Accesors for ostate fields, this is all cchmac_ctx_t adds to the ccdigest_ctx_t. */ +#define cchmac_ostate(_di_, HC) ((struct ccdigest_state *)(((cchmac_ctx_t)(HC)).hdr->b + ccdigest_di_size(_di_))) +#define cchmac_ostate8(_di_, HC) (ccdigest_u8(cchmac_ostate(_di_, HC))) +#define cchmac_ostate32(_di_, HC) (ccdigest_u32(cchmac_ostate(_di_, HC))) +#define cchmac_ostate64(_di_, HC) (ccdigest_u64(cchmac_ostate(_di_, HC))) +#define cchmac_ostateccn(_di_, HC) (ccdigest_ccn(cchmac_ostate(_di_, HC))) + +/* Convenience accessors for ccdigest_ctx_t fields. */ +#define cchmac_istate(_di_, HC) ((ccdigest_state_t)(((cchmac_ctx_t)(HC)).digest)) +#define cchmac_istate8(_di_, HC) (ccdigest_u8(cchmac_istate(_di_, HC))) +#define cchmac_istate32(_di_, HC) (ccdigest_u32(cchmac_istate(_di_, HC))) +#define cchmac_istate64(_di_, HC) (ccdigest_u64(cchmac_istate(_di_, HC))) +#define cchmac_istateccn(_di_, HC) (ccdigest_ccn(cchmac_istate(_di_, HC))) +#define cchmac_data(_di_, HC) ccdigest_data(_di_, ((cchmac_ctx_t)(HC)).digest) +#define cchmac_num(_di_, HC) ccdigest_num(_di_, ((cchmac_ctx_t)(HC)).digest) +#define cchmac_nbits(_di_, HC) ccdigest_nbits(_di_, ((cchmac_ctx_t)(HC)).digest) + +void cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t ctx, + unsigned long key_len, const void *key); +void cchmac_update(const struct ccdigest_info *di, cchmac_ctx_t ctx, + unsigned long data_len, const void *data); +void cchmac_final(const struct ccdigest_info *di, cchmac_ctx_t ctx, + unsigned char *mac); + +void cchmac(const struct ccdigest_info *di, unsigned long key_len, + const void *key, unsigned long data_len, const void *data, + unsigned char *mac); + +/* Test functions */ + +struct cchmac_test_input { + const struct ccdigest_info *di; + unsigned long key_len; + const void *key; + unsigned long data_len; + const void *data; + unsigned long mac_len; + const void *expected_mac; +}; + +int cchmac_test(const struct cchmac_test_input *input); +int cchmac_test_chunks(const struct cchmac_test_input *input, unsigned long chunk_size); + + +#endif /* _CORECRYPTO_CCHMAC_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmd5.h b/EXTERNAL_HEADERS/corecrypto/ccmd5.h new file mode 100644 index 000000000..128522500 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccmd5.h @@ -0,0 +1,27 @@ +/* + * ccmd5.h + * corecrypto + * + * Created by Fabrice Gautier on 12/3/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCMD5_H_ +#define _CORECRYPTO_CCMD5_H_ + +#include + +#define CCMD5_BLOCK_SIZE 64 +#define CCMD5_OUTPUT_SIZE 16 +#define CCMD5_STATE_SIZE 16 + +extern const uint32_t ccmd5_initial_state[4]; + +/* Selector */ +const struct ccdigest_info *ccmd5_di(void); + +/* Implementations */ +extern const struct ccdigest_info ccmd5_ltc_di; + +#endif /* _CORECRYPTO_CCMD5_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode.h b/EXTERNAL_HEADERS/corecrypto/ccmode.h new file mode 100644 index 000000000..3224069e7 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccmode.h @@ -0,0 +1,469 @@ +/* + * ccmode.h + * corecrypto + * + * Created by Michael Brouwer on 12/6/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCMODE_H_ +#define _CORECRYPTO_CCMODE_H_ + +#include +#include + +/* ECB mode. */ + +/* Declare a ecb key named _name_. Pass the size field of a struct ccmode_ecb + for _size_. */ +#define ccecb_ctx_decl(_size_, _name_) cc_ctx_decl(ccecb_ctx, _size_, _name_) +#define ccecb_ctx_clear(_size_, _name_) cc_ctx_clear(ccecb_ctx, _size_, _name_) + +CC_INLINE size_t ccecb_context_size(const struct ccmode_ecb *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long ccecb_block_size(const struct ccmode_ecb *mode) +{ + return mode->block_size; +} + +CC_INLINE void ccecb_init(const struct ccmode_ecb *mode, ccecb_ctx *ctx, + unsigned long key_len, const void *key) +{ + mode->init(mode, ctx, key_len, key); +} + +CC_INLINE void ccecb_update(const struct ccmode_ecb *mode, const ccecb_ctx *ctx, + unsigned long in_len, const void *in, void *out) +{ + unsigned long numBlocks = (in_len / mode->block_size); + mode->ecb(ctx, numBlocks, in, out); +} + +CC_INLINE void ccecb_one_shot(const struct ccmode_ecb *mode, + unsigned long key_len, const void *key, unsigned long in_len, + const void *in, void *out) +{ + unsigned long numBlocks = (in_len / mode->block_size); + ccecb_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, key_len, key); + mode->ecb(ctx, numBlocks, in, out); + ccecb_ctx_clear(mode->size, ctx); +} + +/* CBC mode. */ + +/* The CBC interface changed due to rdar://11468135. This macros is to indicate + to client which CBC API is implemented. Clients can support old versions of + corecrypto at build time using this. + */ +#define __CC_HAS_FIX_FOR_11468135__ 1 + +/* Declare a cbc key named _name_. Pass the size field of a struct ccmode_cbc + for _size_. */ +#define cccbc_ctx_decl(_size_, _name_) cc_ctx_decl(cccbc_ctx, _size_, _name_) +#define cccbc_ctx_clear(_size_, _name_) cc_ctx_clear(cccbc_ctx, _size_, _name_) + +/* Declare a cbc iv tweak named _name_. Pass the blocksize field of a struct ccmode_cbc + for _size_. */ +#define cccbc_iv_decl(_size_, _name_) cc_ctx_decl(cccbc_iv, _size_, _name_) +#define cccbc_iv_clear(_size_, _name_) cc_ctx_clear(cccbc_iv, _size_, _name_) + +/* Actual symmetric algorithm implementation can provide you one of these. + + Alternatively you can create a ccmode_cbc instance from any ccmode_ecb + cipher. To do so, statically initialize a struct ccmode_cbc using the + CCMODE_FACTORY_CBC_DECRYPT or CCMODE_FACTORY_CBC_ENCRYPT macros. Alternatively + you can dynamically initialize a struct ccmode_cbc ccmode_factory_cbc_decrypt() + or ccmode_factory_cbc_encrypt(). */ + +CC_INLINE size_t cccbc_context_size(const struct ccmode_cbc *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long cccbc_block_size(const struct ccmode_cbc *mode) +{ + return mode->block_size; +} + +CC_INLINE void cccbc_init(const struct ccmode_cbc *mode, cccbc_ctx *ctx, + unsigned long key_len, const void *key) +{ + mode->init(mode, ctx, key_len, key); +} + +CC_INLINE void cccbc_set_iv(const struct ccmode_cbc *mode, cccbc_iv *iv_ctx, const void *iv) +{ + if(iv) + cc_copy(mode->block_size, iv_ctx, iv); + else + cc_zero(mode->block_size, iv_ctx); +} + +CC_INLINE void cccbc_update(const struct ccmode_cbc *mode, cccbc_ctx *ctx, cccbc_iv *iv, + unsigned long nblocks, const void *in, void *out) +{ + mode->cbc(ctx, iv, nblocks, in, out); +} + +CC_INLINE void cccbc_one_shot(const struct ccmode_cbc *mode, + unsigned long key_len, const void *key, const void *iv, unsigned long nblocks, + const void *in, void *out) +{ + cccbc_ctx_decl(mode->size, ctx); + cccbc_iv_decl(mode->block_size, iv_ctx); + mode->init(mode, ctx, key_len, key); + if(iv) + cccbc_set_iv (mode, iv_ctx, iv); + else + cc_zero(mode->block_size, iv_ctx); + mode->cbc(ctx, iv_ctx, nblocks, in, out); + cccbc_ctx_clear(mode->size, ctx); +} + +/* CFB mode. */ + +/* Declare a cfb key named _name_. Pass the size field of a struct ccmode_cfb + for _size_. */ +#define cccfb_ctx_decl(_size_, _name_) cc_ctx_decl(cccfb_ctx, _size_, _name_) +#define cccfb_ctx_clear(_size_, _name_) cc_ctx_clear(cccfb_ctx, _size_, _name_) + +CC_INLINE size_t cccfb_context_size(const struct ccmode_cfb *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long cccfb_block_size(const struct ccmode_cfb *mode) +{ + return mode->block_size; +} + +CC_INLINE void cccfb_init(const struct ccmode_cfb *mode, cccfb_ctx *ctx, + unsigned long key_len, const void *key, const void *iv) +{ + mode->init(mode, ctx, key_len, key, iv); +} + +CC_INLINE void cccfb_update(const struct ccmode_cfb *mode, cccfb_ctx *ctx, + unsigned long in_len, const void *in, void *out) +{ + mode->cfb(ctx, in_len, in, out); +} + +CC_INLINE void cccfb_one_shot(const struct ccmode_cfb *mode, + unsigned long key_len, const void *key, const void *iv, + unsigned long in_len, const void *in, void *out) +{ + cccfb_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, key_len, key, iv); + mode->cfb(ctx, in_len, in, out); + cccfb_ctx_clear(mode->size, ctx); +} + +/* CFB8 mode. */ + +/* Declare a cfb8 key named _name_. Pass the size field of a struct ccmode_cfb8 + for _size_. */ +#define cccfb8_ctx_decl(_size_, _name_) cc_ctx_decl(cccfb8_ctx, _size_, _name_) +#define cccfb8_ctx_clear(_size_, _name_) cc_ctx_clear(cccfb8_ctx, _size_, _name_) + +CC_INLINE size_t cccfb8_context_size(const struct ccmode_cfb8 *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long cccfb8_block_size(const struct ccmode_cfb8 *mode) +{ + return mode->block_size; +} + +CC_INLINE void cccfb8_init(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, + unsigned long key_len, const void *key, const void *iv) +{ + mode->init(mode, ctx, key_len, key, iv); +} + +CC_INLINE void cccfb8_update(const struct ccmode_cfb8 *mode, cccfb8_ctx *ctx, + unsigned long in_len, const void *in, void *out) +{ + mode->cfb8(ctx, in_len, in, out); +} + +CC_INLINE void cccfb8_one_shot(const struct ccmode_cfb8 *mode, + unsigned long key_len, const void *key, const void *iv, + unsigned long in_len, const void *in, void *out) +{ + cccfb8_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, key_len, key, iv); + mode->cfb8(ctx, in_len, in, out); + cccfb8_ctx_clear(mode->size, ctx); +} + +/* CTR mode. */ + +/* Declare a ctr key named _name_. Pass the size field of a struct ccmode_ctr + for _size_. */ +#define ccctr_ctx_decl(_size_, _name_) cc_ctx_decl(ccctr_ctx, _size_, _name_) +#define ccctr_ctx_clear(_size_, _name_) cc_ctx_clear(ccctr_ctx, _size_, _name_) + +/* This is Integer Counter Mode: The IV is the initial value of the counter + that is incremented by 1 for each new block. Use the mode flags to select + if the IV/Counter is stored in big or little endian. */ + +CC_INLINE size_t ccctr_context_size(const struct ccmode_ctr *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long ccctr_block_size(const struct ccmode_ctr *mode) +{ + return mode->block_size; +} + +CC_INLINE void ccctr_init(const struct ccmode_ctr *mode, ccctr_ctx *ctx, + unsigned long key_len, const void *key, const void *iv) +{ + mode->init(mode, ctx, key_len, key, iv); +} + +CC_INLINE void ccctr_update(const struct ccmode_ctr *mode, ccctr_ctx *ctx, + unsigned long in_len, const void *in, void *out) +{ + unsigned long numBlocks = (in_len / mode->block_size); + mode->ctr(ctx, numBlocks, in, out); +} + +CC_INLINE void ccctr_one_shot(const struct ccmode_ctr *mode, + unsigned long key_len, const void *key, const void *iv, + unsigned long in_len, const void *in, void *out) +{ + unsigned long numBlocks = (in_len / mode->block_size); + ccctr_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, key_len, key, iv); + mode->ctr(ctx, numBlocks, in, out); + ccctr_ctx_clear(mode->size, ctx); +} + + +/* OFB mode. */ + +/* Declare a ofb key named _name_. Pass the size field of a struct ccmode_ofb + for _size_. */ +#define ccofb_ctx_decl(_size_, _name_) cc_ctx_decl(ccofb_ctx, _size_, _name_) +#define ccofb_ctx_clear(_size_, _name_) cc_ctx_clear(ccofb_ctx, _size_, _name_) + +CC_INLINE size_t ccofb_context_size(const struct ccmode_ofb *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long ccofb_block_size(const struct ccmode_ofb *mode) +{ + return mode->block_size; +} + +CC_INLINE void ccofb_init(const struct ccmode_ofb *mode, ccofb_ctx *ctx, + unsigned long key_len, const void *key, const void *iv) +{ + mode->init(mode, ctx, key_len, key, iv); +} + +CC_INLINE void ccofb_update(const struct ccmode_ofb *mode, ccofb_ctx *ctx, + unsigned long in_len, const void *in, void *out) +{ + mode->ofb(ctx, in_len, in, out); +} + +CC_INLINE void ccofb_one_shot(const struct ccmode_ofb *mode, + unsigned long key_len, const void *key, const void *iv, + unsigned long in_len, const void *in, void *out) +{ + ccofb_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, key_len, key, iv); + mode->ofb(ctx, in_len, in, out); + ccofb_ctx_clear(mode->size, ctx); +} + +/* Authenticated cipher modes. */ + +/* XTS mode. */ + +/* Declare a xts key named _name_. Pass the size field of a struct ccmode_xts + for _size_. */ +#define ccxts_ctx_decl(_size_, _name_) cc_ctx_decl(ccxts_ctx, _size_, _name_) +#define ccxts_ctx_clear(_size_, _name_) cc_ctx_clear(ccxts_ctx, _size_, _name_) + +/* Declare a xts tweak named _name_. Pass the tweak_size field of a struct ccmode_xts + for _size_. */ +#define ccxts_tweak_decl(_size_, _name_) cc_ctx_decl(ccxts_tweak, _size_, _name_) +#define ccxts_tweak_clear(_size_, _name_) cc_ctx_clear(ccxts_tweak, _size_, _name_) + +/* Actual symmetric algorithm implementation can provide you one of these. + + Alternatively you can create a ccmode_xts instance from any ccmode_ecb + cipher. To do so, statically initialize a struct ccmode_xts using the + CCMODE_FACTORY_XTS_DECRYPT or CCMODE_FACTORY_XTS_ENCRYPT macros. Alternatively + you can dynamically initialize a struct ccmode_xts ccmode_factory_xts_decrypt() + or ccmode_factory_xts_encrypt(). */ + +/* NOTE that xts mode does not do cts padding. It's really an xex mode. + If you need cts padding use the ccpad_xts_encrypt and ccpad_xts_decrypt + functions. Also note that xts only works for ecb modes with a block_size + of 16. */ + +CC_INLINE size_t ccxts_context_size(const struct ccmode_xts *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long ccxts_block_size(const struct ccmode_xts *mode) +{ + return mode->block_size; +} + +CC_INLINE void ccxts_init(const struct ccmode_xts *mode, ccxts_ctx *ctx, + unsigned long key_len, const void *key, const void *tweak_key) +{ + mode->init(mode, ctx, key_len, key, tweak_key); +} + +CC_INLINE void ccxts_set_tweak(const struct ccmode_xts *mode, ccxts_ctx *ctx, ccxts_tweak *tweak, const void *iv) +{ + mode->set_tweak(ctx, tweak, iv); +} + +CC_INLINE void *ccxts_update(const struct ccmode_xts *mode, ccxts_ctx *ctx, + ccxts_tweak *tweak, unsigned long in_len, const void *in, void *out) +{ + return mode->xts(ctx, tweak, in_len, in, out); +} + +CC_INLINE void ccxts_one_shot(const struct ccmode_xts *mode, + unsigned long key_len, const void *key, const void *tweak_key, + const void* iv, + unsigned long in_len, const void *in, void *out) +{ + ccxts_ctx_decl(mode->size, ctx); + ccxts_tweak_decl(mode->tweak_size, tweak); + mode->init(mode, ctx, key_len, key, tweak_key); + mode->set_tweak(ctx, tweak, iv); + mode->xts(ctx, tweak, in_len, in, out); + ccxts_ctx_clear(mode->size, ctx); + ccxts_tweak_clear(mode->tweak_size, tweak); +} + +/* GCM mode. */ + +/* Declare a gcm key named _name_. Pass the size field of a struct ccmode_gcm + for _size_. */ +#define ccgcm_ctx_decl(_size_, _name_) cc_ctx_decl(ccgcm_ctx, _size_, _name_) +#define ccgcm_ctx_clear(_size_, _name_) cc_ctx_clear(ccgcm_ctx, _size_, _name_) + +CC_INLINE size_t ccgcm_context_size(const struct ccmode_gcm *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long ccgcm_block_size(const struct ccmode_gcm *mode) +{ + return mode->block_size; +} + +CC_INLINE void ccgcm_init(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, + unsigned long key_len, const void *key) +{ + mode->init(mode, ctx, key_len, key); +} + +CC_INLINE void ccgcm_set_iv(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, size_t iv_size, const void *iv) +{ + mode->set_iv(ctx, iv_size, iv); +} + +CC_INLINE void ccgcm_gmac(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, + unsigned long nbytes, const void *in) +{ + mode->gmac(ctx, nbytes, in); +} + +CC_INLINE void ccgcm_update(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, + unsigned long nbytes, const void *in, void *out) +{ + mode->gcm(ctx, nbytes, in, out); +} + +CC_INLINE void ccgcm_finalize(const struct ccmode_gcm *mode, ccgcm_ctx *ctx, + size_t tag_size, void *tag) +{ + mode->finalize(ctx, tag_size, tag); +} + +CC_INLINE void ccgcm_reset(const struct ccmode_gcm *mode, ccgcm_ctx *ctx) +{ + mode->reset(ctx); +} + + +CC_INLINE void ccgcm_one_shot(const struct ccmode_gcm *mode, + unsigned long key_len, const void *key, + unsigned long iv_len, const void *iv, + unsigned long nbytes, const void *in, void *out, + unsigned long adata_len, const void* adata, + size_t tag_len, void *tag) +{ + ccgcm_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, key_len, key); + mode->set_iv(ctx, iv_len, iv); + mode->gmac(ctx, adata_len, adata); + mode->gcm(ctx, nbytes, in, out); + mode->finalize(ctx, tag_len, tag); + ccgcm_ctx_clear(mode->size, ctx); +} + +/* OMAC mode. */ + + +/* Declare a omac key named _name_. Pass the size field of a struct ccmode_omac + for _size_. */ +#define ccomac_ctx_decl(_size_, _name_) cc_ctx_decl(ccomac_ctx, _size_, _name_) +#define ccomac_ctx_clear(_size_, _name_) cc_ctx_clear(ccomac_ctx, _size_, _name_) + +CC_INLINE size_t ccomac_context_size(const struct ccmode_omac *mode) +{ + return mode->size; +} + +CC_INLINE unsigned long ccomac_block_size(const struct ccmode_omac *mode) +{ + return mode->block_size; +} + +CC_INLINE void ccomac_init(const struct ccmode_omac *mode, ccomac_ctx *ctx, + unsigned long tweak_len, unsigned long key_len, const void *key) +{ + return mode->init(mode, ctx, tweak_len, key_len, key); +} + +CC_INLINE int ccomac_update(const struct ccmode_omac *mode, ccomac_ctx *ctx, + unsigned long in_len, const void *tweak, const void *in, void *out) +{ + return mode->omac(ctx, in_len, tweak, in, out); +} + +CC_INLINE int ccomac_one_shot(const struct ccmode_omac *mode, + unsigned long tweak_len, unsigned long key_len, const void *key, + const void *tweak, unsigned long in_len, const void *in, void *out) +{ + ccomac_ctx_decl(mode->size, ctx); + mode->init(mode, ctx, tweak_len, key_len, key); + int result = mode->omac(ctx, in_len, tweak, in, out); + ccomac_ctx_clear(mode->size, ctx); + return result; +} + + +#endif /* _CORECRYPTO_CCMODE_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h new file mode 100644 index 000000000..3233c9916 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h @@ -0,0 +1,571 @@ +/* + * ccmode_factory.h + * corecrypto + * + * Created by Fabrice Gautier on 1/21/11. + * Copyright 2011 Apple, Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCMODE_FACTORY_H_ +#define _CORECRYPTO_CCMODE_FACTORY_H_ + +#include /* TODO: Remove dependancy on this header. */ +#include + +/* For CBC, direction of underlying ecb is the same as the cbc direction */ +#define CCMODE_CBC_FACTORY(_cipher_, _dir_) \ +static struct ccmode_cbc cbc_##_cipher_##_##_dir_; \ + \ +const struct ccmode_cbc *cc##_cipher_##_cbc_##_dir_##_mode(void) \ +{ \ + const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_##_dir_##_mode(); \ + ccmode_factory_cbc_##_dir_(&cbc_##_cipher_##_##_dir_, ecb); \ + return &cbc_##_cipher_##_##_dir_; \ +} + +/* For CTR, only one direction, underlying ecb is always encrypt */ +#define CCMODE_CTR_FACTORY(_cipher_) \ +static struct ccmode_ctr ctr_##_cipher_; \ + \ +const struct ccmode_ctr *cc##_cipher_##_ctr_crypt_mode(void) \ +{ \ + const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode(); \ + ccmode_factory_ctr_crypt(&ctr_##_cipher_, ecb); \ + return &ctr_##_cipher_; \ +} + +/* OFB, same as CTR */ +#define CCMODE_OFB_FACTORY(_cipher_) \ +static struct ccmode_ofb ofb_##_cipher_; \ + \ +const struct ccmode_ofb *cc##_cipher_##_ofb_crypt_mode(void) \ +{ \ + const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode(); \ + ccmode_factory_ofb_crypt(&ofb_##_cipher_, ecb); \ + return &ofb_##_cipher_; \ +} + + +/* For CFB, the underlying ecb operation is encrypt for both directions */ +#define CCMODE_CFB_FACTORY(_cipher_, _mode_, _dir_) \ +static struct ccmode_##_mode_ _mode_##_##_cipher_##_##_dir_; \ + \ +const struct ccmode_##_mode_ *cc##_cipher_##_##_mode_##_##_dir_##_mode(void) \ +{ \ + const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_encrypt_mode(); \ + ccmode_factory_##_mode_##_##_dir_(&_mode_##_##_cipher_##_##_dir_, ecb); \ + return &_mode_##_##_cipher_##_##_dir_; \ +} + +/* For GCM, same as CFB */ +#define CCMODE_GCM_FACTORY(_cipher_, _dir_) CCMODE_CFB_FACTORY(_cipher_, gcm, _dir_) + + +/* Fot XTS, you always need an ecb encrypt */ +#define CCMODE_XTS_FACTORY(_cipher_ , _dir_) \ +static struct ccmode_xts xts##_cipher_##_##_dir_; \ + \ +const struct ccmode_xts *cc##_cipher_##_xts_##_dir_##_mode(void) \ +{ \ + const struct ccmode_ecb *ecb=cc##_cipher_##_ecb_##_dir_##_mode(); \ + const struct ccmode_ecb *ecb_enc=cc##_cipher_##_ecb_encrypt_mode(); \ + \ + ccmode_factory_xts_##_dir_(&xts##_cipher_##_##_dir_, ecb, ecb_enc); \ + return &xts##_cipher_##_##_dir_; \ +} + +#if 0 + +/* example of how to make the selection function thread safe */ + +struct ccmode_cbc cc3des_cbc_mode_encrypt; +dispatch_once_t cc3des_mode_encrypt_init_once; + +void cc3des_mode_encrypt_init(void *ctx) { + struct ccmode_ecb *ecb = cc3des_ecb_encrypt_mode(); + ccmode_factory_cbc_encrypt(&cc3des_mode_encrypt, ecb); +} + +const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) { + dispatch_once_f(&cc3des_mode_encrypt_init_once, NULL, cc3des_mode_encrypt_init); + return &cc3des_mode_encrypt; +} + +struct ccmode_cbc cc3des_cbc_mode_encrypt = { + .n = CC3DES_LTC_ECB_ENCRYPT_N, + .init = ccmode_cbc_init, + .cbc = ccmode_cbc_encrypt, + .custom = &cc3des_ltc_ecb_encrypt +}; + +const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) { + return &cc3des_mode_encrypt; +} + +#endif + + + +void *ccmode_cbc_init(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long rawkey_len, const void *rawkey, + const void *iv); +void *ccmode_cbc_decrypt(cccbc_ctx *ctx, unsigned long nblocks, + const void *in, void *out); +void *ccmode_cbc_encrypt(cccbc_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + +struct _ccmode_cbc_key { + const struct ccmode_ecb *ecb; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_cbc object for decryption. */ +#define CCMODE_FACTORY_CBC_DECRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_cbc_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = (ECB)->block_size, \ +.init = ccmode_cbc_init, \ +.cbc = ccmode_cbc_decrypt, \ +.custom = (ECB) \ +} + +/* Use this to statically initialize a ccmode_cbc object for encryption. */ +#define CCMODE_FACTORY_CBC_ENCRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_cbc_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = (ECB)->block_size, \ +.init = ccmode_cbc_init, \ +.cbc = ccmode_cbc_encrypt, \ +.custom = (ECB) \ +} + +/* Use these function to runtime initialize a ccmode_cbc decrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb decrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_cbc_decrypt(struct ccmode_cbc *cbc, + const struct ccmode_ecb *ecb) { + struct ccmode_cbc cbc_decrypt = CCMODE_FACTORY_CBC_DECRYPT(ecb); + *cbc = cbc_decrypt; +} + +/* Use these function to runtime initialize a ccmode_cbc encrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_cbc_encrypt(struct ccmode_cbc *cbc, + const struct ccmode_ecb *ecb) { + struct ccmode_cbc cbc_encrypt = CCMODE_FACTORY_CBC_ENCRYPT(ecb); + *cbc = cbc_encrypt; +} + + +void ccmode_cfb_init(const struct ccmode_cfb *cfb, cccfb_ctx *ctx, + unsigned long rawkey_len, const void *rawkey, + const void *iv); +void ccmode_cfb_decrypt(cccfb_ctx *ctx, unsigned long nblocks, + const void *in, void *out); +void ccmode_cfb_encrypt(cccfb_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + +struct _ccmode_cfb_key { + const struct ccmode_ecb *ecb; + size_t pad_len; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_cfb object for decryption. */ +#define CCMODE_FACTORY_CFB_DECRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = 1, \ +.init = ccmode_cfb_init, \ +.cfb = ccmode_cfb_decrypt, \ +.custom = (ECB) \ +} + +/* Use this to statically initialize a ccmode_cfb object for encryption. */ +#define CCMODE_FACTORY_CFB_ENCRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = 1, \ +.init = ccmode_cfb_init, \ +.cfb = ccmode_cfb_encrypt, \ +.custom = (ECB) \ +} + +/* Use these function to runtime initialize a ccmode_cfb decrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_cfb_decrypt(struct ccmode_cfb *cfb, + const struct ccmode_ecb *ecb) { + struct ccmode_cfb cfb_decrypt = CCMODE_FACTORY_CFB_DECRYPT(ecb); + *cfb = cfb_decrypt; +} + +/* Use these function to runtime initialize a ccmode_cfb encrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_cfb_encrypt(struct ccmode_cfb *cfb, + const struct ccmode_ecb *ecb) { + struct ccmode_cfb cfb_encrypt = CCMODE_FACTORY_CFB_ENCRYPT(ecb); + *cfb = cfb_encrypt; +} + + +void ccmode_cfb8_init(const struct ccmode_cfb8 *cfb8, cccfb8_ctx *ctx, + unsigned long rawkey_len, const void *rawkey, + const void *iv); +void ccmode_cfb8_decrypt(cccfb8_ctx *ctx, unsigned long nbytes, + const void *in, void *out); +void ccmode_cfb8_encrypt(cccfb8_ctx *ctx, unsigned long nbytes, + const void *in, void *out); + +struct _ccmode_cfb8_key { + const struct ccmode_ecb *ecb; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_cfb8 object for decryption. */ +#define CCMODE_FACTORY_CFB8_DECRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb8_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = 1, \ +.init = ccmode_cfb8_init, \ +.cfb8 = ccmode_cfb8_decrypt, \ +.custom = (ECB) \ +} + +/* Use this to statically initialize a ccmode_cfb8 object for encryption. */ +#define CCMODE_FACTORY_CFB8_ENCRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb8_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = 1, \ +.init = ccmode_cfb8_init, \ +.cfb8 = ccmode_cfb8_encrypt, \ +.custom = (ECB) \ +} + +/* Use these function to runtime initialize a ccmode_cfb8 decrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb decrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_cfb8_decrypt(struct ccmode_cfb8 *cfb8, + const struct ccmode_ecb *ecb) { + struct ccmode_cfb8 cfb8_decrypt = CCMODE_FACTORY_CFB8_DECRYPT(ecb); + *cfb8 = cfb8_decrypt; +} + +/* Use these function to runtime initialize a ccmode_cfb8 encrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_cfb8_encrypt(struct ccmode_cfb8 *cfb8, + const struct ccmode_ecb *ecb) { + struct ccmode_cfb8 cfb8_encrypt = CCMODE_FACTORY_CFB8_ENCRYPT(ecb); + *cfb8 = cfb8_encrypt; +} + +void ccmode_ctr_init(const struct ccmode_ctr *ctr, ccctr_ctx *ctx, + unsigned long rawkey_len, const void *rawkey, + const void *iv); +void ccmode_ctr_crypt(ccctr_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + +struct _ccmode_ctr_key { + const struct ccmode_ecb *ecb; + size_t pad_len; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_ctr object for decryption. */ +#define CCMODE_FACTORY_CTR_CRYPT(ECB_ENCRYPT) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_ctr_key)) + 2 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ +.block_size = 1, \ +.init = ccmode_ctr_init, \ +.ctr = ccmode_ctr_crypt, \ +.custom = (ECB_ENCRYPT) \ +} + +/* Use these function to runtime initialize a ccmode_ctr decrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_ctr_crypt(struct ccmode_ctr *ctr, + const struct ccmode_ecb *ecb) { + struct ccmode_ctr ctr_crypt = CCMODE_FACTORY_CTR_CRYPT(ecb); + *ctr = ctr_crypt; +} + +/* GCM FEATURES. */ +//#define CCMODE_GCM_TABLES 1 +#define CCMODE_GCM_FAST 1 + +#ifdef CCMODE_GCM_FAST +#define CCMODE_GCM_FAST_TYPE cc_unit +#endif + +#ifdef CCMODE_GCM_TABLES + +//#define CCMODE_GCM_TABLES_SSE2 1 + +extern const unsigned char gcm_shift_table[256*2]; +#endif + +/* Create a gcm key from a gcm mode object. + key must point to at least sizeof(CCMODE_GCM_KEY(ecb)) bytes of free + storage. */ +void ccmode_gcm_init(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, + unsigned long rawkey_len, const void *rawkey); +void ccmode_gcm_set_iv(ccgcm_ctx *ctx, size_t iv_size, const void *iv); +void ccmode_gcm_gmac(ccgcm_ctx *ctx, unsigned long nbytes, const void *in); +void ccmode_gcm_decrypt(ccgcm_ctx *ctx, unsigned long nbytes, const void *in, + void *out); +void ccmode_gcm_encrypt(ccgcm_ctx *ctx, unsigned long nbytes, const void *in, + void *out); +void ccmode_gcm_finalize(ccgcm_ctx *key, size_t tag_size, void *tag); +void ccmode_gcm_reset(ccgcm_ctx *key); + +struct _ccmode_gcm_key { + // 5 blocks of temp space. + unsigned char H[16]; /* multiplier */ + unsigned char X[16]; /* accumulator */ + unsigned char Y[16]; /* counter */ + unsigned char Y_0[16]; /* initial counter */ + unsigned char buf[16]; /* buffer for stuff */ + + const struct ccmode_ecb *ecb; + uint32_t ivmode; /* Which mode is the IV in? */ + uint32_t mode; /* mode the GCM code is in */ + uint32_t buflen; /* length of data in buf */ + + uint64_t totlen; /* 64-bit counter used for IV and AAD */ + uint64_t pttotlen; /* 64-bit counter for the PT */ + +#ifdef CCMODE_GCM_TABLES + /* TODO: Make table based gcm a separate mode object. */ + unsigned char PC[16][256][16] /* 16 tables of 8x128 */ +#ifdef CCMODE_GCM_TABLES_SSE2 + __attribute__ ((aligned (16))) +#endif /* CCMODE_GCM_TABLES_SSE2 */ + ; +#endif /* CCMODE_GCM_TABLES */ + + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_gcm object for decryption. */ +#define CCMODE_FACTORY_GCM_DECRYPT(ECB_ENCRYPT) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_gcm_key)) + 5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ +.block_size = 1, \ +.init = ccmode_gcm_init, \ +.set_iv = ccmode_gcm_set_iv, \ +.gmac = ccmode_gcm_gmac, \ +.gcm = ccmode_gcm_decrypt, \ +.finalize = ccmode_gcm_finalize, \ +.reset = ccmode_gcm_reset, \ +.custom = (ECB_ENCRYPT) \ +} + +/* Use this to statically initialize a ccmode_gcm object for encryption. */ +#define CCMODE_FACTORY_GCM_ENCRYPT(ECB_ENCRYPT) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_gcm_key)) + 5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ +.block_size = 1, \ +.init = ccmode_gcm_init, \ +.set_iv = ccmode_gcm_set_iv, \ +.gmac = ccmode_gcm_gmac, \ +.gcm = ccmode_gcm_encrypt, \ +.finalize = ccmode_gcm_finalize, \ +.reset = ccmode_gcm_reset, \ +.custom = (ECB_ENCRYPT) \ +} + +/* Use these function to runtime initialize a ccmode_gcm decrypt object (for + example if it's part of a larger structure). For GCM you always pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_gcm_decrypt(struct ccmode_gcm *gcm, + const struct ccmode_ecb *ecb_encrypt) { + struct ccmode_gcm gcm_decrypt = CCMODE_FACTORY_GCM_DECRYPT(ecb_encrypt); + *gcm = gcm_decrypt; +} + +/* Use these function to runtime initialize a ccmode_gcm encrypt object (for + example if it's part of a larger structure). For GCM you always pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_gcm_encrypt(struct ccmode_gcm *gcm, + const struct ccmode_ecb *ecb_encrypt) { + struct ccmode_gcm gcm_encrypt = CCMODE_FACTORY_GCM_ENCRYPT(ecb_encrypt); + *gcm = gcm_encrypt; +} + + +void ccmode_ofb_init(const struct ccmode_ofb *ofb, ccofb_ctx *ctx, + unsigned long rawkey_len, const void *rawkey, + const void *iv); +void ccmode_ofb_crypt(ccofb_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + +struct _ccmode_ofb_key { + const struct ccmode_ecb *ecb; + size_t pad_len; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_ofb object. */ +#define CCMODE_FACTORY_OFB_CRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_ofb_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ +.block_size = 1, \ +.init = ccmode_ofb_init, \ +.ofb = ccmode_ofb_crypt, \ +.custom = (ECB) \ +} + +/* Use these function to runtime initialize a ccmode_ofb encrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_ofb_crypt(struct ccmode_ofb *ofb, + const struct ccmode_ecb *ecb) { + struct ccmode_ofb ofb_crypt = CCMODE_FACTORY_OFB_CRYPT(ecb); + *ofb = ofb_crypt; +} + + +int ccmode_omac_decrypt(ccomac_ctx *ctx, unsigned long nblocks, + const void *tweak, const void *in, void *out); +int ccmode_omac_encrypt(ccomac_ctx *ctx, unsigned long nblocks, + const void *tweak, const void *in, void *out); + +/* Create a omac key from a omac mode object. The tweak_len here + determines how long the tweak is in bytes, for each subsequent call to + ccmode_omac->omac(). + key must point to at least sizeof(CCMODE_OMAC_KEY(ecb)) bytes of free + storage. */ +void ccmode_omac_init(const struct ccmode_omac *omac, ccomac_ctx *ctx, + cc_size tweak_len, unsigned long rawkey_len, + const void *rawkey); + +struct _ccmode_omac_key { + const struct ccmode_ecb *ecb; + size_t tweak_len; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_omac object for decryption. */ +#define CCMODE_FACTORY_OMAC_DECRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_omac_key)) + 2 * ccn_sizeof_size((ECB)->size), \ +.block_size = (ECB)->block_size, \ +.init = ccmode_omac_init, \ +.omac = ccmode_omac_decrypt, \ +.custom = (ECB) \ +} + +/* Use this to statically initialize a ccmode_omac object for encryption. */ +#define CCMODE_FACTORY_OMAC_ENCRYPT(ECB) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_omac_key)) + 2 * ccn_sizeof_size((ECB)->size), \ +.block_size = (ECB)->block_size, \ +.init = ccmode_omac_init, \ +.omac = ccmode_omac_encrypt, \ +.custom = (ECB) \ +} + +/* Use these function to runtime initialize a ccmode_omac decrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb decrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_omac_decrypt(struct ccmode_omac *omac, + const struct ccmode_ecb *ecb) { + struct ccmode_omac omac_decrypt = CCMODE_FACTORY_OMAC_DECRYPT(ecb); + *omac = omac_decrypt; +} + +/* Use these function to runtime initialize a ccmode_omac encrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_omac_encrypt(struct ccmode_omac *omac, + const struct ccmode_ecb *ecb) { + struct ccmode_omac omac_encrypt = CCMODE_FACTORY_OMAC_ENCRYPT(ecb); + *omac = omac_encrypt; +} + + +/* Function prototypes used by the macros below, do not call directly. */ +void ccmode_xts_init(const struct ccmode_xts *xts, ccxts_ctx *ctx, + unsigned long key_len, const void *data_key, + const void *tweak_key); +void *ccmode_xts_crypt(ccxts_ctx *ctx, unsigned long nblocks, + const void *in, void *out); +void ccmode_xts_set_tweak(ccxts_ctx *ctx, const void *tweak); + + +struct _ccmode_xts_key { + const struct ccmode_ecb *ecb; + const struct ccmode_ecb *ecb_encrypt; + // FIPS requires that for XTS that no more that 2^20 AES blocks may be processed for any given + // Key, Tweak Key, and tweak combination + // the bytes_processed field in the context will accumuate the number of blocks processed and + // will fail the encrypt/decrypt if the size is violated. This counter will be reset to 0 + // when set_tweak is called. + unsigned long blocks_processed; + cc_unit u[]; +}; + +/* Use this to statically initialize a ccmode_xts object for decryption. */ +#define CCMODE_FACTORY_XTS_DECRYPT(ECB, ECB_ENCRYPT) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size) + ccn_sizeof_size(16), \ +.block_size = 16, \ +.init = ccmode_xts_init, \ +.set_tweak = ccmode_xts_set_tweak, \ +.xts = ccmode_xts_crypt, \ +.custom = (ECB), \ +.custom1 = (ECB_ENCRYPT) \ +} + +/* Use this to statically initialize a ccmode_xts object for encryption. */ +#define CCMODE_FACTORY_XTS_ENCRYPT(ECB, ECB_ENCRYPT) { \ +.size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size) + ccn_sizeof_size(16), \ +.block_size = 16, \ +.init = ccmode_xts_init, \ +.set_tweak = ccmode_xts_set_tweak, \ +.xts = ccmode_xts_crypt, \ +.custom = (ECB), \ +.custom1 = (ECB_ENCRYPT) \ +} + +/* Use these function to runtime initialize a ccmode_xts decrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb decrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_xts_decrypt(struct ccmode_xts *xts, + const struct ccmode_ecb *ecb, + const struct ccmode_ecb *ecb_encrypt) { + struct ccmode_xts xts_decrypt = CCMODE_FACTORY_XTS_DECRYPT(ecb, ecb_encrypt); + *xts = xts_decrypt; +} + +/* Use these function to runtime initialize a ccmode_xts encrypt object (for + example if it's part of a larger structure). Normally you would pass a + ecb encrypt mode implementation of some underlying algorithm as the ecb + parameter. */ +CC_INLINE +void ccmode_factory_xts_encrypt(struct ccmode_xts *xts, + const struct ccmode_ecb *ecb, + const struct ccmode_ecb *ecb_encrypt) { + struct ccmode_xts xts_encrypt = CCMODE_FACTORY_XTS_ENCRYPT(ecb, ecb_encrypt); + *xts = xts_encrypt; +} + +#endif /* _CORECRYPTO_CCMODE_FACTORY_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h b/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h new file mode 100644 index 000000000..3e35f548e --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h @@ -0,0 +1,166 @@ +/* + * ccmode_impl.h + * corecrypto + * + * Created by James Murphy on 12/9/11. + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCMODE_IMPL_H_ +#define _CORECRYPTO_CCMODE_IMPL_H_ + +#include + +/* ECB mode. */ +cc_aligned_struct(16) ccecb_ctx; + + +/* Actual symmetric algorithm implementation should provide you one of these. */ +struct ccmode_ecb { + size_t size; /* first argument to ccecb_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_ecb *ecb, ccecb_ctx *ctx, + unsigned long key_len, const void *key); + void (*ecb)(const ccecb_ctx *ctx, unsigned long nblocks, const void *in, + void *out); +}; + +/* CBC mode. */ +cc_aligned_struct(16) cccbc_ctx; +cc_aligned_struct(16) cccbc_iv; + +struct ccmode_cbc { + size_t size; /* first argument to cccbc_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long key_len, const void *key); + /* cbc encrypt or decrypt nblocks from in to out, iv will be used and updated. */ + void (*cbc)(const cccbc_ctx *ctx, cccbc_iv *iv, unsigned long nblocks, + const void *in, void *out); + const void *custom; +}; + +/* CFB mode. */ +cc_aligned_struct(16) cccfb_ctx; + +struct ccmode_cfb { + size_t size; /* first argument to cccfb_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_cfb *cfb, cccfb_ctx *ctx, + unsigned long key_len, const void *key, + const void *iv); + void (*cfb)(cccfb_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + const void *custom; +}; + +/* CFB8 mode. */ + +cc_aligned_struct(16) cccfb8_ctx; + +struct ccmode_cfb8 { + size_t size; /* first argument to cccfb8_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_cfb8 *cfb8, cccfb8_ctx *ctx, + unsigned long key_len, const void *key, + const void *iv); + void (*cfb8)(cccfb8_ctx *ctx, unsigned long nbytes, + const void *in, void *out); + const void *custom; +}; + +/* CTR mode. */ + +cc_aligned_struct(16) ccctr_ctx; + +struct ccmode_ctr { + size_t size; /* first argument to ccctr_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_ctr *ctr, ccctr_ctx *ctx, + unsigned long key_len, const void *key, + const void *iv); + void (*ctr)(ccctr_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + const void *custom; +}; + +/* OFB mode. */ + +cc_aligned_struct(16) ccofb_ctx; + +struct ccmode_ofb { + size_t size; /* first argument to ccofb_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_ofb *ofb, ccofb_ctx *ctx, + unsigned long key_len, const void *key, + const void *iv); + void (*ofb)(ccofb_ctx *ctx, unsigned long nblocks, + const void *in, void *out); + const void *custom; +}; + +/* XTS mode. */ + +cc_aligned_struct(16) ccxts_ctx; +cc_aligned_struct(16) ccxts_tweak; + +struct ccmode_xts { + size_t size; /* first argument to ccxts_ctx_decl(). */ + size_t tweak_size; /* first argument to ccxts_tweak_decl(). */ + unsigned long block_size; + + /* Create a xts key from a xts mode object. The tweak_len here + determines how long the tweak is in bytes, for each subsequent call to + ccmode_xts->xts(). + key must point to at least 'size' cc_units of free storage. + tweak_key must point to at least 'tweak_size' cc_units of free storage. */ + void (*init)(const struct ccmode_xts *xts, ccxts_ctx *ctx, + unsigned long key_len, const void *key, + const void *tweak_key); + + /* Set the tweak (sector number), the block within the sector zero. */ + void (*set_tweak)(const ccxts_ctx *ctx, ccxts_tweak *tweak, const void *iv); + + /* Encrypt blocks for a sector, clients must call set_tweak before calling + this function. Return a pointer to the tweak buffer */ + void *(*xts)(const ccxts_ctx *ctx, ccxts_tweak *tweak, unsigned long nblocks, + const void *in, void *out); + + const void *custom; + const void *custom1; +}; + +/* GCM mode. */ + +cc_aligned_struct(16) ccgcm_ctx; + +struct ccmode_gcm { + size_t size; /* first argument to ccgcm_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, + unsigned long key_len, const void *key); + void (*set_iv)(ccgcm_ctx *ctx, size_t iv_size, const void *iv); + void (*gmac)(ccgcm_ctx *ctx, unsigned long nbytes, const void *in); // could just be gcm with NULL out + void (*gcm)(ccgcm_ctx *ctx, unsigned long nbytes, const void *in, void *out); + void (*finalize)(ccgcm_ctx *key, size_t tag_size, void *tag); + void (*reset)(ccgcm_ctx *ctx); + const void *custom; +}; + +/* OMAC mode. */ + +cc_aligned_struct(16) ccomac_ctx; + +struct ccmode_omac { + size_t size; /* first argument to ccomac_ctx_decl(). */ + unsigned long block_size; + void (*init)(const struct ccmode_omac *omac, ccomac_ctx *ctx, + unsigned long tweak_len, unsigned long key_len, + const void *key); + int (*omac)(ccomac_ctx *ctx, unsigned long nblocks, + const void *tweak, const void *in, void *out); + const void *custom; +}; + +#endif /* _CORECRYPTO_CCMODE_IMPL_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccn.h b/EXTERNAL_HEADERS/corecrypto/ccn.h new file mode 100644 index 000000000..dd10e97de --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccn.h @@ -0,0 +1,636 @@ +/* + * ccn.h + * corecrypto + * + * Created by Michael Brouwer on 7/25/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCN_H_ +#define _CORECRYPTO_CCN_H_ + +#include +#include /* TODO: Get rid of this include in this header. */ +#include + +typedef uint8_t cc_byte; +typedef size_t cc_size; + +#if CCN_UNIT_SIZE == 8 +typedef uint64_t cc_unit; // 64 bit unit +//typedef uint128_t cc_dunit; // 128 bit double width unit +#define CCN_LOG2_BITS_PER_UNIT 6 // 2^6 = 64 bits +#define CC_UNIT_C(x) UINT64_C(x) +#elif CCN_UNIT_SIZE == 4 +typedef uint32_t cc_unit; // 32 bit unit +typedef uint64_t cc_dunit; // 64 bit double width unit +#define CCN_LOG2_BITS_PER_UNIT 5 // 2^5 = 32 bits +#define CC_UNIT_C(x) UINT32_C(x) +#elif CCN_UNIT_SIZE == 2 +typedef uint16_t cc_unit; // 16 bit unit +typedef uint32_t cc_dunit; // 32 bit double width unit +#define CCN_LOG2_BITS_PER_UNIT 4 // 2^4 = 16 bits +#define CC_UNIT_C(x) UINT16_C(x) +#elif CCN_UNIT_SIZE == 1 +typedef uint8_t cc_unit; // 8 bit unit +typedef uint16_t cc_dunit; // 16 bit double width unit +#define CCN_LOG2_BITS_PER_UNIT 3 // 2^3 = 8 bits +#define CC_UNIT_C(x) UINT8_C(x) +#else +#error invalid CCN_UNIT_SIZE +#endif + +// All mp types have units in little endian unit order. +typedef cc_unit *ccn_t; // n unit long mp +typedef cc_unit *ccnp1_t; // n + 1 unit long mp +typedef cc_unit *cc2n_t; // 2 * n unit long mp +typedef cc_unit *cc2np2_t; // 2 * n + 2 unit long mp +typedef const cc_unit *ccn_in_t; // n unit long mp +typedef const cc_unit *ccnp1_in_t; // n + 1 unit long mp +typedef const cc_unit *cc2n_in_t; // 2 * n unit long mp +typedef const cc_unit *cc2np2_in_t; // 2 * n + 2 unit long mp + +#define CCN_UNIT_BITS (sizeof(cc_unit) * 8) +#define CCN_UNIT_MASK ((cc_unit)~0) + + +/* Conversions between n sizeof and bits */ + +/* Returns the sizeof a ccn vector of length _n_ units. */ +#define ccn_sizeof_n(_n_) (sizeof(cc_unit) * (_n_)) + +/* Returns the count (n) of a ccn vector that can represent _bits_. */ +#define ccn_nof(_bits_) (((_bits_) + CCN_UNIT_BITS - 1) / CCN_UNIT_BITS) + +/* Returns the sizeof a ccn vector that can represent _bits_. */ +#define ccn_sizeof(_bits_) (ccn_sizeof_n(ccn_nof(_bits_))) + +/* Returns the count (n) of a ccn vector that can represent _size_ bytes. */ +#define ccn_nof_size(_size_) (((_size_) + CCN_UNIT_SIZE - 1) / CCN_UNIT_SIZE) + +/* Return the max number of bits a ccn vector of _n_ units can hold. */ +#define ccn_bitsof_n(_n_) ((_n_) * CCN_UNIT_BITS) + +/* Return the max number of bits a ccn vector of _size_ bytes can hold. */ +#define ccn_bitsof_size(_size_) ((_size_) * 8) + +/* Return the size of a ccn of size bytes in bytes. */ +#define ccn_sizeof_size(_size_) ccn_sizeof_n(ccn_nof_size(_size_)) + +/* Returns the value of bit _k_ of _ccn_, both are only evaluated once. */ +#define ccn_bit(_ccn_, _k_) ({__typeof__ (_k_) __k = (_k_); \ + 1 & ((_ccn_)[__k / CCN_UNIT_BITS] >> (__k & (CCN_UNIT_BITS - 1)));}) + +#define ccn_set_bit(_ccn_, _k_, _v_) ({__typeof__ (_k_) __k = (_k_); \ + if (_v_) \ + (_ccn_)[__k/CCN_UNIT_BITS] |= CC_UNIT_C(1) << (__k & (CCN_UNIT_BITS - 1)); \ + else \ + (_ccn_)[__k/CCN_UNIT_BITS] &= ~(CC_UNIT_C(1) << (__k & (CCN_UNIT_BITS - 1))); \ + }) + +/* Macros for making ccn constants. You must use list of CCN64_C() instances + separated by commas, with an optional smaller sized CCN32_C, CCN16_C, or + CCN8_C() instance at the end of the list, when making macros to declare + larger sized constants. */ +#define CCN8_C(a0) CC_UNIT_C(0x##a0) + +#if CCN_UNIT_SIZE >= 2 +#define CCN16_C(a1,a0) CC_UNIT_C(0x##a1##a0) +#define ccn16_v(a0) (a0) +#elif CCN_UNIT_SIZE == 1 +#define CCN16_C(a1,a0) CCN8_C(a0),CCN8_C(a1) +#define ccn16_v(a0) (a0 & UINT8_C(0xff)),(a0 >> 8) +#endif + +#if CCN_UNIT_SIZE >= 4 +#define CCN32_C(a3,a2,a1,a0) CC_UNIT_C(0x##a3##a2##a1##a0) +#define ccn32_v(a0) (a0) +#else +#define CCN32_C(a3,a2,a1,a0) CCN16_C(a1,a0),CCN16_C(a3,a2) +#define ccn32_v(a0) ccn16_v(a0 & UINT16_C(0xffff)),ccn16_v(a0 >> 16) +#endif + +#if CCN_UNIT_SIZE == 8 +#define CCN64_C(a7,a6,a5,a4,a3,a2,a1,a0) CC_UNIT_C(0x##a7##a6##a5##a4##a3##a2##a1##a0) +#define CCN40_C(a4,a3,a2,a1,a0) CC_UNIT_C(0x##a4##a3##a2##a1##a0) +#define ccn64_v(a0) (a0) +//#define ccn64_32(a1,a0) ((a1 << 32) | a0) +//#define ccn_uint64(a,i) (a[i]) +#else +#define CCN64_C(a7,a6,a5,a4,a3,a2,a1,a0) CCN32_C(a3,a2,a1,a0),CCN32_C(a7,a6,a5,a4) +#define CCN40_C(a4,a3,a2,a1,a0) CCN32_C(a3,a2,a1,a0),CCN8_C(a4) +#define ccn64_v(a0) ccn32_v((uint64_t)a0 & UINT32_C(0xffffffff)),ccn32_v((uint64_t)a0 >> 32) +//#define ccn64_32(a1,a0) ccn32_v(a0),ccn32_v(a1) +//#define ccn_uint64(a,i) ((uint64_t)ccn_uint32(a, i << 1 + 1) << 32 | (uint64_t)ccn_uint32(a, i << 1)) +#endif + +/* Macro's for reading uint32_t and uint64_t from ccns, the index is in 32 or + 64 bit units respectively. */ +#if CCN_UNIT_SIZE == 8 +//#define ccn_uint16(a,i) ((i & 3) == 3 ? ((uint16_t)(a[i >> 2] >> 48)) : \ +// (i & 3) == 2 ? ((uint16_t)(a[i >> 2] >> 32) & UINT16_C(0xffff)) : \ +// (i & 3) == 1 ? ((uint16_t)(a[i >> 2] >> 16) & UINT16_C(0xffff)) : \ +// ((uint16_t)(a[i >> 1] & UINT16_C(0xffff)))) +//#define ccn_uint32(a,i) (i & 1 ? ((uint32_t)(a[i >> 1] >> 32)) : ((uint32_t)(a[i >> 1] & UINT32_C(0xffffffff)))) +#elif CCN_UNIT_SIZE == 4 +//#define ccn16_v(a0) (a0) +//#define ccn32_v(a0) (a0) +//#define ccn_uint16(a,i) (i & 1 ? ((uint16_t)(a[i >> 1] >> 16)) : ((uint16_t)(a[i >> 1] & UINT16_C(0xffff)))) +//#define ccn_uint32(a,i) (a[i]) +#elif CCN_UNIT_SIZE == 2 +//#define ccn16_v(a0) (a0) +//#define ccn32_v(a0,a1) (a1,a0) +//#define ccn_uint16(a,i) (a[i]) +//#define ccn_uint32(a,i) (((uint32_t)a[i << 1 + 1]) << 16 | (uint32_t)a[i << 1])) +#elif CCN_UNIT_SIZE == 1 +//#define ccn16_v(a0) (a0 & UINT8_C(0xff)),(a0 >> 8) +//#define ccn_uint16(a,i) ((uint16_t)((a[i << 1 + 1] << 8) | a[i << 1])) +//#define ccn_uint32(a,i) ((uint32_t)ccn_uint16(a, i << 1 + 1) << 16 | (uint32_t)ccn_uint16(a, i << 1)) +#endif + +/* Macro's for reading uint32_t and uint64_t from ccns, the index is in 32 or + 64 bit units respectively. */ +#if CCN_UNIT_SIZE == 8 + +#define ccn64_32(a1,a0) (((cc_unit)a1) << 32 | ((cc_unit)a0)) +#define ccn32_32(a0) a0 +#if __LITTLE_ENDIAN__ +#define ccn32_32_parse(p,i) (((uint32_t *)p)[i]) +#else +#define ccn32_32_parse(p,i) (((uint32_t *)p)[i^1]) +#endif +#define ccn32_32_null 0 + +#define ccn64_64(a0) a0 +#define ccn64_64_parse(p,i) p[i] +#define ccn64_64_null 0 + +#elif CCN_UNIT_SIZE == 4 + +#define ccn32_32(a0) a0 +#define ccn32_32_parse(p,i) p[i] +#define ccn32_32_null 0 +#define ccn64_32(a1,a0) ccn32_32(a0),ccn32_32(a1) + +#define ccn64_64(a1,a0) a0,a1 +#define ccn64_64_parse(p,i) p[1+(i<<1)],p[i<<1] +#define ccn64_64_null 0,0 + +#elif CCN_UNIT_SIZE == 2 + +#define ccn32_32(a1,a0) a0,a1 +#define ccn32_32_parse(p,i) p[1+(i<<1)],p[i<<1] +#define ccn32_32_null 0,0 +#define ccn64_32(a3,a2,a1,a0) ccn32_32(a1,a0),ccn32_32(a3,a2) + +#define ccn64_64(a3,a2,a1,a0) a0,a1,a2,a3 +#define ccn64_64_parse(p,i) p[3+(i<<2)],p[2+(i<<2)],p[1+(i<<2)],p[i<<2] +#define ccn64_64_null 0,0,0,0 + +#elif CCN_UNIT_SIZE == 1 + +#define ccn32_32(a3,a2,a1,a0) a0,a1,a2,a3 +#define ccn32_32_parse(p,i) p[3+(i<<2)],p[2+(i<<2)],p[1+(i<<2)],p[i<<2] +#define ccn32_32_null 0,0,0,0 +#define ccn64_32(a7,a6,a5,a4,a3,a2,a1,a0) ccn32_32(a3,a2,a1,a0),ccn32_32(a7,a6,a5,a4) + +#define ccn64_64(a7,a6,a5,a4,a3,a2,a1,a0) a0,a1,a2,a3,a4,a5,a6,a7 +#define ccn64_64_parse(p,i) p[7+(i<<3)],p[6+(i<<3)],p[5+(i<<3)],p[4+(i<<3)],p[3+(i<<3)],p[2+(i<<3)],p[1+(i<<3)],p[i<<3] +#define ccn64_64_null 0,0,0,0,0,0,0,0 + +#endif + + +/* Macros to construct fixed size ccn arrays from 64 or 32 bit quantities. */ +#define ccn192_64(a2,a1,a0) ccn64_64(a0),ccn64_64(a1),ccn64_64(a2) +#define ccn224_32(a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn32_32(a6) +#define ccn256_32(a7,a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn64_32(a7,a6) +#define ccn384_32(a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0) ccn64_32(a1,a0),ccn64_32(a3,a2),ccn64_32(a5,a4),ccn64_32(a7,a6),ccn64_32(a9,a8),ccn64_32(a11,a10) + + +#define CCN192_C(c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN64_C(a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN64_C(b7,b6,b5,b4,b3,b2,b1,b0),\ + CCN64_C(c7,c6,c5,c4,c3,c2,c1,c0) + +#define CCN200_C(d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN192_C(c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN8_C(d0) + +#define CCN224_C(d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN192_C(c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN32_C(d3,d2,d1,d0) + +#define CCN232_C(d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN192_C(c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN40_C(d4,d3,d2,d1,d0) + +#define CCN256_C(d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN192_C(c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN64_C(d7,d6,d5,d4,d3,d2,d1,d0) + +#define CCN264_C(e0,d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN256_C(d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN8_C(e0) + +#define CCN384_C(f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0,d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN256_C(d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN64_C(e7,e6,e5,e4,e3,e2,e1,e0),\ + CCN64_C(f7,f6,f5,f4,f3,f2,f1,f0) + +#define CCN392_C(g0,f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0,d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN384_C(f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0,d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN8_C(g0) + +#define CCN528_C(i1,i0,h7,h6,h5,h4,h3,h2,h1,h0,g7,g6,g5,g4,g3,g2,g1,g0,f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0,d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0) \ + CCN256_C(d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7,b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0),\ + CCN256_C(h7,h6,h5,h4,h3,h2,h1,h0,g7,g6,g5,g4,g3,g2,g1,g0,f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0),\ + CCN16_C(i1,i0) + +#define CCN192_N ccn_nof(192) +#define CCN224_N ccn_nof(224) +#define CCN256_N ccn_nof(256) +#define CCN384_N ccn_nof(384) +#define CCN521_N ccn_nof(521) + +#if defined(_ARM_ARCH_6) || defined(_ARM_ARCH_7) +#if CCN_USE_BUILTIN_CLZ +CC_INLINE CC_CONST +cc_unit cc_clz(cc_unit data) +{ + return __builtin_clzl(data); +} +#else +CC_INLINE CC_CONST +cc_unit cc_clz(cc_unit data) +{ + __asm__ ("clz %0, %1\n" : "=l" (data) : "l" (data)); + return data; +} +#endif /* CCN_USE_BUILTIN_CLZ */ +#endif /* !defined(_ARM_ARCH_6) && !defined(_ARM_ARCH_7) */ + + +#if CCN_N_INLINE +/* Return the number of used units after stripping leading 0 units. */ +CC_INLINE CC_PURE CC_NONNULL2 +cc_size ccn_n(cc_size n, const cc_unit *s) { +#if 1 + while (n-- && s[n] == 0) {} + return n + 1; +#elif 0 + while (n && s[n - 1] == 0) { + n -= 1; + } + return n; +#else + if (n & 1) { + if (s[n - 1]) + return n; + n &= ~1; + } + if (n & 2) { + cc_unit a[2] = { s[n - 1], s[n - 2] }; + if (a[0]) + return n - 1; + if (a[1]) + return n - 2; + n &= ~2; + } + while (n) { + cc_unit a[4] = { s[n - 1], s[n - 2], s[n - 3], s[n - 4] }; + if (a[0]) + return n - 1; + if (a[1]) + return n - 2; + if (a[2]) + return n - 3; + if (a[3]) + return n - 4; + n -= 4; + } + return n; +#endif +} +#else +/* Return the number of used units after stripping leading 0 units. */ +CC_PURE CC_NONNULL2 +cc_size ccn_n(cc_size n, const cc_unit *s); +#endif + +/* s >> k -> r return bits shifted out of least significant word in bits [0, n> + { N bit, scalar -> N bit } N = n * sizeof(cc_unit) * 8 + the _multi version doesn't return the shifted bits, but does support multiple + word shifts. */ +CC_NONNULL((2,3)) +cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k); +CC_NONNULL((2,3)) +void ccn_shift_right_multi(cc_size n, cc_unit *r,const cc_unit *s, size_t k); + +/* s << k -> r return bits shifted out of most significant word in bits [0, n> + { N bit, scalar -> N bit } N = n * sizeof(cc_unit) * 8 + the _multi version doesn't return the shifted bits, but does support multiple + word shifts */ +CC_NONNULL((2,3)) +cc_unit ccn_shift_left(cc_size n, cc_unit *r, const cc_unit *s, size_t k); +CC_NONNULL((2,3)) +void ccn_shift_left_multi(cc_size n, cc_unit *r, const cc_unit *s, size_t k); + +/* s == 0 -> return 0 | s > 0 -> return index (starting at 1) of most + significant bit that is 1. + { N bit } N = n * sizeof(cc_unit) * 8 */ +CC_NONNULL2 +size_t ccn_bitlen(cc_size n, const cc_unit *s); + +/* Returns the number of bits which are zero before the first one bit + counting from least to most significant bit. */ +size_t ccn_trailing_zeros(cc_size n, const cc_unit *s); + +/* s == 0 -> return true | s != 0 -> return false + { N bit } N = n * sizeof(cc_unit) * 8 */ +#define ccn_is_zero(_n_, _s_) (!ccn_n(_n_, _s_)) + +/* s == 1 -> return true | s != 1 -> return false + { N bit } N = n * sizeof(cc_unit) * 8 */ +#define ccn_is_one(_n_, _s_) (ccn_n(_n_, _s_) == 1 && _s_[0] == 1) + +#if CCN_CMP_INLINE +CC_INLINE CC_PURE CC_NONNULL((2,3)) +int ccn_cmp(cc_size n, const cc_unit *s, const cc_unit *t) { + while (n) { + n--; + cc_unit si = s[n]; + cc_unit ti = t[n]; + if (si != ti) + return si > ti ? 1 : -1; + } + return n; +} +#else +/* s < t -> return - 1 | s == t -> return 0 | s > t -> return 1 + { N bit, N bit -> int } N = n * sizeof(cc_unit) * 8 */ +CC_PURE CC_NONNULL((2,3)) +int ccn_cmp(cc_size n, const cc_unit *s, const cc_unit *t); +#endif + +/* s < t -> return - 1 | s == t -> return 0 | s > t -> return 1 + { N bit, M bit -> int } N = ns * sizeof(cc_unit) * 8 M = nt * sizeof(cc_unit) * 8 */ +CC_INLINE +int ccn_cmpn(cc_size ns, const cc_unit *s, + cc_size nt, const cc_unit *t) { + if (ns > nt) { + return 1; + } else if (ns < nt) { + return -1; + } + return ccn_cmp(ns, s, t); +} + +/* s - t -> r return 1 iff t > s + { N bit, N bit -> N bit } N = n * sizeof(cc_unit) * 8 */ +CC_NONNULL((2,3,4)) +cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); + +/* s - v -> r return 1 iff v > s return 0 otherwise. + { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */ +CC_NONNULL((2,3)) +cc_unit ccn_sub1(cc_size n, cc_unit *r, const cc_unit *s, cc_unit v); + +/* s - t -> r return 1 iff t > s + { N bit, NT bit -> N bit NT <= N} N = n * sizeof(cc_unit) * 8 */ +CC_INLINE +CC_NONNULL((2,3,5)) +cc_unit ccn_subn(cc_size n, cc_unit *r,const cc_unit *s, + cc_size nt, const cc_unit *t) { + return ccn_sub1(n - nt, r + nt, s + nt, ccn_sub(nt, r, s, t)); +} + + +/* s + t -> r return carry if result doesn't fit in n bits. + { N bit, N bit -> N bit } N = n * sizeof(cc_unit) * 8 */ +CC_NONNULL((2,3,4)) +cc_unit ccn_add(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); + +/* s + v -> r return carry if result doesn't fit in n bits. + { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */ +CC_NONNULL((2,3)) +cc_unit ccn_add1(cc_size n, cc_unit *r, const cc_unit *s, cc_unit v); + +/* s + t -> r return carry if result doesn't fit in n bits + { N bit, NT bit -> N bit NT <= N} N = n * sizeof(cc_unit) * 8 */ +CC_INLINE +CC_NONNULL((2,3,5)) +cc_unit ccn_addn(cc_size n, cc_unit *r, const cc_unit *s, + cc_size nt, const cc_unit *t) { + return ccn_add1(n - nt, r + nt, s + nt, ccn_add(nt, r, s, t)); +} + +CC_NONNULL((4,5)) +void ccn_divmod(cc_size n, cc_unit *q, cc_unit *r, const cc_unit *s, const cc_unit *t); + + +CC_NONNULL((2,3,4)) +void ccn_lcm(cc_size n, cc_unit *r2n, const cc_unit *s, const cc_unit *t); + + +/* s * t -> r + { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8 */ +CC_NONNULL((2,3,4)) +void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t); + +CC_NONNULL((2,3)) +cc_unit ccn_mul1(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit v); +CC_NONNULL((2,3)) +cc_unit ccn_addmul1(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit v); + +#if 0 +/* a % d -> n + {2 * n bit, n bit -> n bit } n = count * sizeof(cc_unit) * 8 */ +CC_NONNULL((2,3,4)) +void ccn_mod(cc_size n, cc_unit *r, const cc_unit *a_2n, const cc_unit *d); +#endif + +/* r = gcd(s, t). + N bit, N bit -> N bit */ +CC_NONNULL((2,3,4)) +void ccn_gcd(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); + +/* r = gcd(s, t). + N bit, N bit -> O bit */ +CC_NONNULL((2,4,6)) +void ccn_gcdn(cc_size rn, cc_unit *r, cc_size sn, const cc_unit *s, cc_size tn, const cc_unit *t); + +/* r = (data, len) treated as a big endian byte array, return -1 if data + doesn't fit in r, return 0 otherwise. */ +CC_NONNULL((2,4)) +int ccn_read_uint(cc_size n, cc_unit *r, size_t data_size, const uint8_t *data); + +/* r = (data, len) treated as a big endian byte array, return -1 if data + doesn't fit in r, return 0 otherwise. + ccn_read_uint strips leading zeroes and doesn't care about sign. */ +#define ccn_read_int(n, r, data_size, data) ccn_read_uint(n, r, data_size, data) + +/* Return actual size in bytes needed to serialize s. */ +CC_PURE CC_NONNULL2 +size_t ccn_write_uint_size(cc_size n, const cc_unit *s); + +/* Serialize s, to out. + First byte of byte stream is the m.s. byte of s, + regardless of the size of cc_unit. + + No assumption is made about the alignment of out. + + The out_size argument should be the value returned from ccn_write_uint_size, + and is also the exact number of bytes this function will write to out. + If out_size if less than the value returned by ccn_write_uint_size, only the + first out_size non-zero most significant octects of s will be written. */ +CC_NONNULL((2,4)) +void ccn_write_uint(cc_size n, const cc_unit *s, size_t out_size, void *out); + + +CC_INLINE CC_NONNULL((2,4)) +cc_size ccn_write_uint_padded(cc_size n, const cc_unit* s, size_t out_size, uint8_t* to) +{ + size_t bytesInKey = ccn_write_uint_size(n, s); + cc_size offset = (out_size > bytesInKey) ? out_size - bytesInKey : 0; + + cc_zero(offset, to); + ccn_write_uint(n, s, out_size - offset, to + offset); + + return offset; +} + + +/* Return actual size in bytes needed to serialize s as int + (adding leading zero if high bit is set). */ +CC_PURE CC_NONNULL2 +size_t ccn_write_int_size(cc_size n, const cc_unit *s); + +/* Serialize s, to out. + First byte of byte stream is the m.s. byte of s, + regardless of the size of cc_unit. + + No assumption is made about the alignment of out. + + The out_size argument should be the value returned from ccn_write_int_size, + and is also the exact number of bytes this function will write to out. + If out_size if less than the value returned by ccn_write_int_size, only the + first out_size non-zero most significant octects of s will be written. */ +CC_NONNULL((2,4)) +void ccn_write_int(cc_size n, const cc_unit *s, size_t out_size, void *out); + + +/* s^2 -> r + { n bit -> 2 * n bit } */ +CC_INLINE CC_NONNULL((2,3)) +void ccn_sqr(cc_size n, cc_unit *r, const cc_unit *s) { + ccn_mul(n, r, s, s); +} + +/* s -> r + { n bit -> n bit } */ +CC_NONNULL((2,3)) +void ccn_set(cc_size n, cc_unit *r, const cc_unit *s); + +CC_INLINE CC_NONNULL2 +void ccn_zero(cc_size n, cc_unit *r) { + CC_BZERO(r, ccn_sizeof_n(n)); +} + +/* Burn (zero fill or otherwise overwrite) n cc_units of stack space. */ +void ccn_burn_stack(cc_size n); + +CC_INLINE CC_NONNULL2 +void ccn_seti(cc_size n, cc_unit *r, cc_unit v) { + /* assert(n > 0); */ + r[0] = v; + ccn_zero(n - 1, r + 1); +} + +CC_INLINE CC_NONNULL((2,4)) +void ccn_setn(cc_size n, cc_unit *r, CC_UNUSED const cc_size s_size, const cc_unit *s) { + /* FIXME: assert not available in kernel. + assert(n > 0); + assert(s_size > 0); + assert(s_size <= n); + */ + ccn_set(s_size, r, s); + ccn_zero(n - s_size, r + s_size); +} + +#define CC_SWAP_HOST_BIG_64(x) \ + ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) +#define CC_SWAP_HOST_BIG_32(x) \ + ((((x) & 0xff000000) >> 24) | \ + (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | \ + (((x) & 0x000000ff) << 24) ) +#define CC_SWAP_HOST_BIG_16(x) \ + (((x) & 0xff00) >> 8) | \ + (((x) & 0x00ff) << 8) | \ + +/* This should probably move if we move ccn_swap out of line. */ +#if CCN_UNIT_SIZE == 8 +#define CC_UNIT_TO_BIG(x) CC_SWAP_HOST_BIG_64(x) +#elif CCN_UNIT_SIZE == 4 +#define CC_UNIT_TO_BIG(x) CC_SWAP_HOST_BIG_32(x) +#elif CCN_UNIT_SIZE == 2 +#define CC_UNIT_TO_BIG(x) CC_SWAP_HOST_BIG_16(x) +#elif CCN_UNIT_SIZE == 1 +#define CC_UNIT_TO_BIG(x) (x) +#else +#error unsupported CCN_UNIT_SIZE +#endif + +/* Swap units in r in place from cc_unit vector byte order to big endian byte order (or back). */ +CC_INLINE CC_NONNULL2 +void ccn_swap(cc_size n, cc_unit *r) { + cc_unit *e; + for (e = r + n - 1; r < e; ++r, --e) { + cc_unit t = CC_UNIT_TO_BIG(*r); + *r = CC_UNIT_TO_BIG(*e); + *e = t; + } + if (n & 1) + *r = CC_UNIT_TO_BIG(*r); +} + +CC_INLINE CC_NONNULL2 CC_NONNULL3 CC_NONNULL4 +void ccn_xor(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) { + while (n--) { + r[n] = s[n] ^ t[n]; + } +} + +/* Debugging */ +CC_NONNULL2 +void ccn_print(cc_size n, const cc_unit *s); +CC_NONNULL3 +void ccn_lprint(cc_size n, const char *label, const cc_unit *s); + +/* Forward declaration so we don't depend on ccrng.h. */ +struct ccrng_state; + +#if 0 +CC_INLINE CC_NONNULL((2,3)) +int ccn_random(cc_size n, cc_unit *r, struct ccrng_state *rng) { + return (RNG)->generate((RNG), ccn_sizeof_n(n), (unsigned char *)r); +} +#else +#define ccn_random(_n_,_r_,_ccrng_ctx_) \ + ccrng_generate(_ccrng_ctx_, ccn_sizeof_n(_n_), (unsigned char *)_r_); +#endif + +/* Make a ccn of size ccn_nof(nbits) units with up to nbits sized random value. */ +CC_NONNULL((2,3)) +int ccn_random_bits(cc_size nbits, cc_unit *r, struct ccrng_state *rng); + +#endif /* _CORECRYPTO_CCN_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccpad.h b/EXTERNAL_HEADERS/corecrypto/ccpad.h new file mode 100644 index 000000000..71789e0db --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccpad.h @@ -0,0 +1,65 @@ +/* + * ccpad.h + * corecrypto + * + * Created by Michael Brouwer on 12/6/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCPAD_H_ +#define _CORECRYPTO_CCPAD_H_ + +#include + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts1_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts1_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts2_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts2_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts3_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_cts3_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is non zero and a multiple of block_size. Furthermore in is nbytes long and out is nbytes long. Returns number of bytes written to out (technically we always write nbytes to out but the returned value is the number of bytes decrypted after removal of padding. + + To be safe we remove the entire offending block if the pkcs7 padding checks failed. However we purposely don't report the failure to decode the padding since any use of this error leads to potential security exploits. So currently there is no way to distinguish between a full block of padding and bad padding. + */ +unsigned long ccpad_pkcs7_decrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, + void *out); + +/* Contract is in is nbytes long. Writes (nbytes / block_size) + 1 times block_size to out. In other words, out must be nbytes rounded down to the closest multiple of block_size plus block_size bytes. */ +void ccpad_pkcs7_encrypt(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_xts_decrypt(const struct ccmode_xts *xts, ccxts_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +/* Contract is nbytes is at least 1 block + 1 byte. Also in is nbytes long out is nbytes long. */ +void ccpad_xts_encrypt(const struct ccmode_xts *xts, ccxts_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +#endif /* _CORECRYPTO_CCPAD_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h b/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h new file mode 100644 index 000000000..15b94da15 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccpbkdf2.h @@ -0,0 +1,43 @@ +/* + * ccpbkdf.h + * corecrypto + * + * Copyright 1999-2001, 2010 Apple Inc. All rights reserved. + * + * Derived from pbkdf2.h by Mitch Adler on 09-12-2010. + * + */ + +#ifndef _CORECRYPTO_CCPBKDF2_H_ +#define _CORECRYPTO_CCPBKDF2_H_ + + +#include + +/*! @function ccpbkdf2_hmac + @abstract perform a pbkdf2 using HMAC(di) for the PRF (see PKCS#5 for specification) + @discussion This performs a standard PBKDF2 transformation of password and salt through +an HMAC PRF of the callers slection (any Digest, typically SHA-1) returning dkLen bytes +containing the entropy. + +Considerations: +The salt used should be at least 8 bytes long. Each session should use it's own salt. +We use the password as the key for the HMAC and the running data as the text for the HMAC to make a PRF. +SHA-1 is a good hash to use for the core of the HMAC PRF. + @param di digest info defining the digest type to use in the PRF. + @param passwordLen amount of data to be fed in + @param password data to be fed into the PBKDF + @param saltLen length of the salt + @param salt salt to be used in pbkdf + @param iterations itrations to go + @param dkLen length of the results + @param dk buffer for the results of the PBKDF tranformation, must be dkLen big + + */ +int ccpbkdf2_hmac(const struct ccdigest_info *di, + unsigned long passwordLen, const void *password, + unsigned long saltLen, const void *salt, + unsigned long iterations, + unsigned long dkLen, void *dk); + +#endif /* _CORECRYPTO_CCPBKDF2_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrc4.h b/EXTERNAL_HEADERS/corecrypto/ccrc4.h new file mode 100644 index 000000000..a177f86e3 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccrc4.h @@ -0,0 +1,43 @@ +/* + * ccrc4.h + * corecrypto + * + * Created by Fabrice Gautier on 12/22/10. + * Copyright 2010,2011 Apple, Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCRC4_H_ +#define _CORECRYPTO_CCRC4_H_ + +#include + +cc_aligned_struct(16) ccrc4_ctx; + +/* Declare a gcm key named _name_. Pass the size field of a struct ccmode_gcm + for _size_. */ +#define ccrc4_ctx_decl(_size_, _name_) cc_ctx_decl(ccrc4_ctx, _size_, _name_) +#define ccrc4_ctx_clear(_size_, _name_) cc_ctx_clear(ccrc4_ctx, _size_, _name_) + +struct ccrc4_info { + size_t size; /* first argument to ccrc4_ctx_decl(). */ + void (*init)(ccrc4_ctx *ctx, unsigned long key_len, const void *key); + void (*crypt)(ccrc4_ctx *ctx, unsigned long nbytes, const void *in, void *out); +}; + + +const struct ccrc4_info *ccrc4(void); + +extern const struct ccrc4_info ccrc4_eay; + +struct ccrc4_vector { + unsigned long keylen; + const void *key; + unsigned long datalen; + const void *pt; + const void *ct; +}; + +int ccrc4_test(const struct ccrc4_info *rc4, const struct ccrc4_vector *v); + +#endif /* _CORECRYPTO_CCRC4_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng.h b/EXTERNAL_HEADERS/corecrypto/ccrng.h new file mode 100644 index 000000000..8a31d5ac5 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccrng.h @@ -0,0 +1,26 @@ +/* + * ccrng.h + * corecrypto + * + * Created by Fabrice Gautier on 12/13/10. + * Copyright 2010 Apple, Inc. All rights reserved. + * + */ + + +#ifndef _CORECRYPTO_CCRNG_H_ +#define _CORECRYPTO_CCRNG_H_ + +#include + +#define CCRNG_STATE_COMMON \ + int (*generate)(struct ccrng_state *rng, unsigned long outlen, void *out); + +/* default state structure - do not instantiate, instead use the specific one you need */ +struct ccrng_state { + CCRNG_STATE_COMMON +}; + +#define ccrng_generate(ctx, outlen, out) ((ctx)->generate((ctx), (outlen), (out))) + +#endif /* _CORECRYPTO_CCRNG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng_system.h b/EXTERNAL_HEADERS/corecrypto/ccrng_system.h new file mode 100644 index 000000000..049970d19 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccrng_system.h @@ -0,0 +1,22 @@ +/* + * ccrng_system.h + * corecrypto + * + * Created by Fabrice Gautier on 12/13/10. + * Copyright 2010 Apple, Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCRNG_SYSTEM_H_ +#define _CORECRYPTO_CCRNG_SYSTEM_H_ + +#include + +struct ccrng_system_state { + CCRNG_STATE_COMMON + int fd; +}; + +int ccrng_system_init(struct ccrng_system_state *rng); + +#endif /* _CORECRYPTO_CCRNG_SYSTEM_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha1.h b/EXTERNAL_HEADERS/corecrypto/ccsha1.h new file mode 100644 index 000000000..fbb258f39 --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccsha1.h @@ -0,0 +1,47 @@ +/* + * ccsha1.h + * corecrypto + * + * Created by Michael Brouwer on 12/1/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCSHA1_H_ +#define _CORECRYPTO_CCSHA1_H_ + +#include +#include + +#define CCSHA1_BLOCK_SIZE 64 +#define CCSHA1_OUTPUT_SIZE 20 +#define CCSHA1_STATE_SIZE 20 + +/* sha1 selector */ +const struct ccdigest_info *ccsha1_di(void); + +extern const uint32_t ccsha1_initial_state[5]; + +/* shared between several implementations */ +void ccsha1_final(const struct ccdigest_info *di, ccdigest_ctx_t, + unsigned char *digest); + + +/* Implementations */ +extern const struct ccdigest_info ccsha1_ltc_di; +extern const struct ccdigest_info ccsha1_eay_di; + +#if CCSHA1_VNG_INTEL +extern const struct ccdigest_info ccsha1_vng_intel_SSE3_di; +extern const struct ccdigest_info ccsha1_vng_intel_NOSSE3_di; +#endif + +#if CCSHA1_VNG_ARMV7NEON +extern const struct ccdigest_info ccsha1_vng_armv7neon_di; +#endif + +/* TODO: Placeholders */ +#define ccoid_sha1 ((unsigned char *)"\x06\x05\x2b\x0e\x03\x02\x1a") +#define ccoid_sha1_len 7 + +#endif /* _CORECRYPTO_CCSHA1_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha2.h b/EXTERNAL_HEADERS/corecrypto/ccsha2.h new file mode 100644 index 000000000..4385b895e --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/ccsha2.h @@ -0,0 +1,62 @@ +/* + * ccsha2.h + * corecrypto + * + * Created by Fabrice Gautier on 12/3/10. + * Copyright 2010,2011 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCSHA2_H_ +#define _CORECRYPTO_CCSHA2_H_ + +#include + +/* sha2 selectors */ +const struct ccdigest_info *ccsha224_di(void); +const struct ccdigest_info *ccsha256_di(void); +const struct ccdigest_info *ccsha384_di(void); +const struct ccdigest_info *ccsha512_di(void); + +/* TODO: Placeholders */ +#define ccoid_sha224 ((unsigned char *)"\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x04") +#define ccoid_sha224_len 11 + +#define ccoid_sha256 ((unsigned char *)"\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01") +#define ccoid_sha256_len 11 + +#define ccoid_sha384 ((unsigned char *)"\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x02") +#define ccoid_sha384_len 11 + +#define ccoid_sha512 ((unsigned char *)"\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x03") +#define ccoid_sha512_len 11 + + +/* SHA256 */ +#define CCSHA256_BLOCK_SIZE 64 +#define CCSHA256_OUTPUT_SIZE 32 +#define CCSHA256_STATE_SIZE 32 +extern const struct ccdigest_info ccsha256_ltc_di; +extern const struct ccdigest_info ccsha256_vng_intel_SSE3_di; +extern const struct ccdigest_info ccsha256_vng_intel_NOSSE3_di; +extern const struct ccdigest_info ccsha256_vng_armv7neon_di; +extern const uint32_t ccsha256_K[64]; + +/* SHA224 */ +#define CCSHA224_OUTPUT_SIZE 28 +extern const struct ccdigest_info ccsha224_ltc_di; +extern const struct ccdigest_info ccsha224_vng_intel_SSE3_di; +extern const struct ccdigest_info ccsha224_vng_intel_NOSSE3_di; +extern const struct ccdigest_info ccsha224_vng_armv7neon_di; + +/* SHA512 */ +#define CCSHA512_BLOCK_SIZE 128 +#define CCSHA512_OUTPUT_SIZE 64 +#define CCSHA512_STATE_SIZE 64 +extern const struct ccdigest_info ccsha512_ltc_di; + +/* SHA384 */ +#define CCSHA384_OUTPUT_SIZE 48 +extern const struct ccdigest_info ccsha384_ltc_di; + +#endif /* _CORECRYPTO_CCSHA2_H_ */ diff --git a/EXTERNAL_HEADERS/mach-o/Makefile b/EXTERNAL_HEADERS/mach-o/Makefile index 1ce373da0..e54b58c6d 100644 --- a/EXTERNAL_HEADERS/mach-o/Makefile +++ b/EXTERNAL_HEADERS/mach-o/Makefile @@ -11,7 +11,6 @@ INSTINC_SUBDIRS = EXPORT_FILES = \ fat.h \ - kld.h \ loader.h \ nlist.h \ reloc.h diff --git a/EXTERNAL_HEADERS/mach-o/kld.h b/EXTERNAL_HEADERS/mach-o/kld.h deleted file mode 100644 index 6b15999d3..000000000 --- a/EXTERNAL_HEADERS/mach-o/kld.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#ifndef _MACHO_KLD_H_ -#define _MACHO_KLD_H_ - -#include -#include - -/* - * These API's are in libkld. Both kextload(8) and /mach_kernel should - * link with -lkld and then ld(1) will expand -lkld to libkld.dylib or - * libkld.a depending on if -dynamic or -static is in effect. - * - * Note: we are using the __DYNAMIC__ flag to indicate user space kernel - * linking and __STATIC__ as a synonym of KERNEL. - */ - -/* - * Note that you must supply the following function for error reporting when - * using any of the functions listed here. - */ -extern void kld_error_vprintf(const char *format, va_list ap); - -/* - * These two are only in libkld.dylib for use by kextload(8) (user code compiled - * with the default -dynamic). - */ -#ifdef __DYNAMIC__ -extern long kld_load_basefile( - const char *base_filename); - -/* Note: this takes only one object file name */ -extern long kld_load( - struct mach_header **header_addr, - const char *object_filename, - const char *output_filename); - -extern long kld_load_from_memory( - struct mach_header **header_addr, - const char *object_name, - char *object_addr, - long object_size, - const char *output_filename); -#endif /* __DYNAMIC__ */ - -/* - * This one is only in libkld.a use by /mach_kernel (kernel code compiled with - * -static). - */ -#ifdef __STATIC__ -/* Note: this api does not write an output file */ -extern long kld_load_from_memory( - struct mach_header **header_addr, - const char *object_name, - char *object_addr, - long object_size); -#endif /* __STATIC__ */ - -extern long kld_load_basefile_from_memory( - const char *base_filename, - char *base_addr, - long base_size); - -extern long kld_unload_all( - long deallocate_sets); - -extern long kld_lookup( - const char *symbol_name, - unsigned long *value); - -extern long kld_forget_symbol( - const char *symbol_name); - -extern void kld_address_func( - unsigned long (*func)(unsigned long size, unsigned long headers_size)); - -#define KLD_STRIP_ALL 0x00000000 -#define KLD_STRIP_NONE 0x00000001 - -extern void kld_set_link_options( - unsigned long link_options); - -#endif /* _MACHO_KLD_H_ */ diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index 9fecf2b4a..f41664e54 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2008 Apple Inc. All Rights Reserved. + * Copyright (c) 1999-2010 Apple Inc. All Rights Reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -174,13 +174,6 @@ struct mach_header_64 { in the task will be given stack execution privilege. Only used in MH_EXECUTE filetypes. */ -#define MH_DEAD_STRIPPABLE_DYLIB 0x400000 /* Only for use on dylibs. When - linking against a dylib that - has this bit set, the static linker - will automatically not create a - LC_LOAD_DYLIB load command to the - dylib if no symbols are being - referenced from the dylib. */ #define MH_ROOT_SAFE 0x40000 /* When this bit is set, the binary declares it is safe for use in processes with uid zero */ @@ -197,6 +190,16 @@ struct mach_header_64 { load the main executable at a random address. Only used in MH_EXECUTE filetypes. */ +#define MH_DEAD_STRIPPABLE_DYLIB 0x400000 /* Only for use on dylibs. When + linking against a dylib that + has this bit set, the static linker + will automatically not create a + LC_LOAD_DYLIB load command to the + dylib if no symbols are being + referenced from the dylib. */ +#define MH_HAS_TLV_DESCRIPTORS 0x800000 /* Contains a section of type + S_THREAD_LOCAL_VARIABLES */ + #define MH_NO_HEAP_EXECUTION 0x1000000 /* When this bit is set, the OS will run the main executable with a non-executable heap even on @@ -281,6 +284,17 @@ struct load_command { #define LC_ENCRYPTION_INFO 0x21 /* encrypted segment information */ #define LC_DYLD_INFO 0x22 /* compressed dyld information */ #define LC_DYLD_INFO_ONLY (0x22|LC_REQ_DYLD) /* compressed dyld information only */ +#define LC_LOAD_UPWARD_DYLIB (0x23 | LC_REQ_DYLD) /* load upward dylib */ +#define LC_VERSION_MIN_MACOSX 0x24 /* build for MacOSX min OS version */ +#define LC_VERSION_MIN_IPHONEOS 0x25 /* build for iPhoneOS min OS version */ +#define LC_FUNCTION_STARTS 0x26 /* compressed table of function start addresses */ +#define LC_DYLD_ENVIRONMENT 0x27 /* string for dyld to treat + like environment variable */ +#define LC_MAIN (0x28|LC_REQ_DYLD) /* replacement for LC_UNIXTHREAD */ +#define LC_DATA_IN_CODE 0x29 /* table of non-instructions in __text */ +#define LC_SOURCE_VERSION 0x2A /* source version used to build binary */ +#define LC_DYLIB_CODE_SIGN_DRS 0x2B /* Code signing DRs copied from linked dylibs */ + /* * A variable length string in a load command is represented by an lc_str @@ -470,6 +484,20 @@ struct section_64 { /* for 64-bit architectures */ #define S_LAZY_DYLIB_SYMBOL_POINTERS 0x10 /* section with only lazy symbol pointers to lazy loaded dylibs */ +/* + * Section types to support thread local variables + */ +#define S_THREAD_LOCAL_REGULAR 0x11 /* template of initial + values for TLVs */ +#define S_THREAD_LOCAL_ZEROFILL 0x12 /* template of initial + values for TLVs */ +#define S_THREAD_LOCAL_VARIABLES 0x13 /* TLV descriptors */ +#define S_THREAD_LOCAL_VARIABLE_POINTERS 0x14 /* pointers to TLV + descriptors */ +#define S_THREAD_LOCAL_INIT_FUNCTION_POINTERS 0x15 /* functions to call + to initialize TLV + values */ + /* * Constants for the section attributes part of the flags field of a section * structure. @@ -716,9 +744,12 @@ struct prebound_dylib_command { * the name of the dynamic linker (LC_LOAD_DYLINKER). And a dynamic linker * contains a dylinker_command to identify the dynamic linker (LC_ID_DYLINKER). * A file can have at most one of these. + * This struct is also used for the LC_DYLD_ENVIRONMENT load command and + * contains string for dyld to treat like environment variable. */ struct dylinker_command { - uint32_t cmd; /* LC_ID_DYLINKER or LC_LOAD_DYLINKER */ + uint32_t cmd; /* LC_ID_DYLINKER, LC_LOAD_DYLINKER or + LC_DYLD_ENVIRONMENT */ uint32_t cmdsize; /* includes pathname string */ union lc_str name; /* dynamic linker's path name */ }; @@ -1122,7 +1153,9 @@ struct rpath_command { * of data in the __LINKEDIT segment. */ struct linkedit_data_command { - uint32_t cmd; /* LC_CODE_SIGNATURE or LC_SEGMENT_SPLIT_INFO */ + uint32_t cmd; /* LC_CODE_SIGNATURE, LC_SEGMENT_SPLIT_INFO, + LC_FUNCTION_STARTS, LC_DATA_IN_CODE, + or LC_DYLIB_CODE_SIGN_DRS */ uint32_t cmdsize; /* sizeof(struct linkedit_data_command) */ uint32_t dataoff; /* file offset of data in __LINKEDIT segment */ uint32_t datasize; /* file size of data in __LINKEDIT segment */ @@ -1141,6 +1174,18 @@ struct encryption_info_command { 0 means not-encrypted yet */ }; +/* + * The version_min_command contains the min OS version on which this + * binary was built to run. + */ +struct version_min_command { + uint32_t cmd; /* LC_VERSION_MIN_MACOSX or + LC_VERSION_MIN_IPHONEOS */ + uint32_t cmdsize; /* sizeof(struct min_version_command) */ + uint32_t version; /* X.Y.Z is encoded in nibbles xxxx.yy.zz */ + uint32_t sdk; /* X.Y.Z is encoded in nibbles xxxx.yy.zz */ +}; + /* * The dyld_info_command contains the file offsets and sizes of * the new compressed form of the information dyld needs to @@ -1170,7 +1215,7 @@ struct dyld_info_command { /* * Dyld binds an image during the loading process, if the image * requires any pointers to be initialized to symbols in other images. - * The rebase information is a stream of byte sized + * The bind information is a stream of byte sized * opcodes whose symbolic names start with BIND_OPCODE_. * Conceptually the bind information is a table of tuples: * @@ -1223,19 +1268,27 @@ struct dyld_info_command { * The export area is a stream of nodes. The first node sequentially * is the start node for the trie. * - * Nodes for a symbol start with a byte that is the length of + * Nodes for a symbol start with a uleb128 that is the length of * the exported symbol information for the string so far. - * If there is no exported symbol, the byte is zero. If there - * is exported info, it follows the length byte. The exported - * info normally consists of a flags and offset both encoded - * in uleb128. The offset is location of the content named - * by the symbol. It is the offset from the mach_header for - * the image. + * If there is no exported symbol, the node starts with a zero byte. + * If there is exported info, it follows the length. + * + * First is a uleb128 containing flags. Normally, it is followed by + * a uleb128 encoded offset which is location of the content named + * by the symbol from the mach_header for the image. If the flags + * is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is + * a uleb128 encoded library ordinal, then a zero terminated + * UTF8 string. If the string is zero length, then the symbol + * is re-export from the specified dylib with the same name. + * If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following + * the flags is two uleb128s: the stub offset and the resolver offset. + * The stub is used by non-lazy pointers. The resolver is used + * by lazy pointers and must be called to get the actual address to use. * - * After the initial byte and optional exported symbol information - * is a byte of how many edges (0-255) that this node has leaving - * it, followed by each edge. - * Each edge is a zero terminated cstring of the addition chars + * After the optional exported symbol information is a byte of + * how many edges (0-255) that this node has leaving it, + * followed by each edge. + * Each edge is a zero terminated UTF8 of the addition chars * in the symbol, followed by a uleb128 offset for the node that * edge points to. * @@ -1303,8 +1356,8 @@ struct dyld_info_command { #define EXPORT_SYMBOL_FLAGS_KIND_REGULAR 0x00 #define EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL 0x01 #define EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION 0x04 -#define EXPORT_SYMBOL_FLAGS_INDIRECT_DEFINITION 0x08 -#define EXPORT_SYMBOL_FLAGS_HAS_SPECIALIZATIONS 0x10 +#define EXPORT_SYMBOL_FLAGS_REEXPORT 0x08 +#define EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER 0x10 /* * The symseg_command contains the offset and size of the GNU style @@ -1346,4 +1399,60 @@ struct fvmfile_command { uint32_t header_addr; /* files virtual address */ }; + +/* + * The entry_point_command is a replacement for thread_command. + * It is used for main executables to specify the location (file offset) + * of main(). If -stack_size was used at link time, the stacksize + * field will contain the stack size need for the main thread. + */ +struct entry_point_command { + uint32_t cmd; /* LC_MAIN only used in MH_EXECUTE filetypes */ + uint32_t cmdsize; /* 24 */ + uint64_t entryoff; /* file (__TEXT) offset of main() */ + uint64_t stacksize;/* if not zero, initial stack size */ +}; + + +/* + * The source_version_command is an optional load command containing + * the version of the sources used to build the binary. + */ +struct source_version_command { + uint32_t cmd; /* LC_SOURCE_VERSION */ + uint32_t cmdsize; /* 16 */ + uint64_t version; /* A.B.C.D.E packed as a24.b10.c10.d10.e10 */ +}; + + +/* + * The LC_DATA_IN_CODE load commands uses a linkedit_data_command + * to point to an array of data_in_code_entry entries. Each entry + * describes a range of data in a code section. This load command + * is only used in final linked images. + */ +struct data_in_code_entry { + uint32_t offset; /* from mach_header to start of data range*/ + uint16_t length; /* number of bytes in data range */ + uint16_t kind; /* a DICE_KIND_* value */ +}; +#define DICE_KIND_DATA 0x0001 /* L$start$data$... label */ +#define DICE_KIND_JUMP_TABLE8 0x0002 /* L$start$jt8$... label */ +#define DICE_KIND_JUMP_TABLE16 0x0003 /* L$start$jt16$... label */ +#define DICE_KIND_JUMP_TABLE32 0x0004 /* L$start$jt32$... label */ +#define DICE_KIND_ABS_JUMP_TABLE32 0x0005 /* L$start$jta32$... label */ + + + +/* + * Sections of type S_THREAD_LOCAL_VARIABLES contain an array + * of tlv_descriptor structures. + */ +struct tlv_descriptor +{ + void* (*thunk)(struct tlv_descriptor*); + unsigned long key; + unsigned long offset; +}; + #endif /* _MACHO_LOADER_H_ */ diff --git a/EXTERNAL_HEADERS/mach-o/nlist.h b/EXTERNAL_HEADERS/mach-o/nlist.h index 868ec2046..1c1941012 100644 --- a/EXTERNAL_HEADERS/mach-o/nlist.h +++ b/EXTERNAL_HEADERS/mach-o/nlist.h @@ -214,8 +214,10 @@ struct nlist_64 { * determined by the static link editor. Which library an undefined symbol is * bound to is recorded by the static linker in the high 8 bits of the n_desc * field using the SET_LIBRARY_ORDINAL macro below. The ordinal recorded - * references the libraries listed in the Mach-O's LC_LOAD_DYLIB load commands - * in the order they appear in the headers. The library ordinals start from 1. + * references the libraries listed in the Mach-O's LC_LOAD_DYLIB, + * LC_LOAD_WEAK_DYLIB, LC_REEXPORT_DYLIB, LC_LOAD_UPWARD_DYLIB, and + * LC_LAZY_LOAD_DYLIB, etc. load commands in the order they appear in the + * headers. The library ordinals start from 1. * For a dynamic library that is built as a two-level namespace image the * undefined references from module defined in another use the same nlist struct * an in that case SELF_LIBRARY_ORDINAL is used as the library ordinal. For @@ -286,6 +288,14 @@ struct nlist_64 { */ #define N_ARM_THUMB_DEF 0x0008 /* symbol is a Thumb function (ARM) */ +/* + * The N_SYMBOL_RESOLVER bit of the n_desc field indicates that the + * that the function is actually a resolver function and should + * be called to get the address of the real function to use. + * This bit is only available in .o files (MH_OBJECT filetype) + */ +#define N_SYMBOL_RESOLVER 0x0100 + #ifndef __STRICT_BSD__ #if __cplusplus extern "C" { diff --git a/EXTERNAL_HEADERS/mach-o/reloc.h b/EXTERNAL_HEADERS/mach-o/reloc.h index e36f4f734..d5741efa3 100644 --- a/EXTERNAL_HEADERS/mach-o/reloc.h +++ b/EXTERNAL_HEADERS/mach-o/reloc.h @@ -196,7 +196,8 @@ enum reloc_type_generic GENERIC_RELOC_PAIR, /* Only follows a GENERIC_RELOC_SECTDIFF */ GENERIC_RELOC_SECTDIFF, GENERIC_RELOC_PB_LA_PTR, /* prebound lazy pointer */ - GENERIC_RELOC_LOCAL_SECTDIFF + GENERIC_RELOC_LOCAL_SECTDIFF, + GENERIC_RELOC_TLV /* thread local variables */ }; #endif /* _MACHO_RELOC_H_ */ diff --git a/EXTERNAL_HEADERS/mach-o/x86_64/reloc.h b/EXTERNAL_HEADERS/mach-o/x86_64/reloc.h index 74edf082c..d3466d8fe 100644 --- a/EXTERNAL_HEADERS/mach-o/x86_64/reloc.h +++ b/EXTERNAL_HEADERS/mach-o/x86_64/reloc.h @@ -114,9 +114,19 @@ * lea L0(%rip), %rax * r_type=X86_64_RELOC_SIGNED, r_length=2, r_extern=0, r_pcrel=1, r_symbolnum=3 * 48 8d 05 56 00 00 00 - * // assumes L0 is in third section, has an address of 0x00000056 in .o - * // file, and there is no previous non-local label - * + * // assumes L0 is in third section and there is no previous non-local label. + * // The rip-relative-offset of 0x00000056 is L0-address_of_next_instruction. + * // address_of_next_instruction is the address of the relocation + 4. + * + * add $6,L0(%rip) + * r_type=X86_64_RELOC_SIGNED_1, r_length=2, r_extern=0, r_pcrel=1, r_symbolnum=3 + * 83 05 18 00 00 00 06 + * // assumes L0 is in third section and there is no previous non-local label. + * // The rip-relative-offset of 0x00000018 is L0-address_of_next_instruction. + * // address_of_next_instruction is the address of the relocation + 4 + 1. + * // The +1 comes from SIGNED_1. This is used because the relocation is not + * // at the end of the instruction. + * * .quad L1 * r_type=X86_64_RELOC_UNSIGNED, r_length=3, r_extern=1, r_pcrel=0, r_symbolnum=_prev * 12 00 00 00 00 00 00 00 @@ -171,4 +181,5 @@ enum reloc_type_x86_64 X86_64_RELOC_SIGNED_1, // for signed 32-bit displacement with a -1 addend X86_64_RELOC_SIGNED_2, // for signed 32-bit displacement with a -2 addend X86_64_RELOC_SIGNED_4, // for signed 32-bit displacement with a -4 addend + X86_64_RELOC_TLV, // for thread local variables }; diff --git a/EXTERNAL_HEADERS/stdint.h b/EXTERNAL_HEADERS/stdint.h index 90164c0af..ca048597a 100644 --- a/EXTERNAL_HEADERS/stdint.h +++ b/EXTERNAL_HEADERS/stdint.h @@ -162,7 +162,9 @@ typedef unsigned long long uintmax_t; #define SIZE_MAX UINT32_MAX +#ifndef WCHAR_MAX #define WCHAR_MAX INT32_MAX +#endif /* We have no wint_t yet, so no WINT_{MIN,MAX}. Should end up being {U}INT32_{MIN,MAX}, depending. */ diff --git a/Makefile b/Makefile index acd493419..3e9a72e7e 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,8 @@ INSTALL_KERNEL_FILE = mach_kernel INSTALL_KERNEL_DIR = / +INSTALL_KERNEL_SYM_DIR = $(INSTALL_KERNEL_DIR)/System/Library/Extensions/KDK/ + INSTMAN_SUBDIRS = \ bsd diff --git a/README b/README index b9e102527..b71d70f72 100644 --- a/README +++ b/README @@ -131,6 +131,8 @@ A. How to build XNU: -project libkxld # user-space version of kernel linker + -project libkmod # static library automatically linked into kexts + -project Libsyscall # automatically generate BSD syscall stubs @@ -154,7 +156,9 @@ A. How to build XNU: $ make -w # trace recursive make invocations. Useful in combination with VERBOSE=YES - $ make BUILD_LTO=1 # built with LLVM Link Time Optimization (experimental) + $ make BUILD_LTO=1 # build with LLVM Link Time Optimization (experimental) + + $ make BUILD_INTEGRATED_ASSEMBLER=1 # build with LLVM integrated assembler (experimental) ============================================= B. How to install a new header file from XNU diff --git a/SETUP/Makefile b/SETUP/Makefile index 7a0e5c5b4..97c11fe61 100644 --- a/SETUP/Makefile +++ b/SETUP/Makefile @@ -10,7 +10,9 @@ include $(MakeInc_def) SETUP_SUBDIRS = \ config \ kextsymboltool \ - setsegname + setsegname \ + decomment \ + md include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/SETUP/config/config.h b/SETUP/config/config.h index 54219e1db..bcb0d3eeb 100644 --- a/SETUP/config/config.h +++ b/SETUP/config/config.h @@ -250,10 +250,10 @@ extern int maxusers; #ifdef mips #define DEV_MASK 0xf #define DEV_SHIFT 4 -#else mips +#else /* mips */ #define DEV_MASK 0x7 #define DEV_SHIFT 3 -#endif mips +#endif /* mips */ /* External function references */ char *get_rest(FILE *fp); diff --git a/SETUP/config/doconf b/SETUP/config/doconf index 2d4e952e9..33612c023 100755 --- a/SETUP/config/doconf +++ b/SETUP/config/doconf @@ -300,8 +300,7 @@ part != 0 {\ # kernel binaries are put). # echo 'builddir "."' >> $SYSCONF.new - set OBJRELDIR=`$RELPATH $OBJROOT $OBJDIR` - echo 'objectdir "'$OBJROOT'/'$OBJRELDIR'"' >> $SYSCONF.new + echo 'objectdir "'$OBJDIR'"' >> $SYSCONF.new set SRCDIR=`dirname $SOURCE` echo 'sourcedir "'$SRCROOT'"' >> $SYSCONF.new if (-f $SYSCONF) then diff --git a/SETUP/config/mkioconf.c b/SETUP/config/mkioconf.c index 90b6c2f97..2b4ff4a65 100644 --- a/SETUP/config/mkioconf.c +++ b/SETUP/config/mkioconf.c @@ -554,7 +554,7 @@ pseudo_inits(FILE *fp) continue; fprintf(fp, "extern int %s(int);\n", dp->d_init); } -#endif notdef +#endif /* notdef */ fprintf(fp, "struct pseudo_init {\n"); fprintf(fp, "\tint\tps_count;\n\tint\t(*ps_func)();\n"); fprintf(fp, "} pseudo_inits[] = {\n"); @@ -567,7 +567,7 @@ pseudo_inits(FILE *fp) count = 1; fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); } -#endif notdef +#endif /* notdef */ fprintf(fp, "\t{0,\t0},\n};\n"); } #endif @@ -731,7 +731,7 @@ romp_ioconf(void) (void) fclose(fp); } -#endif MACHINE_ROMP +#endif /* MACHINE_ROMP */ #if MACHINE_MMAX void @@ -875,7 +875,7 @@ mmax_ioconf(void) (void) fclose(fp); } -#endif MACHINE_MMAX +#endif /* MACHINE_MMAX */ #if MACHINE_SQT @@ -1207,7 +1207,7 @@ sqt_ioconf(void) (void) fclose(fp); } -#endif MACHINE_SQT +#endif /* MACHINE_SQT */ #if MACHINE_I386 void i386_ioconf(void) @@ -1228,7 +1228,7 @@ i386_ioconf(void) i386_pseudo_inits (fp); (void) fclose(fp); } -#endif MACHINE_I386 +#endif /* MACHINE_I386 */ #if MACHINE_MIPSY || MACHINE_MIPS @@ -1513,7 +1513,7 @@ is_declared(const char *cp) } return(0); } -#endif MACHINE_MIPSY || MACHINE_MIPS +#endif /* MACHINE_MIPSY || MACHINE_MIPS */ #if MACHINE_M68K char *m68k_dn(const char *name); @@ -1709,7 +1709,7 @@ m68k_dn(const char *name) { sprintf(errbuf, "&%sdriver", name); return ns(errbuf); } -#endif MACHINE_M68K +#endif /* MACHINE_M68K */ #if MACHINE_M88K || MACHINE_M98K char *nrw_dn(char *name); @@ -1800,7 +1800,7 @@ m98k_dn(char *name) } -#endif MACHINE_M88K || MACHINE_M98K +#endif /* MACHINE_M88K || MACHINE_M98K */ #ifdef MACHINE_HPPA char *hppa_dn(char *name); @@ -1855,7 +1855,7 @@ hppa_dn(char *name) return (errbuf); } -#endif MACHINE_HPPA +#endif /* MACHINE_HPPA */ #ifdef MACHINE_SPARC char *sparc_dn(char *name); @@ -1909,7 +1909,7 @@ sparc_dn(char *name) return (errbuf); } -#endif MACHINE_SPARC +#endif /* MACHINE_SPARC */ #ifdef MACHINE_PPC char *ppc_dn(char *name); @@ -1964,7 +1964,7 @@ ppc_dn(name) return (errbuf); } -#endif MACHINE_PPC +#endif /* MACHINE_PPC */ #ifdef MACHINE_ARM void arm_pseudo_inits(FILE *fp); diff --git a/SETUP/config/mkmakefile.c b/SETUP/config/mkmakefile.c index 6ac9aa099..4bf5602fd 100644 --- a/SETUP/config/mkmakefile.c +++ b/SETUP/config/mkmakefile.c @@ -659,7 +659,7 @@ do_objs(FILE *fp, const char *msg, int ext) #if DO_SWAPFILE register struct file_list *fl; char swapname[32]; -#endif DO_SWAPFILE +#endif /* DO_SWAPFILE */ fprintf(fp, "%s", msg); lpos = strlen(msg); @@ -693,7 +693,7 @@ do_objs(FILE *fp, const char *msg, int ext) if (eq(sp, swapname)) goto cont; } -#endif DO_SWAPFILE +#endif /* DO_SWAPFILE */ cp = (char *)sp + (len = strlen(sp)) - 1; och = *cp; *cp = 'o'; @@ -707,7 +707,7 @@ do_objs(FILE *fp, const char *msg, int ext) #if DO_SWAPFILE cont: ; -#endif DO_SWAPFILE +#endif /* DO_SWAPFILE */ } if (lpos != 8) putc('\n', fp); @@ -955,10 +955,11 @@ do_rules(FILE *f) } break; default: - fprintf(f, "\t${S_RULE_1A}%s%.*s${S_RULE_1B}%s\n", - source_dir, (int)(tp-np), np, nl); - fprintf(f, "\t${S_RULE_2}%s\n", nl); - fprintf(f, "\t${S_RULE_3}\n\n"); + fprintf(f, "\t${S_RULE_1A}%s%.*s${S_RULE_1B}%s\n", + source_dir, (int)(tp-np), np, nl); + fprintf(f, "\t${S_RULE_2}%s\n", nl); + fprintf(f, "\t${S_RULE_3}\n\n"); + break; } continue; } @@ -989,7 +990,7 @@ do_rules(FILE *f) source_dir, extras, np); } break; - #endif 0 + #endif /* 0 */ default: goto common; } @@ -1091,9 +1092,9 @@ do_load(FILE *f) fprintf(f, " %s", fl->f_needs); #ifdef multimax fprintf(f, "\n\nall .ORDER: includelinks ${LOAD}\n"); -#else multimax +#else /* multimax */ fprintf(f, "\n\nall: includelinks ${LOAD}\n"); -#endif multimax +#endif /* multimax */ fprintf(f, "\n"); } @@ -1138,7 +1139,7 @@ do_swapspec(__unused FILE *f, __unused const char *name, __unused char *sysname) fprintf(f, "\t${C_RULE_3}\n"); fprintf(f, "\t${C_RULE_4}\n\n"); } -#endif DO_SWAPFILE +#endif /* DO_SWAPFILE */ } char * diff --git a/SETUP/decomment/Makefile b/SETUP/decomment/Makefile new file mode 100644 index 000000000..05cf5b833 --- /dev/null +++ b/SETUP/decomment/Makefile @@ -0,0 +1,31 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = decomment.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) + +decomment: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +do_build_setup: decomment + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/decomment/decomment.c b/SETUP/decomment/decomment.c new file mode 100644 index 000000000..f95bdb69d --- /dev/null +++ b/SETUP/decomment/decomment.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * decomment.c + * + * Removes all comments and (optionally) whitespace from an input file. + * Writes result on stdout. + */ + +#include +#include /* for isspace */ +#include + +/* + * State of input scanner. + */ +typedef enum { + IS_NORMAL, + IS_SLASH, // encountered opening '/' + IS_IN_COMMENT, // within / * * / comment + IS_STAR, // encountered closing '*' + IS_IN_END_COMMENT // within / / comment +} input_state_t; + +static void usage(char **argv); + +int main(int argc, char **argv) +{ + FILE *fp; + char bufchar; + input_state_t input_state = IS_NORMAL; + int exit_code = 0; + int remove_whitespace = 0; + int arg; + + if(argc < 2) + usage(argv); + for(arg=2; arg +#include +#include +#include +#include + +#define LINESIZE 65536 // NeXT_MOD + +#define OUTLINELEN 79 +#define IObuffer 50000 +#define SALUTATION "# Dependencies for File:" +#define SALUTATIONLEN (sizeof SALUTATION - 1) +#define OLDSALUTATION "# DO NOT DELETE THIS LINE" +#define OLDSALUTATIONLEN (sizeof OLDSALUTATION - 1) + +char file_array[IObuffer]; /* read file and store crunched names */ +char dep_line[LINESIZE]; /* line being processed */ +char dot_o[LINESIZE]; /* : prefix */ +char *path_component[100]; /* stores components for a path while being + crunched */ + +struct dep { /* stores paths that a file depends on */ + int len; + char *str; +} dep_files[1000]; +int dep_file_index; + +qsort_strcmp(a, b) +struct dep *a, *b; +{ +extern int strcmp(); + return strcmp(a->str, b->str); +} + +char *outfile = (char *) 0; /* generate dependency file */ +FILE *out; + +char *makefile = (char *) 0; /* user supplied makefile name */ +char *real_mak_name; /* actual makefile name (if not supplied) */ +char shadow_mak_name[LINESIZE]; /* changes done here then renamed */ +FILE *mak; /* for reading makefile */ +FILE *makout; /* for writing shadow */ +char makbuf[LINESIZE]; /* one line buffer for makefile */ +struct stat makstat; /* stat of makefile for time comparisons */ +int mak_eof = 0; /* eof seen on makefile */ +FILE *find_mak(), *temp_mak(); + +int delete = 0; /* -d delete dependency file */ +int debug = 0; +int D_contents = 0; /* print file contents */ +int D_depend = 0; /* print dependency processing info */ +int D_make = 0; /* print makefile processing info */ +int D_open = 0; /* print after succesful open */ +int D_time = 0; /* print time comparison info */ +int force = 1; /* always update dependency info */ +int update = 0; /* it's ok if the -m file does not exist */ +int verbose = 0; /* tell me something */ +int expunge = 0; /* first flush dependency stuff from makefile */ + + +char *name; + +static void scan_mak(FILE *, FILE *, char *); +static void finish_mak(FILE *, FILE *); + +main(argc,argv) +register char **argv; +{ +int size; + + name = *argv; + {register char *cp =name; + while (*cp) if (*cp++ == '/') name = cp; + } + + for ( argv++ ; --argc ; argv++ ) { register char *token = *argv; + if (*token++ != '-' || !*token) + break; + else { register int flag; + for ( ; flag = *token++ ; ) { + switch (flag) { + case 'd': + delete++; + break; + case 'f': + force++; + break; + case 'u': + update++; + case 'm': + makefile = *++argv; + if (--argc < 0) goto usage; + break; + case 'o': + outfile = *++argv; + if (--argc < 0) goto usage; + break; + case 'v': + verbose++; + break; + case 'x': + expunge++; + break; + case 'D': + for ( ; flag = *token++ ; ) + switch (flag) { + case 'c': + D_contents++; + break; + case 'd': + D_depend++; + break; + case 'm': + D_make++; + break; + case 'o': + D_open++; + break; + case 't': + D_time++; + break; + case 'D': + debug++; + break; + default: + goto letters; + } + goto newtoken; + default: + goto usage; + } +letters: ; + } + } +newtoken: ; + } + + if (!expunge && argc < 1) goto usage; + if ((int) outfile && (int) makefile) /* not both */ + goto usage; + + if ((int) outfile) { + /* + * NeXT_MOD, For SGS stuff, in case still linked to master version + */ + unlink(outfile); + + if ((out = fopen(outfile, "w")) == NULL) { + fprintf(stderr, "%s: outfile = \"%s\" ", name, outfile); + perror("fopen"); + fflush(stdout), fflush(stderr); + exit(1); + } else if (D_open) + printf("%s: opened outfile \"%s\"\n", name, outfile); + } else if (mak = find_mak(makefile)) { + makout = temp_mak(); + out = makout; + if (expunge) + expunge_mak(mak, makout); + else + skip_mak(mak, makout); + } else if (mak_eof && /* non existent file == mt file */ + (int)(makout = temp_mak())) { /* but we need to be able */ + out = makout; /* to write here */ + } else if (makefile) { + fprintf(stderr, "%s: makefile \"%s\" can not be opened or stat'ed\n", + name, makefile); + exit(2); + } + + for (; argc--; argv++) { + dep_file_index = 0; + + if (size = read_dep(*argv)) { + + save_dot_o(); + if (D_depend) printf("%s: dot_o = \"%s\"\n", name, dot_o); + + parse_dep(); + if (mak) scan_mak(mak, makout, dot_o); + if (out) output_dep(out); + + if (delete) + unlink(*argv); + } + } + + if (mak) finish_mak(mak, makout); + rename(shadow_mak_name, real_mak_name); + exit(0); +usage: + fprintf(stderr, "usage: md -f -Dcdmot -m makefile -o outputfile -v ... \n"); + exit(1); +} + + +read_dep(file) +register char *file; +{ +register int fd; +register int size; +struct stat statbuf; + + if ((fd = open(file, 0)) < 0) { + fprintf(stderr, "%s: file = \"%s\" ", name, file); + perror("open"); + fflush(stdout), fflush(stderr); + return 0; + } + if (D_open) + printf("%s: opened dependency file \"%s\"\n", name, file); + + if (fstat(fd, &statbuf) < 0) { + fprintf(stderr, "%s: file = \"%s\" ", name, file); + perror("stat"); + fflush(stdout), fflush(stderr); + goto out; + } + switch(statbuf.st_mode & S_IFMT) { + case S_IFREG: + if (D_time) + printf("%s: file time = %d\n", name, statbuf.st_mtime); + + if (statbuf.st_size > IObuffer) { + fprintf(stderr, "%s: file \"%s\" tooo big for IObuffer\n", + name, file); + goto out; + } else if (force) + break; + else if ((int) mak && statbuf.st_mtime < makstat.st_mtime) { + if (verbose || D_time) + fprintf(stderr, "%s: skipping \"%s\" %d < %d \"%s\"\n", + name, file, statbuf.st_mtime, makstat.st_mtime, + real_mak_name); + goto out; + } else /* >= =>ok */ + break; + case S_IFDIR: + case S_IFLNK: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + default: + fprintf(stderr, "%s: bad mode: 0%o on \"%s\"\n", + name, statbuf.st_mode, file); + fflush(stdout), fflush(stderr); + goto out; + } + + if ((size = read(fd, file_array, sizeof (file_array))) < 0) { + fprintf(stderr, "%s: file = \"%s\" ", name, file); + perror("read"); + fflush(stdout), fflush(stderr); + goto out; + } + file_array[size] = 0; + + if (close(fd) < 0) { + fprintf(stderr, "%s: file = \"%s\" ", name, file); + perror("close"); + fflush(stdout), fflush(stderr); + return 0; + } + + if (D_depend && D_contents) + printf("file_array: \"%s\"\n", file_array); + return size; +out: ; + close(fd); + return 0; +} + +save_dot_o() +{ +register char *cp = file_array; +register char *svp = dot_o; +register int c; + + while ((*svp++ = (c = *cp++)) && c != ':'); + *svp = 0; +} + +parse_dep() +{ +register char *lp = file_array; +register int c; + + while (*lp) {register char *tlp = lp; + register char *cp = dep_line; + register int i = 0; + int abspath = 0; + char oldc; + char *oldcp; + + /* get a line to process */ + while ((c = *lp++) && c != '\n') + { + if (c == '\\') + lp++; /* skip backslash newline */ + else + *cp++ = c; + } + if (!c) + break; + *cp = 0; + cp = dep_line; + lp[-1] = 0; + /* skip .o file name */ + while ((c = *cp++) && c != ':'); if (!c) continue; +next_filename: + i = 0; + abspath = 0; + while ((c = *cp) && (c == ' ' || c == '\t')) cp++; if (!c) continue; + + /* canonicalization processing */ + + /* initial / is remembered */ + if (c == '/') + abspath++; + + while (c && c != ' ' && c != '\t') { + if (D_depend) printf("i = %d going \"%s\"\n", i, cp); + /* kill \'s */ + while ((c = *cp) && c == '/') cp++; if (!c) break; + path_component[i] = cp; + /* swallow chars till next / or null */ + while ((c = *cp++) && c != '/' && c != ' ' && c != '\t'); + if (c) cp[-1]=0;/* end component C style */ + + /* ignore . */; + if (!strcmp(path_component[i], ".")) + ; /* if "component" != .. */ + else /* don't reduce /component/.. to nothing */ + i++; /* there could be symbolic links! */ + } + /* reassemble components */ + oldc = c; /* save c */ + oldcp = cp; /* save cp */ + cp = tlp; /* overwrite line in buffer */ + if (abspath) + *cp++ = '/'; + for (c=0; clen; + register char *str = dp->str; + if (j && len == (dp-1)->len && !strcmp(str, (dp-1)->str)) + continue; + written++; + if (size + len + 1 > OUTLINELEN) { + fprintf(out, "\n%s %s", dot_o, str); + size = dot_o_len + len + 1; + } else { + fprintf(out, " %s", str); + size += len + 1; + } + } + fprintf(out, "\n"); + if (verbose) + fprintf(stdout, "%s: \"%s\" %d => %d\n", name, dot_o, dep_file_index, written); +} + + /* process makefile */ +FILE * +find_mak(file) +char *file; +{ +FILE *mak; + + if ((int) file) { + if ((mak = fopen(file, "r")) != NULL) { + real_mak_name = file; + } else if (update) { + mak_eof = 1; + real_mak_name = file; + return NULL; + } else { + fprintf(stderr, "%s: file = \"%s\" ", name, file); + perror("fopen"); + fflush(stdout), fflush(stderr); + return NULL; + } + } else { + if ((mak = fopen("makefile", "r")) != NULL) { + real_mak_name = "makefile"; + } else if ((mak = fopen("Makefile", "r")) != NULL) { + real_mak_name = "Makefile"; + } else return NULL; + } + + if (fstat(fileno(mak), &makstat) < 0) { + fprintf(stderr, "%s: file = \"%s\" ", name, real_mak_name); + perror("stat"); + fflush(stdout), fflush(stderr); + return NULL; + } + if (D_open) + printf("%s: opened makefile \"%s\"\n", name, real_mak_name); + if (D_time) + printf("%s: makefile time = %d\n", name, makstat.st_mtime); + + return mak; +} + +FILE * +temp_mak() +{ +FILE *mak; + + strcpy(shadow_mak_name, real_mak_name); + strcat(shadow_mak_name, ".md"); + + /* + * For SGS stuff, in case still linked to master version + */ + unlink(shadow_mak_name); + if ((mak = fopen(shadow_mak_name, "w")) == NULL) { + fprintf(stderr, "%s: file = \"%s\" ", name, shadow_mak_name); + perror("fopen"); + fflush(stdout), fflush(stderr); + return NULL; + } + if (D_open) + printf("%s: opened makefile.md \"%s\"\n", name, shadow_mak_name); + + return mak; +} + +skip_mak(makin, makout) +register FILE *makin, *makout; +{ +register int len = SALUTATIONLEN; + + if (D_make) + printf("skipping in \"%s\" ", real_mak_name); + + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (D_make && D_contents) + printf("%s: \"%s\"\n", real_mak_name, makbuf); + if (strncmp(makbuf, SALUTATION, len)) { + fputs(makbuf, makout); + } else + break; + } + mak_eof = feof(makin); + if (mak_eof) + fclose(makin); + if (D_make) + printf("eof = %d str = \"%s\"", mak_eof, makbuf); +} + +expunge_mak(makin, makout) +register FILE *makin, *makout; +{ +register int len = SALUTATIONLEN; +register int oldlen = OLDSALUTATIONLEN; + + if (D_make) + printf("expunging in \"%s\" ", real_mak_name); + + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (D_make && D_contents) + printf("%s: \"%s\"\n", real_mak_name, makbuf); + if (! strncmp(makbuf, SALUTATION, len) || + ! strncmp(makbuf, OLDSALUTATION, oldlen)) + break; + else + fputs(makbuf, makout); + } + mak_eof = 1; + if (mak_eof) + fclose(makin); + if (D_make) + printf("eof = %d str = \"%s\"", mak_eof, makbuf); +} + +static void +scan_mak(FILE *makin, FILE *makout, char *file) +{ +register char *cp = &makbuf[SALUTATIONLEN+1]; +register int len = strlen(file); +register int ret; + + if (D_make) + printf("scanning in \"%s\" for \"%s\"\n", real_mak_name, file); + + do { + if (mak_eof) /* don't scan any more */ + return; + + ret = strncmp(cp, file, len); + if (D_make) + printf("saw \"%s\" ret = %d\n", cp, ret); + + if (ret < 0) { /* skip forward till match or greater */ + fputs(makbuf, makout); /* line we're looking at */ + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (strncmp(makbuf, SALUTATION, SALUTATIONLEN)) { + fputs(makbuf, makout); + } else + break; + } + mak_eof = feof(makin); + if (mak_eof) + fclose(makin); + continue; + } else if (ret == 0) { /* flush match */ + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (strncmp(makbuf, SALUTATION, SALUTATIONLEN)) { + ; /* flush old stuff */ + } else + break; + } + mak_eof = feof(makin); + if (mak_eof) + fclose(makin); + break; + } else { /* no luck this time */ + break; + } + } while (1); +} + +static void +finish_mak(FILE *makin, FILE *makout) +{ + if (mak_eof) /* don't scan any more */ + return; + + if (D_make) + printf("finishing in \"%s\"\n", real_mak_name); + + fputs(makbuf, makout); /* line we're looking at */ + while (fgets(makbuf, LINESIZE, makin) != NULL) { + fputs(makbuf, makout); + } +} diff --git a/bsd/bsm/audit.h b/bsd/bsm/audit.h index a24cc88d7..d3bc41fa6 100644 --- a/bsd/bsm/audit.h +++ b/bsd/bsm/audit.h @@ -311,11 +311,29 @@ int auditon(int, void *, int); int auditctl(const char *); int getauid(au_id_t *); int setauid(const au_id_t *); -int getaudit(struct auditinfo *); -int setaudit(const struct auditinfo *); int getaudit_addr(struct auditinfo_addr *, int); int setaudit_addr(const struct auditinfo_addr *, int); +#if defined(__APPLE__) +#include + +/* + * getaudit()/setaudit() are deprecated and have been replaced with + * wrappers to the getaudit_addr()/setaudit_addr() syscalls above. + */ + +int getaudit(struct auditinfo *) + __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, + __IPHONE_2_0, __IPHONE_NA); +int setaudit(const struct auditinfo *) + __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, + __IPHONE_2_0, __IPHONE_NA); +#else + +int getaudit(struct auditinfo *); +int setaudit(const struct auditinfo *); +#endif /* !__APPLE__ */ + #ifdef __APPLE_API_PRIVATE #include mach_port_name_t audit_session_self(void); diff --git a/bsd/bsm/audit_errno.h b/bsd/bsm/audit_errno.h index f7dec8d89..c6f058017 100644 --- a/bsd/bsm/audit_errno.h +++ b/bsd/bsm/audit_errno.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2008 Apple Inc. + * Copyright (c) 2008-2011 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -163,6 +163,7 @@ #define BSM_ERRNO_EALREADY 149 #define BSM_ERRNO_EINPROGRESS 150 #define BSM_ERRNO_ESTALE 151 +#define BSM_ERRNO_EQFULL 152 /* * OpenBSM constants for error numbers not defined in Solaris. In the event diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 25e3eb829..d5e4bac01 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2005-2009 Apple Inc. + * Copyright (c) 2005-2010 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -732,6 +732,7 @@ #define AUE_INITGROUPS AUE_NULL #define AUE_IOPOLICYSYS AUE_NULL #define AUE_ISSETUGID AUE_NULL +#define AUE_LEDGER AUE_NULL #define AUE_LIOLISTIO AUE_NULL #define AUE_LISTXATTR AUE_NULL #define AUE_LSTATV AUE_NULL diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index bb57c6dae..2cbd0b9e8 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -114,6 +114,8 @@ options NORMA_ETHER # NORMA across ethernet # options SIMPLE_CLOCK # don't assume fixed tick # options XPR_DEBUG # kernel tracing # options KDEBUG # kernel tracing # +options IST_KDEBUG # limited kernel tracing # +options NO_KDEBUG # no kernel tracing # options DDM_DEBUG # driverkit-style tracing # options MACH_OLD_VM_COPY # Old vm_copy technology # options NO_DIRECT_RPC # for untyped mig servers # @@ -127,8 +129,15 @@ options ROUTING # routing # options VLAN # # options BOND # # options PF # Packet Filter # -options PF_PKTHDR # PF tag inside mbuf pkthdr # +options PF_ALTQ # PF ALTQ (Alternate Queueing) # options PFLOG # PF log interface # +options PKTSCHED_CBQ # CBQ packet scheduler # +options PKTSCHED_HFSC # H-FSC packet scheduler # +options PKTSCHED_PRIQ # PRIQ packet scheduler # +options PKTSCHED_FAIRQ # FAIRQ packet scheduler # +options CLASSQ_BLUE # BLUE queueing algorithm # +options CLASSQ_RED # RED queueing algorithm # +options CLASSQ_RIO # RIO queueing algorithm # options IPDIVERT # Divert sockets (for NAT) # options IPFIREWALL # IP Firewalling (used by NAT) # options IPFIREWALL_FORWARD #Transparent proxy # @@ -182,7 +191,10 @@ options FDESC # fdesc_fs support # options DEVFS # devfs support # options JOURNALING # journaling support # options HFS_COMPRESSION # hfs compression # -options CONFIG_HFS_TRIM # HFS trims unused blocks # +options CONFIG_HFS_STD # hfs standard support # +options CONFIG_HFS_TRIM # hfs trims unused blocks # +options CONFIG_HFS_MOUNT_UNMAP #hfs trims blocks at mount # + # # file system features @@ -193,6 +205,9 @@ options NAMEDSTREAMS # named stream vnop support # options CONFIG_VOLFS # volfs path support (legacy) # options CONFIG_IMGSRC_ACCESS # source of imageboot dmg # options CONFIG_TRIGGERS # trigger vnodes # +options CONFIG_VFS_FUNNEL # thread unsafe vfs's # +options CONFIG_EXT_RESOLVER # e.g. memberd # +options CONFIG_SEARCHFS # searchfs syscall support # # # NFS support @@ -234,7 +249,6 @@ options "IPV6FIREWALL_DEFAULT_TO_ACCEPT" #IPv6 Firewall Feature # pseudo-device dummy 2 # -pseudo-device faith 1 # pseudo-device stf 1 # options crypto # @@ -336,18 +350,18 @@ options CONFIG_NMBCLUSTERS="((1024 * 1024) / MCLBYTES)" # options CONFIG_NMBCLUSTERS="((1024 * 512) / MCLBYTES)" # options CONFIG_NMBCLUSTERS="((1024 * 256) / MCLBYTES)" # -# -# set maximum space used for packet buffers -# -options CONFIG_USESOCKTHRESHOLD=1 # -options CONFIG_USESOCKTHRESHOLD=0 # - # # Configure size of TCP hash table # options CONFIG_TCBHASHSIZE=4096 # options CONFIG_TCBHASHSIZE=128 # +# +# Configure bandwidth limiting sysctl +# +options CONFIG_ICMP_BANDLIM=250 # +options CONFIG_ICMP_BANDLIM=50 # + # # configurable async IO options # CONFIG_AIO_MAX - system wide limit of async IO requests. @@ -435,18 +449,31 @@ options CONFIG_CODE_DECRYPTION # # # User Content Protection, used on embedded # - options CONFIG_PROTECT # # -# freeze - support app hibernation, used on embedded -# CONFIG_FREEZE_SUSPENDED_MIN is the minimum number of suspended -# processes to be left unhibernated +# enable per-process memory priority tracking +# +options CONFIG_MEMORYSTATUS # + +# +# enable jetsam - used on embedded +# +options CONFIG_JETSAM # + +# +# enable freezing of suspended processes - used on embedded # options CONFIG_FREEZE # options CHECK_CS_VALIDATION_BITMAP # +# +# memory pressure event support +# must be set in both bsd/conf and osfmk/conf MASTER files +# +options VM_PRESSURE_EVENTS # + # # Ethernet (ARP) # @@ -507,4 +534,3 @@ pseudo-device sdt 1 init sdt_init # pseudo-device systrace 1 init systrace_init # pseudo-device fbt 1 init fbt_init # pseudo-device profile_prvd 1 init profile_init # - diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index 594f0fb51..c2cae3eba 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -44,11 +44,14 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs config_hfs_trim hfs_compression config_hfs_alloc_rbtree config_imgsrc_access config_triggers ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge pf pflog pf_pkthdr ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks memorystatus vm_pressure_events ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs hfs_compression config_hfs_std config_hfs_alloc_rbtree config_hfs_trim config_imgsrc_access config_triggers config_vfs_funnel config_ext_resolver config_searchfs] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge PF ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] +# PF = [ pf pflog ] +# PKTSCHED = [ pktsched_cbq pktsched_fairq pktsched_hfsc pktsched_priq ] +# CLASSQ = [ classq_blue classq_red classq_rio ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] # PROFILE = [ RELEASE profile ] # DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] @@ -115,4 +118,3 @@ pseudo-device nfsmeas # # Removable Volume support # pseudo-device vol # - diff --git a/bsd/conf/MASTER.x86_64 b/bsd/conf/MASTER.x86_64 index 4bf42910b..a1be0eb1f 100644 --- a/bsd/conf/MASTER.x86_64 +++ b/bsd/conf/MASTER.x86_64 @@ -44,11 +44,14 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs config_hfs_trim hfs_compression config_hfs_alloc_rbtree config_imgsrc_access config_triggers ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge pf pflog pf_pkthdr ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks memorystatus vm_pressure_events ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs hfs_compression config_hfs_std config_hfs_alloc_rbtree config_hfs_trim config_imgsrc_access config_triggers config_ext_resolver config_searchfs ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge PF ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] +# PF = [ pf pflog ] +# PKTSCHED = [ pktsched_cbq pktsched_fairq pktsched_hfsc pktsched_priq ] +# CLASSQ = [ classq_blue classq_red classq_rio ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] # PROFILE = [ RELEASE profile ] # DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] @@ -95,6 +98,9 @@ options CONFIG_AUDIT # Kernel auditing # app-profiling i.e. pre-heating - off? options CONFIG_APP_PROFILE=0 +# kernel performance tracing +#options KPERF # + # # code decryption... used on i386 for DSMOS # must be set in all the bsd/conf and osfmk/conf MASTER files @@ -115,4 +121,3 @@ pseudo-device nfsmeas # # Removable Volume support # pseudo-device vol # - diff --git a/bsd/conf/Makefile b/bsd/conf/Makefile index afaf3eb89..610e6d6c5 100644 --- a/bsd/conf/Makefile +++ b/bsd/conf/Makefile @@ -41,9 +41,11 @@ $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ do_all: $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ + next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH)); \ ${MAKE} -C $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(BSD_KERNEL_CONFIG)/Makefile \ SOURCE=$${next_source} \ + RELATIVE_SOURCE_PATH=$${next_relsource} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ KERNEL_CONFIG=$(BSD_KERNEL_CONFIG) \ diff --git a/bsd/conf/Makefile.i386 b/bsd/conf/Makefile.i386 index a46354589..59554731c 100644 --- a/bsd/conf/Makefile.i386 +++ b/bsd/conf/Makefile.i386 @@ -9,6 +9,11 @@ fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual # sha256 Files to build with -DSHA256_USE_ASSEMBLY=1 sha2.o_CFLAGS_ADD += -DSHA256_USE_ASSEMBLY=1 +# Inline assembly doesn't interact well with LTO +fbt_x86.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) +# Taking the address of labels doesn't work with LTO (9524055) +dtrace.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) + ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index 8691ce705..61a088bd8 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -1,5 +1,5 @@ # -# Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. +# Copyright (c) 2000-2011 Apple Inc. All rights reserved. # # @APPLE_LICENSE_HEADER_START@ # @@ -102,6 +102,7 @@ OBJS_NO_SIGN_COMPARE = \ mld6.o \ nd6.o \ nd6_nbr.o \ + nd6_prproxy.o \ nd6_rtr.o \ raw_ip6.o \ route6.o \ @@ -118,6 +119,163 @@ OBJS_NO_SIGN_COMPARE = \ $(foreach file,$(OBJS_NO_SIGN_COMPARE),$(eval $(call add_perfile_cflags,$(file),-Wno-sign-compare))) +# Objects that want -Wcast-align warning treated as error +OBJS_ERROR_CAST_ALIGN = \ + kern_control.o \ + mcache.o \ + sys_socket.o \ + uipc_mbuf.o \ + uipc_mbuf2.o \ + uipc_socket.o \ + uipc_socket2.o \ + uipc_syscalls.o \ + bpf.o \ + bpf_filter.o \ + bridgestp.o \ + bsd_comp.o \ + devtimer.o \ + dlil.o \ + ether_if_module.o \ + ether_inet_pr_module.o \ + ether_inet6_pr_module.o \ + flowhash.o \ + if.o \ + if_bridge.o \ + if_gif.o \ + if_llreach.o \ + if_loop.o \ + if_media.o \ + if_mib.o \ + if_pflog.o \ + if_stf.o \ + if_utun.o \ + if_utun_crypto.o \ + if_utun_crypto_ipsec.o \ + if_vlan.o \ + init.o \ + iptap.o \ + kext_net.o \ + kpi_interface.o \ + kpi_interfacefilter.o \ + kpi_protocol.o \ + kpi_protocol.o \ + ndrv.o \ + net_osdep.o \ + net_str_id.o \ + netsrc.o \ + ntstat.o \ + pf.o \ + pf_if.o \ + pf_ioctl.o \ + pf_norm.o \ + pf_osfp.o \ + pf_ruleset.o \ + pf_table.o \ + ppp_deflate.o \ + radix.o \ + raw_cb.o \ + raw_usrreq.o \ + route.o \ + rtsock.o \ + dhcp_options.o \ + altq_cbq.o \ + altq_fairq.o \ + altq_hfsc.o \ + altq_priq.o \ + altq_qfq.o \ + altq_subr.o \ + pktsched.o \ + pktsched_cbq.o \ + pktsched_fairq.o \ + pktsched_hfsc.o \ + pktsched_priq.o \ + pktsched_qfq.o \ + pktsched_rmclass.o \ + pktsched_tcq.o \ + classq.o \ + classq_blue.o \ + classq_red.o \ + classq_rio.o \ + classq_sfb.o \ + classq_subr.o \ + classq_util.o \ + igmp.o \ + in.o \ + in_arp.o \ + in_cksum.o \ + in_dhcp.o \ + in_gif.o \ + in_mcast.o \ + in_pcb.o \ + in_pcblist.o \ + in_proto.o \ + in_rmx.o \ + in_tclass.o \ + ip_divert.o \ + ip_ecn.o \ + ip_encap.o \ + ip_icmp.o \ + ip_id.o \ + ip_input.o \ + ip_mroute.o \ + ip_output.o \ + kpi_ipfilter.o \ + raw_ip.o \ + tcp_debug.o \ + tcp_input.o \ + tcp_ledbat.o \ + tcp_newreno.o \ + tcp_output.o \ + tcp_sack.o \ + tcp_subr.o \ + tcp_timer.o \ + tcp_usrreq.o \ + udp_usrreq.o \ + ah_core.o \ + ah_input.o \ + ah_output.o \ + dest6.o \ + esp_core.o \ + esp_input.o \ + esp_output.o \ + esp_rijndael.o \ + frag6.o \ + icmp6.o \ + in6.o \ + in6_cksum.o \ + in6_gif.o \ + in6_ifattach.o \ + in6_mcast.o \ + in6_pcb.o \ + in6_prefix.o \ + in6_proto.o \ + in6_rmx.o \ + in6_src.o \ + ip6_forward.o \ + ip6_id.o \ + ip6_input.o \ + ip6_mroute.o \ + ip6_output.o \ + ipcomp_core.o \ + ipcomp_input.o \ + ipcomp_output.o \ + ipsec.o \ + mld6.o \ + nd6.o \ + nd6_nbr.o \ + nd6_rtr.o \ + raw_ip6.o \ + route6.o \ + scope6.o \ + udp6_output.o \ + udp6_usrreq.o \ + key.o \ + key_debug.o \ + keydb.o \ + keysock.o + +$(foreach file,$(OBJS_ERROR_CAST_ALIGN),$(eval $(call add_perfile_cflags,$(file),-Werror=cast-align))) + # # Directories for mig generated files # diff --git a/bsd/conf/Makefile.x86_64 b/bsd/conf/Makefile.x86_64 index 29811299a..e45baf159 100644 --- a/bsd/conf/Makefile.x86_64 +++ b/bsd/conf/Makefile.x86_64 @@ -9,6 +9,11 @@ fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual # sha256 Files to build with -DSHA256_USE_ASSEMBLY=1 sha2.o_CFLAGS_ADD += -DSHA256_USE_ASSEMBLY=1 +# Inline assembly doesn't interact well with LTO +fbt_x86.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) +# Taking the address of labels doesn't work with LTO (9524055) +dtrace.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) + ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### diff --git a/bsd/conf/files b/bsd/conf/files index b3a7b10c4..dd8075219 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -26,7 +26,6 @@ OPTIONS/mach_host optional mach_host OPTIONS/mach_ipc_compat optional mach_ipc_compat OPTIONS/mach_ipc_debug optional mach_ipc_debug OPTIONS/mach_ipc_test optional mach_ipc_test -OPTIONS/mach_kdb optional mach_kdb OPTIONS/mach_ldebug optional mach_ldebug OPTIONS/mach_load optional mach_load OPTIONS/mach_machine_routines optional mach_machine_routines @@ -108,13 +107,21 @@ OPTIONS/ipfirewall optional ipfirewall OPTIONS/ipv6firewall optional ipv6firewall OPTIONS/tcpdebug optional tcpdebug OPTIONS/if_bridge optional if_bridge -OPTIONS/faith optional faith +OPTIONS/bridgestp optional bridgestp if_bridge OPTIONS/gif optional gif OPTIONS/netat optional netat OPTIONS/sendfile optional sendfile OPTIONS/randomipid optional randomipid OPTIONS/pf optional pf - +OPTIONS/pflog optional pflog pf +OPTIONS/pf_altq optional pf_altq pf +OPTIONS/classq_blue optional classq_blue +OPTIONS/classq_red optional classq_red +OPTIONS/classq_rio optional classq_rio +OPTIONS/pktsched_cbq optional pktsched_cbq +OPTIONS/pktsched_fairq optional pktsched_fairq +OPTIONS/pktsched_hfsc optional pktsched_hfsc +OPTIONS/pktsched_priq optional pktsched_priq OPTIONS/zlib optional zlib # @@ -193,7 +200,7 @@ bsd/kern/decmpfs.c standard bsd/net/bpf.c optional bpfilter bsd/net/bpf_filter.c optional bpfilter bsd/net/if_bridge.c optional if_bridge -bsd/net/bridgestp.c optional if_bridge +bsd/net/bridgestp.c optional bridgestp bsd/net/bsd_comp.c optional ppp_bsdcomp bsd/net/if.c optional networking bsd/net/init.c optional sockets @@ -229,7 +236,9 @@ bsd/net/kpi_protocol.c optional networking bsd/net/kpi_interfacefilter.c optional networking bsd/net/net_str_id.c optional networking bsd/net/if_utun.c optional networking -bsd/net/if_pflog.c optional pflog pf +bsd/net/if_utun_crypto.c optional networking +bsd/net/if_utun_crypto_ipsec.c optional networking +bsd/net/if_pflog.c optional pflog bsd/net/pf.c optional pf bsd/net/pf_if.c optional pf bsd/net/pf_ioctl.c optional pf @@ -237,7 +246,33 @@ bsd/net/pf_norm.c optional pf bsd/net/pf_osfp.c optional pf bsd/net/pf_ruleset.c optional pf bsd/net/pf_table.c optional pf +bsd/net/iptap.c optional networking bsd/net/if_llreach.c optional networking +bsd/net/flowhash.c optional networking + +bsd/net/classq/classq.c optional networking +bsd/net/classq/classq_blue.c optional classq_blue +bsd/net/classq/classq_red.c optional classq_red +bsd/net/classq/classq_rio.c optional classq_rio +bsd/net/classq/classq_sfb.c optional networking +bsd/net/classq/classq_subr.c optional networking +bsd/net/classq/classq_util.c optional networking + +bsd/net/pktsched/pktsched.c optional networking +bsd/net/pktsched/pktsched_cbq.c optional pktsched_cbq +bsd/net/pktsched/pktsched_fairq.c optional pktsched_fairq +bsd/net/pktsched/pktsched_hfsc.c optional pktsched_hfsc +bsd/net/pktsched/pktsched_priq.c optional pktsched_priq +bsd/net/pktsched/pktsched_qfq.c optional networking +bsd/net/pktsched/pktsched_rmclass.c optional pktsched_cbq +bsd/net/pktsched/pktsched_tcq.c optional networking + +bsd/net/altq/altq_cbq.c optional pktsched_cbq pf_altq +bsd/net/altq/altq_fairq.c optional pktsched_fairq pf_altq +bsd/net/altq/altq_hfsc.c optional pktsched_hfsc pf_altq +bsd/net/altq/altq_priq.c optional pktsched_priq pf_altq +bsd/net/altq/altq_qfq.c optional pf_altq +bsd/net/altq/altq_subr.c optional pf_altq bsd/netinet/igmp.c optional inet bsd/netinet/in.c optional inet @@ -268,6 +303,7 @@ bsd/netinet/tcp_subr.c optional inet bsd/netinet/tcp_timer.c optional inet bsd/netinet/tcp_usrreq.c optional inet bsd/netinet/tcp_newreno.c optional inet +bsd/netinet/tcp_lro.c optional inet bsd/netinet/tcp_ledbat.c optional inet bsd/netinet/udp_usrreq.c optional inet bsd/netinet/in_gif.c optional gif inet @@ -305,6 +341,7 @@ bsd/netinet6/in6_rmx.c optional inet6 bsd/netinet6/mld6.c optional inet6 bsd/netinet6/nd6.c optional inet6 bsd/netinet6/nd6_nbr.c optional inet6 +bsd/netinet6/nd6_prproxy.c optional inet6 bsd/netinet6/nd6_rtr.c optional inet6 bsd/netinet6/raw_ip6.c optional inet6 bsd/netinet6/route6.c optional inet6 @@ -318,10 +355,6 @@ bsd/netkey/key_debug.c optional ipsec bsd/netkey/keysock.c optional ipsec bsd/netkey/keydb.c optional ipsec -bsd/crypto/sha2/sha2.c optional crypto allcrypto -bsd/crypto/des/des_ecb.c optional crypto -bsd/crypto/des/des_enc.c optional crypto -bsd/crypto/des/des_setkey.c optional crypto bsd/crypto/blowfish/bf_enc.c optional crypto allcrypto bsd/crypto/blowfish/bf_skey.c optional crypto allcrypto bsd/crypto/cast128/cast128.c optional crypto allcrypto @@ -419,6 +452,7 @@ bsd/nfs/nfs_vfsops.c optional nfsclient bsd/nfs/nfs_vnops.c optional nfsclient bsd/nfs/nfs4_subs.c optional nfsclient bsd/nfs/nfs4_vnops.c optional nfsclient +bsd/nfs/nfs_upcall.c optional nfsserver bsd/kern/netboot.c optional nfsclient @@ -489,7 +523,6 @@ bsd/kern/kern_symfile.c standard bsd/kern/kern_descrip.c standard bsd/kern/kern_event.c standard bsd/kern/kern_control.c optional networking -bsd/kern/kern_callout.c standard bsd/kern/kern_exec.c standard bsd/kern/kern_exit.c standard bsd/kern/kern_lockf.c standard @@ -508,7 +541,7 @@ bsd/kern/kern_subr.c standard bsd/kern/kern_synch.c standard bsd/kern/kern_sysctl.c standard bsd/kern/kern_newsysctl.c standard -bsd/kern/kern_memorystatus.c optional config_embedded +bsd/kern/kern_memorystatus.c optional config_memorystatus bsd/kern/kern_mib.c standard bsd/kern/kpi_mbuf.c optional sockets bsd/kern/kern_time.c standard @@ -558,7 +591,7 @@ bsd/kern/pthread_support.c optional psynch bsd/kern/pthread_synch.c standard bsd/kern/proc_info.c standard bsd/kern/process_policy.c standard -bsd/kern/vm_pressure.c standard +bsd/kern/vm_pressure.c optional vm_pressure_events bsd/kern/socket_info.c optional sockets bsd/vm/vnode_pager.c standard @@ -573,6 +606,8 @@ bsd/conf/param.c standard bsd/dev/chud/chud_bsd_callback.c standard bsd/dev/chud/chud_process.c standard + + bsd/dev/dtrace/dtrace.c optional config_dtrace bsd/dev/dtrace/lockstat.c optional config_dtrace bsd/dev/dtrace/dtrace_ptss.c optional config_dtrace @@ -589,3 +624,5 @@ bsd/dev/dtrace/fasttrap.c optional config_dtrace bsd/kern/imageboot.c optional config_imageboot +osfmk/kperf/kperfbsd.c optional kperf + diff --git a/bsd/conf/files.i386 b/bsd/conf/files.i386 index 331f7202d..5c8852f6c 100644 --- a/bsd/conf/files.i386 +++ b/bsd/conf/files.i386 @@ -15,16 +15,6 @@ bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/i386/munge.s standard -bsd/crypto/aes/i386/AES.s optional crypto -bsd/crypto/aes/i386/aes_modes_asm.s optional crypto -bsd/crypto/aes/i386/aes_modes_hw.s optional crypto -bsd/crypto/aes/i386/aes_key_hw.s optional crypto -bsd/crypto/aes/i386/aes_crypt_hw.s optional crypto -bsd/crypto/aes/i386/aesxts_asm.s optional crypto -bsd/crypto/aes/i386/aesxts.c optional crypto - -bsd/crypto/sha2/intel/sha256.s optional crypto -bsd/crypto/sha2/intel/sha256nossse3.s optional crypto # Lightly ifdef'd to support K64 DTrace bsd/dev/i386/dtrace_isa.c optional config_dtrace diff --git a/bsd/conf/files.x86_64 b/bsd/conf/files.x86_64 index fcb3be604..ed63a4a2f 100644 --- a/bsd/conf/files.x86_64 +++ b/bsd/conf/files.x86_64 @@ -15,16 +15,6 @@ bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/x86_64/munge.s standard -bsd/crypto/aes/i386/AES.s optional crypto -bsd/crypto/aes/i386/aes_modes_asm.s optional crypto -bsd/crypto/aes/i386/aes_modes_hw.s optional crypto -bsd/crypto/aes/i386/aes_key_hw.s optional crypto -bsd/crypto/aes/i386/aes_crypt_hw.s optional crypto -bsd/crypto/aes/i386/aesxts_asm.s optional crypto -bsd/crypto/aes/i386/aesxts.c optional crypto - -bsd/crypto/sha2/intel/sha256.s optional crypto -bsd/crypto/sha2/intel/sha256nossse3.s optional crypto # Lightly ifdef'd to support K64 DTrace bsd/dev/i386/dtrace_isa.c optional config_dtrace diff --git a/bsd/crypto/Makefile b/bsd/crypto/Makefile index ab0c4b986..109c4c4cb 100644 --- a/bsd/crypto/Makefile +++ b/bsd/crypto/Makefile @@ -10,17 +10,11 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ blowfish \ cast128 \ - des \ rc4 \ - aes \ - sha2 - INSTINC_SUBDIRS_I386 = \ - aes INSTINC_SUBDIRS_X86_64 = \ - aes INSTINC_SUBDIRS_ARM = \ @@ -33,7 +27,10 @@ EXPINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS_ARM = \ PRIVATE_DATAFILES = \ - sha1.h + sha1.h \ + sha2.h \ + des.h \ + aes.h aesxts.h \ INSTALL_MI_DIR = crypto @@ -41,6 +38,11 @@ EXPORT_MI_DIR = ${INSTALL_MI_DIR} INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} +# We use this to install aesxts.h in Kernel.framework/PrivateHeaders +# in addition to Kernel.framework/PrivateHeaders/crypto +# This should be removed once all clients are switched to include libkern/crypto/aesxts.h +INSTALL_KF_MD_LCL_LIST = aesxts.h + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/aes.h b/bsd/crypto/aes.h new file mode 100644 index 000000000..9fd55fdfa --- /dev/null +++ b/bsd/crypto/aes.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * This header file is kept for legacy reasons and may be removed in + * future; the interface resides in . + */ +#include diff --git a/bsd/crypto/aes/Assert.c b/bsd/crypto/aes/Assert.c deleted file mode 100644 index 5ba9c4472..000000000 --- a/bsd/crypto/aes/Assert.c +++ /dev/null @@ -1,34 +0,0 @@ -/* This module exists solely to check compile-time assertions. It should be - compiled when building the project, and building should be terminated if - errors are encountered. However, any object it produces need not be - included in the build. -*/ - - -#include - -#include "crypto/aes.h" -#include "Context.h" - -/* Declare CheckAssertion so that if any of the declarations below differ - from it, the compiler will report an error. -*/ -extern char CheckAssertion[1]; - -/* Ensure that ContextKey is the offset of the ks member of the AES context - structures. -*/ -extern char CheckAssertion[ContextKey == offsetof(aes_encrypt_ctx, ks)]; -extern char CheckAssertion[ContextKey == offsetof(aes_decrypt_ctx, ks)]; - /* If these assertions fail, change the definition of ContextKey in - Context.h to match the offset of the ks field. - */ - -/* Ensure that ContextKeyLength is the offset of the inf member of the AES - context structures. -*/ -extern char CheckAssertion[ContextKeyLength == offsetof(aes_encrypt_ctx, inf)]; -extern char CheckAssertion[ContextKeyLength == offsetof(aes_decrypt_ctx, inf)]; - /* If these assertions fail, change the definition of ContextKeyLength in - Context.h to match the offset of the inf field. - */ diff --git a/bsd/crypto/aes/aes.h b/bsd/crypto/aes/aes.h deleted file mode 100755 index 49c845da6..000000000 --- a/bsd/crypto/aes/aes.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - This file contains the definitions required to use AES in C. See aesopt.h - for optimisation details. -*/ - -#ifndef _AES_H -#define _AES_H - - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define AES_128 /* define if AES with 128 bit keys is needed */ -#define AES_192 /* define if AES with 192 bit keys is needed */ -#define AES_256 /* define if AES with 256 bit keys is needed */ -#define AES_VAR /* define if a variable key size is needed */ -#define AES_MODES /* define if support is needed for modes */ - -/* The following must also be set in assembler files if being used */ - -#define AES_ENCRYPT /* if support for encryption is needed */ -#define AES_DECRYPT /* if support for decryption is needed */ -#define AES_ERR_CHK /* for parameter checks & error return codes */ -#define AES_REV_DKS /* define to reverse decryption key schedule */ - -#define AES_BLOCK_SIZE 16 /* the AES block size in bytes */ -#define N_COLS 4 /* the number of columns in the state */ - -typedef unsigned int uint_32t; -typedef unsigned char uint_8t; -typedef unsigned short uint_16t; -typedef unsigned char aes_08t; -typedef unsigned int aes_32t; - -#define void_ret void -#define int_ret int - -/* The key schedule length is 11, 13 or 15 16-byte blocks for 128, */ -/* 192 or 256-bit keys respectively. That is 176, 208 or 240 bytes */ -/* or 44, 52 or 60 32-bit words. */ - -#if defined( AES_VAR ) || defined( AES_256 ) -#define KS_LENGTH 60 -#elif defined( AES_192 ) -#define KS_LENGTH 52 -#else -#define KS_LENGTH 44 -#endif - - -#if 0 // defined (__i386__) || defined (__x86_64__) - -/* - looks like no other code for (i386/x86_64) is using the following definitions any more. - I comment this out, so the C code in the directory gen/ can be used to compile for test/development purpose. - Note : this is not going to change anything in the i386/x86_64 kernel. - (source code in i386/, mostly in assembly, does not reference to this header file.) - - cclee 10-20-2010 -*/ - -/* the character array 'inf' in the following structures is used */ -/* to hold AES context information. This AES code uses cx->inf.b[0] */ -/* to hold the number of rounds multiplied by 16. The other three */ -/* elements can be used by code that implements additional modes */ - -#if defined( AES_ERR_CHK ) -#define aes_rval int_ret -#else -#define aes_rval void_ret -#endif - -typedef union -{ uint_32t l; - uint_8t b[4]; -} aes_inf; - -typedef struct -{ uint_32t ks[KS_LENGTH]; - aes_inf inf; -} aes_encrypt_ctx; - -typedef struct -{ uint_32t ks[KS_LENGTH]; - aes_inf inf; -} aes_decrypt_ctx; - -#else - -#if defined( AES_ERR_CHK ) -#define aes_ret int -#define aes_good 0 -#define aes_error -1 -#else -#define aes_ret void -#endif - -#define aes_rval aes_ret - -typedef struct -{ aes_32t ks[KS_LENGTH]; - aes_32t rn; -} aes_encrypt_ctx; - -typedef struct -{ aes_32t ks[KS_LENGTH]; - aes_32t rn; -} aes_decrypt_ctx; - -#endif - -typedef struct -{ - aes_decrypt_ctx decrypt; - aes_encrypt_ctx encrypt; -} aes_ctx; - - -/* implemented in case of wrong call for fixed tables */ - -void gen_tabs(void); - - -/* Key lengths in the range 16 <= key_len <= 32 are given in bytes, */ -/* those in the range 128 <= key_len <= 256 are given in bits */ - -#if defined( AES_ENCRYPT ) - -#if defined(AES_128) || defined(AES_VAR) -aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); -#endif - -#if defined(AES_192) || defined(AES_VAR) -aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); -#endif - -#if defined(AES_256) || defined(AES_VAR) -aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); -#endif - -#if defined(AES_VAR) -aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]); -#endif - -#if defined (__i386__) || defined (__x86_64__) -aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1]); -#endif - -aes_rval aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out_blk, const aes_encrypt_ctx cx[1]); - -#endif - -#if defined( AES_DECRYPT ) - -#if defined(AES_128) || defined(AES_VAR) -aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); -#endif - -#if defined(AES_192) || defined(AES_VAR) -aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); -#endif - -#if defined(AES_256) || defined(AES_VAR) -aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); -#endif - -#if defined(AES_VAR) -aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]); -#endif - -#if defined (__i386__) || defined (__x86_64__) -aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1]); -#endif - -aes_rval aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out_blk, const aes_decrypt_ctx cx[1]); - - -#endif - - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/bsd/crypto/aes/gen/Makefile b/bsd/crypto/aes/gen/Makefile deleted file mode 100644 index d32c71c39..000000000 --- a/bsd/crypto/aes/gen/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_I386 = \ - -PRIVATE_DATAFILES = \ - aestab.h aesopt.h - -INSTALL_MI_DIR = crypto - -EXPORT_MI_DIR = ${INSTALL_MI_DIR} - -INSTALL_KF_MI_LIST = - -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/bsd/crypto/aes/gen/aescrypt.c b/bsd/crypto/aes/gen/aescrypt.c deleted file mode 100644 index 31d4c81af..000000000 --- a/bsd/crypto/aes/gen/aescrypt.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - This file contains the code for implementing encryption and decryption - for AES (Rijndael) for block and key sizes of 16, 24 and 32 bytes. It - can optionally be replaced by code written in assembler using NASM. For - further details see the file aesopt.h -*/ - -#include "aesopt.h" -#include "aestab.h" - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define ki(y,x,k,c) (s(y,c) = s(x, c) ^ (k)[c]) -#define xo(y,x,c) (s(y,c) ^= s(x, c)) -#define si(y,x,c) (s(y,c) = word_in(x, c)) -#define so(y,x,c) word_out(y, c, s(x,c)) - -#if defined(ARRAYS) -#define locals(y,x) x[4],y[4] -#else -#define locals(y,x) x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3 -#endif - -#define dtables(tab) const aes_32t *tab##0, *tab##1, *tab##2, *tab##3 -#define itables(tab) tab##0 = tab[0]; tab##1 = tab[1]; tab##2 = tab[2]; tab##3 = tab[3] - -#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ - s(y,2) = s(x,2); s(y,3) = s(x,3); - -#define key_in(y,x,k) ki(y,x,k,0); ki(y,x,k,1); ki(y,x,k,2); ki(y,x,k,3) -#define cbc(y,x) xo(y,x,0); xo(y,x,1); xo(y,x,2); xo(y,x,3) -#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3) -#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) -#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3) - -#if defined(ENCRYPTION) && !defined(AES_ASM) - -/* Visual C++ .Net v7.1 provides the fastest encryption code when using - Pentium optimiation with small code but this is poor for decryption - so we need to control this with the following VC++ pragmas -*/ - -#if defined(_MSC_VER) -#pragma optimize( "s", on ) -#endif - -/* Given the column (c) of the output state variable, the following - macros give the input state variables which are needed in its - computation for each row (r) of the state. All the alternative - macros give the same end values but expand into different ways - of calculating these values. In particular the complex macro - used for dynamically variable block sizes is designed to expand - to a compile time constant whenever possible but will expand to - conditional clauses on some branches (I am grateful to Frank - Yellin for this construction) -*/ - -#define fwd_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))) - -#if defined(FT4_SET) -#undef dec_fmvars -# if defined(ENC_ROUND_CACHE_TABLES) -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fn,fwd_var,rf1,c)) -# else -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fn,fwd_var,rf1,c)) -# endif -#elif defined(FT1_SET) -#undef dec_fmvars -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_fn,fwd_var,rf1,c)) -#else -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_sbox,fwd_var,rf1,c))) -#endif - -#if defined(FL4_SET) -# if defined(LAST_ENC_ROUND_CACHE_TABLES) -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fl,fwd_var,rf1,c)) -# else -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fl,fwd_var,rf1,c)) -# endif -#elif defined(FL1_SET) -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_fl,fwd_var,rf1,c)) -#else -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_sbox,fwd_var,rf1,c)) -#endif - -aes_rval aes_encrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out, const aes_encrypt_ctx cx[1]) -{ aes_32t locals(b0, b1); - const aes_32t *kp; - const aes_32t *kptr = cx->ks; -#if defined(ENC_ROUND_CACHE_TABLES) - dtables(t_fn); -#endif -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - dtables(t_fl); -#endif - -#if defined( dec_fmvars ) - dec_fmvars; /* declare variables for fwd_mcol() if needed */ -#endif - -#if defined( AES_ERR_CHK ) - if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) - return aes_error; -#endif - - // Load IV into b0. - state_in(b0, in_iv); - - for (;num_blk; in += AES_BLOCK_SIZE, out += AES_BLOCK_SIZE, --num_blk) - { - kp = kptr; -#if 0 - // Read the plaintext into b1 - state_in(b1, in); - // Do the CBC with b0 which is either the iv or the ciphertext of the previous block. - cbc(b1, b0); - - // Xor b1 with the key schedule to get things started. - key_in(b0, b1, kp); -#else - // Since xor is associative we mess with the ordering here to get the loads started early - key_in(b1, b0, kp); // Xor b0(IV) with the key schedule and assign to b1 - state_in(b0, in); // Load block into b0 - cbc(b0, b1); // Xor b0 with b1 and store in b0 -#endif - -#if defined(ENC_ROUND_CACHE_TABLES) - itables(t_fn); -#endif - -#if (ENC_UNROLL == FULL) - - switch(cx->rn) - { - case 14: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - kp += 2 * N_COLS; - case 12: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - kp += 2 * N_COLS; - case 10: - default: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - round(fwd_rnd, b1, b0, kp + 3 * N_COLS); - round(fwd_rnd, b0, b1, kp + 4 * N_COLS); - round(fwd_rnd, b1, b0, kp + 5 * N_COLS); - round(fwd_rnd, b0, b1, kp + 6 * N_COLS); - round(fwd_rnd, b1, b0, kp + 7 * N_COLS); - round(fwd_rnd, b0, b1, kp + 8 * N_COLS); - round(fwd_rnd, b1, b0, kp + 9 * N_COLS); -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - itables(t_fl); -#endif - round(fwd_lrnd, b0, b1, kp +10 * N_COLS); - } - -#else - - { aes_32t rnd; -#if (ENC_UNROLL == PARTIAL) - for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) - { - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); - kp += N_COLS; - round(fwd_rnd, b0, b1, kp); - } - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); -#else - for(rnd = 0; rnd < cx->rn - 1; ++rnd) - { - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); - l_copy(b0, b1); - } -#endif -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - itables(t_fl); -#endif - kp += N_COLS; - round(fwd_lrnd, b0, b1, kp); - } -#endif - - state_out(out, b0); - } - -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(DECRYPTION) && !defined(AES_ASM) - -/* Visual C++ .Net v7.1 provides the fastest encryption code when using - Pentium optimiation with small code but this is poor for decryption - so we need to control this with the following VC++ pragmas -*/ - -#if defined(_MSC_VER) -#pragma optimize( "t", on ) -#endif - -/* Given the column (c) of the output state variable, the following - macros give the input state variables which are needed in its - computation for each row (r) of the state. All the alternative - macros give the same end values but expand into different ways - of calculating these values. In particular the complex macro - used for dynamically variable block sizes is designed to expand - to a compile time constant whenever possible but will expand to - conditional clauses on some branches (I am grateful to Frank - Yellin for this construction) -*/ - -#define inv_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))) - -#if defined(IT4_SET) -#undef dec_imvars -# if defined(DEC_ROUND_CACHE_TABLES) -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_in,inv_var,rf1,c)) -# else -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_in,inv_var,rf1,c)) -# endif -#elif defined(IT1_SET) -#undef dec_imvars -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_in,inv_var,rf1,c)) -#else -#define inv_rnd(y,x,k,c) (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c))) -#endif - -#if defined(IL4_SET) -# if defined(LAST_DEC_ROUND_CACHE_TABLES) -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_il,inv_var,rf1,c)) -# else -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_il,inv_var,rf1,c)) -# endif -#elif defined(IL1_SET) -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_il,inv_var,rf1,c)) -#else -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c)) -#endif - -aes_rval aes_decrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out, const aes_decrypt_ctx cx[1]) -{ aes_32t locals(b0, b1); - const aes_32t *kptr = cx->ks + cx->rn * N_COLS; - const aes_32t *kp; -#if defined(DEC_ROUND_CACHE_TABLES) - dtables(t_in); -#endif -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - dtables(t_il); -#endif - -#if defined( dec_imvars ) - dec_imvars; /* declare variables for inv_mcol() if needed */ -#endif - -#if defined( AES_ERR_CHK ) - if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) - return aes_error; -#endif - -#if defined(DEC_ROUND_CACHE_TABLES) - itables(t_in); -#endif - - in += AES_BLOCK_SIZE * (num_blk - 1); - out += AES_BLOCK_SIZE * (num_blk - 1); - // Load the last block's ciphertext into b1 - state_in(b1, in); - - for (;num_blk; out -= AES_BLOCK_SIZE, --num_blk) - { - kp = kptr; - // Do the xor part of state_in, where b1 is the previous block's ciphertext. - key_in(b0, b1, kp); - -#if (DEC_UNROLL == FULL) - - switch(cx->rn) - { - case 14: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - kp -= 2 * N_COLS; - case 12: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - kp -= 2 * N_COLS; - case 10: - default: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - round(inv_rnd, b1, b0, kp - 3 * N_COLS); - round(inv_rnd, b0, b1, kp - 4 * N_COLS); - round(inv_rnd, b1, b0, kp - 5 * N_COLS); - round(inv_rnd, b0, b1, kp - 6 * N_COLS); - round(inv_rnd, b1, b0, kp - 7 * N_COLS); - round(inv_rnd, b0, b1, kp - 8 * N_COLS); - round(inv_rnd, b1, b0, kp - 9 * N_COLS); -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - itables(t_il); -#endif - round(inv_lrnd, b0, b1, kp - 10 * N_COLS); - } - -#else - - { aes_32t rnd; -#if (DEC_UNROLL == PARTIAL) - for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) - { - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); - kp -= N_COLS; - round(inv_rnd, b0, b1, kp); - } - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); -#else - for(rnd = 0; rnd < cx->rn - 1; ++rnd) - { - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); - l_copy(b0, b1); - } -#endif -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - itables(t_il); -#endif - kp -= N_COLS; - round(inv_lrnd, b0, b1, kp); - } -#endif - - if (num_blk == 1) - { - // We are doing the first block so we need the IV rather than the previous - // block for CBC (there is no previous block) - state_in(b1, in_iv); - } - else - { - in -= AES_BLOCK_SIZE; - state_in(b1, in); - } - - // Do the CBC with b1 which is either the IV or the ciphertext of the previous block. - cbc(b0, b1); - - state_out(out, b0); - } -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(__cplusplus) -} -#endif diff --git a/bsd/crypto/aes/gen/aeskey.c b/bsd/crypto/aes/gen/aeskey.c deleted file mode 100644 index 5e0a6453c..000000000 --- a/bsd/crypto/aes/gen/aeskey.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 26/08/2003 - - This file contains the code for implementing the key schedule for AES - (Rijndael) for block and key sizes of 16, 24, and 32 bytes. See aesopt.h - for further details including optimisation. -*/ - -#include "aesopt.h" -#include "aestab.h" - -#if defined(__cplusplus) -extern "C" -{ -#endif - -/* Initialise the key schedule from the user supplied key. The key - length can be specified in bytes, with legal values of 16, 24 - and 32, or in bits, with legal values of 128, 192 and 256. These - values correspond with Nk values of 4, 6 and 8 respectively. - - The following macros implement a single cycle in the key - schedule generation process. The number of cycles needed - for each cx->n_col and nk value is: - - nk = 4 5 6 7 8 - ------------------------------ - cx->n_col = 4 10 9 8 7 7 - cx->n_col = 5 14 11 10 9 9 - cx->n_col = 6 19 15 12 11 11 - cx->n_col = 7 21 19 16 13 14 - cx->n_col = 8 29 23 19 17 14 -*/ - -#define ke4(k,i) \ -{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} -#define kel4(k,i) \ -{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} - -#define ke6(k,i) \ -{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ - k[6*(i)+10] = ss[4] ^= ss[3]; k[6*(i)+11] = ss[5] ^= ss[4]; \ -} -#define kel6(k,i) \ -{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ -} - -#define ke8(k,i) \ -{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ - k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); k[8*(i)+13] = ss[5] ^= ss[4]; \ - k[8*(i)+14] = ss[6] ^= ss[5]; k[8*(i)+15] = ss[7] ^= ss[6]; \ -} -#define kel8(k,i) \ -{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ -} - -#if defined(ENCRYPTION_KEY_SCHEDULE) - -#if defined(AES_128) || defined(AES_VAR) - -aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[4]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < ((11 * N_COLS - 5) / 4); ++i) - ke4(cx->ks, i); - } -#else - ke4(cx->ks, 0); ke4(cx->ks, 1); - ke4(cx->ks, 2); ke4(cx->ks, 3); - ke4(cx->ks, 4); ke4(cx->ks, 5); - ke4(cx->ks, 6); ke4(cx->ks, 7); - ke4(cx->ks, 8); -#endif - kel4(cx->ks, 9); - cx->rn = 10; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_192) || defined(AES_VAR) - -aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[6]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) - ke6(cx->ks, i); - } -#else - ke6(cx->ks, 0); ke6(cx->ks, 1); - ke6(cx->ks, 2); ke6(cx->ks, 3); - ke6(cx->ks, 4); ke6(cx->ks, 5); - ke6(cx->ks, 6); -#endif - kel6(cx->ks, 7); - cx->rn = 12; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_256) || defined(AES_VAR) - -aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[8]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - cx->ks[6] = ss[6] = word_in(key, 6); - cx->ks[7] = ss[7] = word_in(key, 7); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) - ke8(cx->ks, i); - } -#else - ke8(cx->ks, 0); ke8(cx->ks, 1); - ke8(cx->ks, 2); ke8(cx->ks, 3); - ke8(cx->ks, 4); ke8(cx->ks, 5); -#endif - kel8(cx->ks, 6); - cx->rn = 14; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_VAR) - -aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]) -{ - switch(key_len) - { -#if defined( AES_ERR_CHK ) - case 16: case 128: return aes_encrypt_key128(key, cx); - case 24: case 192: return aes_encrypt_key192(key, cx); - case 32: case 256: return aes_encrypt_key256(key, cx); - default: return aes_error; -#else - case 16: case 128: aes_encrypt_key128(key, cx); return; - case 24: case 192: aes_encrypt_key192(key, cx); return; - case 32: case 256: aes_encrypt_key256(key, cx); return; -#endif - } -} - -#endif - -#endif - -#if defined(DECRYPTION_KEY_SCHEDULE) - -#if DEC_ROUND == NO_TABLES -#define ff(x) (x) -#else -#define ff(x) inv_mcol(x) -#if defined( dec_imvars ) -#define d_vars dec_imvars -#endif -#endif - -#if 1 -#define kdf4(k,i) \ -{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; ss[1] = ss[1] ^ ss[3]; ss[2] = ss[2] ^ ss[3]; ss[3] = ss[3]; \ - ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ - ss[4] ^= k[4*(i)]; k[4*(i)+4] = ff(ss[4]); ss[4] ^= k[4*(i)+1]; k[4*(i)+5] = ff(ss[4]); \ - ss[4] ^= k[4*(i)+2]; k[4*(i)+6] = ff(ss[4]); ss[4] ^= k[4*(i)+3]; k[4*(i)+7] = ff(ss[4]); \ -} -#define kd4(k,i) \ -{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ - k[4*(i)+4] = ss[4] ^= k[4*(i)]; k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ - k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ -} -#define kdl4(k,i) \ -{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ - k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; k[4*(i)+5] = ss[1] ^ ss[3]; \ - k[4*(i)+6] = ss[0]; k[4*(i)+7] = ss[1]; \ -} -#else -#define kdf4(k,i) \ -{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ff(ss[0]); ss[1] ^= ss[0]; k[4*(i)+ 5] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ff(ss[2]); ss[3] ^= ss[2]; k[4*(i)+ 7] = ff(ss[3]); \ -} -#define kd4(k,i) \ -{ ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \ - ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[4*(i)+ 4] = ss[4] ^= k[4*(i)]; \ - ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[4] ^= k[4*(i)+ 1]; \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[4] ^= k[4*(i)+ 2]; \ - ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[4] ^= k[4*(i)+ 3]; \ -} -#define kdl4(k,i) \ -{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ss[0]; ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[1]; \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[2]; ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[3]; \ -} -#endif - -#define kdf6(k,i) \ -{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ff(ss[0]); ss[1] ^= ss[0]; k[6*(i)+ 7] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ff(ss[2]); ss[3] ^= ss[2]; k[6*(i)+ 9] = ff(ss[3]); \ - ss[4] ^= ss[3]; k[6*(i)+10] = ff(ss[4]); ss[5] ^= ss[4]; k[6*(i)+11] = ff(ss[5]); \ -} -#define kd6(k,i) \ -{ ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \ - ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ - ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ - ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ - ss[4] ^= ss[3]; k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ - ss[5] ^= ss[4]; k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ -} -#define kdl6(k,i) \ -{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ss[0]; ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[1]; \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[2]; ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[3]; \ -} - -#define kdf8(k,i) \ -{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ff(ss[0]); ss[1] ^= ss[0]; k[8*(i)+ 9] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[8*(i)+10] = ff(ss[2]); ss[3] ^= ss[2]; k[8*(i)+11] = ff(ss[3]); \ - ss[4] ^= ls_box(ss[3],0); k[8*(i)+12] = ff(ss[4]); ss[5] ^= ss[4]; k[8*(i)+13] = ff(ss[5]); \ - ss[6] ^= ss[5]; k[8*(i)+14] = ff(ss[6]); ss[7] ^= ss[6]; k[8*(i)+15] = ff(ss[7]); \ -} -#define kd8(k,i) \ -{ aes_32t g = ls_box(ss[7],3) ^ t_use(r,c)[i]; \ - ss[0] ^= g; g = ff(g); k[8*(i)+ 8] = g ^= k[8*(i)]; \ - ss[1] ^= ss[0]; k[8*(i)+ 9] = g ^= k[8*(i)+ 1]; \ - ss[2] ^= ss[1]; k[8*(i)+10] = g ^= k[8*(i)+ 2]; \ - ss[3] ^= ss[2]; k[8*(i)+11] = g ^= k[8*(i)+ 3]; \ - g = ls_box(ss[3],0); \ - ss[4] ^= g; g = ff(g); k[8*(i)+12] = g ^= k[8*(i)+ 4]; \ - ss[5] ^= ss[4]; k[8*(i)+13] = g ^= k[8*(i)+ 5]; \ - ss[6] ^= ss[5]; k[8*(i)+14] = g ^= k[8*(i)+ 6]; \ - ss[7] ^= ss[6]; k[8*(i)+15] = g ^= k[8*(i)+ 7]; \ -} -#define kdl8(k,i) \ -{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ss[0]; ss[1] ^= ss[0]; k[8*(i)+ 9] = ss[1]; \ - ss[2] ^= ss[1]; k[8*(i)+10] = ss[2]; ss[3] ^= ss[2]; k[8*(i)+11] = ss[3]; \ -} - -#if defined(AES_128) || defined(AES_VAR) - -aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[5]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (11 * N_COLS - 5) / 4; ++i) - ke4(cx->ks, i); - kel4(cx->ks, 9); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 10 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); -#endif - } -#else - kdf4(cx->ks, 0); kd4(cx->ks, 1); - kd4(cx->ks, 2); kd4(cx->ks, 3); - kd4(cx->ks, 4); kd4(cx->ks, 5); - kd4(cx->ks, 6); kd4(cx->ks, 7); - kd4(cx->ks, 8); kdl4(cx->ks, 9); -#endif - cx->rn = 10; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_192) || defined(AES_VAR) - -aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[7]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - { aes_32t i; - - for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) - ke6(cx->ks, i); - kel6(cx->ks, 7); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 12 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); -#endif - } -#else - cx->ks[4] = ff(ss[4] = word_in(key, 4)); - cx->ks[5] = ff(ss[5] = word_in(key, 5)); - kdf6(cx->ks, 0); kd6(cx->ks, 1); - kd6(cx->ks, 2); kd6(cx->ks, 3); - kd6(cx->ks, 4); kd6(cx->ks, 5); - kd6(cx->ks, 6); kdl6(cx->ks, 7); -#endif - cx->rn = 12; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_256) || defined(AES_VAR) - -aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[8]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - cx->ks[6] = ss[6] = word_in(key, 6); - cx->ks[7] = ss[7] = word_in(key, 7); - { aes_32t i; - - for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) - ke8(cx->ks, i); - kel8(cx->ks, i); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 14 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); - -#endif - } -#else - cx->ks[4] = ff(ss[4] = word_in(key, 4)); - cx->ks[5] = ff(ss[5] = word_in(key, 5)); - cx->ks[6] = ff(ss[6] = word_in(key, 6)); - cx->ks[7] = ff(ss[7] = word_in(key, 7)); - kdf8(cx->ks, 0); kd8(cx->ks, 1); - kd8(cx->ks, 2); kd8(cx->ks, 3); - kd8(cx->ks, 4); kd8(cx->ks, 5); - kdl8(cx->ks, 6); -#endif - cx->rn = 14; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_VAR) - -aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]) -{ - switch(key_len) - { -#if defined( AES_ERR_CHK ) - case 16: case 128: return aes_decrypt_key128(key, cx); - case 24: case 192: return aes_decrypt_key192(key, cx); - case 32: case 256: return aes_decrypt_key256(key, cx); - default: return aes_error; -#else - case 16: case 128: aes_decrypt_key128(key, cx); return; - case 24: case 192: aes_decrypt_key192(key, cx); return; - case 32: case 256: aes_decrypt_key256(key, cx); return; -#endif - } -} - -#endif - -#endif - -#if defined(__cplusplus) -} -#endif diff --git a/bsd/crypto/aes/gen/aesopt.h b/bsd/crypto/aes/gen/aesopt.h deleted file mode 100644 index a00794865..000000000 --- a/bsd/crypto/aes/gen/aesopt.h +++ /dev/null @@ -1,736 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - My thanks go to Dag Arne Osvik for devising the schemes used here for key - length derivation from the form of the key schedule - - This file contains the compilation options for AES (Rijndael) and code - that is common across encryption, key scheduling and table generation. - - OPERATION - - These source code files implement the AES algorithm Rijndael designed by - Joan Daemen and Vincent Rijmen. This version is designed for the standard - block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 - and 32 bytes). - - This version is designed for flexibility and speed using operations on - 32-bit words rather than operations on bytes. It can be compiled with - either big or little endian internal byte order but is faster when the - native byte order for the processor is used. - - THE CIPHER INTERFACE - - The cipher interface is implemented as an array of bytes in which lower - AES bit sequence indexes map to higher numeric significance within bytes. - - aes_08t (an unsigned 8-bit type) - aes_32t (an unsigned 32-bit type) - struct aes_encrypt_ctx (structure for the cipher encryption context) - struct aes_decrypt_ctx (structure for the cipher decryption context) - aes_rval the function return type - - C subroutine calls: - - aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, - const aes_encrypt_ctx cx[1]); - - aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, - const aes_decrypt_ctx cx[1]); - - IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that - you call genTabs() before AES is used so that the tables are initialised. - - C++ aes class subroutines: - - Class AESencrypt for encryption - - Construtors: - AESencrypt(void) - AESencrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval encrypt(const unsigned char *in, unsigned char *out) const - - Class AESdecrypt for encryption - Construtors: - AESdecrypt(void) - AESdecrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval decrypt(const unsigned char *in, unsigned char *out) const - - COMPILATION - - The files used to provide AES (Rijndael) are - - a. aes.h for the definitions needed for use in C. - b. aescpp.h for the definitions needed for use in C++. - c. aesopt.h for setting compilation options (also includes common code). - d. aescrypt.c for encryption and decrytpion, or - e. aeskey.c for key scheduling. - f. aestab.c for table loading or generation. - g. aescrypt.asm for encryption and decryption using assembler code. - h. aescrypt.mmx.asm for encryption and decryption using MMX assembler. - - To compile AES (Rijndael) for use in C code use aes.h and set the - defines here for the facilities you need (key lengths, encryption - and/or decryption). Do not define AES_DLL or AES_CPP. Set the options - for optimisations and table sizes here. - - To compile AES (Rijndael) for use in in C++ code use aescpp.h but do - not define AES_DLL - - To compile AES (Rijndael) in C as a Dynamic Link Library DLL) use - aes.h and include the AES_DLL define. - - CONFIGURATION OPTIONS (here and in aes.h) - - a. set AES_DLL in aes.h if AES (Rijndael) is to be compiled as a DLL - b. You may need to set PLATFORM_BYTE_ORDER to define the byte order. - c. If you want the code to run in a specific internal byte order, then - ALGORITHM_BYTE_ORDER must be set accordingly. - d. set other configuration options decribed below. -*/ - -#if !defined( _AESOPT_H ) -#define _AESOPT_H - -#include - -/* CONFIGURATION - USE OF DEFINES - - Later in this section there are a number of defines that control the - operation of the code. In each section, the purpose of each define is - explained so that the relevant form can be included or excluded by - setting either 1's or 0's respectively on the branches of the related - #if clauses. - - PLATFORM SPECIFIC INCLUDES AND BYTE ORDER IN 32-BIT WORDS - - To obtain the highest speed on processors with 32-bit words, this code - needs to determine the byte order of the target machine. The following - block of code is an attempt to capture the most obvious ways in which - various environemnts define byte order. It may well fail, in which case - the definitions will need to be set by editing at the points marked - **** EDIT HERE IF NECESSARY **** below. My thanks go to Peter Gutmann - for his assistance with this endian detection nightmare. -*/ - -#define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ -#define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ - -#if defined(__GNUC__) || defined(__GNU_LIBRARY__) -# if defined(__FreeBSD__) || defined(__OpenBSD__) -# include -# elif defined( BSD ) && BSD >= 199103 -# include -# elif defined(__APPLE__) -# if defined(__BIG_ENDIAN__) && !defined( BIG_ENDIAN ) -# define BIG_ENDIAN -# elif defined(__LITTLE_ENDIAN__) && !defined( LITTLE_ENDIAN ) -# define LITTLE_ENDIAN -# endif -# else -# include -# if defined(__BEOS__) -# include -# endif -# endif -#endif - -#if !defined(PLATFORM_BYTE_ORDER) -# if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN) -# if defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN) -# if defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# elif defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__) -# if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __LITTLE_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# endif -#endif - -/* if the platform is still unknown, try to find its byte order */ -/* from commonly used machine defines */ - -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __TANDEM ) || defined( THINK_C ) || defined( __VMCMS__ ) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN - -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -#else -# error Please edit aesopt.h (line 234 or 236) to set the platform byte order -#endif - -#endif - -/* SOME LOCAL DEFINITIONS */ - -#define NO_TABLES 0 -#define ONE_TABLE 1 -#define FOUR_TABLES 4 -#define NONE 0 -#define PARTIAL 1 -#define FULL 2 - -#if defined(bswap32) -#define aes_sw32 bswap32 -#elif defined(bswap_32) -#define aes_sw32 bswap_32 -#else -#define brot(x,n) (((aes_32t)(x) << n) | ((aes_32t)(x) >> (32 - n))) -#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00)) -#endif - -/* 1. FUNCTIONS REQUIRED - - This implementation provides subroutines for encryption, decryption - and for setting the three key lengths (separately) for encryption - and decryption. When the assembler code is not being used the following - definition blocks allow the selection of the routines that are to be - included in the compilation. -*/ -#if defined( AES_ENCRYPT ) -#define ENCRYPTION -#define ENCRYPTION_KEY_SCHEDULE -#endif - -#if defined( AES_DECRYPT ) -#define DECRYPTION -#define DECRYPTION_KEY_SCHEDULE -#endif - -/* 2. ASSEMBLER SUPPORT - - This define (which can be on the command line) enables the use of the - assembler code routines for encryption and decryption with the C code - only providing key scheduling -*/ - -/* 3. BYTE ORDER WITHIN 32 BIT WORDS - - The fundamental data processing units in Rijndael are 8-bit bytes. The - input, output and key input are all enumerated arrays of bytes in which - bytes are numbered starting at zero and increasing to one less than the - number of bytes in the array in question. This enumeration is only used - for naming bytes and does not imply any adjacency or order relationship - from one byte to another. When these inputs and outputs are considered - as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to - byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. - In this implementation bits are numbered from 0 to 7 starting at the - numerically least significant end of each byte (bit n represents 2^n). - - However, Rijndael can be implemented more efficiently using 32-bit - words by packing bytes into words so that bytes 4*n to 4*n+3 are placed - into word[n]. While in principle these bytes can be assembled into words - in any positions, this implementation only supports the two formats in - which bytes in adjacent positions within words also have adjacent byte - numbers. This order is called big-endian if the lowest numbered bytes - in words have the highest numeric significance and little-endian if the - opposite applies. - - This code can work in either order irrespective of the order used by the - machine on which it runs. Normally the internal byte order will be set - to the order of the processor on which the code is to be run but this - define can be used to reverse this in special situations - - NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set -*/ -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER - -/* 4. FAST INPUT/OUTPUT OPERATIONS. - - On some machines it is possible to improve speed by transferring the - bytes in the input and output arrays to and from the internal 32-bit - variables by addressing these arrays as if they are arrays of 32-bit - words. On some machines this will always be possible but there may - be a large performance penalty if the byte arrays are not aligned on - the normal word boundaries. On other machines this technique will - lead to memory access errors when such 32-bit word accesses are not - properly aligned. The option SAFE_IO avoids such problems but will - often be slower on those machines that support misaligned access - (especially so if care is taken to align the input and output byte - arrays on 32-bit word boundaries). If SAFE_IO is not defined it is - assumed that access to byte arrays as if they are arrays of 32-bit - words will not cause problems when such accesses are misaligned. -*/ - -/* 5. LOOP UNROLLING - - The code for encryption and decrytpion cycles through a number of rounds - that can be implemented either in a loop or by expanding the code into a - long sequence of instructions, the latter producing a larger program but - one that will often be much faster. The latter is called loop unrolling. - There are also potential speed advantages in expanding two iterations in - a loop with half the number of iterations, which is called partial loop - unrolling. The following options allow partial or full loop unrolling - to be set independently for encryption and decryption -*/ -#if 1 -#define ENC_UNROLL FULL -#elif 0 -#define ENC_UNROLL PARTIAL -#else -#define ENC_UNROLL NONE -#endif - -#if 1 -#define DEC_UNROLL FULL -#elif 0 -#define DEC_UNROLL PARTIAL -#else -#define DEC_UNROLL NONE -#endif - -/* 6. FAST FINITE FIELD OPERATIONS - - If this section is included, tables are used to provide faster finite - field arithmetic (this has no effect if FIXED_TABLES is defined). -*/ -#if 1 -#define FF_TABLES -#endif - -/* 7. INTERNAL STATE VARIABLE FORMAT - - The internal state of Rijndael is stored in a number of local 32-bit - word varaibles which can be defined either as an array or as individual - names variables. Include this section if you want to store these local - varaibles in arrays. Otherwise individual local variables will be used. -*/ -#if 0 -#define ARRAYS -#endif - -/* In this implementation the columns of the state array are each held in - 32-bit words. The state array can be held in various ways: in an array - of words, in a number of individual word variables or in a number of - processor registers. The following define maps a variable name x and - a column number c to the way the state array variable is to be held. - The first define below maps the state into an array x[c] whereas the - second form maps the state into a number of individual variables x0, - x1, etc. Another form could map individual state colums to machine - register names. -*/ - -#if defined(ARRAYS) -#define s(x,c) x[c] -#else -#define s(x,c) x##c -#endif - -/* 8. FIXED OR DYNAMIC TABLES - - When this section is included the tables used by the code are compiled - statically into the binary file. Otherwise the subroutine gen_tabs() - must be called to compute them before the code is first used. -*/ -#if 1 -#define FIXED_TABLES -#endif - -/* 9. TABLE ALIGNMENT - - On some sytsems speed will be improved by aligning the AES large lookup - tables on particular boundaries. This define should be set to a power of - two giving the desired alignment. It can be left undefined if alignment - is not needed. This option is specific to the Microsft VC++ compiler - - it seems to sometimes cause trouble for the VC++ version 6 compiler. -*/ - - -/* 10. INTERNAL TABLE CONFIGURATION - - This cipher proceeds by repeating in a number of cycles known as 'rounds' - which are implemented by a round function which can optionally be speeded - up using tables. The basic tables are each 256 32-bit words, with either - one or four tables being required for each round function depending on - how much speed is required. The encryption and decryption round functions - are different and the last encryption and decrytpion round functions are - different again making four different round functions in all. - - This means that: - 1. Normal encryption and decryption rounds can each use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - 2. The last encryption and decryption rounds can also use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - - Include or exclude the appropriate definitions below to set the number - of tables used by this implementation. -*/ - -#if 1 /* set tables for the normal encryption round */ -#define ENC_ROUND FOUR_TABLES -#elif 0 -#define ENC_ROUND ONE_TABLE -#else -#define ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last encryption round */ -#define LAST_ENC_ROUND FOUR_TABLES -#elif 0 -#define LAST_ENC_ROUND ONE_TABLE -#else -#define LAST_ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the normal decryption round */ -#define DEC_ROUND FOUR_TABLES -#elif 0 -#define DEC_ROUND ONE_TABLE -#else -#define DEC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last decryption round */ -#define LAST_DEC_ROUND FOUR_TABLES -#elif 0 -#define LAST_DEC_ROUND ONE_TABLE -#else -#define LAST_DEC_ROUND NO_TABLES -#endif - -/* The decryption key schedule can be speeded up with tables in the same - way that the round functions can. Include or exclude the following - defines to set this requirement. -*/ -#if 1 -#define KEY_SCHED FOUR_TABLES -#elif 0 -#define KEY_SCHED ONE_TABLE -#else -#define KEY_SCHED NO_TABLES -#endif - -/* 11. TABLE POINTER CACHING - - Normally tables are referenced directly, Enable this option if you wish to - cache pointers to the tables in the encrypt/decrypt code. Note that this - only works if you are using FOUR_TABLES for the ROUND you enable this for. -*/ -#if 1 -#define ENC_ROUND_CACHE_TABLES -#endif -#if 1 -#define LAST_ENC_ROUND_CACHE_TABLES -#endif -#if 1 -#define DEC_ROUND_CACHE_TABLES -#endif -#if 1 -#define LAST_DEC_ROUND_CACHE_TABLES -#endif - - -/* END OF CONFIGURATION OPTIONS */ - -#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) - -/* Disable or report errors on some combinations of options */ - -#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND NO_TABLES -#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND ONE_TABLE -#endif - -#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE -#undef ENC_UNROLL -#define ENC_UNROLL NONE -#endif - -#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND NO_TABLES -#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND ONE_TABLE -#endif - -#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE -#undef DEC_UNROLL -#define DEC_UNROLL NONE -#endif - -/* upr(x,n): rotates bytes within words by n positions, moving bytes to - higher index positions with wrap around into low positions - ups(x,n): moves bytes by n positions to higher index positions in - words but without wrap around - bval(x,n): extracts a byte from a word - - NOTE: The definitions given here are intended only for use with - unsigned variables and with shift counts that are compile - time constants -*/ - -#if (ALGORITHM_BYTE_ORDER == BRG_LITTLE_ENDIAN) -#define upr(x,n) (((aes_32t)(x) << (8 * (n))) | ((aes_32t)(x) >> (32 - 8 * (n)))) -#define ups(x,n) ((aes_32t) (x) << (8 * (n))) -#define bval(x,n) ((aes_08t)((x) >> (8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((aes_32t)(b3) << 24) | ((aes_32t)(b2) << 16) | ((aes_32t)(b1) << 8) | (b0)) -#endif - -#if (ALGORITHM_BYTE_ORDER == BRG_BIG_ENDIAN) -#define upr(x,n) (((aes_32t)(x) >> (8 * (n))) | ((aes_32t)(x) << (32 - 8 * (n)))) -#define ups(x,n) ((aes_32t) (x) >> (8 * (n)))) -#define bval(x,n) ((aes_08t)((x) >> (24 - 8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((aes_32t)(b0) << 24) | ((aes_32t)(b1) << 16) | ((aes_32t)(b2) << 8) | (b3)) -#endif - -#if defined(SAFE_IO) - -#define word_in(x,c) bytes2word(((aes_08t*)(x)+4*c)[0], ((aes_08t*)(x)+4*c)[1], \ - ((aes_08t*)(x)+4*c)[2], ((aes_08t*)(x)+4*c)[3]) -#define word_out(x,c,v) { ((aes_08t*)(x)+4*c)[0] = bval(v,0); ((aes_08t*)(x)+4*c)[1] = bval(v,1); \ - ((aes_08t*)(x)+4*c)[2] = bval(v,2); ((aes_08t*)(x)+4*c)[3] = bval(v,3); } - -#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) - -#define word_in(x,c) (*((const aes_32t*)(x)+(c))) -#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = (v)) - -#else - -#define word_in(x,c) aes_sw32(*((const aes_32t*)(x)+(c))) -#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = aes_sw32(v)) - -#endif - -/* the finite field modular polynomial and elements */ - -#define WPOLY 0x011b -#define BPOLY 0x1b - -/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ - -#define m1 0x80808080 -#define m2 0x7f7f7f7f -#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) - -/* The following defines provide alternative definitions of gf_mulx that might - give improved performance if a fast 32-bit multiply is not available. Note - that a temporary variable u needs to be defined where gf_mulx is used. - -#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) -#define m4 (0x01010101 * BPOLY) -#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) -*/ - -/* Work out which tables are needed for the different options */ - -#if defined( AES_ASM ) -#if defined( ENC_ROUND ) -#undef ENC_ROUND -#endif -#define ENC_ROUND FOUR_TABLES -#if defined( LAST_ENC_ROUND ) -#undef LAST_ENC_ROUND -#endif -#define LAST_ENC_ROUND FOUR_TABLES -#if defined( DEC_ROUND ) -#undef DEC_ROUND -#endif -#define DEC_ROUND FOUR_TABLES -#if defined( LAST_DEC_ROUND ) -#undef LAST_DEC_ROUND -#endif -#define LAST_DEC_ROUND FOUR_TABLES -#if defined( KEY_SCHED ) -#undef KEY_SCHED -#define KEY_SCHED FOUR_TABLES -#endif -#endif - -#if defined(ENCRYPTION) || defined(AES_ASM) -#if ENC_ROUND == ONE_TABLE -#define FT1_SET -#elif ENC_ROUND == FOUR_TABLES -#define FT4_SET -#else -#define SBX_SET -#endif -#if LAST_ENC_ROUND == ONE_TABLE -#define FL1_SET -#elif LAST_ENC_ROUND == FOUR_TABLES -#define FL4_SET -#elif !defined(SBX_SET) -#define SBX_SET -#endif -#endif - -#if defined(DECRYPTION) || defined(AES_ASM) -#if DEC_ROUND == ONE_TABLE -#define IT1_SET -#elif DEC_ROUND == FOUR_TABLES -#define IT4_SET -#else -#define ISB_SET -#endif -#if LAST_DEC_ROUND == ONE_TABLE -#define IL1_SET -#elif LAST_DEC_ROUND == FOUR_TABLES -#define IL4_SET -#elif !defined(ISB_SET) -#define ISB_SET -#endif -#endif - -#if defined(ENCRYPTION_KEY_SCHEDULE) || defined(DECRYPTION_KEY_SCHEDULE) -#if KEY_SCHED == ONE_TABLE -#define LS1_SET -#define IM1_SET -#elif KEY_SCHED == FOUR_TABLES -#define LS4_SET -#define IM4_SET -#elif !defined(SBX_SET) -#define SBX_SET -#endif -#endif - -/* generic definitions of Rijndael macros that use tables */ - -#define no_table(x,box,vf,rf,c) bytes2word( \ - box[bval(vf(x,0,c),rf(0,c))], \ - box[bval(vf(x,1,c),rf(1,c))], \ - box[bval(vf(x,2,c),rf(2,c))], \ - box[bval(vf(x,3,c),rf(3,c))]) - -#define one_table(x,op,tab,vf,rf,c) \ - ( tab[bval(vf(x,0,c),rf(0,c))] \ - ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ - ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ - ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) - -#define four_tables(x,tab,vf,rf,c) \ - ( tab[0][bval(vf(x,0,c),rf(0,c))] \ - ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ - ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ - ^ tab[3][bval(vf(x,3,c),rf(3,c))]) - -#define four_cached_tables(x,tab,vf,rf,c) \ -( tab##0[bval(vf(x,0,c),rf(0,c))] \ - ^ tab##1[bval(vf(x,1,c),rf(1,c))] \ - ^ tab##2[bval(vf(x,2,c),rf(2,c))] \ - ^ tab##3[bval(vf(x,3,c),rf(3,c))]) - -#define vf1(x,r,c) (x) -#define rf1(r,c) (r) -#define rf2(r,c) ((8+r-c)&3) - -/* perform forward and inverse column mix operation on four bytes in long word x in */ -/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros. */ - -#if defined(FM4_SET) /* not currently used */ -#define fwd_mcol(x) four_tables(x,t_use(f,m),vf1,rf1,0) -#elif defined(FM1_SET) /* not currently used */ -#define fwd_mcol(x) one_table(x,upr,t_use(f,m),vf1,rf1,0) -#else -#define dec_fmvars aes_32t g2 -#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1)) -#endif - -#if defined(IM4_SET) -#define inv_mcol(x) four_tables(x,t_use(i,m),vf1,rf1,0) -#elif defined(IM1_SET) -#define inv_mcol(x) one_table(x,upr,t_use(i,m),vf1,rf1,0) -#else -#define dec_imvars aes_32t g2, g4, g9 -#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \ - (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1)) -#endif - -#if defined(FL4_SET) -#define ls_box(x,c) four_tables(x,t_use(f,l),vf1,rf2,c) -#elif defined(LS4_SET) -#define ls_box(x,c) four_tables(x,t_use(l,s),vf1,rf2,c) -#elif defined(FL1_SET) -#define ls_box(x,c) one_table(x,upr,t_use(f,l),vf1,rf2,c) -#elif defined(LS1_SET) -#define ls_box(x,c) one_table(x,upr,t_use(l,s),vf1,rf2,c) -#else -#define ls_box(x,c) no_table(x,t_use(s,box),vf1,rf2,c) -#endif - -#endif diff --git a/bsd/crypto/aes/gen/aestab.c b/bsd/crypto/aes/gen/aestab.c deleted file mode 100644 index dfd2ee969..000000000 --- a/bsd/crypto/aes/gen/aestab.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - -*/ - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define DO_TABLES - -#include "aesopt.h" - -#if defined(FIXED_TABLES) - -#define sb_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } - -#define isb_data(w) {\ - w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\ - w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\ - w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\ - w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\ - w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\ - w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\ - w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\ - w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\ - w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\ - w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\ - w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\ - w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\ - w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\ - w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\ - w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\ - w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\ - w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\ - w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\ - w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\ - w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\ - w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\ - w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\ - w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\ - w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\ - w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\ - w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\ - w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\ - w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\ - w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\ - w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\ - w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\ - w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) } - -#define mm_data(w) {\ - w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\ - w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\ - w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\ - w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\ - w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\ - w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\ - w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\ - w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\ - w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\ - w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\ - w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\ - w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\ - w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\ - w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\ - w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\ - w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\ - w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\ - w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\ - w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\ - w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\ - w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\ - w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\ - w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\ - w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\ - w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\ - w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\ - w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\ - w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\ - w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\ - w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\ - w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\ - w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) } - -#define rc_data(w) {\ - w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\ - w(0x1b), w(0x36) } - -#define h0(x) (x) - -#define w0(p) bytes2word(p, 0, 0, 0) -#define w1(p) bytes2word(0, p, 0, 0) -#define w2(p) bytes2word(0, 0, p, 0) -#define w3(p) bytes2word(0, 0, 0, p) - -#define u0(p) bytes2word(f2(p), p, p, f3(p)) -#define u1(p) bytes2word(f3(p), f2(p), p, p) -#define u2(p) bytes2word(p, f3(p), f2(p), p) -#define u3(p) bytes2word(p, p, f3(p), f2(p)) - -#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p)) -#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p)) -#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p)) -#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p)) - -#endif - -#if defined(FIXED_TABLES) || !defined(FF_TABLES) - -#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY)) -#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY)) -#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \ - ^ (((x>>5) & 4) * WPOLY)) -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -#else - -#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) -#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) -#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) -#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) -#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) -#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) -#define fi(x) ((x) ? pow[ 255 - log[x]] : 0) - -#endif - -#include "aestab.h" - -#if defined(FIXED_TABLES) - -/* implemented in case of wrong call for fixed tables */ - -void gen_tabs(void) -{ -} - -#else /* dynamic table generation */ - -#if !defined(FF_TABLES) - -/* Generate the tables for the dynamic table option - - It will generally be sensible to use tables to compute finite - field multiplies and inverses but where memory is scarse this - code might sometimes be better. But it only has effect during - initialisation so its pretty unimportant in overall terms. -*/ - -/* return 2 ^ (n - 1) where n is the bit number of the highest bit - set in x with x in the range 1 < x < 0x00000200. This form is - used so that locals within fi can be bytes rather than words -*/ - -static aes_08t hibit(const aes_32t x) -{ aes_08t r = (aes_08t)((x >> 1) | (x >> 2)); - - r |= (r >> 2); - r |= (r >> 4); - return (r + 1) >> 1; -} - -/* return the inverse of the finite field element x */ - -static aes_08t fi(const aes_08t x) -{ aes_08t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0; - - if(x < 2) return x; - - for(;;) - { - if(!n1) return v1; - - while(n2 >= n1) - { - n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2); - } - - if(!n2) return v2; - - while(n1 >= n2) - { - n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1); - } - } -} - -#endif - -/* The forward and inverse affine transformations used in the S-box */ - -#define fwd_affine(x) \ - (w = (aes_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(aes_08t)(w^(w>>8))) - -#define inv_affine(x) \ - (w = (aes_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(aes_08t)(w^(w>>8))) - -static int init = 0; - -void gen_tabs(void) -{ aes_32t i, w; - -#if defined(FF_TABLES) - - aes_08t pow[512], log[256]; - - if(init) return; - /* log and power tables for GF(2^8) finite field with - WPOLY as modular polynomial - the simplest primitive - root is 0x03, used here to generate the tables - */ - - i = 0; w = 1; - do - { - pow[i] = (aes_08t)w; - pow[i + 255] = (aes_08t)w; - log[w] = (aes_08t)i++; - w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); - } - while (w != 1); - -#else - if(init) return; -#endif - - for(i = 0, w = 1; i < RC_LENGTH; ++i) - { - t_set(r,c)[i] = bytes2word(w, 0, 0, 0); - w = f2(w); - } - - for(i = 0; i < 256; ++i) - { aes_08t b; - - b = fwd_affine(fi((aes_08t)i)); - w = bytes2word(f2(b), b, b, f3(b)); - -#if defined( SBX_SET ) - t_set(s,box)[i] = b; -#endif - -#if defined( FT1_SET ) /* tables for a normal encryption round */ - t_set(f,n)[i] = w; -#endif -#if defined( FT4_SET ) - t_set(f,n)[0][i] = w; - t_set(f,n)[1][i] = upr(w,1); - t_set(f,n)[2][i] = upr(w,2); - t_set(f,n)[3][i] = upr(w,3); -#endif - w = bytes2word(b, 0, 0, 0); - -#if defined( FL1_SET ) /* tables for last encryption round (may also */ - t_set(f,l)[i] = w; /* be used in the key schedule) */ -#endif -#if defined( FL4_SET ) - t_set(f,l)[0][i] = w; - t_set(f,l)[1][i] = upr(w,1); - t_set(f,l)[2][i] = upr(w,2); - t_set(f,l)[3][i] = upr(w,3); -#endif - -#if defined( LS1_SET ) /* table for key schedule if t_set(f,l) above is */ - t_set(l,s)[i] = w; /* not of the required form */ -#endif -#if defined( LS4_SET ) - t_set(l,s)[0][i] = w; - t_set(l,s)[1][i] = upr(w,1); - t_set(l,s)[2][i] = upr(w,2); - t_set(l,s)[3][i] = upr(w,3); -#endif - - b = fi(inv_affine((aes_08t)i)); - w = bytes2word(fe(b), f9(b), fd(b), fb(b)); - -#if defined( IM1_SET ) /* tables for the inverse mix column operation */ - t_set(i,m)[b] = w; -#endif -#if defined( IM4_SET ) - t_set(i,m)[0][b] = w; - t_set(i,m)[1][b] = upr(w,1); - t_set(i,m)[2][b] = upr(w,2); - t_set(i,m)[3][b] = upr(w,3); -#endif - -#if defined( ISB_SET ) - t_set(i,box)[i] = b; -#endif -#if defined( IT1_SET ) /* tables for a normal decryption round */ - t_set(i,n)[i] = w; -#endif -#if defined( IT4_SET ) - t_set(i,n)[0][i] = w; - t_set(i,n)[1][i] = upr(w,1); - t_set(i,n)[2][i] = upr(w,2); - t_set(i,n)[3][i] = upr(w,3); -#endif - w = bytes2word(b, 0, 0, 0); -#if defined( IL1_SET ) /* tables for last decryption round */ - t_set(i,l)[i] = w; -#endif -#if defined( IL4_SET ) - t_set(i,l)[0][i] = w; - t_set(i,l)[1][i] = upr(w,1); - t_set(i,l)[2][i] = upr(w,2); - t_set(i,l)[3][i] = upr(w,3); -#endif - } - init = 1; -} - -#endif - -#if defined(__cplusplus) -} -#endif - diff --git a/bsd/crypto/aes/gen/aestab.h b/bsd/crypto/aes/gen/aestab.h deleted file mode 100644 index 004ef9e74..000000000 --- a/bsd/crypto/aes/gen/aestab.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - This file contains the code for declaring the tables needed to implement - AES. The file aesopt.h is assumed to be included before this header file. - If there are no global variables, the definitions here can be used to put - the AES tables in a structure so that a pointer can then be added to the - AES context to pass them to the AES routines that need them. If this - facility is used, the calling program has to ensure that this pointer is - managed appropriately. In particular, the value of the t_dec(in,it) item - in the table structure must be set to zero in order to ensure that the - tables are initialised. In practice the three code sequences in aeskey.c - that control the calls to gen_tabs() and the gen_tabs() routine itself will - have to be changed for a specific implementation. If global variables are - available it will generally be preferable to use them with the precomputed - FIXED_TABLES option that uses static global tables. - - The following defines can be used to control the way the tables - are defined, initialised and used in embedded environments that - require special features for these purposes - - the 't_dec' construction is used to declare fixed table arrays - the 't_set' construction is used to set fixed table values - the 't_use' construction is used to access fixed table values - - 256 byte tables: - - t_xxx(s,box) => forward S box - t_xxx(i,box) => inverse S box - - 256 32-bit word OR 4 x 256 32-bit word tables: - - t_xxx(f,n) => forward normal round - t_xxx(f,l) => forward last round - t_xxx(i,n) => inverse normal round - t_xxx(i,l) => inverse last round - t_xxx(l,s) => key schedule table - t_xxx(i,m) => key schedule table - - Other variables and tables: - - t_xxx(r,c) => the rcon table -*/ - -#if !defined( _AESTAB_H ) -#define _AESTAB_H - -#define t_dec(m,n) t_##m##n -#define t_set(m,n) t_##m##n -#define t_use(m,n) t_##m##n - -#if defined(FIXED_TABLES) -#define Const const -#else -#define Const -#endif - -#if defined(DO_TABLES) -#define Extern -#else -#define Extern extern -#endif - -#if defined(_MSC_VER) && defined(TABLE_ALIGN) -#define Align __declspec(align(TABLE_ALIGN)) -#else -#define Align -#endif - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#if defined(DO_TABLES) && defined(FIXED_TABLES) -#define d_1(t,n,b,e) Align Const t n[256] = b(e) -#define d_4(t,n,b,e,f,g,h) Align Const t n[4][256] = { b(e), b(f), b(g), b(h) } -Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH] = rc_data(w0); -#else -#define d_1(t,n,b,e) Extern Align Const t n[256] -#define d_4(t,n,b,e,f,g,h) Extern Align Const t n[4][256] -Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH]; -#endif - -#if defined( SBX_SET ) - d_1(aes_08t, t_dec(s,box), sb_data, h0); -#endif -#if defined( ISB_SET ) - d_1(aes_08t, t_dec(i,box), isb_data, h0); -#endif - -#if defined( FT1_SET ) - d_1(aes_32t, t_dec(f,n), sb_data, u0); -#endif -#if defined( FT4_SET ) - d_4(aes_32t, t_dec(f,n), sb_data, u0, u1, u2, u3); -#endif - -#if defined( FL1_SET ) - d_1(aes_32t, t_dec(f,l), sb_data, w0); -#endif -#if defined( FL4_SET ) - d_4(aes_32t, t_dec(f,l), sb_data, w0, w1, w2, w3); -#endif - -#if defined( IT1_SET ) - d_1(aes_32t, t_dec(i,n), isb_data, v0); -#endif -#if defined( IT4_SET ) - d_4(aes_32t, t_dec(i,n), isb_data, v0, v1, v2, v3); -#endif - -#if defined( IL1_SET ) - d_1(aes_32t, t_dec(i,l), isb_data, w0); -#endif -#if defined( IL4_SET ) - d_4(aes_32t, t_dec(i,l), isb_data, w0, w1, w2, w3); -#endif - -#if defined( LS1_SET ) -#if defined( FL1_SET ) -#undef LS1_SET -#else - d_1(aes_32t, t_dec(l,s), sb_data, w0); -#endif -#endif - -#if defined( LS4_SET ) -#if defined( FL4_SET ) -#undef LS4_SET -#else - d_4(aes_32t, t_dec(l,s), sb_data, w0, w1, w2, w3); -#endif -#endif - -#if defined( IM1_SET ) - d_1(aes_32t, t_dec(i,m), mm_data, v0); -#endif -#if defined( IM4_SET ) - d_4(aes_32t, t_dec(i,m), mm_data, v0, v1, v2, v3); -#endif - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/bsd/crypto/aes/i386/AES.s b/bsd/crypto/aes/i386/AES.s deleted file mode 100644 index 9bf440a68..000000000 --- a/bsd/crypto/aes/i386/AES.s +++ /dev/null @@ -1,143 +0,0 @@ -/* AES.s -- Core AES routines for Intel processors. - - Written by Eric Postpischil, January 30, 2008. -*/ - - -/* We build these AES routines as a single module because the routines refer - to labels in Data.s and it is easier and faster to refer to them as local - labels. In my implementations of AES for CommonCrypto, both i386 and - x86_64 use position-independent code. For this in-kernel implementation, - i386 has been converted to absolute addressing, but x86_64 still uses PIC. - - A local label can be referred to with position-independent assembler - expressions such as "label-base(register)", where is a local label - whose address has been loaded into . (On i386, this is typically - done with the idiom of a call to the next instruction and a pop of that - return address into a register.) Without local labels, the references must - be done using spaces for addresses of "lazy symbols" that are filled in by - the dynamic loader and loaded by the code that wants the address. - - So the various routines in other files are assembled here via #include - directives. -*/ -#include "Data.s" - - -#define TableSize (256*4) - /* Each of the arrays defined in Data.s except for the round constants - in _AESRcon is composed of four tables of 256 entries of four bytes - each. TableSize is the number of bytes in one of those four tables. - */ - - -// Include constants describing the AES context structures. -#include "Context.h" - - -/* Define a macro to select a value based on architecture. This reduces - some of the architecture conditionalization later in the source. -*/ -#if defined __i386__ - #define Arch(i386, x86_64) i386 -#elif defined __x86_64__ - #define Arch(i386, x86_64) x86_64 -#endif - - -// Define an instruction for moving pointers. -#define movp Arch(movd, movd) - // Latter argument should be "movq", but the assembler uses "movd". - - -/* Rename the general registers. This makes it easier to keep track of them - and provides names for the "whole register" that are uniform between i386 - and x86_64. -*/ -#if defined __i386__ - #define r0 %eax // Available for any use. - #define r1 %ecx // Available for any use, some special purposes (loop). - #define r2 %edx // Available for any use. - #define r3 %ebx // Must be preserved by called routine. - #define r4 %esp // Stack pointer. - #define r5 %ebp // Frame pointer, must preserve, no bare indirect. - #define r6 %esi // Must be preserved by called routine. - #define r7 %edi // Must be preserved by called routine. -#elif defined __x86_64__ - #define r0 %rax // Available for any use. - #define r1 %rcx // Available for any use. - #define r2 %rdx // Available for any use. - #define r3 %rbx // Must be preserved by called routine. - #define r4 %rsp // Stack pointer. - #define r5 %rbp // Frame pointer. Must be preserved by called routine. - #define r6 %rsi // Available for any use. - #define r7 %rdi // Available for any use. - #define r8 %r8 // Available for any use. - #define r9 %r9 // Available for any use. - #define r10 %r10 // Available for any use. - #define r11 %r11 // Available for any use. - #define r12 %r12 // Must be preserved by called routine. - #define r13 %r13 // Must be preserved by called routine. - #define r14 %r14 // Must be preserved by called routine. - #define r15 %r15 // Must be preserved by called routine. -#else - #error "Unknown architecture." -#endif - -// Define names for parts of registers. - -#define r0d %eax // Low 32 bits of r0. -#define r1d %ecx // Low 32 bits of r1. -#define r2d %edx // Low 32 bits of r2. -#define r3d %ebx // Low 32 bits of r3. -#define r5d %ebp // Low 32 bits of r5. -#define r6d %esi // Low 32 bits of r6. -#define r7d %edi // Low 32 bits of r7. -#define r8d %r8d // Low 32 bits of r8. -#define r9d %r9d // Low 32 bits of r9. -#define r11d %r11d // Low 32 bits of r11. - -#define r0l %al // Low byte of r0. -#define r1l %cl // Low byte of r1. -#define r2l %dl // Low byte of r2. -#define r3l %bl // Low byte of r3. -#define r5l %bpl // Low byte of r5. - -#define r0h %ah // Second lowest byte of r0. -#define r1h %ch // Second lowest byte of r1. -#define r2h %dh // Second lowest byte of r2. -#define r3h %bh // Second lowest byte of r3. - - - .text - - -// Define encryption routine, _AESEncryptWithExpandedKey -#define Select 0 -#include "EncryptDecrypt.s" -#undef Select - - -// Define decryption routine, _AESDecryptWithExpandedKey -#define Select 1 -#include "EncryptDecrypt.s" -#undef Select - -// Define encryption routine, _AESEncryptWithExpandedKey -#define Select 2 -#include "EncryptDecrypt.s" -#undef Select - - -// Define decryption routine, _AESDecryptWithExpandedKey -#define Select 3 -#include "EncryptDecrypt.s" -#undef Select - - -// Define key expansion routine for encryption, _AESExpandKeyForEncryption. -#include "ExpandKeyForEncryption.s" - - -// Define key expansion for decryption routine, _AESExpandKeyForDecryption. -#include "ExpandKeyForDecryption.s" diff --git a/bsd/crypto/aes/i386/Context.h b/bsd/crypto/aes/i386/Context.h deleted file mode 100644 index f53cb9514..000000000 --- a/bsd/crypto/aes/i386/Context.h +++ /dev/null @@ -1,9 +0,0 @@ -// Define byte offset of key within context structure. -#define ContextKey 0 - -/* Define byte offset of key length within context structure. The number - stored there is the number of bytes from the start of the first round key - to the start of the last round key. That is 16 less than the number of - bytes in the entire key. -*/ -#define ContextKeyLength 240 diff --git a/bsd/crypto/aes/i386/Data.mk b/bsd/crypto/aes/i386/Data.mk deleted file mode 100644 index 4b55d630f..000000000 --- a/bsd/crypto/aes/i386/Data.mk +++ /dev/null @@ -1,30 +0,0 @@ -default: - @echo "This makefile builds Data.s, which contains constant data for the" - @echo "AES implementation. This file does not normally need to be rebuilt," - @echo "so it is checked into the source code repository. It should be" - @echo "changed only when the implementation changes and needs data in a" - @echo "different format. (This file can also build a C version, Data.c," - @echo "but that is not currently in use.)" - @echo "" - @echo "To rebuild the file(s), execute \"make -f Data.mk all\"." - -.PHONY: all clean -Targets = Data.s -all: $(Targets) - -CFLAGS += -O3 -std=c99 -Wmost -Werror - -.INTERMEDIATE: MakeData -MakeData: MakeData.c - -# Do not leave bad output files if the build fails. -.DELETE_ON_ERROR: $(Targets) - -Data.c: MakeData - ./$< >$@ C - -Data.s: MakeData - ./$< >$@ Intel - -clean: - -rm $(Targets) diff --git a/bsd/crypto/aes/i386/Data.s b/bsd/crypto/aes/i386/Data.s deleted file mode 100644 index d330462d0..000000000 --- a/bsd/crypto/aes/i386/Data.s +++ /dev/null @@ -1,5196 +0,0 @@ -// This file was generated by MakeData.c. - - - .const - - -// Round constants. - .globl _AESRcon - .private_extern _AESRcon -_AESRcon: - .byte 0 // Not used, included for indexing simplicity. - .byte 0x01 - .byte 0x02 - .byte 0x04 - .byte 0x08 - .byte 0x10 - .byte 0x20 - .byte 0x40 - .byte 0x80 - .byte 0x1b - .byte 0x36 - - -// Tables for InvMixColumn. - .globl _AESInvMixColumnTable - .private_extern _AESInvMixColumnTable - .align 2 -_AESInvMixColumnTable: - // Table 0. - .long 0x00000000 - .long 0x0b0d090e - .long 0x161a121c - .long 0x1d171b12 - .long 0x2c342438 - .long 0x27392d36 - .long 0x3a2e3624 - .long 0x31233f2a - .long 0x58684870 - .long 0x5365417e - .long 0x4e725a6c - .long 0x457f5362 - .long 0x745c6c48 - .long 0x7f516546 - .long 0x62467e54 - .long 0x694b775a - .long 0xb0d090e0 - .long 0xbbdd99ee - .long 0xa6ca82fc - .long 0xadc78bf2 - .long 0x9ce4b4d8 - .long 0x97e9bdd6 - .long 0x8afea6c4 - .long 0x81f3afca - .long 0xe8b8d890 - .long 0xe3b5d19e - .long 0xfea2ca8c - .long 0xf5afc382 - .long 0xc48cfca8 - .long 0xcf81f5a6 - .long 0xd296eeb4 - .long 0xd99be7ba - .long 0x7bbb3bdb - .long 0x70b632d5 - .long 0x6da129c7 - .long 0x66ac20c9 - .long 0x578f1fe3 - .long 0x5c8216ed - .long 0x41950dff - .long 0x4a9804f1 - .long 0x23d373ab - .long 0x28de7aa5 - .long 0x35c961b7 - .long 0x3ec468b9 - .long 0x0fe75793 - .long 0x04ea5e9d - .long 0x19fd458f - .long 0x12f04c81 - .long 0xcb6bab3b - .long 0xc066a235 - .long 0xdd71b927 - .long 0xd67cb029 - .long 0xe75f8f03 - .long 0xec52860d - .long 0xf1459d1f - .long 0xfa489411 - .long 0x9303e34b - .long 0x980eea45 - .long 0x8519f157 - .long 0x8e14f859 - .long 0xbf37c773 - .long 0xb43ace7d - .long 0xa92dd56f - .long 0xa220dc61 - .long 0xf66d76ad - .long 0xfd607fa3 - .long 0xe07764b1 - .long 0xeb7a6dbf - .long 0xda595295 - .long 0xd1545b9b - .long 0xcc434089 - .long 0xc74e4987 - .long 0xae053edd - .long 0xa50837d3 - .long 0xb81f2cc1 - .long 0xb31225cf - .long 0x82311ae5 - .long 0x893c13eb - .long 0x942b08f9 - .long 0x9f2601f7 - .long 0x46bde64d - .long 0x4db0ef43 - .long 0x50a7f451 - .long 0x5baafd5f - .long 0x6a89c275 - .long 0x6184cb7b - .long 0x7c93d069 - .long 0x779ed967 - .long 0x1ed5ae3d - .long 0x15d8a733 - .long 0x08cfbc21 - .long 0x03c2b52f - .long 0x32e18a05 - .long 0x39ec830b - .long 0x24fb9819 - .long 0x2ff69117 - .long 0x8dd64d76 - .long 0x86db4478 - .long 0x9bcc5f6a - .long 0x90c15664 - .long 0xa1e2694e - .long 0xaaef6040 - .long 0xb7f87b52 - .long 0xbcf5725c - .long 0xd5be0506 - .long 0xdeb30c08 - .long 0xc3a4171a - .long 0xc8a91e14 - .long 0xf98a213e - .long 0xf2872830 - .long 0xef903322 - .long 0xe49d3a2c - .long 0x3d06dd96 - .long 0x360bd498 - .long 0x2b1ccf8a - .long 0x2011c684 - .long 0x1132f9ae - .long 0x1a3ff0a0 - .long 0x0728ebb2 - .long 0x0c25e2bc - .long 0x656e95e6 - .long 0x6e639ce8 - .long 0x737487fa - .long 0x78798ef4 - .long 0x495ab1de - .long 0x4257b8d0 - .long 0x5f40a3c2 - .long 0x544daacc - .long 0xf7daec41 - .long 0xfcd7e54f - .long 0xe1c0fe5d - .long 0xeacdf753 - .long 0xdbeec879 - .long 0xd0e3c177 - .long 0xcdf4da65 - .long 0xc6f9d36b - .long 0xafb2a431 - .long 0xa4bfad3f - .long 0xb9a8b62d - .long 0xb2a5bf23 - .long 0x83868009 - .long 0x888b8907 - .long 0x959c9215 - .long 0x9e919b1b - .long 0x470a7ca1 - .long 0x4c0775af - .long 0x51106ebd - .long 0x5a1d67b3 - .long 0x6b3e5899 - .long 0x60335197 - .long 0x7d244a85 - .long 0x7629438b - .long 0x1f6234d1 - .long 0x146f3ddf - .long 0x097826cd - .long 0x02752fc3 - .long 0x335610e9 - .long 0x385b19e7 - .long 0x254c02f5 - .long 0x2e410bfb - .long 0x8c61d79a - .long 0x876cde94 - .long 0x9a7bc586 - .long 0x9176cc88 - .long 0xa055f3a2 - .long 0xab58faac - .long 0xb64fe1be - .long 0xbd42e8b0 - .long 0xd4099fea - .long 0xdf0496e4 - .long 0xc2138df6 - .long 0xc91e84f8 - .long 0xf83dbbd2 - .long 0xf330b2dc - .long 0xee27a9ce - .long 0xe52aa0c0 - .long 0x3cb1477a - .long 0x37bc4e74 - .long 0x2aab5566 - .long 0x21a65c68 - .long 0x10856342 - .long 0x1b886a4c - .long 0x069f715e - .long 0x0d927850 - .long 0x64d90f0a - .long 0x6fd40604 - .long 0x72c31d16 - .long 0x79ce1418 - .long 0x48ed2b32 - .long 0x43e0223c - .long 0x5ef7392e - .long 0x55fa3020 - .long 0x01b79aec - .long 0x0aba93e2 - .long 0x17ad88f0 - .long 0x1ca081fe - .long 0x2d83bed4 - .long 0x268eb7da - .long 0x3b99acc8 - .long 0x3094a5c6 - .long 0x59dfd29c - .long 0x52d2db92 - .long 0x4fc5c080 - .long 0x44c8c98e - .long 0x75ebf6a4 - .long 0x7ee6ffaa - .long 0x63f1e4b8 - .long 0x68fcedb6 - .long 0xb1670a0c - .long 0xba6a0302 - .long 0xa77d1810 - .long 0xac70111e - .long 0x9d532e34 - .long 0x965e273a - .long 0x8b493c28 - .long 0x80443526 - .long 0xe90f427c - .long 0xe2024b72 - .long 0xff155060 - .long 0xf418596e - .long 0xc53b6644 - .long 0xce366f4a - .long 0xd3217458 - .long 0xd82c7d56 - .long 0x7a0ca137 - .long 0x7101a839 - .long 0x6c16b32b - .long 0x671bba25 - .long 0x5638850f - .long 0x5d358c01 - .long 0x40229713 - .long 0x4b2f9e1d - .long 0x2264e947 - .long 0x2969e049 - .long 0x347efb5b - .long 0x3f73f255 - .long 0x0e50cd7f - .long 0x055dc471 - .long 0x184adf63 - .long 0x1347d66d - .long 0xcadc31d7 - .long 0xc1d138d9 - .long 0xdcc623cb - .long 0xd7cb2ac5 - .long 0xe6e815ef - .long 0xede51ce1 - .long 0xf0f207f3 - .long 0xfbff0efd - .long 0x92b479a7 - .long 0x99b970a9 - .long 0x84ae6bbb - .long 0x8fa362b5 - .long 0xbe805d9f - .long 0xb58d5491 - .long 0xa89a4f83 - .long 0xa397468d - // Table 1. - .long 0x00000000 - .long 0x0d090e0b - .long 0x1a121c16 - .long 0x171b121d - .long 0x3424382c - .long 0x392d3627 - .long 0x2e36243a - .long 0x233f2a31 - .long 0x68487058 - .long 0x65417e53 - .long 0x725a6c4e - .long 0x7f536245 - .long 0x5c6c4874 - .long 0x5165467f - .long 0x467e5462 - .long 0x4b775a69 - .long 0xd090e0b0 - .long 0xdd99eebb - .long 0xca82fca6 - .long 0xc78bf2ad - .long 0xe4b4d89c - .long 0xe9bdd697 - .long 0xfea6c48a - .long 0xf3afca81 - .long 0xb8d890e8 - .long 0xb5d19ee3 - .long 0xa2ca8cfe - .long 0xafc382f5 - .long 0x8cfca8c4 - .long 0x81f5a6cf - .long 0x96eeb4d2 - .long 0x9be7bad9 - .long 0xbb3bdb7b - .long 0xb632d570 - .long 0xa129c76d - .long 0xac20c966 - .long 0x8f1fe357 - .long 0x8216ed5c - .long 0x950dff41 - .long 0x9804f14a - .long 0xd373ab23 - .long 0xde7aa528 - .long 0xc961b735 - .long 0xc468b93e - .long 0xe757930f - .long 0xea5e9d04 - .long 0xfd458f19 - .long 0xf04c8112 - .long 0x6bab3bcb - .long 0x66a235c0 - .long 0x71b927dd - .long 0x7cb029d6 - .long 0x5f8f03e7 - .long 0x52860dec - .long 0x459d1ff1 - .long 0x489411fa - .long 0x03e34b93 - .long 0x0eea4598 - .long 0x19f15785 - .long 0x14f8598e - .long 0x37c773bf - .long 0x3ace7db4 - .long 0x2dd56fa9 - .long 0x20dc61a2 - .long 0x6d76adf6 - .long 0x607fa3fd - .long 0x7764b1e0 - .long 0x7a6dbfeb - .long 0x595295da - .long 0x545b9bd1 - .long 0x434089cc - .long 0x4e4987c7 - .long 0x053eddae - .long 0x0837d3a5 - .long 0x1f2cc1b8 - .long 0x1225cfb3 - .long 0x311ae582 - .long 0x3c13eb89 - .long 0x2b08f994 - .long 0x2601f79f - .long 0xbde64d46 - .long 0xb0ef434d - .long 0xa7f45150 - .long 0xaafd5f5b - .long 0x89c2756a - .long 0x84cb7b61 - .long 0x93d0697c - .long 0x9ed96777 - .long 0xd5ae3d1e - .long 0xd8a73315 - .long 0xcfbc2108 - .long 0xc2b52f03 - .long 0xe18a0532 - .long 0xec830b39 - .long 0xfb981924 - .long 0xf691172f - .long 0xd64d768d - .long 0xdb447886 - .long 0xcc5f6a9b - .long 0xc1566490 - .long 0xe2694ea1 - .long 0xef6040aa - .long 0xf87b52b7 - .long 0xf5725cbc - .long 0xbe0506d5 - .long 0xb30c08de - .long 0xa4171ac3 - .long 0xa91e14c8 - .long 0x8a213ef9 - .long 0x872830f2 - .long 0x903322ef - .long 0x9d3a2ce4 - .long 0x06dd963d - .long 0x0bd49836 - .long 0x1ccf8a2b - .long 0x11c68420 - .long 0x32f9ae11 - .long 0x3ff0a01a - .long 0x28ebb207 - .long 0x25e2bc0c - .long 0x6e95e665 - .long 0x639ce86e - .long 0x7487fa73 - .long 0x798ef478 - .long 0x5ab1de49 - .long 0x57b8d042 - .long 0x40a3c25f - .long 0x4daacc54 - .long 0xdaec41f7 - .long 0xd7e54ffc - .long 0xc0fe5de1 - .long 0xcdf753ea - .long 0xeec879db - .long 0xe3c177d0 - .long 0xf4da65cd - .long 0xf9d36bc6 - .long 0xb2a431af - .long 0xbfad3fa4 - .long 0xa8b62db9 - .long 0xa5bf23b2 - .long 0x86800983 - .long 0x8b890788 - .long 0x9c921595 - .long 0x919b1b9e - .long 0x0a7ca147 - .long 0x0775af4c - .long 0x106ebd51 - .long 0x1d67b35a - .long 0x3e58996b - .long 0x33519760 - .long 0x244a857d - .long 0x29438b76 - .long 0x6234d11f - .long 0x6f3ddf14 - .long 0x7826cd09 - .long 0x752fc302 - .long 0x5610e933 - .long 0x5b19e738 - .long 0x4c02f525 - .long 0x410bfb2e - .long 0x61d79a8c - .long 0x6cde9487 - .long 0x7bc5869a - .long 0x76cc8891 - .long 0x55f3a2a0 - .long 0x58faacab - .long 0x4fe1beb6 - .long 0x42e8b0bd - .long 0x099fead4 - .long 0x0496e4df - .long 0x138df6c2 - .long 0x1e84f8c9 - .long 0x3dbbd2f8 - .long 0x30b2dcf3 - .long 0x27a9ceee - .long 0x2aa0c0e5 - .long 0xb1477a3c - .long 0xbc4e7437 - .long 0xab55662a - .long 0xa65c6821 - .long 0x85634210 - .long 0x886a4c1b - .long 0x9f715e06 - .long 0x9278500d - .long 0xd90f0a64 - .long 0xd406046f - .long 0xc31d1672 - .long 0xce141879 - .long 0xed2b3248 - .long 0xe0223c43 - .long 0xf7392e5e - .long 0xfa302055 - .long 0xb79aec01 - .long 0xba93e20a - .long 0xad88f017 - .long 0xa081fe1c - .long 0x83bed42d - .long 0x8eb7da26 - .long 0x99acc83b - .long 0x94a5c630 - .long 0xdfd29c59 - .long 0xd2db9252 - .long 0xc5c0804f - .long 0xc8c98e44 - .long 0xebf6a475 - .long 0xe6ffaa7e - .long 0xf1e4b863 - .long 0xfcedb668 - .long 0x670a0cb1 - .long 0x6a0302ba - .long 0x7d1810a7 - .long 0x70111eac - .long 0x532e349d - .long 0x5e273a96 - .long 0x493c288b - .long 0x44352680 - .long 0x0f427ce9 - .long 0x024b72e2 - .long 0x155060ff - .long 0x18596ef4 - .long 0x3b6644c5 - .long 0x366f4ace - .long 0x217458d3 - .long 0x2c7d56d8 - .long 0x0ca1377a - .long 0x01a83971 - .long 0x16b32b6c - .long 0x1bba2567 - .long 0x38850f56 - .long 0x358c015d - .long 0x22971340 - .long 0x2f9e1d4b - .long 0x64e94722 - .long 0x69e04929 - .long 0x7efb5b34 - .long 0x73f2553f - .long 0x50cd7f0e - .long 0x5dc47105 - .long 0x4adf6318 - .long 0x47d66d13 - .long 0xdc31d7ca - .long 0xd138d9c1 - .long 0xc623cbdc - .long 0xcb2ac5d7 - .long 0xe815efe6 - .long 0xe51ce1ed - .long 0xf207f3f0 - .long 0xff0efdfb - .long 0xb479a792 - .long 0xb970a999 - .long 0xae6bbb84 - .long 0xa362b58f - .long 0x805d9fbe - .long 0x8d5491b5 - .long 0x9a4f83a8 - .long 0x97468da3 - // Table 2. - .long 0x00000000 - .long 0x090e0b0d - .long 0x121c161a - .long 0x1b121d17 - .long 0x24382c34 - .long 0x2d362739 - .long 0x36243a2e - .long 0x3f2a3123 - .long 0x48705868 - .long 0x417e5365 - .long 0x5a6c4e72 - .long 0x5362457f - .long 0x6c48745c - .long 0x65467f51 - .long 0x7e546246 - .long 0x775a694b - .long 0x90e0b0d0 - .long 0x99eebbdd - .long 0x82fca6ca - .long 0x8bf2adc7 - .long 0xb4d89ce4 - .long 0xbdd697e9 - .long 0xa6c48afe - .long 0xafca81f3 - .long 0xd890e8b8 - .long 0xd19ee3b5 - .long 0xca8cfea2 - .long 0xc382f5af - .long 0xfca8c48c - .long 0xf5a6cf81 - .long 0xeeb4d296 - .long 0xe7bad99b - .long 0x3bdb7bbb - .long 0x32d570b6 - .long 0x29c76da1 - .long 0x20c966ac - .long 0x1fe3578f - .long 0x16ed5c82 - .long 0x0dff4195 - .long 0x04f14a98 - .long 0x73ab23d3 - .long 0x7aa528de - .long 0x61b735c9 - .long 0x68b93ec4 - .long 0x57930fe7 - .long 0x5e9d04ea - .long 0x458f19fd - .long 0x4c8112f0 - .long 0xab3bcb6b - .long 0xa235c066 - .long 0xb927dd71 - .long 0xb029d67c - .long 0x8f03e75f - .long 0x860dec52 - .long 0x9d1ff145 - .long 0x9411fa48 - .long 0xe34b9303 - .long 0xea45980e - .long 0xf1578519 - .long 0xf8598e14 - .long 0xc773bf37 - .long 0xce7db43a - .long 0xd56fa92d - .long 0xdc61a220 - .long 0x76adf66d - .long 0x7fa3fd60 - .long 0x64b1e077 - .long 0x6dbfeb7a - .long 0x5295da59 - .long 0x5b9bd154 - .long 0x4089cc43 - .long 0x4987c74e - .long 0x3eddae05 - .long 0x37d3a508 - .long 0x2cc1b81f - .long 0x25cfb312 - .long 0x1ae58231 - .long 0x13eb893c - .long 0x08f9942b - .long 0x01f79f26 - .long 0xe64d46bd - .long 0xef434db0 - .long 0xf45150a7 - .long 0xfd5f5baa - .long 0xc2756a89 - .long 0xcb7b6184 - .long 0xd0697c93 - .long 0xd967779e - .long 0xae3d1ed5 - .long 0xa73315d8 - .long 0xbc2108cf - .long 0xb52f03c2 - .long 0x8a0532e1 - .long 0x830b39ec - .long 0x981924fb - .long 0x91172ff6 - .long 0x4d768dd6 - .long 0x447886db - .long 0x5f6a9bcc - .long 0x566490c1 - .long 0x694ea1e2 - .long 0x6040aaef - .long 0x7b52b7f8 - .long 0x725cbcf5 - .long 0x0506d5be - .long 0x0c08deb3 - .long 0x171ac3a4 - .long 0x1e14c8a9 - .long 0x213ef98a - .long 0x2830f287 - .long 0x3322ef90 - .long 0x3a2ce49d - .long 0xdd963d06 - .long 0xd498360b - .long 0xcf8a2b1c - .long 0xc6842011 - .long 0xf9ae1132 - .long 0xf0a01a3f - .long 0xebb20728 - .long 0xe2bc0c25 - .long 0x95e6656e - .long 0x9ce86e63 - .long 0x87fa7374 - .long 0x8ef47879 - .long 0xb1de495a - .long 0xb8d04257 - .long 0xa3c25f40 - .long 0xaacc544d - .long 0xec41f7da - .long 0xe54ffcd7 - .long 0xfe5de1c0 - .long 0xf753eacd - .long 0xc879dbee - .long 0xc177d0e3 - .long 0xda65cdf4 - .long 0xd36bc6f9 - .long 0xa431afb2 - .long 0xad3fa4bf - .long 0xb62db9a8 - .long 0xbf23b2a5 - .long 0x80098386 - .long 0x8907888b - .long 0x9215959c - .long 0x9b1b9e91 - .long 0x7ca1470a - .long 0x75af4c07 - .long 0x6ebd5110 - .long 0x67b35a1d - .long 0x58996b3e - .long 0x51976033 - .long 0x4a857d24 - .long 0x438b7629 - .long 0x34d11f62 - .long 0x3ddf146f - .long 0x26cd0978 - .long 0x2fc30275 - .long 0x10e93356 - .long 0x19e7385b - .long 0x02f5254c - .long 0x0bfb2e41 - .long 0xd79a8c61 - .long 0xde94876c - .long 0xc5869a7b - .long 0xcc889176 - .long 0xf3a2a055 - .long 0xfaacab58 - .long 0xe1beb64f - .long 0xe8b0bd42 - .long 0x9fead409 - .long 0x96e4df04 - .long 0x8df6c213 - .long 0x84f8c91e - .long 0xbbd2f83d - .long 0xb2dcf330 - .long 0xa9ceee27 - .long 0xa0c0e52a - .long 0x477a3cb1 - .long 0x4e7437bc - .long 0x55662aab - .long 0x5c6821a6 - .long 0x63421085 - .long 0x6a4c1b88 - .long 0x715e069f - .long 0x78500d92 - .long 0x0f0a64d9 - .long 0x06046fd4 - .long 0x1d1672c3 - .long 0x141879ce - .long 0x2b3248ed - .long 0x223c43e0 - .long 0x392e5ef7 - .long 0x302055fa - .long 0x9aec01b7 - .long 0x93e20aba - .long 0x88f017ad - .long 0x81fe1ca0 - .long 0xbed42d83 - .long 0xb7da268e - .long 0xacc83b99 - .long 0xa5c63094 - .long 0xd29c59df - .long 0xdb9252d2 - .long 0xc0804fc5 - .long 0xc98e44c8 - .long 0xf6a475eb - .long 0xffaa7ee6 - .long 0xe4b863f1 - .long 0xedb668fc - .long 0x0a0cb167 - .long 0x0302ba6a - .long 0x1810a77d - .long 0x111eac70 - .long 0x2e349d53 - .long 0x273a965e - .long 0x3c288b49 - .long 0x35268044 - .long 0x427ce90f - .long 0x4b72e202 - .long 0x5060ff15 - .long 0x596ef418 - .long 0x6644c53b - .long 0x6f4ace36 - .long 0x7458d321 - .long 0x7d56d82c - .long 0xa1377a0c - .long 0xa8397101 - .long 0xb32b6c16 - .long 0xba25671b - .long 0x850f5638 - .long 0x8c015d35 - .long 0x97134022 - .long 0x9e1d4b2f - .long 0xe9472264 - .long 0xe0492969 - .long 0xfb5b347e - .long 0xf2553f73 - .long 0xcd7f0e50 - .long 0xc471055d - .long 0xdf63184a - .long 0xd66d1347 - .long 0x31d7cadc - .long 0x38d9c1d1 - .long 0x23cbdcc6 - .long 0x2ac5d7cb - .long 0x15efe6e8 - .long 0x1ce1ede5 - .long 0x07f3f0f2 - .long 0x0efdfbff - .long 0x79a792b4 - .long 0x70a999b9 - .long 0x6bbb84ae - .long 0x62b58fa3 - .long 0x5d9fbe80 - .long 0x5491b58d - .long 0x4f83a89a - .long 0x468da397 - // Table 3. - .long 0x00000000 - .long 0x0e0b0d09 - .long 0x1c161a12 - .long 0x121d171b - .long 0x382c3424 - .long 0x3627392d - .long 0x243a2e36 - .long 0x2a31233f - .long 0x70586848 - .long 0x7e536541 - .long 0x6c4e725a - .long 0x62457f53 - .long 0x48745c6c - .long 0x467f5165 - .long 0x5462467e - .long 0x5a694b77 - .long 0xe0b0d090 - .long 0xeebbdd99 - .long 0xfca6ca82 - .long 0xf2adc78b - .long 0xd89ce4b4 - .long 0xd697e9bd - .long 0xc48afea6 - .long 0xca81f3af - .long 0x90e8b8d8 - .long 0x9ee3b5d1 - .long 0x8cfea2ca - .long 0x82f5afc3 - .long 0xa8c48cfc - .long 0xa6cf81f5 - .long 0xb4d296ee - .long 0xbad99be7 - .long 0xdb7bbb3b - .long 0xd570b632 - .long 0xc76da129 - .long 0xc966ac20 - .long 0xe3578f1f - .long 0xed5c8216 - .long 0xff41950d - .long 0xf14a9804 - .long 0xab23d373 - .long 0xa528de7a - .long 0xb735c961 - .long 0xb93ec468 - .long 0x930fe757 - .long 0x9d04ea5e - .long 0x8f19fd45 - .long 0x8112f04c - .long 0x3bcb6bab - .long 0x35c066a2 - .long 0x27dd71b9 - .long 0x29d67cb0 - .long 0x03e75f8f - .long 0x0dec5286 - .long 0x1ff1459d - .long 0x11fa4894 - .long 0x4b9303e3 - .long 0x45980eea - .long 0x578519f1 - .long 0x598e14f8 - .long 0x73bf37c7 - .long 0x7db43ace - .long 0x6fa92dd5 - .long 0x61a220dc - .long 0xadf66d76 - .long 0xa3fd607f - .long 0xb1e07764 - .long 0xbfeb7a6d - .long 0x95da5952 - .long 0x9bd1545b - .long 0x89cc4340 - .long 0x87c74e49 - .long 0xddae053e - .long 0xd3a50837 - .long 0xc1b81f2c - .long 0xcfb31225 - .long 0xe582311a - .long 0xeb893c13 - .long 0xf9942b08 - .long 0xf79f2601 - .long 0x4d46bde6 - .long 0x434db0ef - .long 0x5150a7f4 - .long 0x5f5baafd - .long 0x756a89c2 - .long 0x7b6184cb - .long 0x697c93d0 - .long 0x67779ed9 - .long 0x3d1ed5ae - .long 0x3315d8a7 - .long 0x2108cfbc - .long 0x2f03c2b5 - .long 0x0532e18a - .long 0x0b39ec83 - .long 0x1924fb98 - .long 0x172ff691 - .long 0x768dd64d - .long 0x7886db44 - .long 0x6a9bcc5f - .long 0x6490c156 - .long 0x4ea1e269 - .long 0x40aaef60 - .long 0x52b7f87b - .long 0x5cbcf572 - .long 0x06d5be05 - .long 0x08deb30c - .long 0x1ac3a417 - .long 0x14c8a91e - .long 0x3ef98a21 - .long 0x30f28728 - .long 0x22ef9033 - .long 0x2ce49d3a - .long 0x963d06dd - .long 0x98360bd4 - .long 0x8a2b1ccf - .long 0x842011c6 - .long 0xae1132f9 - .long 0xa01a3ff0 - .long 0xb20728eb - .long 0xbc0c25e2 - .long 0xe6656e95 - .long 0xe86e639c - .long 0xfa737487 - .long 0xf478798e - .long 0xde495ab1 - .long 0xd04257b8 - .long 0xc25f40a3 - .long 0xcc544daa - .long 0x41f7daec - .long 0x4ffcd7e5 - .long 0x5de1c0fe - .long 0x53eacdf7 - .long 0x79dbeec8 - .long 0x77d0e3c1 - .long 0x65cdf4da - .long 0x6bc6f9d3 - .long 0x31afb2a4 - .long 0x3fa4bfad - .long 0x2db9a8b6 - .long 0x23b2a5bf - .long 0x09838680 - .long 0x07888b89 - .long 0x15959c92 - .long 0x1b9e919b - .long 0xa1470a7c - .long 0xaf4c0775 - .long 0xbd51106e - .long 0xb35a1d67 - .long 0x996b3e58 - .long 0x97603351 - .long 0x857d244a - .long 0x8b762943 - .long 0xd11f6234 - .long 0xdf146f3d - .long 0xcd097826 - .long 0xc302752f - .long 0xe9335610 - .long 0xe7385b19 - .long 0xf5254c02 - .long 0xfb2e410b - .long 0x9a8c61d7 - .long 0x94876cde - .long 0x869a7bc5 - .long 0x889176cc - .long 0xa2a055f3 - .long 0xacab58fa - .long 0xbeb64fe1 - .long 0xb0bd42e8 - .long 0xead4099f - .long 0xe4df0496 - .long 0xf6c2138d - .long 0xf8c91e84 - .long 0xd2f83dbb - .long 0xdcf330b2 - .long 0xceee27a9 - .long 0xc0e52aa0 - .long 0x7a3cb147 - .long 0x7437bc4e - .long 0x662aab55 - .long 0x6821a65c - .long 0x42108563 - .long 0x4c1b886a - .long 0x5e069f71 - .long 0x500d9278 - .long 0x0a64d90f - .long 0x046fd406 - .long 0x1672c31d - .long 0x1879ce14 - .long 0x3248ed2b - .long 0x3c43e022 - .long 0x2e5ef739 - .long 0x2055fa30 - .long 0xec01b79a - .long 0xe20aba93 - .long 0xf017ad88 - .long 0xfe1ca081 - .long 0xd42d83be - .long 0xda268eb7 - .long 0xc83b99ac - .long 0xc63094a5 - .long 0x9c59dfd2 - .long 0x9252d2db - .long 0x804fc5c0 - .long 0x8e44c8c9 - .long 0xa475ebf6 - .long 0xaa7ee6ff - .long 0xb863f1e4 - .long 0xb668fced - .long 0x0cb1670a - .long 0x02ba6a03 - .long 0x10a77d18 - .long 0x1eac7011 - .long 0x349d532e - .long 0x3a965e27 - .long 0x288b493c - .long 0x26804435 - .long 0x7ce90f42 - .long 0x72e2024b - .long 0x60ff1550 - .long 0x6ef41859 - .long 0x44c53b66 - .long 0x4ace366f - .long 0x58d32174 - .long 0x56d82c7d - .long 0x377a0ca1 - .long 0x397101a8 - .long 0x2b6c16b3 - .long 0x25671bba - .long 0x0f563885 - .long 0x015d358c - .long 0x13402297 - .long 0x1d4b2f9e - .long 0x472264e9 - .long 0x492969e0 - .long 0x5b347efb - .long 0x553f73f2 - .long 0x7f0e50cd - .long 0x71055dc4 - .long 0x63184adf - .long 0x6d1347d6 - .long 0xd7cadc31 - .long 0xd9c1d138 - .long 0xcbdcc623 - .long 0xc5d7cb2a - .long 0xefe6e815 - .long 0xe1ede51c - .long 0xf3f0f207 - .long 0xfdfbff0e - .long 0xa792b479 - .long 0xa999b970 - .long 0xbb84ae6b - .long 0xb58fa362 - .long 0x9fbe805d - .long 0x91b58d54 - .long 0x83a89a4f - .long 0x8da39746 - - -// Tables for main encryption iterations. - .globl _AESEncryptTable - .private_extern _AESEncryptTable - .align 2 -_AESEncryptTable: - // Table 0. - .long 0xa56363c6 - .long 0x847c7cf8 - .long 0x997777ee - .long 0x8d7b7bf6 - .long 0x0df2f2ff - .long 0xbd6b6bd6 - .long 0xb16f6fde - .long 0x54c5c591 - .long 0x50303060 - .long 0x03010102 - .long 0xa96767ce - .long 0x7d2b2b56 - .long 0x19fefee7 - .long 0x62d7d7b5 - .long 0xe6abab4d - .long 0x9a7676ec - .long 0x45caca8f - .long 0x9d82821f - .long 0x40c9c989 - .long 0x877d7dfa - .long 0x15fafaef - .long 0xeb5959b2 - .long 0xc947478e - .long 0x0bf0f0fb - .long 0xecadad41 - .long 0x67d4d4b3 - .long 0xfda2a25f - .long 0xeaafaf45 - .long 0xbf9c9c23 - .long 0xf7a4a453 - .long 0x967272e4 - .long 0x5bc0c09b - .long 0xc2b7b775 - .long 0x1cfdfde1 - .long 0xae93933d - .long 0x6a26264c - .long 0x5a36366c - .long 0x413f3f7e - .long 0x02f7f7f5 - .long 0x4fcccc83 - .long 0x5c343468 - .long 0xf4a5a551 - .long 0x34e5e5d1 - .long 0x08f1f1f9 - .long 0x937171e2 - .long 0x73d8d8ab - .long 0x53313162 - .long 0x3f15152a - .long 0x0c040408 - .long 0x52c7c795 - .long 0x65232346 - .long 0x5ec3c39d - .long 0x28181830 - .long 0xa1969637 - .long 0x0f05050a - .long 0xb59a9a2f - .long 0x0907070e - .long 0x36121224 - .long 0x9b80801b - .long 0x3de2e2df - .long 0x26ebebcd - .long 0x6927274e - .long 0xcdb2b27f - .long 0x9f7575ea - .long 0x1b090912 - .long 0x9e83831d - .long 0x742c2c58 - .long 0x2e1a1a34 - .long 0x2d1b1b36 - .long 0xb26e6edc - .long 0xee5a5ab4 - .long 0xfba0a05b - .long 0xf65252a4 - .long 0x4d3b3b76 - .long 0x61d6d6b7 - .long 0xceb3b37d - .long 0x7b292952 - .long 0x3ee3e3dd - .long 0x712f2f5e - .long 0x97848413 - .long 0xf55353a6 - .long 0x68d1d1b9 - .long 0x00000000 - .long 0x2cededc1 - .long 0x60202040 - .long 0x1ffcfce3 - .long 0xc8b1b179 - .long 0xed5b5bb6 - .long 0xbe6a6ad4 - .long 0x46cbcb8d - .long 0xd9bebe67 - .long 0x4b393972 - .long 0xde4a4a94 - .long 0xd44c4c98 - .long 0xe85858b0 - .long 0x4acfcf85 - .long 0x6bd0d0bb - .long 0x2aefefc5 - .long 0xe5aaaa4f - .long 0x16fbfbed - .long 0xc5434386 - .long 0xd74d4d9a - .long 0x55333366 - .long 0x94858511 - .long 0xcf45458a - .long 0x10f9f9e9 - .long 0x06020204 - .long 0x817f7ffe - .long 0xf05050a0 - .long 0x443c3c78 - .long 0xba9f9f25 - .long 0xe3a8a84b - .long 0xf35151a2 - .long 0xfea3a35d - .long 0xc0404080 - .long 0x8a8f8f05 - .long 0xad92923f - .long 0xbc9d9d21 - .long 0x48383870 - .long 0x04f5f5f1 - .long 0xdfbcbc63 - .long 0xc1b6b677 - .long 0x75dadaaf - .long 0x63212142 - .long 0x30101020 - .long 0x1affffe5 - .long 0x0ef3f3fd - .long 0x6dd2d2bf - .long 0x4ccdcd81 - .long 0x140c0c18 - .long 0x35131326 - .long 0x2fececc3 - .long 0xe15f5fbe - .long 0xa2979735 - .long 0xcc444488 - .long 0x3917172e - .long 0x57c4c493 - .long 0xf2a7a755 - .long 0x827e7efc - .long 0x473d3d7a - .long 0xac6464c8 - .long 0xe75d5dba - .long 0x2b191932 - .long 0x957373e6 - .long 0xa06060c0 - .long 0x98818119 - .long 0xd14f4f9e - .long 0x7fdcdca3 - .long 0x66222244 - .long 0x7e2a2a54 - .long 0xab90903b - .long 0x8388880b - .long 0xca46468c - .long 0x29eeeec7 - .long 0xd3b8b86b - .long 0x3c141428 - .long 0x79dedea7 - .long 0xe25e5ebc - .long 0x1d0b0b16 - .long 0x76dbdbad - .long 0x3be0e0db - .long 0x56323264 - .long 0x4e3a3a74 - .long 0x1e0a0a14 - .long 0xdb494992 - .long 0x0a06060c - .long 0x6c242448 - .long 0xe45c5cb8 - .long 0x5dc2c29f - .long 0x6ed3d3bd - .long 0xefacac43 - .long 0xa66262c4 - .long 0xa8919139 - .long 0xa4959531 - .long 0x37e4e4d3 - .long 0x8b7979f2 - .long 0x32e7e7d5 - .long 0x43c8c88b - .long 0x5937376e - .long 0xb76d6dda - .long 0x8c8d8d01 - .long 0x64d5d5b1 - .long 0xd24e4e9c - .long 0xe0a9a949 - .long 0xb46c6cd8 - .long 0xfa5656ac - .long 0x07f4f4f3 - .long 0x25eaeacf - .long 0xaf6565ca - .long 0x8e7a7af4 - .long 0xe9aeae47 - .long 0x18080810 - .long 0xd5baba6f - .long 0x887878f0 - .long 0x6f25254a - .long 0x722e2e5c - .long 0x241c1c38 - .long 0xf1a6a657 - .long 0xc7b4b473 - .long 0x51c6c697 - .long 0x23e8e8cb - .long 0x7cdddda1 - .long 0x9c7474e8 - .long 0x211f1f3e - .long 0xdd4b4b96 - .long 0xdcbdbd61 - .long 0x868b8b0d - .long 0x858a8a0f - .long 0x907070e0 - .long 0x423e3e7c - .long 0xc4b5b571 - .long 0xaa6666cc - .long 0xd8484890 - .long 0x05030306 - .long 0x01f6f6f7 - .long 0x120e0e1c - .long 0xa36161c2 - .long 0x5f35356a - .long 0xf95757ae - .long 0xd0b9b969 - .long 0x91868617 - .long 0x58c1c199 - .long 0x271d1d3a - .long 0xb99e9e27 - .long 0x38e1e1d9 - .long 0x13f8f8eb - .long 0xb398982b - .long 0x33111122 - .long 0xbb6969d2 - .long 0x70d9d9a9 - .long 0x898e8e07 - .long 0xa7949433 - .long 0xb69b9b2d - .long 0x221e1e3c - .long 0x92878715 - .long 0x20e9e9c9 - .long 0x49cece87 - .long 0xff5555aa - .long 0x78282850 - .long 0x7adfdfa5 - .long 0x8f8c8c03 - .long 0xf8a1a159 - .long 0x80898909 - .long 0x170d0d1a - .long 0xdabfbf65 - .long 0x31e6e6d7 - .long 0xc6424284 - .long 0xb86868d0 - .long 0xc3414182 - .long 0xb0999929 - .long 0x772d2d5a - .long 0x110f0f1e - .long 0xcbb0b07b - .long 0xfc5454a8 - .long 0xd6bbbb6d - .long 0x3a16162c - // Table 1. - .long 0x6363c6a5 - .long 0x7c7cf884 - .long 0x7777ee99 - .long 0x7b7bf68d - .long 0xf2f2ff0d - .long 0x6b6bd6bd - .long 0x6f6fdeb1 - .long 0xc5c59154 - .long 0x30306050 - .long 0x01010203 - .long 0x6767cea9 - .long 0x2b2b567d - .long 0xfefee719 - .long 0xd7d7b562 - .long 0xabab4de6 - .long 0x7676ec9a - .long 0xcaca8f45 - .long 0x82821f9d - .long 0xc9c98940 - .long 0x7d7dfa87 - .long 0xfafaef15 - .long 0x5959b2eb - .long 0x47478ec9 - .long 0xf0f0fb0b - .long 0xadad41ec - .long 0xd4d4b367 - .long 0xa2a25ffd - .long 0xafaf45ea - .long 0x9c9c23bf - .long 0xa4a453f7 - .long 0x7272e496 - .long 0xc0c09b5b - .long 0xb7b775c2 - .long 0xfdfde11c - .long 0x93933dae - .long 0x26264c6a - .long 0x36366c5a - .long 0x3f3f7e41 - .long 0xf7f7f502 - .long 0xcccc834f - .long 0x3434685c - .long 0xa5a551f4 - .long 0xe5e5d134 - .long 0xf1f1f908 - .long 0x7171e293 - .long 0xd8d8ab73 - .long 0x31316253 - .long 0x15152a3f - .long 0x0404080c - .long 0xc7c79552 - .long 0x23234665 - .long 0xc3c39d5e - .long 0x18183028 - .long 0x969637a1 - .long 0x05050a0f - .long 0x9a9a2fb5 - .long 0x07070e09 - .long 0x12122436 - .long 0x80801b9b - .long 0xe2e2df3d - .long 0xebebcd26 - .long 0x27274e69 - .long 0xb2b27fcd - .long 0x7575ea9f - .long 0x0909121b - .long 0x83831d9e - .long 0x2c2c5874 - .long 0x1a1a342e - .long 0x1b1b362d - .long 0x6e6edcb2 - .long 0x5a5ab4ee - .long 0xa0a05bfb - .long 0x5252a4f6 - .long 0x3b3b764d - .long 0xd6d6b761 - .long 0xb3b37dce - .long 0x2929527b - .long 0xe3e3dd3e - .long 0x2f2f5e71 - .long 0x84841397 - .long 0x5353a6f5 - .long 0xd1d1b968 - .long 0x00000000 - .long 0xededc12c - .long 0x20204060 - .long 0xfcfce31f - .long 0xb1b179c8 - .long 0x5b5bb6ed - .long 0x6a6ad4be - .long 0xcbcb8d46 - .long 0xbebe67d9 - .long 0x3939724b - .long 0x4a4a94de - .long 0x4c4c98d4 - .long 0x5858b0e8 - .long 0xcfcf854a - .long 0xd0d0bb6b - .long 0xefefc52a - .long 0xaaaa4fe5 - .long 0xfbfbed16 - .long 0x434386c5 - .long 0x4d4d9ad7 - .long 0x33336655 - .long 0x85851194 - .long 0x45458acf - .long 0xf9f9e910 - .long 0x02020406 - .long 0x7f7ffe81 - .long 0x5050a0f0 - .long 0x3c3c7844 - .long 0x9f9f25ba - .long 0xa8a84be3 - .long 0x5151a2f3 - .long 0xa3a35dfe - .long 0x404080c0 - .long 0x8f8f058a - .long 0x92923fad - .long 0x9d9d21bc - .long 0x38387048 - .long 0xf5f5f104 - .long 0xbcbc63df - .long 0xb6b677c1 - .long 0xdadaaf75 - .long 0x21214263 - .long 0x10102030 - .long 0xffffe51a - .long 0xf3f3fd0e - .long 0xd2d2bf6d - .long 0xcdcd814c - .long 0x0c0c1814 - .long 0x13132635 - .long 0xececc32f - .long 0x5f5fbee1 - .long 0x979735a2 - .long 0x444488cc - .long 0x17172e39 - .long 0xc4c49357 - .long 0xa7a755f2 - .long 0x7e7efc82 - .long 0x3d3d7a47 - .long 0x6464c8ac - .long 0x5d5dbae7 - .long 0x1919322b - .long 0x7373e695 - .long 0x6060c0a0 - .long 0x81811998 - .long 0x4f4f9ed1 - .long 0xdcdca37f - .long 0x22224466 - .long 0x2a2a547e - .long 0x90903bab - .long 0x88880b83 - .long 0x46468cca - .long 0xeeeec729 - .long 0xb8b86bd3 - .long 0x1414283c - .long 0xdedea779 - .long 0x5e5ebce2 - .long 0x0b0b161d - .long 0xdbdbad76 - .long 0xe0e0db3b - .long 0x32326456 - .long 0x3a3a744e - .long 0x0a0a141e - .long 0x494992db - .long 0x06060c0a - .long 0x2424486c - .long 0x5c5cb8e4 - .long 0xc2c29f5d - .long 0xd3d3bd6e - .long 0xacac43ef - .long 0x6262c4a6 - .long 0x919139a8 - .long 0x959531a4 - .long 0xe4e4d337 - .long 0x7979f28b - .long 0xe7e7d532 - .long 0xc8c88b43 - .long 0x37376e59 - .long 0x6d6ddab7 - .long 0x8d8d018c - .long 0xd5d5b164 - .long 0x4e4e9cd2 - .long 0xa9a949e0 - .long 0x6c6cd8b4 - .long 0x5656acfa - .long 0xf4f4f307 - .long 0xeaeacf25 - .long 0x6565caaf - .long 0x7a7af48e - .long 0xaeae47e9 - .long 0x08081018 - .long 0xbaba6fd5 - .long 0x7878f088 - .long 0x25254a6f - .long 0x2e2e5c72 - .long 0x1c1c3824 - .long 0xa6a657f1 - .long 0xb4b473c7 - .long 0xc6c69751 - .long 0xe8e8cb23 - .long 0xdddda17c - .long 0x7474e89c - .long 0x1f1f3e21 - .long 0x4b4b96dd - .long 0xbdbd61dc - .long 0x8b8b0d86 - .long 0x8a8a0f85 - .long 0x7070e090 - .long 0x3e3e7c42 - .long 0xb5b571c4 - .long 0x6666ccaa - .long 0x484890d8 - .long 0x03030605 - .long 0xf6f6f701 - .long 0x0e0e1c12 - .long 0x6161c2a3 - .long 0x35356a5f - .long 0x5757aef9 - .long 0xb9b969d0 - .long 0x86861791 - .long 0xc1c19958 - .long 0x1d1d3a27 - .long 0x9e9e27b9 - .long 0xe1e1d938 - .long 0xf8f8eb13 - .long 0x98982bb3 - .long 0x11112233 - .long 0x6969d2bb - .long 0xd9d9a970 - .long 0x8e8e0789 - .long 0x949433a7 - .long 0x9b9b2db6 - .long 0x1e1e3c22 - .long 0x87871592 - .long 0xe9e9c920 - .long 0xcece8749 - .long 0x5555aaff - .long 0x28285078 - .long 0xdfdfa57a - .long 0x8c8c038f - .long 0xa1a159f8 - .long 0x89890980 - .long 0x0d0d1a17 - .long 0xbfbf65da - .long 0xe6e6d731 - .long 0x424284c6 - .long 0x6868d0b8 - .long 0x414182c3 - .long 0x999929b0 - .long 0x2d2d5a77 - .long 0x0f0f1e11 - .long 0xb0b07bcb - .long 0x5454a8fc - .long 0xbbbb6dd6 - .long 0x16162c3a - // Table 2. - .long 0x63c6a563 - .long 0x7cf8847c - .long 0x77ee9977 - .long 0x7bf68d7b - .long 0xf2ff0df2 - .long 0x6bd6bd6b - .long 0x6fdeb16f - .long 0xc59154c5 - .long 0x30605030 - .long 0x01020301 - .long 0x67cea967 - .long 0x2b567d2b - .long 0xfee719fe - .long 0xd7b562d7 - .long 0xab4de6ab - .long 0x76ec9a76 - .long 0xca8f45ca - .long 0x821f9d82 - .long 0xc98940c9 - .long 0x7dfa877d - .long 0xfaef15fa - .long 0x59b2eb59 - .long 0x478ec947 - .long 0xf0fb0bf0 - .long 0xad41ecad - .long 0xd4b367d4 - .long 0xa25ffda2 - .long 0xaf45eaaf - .long 0x9c23bf9c - .long 0xa453f7a4 - .long 0x72e49672 - .long 0xc09b5bc0 - .long 0xb775c2b7 - .long 0xfde11cfd - .long 0x933dae93 - .long 0x264c6a26 - .long 0x366c5a36 - .long 0x3f7e413f - .long 0xf7f502f7 - .long 0xcc834fcc - .long 0x34685c34 - .long 0xa551f4a5 - .long 0xe5d134e5 - .long 0xf1f908f1 - .long 0x71e29371 - .long 0xd8ab73d8 - .long 0x31625331 - .long 0x152a3f15 - .long 0x04080c04 - .long 0xc79552c7 - .long 0x23466523 - .long 0xc39d5ec3 - .long 0x18302818 - .long 0x9637a196 - .long 0x050a0f05 - .long 0x9a2fb59a - .long 0x070e0907 - .long 0x12243612 - .long 0x801b9b80 - .long 0xe2df3de2 - .long 0xebcd26eb - .long 0x274e6927 - .long 0xb27fcdb2 - .long 0x75ea9f75 - .long 0x09121b09 - .long 0x831d9e83 - .long 0x2c58742c - .long 0x1a342e1a - .long 0x1b362d1b - .long 0x6edcb26e - .long 0x5ab4ee5a - .long 0xa05bfba0 - .long 0x52a4f652 - .long 0x3b764d3b - .long 0xd6b761d6 - .long 0xb37dceb3 - .long 0x29527b29 - .long 0xe3dd3ee3 - .long 0x2f5e712f - .long 0x84139784 - .long 0x53a6f553 - .long 0xd1b968d1 - .long 0x00000000 - .long 0xedc12ced - .long 0x20406020 - .long 0xfce31ffc - .long 0xb179c8b1 - .long 0x5bb6ed5b - .long 0x6ad4be6a - .long 0xcb8d46cb - .long 0xbe67d9be - .long 0x39724b39 - .long 0x4a94de4a - .long 0x4c98d44c - .long 0x58b0e858 - .long 0xcf854acf - .long 0xd0bb6bd0 - .long 0xefc52aef - .long 0xaa4fe5aa - .long 0xfbed16fb - .long 0x4386c543 - .long 0x4d9ad74d - .long 0x33665533 - .long 0x85119485 - .long 0x458acf45 - .long 0xf9e910f9 - .long 0x02040602 - .long 0x7ffe817f - .long 0x50a0f050 - .long 0x3c78443c - .long 0x9f25ba9f - .long 0xa84be3a8 - .long 0x51a2f351 - .long 0xa35dfea3 - .long 0x4080c040 - .long 0x8f058a8f - .long 0x923fad92 - .long 0x9d21bc9d - .long 0x38704838 - .long 0xf5f104f5 - .long 0xbc63dfbc - .long 0xb677c1b6 - .long 0xdaaf75da - .long 0x21426321 - .long 0x10203010 - .long 0xffe51aff - .long 0xf3fd0ef3 - .long 0xd2bf6dd2 - .long 0xcd814ccd - .long 0x0c18140c - .long 0x13263513 - .long 0xecc32fec - .long 0x5fbee15f - .long 0x9735a297 - .long 0x4488cc44 - .long 0x172e3917 - .long 0xc49357c4 - .long 0xa755f2a7 - .long 0x7efc827e - .long 0x3d7a473d - .long 0x64c8ac64 - .long 0x5dbae75d - .long 0x19322b19 - .long 0x73e69573 - .long 0x60c0a060 - .long 0x81199881 - .long 0x4f9ed14f - .long 0xdca37fdc - .long 0x22446622 - .long 0x2a547e2a - .long 0x903bab90 - .long 0x880b8388 - .long 0x468cca46 - .long 0xeec729ee - .long 0xb86bd3b8 - .long 0x14283c14 - .long 0xdea779de - .long 0x5ebce25e - .long 0x0b161d0b - .long 0xdbad76db - .long 0xe0db3be0 - .long 0x32645632 - .long 0x3a744e3a - .long 0x0a141e0a - .long 0x4992db49 - .long 0x060c0a06 - .long 0x24486c24 - .long 0x5cb8e45c - .long 0xc29f5dc2 - .long 0xd3bd6ed3 - .long 0xac43efac - .long 0x62c4a662 - .long 0x9139a891 - .long 0x9531a495 - .long 0xe4d337e4 - .long 0x79f28b79 - .long 0xe7d532e7 - .long 0xc88b43c8 - .long 0x376e5937 - .long 0x6ddab76d - .long 0x8d018c8d - .long 0xd5b164d5 - .long 0x4e9cd24e - .long 0xa949e0a9 - .long 0x6cd8b46c - .long 0x56acfa56 - .long 0xf4f307f4 - .long 0xeacf25ea - .long 0x65caaf65 - .long 0x7af48e7a - .long 0xae47e9ae - .long 0x08101808 - .long 0xba6fd5ba - .long 0x78f08878 - .long 0x254a6f25 - .long 0x2e5c722e - .long 0x1c38241c - .long 0xa657f1a6 - .long 0xb473c7b4 - .long 0xc69751c6 - .long 0xe8cb23e8 - .long 0xdda17cdd - .long 0x74e89c74 - .long 0x1f3e211f - .long 0x4b96dd4b - .long 0xbd61dcbd - .long 0x8b0d868b - .long 0x8a0f858a - .long 0x70e09070 - .long 0x3e7c423e - .long 0xb571c4b5 - .long 0x66ccaa66 - .long 0x4890d848 - .long 0x03060503 - .long 0xf6f701f6 - .long 0x0e1c120e - .long 0x61c2a361 - .long 0x356a5f35 - .long 0x57aef957 - .long 0xb969d0b9 - .long 0x86179186 - .long 0xc19958c1 - .long 0x1d3a271d - .long 0x9e27b99e - .long 0xe1d938e1 - .long 0xf8eb13f8 - .long 0x982bb398 - .long 0x11223311 - .long 0x69d2bb69 - .long 0xd9a970d9 - .long 0x8e07898e - .long 0x9433a794 - .long 0x9b2db69b - .long 0x1e3c221e - .long 0x87159287 - .long 0xe9c920e9 - .long 0xce8749ce - .long 0x55aaff55 - .long 0x28507828 - .long 0xdfa57adf - .long 0x8c038f8c - .long 0xa159f8a1 - .long 0x89098089 - .long 0x0d1a170d - .long 0xbf65dabf - .long 0xe6d731e6 - .long 0x4284c642 - .long 0x68d0b868 - .long 0x4182c341 - .long 0x9929b099 - .long 0x2d5a772d - .long 0x0f1e110f - .long 0xb07bcbb0 - .long 0x54a8fc54 - .long 0xbb6dd6bb - .long 0x162c3a16 - // Table 3. - .long 0xc6a56363 - .long 0xf8847c7c - .long 0xee997777 - .long 0xf68d7b7b - .long 0xff0df2f2 - .long 0xd6bd6b6b - .long 0xdeb16f6f - .long 0x9154c5c5 - .long 0x60503030 - .long 0x02030101 - .long 0xcea96767 - .long 0x567d2b2b - .long 0xe719fefe - .long 0xb562d7d7 - .long 0x4de6abab - .long 0xec9a7676 - .long 0x8f45caca - .long 0x1f9d8282 - .long 0x8940c9c9 - .long 0xfa877d7d - .long 0xef15fafa - .long 0xb2eb5959 - .long 0x8ec94747 - .long 0xfb0bf0f0 - .long 0x41ecadad - .long 0xb367d4d4 - .long 0x5ffda2a2 - .long 0x45eaafaf - .long 0x23bf9c9c - .long 0x53f7a4a4 - .long 0xe4967272 - .long 0x9b5bc0c0 - .long 0x75c2b7b7 - .long 0xe11cfdfd - .long 0x3dae9393 - .long 0x4c6a2626 - .long 0x6c5a3636 - .long 0x7e413f3f - .long 0xf502f7f7 - .long 0x834fcccc - .long 0x685c3434 - .long 0x51f4a5a5 - .long 0xd134e5e5 - .long 0xf908f1f1 - .long 0xe2937171 - .long 0xab73d8d8 - .long 0x62533131 - .long 0x2a3f1515 - .long 0x080c0404 - .long 0x9552c7c7 - .long 0x46652323 - .long 0x9d5ec3c3 - .long 0x30281818 - .long 0x37a19696 - .long 0x0a0f0505 - .long 0x2fb59a9a - .long 0x0e090707 - .long 0x24361212 - .long 0x1b9b8080 - .long 0xdf3de2e2 - .long 0xcd26ebeb - .long 0x4e692727 - .long 0x7fcdb2b2 - .long 0xea9f7575 - .long 0x121b0909 - .long 0x1d9e8383 - .long 0x58742c2c - .long 0x342e1a1a - .long 0x362d1b1b - .long 0xdcb26e6e - .long 0xb4ee5a5a - .long 0x5bfba0a0 - .long 0xa4f65252 - .long 0x764d3b3b - .long 0xb761d6d6 - .long 0x7dceb3b3 - .long 0x527b2929 - .long 0xdd3ee3e3 - .long 0x5e712f2f - .long 0x13978484 - .long 0xa6f55353 - .long 0xb968d1d1 - .long 0x00000000 - .long 0xc12ceded - .long 0x40602020 - .long 0xe31ffcfc - .long 0x79c8b1b1 - .long 0xb6ed5b5b - .long 0xd4be6a6a - .long 0x8d46cbcb - .long 0x67d9bebe - .long 0x724b3939 - .long 0x94de4a4a - .long 0x98d44c4c - .long 0xb0e85858 - .long 0x854acfcf - .long 0xbb6bd0d0 - .long 0xc52aefef - .long 0x4fe5aaaa - .long 0xed16fbfb - .long 0x86c54343 - .long 0x9ad74d4d - .long 0x66553333 - .long 0x11948585 - .long 0x8acf4545 - .long 0xe910f9f9 - .long 0x04060202 - .long 0xfe817f7f - .long 0xa0f05050 - .long 0x78443c3c - .long 0x25ba9f9f - .long 0x4be3a8a8 - .long 0xa2f35151 - .long 0x5dfea3a3 - .long 0x80c04040 - .long 0x058a8f8f - .long 0x3fad9292 - .long 0x21bc9d9d - .long 0x70483838 - .long 0xf104f5f5 - .long 0x63dfbcbc - .long 0x77c1b6b6 - .long 0xaf75dada - .long 0x42632121 - .long 0x20301010 - .long 0xe51affff - .long 0xfd0ef3f3 - .long 0xbf6dd2d2 - .long 0x814ccdcd - .long 0x18140c0c - .long 0x26351313 - .long 0xc32fecec - .long 0xbee15f5f - .long 0x35a29797 - .long 0x88cc4444 - .long 0x2e391717 - .long 0x9357c4c4 - .long 0x55f2a7a7 - .long 0xfc827e7e - .long 0x7a473d3d - .long 0xc8ac6464 - .long 0xbae75d5d - .long 0x322b1919 - .long 0xe6957373 - .long 0xc0a06060 - .long 0x19988181 - .long 0x9ed14f4f - .long 0xa37fdcdc - .long 0x44662222 - .long 0x547e2a2a - .long 0x3bab9090 - .long 0x0b838888 - .long 0x8cca4646 - .long 0xc729eeee - .long 0x6bd3b8b8 - .long 0x283c1414 - .long 0xa779dede - .long 0xbce25e5e - .long 0x161d0b0b - .long 0xad76dbdb - .long 0xdb3be0e0 - .long 0x64563232 - .long 0x744e3a3a - .long 0x141e0a0a - .long 0x92db4949 - .long 0x0c0a0606 - .long 0x486c2424 - .long 0xb8e45c5c - .long 0x9f5dc2c2 - .long 0xbd6ed3d3 - .long 0x43efacac - .long 0xc4a66262 - .long 0x39a89191 - .long 0x31a49595 - .long 0xd337e4e4 - .long 0xf28b7979 - .long 0xd532e7e7 - .long 0x8b43c8c8 - .long 0x6e593737 - .long 0xdab76d6d - .long 0x018c8d8d - .long 0xb164d5d5 - .long 0x9cd24e4e - .long 0x49e0a9a9 - .long 0xd8b46c6c - .long 0xacfa5656 - .long 0xf307f4f4 - .long 0xcf25eaea - .long 0xcaaf6565 - .long 0xf48e7a7a - .long 0x47e9aeae - .long 0x10180808 - .long 0x6fd5baba - .long 0xf0887878 - .long 0x4a6f2525 - .long 0x5c722e2e - .long 0x38241c1c - .long 0x57f1a6a6 - .long 0x73c7b4b4 - .long 0x9751c6c6 - .long 0xcb23e8e8 - .long 0xa17cdddd - .long 0xe89c7474 - .long 0x3e211f1f - .long 0x96dd4b4b - .long 0x61dcbdbd - .long 0x0d868b8b - .long 0x0f858a8a - .long 0xe0907070 - .long 0x7c423e3e - .long 0x71c4b5b5 - .long 0xccaa6666 - .long 0x90d84848 - .long 0x06050303 - .long 0xf701f6f6 - .long 0x1c120e0e - .long 0xc2a36161 - .long 0x6a5f3535 - .long 0xaef95757 - .long 0x69d0b9b9 - .long 0x17918686 - .long 0x9958c1c1 - .long 0x3a271d1d - .long 0x27b99e9e - .long 0xd938e1e1 - .long 0xeb13f8f8 - .long 0x2bb39898 - .long 0x22331111 - .long 0xd2bb6969 - .long 0xa970d9d9 - .long 0x07898e8e - .long 0x33a79494 - .long 0x2db69b9b - .long 0x3c221e1e - .long 0x15928787 - .long 0xc920e9e9 - .long 0x8749cece - .long 0xaaff5555 - .long 0x50782828 - .long 0xa57adfdf - .long 0x038f8c8c - .long 0x59f8a1a1 - .long 0x09808989 - .long 0x1a170d0d - .long 0x65dabfbf - .long 0xd731e6e6 - .long 0x84c64242 - .long 0xd0b86868 - .long 0x82c34141 - .long 0x29b09999 - .long 0x5a772d2d - .long 0x1e110f0f - .long 0x7bcbb0b0 - .long 0xa8fc5454 - .long 0x6dd6bbbb - .long 0x2c3a1616 - - -// Tables for main decryption iterations. - .globl _AESDecryptTable - .private_extern _AESDecryptTable - .align 2 -_AESDecryptTable: - // Table 0. - .long 0x50a7f451 - .long 0x5365417e - .long 0xc3a4171a - .long 0x965e273a - .long 0xcb6bab3b - .long 0xf1459d1f - .long 0xab58faac - .long 0x9303e34b - .long 0x55fa3020 - .long 0xf66d76ad - .long 0x9176cc88 - .long 0x254c02f5 - .long 0xfcd7e54f - .long 0xd7cb2ac5 - .long 0x80443526 - .long 0x8fa362b5 - .long 0x495ab1de - .long 0x671bba25 - .long 0x980eea45 - .long 0xe1c0fe5d - .long 0x02752fc3 - .long 0x12f04c81 - .long 0xa397468d - .long 0xc6f9d36b - .long 0xe75f8f03 - .long 0x959c9215 - .long 0xeb7a6dbf - .long 0xda595295 - .long 0x2d83bed4 - .long 0xd3217458 - .long 0x2969e049 - .long 0x44c8c98e - .long 0x6a89c275 - .long 0x78798ef4 - .long 0x6b3e5899 - .long 0xdd71b927 - .long 0xb64fe1be - .long 0x17ad88f0 - .long 0x66ac20c9 - .long 0xb43ace7d - .long 0x184adf63 - .long 0x82311ae5 - .long 0x60335197 - .long 0x457f5362 - .long 0xe07764b1 - .long 0x84ae6bbb - .long 0x1ca081fe - .long 0x942b08f9 - .long 0x58684870 - .long 0x19fd458f - .long 0x876cde94 - .long 0xb7f87b52 - .long 0x23d373ab - .long 0xe2024b72 - .long 0x578f1fe3 - .long 0x2aab5566 - .long 0x0728ebb2 - .long 0x03c2b52f - .long 0x9a7bc586 - .long 0xa50837d3 - .long 0xf2872830 - .long 0xb2a5bf23 - .long 0xba6a0302 - .long 0x5c8216ed - .long 0x2b1ccf8a - .long 0x92b479a7 - .long 0xf0f207f3 - .long 0xa1e2694e - .long 0xcdf4da65 - .long 0xd5be0506 - .long 0x1f6234d1 - .long 0x8afea6c4 - .long 0x9d532e34 - .long 0xa055f3a2 - .long 0x32e18a05 - .long 0x75ebf6a4 - .long 0x39ec830b - .long 0xaaef6040 - .long 0x069f715e - .long 0x51106ebd - .long 0xf98a213e - .long 0x3d06dd96 - .long 0xae053edd - .long 0x46bde64d - .long 0xb58d5491 - .long 0x055dc471 - .long 0x6fd40604 - .long 0xff155060 - .long 0x24fb9819 - .long 0x97e9bdd6 - .long 0xcc434089 - .long 0x779ed967 - .long 0xbd42e8b0 - .long 0x888b8907 - .long 0x385b19e7 - .long 0xdbeec879 - .long 0x470a7ca1 - .long 0xe90f427c - .long 0xc91e84f8 - .long 0x00000000 - .long 0x83868009 - .long 0x48ed2b32 - .long 0xac70111e - .long 0x4e725a6c - .long 0xfbff0efd - .long 0x5638850f - .long 0x1ed5ae3d - .long 0x27392d36 - .long 0x64d90f0a - .long 0x21a65c68 - .long 0xd1545b9b - .long 0x3a2e3624 - .long 0xb1670a0c - .long 0x0fe75793 - .long 0xd296eeb4 - .long 0x9e919b1b - .long 0x4fc5c080 - .long 0xa220dc61 - .long 0x694b775a - .long 0x161a121c - .long 0x0aba93e2 - .long 0xe52aa0c0 - .long 0x43e0223c - .long 0x1d171b12 - .long 0x0b0d090e - .long 0xadc78bf2 - .long 0xb9a8b62d - .long 0xc8a91e14 - .long 0x8519f157 - .long 0x4c0775af - .long 0xbbdd99ee - .long 0xfd607fa3 - .long 0x9f2601f7 - .long 0xbcf5725c - .long 0xc53b6644 - .long 0x347efb5b - .long 0x7629438b - .long 0xdcc623cb - .long 0x68fcedb6 - .long 0x63f1e4b8 - .long 0xcadc31d7 - .long 0x10856342 - .long 0x40229713 - .long 0x2011c684 - .long 0x7d244a85 - .long 0xf83dbbd2 - .long 0x1132f9ae - .long 0x6da129c7 - .long 0x4b2f9e1d - .long 0xf330b2dc - .long 0xec52860d - .long 0xd0e3c177 - .long 0x6c16b32b - .long 0x99b970a9 - .long 0xfa489411 - .long 0x2264e947 - .long 0xc48cfca8 - .long 0x1a3ff0a0 - .long 0xd82c7d56 - .long 0xef903322 - .long 0xc74e4987 - .long 0xc1d138d9 - .long 0xfea2ca8c - .long 0x360bd498 - .long 0xcf81f5a6 - .long 0x28de7aa5 - .long 0x268eb7da - .long 0xa4bfad3f - .long 0xe49d3a2c - .long 0x0d927850 - .long 0x9bcc5f6a - .long 0x62467e54 - .long 0xc2138df6 - .long 0xe8b8d890 - .long 0x5ef7392e - .long 0xf5afc382 - .long 0xbe805d9f - .long 0x7c93d069 - .long 0xa92dd56f - .long 0xb31225cf - .long 0x3b99acc8 - .long 0xa77d1810 - .long 0x6e639ce8 - .long 0x7bbb3bdb - .long 0x097826cd - .long 0xf418596e - .long 0x01b79aec - .long 0xa89a4f83 - .long 0x656e95e6 - .long 0x7ee6ffaa - .long 0x08cfbc21 - .long 0xe6e815ef - .long 0xd99be7ba - .long 0xce366f4a - .long 0xd4099fea - .long 0xd67cb029 - .long 0xafb2a431 - .long 0x31233f2a - .long 0x3094a5c6 - .long 0xc066a235 - .long 0x37bc4e74 - .long 0xa6ca82fc - .long 0xb0d090e0 - .long 0x15d8a733 - .long 0x4a9804f1 - .long 0xf7daec41 - .long 0x0e50cd7f - .long 0x2ff69117 - .long 0x8dd64d76 - .long 0x4db0ef43 - .long 0x544daacc - .long 0xdf0496e4 - .long 0xe3b5d19e - .long 0x1b886a4c - .long 0xb81f2cc1 - .long 0x7f516546 - .long 0x04ea5e9d - .long 0x5d358c01 - .long 0x737487fa - .long 0x2e410bfb - .long 0x5a1d67b3 - .long 0x52d2db92 - .long 0x335610e9 - .long 0x1347d66d - .long 0x8c61d79a - .long 0x7a0ca137 - .long 0x8e14f859 - .long 0x893c13eb - .long 0xee27a9ce - .long 0x35c961b7 - .long 0xede51ce1 - .long 0x3cb1477a - .long 0x59dfd29c - .long 0x3f73f255 - .long 0x79ce1418 - .long 0xbf37c773 - .long 0xeacdf753 - .long 0x5baafd5f - .long 0x146f3ddf - .long 0x86db4478 - .long 0x81f3afca - .long 0x3ec468b9 - .long 0x2c342438 - .long 0x5f40a3c2 - .long 0x72c31d16 - .long 0x0c25e2bc - .long 0x8b493c28 - .long 0x41950dff - .long 0x7101a839 - .long 0xdeb30c08 - .long 0x9ce4b4d8 - .long 0x90c15664 - .long 0x6184cb7b - .long 0x70b632d5 - .long 0x745c6c48 - .long 0x4257b8d0 - // Table 1. - .long 0xa7f45150 - .long 0x65417e53 - .long 0xa4171ac3 - .long 0x5e273a96 - .long 0x6bab3bcb - .long 0x459d1ff1 - .long 0x58faacab - .long 0x03e34b93 - .long 0xfa302055 - .long 0x6d76adf6 - .long 0x76cc8891 - .long 0x4c02f525 - .long 0xd7e54ffc - .long 0xcb2ac5d7 - .long 0x44352680 - .long 0xa362b58f - .long 0x5ab1de49 - .long 0x1bba2567 - .long 0x0eea4598 - .long 0xc0fe5de1 - .long 0x752fc302 - .long 0xf04c8112 - .long 0x97468da3 - .long 0xf9d36bc6 - .long 0x5f8f03e7 - .long 0x9c921595 - .long 0x7a6dbfeb - .long 0x595295da - .long 0x83bed42d - .long 0x217458d3 - .long 0x69e04929 - .long 0xc8c98e44 - .long 0x89c2756a - .long 0x798ef478 - .long 0x3e58996b - .long 0x71b927dd - .long 0x4fe1beb6 - .long 0xad88f017 - .long 0xac20c966 - .long 0x3ace7db4 - .long 0x4adf6318 - .long 0x311ae582 - .long 0x33519760 - .long 0x7f536245 - .long 0x7764b1e0 - .long 0xae6bbb84 - .long 0xa081fe1c - .long 0x2b08f994 - .long 0x68487058 - .long 0xfd458f19 - .long 0x6cde9487 - .long 0xf87b52b7 - .long 0xd373ab23 - .long 0x024b72e2 - .long 0x8f1fe357 - .long 0xab55662a - .long 0x28ebb207 - .long 0xc2b52f03 - .long 0x7bc5869a - .long 0x0837d3a5 - .long 0x872830f2 - .long 0xa5bf23b2 - .long 0x6a0302ba - .long 0x8216ed5c - .long 0x1ccf8a2b - .long 0xb479a792 - .long 0xf207f3f0 - .long 0xe2694ea1 - .long 0xf4da65cd - .long 0xbe0506d5 - .long 0x6234d11f - .long 0xfea6c48a - .long 0x532e349d - .long 0x55f3a2a0 - .long 0xe18a0532 - .long 0xebf6a475 - .long 0xec830b39 - .long 0xef6040aa - .long 0x9f715e06 - .long 0x106ebd51 - .long 0x8a213ef9 - .long 0x06dd963d - .long 0x053eddae - .long 0xbde64d46 - .long 0x8d5491b5 - .long 0x5dc47105 - .long 0xd406046f - .long 0x155060ff - .long 0xfb981924 - .long 0xe9bdd697 - .long 0x434089cc - .long 0x9ed96777 - .long 0x42e8b0bd - .long 0x8b890788 - .long 0x5b19e738 - .long 0xeec879db - .long 0x0a7ca147 - .long 0x0f427ce9 - .long 0x1e84f8c9 - .long 0x00000000 - .long 0x86800983 - .long 0xed2b3248 - .long 0x70111eac - .long 0x725a6c4e - .long 0xff0efdfb - .long 0x38850f56 - .long 0xd5ae3d1e - .long 0x392d3627 - .long 0xd90f0a64 - .long 0xa65c6821 - .long 0x545b9bd1 - .long 0x2e36243a - .long 0x670a0cb1 - .long 0xe757930f - .long 0x96eeb4d2 - .long 0x919b1b9e - .long 0xc5c0804f - .long 0x20dc61a2 - .long 0x4b775a69 - .long 0x1a121c16 - .long 0xba93e20a - .long 0x2aa0c0e5 - .long 0xe0223c43 - .long 0x171b121d - .long 0x0d090e0b - .long 0xc78bf2ad - .long 0xa8b62db9 - .long 0xa91e14c8 - .long 0x19f15785 - .long 0x0775af4c - .long 0xdd99eebb - .long 0x607fa3fd - .long 0x2601f79f - .long 0xf5725cbc - .long 0x3b6644c5 - .long 0x7efb5b34 - .long 0x29438b76 - .long 0xc623cbdc - .long 0xfcedb668 - .long 0xf1e4b863 - .long 0xdc31d7ca - .long 0x85634210 - .long 0x22971340 - .long 0x11c68420 - .long 0x244a857d - .long 0x3dbbd2f8 - .long 0x32f9ae11 - .long 0xa129c76d - .long 0x2f9e1d4b - .long 0x30b2dcf3 - .long 0x52860dec - .long 0xe3c177d0 - .long 0x16b32b6c - .long 0xb970a999 - .long 0x489411fa - .long 0x64e94722 - .long 0x8cfca8c4 - .long 0x3ff0a01a - .long 0x2c7d56d8 - .long 0x903322ef - .long 0x4e4987c7 - .long 0xd138d9c1 - .long 0xa2ca8cfe - .long 0x0bd49836 - .long 0x81f5a6cf - .long 0xde7aa528 - .long 0x8eb7da26 - .long 0xbfad3fa4 - .long 0x9d3a2ce4 - .long 0x9278500d - .long 0xcc5f6a9b - .long 0x467e5462 - .long 0x138df6c2 - .long 0xb8d890e8 - .long 0xf7392e5e - .long 0xafc382f5 - .long 0x805d9fbe - .long 0x93d0697c - .long 0x2dd56fa9 - .long 0x1225cfb3 - .long 0x99acc83b - .long 0x7d1810a7 - .long 0x639ce86e - .long 0xbb3bdb7b - .long 0x7826cd09 - .long 0x18596ef4 - .long 0xb79aec01 - .long 0x9a4f83a8 - .long 0x6e95e665 - .long 0xe6ffaa7e - .long 0xcfbc2108 - .long 0xe815efe6 - .long 0x9be7bad9 - .long 0x366f4ace - .long 0x099fead4 - .long 0x7cb029d6 - .long 0xb2a431af - .long 0x233f2a31 - .long 0x94a5c630 - .long 0x66a235c0 - .long 0xbc4e7437 - .long 0xca82fca6 - .long 0xd090e0b0 - .long 0xd8a73315 - .long 0x9804f14a - .long 0xdaec41f7 - .long 0x50cd7f0e - .long 0xf691172f - .long 0xd64d768d - .long 0xb0ef434d - .long 0x4daacc54 - .long 0x0496e4df - .long 0xb5d19ee3 - .long 0x886a4c1b - .long 0x1f2cc1b8 - .long 0x5165467f - .long 0xea5e9d04 - .long 0x358c015d - .long 0x7487fa73 - .long 0x410bfb2e - .long 0x1d67b35a - .long 0xd2db9252 - .long 0x5610e933 - .long 0x47d66d13 - .long 0x61d79a8c - .long 0x0ca1377a - .long 0x14f8598e - .long 0x3c13eb89 - .long 0x27a9ceee - .long 0xc961b735 - .long 0xe51ce1ed - .long 0xb1477a3c - .long 0xdfd29c59 - .long 0x73f2553f - .long 0xce141879 - .long 0x37c773bf - .long 0xcdf753ea - .long 0xaafd5f5b - .long 0x6f3ddf14 - .long 0xdb447886 - .long 0xf3afca81 - .long 0xc468b93e - .long 0x3424382c - .long 0x40a3c25f - .long 0xc31d1672 - .long 0x25e2bc0c - .long 0x493c288b - .long 0x950dff41 - .long 0x01a83971 - .long 0xb30c08de - .long 0xe4b4d89c - .long 0xc1566490 - .long 0x84cb7b61 - .long 0xb632d570 - .long 0x5c6c4874 - .long 0x57b8d042 - // Table 2. - .long 0xf45150a7 - .long 0x417e5365 - .long 0x171ac3a4 - .long 0x273a965e - .long 0xab3bcb6b - .long 0x9d1ff145 - .long 0xfaacab58 - .long 0xe34b9303 - .long 0x302055fa - .long 0x76adf66d - .long 0xcc889176 - .long 0x02f5254c - .long 0xe54ffcd7 - .long 0x2ac5d7cb - .long 0x35268044 - .long 0x62b58fa3 - .long 0xb1de495a - .long 0xba25671b - .long 0xea45980e - .long 0xfe5de1c0 - .long 0x2fc30275 - .long 0x4c8112f0 - .long 0x468da397 - .long 0xd36bc6f9 - .long 0x8f03e75f - .long 0x9215959c - .long 0x6dbfeb7a - .long 0x5295da59 - .long 0xbed42d83 - .long 0x7458d321 - .long 0xe0492969 - .long 0xc98e44c8 - .long 0xc2756a89 - .long 0x8ef47879 - .long 0x58996b3e - .long 0xb927dd71 - .long 0xe1beb64f - .long 0x88f017ad - .long 0x20c966ac - .long 0xce7db43a - .long 0xdf63184a - .long 0x1ae58231 - .long 0x51976033 - .long 0x5362457f - .long 0x64b1e077 - .long 0x6bbb84ae - .long 0x81fe1ca0 - .long 0x08f9942b - .long 0x48705868 - .long 0x458f19fd - .long 0xde94876c - .long 0x7b52b7f8 - .long 0x73ab23d3 - .long 0x4b72e202 - .long 0x1fe3578f - .long 0x55662aab - .long 0xebb20728 - .long 0xb52f03c2 - .long 0xc5869a7b - .long 0x37d3a508 - .long 0x2830f287 - .long 0xbf23b2a5 - .long 0x0302ba6a - .long 0x16ed5c82 - .long 0xcf8a2b1c - .long 0x79a792b4 - .long 0x07f3f0f2 - .long 0x694ea1e2 - .long 0xda65cdf4 - .long 0x0506d5be - .long 0x34d11f62 - .long 0xa6c48afe - .long 0x2e349d53 - .long 0xf3a2a055 - .long 0x8a0532e1 - .long 0xf6a475eb - .long 0x830b39ec - .long 0x6040aaef - .long 0x715e069f - .long 0x6ebd5110 - .long 0x213ef98a - .long 0xdd963d06 - .long 0x3eddae05 - .long 0xe64d46bd - .long 0x5491b58d - .long 0xc471055d - .long 0x06046fd4 - .long 0x5060ff15 - .long 0x981924fb - .long 0xbdd697e9 - .long 0x4089cc43 - .long 0xd967779e - .long 0xe8b0bd42 - .long 0x8907888b - .long 0x19e7385b - .long 0xc879dbee - .long 0x7ca1470a - .long 0x427ce90f - .long 0x84f8c91e - .long 0x00000000 - .long 0x80098386 - .long 0x2b3248ed - .long 0x111eac70 - .long 0x5a6c4e72 - .long 0x0efdfbff - .long 0x850f5638 - .long 0xae3d1ed5 - .long 0x2d362739 - .long 0x0f0a64d9 - .long 0x5c6821a6 - .long 0x5b9bd154 - .long 0x36243a2e - .long 0x0a0cb167 - .long 0x57930fe7 - .long 0xeeb4d296 - .long 0x9b1b9e91 - .long 0xc0804fc5 - .long 0xdc61a220 - .long 0x775a694b - .long 0x121c161a - .long 0x93e20aba - .long 0xa0c0e52a - .long 0x223c43e0 - .long 0x1b121d17 - .long 0x090e0b0d - .long 0x8bf2adc7 - .long 0xb62db9a8 - .long 0x1e14c8a9 - .long 0xf1578519 - .long 0x75af4c07 - .long 0x99eebbdd - .long 0x7fa3fd60 - .long 0x01f79f26 - .long 0x725cbcf5 - .long 0x6644c53b - .long 0xfb5b347e - .long 0x438b7629 - .long 0x23cbdcc6 - .long 0xedb668fc - .long 0xe4b863f1 - .long 0x31d7cadc - .long 0x63421085 - .long 0x97134022 - .long 0xc6842011 - .long 0x4a857d24 - .long 0xbbd2f83d - .long 0xf9ae1132 - .long 0x29c76da1 - .long 0x9e1d4b2f - .long 0xb2dcf330 - .long 0x860dec52 - .long 0xc177d0e3 - .long 0xb32b6c16 - .long 0x70a999b9 - .long 0x9411fa48 - .long 0xe9472264 - .long 0xfca8c48c - .long 0xf0a01a3f - .long 0x7d56d82c - .long 0x3322ef90 - .long 0x4987c74e - .long 0x38d9c1d1 - .long 0xca8cfea2 - .long 0xd498360b - .long 0xf5a6cf81 - .long 0x7aa528de - .long 0xb7da268e - .long 0xad3fa4bf - .long 0x3a2ce49d - .long 0x78500d92 - .long 0x5f6a9bcc - .long 0x7e546246 - .long 0x8df6c213 - .long 0xd890e8b8 - .long 0x392e5ef7 - .long 0xc382f5af - .long 0x5d9fbe80 - .long 0xd0697c93 - .long 0xd56fa92d - .long 0x25cfb312 - .long 0xacc83b99 - .long 0x1810a77d - .long 0x9ce86e63 - .long 0x3bdb7bbb - .long 0x26cd0978 - .long 0x596ef418 - .long 0x9aec01b7 - .long 0x4f83a89a - .long 0x95e6656e - .long 0xffaa7ee6 - .long 0xbc2108cf - .long 0x15efe6e8 - .long 0xe7bad99b - .long 0x6f4ace36 - .long 0x9fead409 - .long 0xb029d67c - .long 0xa431afb2 - .long 0x3f2a3123 - .long 0xa5c63094 - .long 0xa235c066 - .long 0x4e7437bc - .long 0x82fca6ca - .long 0x90e0b0d0 - .long 0xa73315d8 - .long 0x04f14a98 - .long 0xec41f7da - .long 0xcd7f0e50 - .long 0x91172ff6 - .long 0x4d768dd6 - .long 0xef434db0 - .long 0xaacc544d - .long 0x96e4df04 - .long 0xd19ee3b5 - .long 0x6a4c1b88 - .long 0x2cc1b81f - .long 0x65467f51 - .long 0x5e9d04ea - .long 0x8c015d35 - .long 0x87fa7374 - .long 0x0bfb2e41 - .long 0x67b35a1d - .long 0xdb9252d2 - .long 0x10e93356 - .long 0xd66d1347 - .long 0xd79a8c61 - .long 0xa1377a0c - .long 0xf8598e14 - .long 0x13eb893c - .long 0xa9ceee27 - .long 0x61b735c9 - .long 0x1ce1ede5 - .long 0x477a3cb1 - .long 0xd29c59df - .long 0xf2553f73 - .long 0x141879ce - .long 0xc773bf37 - .long 0xf753eacd - .long 0xfd5f5baa - .long 0x3ddf146f - .long 0x447886db - .long 0xafca81f3 - .long 0x68b93ec4 - .long 0x24382c34 - .long 0xa3c25f40 - .long 0x1d1672c3 - .long 0xe2bc0c25 - .long 0x3c288b49 - .long 0x0dff4195 - .long 0xa8397101 - .long 0x0c08deb3 - .long 0xb4d89ce4 - .long 0x566490c1 - .long 0xcb7b6184 - .long 0x32d570b6 - .long 0x6c48745c - .long 0xb8d04257 - // Table 3. - .long 0x5150a7f4 - .long 0x7e536541 - .long 0x1ac3a417 - .long 0x3a965e27 - .long 0x3bcb6bab - .long 0x1ff1459d - .long 0xacab58fa - .long 0x4b9303e3 - .long 0x2055fa30 - .long 0xadf66d76 - .long 0x889176cc - .long 0xf5254c02 - .long 0x4ffcd7e5 - .long 0xc5d7cb2a - .long 0x26804435 - .long 0xb58fa362 - .long 0xde495ab1 - .long 0x25671bba - .long 0x45980eea - .long 0x5de1c0fe - .long 0xc302752f - .long 0x8112f04c - .long 0x8da39746 - .long 0x6bc6f9d3 - .long 0x03e75f8f - .long 0x15959c92 - .long 0xbfeb7a6d - .long 0x95da5952 - .long 0xd42d83be - .long 0x58d32174 - .long 0x492969e0 - .long 0x8e44c8c9 - .long 0x756a89c2 - .long 0xf478798e - .long 0x996b3e58 - .long 0x27dd71b9 - .long 0xbeb64fe1 - .long 0xf017ad88 - .long 0xc966ac20 - .long 0x7db43ace - .long 0x63184adf - .long 0xe582311a - .long 0x97603351 - .long 0x62457f53 - .long 0xb1e07764 - .long 0xbb84ae6b - .long 0xfe1ca081 - .long 0xf9942b08 - .long 0x70586848 - .long 0x8f19fd45 - .long 0x94876cde - .long 0x52b7f87b - .long 0xab23d373 - .long 0x72e2024b - .long 0xe3578f1f - .long 0x662aab55 - .long 0xb20728eb - .long 0x2f03c2b5 - .long 0x869a7bc5 - .long 0xd3a50837 - .long 0x30f28728 - .long 0x23b2a5bf - .long 0x02ba6a03 - .long 0xed5c8216 - .long 0x8a2b1ccf - .long 0xa792b479 - .long 0xf3f0f207 - .long 0x4ea1e269 - .long 0x65cdf4da - .long 0x06d5be05 - .long 0xd11f6234 - .long 0xc48afea6 - .long 0x349d532e - .long 0xa2a055f3 - .long 0x0532e18a - .long 0xa475ebf6 - .long 0x0b39ec83 - .long 0x40aaef60 - .long 0x5e069f71 - .long 0xbd51106e - .long 0x3ef98a21 - .long 0x963d06dd - .long 0xddae053e - .long 0x4d46bde6 - .long 0x91b58d54 - .long 0x71055dc4 - .long 0x046fd406 - .long 0x60ff1550 - .long 0x1924fb98 - .long 0xd697e9bd - .long 0x89cc4340 - .long 0x67779ed9 - .long 0xb0bd42e8 - .long 0x07888b89 - .long 0xe7385b19 - .long 0x79dbeec8 - .long 0xa1470a7c - .long 0x7ce90f42 - .long 0xf8c91e84 - .long 0x00000000 - .long 0x09838680 - .long 0x3248ed2b - .long 0x1eac7011 - .long 0x6c4e725a - .long 0xfdfbff0e - .long 0x0f563885 - .long 0x3d1ed5ae - .long 0x3627392d - .long 0x0a64d90f - .long 0x6821a65c - .long 0x9bd1545b - .long 0x243a2e36 - .long 0x0cb1670a - .long 0x930fe757 - .long 0xb4d296ee - .long 0x1b9e919b - .long 0x804fc5c0 - .long 0x61a220dc - .long 0x5a694b77 - .long 0x1c161a12 - .long 0xe20aba93 - .long 0xc0e52aa0 - .long 0x3c43e022 - .long 0x121d171b - .long 0x0e0b0d09 - .long 0xf2adc78b - .long 0x2db9a8b6 - .long 0x14c8a91e - .long 0x578519f1 - .long 0xaf4c0775 - .long 0xeebbdd99 - .long 0xa3fd607f - .long 0xf79f2601 - .long 0x5cbcf572 - .long 0x44c53b66 - .long 0x5b347efb - .long 0x8b762943 - .long 0xcbdcc623 - .long 0xb668fced - .long 0xb863f1e4 - .long 0xd7cadc31 - .long 0x42108563 - .long 0x13402297 - .long 0x842011c6 - .long 0x857d244a - .long 0xd2f83dbb - .long 0xae1132f9 - .long 0xc76da129 - .long 0x1d4b2f9e - .long 0xdcf330b2 - .long 0x0dec5286 - .long 0x77d0e3c1 - .long 0x2b6c16b3 - .long 0xa999b970 - .long 0x11fa4894 - .long 0x472264e9 - .long 0xa8c48cfc - .long 0xa01a3ff0 - .long 0x56d82c7d - .long 0x22ef9033 - .long 0x87c74e49 - .long 0xd9c1d138 - .long 0x8cfea2ca - .long 0x98360bd4 - .long 0xa6cf81f5 - .long 0xa528de7a - .long 0xda268eb7 - .long 0x3fa4bfad - .long 0x2ce49d3a - .long 0x500d9278 - .long 0x6a9bcc5f - .long 0x5462467e - .long 0xf6c2138d - .long 0x90e8b8d8 - .long 0x2e5ef739 - .long 0x82f5afc3 - .long 0x9fbe805d - .long 0x697c93d0 - .long 0x6fa92dd5 - .long 0xcfb31225 - .long 0xc83b99ac - .long 0x10a77d18 - .long 0xe86e639c - .long 0xdb7bbb3b - .long 0xcd097826 - .long 0x6ef41859 - .long 0xec01b79a - .long 0x83a89a4f - .long 0xe6656e95 - .long 0xaa7ee6ff - .long 0x2108cfbc - .long 0xefe6e815 - .long 0xbad99be7 - .long 0x4ace366f - .long 0xead4099f - .long 0x29d67cb0 - .long 0x31afb2a4 - .long 0x2a31233f - .long 0xc63094a5 - .long 0x35c066a2 - .long 0x7437bc4e - .long 0xfca6ca82 - .long 0xe0b0d090 - .long 0x3315d8a7 - .long 0xf14a9804 - .long 0x41f7daec - .long 0x7f0e50cd - .long 0x172ff691 - .long 0x768dd64d - .long 0x434db0ef - .long 0xcc544daa - .long 0xe4df0496 - .long 0x9ee3b5d1 - .long 0x4c1b886a - .long 0xc1b81f2c - .long 0x467f5165 - .long 0x9d04ea5e - .long 0x015d358c - .long 0xfa737487 - .long 0xfb2e410b - .long 0xb35a1d67 - .long 0x9252d2db - .long 0xe9335610 - .long 0x6d1347d6 - .long 0x9a8c61d7 - .long 0x377a0ca1 - .long 0x598e14f8 - .long 0xeb893c13 - .long 0xceee27a9 - .long 0xb735c961 - .long 0xe1ede51c - .long 0x7a3cb147 - .long 0x9c59dfd2 - .long 0x553f73f2 - .long 0x1879ce14 - .long 0x73bf37c7 - .long 0x53eacdf7 - .long 0x5f5baafd - .long 0xdf146f3d - .long 0x7886db44 - .long 0xca81f3af - .long 0xb93ec468 - .long 0x382c3424 - .long 0xc25f40a3 - .long 0x1672c31d - .long 0xbc0c25e2 - .long 0x288b493c - .long 0xff41950d - .long 0x397101a8 - .long 0x08deb30c - .long 0xd89ce4b4 - .long 0x6490c156 - .long 0x7b6184cb - .long 0xd570b632 - .long 0x48745c6c - .long 0xd04257b8 - - -// SubBytes embedded in words tables. - .globl _AESSubBytesWordTable - .private_extern _AESSubBytesWordTable - .align 2 -_AESSubBytesWordTable: - // Table 0. - .long 0x00000063 - .long 0x0000007c - .long 0x00000077 - .long 0x0000007b - .long 0x000000f2 - .long 0x0000006b - .long 0x0000006f - .long 0x000000c5 - .long 0x00000030 - .long 0x00000001 - .long 0x00000067 - .long 0x0000002b - .long 0x000000fe - .long 0x000000d7 - .long 0x000000ab - .long 0x00000076 - .long 0x000000ca - .long 0x00000082 - .long 0x000000c9 - .long 0x0000007d - .long 0x000000fa - .long 0x00000059 - .long 0x00000047 - .long 0x000000f0 - .long 0x000000ad - .long 0x000000d4 - .long 0x000000a2 - .long 0x000000af - .long 0x0000009c - .long 0x000000a4 - .long 0x00000072 - .long 0x000000c0 - .long 0x000000b7 - .long 0x000000fd - .long 0x00000093 - .long 0x00000026 - .long 0x00000036 - .long 0x0000003f - .long 0x000000f7 - .long 0x000000cc - .long 0x00000034 - .long 0x000000a5 - .long 0x000000e5 - .long 0x000000f1 - .long 0x00000071 - .long 0x000000d8 - .long 0x00000031 - .long 0x00000015 - .long 0x00000004 - .long 0x000000c7 - .long 0x00000023 - .long 0x000000c3 - .long 0x00000018 - .long 0x00000096 - .long 0x00000005 - .long 0x0000009a - .long 0x00000007 - .long 0x00000012 - .long 0x00000080 - .long 0x000000e2 - .long 0x000000eb - .long 0x00000027 - .long 0x000000b2 - .long 0x00000075 - .long 0x00000009 - .long 0x00000083 - .long 0x0000002c - .long 0x0000001a - .long 0x0000001b - .long 0x0000006e - .long 0x0000005a - .long 0x000000a0 - .long 0x00000052 - .long 0x0000003b - .long 0x000000d6 - .long 0x000000b3 - .long 0x00000029 - .long 0x000000e3 - .long 0x0000002f - .long 0x00000084 - .long 0x00000053 - .long 0x000000d1 - .long 0x00000000 - .long 0x000000ed - .long 0x00000020 - .long 0x000000fc - .long 0x000000b1 - .long 0x0000005b - .long 0x0000006a - .long 0x000000cb - .long 0x000000be - .long 0x00000039 - .long 0x0000004a - .long 0x0000004c - .long 0x00000058 - .long 0x000000cf - .long 0x000000d0 - .long 0x000000ef - .long 0x000000aa - .long 0x000000fb - .long 0x00000043 - .long 0x0000004d - .long 0x00000033 - .long 0x00000085 - .long 0x00000045 - .long 0x000000f9 - .long 0x00000002 - .long 0x0000007f - .long 0x00000050 - .long 0x0000003c - .long 0x0000009f - .long 0x000000a8 - .long 0x00000051 - .long 0x000000a3 - .long 0x00000040 - .long 0x0000008f - .long 0x00000092 - .long 0x0000009d - .long 0x00000038 - .long 0x000000f5 - .long 0x000000bc - .long 0x000000b6 - .long 0x000000da - .long 0x00000021 - .long 0x00000010 - .long 0x000000ff - .long 0x000000f3 - .long 0x000000d2 - .long 0x000000cd - .long 0x0000000c - .long 0x00000013 - .long 0x000000ec - .long 0x0000005f - .long 0x00000097 - .long 0x00000044 - .long 0x00000017 - .long 0x000000c4 - .long 0x000000a7 - .long 0x0000007e - .long 0x0000003d - .long 0x00000064 - .long 0x0000005d - .long 0x00000019 - .long 0x00000073 - .long 0x00000060 - .long 0x00000081 - .long 0x0000004f - .long 0x000000dc - .long 0x00000022 - .long 0x0000002a - .long 0x00000090 - .long 0x00000088 - .long 0x00000046 - .long 0x000000ee - .long 0x000000b8 - .long 0x00000014 - .long 0x000000de - .long 0x0000005e - .long 0x0000000b - .long 0x000000db - .long 0x000000e0 - .long 0x00000032 - .long 0x0000003a - .long 0x0000000a - .long 0x00000049 - .long 0x00000006 - .long 0x00000024 - .long 0x0000005c - .long 0x000000c2 - .long 0x000000d3 - .long 0x000000ac - .long 0x00000062 - .long 0x00000091 - .long 0x00000095 - .long 0x000000e4 - .long 0x00000079 - .long 0x000000e7 - .long 0x000000c8 - .long 0x00000037 - .long 0x0000006d - .long 0x0000008d - .long 0x000000d5 - .long 0x0000004e - .long 0x000000a9 - .long 0x0000006c - .long 0x00000056 - .long 0x000000f4 - .long 0x000000ea - .long 0x00000065 - .long 0x0000007a - .long 0x000000ae - .long 0x00000008 - .long 0x000000ba - .long 0x00000078 - .long 0x00000025 - .long 0x0000002e - .long 0x0000001c - .long 0x000000a6 - .long 0x000000b4 - .long 0x000000c6 - .long 0x000000e8 - .long 0x000000dd - .long 0x00000074 - .long 0x0000001f - .long 0x0000004b - .long 0x000000bd - .long 0x0000008b - .long 0x0000008a - .long 0x00000070 - .long 0x0000003e - .long 0x000000b5 - .long 0x00000066 - .long 0x00000048 - .long 0x00000003 - .long 0x000000f6 - .long 0x0000000e - .long 0x00000061 - .long 0x00000035 - .long 0x00000057 - .long 0x000000b9 - .long 0x00000086 - .long 0x000000c1 - .long 0x0000001d - .long 0x0000009e - .long 0x000000e1 - .long 0x000000f8 - .long 0x00000098 - .long 0x00000011 - .long 0x00000069 - .long 0x000000d9 - .long 0x0000008e - .long 0x00000094 - .long 0x0000009b - .long 0x0000001e - .long 0x00000087 - .long 0x000000e9 - .long 0x000000ce - .long 0x00000055 - .long 0x00000028 - .long 0x000000df - .long 0x0000008c - .long 0x000000a1 - .long 0x00000089 - .long 0x0000000d - .long 0x000000bf - .long 0x000000e6 - .long 0x00000042 - .long 0x00000068 - .long 0x00000041 - .long 0x00000099 - .long 0x0000002d - .long 0x0000000f - .long 0x000000b0 - .long 0x00000054 - .long 0x000000bb - .long 0x00000016 - // Table 1. - .long 0x00006300 - .long 0x00007c00 - .long 0x00007700 - .long 0x00007b00 - .long 0x0000f200 - .long 0x00006b00 - .long 0x00006f00 - .long 0x0000c500 - .long 0x00003000 - .long 0x00000100 - .long 0x00006700 - .long 0x00002b00 - .long 0x0000fe00 - .long 0x0000d700 - .long 0x0000ab00 - .long 0x00007600 - .long 0x0000ca00 - .long 0x00008200 - .long 0x0000c900 - .long 0x00007d00 - .long 0x0000fa00 - .long 0x00005900 - .long 0x00004700 - .long 0x0000f000 - .long 0x0000ad00 - .long 0x0000d400 - .long 0x0000a200 - .long 0x0000af00 - .long 0x00009c00 - .long 0x0000a400 - .long 0x00007200 - .long 0x0000c000 - .long 0x0000b700 - .long 0x0000fd00 - .long 0x00009300 - .long 0x00002600 - .long 0x00003600 - .long 0x00003f00 - .long 0x0000f700 - .long 0x0000cc00 - .long 0x00003400 - .long 0x0000a500 - .long 0x0000e500 - .long 0x0000f100 - .long 0x00007100 - .long 0x0000d800 - .long 0x00003100 - .long 0x00001500 - .long 0x00000400 - .long 0x0000c700 - .long 0x00002300 - .long 0x0000c300 - .long 0x00001800 - .long 0x00009600 - .long 0x00000500 - .long 0x00009a00 - .long 0x00000700 - .long 0x00001200 - .long 0x00008000 - .long 0x0000e200 - .long 0x0000eb00 - .long 0x00002700 - .long 0x0000b200 - .long 0x00007500 - .long 0x00000900 - .long 0x00008300 - .long 0x00002c00 - .long 0x00001a00 - .long 0x00001b00 - .long 0x00006e00 - .long 0x00005a00 - .long 0x0000a000 - .long 0x00005200 - .long 0x00003b00 - .long 0x0000d600 - .long 0x0000b300 - .long 0x00002900 - .long 0x0000e300 - .long 0x00002f00 - .long 0x00008400 - .long 0x00005300 - .long 0x0000d100 - .long 0x00000000 - .long 0x0000ed00 - .long 0x00002000 - .long 0x0000fc00 - .long 0x0000b100 - .long 0x00005b00 - .long 0x00006a00 - .long 0x0000cb00 - .long 0x0000be00 - .long 0x00003900 - .long 0x00004a00 - .long 0x00004c00 - .long 0x00005800 - .long 0x0000cf00 - .long 0x0000d000 - .long 0x0000ef00 - .long 0x0000aa00 - .long 0x0000fb00 - .long 0x00004300 - .long 0x00004d00 - .long 0x00003300 - .long 0x00008500 - .long 0x00004500 - .long 0x0000f900 - .long 0x00000200 - .long 0x00007f00 - .long 0x00005000 - .long 0x00003c00 - .long 0x00009f00 - .long 0x0000a800 - .long 0x00005100 - .long 0x0000a300 - .long 0x00004000 - .long 0x00008f00 - .long 0x00009200 - .long 0x00009d00 - .long 0x00003800 - .long 0x0000f500 - .long 0x0000bc00 - .long 0x0000b600 - .long 0x0000da00 - .long 0x00002100 - .long 0x00001000 - .long 0x0000ff00 - .long 0x0000f300 - .long 0x0000d200 - .long 0x0000cd00 - .long 0x00000c00 - .long 0x00001300 - .long 0x0000ec00 - .long 0x00005f00 - .long 0x00009700 - .long 0x00004400 - .long 0x00001700 - .long 0x0000c400 - .long 0x0000a700 - .long 0x00007e00 - .long 0x00003d00 - .long 0x00006400 - .long 0x00005d00 - .long 0x00001900 - .long 0x00007300 - .long 0x00006000 - .long 0x00008100 - .long 0x00004f00 - .long 0x0000dc00 - .long 0x00002200 - .long 0x00002a00 - .long 0x00009000 - .long 0x00008800 - .long 0x00004600 - .long 0x0000ee00 - .long 0x0000b800 - .long 0x00001400 - .long 0x0000de00 - .long 0x00005e00 - .long 0x00000b00 - .long 0x0000db00 - .long 0x0000e000 - .long 0x00003200 - .long 0x00003a00 - .long 0x00000a00 - .long 0x00004900 - .long 0x00000600 - .long 0x00002400 - .long 0x00005c00 - .long 0x0000c200 - .long 0x0000d300 - .long 0x0000ac00 - .long 0x00006200 - .long 0x00009100 - .long 0x00009500 - .long 0x0000e400 - .long 0x00007900 - .long 0x0000e700 - .long 0x0000c800 - .long 0x00003700 - .long 0x00006d00 - .long 0x00008d00 - .long 0x0000d500 - .long 0x00004e00 - .long 0x0000a900 - .long 0x00006c00 - .long 0x00005600 - .long 0x0000f400 - .long 0x0000ea00 - .long 0x00006500 - .long 0x00007a00 - .long 0x0000ae00 - .long 0x00000800 - .long 0x0000ba00 - .long 0x00007800 - .long 0x00002500 - .long 0x00002e00 - .long 0x00001c00 - .long 0x0000a600 - .long 0x0000b400 - .long 0x0000c600 - .long 0x0000e800 - .long 0x0000dd00 - .long 0x00007400 - .long 0x00001f00 - .long 0x00004b00 - .long 0x0000bd00 - .long 0x00008b00 - .long 0x00008a00 - .long 0x00007000 - .long 0x00003e00 - .long 0x0000b500 - .long 0x00006600 - .long 0x00004800 - .long 0x00000300 - .long 0x0000f600 - .long 0x00000e00 - .long 0x00006100 - .long 0x00003500 - .long 0x00005700 - .long 0x0000b900 - .long 0x00008600 - .long 0x0000c100 - .long 0x00001d00 - .long 0x00009e00 - .long 0x0000e100 - .long 0x0000f800 - .long 0x00009800 - .long 0x00001100 - .long 0x00006900 - .long 0x0000d900 - .long 0x00008e00 - .long 0x00009400 - .long 0x00009b00 - .long 0x00001e00 - .long 0x00008700 - .long 0x0000e900 - .long 0x0000ce00 - .long 0x00005500 - .long 0x00002800 - .long 0x0000df00 - .long 0x00008c00 - .long 0x0000a100 - .long 0x00008900 - .long 0x00000d00 - .long 0x0000bf00 - .long 0x0000e600 - .long 0x00004200 - .long 0x00006800 - .long 0x00004100 - .long 0x00009900 - .long 0x00002d00 - .long 0x00000f00 - .long 0x0000b000 - .long 0x00005400 - .long 0x0000bb00 - .long 0x00001600 - // Table 2. - .long 0x00630000 - .long 0x007c0000 - .long 0x00770000 - .long 0x007b0000 - .long 0x00f20000 - .long 0x006b0000 - .long 0x006f0000 - .long 0x00c50000 - .long 0x00300000 - .long 0x00010000 - .long 0x00670000 - .long 0x002b0000 - .long 0x00fe0000 - .long 0x00d70000 - .long 0x00ab0000 - .long 0x00760000 - .long 0x00ca0000 - .long 0x00820000 - .long 0x00c90000 - .long 0x007d0000 - .long 0x00fa0000 - .long 0x00590000 - .long 0x00470000 - .long 0x00f00000 - .long 0x00ad0000 - .long 0x00d40000 - .long 0x00a20000 - .long 0x00af0000 - .long 0x009c0000 - .long 0x00a40000 - .long 0x00720000 - .long 0x00c00000 - .long 0x00b70000 - .long 0x00fd0000 - .long 0x00930000 - .long 0x00260000 - .long 0x00360000 - .long 0x003f0000 - .long 0x00f70000 - .long 0x00cc0000 - .long 0x00340000 - .long 0x00a50000 - .long 0x00e50000 - .long 0x00f10000 - .long 0x00710000 - .long 0x00d80000 - .long 0x00310000 - .long 0x00150000 - .long 0x00040000 - .long 0x00c70000 - .long 0x00230000 - .long 0x00c30000 - .long 0x00180000 - .long 0x00960000 - .long 0x00050000 - .long 0x009a0000 - .long 0x00070000 - .long 0x00120000 - .long 0x00800000 - .long 0x00e20000 - .long 0x00eb0000 - .long 0x00270000 - .long 0x00b20000 - .long 0x00750000 - .long 0x00090000 - .long 0x00830000 - .long 0x002c0000 - .long 0x001a0000 - .long 0x001b0000 - .long 0x006e0000 - .long 0x005a0000 - .long 0x00a00000 - .long 0x00520000 - .long 0x003b0000 - .long 0x00d60000 - .long 0x00b30000 - .long 0x00290000 - .long 0x00e30000 - .long 0x002f0000 - .long 0x00840000 - .long 0x00530000 - .long 0x00d10000 - .long 0x00000000 - .long 0x00ed0000 - .long 0x00200000 - .long 0x00fc0000 - .long 0x00b10000 - .long 0x005b0000 - .long 0x006a0000 - .long 0x00cb0000 - .long 0x00be0000 - .long 0x00390000 - .long 0x004a0000 - .long 0x004c0000 - .long 0x00580000 - .long 0x00cf0000 - .long 0x00d00000 - .long 0x00ef0000 - .long 0x00aa0000 - .long 0x00fb0000 - .long 0x00430000 - .long 0x004d0000 - .long 0x00330000 - .long 0x00850000 - .long 0x00450000 - .long 0x00f90000 - .long 0x00020000 - .long 0x007f0000 - .long 0x00500000 - .long 0x003c0000 - .long 0x009f0000 - .long 0x00a80000 - .long 0x00510000 - .long 0x00a30000 - .long 0x00400000 - .long 0x008f0000 - .long 0x00920000 - .long 0x009d0000 - .long 0x00380000 - .long 0x00f50000 - .long 0x00bc0000 - .long 0x00b60000 - .long 0x00da0000 - .long 0x00210000 - .long 0x00100000 - .long 0x00ff0000 - .long 0x00f30000 - .long 0x00d20000 - .long 0x00cd0000 - .long 0x000c0000 - .long 0x00130000 - .long 0x00ec0000 - .long 0x005f0000 - .long 0x00970000 - .long 0x00440000 - .long 0x00170000 - .long 0x00c40000 - .long 0x00a70000 - .long 0x007e0000 - .long 0x003d0000 - .long 0x00640000 - .long 0x005d0000 - .long 0x00190000 - .long 0x00730000 - .long 0x00600000 - .long 0x00810000 - .long 0x004f0000 - .long 0x00dc0000 - .long 0x00220000 - .long 0x002a0000 - .long 0x00900000 - .long 0x00880000 - .long 0x00460000 - .long 0x00ee0000 - .long 0x00b80000 - .long 0x00140000 - .long 0x00de0000 - .long 0x005e0000 - .long 0x000b0000 - .long 0x00db0000 - .long 0x00e00000 - .long 0x00320000 - .long 0x003a0000 - .long 0x000a0000 - .long 0x00490000 - .long 0x00060000 - .long 0x00240000 - .long 0x005c0000 - .long 0x00c20000 - .long 0x00d30000 - .long 0x00ac0000 - .long 0x00620000 - .long 0x00910000 - .long 0x00950000 - .long 0x00e40000 - .long 0x00790000 - .long 0x00e70000 - .long 0x00c80000 - .long 0x00370000 - .long 0x006d0000 - .long 0x008d0000 - .long 0x00d50000 - .long 0x004e0000 - .long 0x00a90000 - .long 0x006c0000 - .long 0x00560000 - .long 0x00f40000 - .long 0x00ea0000 - .long 0x00650000 - .long 0x007a0000 - .long 0x00ae0000 - .long 0x00080000 - .long 0x00ba0000 - .long 0x00780000 - .long 0x00250000 - .long 0x002e0000 - .long 0x001c0000 - .long 0x00a60000 - .long 0x00b40000 - .long 0x00c60000 - .long 0x00e80000 - .long 0x00dd0000 - .long 0x00740000 - .long 0x001f0000 - .long 0x004b0000 - .long 0x00bd0000 - .long 0x008b0000 - .long 0x008a0000 - .long 0x00700000 - .long 0x003e0000 - .long 0x00b50000 - .long 0x00660000 - .long 0x00480000 - .long 0x00030000 - .long 0x00f60000 - .long 0x000e0000 - .long 0x00610000 - .long 0x00350000 - .long 0x00570000 - .long 0x00b90000 - .long 0x00860000 - .long 0x00c10000 - .long 0x001d0000 - .long 0x009e0000 - .long 0x00e10000 - .long 0x00f80000 - .long 0x00980000 - .long 0x00110000 - .long 0x00690000 - .long 0x00d90000 - .long 0x008e0000 - .long 0x00940000 - .long 0x009b0000 - .long 0x001e0000 - .long 0x00870000 - .long 0x00e90000 - .long 0x00ce0000 - .long 0x00550000 - .long 0x00280000 - .long 0x00df0000 - .long 0x008c0000 - .long 0x00a10000 - .long 0x00890000 - .long 0x000d0000 - .long 0x00bf0000 - .long 0x00e60000 - .long 0x00420000 - .long 0x00680000 - .long 0x00410000 - .long 0x00990000 - .long 0x002d0000 - .long 0x000f0000 - .long 0x00b00000 - .long 0x00540000 - .long 0x00bb0000 - .long 0x00160000 - // Table 3. - .long 0x63000000 - .long 0x7c000000 - .long 0x77000000 - .long 0x7b000000 - .long 0xf2000000 - .long 0x6b000000 - .long 0x6f000000 - .long 0xc5000000 - .long 0x30000000 - .long 0x01000000 - .long 0x67000000 - .long 0x2b000000 - .long 0xfe000000 - .long 0xd7000000 - .long 0xab000000 - .long 0x76000000 - .long 0xca000000 - .long 0x82000000 - .long 0xc9000000 - .long 0x7d000000 - .long 0xfa000000 - .long 0x59000000 - .long 0x47000000 - .long 0xf0000000 - .long 0xad000000 - .long 0xd4000000 - .long 0xa2000000 - .long 0xaf000000 - .long 0x9c000000 - .long 0xa4000000 - .long 0x72000000 - .long 0xc0000000 - .long 0xb7000000 - .long 0xfd000000 - .long 0x93000000 - .long 0x26000000 - .long 0x36000000 - .long 0x3f000000 - .long 0xf7000000 - .long 0xcc000000 - .long 0x34000000 - .long 0xa5000000 - .long 0xe5000000 - .long 0xf1000000 - .long 0x71000000 - .long 0xd8000000 - .long 0x31000000 - .long 0x15000000 - .long 0x04000000 - .long 0xc7000000 - .long 0x23000000 - .long 0xc3000000 - .long 0x18000000 - .long 0x96000000 - .long 0x05000000 - .long 0x9a000000 - .long 0x07000000 - .long 0x12000000 - .long 0x80000000 - .long 0xe2000000 - .long 0xeb000000 - .long 0x27000000 - .long 0xb2000000 - .long 0x75000000 - .long 0x09000000 - .long 0x83000000 - .long 0x2c000000 - .long 0x1a000000 - .long 0x1b000000 - .long 0x6e000000 - .long 0x5a000000 - .long 0xa0000000 - .long 0x52000000 - .long 0x3b000000 - .long 0xd6000000 - .long 0xb3000000 - .long 0x29000000 - .long 0xe3000000 - .long 0x2f000000 - .long 0x84000000 - .long 0x53000000 - .long 0xd1000000 - .long 0x00000000 - .long 0xed000000 - .long 0x20000000 - .long 0xfc000000 - .long 0xb1000000 - .long 0x5b000000 - .long 0x6a000000 - .long 0xcb000000 - .long 0xbe000000 - .long 0x39000000 - .long 0x4a000000 - .long 0x4c000000 - .long 0x58000000 - .long 0xcf000000 - .long 0xd0000000 - .long 0xef000000 - .long 0xaa000000 - .long 0xfb000000 - .long 0x43000000 - .long 0x4d000000 - .long 0x33000000 - .long 0x85000000 - .long 0x45000000 - .long 0xf9000000 - .long 0x02000000 - .long 0x7f000000 - .long 0x50000000 - .long 0x3c000000 - .long 0x9f000000 - .long 0xa8000000 - .long 0x51000000 - .long 0xa3000000 - .long 0x40000000 - .long 0x8f000000 - .long 0x92000000 - .long 0x9d000000 - .long 0x38000000 - .long 0xf5000000 - .long 0xbc000000 - .long 0xb6000000 - .long 0xda000000 - .long 0x21000000 - .long 0x10000000 - .long 0xff000000 - .long 0xf3000000 - .long 0xd2000000 - .long 0xcd000000 - .long 0x0c000000 - .long 0x13000000 - .long 0xec000000 - .long 0x5f000000 - .long 0x97000000 - .long 0x44000000 - .long 0x17000000 - .long 0xc4000000 - .long 0xa7000000 - .long 0x7e000000 - .long 0x3d000000 - .long 0x64000000 - .long 0x5d000000 - .long 0x19000000 - .long 0x73000000 - .long 0x60000000 - .long 0x81000000 - .long 0x4f000000 - .long 0xdc000000 - .long 0x22000000 - .long 0x2a000000 - .long 0x90000000 - .long 0x88000000 - .long 0x46000000 - .long 0xee000000 - .long 0xb8000000 - .long 0x14000000 - .long 0xde000000 - .long 0x5e000000 - .long 0x0b000000 - .long 0xdb000000 - .long 0xe0000000 - .long 0x32000000 - .long 0x3a000000 - .long 0x0a000000 - .long 0x49000000 - .long 0x06000000 - .long 0x24000000 - .long 0x5c000000 - .long 0xc2000000 - .long 0xd3000000 - .long 0xac000000 - .long 0x62000000 - .long 0x91000000 - .long 0x95000000 - .long 0xe4000000 - .long 0x79000000 - .long 0xe7000000 - .long 0xc8000000 - .long 0x37000000 - .long 0x6d000000 - .long 0x8d000000 - .long 0xd5000000 - .long 0x4e000000 - .long 0xa9000000 - .long 0x6c000000 - .long 0x56000000 - .long 0xf4000000 - .long 0xea000000 - .long 0x65000000 - .long 0x7a000000 - .long 0xae000000 - .long 0x08000000 - .long 0xba000000 - .long 0x78000000 - .long 0x25000000 - .long 0x2e000000 - .long 0x1c000000 - .long 0xa6000000 - .long 0xb4000000 - .long 0xc6000000 - .long 0xe8000000 - .long 0xdd000000 - .long 0x74000000 - .long 0x1f000000 - .long 0x4b000000 - .long 0xbd000000 - .long 0x8b000000 - .long 0x8a000000 - .long 0x70000000 - .long 0x3e000000 - .long 0xb5000000 - .long 0x66000000 - .long 0x48000000 - .long 0x03000000 - .long 0xf6000000 - .long 0x0e000000 - .long 0x61000000 - .long 0x35000000 - .long 0x57000000 - .long 0xb9000000 - .long 0x86000000 - .long 0xc1000000 - .long 0x1d000000 - .long 0x9e000000 - .long 0xe1000000 - .long 0xf8000000 - .long 0x98000000 - .long 0x11000000 - .long 0x69000000 - .long 0xd9000000 - .long 0x8e000000 - .long 0x94000000 - .long 0x9b000000 - .long 0x1e000000 - .long 0x87000000 - .long 0xe9000000 - .long 0xce000000 - .long 0x55000000 - .long 0x28000000 - .long 0xdf000000 - .long 0x8c000000 - .long 0xa1000000 - .long 0x89000000 - .long 0x0d000000 - .long 0xbf000000 - .long 0xe6000000 - .long 0x42000000 - .long 0x68000000 - .long 0x41000000 - .long 0x99000000 - .long 0x2d000000 - .long 0x0f000000 - .long 0xb0000000 - .long 0x54000000 - .long 0xbb000000 - .long 0x16000000 - - -// InvSubBytes embedded in words tables. - .globl _AESInvSubBytesWordTable - .private_extern _AESInvSubBytesWordTable - .align 2 -_AESInvSubBytesWordTable: - // Table 0. - .long 0x00000052 - .long 0x00000009 - .long 0x0000006a - .long 0x000000d5 - .long 0x00000030 - .long 0x00000036 - .long 0x000000a5 - .long 0x00000038 - .long 0x000000bf - .long 0x00000040 - .long 0x000000a3 - .long 0x0000009e - .long 0x00000081 - .long 0x000000f3 - .long 0x000000d7 - .long 0x000000fb - .long 0x0000007c - .long 0x000000e3 - .long 0x00000039 - .long 0x00000082 - .long 0x0000009b - .long 0x0000002f - .long 0x000000ff - .long 0x00000087 - .long 0x00000034 - .long 0x0000008e - .long 0x00000043 - .long 0x00000044 - .long 0x000000c4 - .long 0x000000de - .long 0x000000e9 - .long 0x000000cb - .long 0x00000054 - .long 0x0000007b - .long 0x00000094 - .long 0x00000032 - .long 0x000000a6 - .long 0x000000c2 - .long 0x00000023 - .long 0x0000003d - .long 0x000000ee - .long 0x0000004c - .long 0x00000095 - .long 0x0000000b - .long 0x00000042 - .long 0x000000fa - .long 0x000000c3 - .long 0x0000004e - .long 0x00000008 - .long 0x0000002e - .long 0x000000a1 - .long 0x00000066 - .long 0x00000028 - .long 0x000000d9 - .long 0x00000024 - .long 0x000000b2 - .long 0x00000076 - .long 0x0000005b - .long 0x000000a2 - .long 0x00000049 - .long 0x0000006d - .long 0x0000008b - .long 0x000000d1 - .long 0x00000025 - .long 0x00000072 - .long 0x000000f8 - .long 0x000000f6 - .long 0x00000064 - .long 0x00000086 - .long 0x00000068 - .long 0x00000098 - .long 0x00000016 - .long 0x000000d4 - .long 0x000000a4 - .long 0x0000005c - .long 0x000000cc - .long 0x0000005d - .long 0x00000065 - .long 0x000000b6 - .long 0x00000092 - .long 0x0000006c - .long 0x00000070 - .long 0x00000048 - .long 0x00000050 - .long 0x000000fd - .long 0x000000ed - .long 0x000000b9 - .long 0x000000da - .long 0x0000005e - .long 0x00000015 - .long 0x00000046 - .long 0x00000057 - .long 0x000000a7 - .long 0x0000008d - .long 0x0000009d - .long 0x00000084 - .long 0x00000090 - .long 0x000000d8 - .long 0x000000ab - .long 0x00000000 - .long 0x0000008c - .long 0x000000bc - .long 0x000000d3 - .long 0x0000000a - .long 0x000000f7 - .long 0x000000e4 - .long 0x00000058 - .long 0x00000005 - .long 0x000000b8 - .long 0x000000b3 - .long 0x00000045 - .long 0x00000006 - .long 0x000000d0 - .long 0x0000002c - .long 0x0000001e - .long 0x0000008f - .long 0x000000ca - .long 0x0000003f - .long 0x0000000f - .long 0x00000002 - .long 0x000000c1 - .long 0x000000af - .long 0x000000bd - .long 0x00000003 - .long 0x00000001 - .long 0x00000013 - .long 0x0000008a - .long 0x0000006b - .long 0x0000003a - .long 0x00000091 - .long 0x00000011 - .long 0x00000041 - .long 0x0000004f - .long 0x00000067 - .long 0x000000dc - .long 0x000000ea - .long 0x00000097 - .long 0x000000f2 - .long 0x000000cf - .long 0x000000ce - .long 0x000000f0 - .long 0x000000b4 - .long 0x000000e6 - .long 0x00000073 - .long 0x00000096 - .long 0x000000ac - .long 0x00000074 - .long 0x00000022 - .long 0x000000e7 - .long 0x000000ad - .long 0x00000035 - .long 0x00000085 - .long 0x000000e2 - .long 0x000000f9 - .long 0x00000037 - .long 0x000000e8 - .long 0x0000001c - .long 0x00000075 - .long 0x000000df - .long 0x0000006e - .long 0x00000047 - .long 0x000000f1 - .long 0x0000001a - .long 0x00000071 - .long 0x0000001d - .long 0x00000029 - .long 0x000000c5 - .long 0x00000089 - .long 0x0000006f - .long 0x000000b7 - .long 0x00000062 - .long 0x0000000e - .long 0x000000aa - .long 0x00000018 - .long 0x000000be - .long 0x0000001b - .long 0x000000fc - .long 0x00000056 - .long 0x0000003e - .long 0x0000004b - .long 0x000000c6 - .long 0x000000d2 - .long 0x00000079 - .long 0x00000020 - .long 0x0000009a - .long 0x000000db - .long 0x000000c0 - .long 0x000000fe - .long 0x00000078 - .long 0x000000cd - .long 0x0000005a - .long 0x000000f4 - .long 0x0000001f - .long 0x000000dd - .long 0x000000a8 - .long 0x00000033 - .long 0x00000088 - .long 0x00000007 - .long 0x000000c7 - .long 0x00000031 - .long 0x000000b1 - .long 0x00000012 - .long 0x00000010 - .long 0x00000059 - .long 0x00000027 - .long 0x00000080 - .long 0x000000ec - .long 0x0000005f - .long 0x00000060 - .long 0x00000051 - .long 0x0000007f - .long 0x000000a9 - .long 0x00000019 - .long 0x000000b5 - .long 0x0000004a - .long 0x0000000d - .long 0x0000002d - .long 0x000000e5 - .long 0x0000007a - .long 0x0000009f - .long 0x00000093 - .long 0x000000c9 - .long 0x0000009c - .long 0x000000ef - .long 0x000000a0 - .long 0x000000e0 - .long 0x0000003b - .long 0x0000004d - .long 0x000000ae - .long 0x0000002a - .long 0x000000f5 - .long 0x000000b0 - .long 0x000000c8 - .long 0x000000eb - .long 0x000000bb - .long 0x0000003c - .long 0x00000083 - .long 0x00000053 - .long 0x00000099 - .long 0x00000061 - .long 0x00000017 - .long 0x0000002b - .long 0x00000004 - .long 0x0000007e - .long 0x000000ba - .long 0x00000077 - .long 0x000000d6 - .long 0x00000026 - .long 0x000000e1 - .long 0x00000069 - .long 0x00000014 - .long 0x00000063 - .long 0x00000055 - .long 0x00000021 - .long 0x0000000c - .long 0x0000007d - // Table 1. - .long 0x00005200 - .long 0x00000900 - .long 0x00006a00 - .long 0x0000d500 - .long 0x00003000 - .long 0x00003600 - .long 0x0000a500 - .long 0x00003800 - .long 0x0000bf00 - .long 0x00004000 - .long 0x0000a300 - .long 0x00009e00 - .long 0x00008100 - .long 0x0000f300 - .long 0x0000d700 - .long 0x0000fb00 - .long 0x00007c00 - .long 0x0000e300 - .long 0x00003900 - .long 0x00008200 - .long 0x00009b00 - .long 0x00002f00 - .long 0x0000ff00 - .long 0x00008700 - .long 0x00003400 - .long 0x00008e00 - .long 0x00004300 - .long 0x00004400 - .long 0x0000c400 - .long 0x0000de00 - .long 0x0000e900 - .long 0x0000cb00 - .long 0x00005400 - .long 0x00007b00 - .long 0x00009400 - .long 0x00003200 - .long 0x0000a600 - .long 0x0000c200 - .long 0x00002300 - .long 0x00003d00 - .long 0x0000ee00 - .long 0x00004c00 - .long 0x00009500 - .long 0x00000b00 - .long 0x00004200 - .long 0x0000fa00 - .long 0x0000c300 - .long 0x00004e00 - .long 0x00000800 - .long 0x00002e00 - .long 0x0000a100 - .long 0x00006600 - .long 0x00002800 - .long 0x0000d900 - .long 0x00002400 - .long 0x0000b200 - .long 0x00007600 - .long 0x00005b00 - .long 0x0000a200 - .long 0x00004900 - .long 0x00006d00 - .long 0x00008b00 - .long 0x0000d100 - .long 0x00002500 - .long 0x00007200 - .long 0x0000f800 - .long 0x0000f600 - .long 0x00006400 - .long 0x00008600 - .long 0x00006800 - .long 0x00009800 - .long 0x00001600 - .long 0x0000d400 - .long 0x0000a400 - .long 0x00005c00 - .long 0x0000cc00 - .long 0x00005d00 - .long 0x00006500 - .long 0x0000b600 - .long 0x00009200 - .long 0x00006c00 - .long 0x00007000 - .long 0x00004800 - .long 0x00005000 - .long 0x0000fd00 - .long 0x0000ed00 - .long 0x0000b900 - .long 0x0000da00 - .long 0x00005e00 - .long 0x00001500 - .long 0x00004600 - .long 0x00005700 - .long 0x0000a700 - .long 0x00008d00 - .long 0x00009d00 - .long 0x00008400 - .long 0x00009000 - .long 0x0000d800 - .long 0x0000ab00 - .long 0x00000000 - .long 0x00008c00 - .long 0x0000bc00 - .long 0x0000d300 - .long 0x00000a00 - .long 0x0000f700 - .long 0x0000e400 - .long 0x00005800 - .long 0x00000500 - .long 0x0000b800 - .long 0x0000b300 - .long 0x00004500 - .long 0x00000600 - .long 0x0000d000 - .long 0x00002c00 - .long 0x00001e00 - .long 0x00008f00 - .long 0x0000ca00 - .long 0x00003f00 - .long 0x00000f00 - .long 0x00000200 - .long 0x0000c100 - .long 0x0000af00 - .long 0x0000bd00 - .long 0x00000300 - .long 0x00000100 - .long 0x00001300 - .long 0x00008a00 - .long 0x00006b00 - .long 0x00003a00 - .long 0x00009100 - .long 0x00001100 - .long 0x00004100 - .long 0x00004f00 - .long 0x00006700 - .long 0x0000dc00 - .long 0x0000ea00 - .long 0x00009700 - .long 0x0000f200 - .long 0x0000cf00 - .long 0x0000ce00 - .long 0x0000f000 - .long 0x0000b400 - .long 0x0000e600 - .long 0x00007300 - .long 0x00009600 - .long 0x0000ac00 - .long 0x00007400 - .long 0x00002200 - .long 0x0000e700 - .long 0x0000ad00 - .long 0x00003500 - .long 0x00008500 - .long 0x0000e200 - .long 0x0000f900 - .long 0x00003700 - .long 0x0000e800 - .long 0x00001c00 - .long 0x00007500 - .long 0x0000df00 - .long 0x00006e00 - .long 0x00004700 - .long 0x0000f100 - .long 0x00001a00 - .long 0x00007100 - .long 0x00001d00 - .long 0x00002900 - .long 0x0000c500 - .long 0x00008900 - .long 0x00006f00 - .long 0x0000b700 - .long 0x00006200 - .long 0x00000e00 - .long 0x0000aa00 - .long 0x00001800 - .long 0x0000be00 - .long 0x00001b00 - .long 0x0000fc00 - .long 0x00005600 - .long 0x00003e00 - .long 0x00004b00 - .long 0x0000c600 - .long 0x0000d200 - .long 0x00007900 - .long 0x00002000 - .long 0x00009a00 - .long 0x0000db00 - .long 0x0000c000 - .long 0x0000fe00 - .long 0x00007800 - .long 0x0000cd00 - .long 0x00005a00 - .long 0x0000f400 - .long 0x00001f00 - .long 0x0000dd00 - .long 0x0000a800 - .long 0x00003300 - .long 0x00008800 - .long 0x00000700 - .long 0x0000c700 - .long 0x00003100 - .long 0x0000b100 - .long 0x00001200 - .long 0x00001000 - .long 0x00005900 - .long 0x00002700 - .long 0x00008000 - .long 0x0000ec00 - .long 0x00005f00 - .long 0x00006000 - .long 0x00005100 - .long 0x00007f00 - .long 0x0000a900 - .long 0x00001900 - .long 0x0000b500 - .long 0x00004a00 - .long 0x00000d00 - .long 0x00002d00 - .long 0x0000e500 - .long 0x00007a00 - .long 0x00009f00 - .long 0x00009300 - .long 0x0000c900 - .long 0x00009c00 - .long 0x0000ef00 - .long 0x0000a000 - .long 0x0000e000 - .long 0x00003b00 - .long 0x00004d00 - .long 0x0000ae00 - .long 0x00002a00 - .long 0x0000f500 - .long 0x0000b000 - .long 0x0000c800 - .long 0x0000eb00 - .long 0x0000bb00 - .long 0x00003c00 - .long 0x00008300 - .long 0x00005300 - .long 0x00009900 - .long 0x00006100 - .long 0x00001700 - .long 0x00002b00 - .long 0x00000400 - .long 0x00007e00 - .long 0x0000ba00 - .long 0x00007700 - .long 0x0000d600 - .long 0x00002600 - .long 0x0000e100 - .long 0x00006900 - .long 0x00001400 - .long 0x00006300 - .long 0x00005500 - .long 0x00002100 - .long 0x00000c00 - .long 0x00007d00 - // Table 2. - .long 0x00520000 - .long 0x00090000 - .long 0x006a0000 - .long 0x00d50000 - .long 0x00300000 - .long 0x00360000 - .long 0x00a50000 - .long 0x00380000 - .long 0x00bf0000 - .long 0x00400000 - .long 0x00a30000 - .long 0x009e0000 - .long 0x00810000 - .long 0x00f30000 - .long 0x00d70000 - .long 0x00fb0000 - .long 0x007c0000 - .long 0x00e30000 - .long 0x00390000 - .long 0x00820000 - .long 0x009b0000 - .long 0x002f0000 - .long 0x00ff0000 - .long 0x00870000 - .long 0x00340000 - .long 0x008e0000 - .long 0x00430000 - .long 0x00440000 - .long 0x00c40000 - .long 0x00de0000 - .long 0x00e90000 - .long 0x00cb0000 - .long 0x00540000 - .long 0x007b0000 - .long 0x00940000 - .long 0x00320000 - .long 0x00a60000 - .long 0x00c20000 - .long 0x00230000 - .long 0x003d0000 - .long 0x00ee0000 - .long 0x004c0000 - .long 0x00950000 - .long 0x000b0000 - .long 0x00420000 - .long 0x00fa0000 - .long 0x00c30000 - .long 0x004e0000 - .long 0x00080000 - .long 0x002e0000 - .long 0x00a10000 - .long 0x00660000 - .long 0x00280000 - .long 0x00d90000 - .long 0x00240000 - .long 0x00b20000 - .long 0x00760000 - .long 0x005b0000 - .long 0x00a20000 - .long 0x00490000 - .long 0x006d0000 - .long 0x008b0000 - .long 0x00d10000 - .long 0x00250000 - .long 0x00720000 - .long 0x00f80000 - .long 0x00f60000 - .long 0x00640000 - .long 0x00860000 - .long 0x00680000 - .long 0x00980000 - .long 0x00160000 - .long 0x00d40000 - .long 0x00a40000 - .long 0x005c0000 - .long 0x00cc0000 - .long 0x005d0000 - .long 0x00650000 - .long 0x00b60000 - .long 0x00920000 - .long 0x006c0000 - .long 0x00700000 - .long 0x00480000 - .long 0x00500000 - .long 0x00fd0000 - .long 0x00ed0000 - .long 0x00b90000 - .long 0x00da0000 - .long 0x005e0000 - .long 0x00150000 - .long 0x00460000 - .long 0x00570000 - .long 0x00a70000 - .long 0x008d0000 - .long 0x009d0000 - .long 0x00840000 - .long 0x00900000 - .long 0x00d80000 - .long 0x00ab0000 - .long 0x00000000 - .long 0x008c0000 - .long 0x00bc0000 - .long 0x00d30000 - .long 0x000a0000 - .long 0x00f70000 - .long 0x00e40000 - .long 0x00580000 - .long 0x00050000 - .long 0x00b80000 - .long 0x00b30000 - .long 0x00450000 - .long 0x00060000 - .long 0x00d00000 - .long 0x002c0000 - .long 0x001e0000 - .long 0x008f0000 - .long 0x00ca0000 - .long 0x003f0000 - .long 0x000f0000 - .long 0x00020000 - .long 0x00c10000 - .long 0x00af0000 - .long 0x00bd0000 - .long 0x00030000 - .long 0x00010000 - .long 0x00130000 - .long 0x008a0000 - .long 0x006b0000 - .long 0x003a0000 - .long 0x00910000 - .long 0x00110000 - .long 0x00410000 - .long 0x004f0000 - .long 0x00670000 - .long 0x00dc0000 - .long 0x00ea0000 - .long 0x00970000 - .long 0x00f20000 - .long 0x00cf0000 - .long 0x00ce0000 - .long 0x00f00000 - .long 0x00b40000 - .long 0x00e60000 - .long 0x00730000 - .long 0x00960000 - .long 0x00ac0000 - .long 0x00740000 - .long 0x00220000 - .long 0x00e70000 - .long 0x00ad0000 - .long 0x00350000 - .long 0x00850000 - .long 0x00e20000 - .long 0x00f90000 - .long 0x00370000 - .long 0x00e80000 - .long 0x001c0000 - .long 0x00750000 - .long 0x00df0000 - .long 0x006e0000 - .long 0x00470000 - .long 0x00f10000 - .long 0x001a0000 - .long 0x00710000 - .long 0x001d0000 - .long 0x00290000 - .long 0x00c50000 - .long 0x00890000 - .long 0x006f0000 - .long 0x00b70000 - .long 0x00620000 - .long 0x000e0000 - .long 0x00aa0000 - .long 0x00180000 - .long 0x00be0000 - .long 0x001b0000 - .long 0x00fc0000 - .long 0x00560000 - .long 0x003e0000 - .long 0x004b0000 - .long 0x00c60000 - .long 0x00d20000 - .long 0x00790000 - .long 0x00200000 - .long 0x009a0000 - .long 0x00db0000 - .long 0x00c00000 - .long 0x00fe0000 - .long 0x00780000 - .long 0x00cd0000 - .long 0x005a0000 - .long 0x00f40000 - .long 0x001f0000 - .long 0x00dd0000 - .long 0x00a80000 - .long 0x00330000 - .long 0x00880000 - .long 0x00070000 - .long 0x00c70000 - .long 0x00310000 - .long 0x00b10000 - .long 0x00120000 - .long 0x00100000 - .long 0x00590000 - .long 0x00270000 - .long 0x00800000 - .long 0x00ec0000 - .long 0x005f0000 - .long 0x00600000 - .long 0x00510000 - .long 0x007f0000 - .long 0x00a90000 - .long 0x00190000 - .long 0x00b50000 - .long 0x004a0000 - .long 0x000d0000 - .long 0x002d0000 - .long 0x00e50000 - .long 0x007a0000 - .long 0x009f0000 - .long 0x00930000 - .long 0x00c90000 - .long 0x009c0000 - .long 0x00ef0000 - .long 0x00a00000 - .long 0x00e00000 - .long 0x003b0000 - .long 0x004d0000 - .long 0x00ae0000 - .long 0x002a0000 - .long 0x00f50000 - .long 0x00b00000 - .long 0x00c80000 - .long 0x00eb0000 - .long 0x00bb0000 - .long 0x003c0000 - .long 0x00830000 - .long 0x00530000 - .long 0x00990000 - .long 0x00610000 - .long 0x00170000 - .long 0x002b0000 - .long 0x00040000 - .long 0x007e0000 - .long 0x00ba0000 - .long 0x00770000 - .long 0x00d60000 - .long 0x00260000 - .long 0x00e10000 - .long 0x00690000 - .long 0x00140000 - .long 0x00630000 - .long 0x00550000 - .long 0x00210000 - .long 0x000c0000 - .long 0x007d0000 - // Table 3. - .long 0x52000000 - .long 0x09000000 - .long 0x6a000000 - .long 0xd5000000 - .long 0x30000000 - .long 0x36000000 - .long 0xa5000000 - .long 0x38000000 - .long 0xbf000000 - .long 0x40000000 - .long 0xa3000000 - .long 0x9e000000 - .long 0x81000000 - .long 0xf3000000 - .long 0xd7000000 - .long 0xfb000000 - .long 0x7c000000 - .long 0xe3000000 - .long 0x39000000 - .long 0x82000000 - .long 0x9b000000 - .long 0x2f000000 - .long 0xff000000 - .long 0x87000000 - .long 0x34000000 - .long 0x8e000000 - .long 0x43000000 - .long 0x44000000 - .long 0xc4000000 - .long 0xde000000 - .long 0xe9000000 - .long 0xcb000000 - .long 0x54000000 - .long 0x7b000000 - .long 0x94000000 - .long 0x32000000 - .long 0xa6000000 - .long 0xc2000000 - .long 0x23000000 - .long 0x3d000000 - .long 0xee000000 - .long 0x4c000000 - .long 0x95000000 - .long 0x0b000000 - .long 0x42000000 - .long 0xfa000000 - .long 0xc3000000 - .long 0x4e000000 - .long 0x08000000 - .long 0x2e000000 - .long 0xa1000000 - .long 0x66000000 - .long 0x28000000 - .long 0xd9000000 - .long 0x24000000 - .long 0xb2000000 - .long 0x76000000 - .long 0x5b000000 - .long 0xa2000000 - .long 0x49000000 - .long 0x6d000000 - .long 0x8b000000 - .long 0xd1000000 - .long 0x25000000 - .long 0x72000000 - .long 0xf8000000 - .long 0xf6000000 - .long 0x64000000 - .long 0x86000000 - .long 0x68000000 - .long 0x98000000 - .long 0x16000000 - .long 0xd4000000 - .long 0xa4000000 - .long 0x5c000000 - .long 0xcc000000 - .long 0x5d000000 - .long 0x65000000 - .long 0xb6000000 - .long 0x92000000 - .long 0x6c000000 - .long 0x70000000 - .long 0x48000000 - .long 0x50000000 - .long 0xfd000000 - .long 0xed000000 - .long 0xb9000000 - .long 0xda000000 - .long 0x5e000000 - .long 0x15000000 - .long 0x46000000 - .long 0x57000000 - .long 0xa7000000 - .long 0x8d000000 - .long 0x9d000000 - .long 0x84000000 - .long 0x90000000 - .long 0xd8000000 - .long 0xab000000 - .long 0x00000000 - .long 0x8c000000 - .long 0xbc000000 - .long 0xd3000000 - .long 0x0a000000 - .long 0xf7000000 - .long 0xe4000000 - .long 0x58000000 - .long 0x05000000 - .long 0xb8000000 - .long 0xb3000000 - .long 0x45000000 - .long 0x06000000 - .long 0xd0000000 - .long 0x2c000000 - .long 0x1e000000 - .long 0x8f000000 - .long 0xca000000 - .long 0x3f000000 - .long 0x0f000000 - .long 0x02000000 - .long 0xc1000000 - .long 0xaf000000 - .long 0xbd000000 - .long 0x03000000 - .long 0x01000000 - .long 0x13000000 - .long 0x8a000000 - .long 0x6b000000 - .long 0x3a000000 - .long 0x91000000 - .long 0x11000000 - .long 0x41000000 - .long 0x4f000000 - .long 0x67000000 - .long 0xdc000000 - .long 0xea000000 - .long 0x97000000 - .long 0xf2000000 - .long 0xcf000000 - .long 0xce000000 - .long 0xf0000000 - .long 0xb4000000 - .long 0xe6000000 - .long 0x73000000 - .long 0x96000000 - .long 0xac000000 - .long 0x74000000 - .long 0x22000000 - .long 0xe7000000 - .long 0xad000000 - .long 0x35000000 - .long 0x85000000 - .long 0xe2000000 - .long 0xf9000000 - .long 0x37000000 - .long 0xe8000000 - .long 0x1c000000 - .long 0x75000000 - .long 0xdf000000 - .long 0x6e000000 - .long 0x47000000 - .long 0xf1000000 - .long 0x1a000000 - .long 0x71000000 - .long 0x1d000000 - .long 0x29000000 - .long 0xc5000000 - .long 0x89000000 - .long 0x6f000000 - .long 0xb7000000 - .long 0x62000000 - .long 0x0e000000 - .long 0xaa000000 - .long 0x18000000 - .long 0xbe000000 - .long 0x1b000000 - .long 0xfc000000 - .long 0x56000000 - .long 0x3e000000 - .long 0x4b000000 - .long 0xc6000000 - .long 0xd2000000 - .long 0x79000000 - .long 0x20000000 - .long 0x9a000000 - .long 0xdb000000 - .long 0xc0000000 - .long 0xfe000000 - .long 0x78000000 - .long 0xcd000000 - .long 0x5a000000 - .long 0xf4000000 - .long 0x1f000000 - .long 0xdd000000 - .long 0xa8000000 - .long 0x33000000 - .long 0x88000000 - .long 0x07000000 - .long 0xc7000000 - .long 0x31000000 - .long 0xb1000000 - .long 0x12000000 - .long 0x10000000 - .long 0x59000000 - .long 0x27000000 - .long 0x80000000 - .long 0xec000000 - .long 0x5f000000 - .long 0x60000000 - .long 0x51000000 - .long 0x7f000000 - .long 0xa9000000 - .long 0x19000000 - .long 0xb5000000 - .long 0x4a000000 - .long 0x0d000000 - .long 0x2d000000 - .long 0xe5000000 - .long 0x7a000000 - .long 0x9f000000 - .long 0x93000000 - .long 0xc9000000 - .long 0x9c000000 - .long 0xef000000 - .long 0xa0000000 - .long 0xe0000000 - .long 0x3b000000 - .long 0x4d000000 - .long 0xae000000 - .long 0x2a000000 - .long 0xf5000000 - .long 0xb0000000 - .long 0xc8000000 - .long 0xeb000000 - .long 0xbb000000 - .long 0x3c000000 - .long 0x83000000 - .long 0x53000000 - .long 0x99000000 - .long 0x61000000 - .long 0x17000000 - .long 0x2b000000 - .long 0x04000000 - .long 0x7e000000 - .long 0xba000000 - .long 0x77000000 - .long 0xd6000000 - .long 0x26000000 - .long 0xe1000000 - .long 0x69000000 - .long 0x14000000 - .long 0x63000000 - .long 0x55000000 - .long 0x21000000 - .long 0x0c000000 - .long 0x7d000000 diff --git a/bsd/crypto/aes/i386/EncryptDecrypt.s b/bsd/crypto/aes/i386/EncryptDecrypt.s deleted file mode 100644 index 6a6147a11..000000000 --- a/bsd/crypto/aes/i386/EncryptDecrypt.s +++ /dev/null @@ -1,607 +0,0 @@ -/* This file defines _aes_encrypt or _aes_decrypt, according to the value of - the Select preprocessor symbol. This file is designed to be included in - another assembly file using the preprocessor #include directive, to benefit - from some assembly-time calculations. - - These two routines are nearly identical. They differ only in the tables - they use, the direction they iterate through the key, and the permutation - performed on part of the state. - - Written by Eric Postpischil, January 2008. -*/ - -/* add AES HW detection and HW-specific program branch cclee 3-12-10 */ -#ifdef KERNEL -#include -#else -#include -#endif - -#if Select == 0 - #define Name _aes_encrypt // Routine name. - #define MTable _AESEncryptTable // Main table. - #define FTable _AESSubBytesWordTable // Final table. - #define P0 S0 // State permutation. - #define P1 S1 - #define P2 S2 - #define P3 S3 - #define Increment +16 // ExpandedKey increment. -#elif Select == 1 - #define Name _aes_decrypt // Routine name. - #define MTable _AESDecryptTable // Main table. - #define FTable _AESInvSubBytesWordTable // Final table. - #define P0 S2 // State permutation. - #define P1 S3 - #define P2 S0 - #define P3 S1 - #define Increment -16 // ExpandedKey increment. -#elif Select == 2 - #define Name _aes_encrypt_xmm_no_save // Routine name. - #define MTable _AESEncryptTable // Main table. - #define FTable _AESSubBytesWordTable // Final table. - #define P0 S0 // State permutation. - #define P1 S1 - #define P2 S2 - #define P3 S3 - #define Increment +16 // ExpandedKey increment. -#elif Select == 3 - #define Name _aes_decrypt_xmm_no_save // Routine name. - #define MTable _AESDecryptTable // Main table. - #define FTable _AESInvSubBytesWordTable // Final table. - #define P0 S2 // State permutation. - #define P1 S3 - #define P2 S0 - #define P3 S1 - #define Increment -16 // ExpandedKey increment. -#endif // Select - - -/* Routine: - - _AESEncryptWithExpandedKey (if Select is 0) or - _AESDecryptWithExpandedKey (if Select is 1). - - Function: - - Perform the AES cipher or its inverse as defined in Federal Information - Processing Standards Publication 197 (FIPS-197), November 26, 2001. - - The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197. - - Input: - - Constant data: - - The following names must be locally defined so the assembler - can calculate certain offsets. - - For encryption: - - static const Word _AESEncryptTable[4][256]. - - _AESEncryptTable[i] contains the tables T[i] defined in AES - Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and - Vincent Rijmen, section 5.2.1, page 18. These tables - combine the SubBytes and MixColumns operations. - - static const Word _AESSubBytesWordTable[256]. - - _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where - SubBytes is defined in FIPS-197. _AESSubBytesWordTable - differs from _AESEncryptTable in that it does not include - the MixColumn operation. It is used in performing the last - round, which differs fromm the previous rounds in that it - does not include the MixColumn operation. - - For decryption: - - static const Word _AESDecryptTable[4][256]. - - The analog of _AESEncryptTable for decryption. - - static const Word _AESSubBytesWordTable[256]. - - _AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i, - where InvSubBytes is defined in FIPS-197. - _AESInvSubBytesWordTable differs from _AESDecryptTable in - that it does not include the InvMixColumn operation. It is - used in performing the last round, which differs from the - previous rounds in that it does not include the - InvMixColumn operation. - - Arguments: - - const Byte *InputText. - - Address of input, 16 bytes. Best if four-byte aligned. - - Byte *OutputText. - - Address of output, 16 bytes. Best if four-byte aligned. - - aes_encrypt_ctx *Context or aes_decrypt_ctx *Context - - aes_encrypt_ctx and aes_decrypt_ctx are identical except the - former is used for encryption and the latter for decryption. - - Each is a structure containing the expanded key beginning at - offset ContextKey and a four-byte "key length" beginning at - offset ContextKeyLength. The "key length" is the number of - bytes from the start of the first round key to the start of the - last round key. That is 16 less than the number of bytes in - the entire key. - - Output: - - Encrypted or decrypted data is written to *OutputText. - - Return: - - aes_rval // -1 if "key length" is invalid. 0 otherwise. -*/ - - .text - .globl Name -Name: - - // detect AES HW, cclee 3-13-10 -#if Select < 2 // only for aes_encrypt/aes_decrypt -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities - mov (%rax), %eax // %eax = __cpu_capabilities -#else -#if defined KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - mov _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif -#endif - test $(kHasAES), %eax // __cpu_capabilities & kHasAES -#if Select == 0 - jne _aes_encrypt_hw // if AES HW detected, branch to HW specific code -#else - jne _aes_decrypt_hw // if AES HW detected, branch to HW specific code -#endif -#endif // Select - - // Push new stack frame. - push r5 - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (3*8) - #endif - - /* Number of bytes used for local variables: - - 4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd. - - 5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers. - */ - #define LocalsSize (Arch(4, 0) + Arch(5, 3)*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - -#ifdef KERNEL -#if Select < 2 - // Save XMM registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) -#if defined __i386__ - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) -#endif -#endif // Select -#endif // KERNEL - -#if defined __i386__ - - // Number of bytes from caller's stack pointer to ours. - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Define location of argument i (presuming 4-byte arguments). - #define Argument(i) StackFrame+4*(i)(%esp) - - #define ArgInputText Argument(0) - #define ArgOutputText Argument(1) - #define ArgContext Argument(2) - -#elif defined __x86_64__ - - // Arguments. - #define InputText r7 // Used early then overwritten for other use. - #define OutputText r6 // Needed near end of routine. - #define ArgContext r2 - /* The argument passed in r2 overlaps registers we need for other - work, so it must be moved early in the routine. - */ - -#endif - -#define BaseP Arch(r6, r9) // Base pointer for addressing global data. -#define ExpandedKey Arch(t0, r10) // Address of expanded key. - -/* The Work registers defined below are used to hold parts of the AES state - while we dissect or assemble it. They must be assigned to the A, B, C, and - D registers so that we can access the bytes in %al, %ah, and so on. -*/ -#define Work0d r0d -#define Work0l r0l -#define Work0h r0h -#define Work1d r3d -#define Work1l r3l -#define Work1h r3h -#define Work2d r1d -#define Work2l r1l -#define Work2h r1h -#define Work3d r2d -#define Work3l r2l -#define Work3h r2h - -#define t0 r5 -#define t0d r5d // Low 32 bits of t0. -#define t0l r5l // Low byte of t0. - -#define t1 r7 - -/* S0, S1, S2, and S3 are where we assemble the new AES state when computing - a regular round. S1, S2, and S3 are assigned to the Work registers, but - S0 needs to go somewhere else because Work0 holds part of the old state. -*/ -#define S0 Arch(t1, r8d) -#define S1 Work1d -#define S2 Work2d -#define S3 Work3d - -/* These XMM registers are used as holding space, because it is faster to - spill to these registers than to the stack. (On x86_64, we do not need - to spill, because there are additional general registers available. - However, using more general registers requires saving them to the stack - and restoring them. I timed it, and no time was saved.) -*/ -#define vS1 %xmm0 -#define vS2 %xmm1 -#define vS3 %xmm2 -#if defined __i386__ - #define vExpandedKey %xmm3 - #define vIncrement %xmm4 -#endif - - // Get address of expanded key. - mov ArgContext, ExpandedKey - #if 0 != ContextKey - add $ContextKey, ExpandedKey - #endif - -/* Store sentinel value of ExpandedKey on the stack on i386, a register on - x86_64. -*/ -#define ExpandedKeyEnd Arch(5*16(r4), r11) - - // Get and check "key length". - movzx ContextKeyLength(ExpandedKey), r0 - cmp $160, r0 - je 2f - cmp $192, r0 - je 2f - cmp $224, r0 - je 2f - mov $-1, r0 // Return error. - jmp 9f -2: - - #if (Select == 0 || Select == 2) - // For encryption, prepare to iterate forward through expanded key. - add ExpandedKey, r0 - mov r0, ExpandedKeyEnd - #else - // For decryption, prepare to iterate backward through expanded key. - mov ExpandedKey, ExpandedKeyEnd - add r0, ExpandedKey - #endif - - // Initialize State from input text. - #if defined __i386__ - mov ArgInputText, BaseP - #define InputText BaseP - #endif - mov 0*4(InputText), Work0d - mov 1*4(InputText), S1 - mov 2*4(InputText), S2 - mov 3*4(InputText), S3 -#undef InputText // Register is reused after this for other purposes. - - // Add round key and save results. - xor 0*4(ExpandedKey), Work0d // S0 is in dissection register. - xor 1*4(ExpandedKey), S1 - movd S1, vS1 // Save S1 to S3 in vector registers. - xor 2*4(ExpandedKey), S2 - movd S2, vS2 - xor 3*4(ExpandedKey), S3 - movd S3, vS3 - - add $Increment, ExpandedKey // Advance to next round key. - - #if defined __i386__ - // Save expanded key address and increment in vector registers. - mov $Increment, t1 - movp ExpandedKey, vExpandedKey - movp t1, vIncrement - #endif - - // Set up relative addressing. - #if defined __i386__ - - // Get address of 0 in BaseP. - call 0f // Push program counter onto stack. - 0: - pop BaseP // Get program counter. - - // Define macros to help address data. -#define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4) -#define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4) - - #elif defined __x86_64__ - - lea MTable(%rip), BaseP - - // Define macros to help address data. - #define LookupM(table, index) (table)*TableSize(BaseP, index, 4) - #define LookupF(table, index) (table)*TableSize(BaseP, index, 4) - -/* With these definitions of LookupM and LookupF, BaseP must be loaded with - the address of the table at the point where it is used. So we need an - instruction to change BaseP after we are done with MTable and before we - start using FTable. I would prefer to use something like: - - .set FMinusM, FTable - MTable - #define LookupF(table, index) \ - FMinusM+(table)*TableSize(BaseP, index, 4) - - Then BaseP would not need to change. However, this fails due to an - assembler/linker bug, . -*/ - - #endif - - // Get round key. - mov 0*4(ExpandedKey), S0 - mov 1*4(ExpandedKey), S1 - mov 2*4(ExpandedKey), S2 - mov 3*4(ExpandedKey), S3 - -1: - /* Word 0 of the current state must be in Work0 now, and the next round - key must be in S0 to S3. - */ - - // Process previous S0. - movzx Work0l, t0 - xor LookupM(0, t0), S0 - movzx Work0h, t0d - xor LookupM(1, t0), P3 - shr $16, Work0d - movzx Work0l, t0d - xor LookupM(2, t0), S2 - movzx Work0h, t0d - xor LookupM(3, t0), P1 - - // Process previous S1. - movd vS1, Work0d - movzx Work0l, t0d - xor LookupM(0, t0), S1 - movzx Work0h, t0d - xor LookupM(1, t0), P0 - shr $16, Work0d - movzx Work0l, t0d - xor LookupM(2, t0), S3 - movzx Work0h, t0d - xor LookupM(3, t0), P2 - - // Process previous S2. - movd vS2, Work0d - movzx Work0l, t0d - xor LookupM(0, t0), S2 - movzx Work0h, t0d - xor LookupM(1, t0), P1 - shr $16, Work0d - movzx Work0l, t0d - xor LookupM(2, t0), S0 - movzx Work0h, t0d - xor LookupM(3, t0), P3 - - // Process previous S3. - movd vS3, Work0d - movzx Work0l, t0d - xor LookupM(0, t0), S3 - movzx Work0h, t0d - xor LookupM(1, t0), P2 - shr $16, Work0d - movzx Work0l, t0d - xor LookupM(2, t0), S1 - movzx Work0h, t0d - xor LookupM(3, t0), P0 - - #if defined __i386__ - paddd vIncrement, vExpandedKey - movp vExpandedKey, ExpandedKey - #else - add $Increment, ExpandedKey - #endif - - // Save state for next iteration and load next round key. - mov S0, Work0d - mov 0*4(ExpandedKey), S0 - movd S1, vS1 - mov 1*4(ExpandedKey), S1 - movd S2, vS2 - mov 2*4(ExpandedKey), S2 - movd S3, vS3 - mov 3*4(ExpandedKey), S3 - - cmp ExpandedKeyEnd, ExpandedKey - jne 1b - - /* Word 0 of the current state must be in Work0 now, and the next round - key must be in S0 to S3. - */ - - // Work around assembler bug. See comments above about Radar 5683882. - #if defined __x86_64__ - lea FTable(%rip), BaseP - #endif - - // Process previous S0. - movzx Work0l, t0 - xor LookupF(0, t0), S0 - movzx Work0h, t0d - xor LookupF(1, t0), P3 - shr $16, Work0d - movzx Work0l, t0d - xor LookupF(2, t0), S2 - movzx Work0h, t0d - xor LookupF(3, t0), P1 - - // Process previous S1. - movd vS1, Work0d - movzx Work0l, t0d - xor LookupF(0, t0), S1 - movzx Work0h, t0d - xor LookupF(1, t0), P0 - shr $16, Work0d - movzx Work0l, t0d - xor LookupF(2, t0), S3 - movzx Work0h, t0d - xor LookupF(3, t0), P2 - - // Process previous S2. - movd vS2, Work0d - movzx Work0l, t0d - xor LookupF(0, t0), S2 - movzx Work0h, t0d - xor LookupF(1, t0), P1 - shr $16, Work0d - movzx Work0l, t0d - xor LookupF(2, t0), S0 - movzx Work0h, t0d - xor LookupF(3, t0), P3 - - // Process previous S3. - movd vS3, Work0d - movzx Work0l, t0d - xor LookupF(0, t0), S3 - movzx Work0h, t0d - xor LookupF(1, t0), P2 - shr $16, Work0d - movzx Work0l, t0d - xor LookupF(2, t0), S1 - movzx Work0h, t0d - xor LookupF(3, t0), P0 - - #if defined __i386__ // Architecture. - // Get OutputText address. - #define OutputText BaseP - mov ArgOutputText, OutputText - #endif // Architecture. - - // Write output. - mov S0, 0*4(OutputText) - mov S1, 1*4(OutputText) - mov S2, 2*4(OutputText) - mov S3, 3*4(OutputText) - - xor r0, r0 // Return success. - -9: - // Pop stack and restore registers. -#ifdef KERNEL -#if Select < 2 -#if defined __i386__ - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 -#endif - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 -#endif // Select -#endif // KERNEL - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - #elif defined __x86_64__ - #endif - pop r3 - pop r5 - - ret - - -#undef ArgExpandedKey -#undef ArgInputText -#undef ArgNr -#undef ArgOutputText -#undef Argument -#undef BaseP -#undef ExpandedKey -#undef ExpandedKeyEnd -#undef FTable -#undef InputText -#undef LocalsSize -#undef LookupM -#undef LookupF -#undef MTable -#undef OutputText -#undef Padding -#undef SaveSize -#undef S0 -#undef S1 -#undef S2 -#undef S3 -#undef StackFrame -#undef Work0d -#undef Work0h -#undef Work0l -#undef Work1d -#undef Work1h -#undef Work1l -#undef Work2d -#undef Work2h -#undef Work2l -#undef Work3d -#undef Work3h -#undef Work3l -#undef t0 -#undef t0d -#undef t0l -#undef t1 -#undef vExpandedKey -#undef vS1 -#undef vS2 -#undef vS3 - -#undef Name -#undef MTable -#undef FTable -#undef P0 -#undef P1 -#undef P2 -#undef P3 -#undef Increment diff --git a/bsd/crypto/aes/i386/ExpandKeyForDecryption.s b/bsd/crypto/aes/i386/ExpandKeyForDecryption.s deleted file mode 100644 index 457508a9a..000000000 --- a/bsd/crypto/aes/i386/ExpandKeyForDecryption.s +++ /dev/null @@ -1,1214 +0,0 @@ -/* This file defines _aes_decrypt_key, _aes_decrypt_key128, - _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be - included in another assembly file with the preprocessor #include directive, - to benefit from some assembly-time calculations. - - Written by Eric Postpischil, January 2008. - - The comments here do not say much about the algorithm; the code just - follows the FIPS-197 specification. I recommend reading the specification - before working with this code or examining the C code in the parent - directory that illustrates key expansion. - - One complication is that this routine both expands the key and applies - InvMixColumn to most of the words in the expanded key. This modifies the - key for use with the Equivalent Inverse Cipher. - - During key expansion, there are sequences of four or six words that are - produced like this: - - E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function. - E[i+1] = E[i+1-Nk] ^ E[i+0]. - E[i+2] = E[i+2-Nk] ^ E[i+1]. - E[i+3] = E[i+3-Nk] ^ E[i+2]. - - When Nk is four or eight, the sequence stops there. When it is six, it - goes on for two more words. Let I be the InvMixColumn function. for the - Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]), - I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not - need to calculate I four times. In AES' finite field, I is a linear - combination of the four bytes of its input. The ^ operation on the bits - that represent field elements is an addition in the Galois field. So - I(a ^ b) = I(a) ^ I(b). Then we have: - - I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])). - I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]). - I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]). - I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]). - - To compute this, we compute I(f(E[i-1])) and XOR it with the previously - stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously - stored E[i+1-Nk])) to get I(E[i+1])), and so on. - - Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to - compute the pre-InvMixColumn words of the expanded key; it is not - sufficient to have the post-InvMixColumn words. -*/ - - -/* Routine: - - _aes_decrypt_key. - - _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256. - - Function: - - Expand the user's cipher key into the key schedule, as defined in - Federal Information Processing Standards Publication 197 (FIPS-197), - November 26, 2001. - - For decryption, the key is modified as shown in Figure 15 in FIPS-197, - to support the Equivalent Inverse Cipher. - - Input: - - Constant data: - - The following names must be locally defined so the assembler - can calculate certain offsets. - - static const Word _AESSubBytesWordTable[4][256]. - - _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where - SubBytes is defined in FIPS-197. _AESSubBytesWordTable - differs from _AESEncryptTable in that it does not include - the MixColumn operation. It is used in performing the last - round, which differs fromm the previous rounds in that it - does not include the MixColumn operation. - - static const Word _AESSInvMixColumnTable[4][256]. - - _AESInvMixColumnTable[i][j] contains the contribution of byte - j to element i of the InvMixColumn operation. - - The four bytes of the word _AESInvMixColumnTable[0][j] are: - - {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j}, - - listed in increasing address order, where multiplication is - performed in the Galois field. {j} designates the element of - the Galois field represented by j. _AESInvMixColumn[i][j] has - the same bytes, rotated right in the order shown above. - - static const Byte _AESRcon[]. - - Round constants, beginning with AESRcon[1] for the first round - (AESRcon[0] is padding.) - - Arguments: - - const uint8_t *Key - - Address of user's cipher key. - - int Length - - Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in - user's cipher key. - - This argument is used with _aes_decrypt_key. It is not - present for the other routines. In those routines, Context - is the second argument. - - aes_decrypt_ctx *Context - - Structure to contain the expanded key beginning at offset - ContextKey and a four-byte "key length" beginning at offset - ContextKeyLength. The "key length" is the number of bytes from - the start of the first round key to the startof the last rond - key. That is 16 less than the number of bytes in the entire - key. - - Output: - - The expanded key and the "key length" are written to *Context. - - Return: - - aes_rval // -1 if "key length" is invalid. 0 otherwise. -*/ -/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ - -#ifdef KERNEL -#include -#else -#include -#endif - -#define dr r0d // Dissection register. -#define drl r0l // Low 8 bits of dissection register. -#define drh r0h // Second-lowest 8 bits of dissection register. - -#define t0 r1 -#define t0d r1d // Low 32 bits of t0. - -#define STable r2 // Address of SubBytes table. Overlaps Nk. -#define ITable r3 // Address of InvMixColumn table. -#define offset Arch(r5, r11) // Address offset and loop sentinel. - -#define R r7 // Address of round constant. -#define K r7 // User key pointer. - // R and K overlap. - -#define E r6 // Expanded key pointer. - -#define ve0 %xmm0 -#define ve1 %xmm1 -#define ve2 %xmm2 -#define ve3 %xmm3 -#define ve4 %xmm4 -#define ve5 %xmm5 -#define vt1 %xmm6 -#define vt0 %xmm7 - -#define LookupS(table, index) (table)*TableSize(STable, index, 4) -#define LookupI(table, index) (table)*TableSize(ITable, index, 4) - - -/* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard - subroutine. It does not conform to the ABI. It is an integral part of - _ExpandKeyForDecryption and shares register use with it. -*/ -InvMixColumn: - movzx drl, t0 - movd LookupI(0, t0), vt0 // Look up byte 0 in table 0. - movzx drh, t0d - movd LookupI(1, t0), vt1 // Look up byte 1 in table 1. - pxor vt1, vt0 - shr $16, dr - movzx drl, t0d - movd LookupI(2, t0), vt1 // Look up byte 2 in table 2. - pxor vt1, vt0 - movzx drh, t0d - movd LookupI(3, t0), vt1 // Look up byte 3 in table 3. - pxor vt1, vt0 - ret - - - // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0. - .macro SubWordRotWord - movzx drl, t0 - movd LookupS(3, t0), vt1 // Look up byte 0 in table 3. - pxor vt1, vt0 - movzx drh, t0d - movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. - pxor vt1, vt0 - shr $$16, dr - movzx drl, t0d - movd LookupS(1, t0), vt1 // Look up byte 2 in table 1. - pxor vt1, vt0 - movzx drh, t0d - movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. - pxor vt1, vt0 - .endmacro - - - // SubWord puts SubWord(dr) into vt0. - .macro SubWord - movzx drl, t0 - movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. - movzx drh, t0d - movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. - pxor vt1,vt0 - shr $$16, dr - movzx drl, t0d - movd LookupS(2, t0), vt1 // Look up byte 2 in table 2. - pxor vt1,vt0 - movzx drh, t0d - movd LookupS(3, t0), vt1 // Look up byte 3 in table 3. - pxor vt1,vt0 - .endmacro - - .text - .globl _aes_decrypt_key -// .private_extern _aes_decrypt_key -_aes_decrypt_key: - - // detect AES HW, cclee 3-13-10 -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities - mov (%rax), %eax // %eax = __cpu_capabilities -#else -#if defined KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - mov _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif - -#endif - test $(kHasAES), %eax // __cpu_capabilities & kHasAES - jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - - 8 four-byte spaces for work. - */ - #define LocalsSize (8*16 + 8*4) - - // Define stack offset to storage space for local data. - #define Local (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - -#if defined __i386__ - - // Define location of argument i. - #define Argument(i) StackFrame+4*(i)(r4) - - #define Nk t0d - - // Load arguments. - mov Argument(2), E - mov Argument(1), Nk - mov Argument(0), K - -#elif defined __x86_64__ - - #define Nk r9d // Number of words in key. - mov r6d, Nk // Move Nk argument out of way. - mov r2, E // Move E argument to common register. - -#endif - - // Dispatch on key length. - cmp $128, Nk - jge 2f - shl $3, Nk // Convert from bytes to bits. - cmp $128, Nk -2: - je DKeyHas4Words - cmp $192, Nk - je DKeyHas6Words - cmp $256, Nk - je DKeyHas8Words - mov $-1, r0 // Return error. - jmp 9f - - - .globl _aes_decrypt_key128 -// .private_extern _aes_decrypt_key128 -_aes_decrypt_key128: - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - - 8 four-byte spaces for work. - */ - #define LocalsSize (8*16 + 8*4) - - // Define stack offset to storage space for local data. - #define Local (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - -#if defined __i386__ - - // Load arguments. - #define Argument(i) StackFrame+4*(i)(r4) - mov Argument(1), E - mov Argument(0), K - -#endif - -// Merge point for _aes_decrypt_key and _aes_decrypt_key128. -DKeyHas4Words: - - // First words of expanded key are copied from user key. - movd 0*4(K), ve0 - movd 1*4(K), ve1 - movd 2*4(K), ve2 - movd 3*4(K), ve3 - - movl $10*16, ContextKeyLength(E) // Set "key length." - - #if 0 != ContextKey - add $ContextKey, E - #endif - - // K cannot be used after we write to R, since they use the same register. - - #if defined __i386__ - - lea _AESRcon, R - lea _AESInvMixColumnTable, ITable - lea _AESSubBytesWordTable, STable - - #elif defined __x86_64__ - - lea _AESRcon(%rip), R - lea _AESInvMixColumnTable(%rip), ITable - lea _AESSubBytesWordTable(%rip), STable - - #endif - - /* With a four-word key, there are ten rounds (eleven 16-byte key blocks), - nine of which have InvMixColumn applied. - */ - mov $-9*4*4, offset - sub offset, E - - // Store initial words of expanded key, which are copies of user's key. - movd ve0, 0*4(E, offset) - movd ve1, 1*4(E, offset) - movd ve2, 2*4(E, offset) - movd ve3, 3*4(E, offset) - -/* Here is the first iteration of the key expansion. It is separate from the - main loop below because we need to apply InvMixColumn to each of the - outputs, in ve0 through ve3. In the main loop, the technique described at - the top of this file is used to compute the proper outputs while using - InvMixColumn only once. -*/ - add $1, R // Advance pointer. - movd ve3, dr // Put previous word into work register. - movzx (R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - pxor vt0, ve0 - - // Chain to successive words. - pxor ve0, ve1 - pxor ve1, ve2 - pxor ve2, ve3 - - add $4*4, offset - - /* Apply InvMixColumn to each word. The transformed values are stored in - the expanded key. The original values are retained in registers for - further computation. - */ - movd ve0, dr - call InvMixColumn - movd vt0, 0*4(E, offset) - - movd ve1, dr - call InvMixColumn - movd vt0, 1*4(E, offset) - - movd ve2, dr - call InvMixColumn - movd vt0, 2*4(E, offset) - - movd ve3, dr - call InvMixColumn - movd vt0, 3*4(E, offset) - -// Here is the main loop. -1: - add $1, R // Advance pointer. - movd ve3, dr // Put previous word into work register. - movzx (R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - pxor vt0, ve0 - - // Chain to successive words. - pxor ve0, ve1 - pxor ve1, ve2 - pxor ve2, ve3 - /* Dr. Brian Gladman uses a technique with a single XOR here instead - of the previous four. There is some periodic behavior in the key - expansion, and Gladman maintains E[4*i+3] for the latest four - values of i. XORing the value in vt0 with one of these yields its - replacement. However, using this technique requires additional - instructions before the loop (to initialize the values) and after - it (to extract the final values to be stored) and either some way - to rotate or index four values in the loop or a four-fold unrolling - of the loop to provide the indexing. Experiment suggests the - former is not worthwhile. Unrolling the loop might give a small - gain, at the cost of increased use of instruction cache, increased - instructions loads the first time the routine is executed, and - increased code complexity, so I decided against it. - */ - - // Apply InvMixColumn to the difference. - movd vt0, dr - call InvMixColumn - - add $4*4, offset - - // Chain the transformed difference to previously transformed outputs. - movd (0-4)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 0*4(E, offset) - - movd (1-4)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 1*4(E, offset) - - movd (2-4)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 2*4(E, offset) - - movd (3-4)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 3*4(E, offset) - - jl 1b - -// Here is the final iteration, which does not perform InvMixColumn. - - movd ve3, dr // Put previous word into work register. - movzx 1(R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - pxor vt0, ve0 - - // Chain to successive words. - movd ve0, 4*4(E, offset) - pxor ve0, ve1 - movd ve1, 5*4(E, offset) - pxor ve1, ve2 - movd ve2, 6*4(E, offset) - pxor ve2, ve3 - movd ve3, 7*4(E, offset) - - xor r0, r0 // Return success. - -9: - // Pop stack and restore registers. - movaps 7*16(r4), %xmm7 - movaps 6*16(r4), %xmm6 - movaps 5*16(r4), %xmm5 - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - pop r5 - #endif - pop r3 - - ret - - - .globl _aes_decrypt_key192 -// .private_extern _aes_decrypt_key192 -_aes_decrypt_key192: - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - - 8 four-byte spaces for work. - */ - #define LocalsSize (8*16 + 8*4) - - // Define stack offset to storage space for local data. - #define Local (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - -#if defined __i386__ - - // Load arguments. - #define Argument(i) StackFrame+4*(i)(r4) - mov Argument(1), E - mov Argument(0), K - -#endif - -// Merge point for _aes_decrypt_key and _aes_decrypt_key192. -DKeyHas6Words: - - // First words of expanded key are copied from user key. - movd 0*4(K), ve0 - movd 1*4(K), ve1 - movd 2*4(K), ve2 - movd 3*4(K), ve3 - - movl $12*16, ContextKeyLength(E) // Set "key length." - - #if 0 != ContextKey - add $ContextKey, E - #endif - - movd 4*4(K), ve4 - movd 5*4(K), ve5 - - // K cannot be used after we write to R, since they use the same register. - - #if defined __i386__ - - lea _AESRcon, R - lea _AESInvMixColumnTable, ITable - lea _AESSubBytesWordTable, STable - - #elif defined __x86_64__ - - lea _AESRcon(%rip), R - lea _AESInvMixColumnTable(%rip), ITable - lea _AESSubBytesWordTable(%rip), STable - - #endif - - /* With a six-word key, there are twelve rounds (thirteen 16-byte key - blocks), eleven of which have InvMixColumn applied. The key expansion - proceeds in iterations of six four-byte words, so the termination - condition is a bit complicated. We set offset to the negative of 10 - four four-byte words, and the loop branch does another iteration if - offset is less than or equal to zero, meaning the number of iterations - performed so far is less than or equal to 10. Thus, after ten - iterations, it branches again. After the eleventh iteration, it - stops. Code after the end of the loop computes the twelfth key block, - which does not have InvMixColumn applied. - */ - mov $-10*4*4, offset - sub offset, E - - // Store initial words of expanded key, which are copies of user's key. - movd ve0, 0*4(E, offset) - movd ve1, 1*4(E, offset) - movd ve2, 2*4(E, offset) - movd ve3, 3*4(E, offset) - - /* The first four words are stored untransformed. After that, words in - the expanded key are transformed by InvMixColumn. - */ - movd ve4, dr - call InvMixColumn - movd vt0, 4*4(E, offset) - - movd ve5, dr - call InvMixColumn - movd vt0, 5*4(E, offset) - -/* Here is the first iteration of the key expansion. It is separate from the - main loop below because we need to apply InvMixColumn to each of the - outputs, in ve0 through ve5. In the main loop, the technique described at - the top of this file is used to compute the proper outputs while using - InvMixColumn only once. -*/ - add $1, R // Advance pointer. - movd ve5, dr // Put previous word into work register. - movzx (R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - pxor vt0, ve0 - - // Chain to successive words. - pxor ve0, ve1 - pxor ve1, ve2 - pxor ve2, ve3 - pxor ve3, ve4 - pxor ve4, ve5 - - add $6*4, offset - - /* Apply InvMixColumn to each word. The transformed values are stored in - the expanded key. The original values are retained in registers for - further computation. - */ - movd ve0, dr - call InvMixColumn - movd vt0, 0*4(E, offset) - - movd ve1, dr - call InvMixColumn - movd vt0, 1*4(E, offset) - - movd ve2, dr - call InvMixColumn - movd vt0, 2*4(E, offset) - - movd ve3, dr - call InvMixColumn - movd vt0, 3*4(E, offset) - - movd (4-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 4*4(E, offset) - - movd (5-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 5*4(E, offset) - -// Here is the main loop. -1: - add $1, R // Advance pointer. - movd ve5, dr // Put previous word into work register. - movzx (R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - pxor vt0, ve0 - - // Chain to successive words. - pxor ve0, ve1 - pxor ve1, ve2 - pxor ve2, ve3 - pxor ve3, ve4 - pxor ve4, ve5 - - // Apply InvMixColumn to the difference. - movd vt0, dr - call InvMixColumn - - add $6*4, offset - - // Chain the transformed difference to previously transformed outputs. - movd (0-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 0*4(E, offset) - - movd (1-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 1*4(E, offset) - - movd (2-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 2*4(E, offset) - - movd (3-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 3*4(E, offset) - - movd (4-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 4*4(E, offset) - - movd (5-6)*4(E, offset), vt1 - pxor vt1, vt0 - movd vt0, 5*4(E, offset) - - jle 1b - -// Here is the final iteration, which does not perform InvMixColumn. - - movd ve5, dr // Put previous word into work register. - movzx 1(R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - pxor vt0, ve0 - - // Chain to successive words. - movd ve0, 6*4(E, offset) - pxor ve0, ve1 - movd ve1, 7*4(E, offset) - pxor ve1, ve2 - movd ve2, 8*4(E, offset) - pxor ve2, ve3 - movd ve3, 9*4(E, offset) - - xor r0, r0 // Return success. - - // Pop stack and restore registers. - movaps 7*16(r4), %xmm7 - movaps 6*16(r4), %xmm6 - movaps 5*16(r4), %xmm5 - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - pop r5 - #endif - pop r3 - - ret - - - .globl _aes_decrypt_key256 -// .private_extern _aes_decrypt_key256 -_aes_decrypt_key256: - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - - 8 four-byte spaces for work. - */ - #define LocalsSize (8*16 + 8*4) - - // Define stack offset to storage space for local data. - #define Local (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - -#if defined __i386__ - - // Load arguments. - #define Argument(i) StackFrame+4*(i)(r4) - mov Argument(1), E - mov Argument(0), K - -#endif - -// Merge point for _aes_decrypt_key and _aes_decrypt_key256. -DKeyHas8Words: - - // First words of expanded key are copied from user key. - movd 0*4(K), ve0 - movd 1*4(K), ve1 - movd 2*4(K), ve2 - movd 3*4(K), ve3 - - movl $14*16, ContextKeyLength(E) // Set "key length." - - #if 0 != ContextKey - add $ContextKey, E - #endif - - // Store initial words of expanded key, which are copies of user's key. - movd ve0, 0*4(E) - movd ve1, 1*4(E) - movd ve2, 2*4(E) - movd ve3, 3*4(E) - movd 4*4(K), ve0 - movd 5*4(K), ve1 - movd 6*4(K), ve2 - movd 7*4(K), ve3 - - // K cannot be used after we write to R, since they use the same register. - - #if defined __i386__ - - lea _AESRcon, R - lea _AESInvMixColumnTable, ITable - lea _AESSubBytesWordTable, STable - - #elif defined __x86_64__ - - lea _AESRcon(%rip), R - lea _AESInvMixColumnTable(%rip), ITable - lea _AESSubBytesWordTable(%rip), STable - - #endif - - /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key - blocks), thirteen of which have InvMixColumn applied. - */ - mov $-12*4*4, offset - sub offset, E - - // Save untransformed values in stack area. - movd ve0, 4*4+Local(r4) - movd ve1, 5*4+Local(r4) - movd ve2, 6*4+Local(r4) - movd ve3, 7*4+Local(r4) - - /* Apply InvMixColumn to words 4 through 7. The transformed values are - stored in the expanded key. The original values are saved in the stack - area for further computation. - */ - movd ve0, dr - call InvMixColumn - movd vt0, 4*4(E, offset) - - movd ve1, dr - call InvMixColumn - movd vt0, 5*4(E, offset) - - movd ve2, dr - call InvMixColumn - movd vt0, 6*4(E, offset) - - movd ve3, dr - call InvMixColumn - movd vt0, 7*4(E, offset) - -/* Here is the first iteration of the key expansion. It is separate from the - main loop below because we need to apply InvMixColumn to each of the - outputs, in ve0 through ve3. In the main loop, the technique described at - the top of this file is used to compute the proper outputs while using - InvMixColumn only once. -*/ - add $1, R // Advance pointer. - movd ve3, dr // Put previous word into work register. - movzx (R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - - add $8*4, offset - - movd (0-8)*4(E, offset), ve0 // Get old word. - pxor vt0, ve0 - movd ve0, 0*4+Local(r4) // Save on stack. - movd ve0, dr - call InvMixColumn - movd vt0, 0*4(E, offset) // Write to expanded key. - - /* Chain to successive words and apply InvMixColumn to each word. The - transformed values are stored in the expanded key. The original - values are retained in local data for further computation. - */ - movd (1-8)*4(E, offset), ve1 // Get old word. - pxor ve0, ve1 // Chain. - movd ve1, 1*4+Local(r4) // Save on stack. - movd ve1, dr - call InvMixColumn - movd vt0, 1*4(E, offset) // Write to expanded key. - - movd (2-8)*4(E, offset), ve2 // Get old word. - pxor ve1, ve2 // Chain. - movd ve2, 2*4+Local(r4) // Save on stack. - movd ve2, dr - call InvMixColumn - movd vt0, 2*4(E, offset) // Write to expanded key. - - movd (3-8)*4(E, offset), ve3 // Get old word. - pxor ve2, ve3 // Chain. - movd ve3, 3*4+Local(r4) // Save on stack. - movd ve3, dr - call InvMixColumn - movd vt0, 3*4(E, offset) // Write to expanded key. - - movd ve3, dr // Put previous word into work register. - SubWord - - movd 4*4+Local(r4), ve0 // Get old word. - pxor vt0, ve0 // Chain. - movd ve0, 4*4+Local(r4) // Save on stack. - - movd 5*4+Local(r4), ve1 // Get old word. - pxor ve0, ve1 // Chain. - movd ve1, 5*4+Local(r4) // Save on stack. - - movd 6*4+Local(r4), ve2 // Get old word. - pxor ve1, ve2 // Chain. - movd ve2, 6*4+Local(r4) // Save on stack. - - movd 7*4+Local(r4), ve3 // Get old word. - pxor ve2, ve3 // Chain. - movd ve3, 7*4+Local(r4) // Save on stack. - - movd vt0, dr // Move change to work register. - call InvMixColumn - - movd (4-8)*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, 4*4(E, offset) // Write new word to expanded key. - - movd (5-8)*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, 5*4(E, offset) // Write new word to expanded key. - - movd (6-8)*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, 6*4(E, offset) // Write new word to expanded key. - - movd (7-8)*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, 7*4(E, offset) // Write new word to expanded key. - -// Here is the main loop. -1: - add $1, R // Advance pointer. - movd ve3, dr // Put previous word into work register. - movzx (R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - - movd 0*4+Local(r4), ve0 // Get old word. - pxor vt0, ve0 - movd ve0, 0*4+Local(r4) // Save on stack. - - // Chain to successive words. - movd 1*4+Local(r4), ve1 // Get old word. - pxor ve0, ve1 // Chain. - movd ve1, 1*4+Local(r4) // Save on stack. - - movd 2*4+Local(r4), ve2 // Get old word. - pxor ve1, ve2 // Chain. - movd ve2, 2*4+Local(r4) // Save on stack. - - movd 3*4+Local(r4), ve3 // Get old word. - pxor ve2, ve3 // Chain. - movd ve3, 3*4+Local(r4) // Save on stack. - - movd vt0, dr // Move change to work register. - call InvMixColumn - - movd 0*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (0+8)*4(E, offset) // Write new word to expanded key. - - movd 1*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (1+8)*4(E, offset) // Write new word to expanded key. - - movd 2*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (2+8)*4(E, offset) // Write new word to expanded key. - - movd 3*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (3+8)*4(E, offset) // Write new word to expanded key. - - movd ve3, dr // Put previous word into work register. - SubWord - - movd 4*4+Local(r4), ve0 // Get old word. - pxor vt0, ve0 // Chain. - movd ve0, 4*4+Local(r4) // Save on stack. - - movd 5*4+Local(r4), ve1 // Get old word. - pxor ve0, ve1 // Chain. - movd ve1, 5*4+Local(r4) // Save on stack. - - movd 6*4+Local(r4), ve2 // Get old word. - pxor ve1, ve2 // Chain. - movd ve2, 6*4+Local(r4) // Save on stack. - - movd 7*4+Local(r4), ve3 // Get old word. - pxor ve2, ve3 // Chain. - movd ve3, 7*4+Local(r4) // Save on stack. - - movd vt0, dr // Move change to work register. - call InvMixColumn - - movd 4*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (4+8)*4(E, offset) // Write new word to expanded key. - - movd 5*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (5+8)*4(E, offset) // Write new word to expanded key. - - movd 6*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (6+8)*4(E, offset) // Write new word to expanded key. - - movd 7*4(E, offset), vt1 // Get old word. - pxor vt1, vt0 // Chain. - movd vt0, (7+8)*4(E, offset) // Write new word to expanded key. - - add $8*4, offset - - jl 1b - - movd ve3, dr // Put previous word into work register. - movzx 1(R), t0d // Get round constant. - movd t0d, vt0 - - SubWordRotWord - - movd 0*4+Local(r4), ve0 // Get old word. - pxor vt0, ve0 // Chain. - movd ve0, (0+8)*4(E, offset) - - // Chain to successive words. - movd 1*4+Local(r4), ve1 // Get old word. - pxor ve0, ve1 // Chain. - movd ve1, (1+8)*4(E, offset) - - movd 2*4+Local(r4), ve2 // Get old word. - pxor ve1, ve2 // Chain. - movd ve2, (2+8)*4(E, offset) - - movd 3*4+Local(r4), ve3 // Get old word. - pxor ve2, ve3 // Chain. - movd ve3, (3+8)*4(E, offset) - - xor r0, r0 // Return success. - - // Pop stack and restore registers. - movaps 7*16(r4), %xmm7 - movaps 6*16(r4), %xmm6 - movaps 5*16(r4), %xmm5 - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - pop r5 - #endif - pop r3 - - ret - - -#undef Address -#undef Argument -#undef E -#undef ITable -#undef K -#undef Local -#undef LocalsSize -#undef LookupI -#undef LookupS -#undef Nk -#undef Padding -#undef R -#undef SaveSize -#undef STable -#undef StackFrame -#undef dr -#undef drh -#undef drl -#undef offset -#undef t0 -#undef t0d -#undef ve0 -#undef ve1 -#undef ve2 -#undef ve3 -#undef ve4 -#undef ve5 -#undef vt0 -#undef vt1 diff --git a/bsd/crypto/aes/i386/ExpandKeyForEncryption.s b/bsd/crypto/aes/i386/ExpandKeyForEncryption.s deleted file mode 100644 index 1ce3c9553..000000000 --- a/bsd/crypto/aes/i386/ExpandKeyForEncryption.s +++ /dev/null @@ -1,801 +0,0 @@ -/* This file defines _aes_encrypt_key, _aes_encrypt_key128, - _aes_encrypt_key192, and _aes_encrypt_key256. It is designed to be - included in another assembly file with the preprocessor #include directive, - to benefit from some assembly-time calculations. - - Written by Eric Postpischil, January 2008. - - The comments here do not say much about the algorithm; the code just - follows the FIPS-197 specification. I recommend reading the specification - before working with this code or examining the C code in the parent - directory that illustrates key expansion. -*/ - - -/* Routines: - - _aes_encrypt_key. - - _aes_encrypt_key128, _aes_encrypt_key192, and _aes_encrypt_key256. - - Function: - - Expand the user's cipher key into the key schedule, as defined in - Federal Information Processing Standards Publication 197 (FIPS-197), - November 26, 2001. - - Input: - - Constant data: - - The following names must be locally defined so the assembler - can calculate certain offsets. - - static const Word _AESSubBytesWordTable[4][256]. - - _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where - SubBytes is defined in FIPS-197. _AESSubBytesWordTable - differs from _AESEncryptTable in that it does not include - the MixColumn operation. It is used in performing the last - round, which differs fromm the previous rounds in that it - does not include the MixColumn operation. - - static const Byte _AESRcon[]. - - Round constants, beginning with AESRcon[1] for the first round - (AESRcon[0] is padding.) - - Arguments: - - const uint8_t *Key - - Address of user's cipher key. - - int Length - - Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in - user's cipher key. - - This argument is used with _aes_encrypt_key. It is not - present for the other routines. In those routines, Context - is the second argument. - - aes_encrypt_ctx *Context - - Structure to contain the expanded key beginning at offset - ContextKey and a four-byte "key length" beginning at offset - ContextKeyLength. The "key length" is the number of bytes from - the start of the first round key to the start of the last round - key. That is 16 less than the number of bytes in the entire - key. - - Output: - - The expanded key and the "key length" are written to *Context. - - Return: - - aes_rval // -1 if "key length" is invalid. 0 otherwise. -*/ - -/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ -#ifdef KERNEL -#include -#else -#include -#endif - - .text - .globl _aes_encrypt_key -// .private_extern _aes_encrypt_key -_aes_encrypt_key: - - // detect AES HW, cclee-3-13-10 -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities - mov (%rax), %eax // %eax = __cpu_capabilities -#else -#if defined KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - mov _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif -#endif - test $(kHasAES), %eax // __cpu_capabilities & kHasAES - jne _aes_encrypt_key_hw // if AES HW detected, branch to _aes_encrypt_key_hw - -#define dr r0d // Dissection register. -#define drl r0l // Low 8 bits of dissection register. -#define drh r0h // Second-lowest 8 bits of dissection register. - -#define t0 r1 -#define t0d r1d // Low 32 bits of t0. - -#define offset Arch(r5, r11) // Address offset and loop sentinel. - -#define R r7 // Address of round constant. -#define K r7 // User key pointer. - // R and K overlap. - -#define E r6 // Expanded key pointer. - -#define ve0 %xmm0 -#define ve1 %xmm1 -#define ve2 %xmm2 -#define ve3 %xmm3 -#define vt3 %xmm4 -#define vt2 %xmm5 -#define vt1 %xmm6 -#define vt0 %xmm7 - -#if defined __i386__ - #define LookupS(table, index) \ - _AESSubBytesWordTable+(table)*TableSize(, index, 4) -#elif defined __x86_64__ - #define LookupS(table, index) (table)*TableSize(STable, index, 4) -#endif - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - */ - #define LocalsSize (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - -#if defined __i386__ - - // Define location of argument i. - #define Argument(i) StackFrame+4*(i)(r4) - - #define Nk t0d - - // Load arguments. - mov Argument(2), E - mov Argument(1), Nk - mov Argument(0), K - -#elif defined __x86_64__ - - #define Nk r9d // Number of words in key. - mov r6d, Nk // Move Nk argument out of way. - mov r2, E // Move E argument to common register. - -#endif - - // Dispatch on key length. - cmp $128, Nk - jge 2f - shl $3, Nk // Convert from bytes to bits. - cmp $128, Nk -2: - je EKeyHas4Words - cmp $192, Nk - je EKeyHas6Words - cmp $256, Nk - je EKeyHas8Words - mov $-1, r0 // Return error. - jmp 9f - -// Stop using Nk. -#undef Nk - - .globl _aes_encrypt_key128 -// .private_extern _aes_encrypt_key128 -_aes_encrypt_key128: - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - */ - #define LocalsSize (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - - #if defined __i386__ - - // Load arguments. - #define Argument(i) StackFrame+4*(i)(r4) - mov Argument(1), E - mov Argument(0), K - - #endif - -// Merge point for _aes_encrypt_key and _aes_encrypt_key128. -EKeyHas4Words: - -#define e0 r2d -#define e1 r3d -#define e2 Arch(r5d, r11d) -#define e3 r7d - - // First words of expanded key are copied from user key. - mov 0*4(K), e0 - mov 1*4(K), e1 - mov 2*4(K), e2 - mov 3*4(K), e3 - - movl $10*16, ContextKeyLength(E) // Set "key length." - - #if 0 != ContextKey - add $ContextKey, E - #endif - - // K cannot be used after we write to R, since they use the same register. - - // Cache round constants in output buffer. The last is a sentinel. - movb $0x01, 1*16(E) - movb $0x02, 2*16(E) - movb $0x04, 3*16(E) - movb $0x08, 4*16(E) - movb $0x10, 5*16(E) - movb $0x20, 6*16(E) - movb $0x40, 7*16(E) - movb $0x80, 8*16(E) - movb $0x1b, 9*16(E) - movb $0x36, 10*16(E) - - #if defined __x86_64__ - - #define STable r8 - lea _AESSubBytesWordTable(%rip), STable - - #endif - - // Store initial words of expanded key, which are copies of user's key. - mov e0, 0*4(E) - mov e1, 1*4(E) - mov e2, 2*4(E) - mov e3, 3*4(E) - -1: - mov e3, dr // Put previous word into dissection register. - - // Perform SubWord(RotWord(dr)). - movzx drl, t0 - xor LookupS(3, t0), e0 // Look up byte 0 in table 3. - movzx drh, t0d - xor LookupS(0, t0), e0 // Look up byte 1 in table 0. - shr $16, dr - movzx drl, t0d - xor LookupS(1, t0), e0 // Look up byte 2 in table 1. - movzx drh, t0d - xor LookupS(2, t0), e0 // Look up byte 3 in table 2. - - add $4*4, E - - movzx (E), t0d // Get cached round constant. - xor t0d, e0 // XOR with word from four words back. - - // Chain to successive words. - mov e0, 0*4(E) - xor e0, e1 - mov e1, 1*4(E) - xor e1, e2 - mov e2, 2*4(E) - xor e2, e3 - mov e3, 3*4(E) - - cmp $0x36, t0d // Was this the last round constant? - - jne 1b - - xor r0, r0 // Return success. - -9: - // Pop stack and restore registers. - movaps 7*16(r4), %xmm7 - movaps 6*16(r4), %xmm6 - movaps 5*16(r4), %xmm5 - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - pop r5 - #endif - pop r3 - - ret - - -// Reset definitions for next case. -#undef e0 -#undef e1 -#undef e2 -#undef e3 - -#undef vt3 -#undef vt2 -#define ve4 %xmm4 -#define ve5 %xmm5 - - - .globl _aes_encrypt_key192 -// .private_extern _aes_encrypt_key192 -_aes_encrypt_key192: - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - */ - #define LocalsSize (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - - #if defined __i386__ - - // Load arguments. - #define Argument(i) StackFrame+4*(i)(r4) - mov Argument(1), E - mov Argument(0), K - - #endif - -// Merge point for _aes_encrypt_key and _aes_encrypt_key192. -EKeyHas6Words: - - // First words of expanded key are copied from user key. - movd 0*4(K), ve0 - movd 1*4(K), ve1 - movd 2*4(K), ve2 - movd 3*4(K), ve3 - - movl $12*16, ContextKeyLength(E) // Set "key length." - - #if 0 != ContextKey - add $ContextKey, E - #endif - - movd 4*4(K), ve4 - movd 5*4(K), ve5 - - // K cannot be used after we write to R, since they use the same register. - - #if defined __i386__ - - lea _AESRcon, R - - #elif defined __x86_64__ - - lea _AESRcon(%rip), R - lea _AESSubBytesWordTable(%rip), STable - - #endif - - /* With a six-word key, there are twelve rounds (thirteen 16-byte key - blocks). - */ - mov $-12*4*4, offset - sub offset, E - - // Store initial words of expanded key, which are copies of user's key. - movd ve0, 0*4(E, offset) - movd ve1, 1*4(E, offset) - movd ve2, 2*4(E, offset) - movd ve3, 3*4(E, offset) - movd ve4, 4*4(E, offset) - movd ve5, 5*4(E, offset) - -/* Jump into loop body. The key expansion processes six four-byte words per - iteration. 52 are needed in the key. So only four are needed in the last - iteration. -*/ - jmp 2f -1: - // Continue chaining to successive words. - pxor ve3, ve4 - movd ve4, 4*4(E, offset) - pxor ve4, ve5 - movd ve5, 5*4(E, offset) -2: - add $1, R // Advance pointer. - movd ve5, dr // Put previous word into dissection register. - movzx (R), t0 // Get round constant. - movd t0d, vt1 - pxor vt1, ve0 // XOR with word from six words back. - - // Perform SubWord(RotWord(dr)). - movzx drl, t0d - movd LookupS(3, t0), vt0 // Look up byte 0 in table 3. - movzx drh, t0d - movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. - shr $16, dr - movzx drl, t0d - pxor vt1, vt0 - pxor vt0, ve0 - movd LookupS(1, t0), vt0 // Look up byte 2 in table 1. - movzx drh, t0d - movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. - pxor vt1, vt0 - pxor vt0, ve0 - - add $6*4, offset - - // Chain to successive words. - movd ve0, 0*4(E, offset) - pxor ve0, ve1 - movd ve1, 1*4(E, offset) - pxor ve1, ve2 - movd ve2, 2*4(E, offset) - pxor ve2, ve3 - movd ve3, 3*4(E, offset) - - jne 1b - - xor r0, r0 // Return success. - - // Pop stack and restore registers. - movaps 7*16(r4), %xmm7 - movaps 6*16(r4), %xmm6 - movaps 5*16(r4), %xmm5 - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - pop r5 - #endif - pop r3 - - ret - - -// Reset definitions for next case. -#undef ve4 -#undef ve5 -#define vt3 %xmm4 -#define vt2 %xmm5 - - - .globl _aes_encrypt_key256 -// .private_extern _aes_encrypt_key256 -_aes_encrypt_key256: - - /* Save registers and set SaveSize to the number of bytes pushed onto the - stack so far, including the caller's return address. - */ - push r3 - #if defined __i386__ - push r5 - push r6 - push r7 - #define SaveSize (5*4) - #else - #define SaveSize (2*8) - #endif - - /* Number of bytes used for local variables: - - 8 16-byte spaces to save XMM registers. - */ - #define LocalsSize (8*16) - - #if 0 < LocalsSize - // Padding to position stack pointer at a multiple of 16 bytes. - #define Padding (15 & -(SaveSize + LocalsSize)) - sub $Padding + LocalsSize, r4 // Allocate space on stack. - #else - #define Padding 0 - #endif - - /* StackFrame is the number of bytes in our stack frame, from caller's - stack pointer to ours (so it includes the return address). - */ - #define StackFrame (SaveSize + Padding + LocalsSize) - - // Save xmm registers. - movaps %xmm0, 0*16(r4) - movaps %xmm1, 1*16(r4) - movaps %xmm2, 2*16(r4) - movaps %xmm3, 3*16(r4) - movaps %xmm4, 4*16(r4) - movaps %xmm5, 5*16(r4) - movaps %xmm6, 6*16(r4) - movaps %xmm7, 7*16(r4) - - #if defined __i386__ - - // Load arguments. - #define Argument(i) StackFrame+4*(i)(r4) - mov Argument(1), E - mov Argument(0), K - - #endif - -// Merge point for _aes_encrypt_key and _aes_encrypt_key256. -EKeyHas8Words: - - // First words of expanded key are copied from user key. - movd 0*4(K), ve0 - movd 1*4(K), ve1 - movd 2*4(K), ve2 - movd 3*4(K), ve3 - - movl $14*16, ContextKeyLength(E) // Set "key length." - - #if 0 != ContextKey - add $ContextKey, E - #endif - - // Store initial words of expanded key, which are copies of user's key. - movd ve0, 0*4(E) - movd ve1, 1*4(E) - movd ve2, 2*4(E) - movd ve3, 3*4(E) - movd 4*4(K), ve0 - movd 5*4(K), ve1 - movd 6*4(K), ve2 - movd 7*4(K), ve3 - - // K cannot be used after we write to R, since they use the same register. - - #if defined __i386__ - - lea _AESRcon, R - - #elif defined __x86_64__ - - lea _AESRcon(%rip), R - lea _AESSubBytesWordTable(%rip), STable - - #endif - - /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key - blocks). - */ - mov $-14*4*4, offset - sub offset, E - - // Store initial words of expanded key, which are copies of user's key. - movd ve0, 4*4(E, offset) - movd ve1, 5*4(E, offset) - movd ve2, 6*4(E, offset) - movd ve3, 7*4(E, offset) - -/* Jump into loop body. The key expansion processes eight four-byte words per - iteration. 60 are needed in the key. So only four are needed in the last - iteration. -*/ - jmp 2f -1: - movd ve3, dr // Put previous word into dissection register. - - /* Get word from eight words back (it is four words back from where E - currently points, and we use it to prepare the value to be stored - four words beyond where E currently points). - */ - movd -4*4(E, offset), ve0 - - // Perform SubWord(dr). - movzx drl, t0 - movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. - movzx drh, t0d - movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. - shr $16, dr - movzx drl, t0d - movd LookupS(2, t0), vt2 // Look up byte 2 in table 2. - movzx drh, t0d - movd LookupS(3, t0), vt3 // Look up byte 3 in table 3. - pxor vt1, vt0 - pxor vt3, vt2 - pxor vt0, ve0 - pxor vt2, ve0 - - movd -3*4(E, offset), ve1 // Get words from eight words back. - movd -2*4(E, offset), ve2 - movd -1*4(E, offset), ve3 - - // Chain to successive words. - movd ve0, 4*4(E, offset) - pxor ve0, ve1 - movd ve1, 5*4(E, offset) - pxor ve1, ve2 - movd ve2, 6*4(E, offset) - pxor ve2, ve3 - movd ve3, 7*4(E, offset) - -2: - add $1, R // Advance pointer. - movd ve3, dr // Put previous word into dissection register. - movzx (R), t0d // Get round constant. - movd t0d, vt1 - movd 0*4(E, offset), ve0 // Get word from eight words back. - pxor vt1, ve0 - - // Perform SubWord(RotWord(dr)). - movzx drl, t0 - movd LookupS(3, t0), vt0 // Look up byte 0 in table 3. - movzx drh, t0d - movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. - shr $16, dr - movzx drl, t0d - movd LookupS(1, t0), vt2 // Look up byte 2 in table 1. - movzx drh, t0d - movd LookupS(2, t0), vt3 // Look up byte 3 in table 2. - pxor vt1, vt0 - pxor vt3, vt2 - pxor vt0, ve0 - pxor vt2, ve0 - - movd 1*4(E, offset), ve1 - movd 2*4(E, offset), ve2 - movd 3*4(E, offset), ve3 - - add $8*4, offset - - // Chain to successive words. - movd ve0, 0*4(E, offset) - pxor ve0, ve1 - movd ve1, 1*4(E, offset) - pxor ve1, ve2 - movd ve2, 2*4(E, offset) - pxor ve2, ve3 - movd ve3, 3*4(E, offset) - - jne 1b - - xor r0, r0 // Return success. - - // Pop stack and restore registers. - movaps 7*16(r4), %xmm7 - movaps 6*16(r4), %xmm6 - movaps 5*16(r4), %xmm5 - movaps 4*16(r4), %xmm4 - movaps 3*16(r4), %xmm3 - movaps 2*16(r4), %xmm2 - movaps 1*16(r4), %xmm1 - movaps 0*16(r4), %xmm0 - #if 0 < LocalsSize - add $Padding + LocalsSize, r4 - #endif - #if defined __i386__ - pop r7 - pop r6 - pop r5 - #endif - pop r3 - - ret - - -#undef Address -#undef Argument -#undef E -#undef K -#undef LocalsSize -#undef LookupS -#undef Padding -#undef R -#undef SaveSize -#undef STable -#undef StackFrame -#undef dr -#undef drh -#undef drl -#undef offset -#undef t0 -#undef t0d -#undef ve0 -#undef ve1 -#undef ve2 -#undef ve3 -#undef vt0 -#undef vt1 -#undef vt2 -#undef vt3 diff --git a/bsd/crypto/aes/i386/MakeData.c b/bsd/crypto/aes/i386/MakeData.c deleted file mode 100644 index 262dc5996..000000000 --- a/bsd/crypto/aes/i386/MakeData.c +++ /dev/null @@ -1,516 +0,0 @@ -#include -#include -#include -#include - -#define MaxRcon 11 - -typedef uint8_t Byte; -typedef uint32_t Word; - - -/* In comments below, {n} designates the Galois field element represented by - the byte n. See notes about Galois field multiplication in ReadMe.txt. - - So 3+5 is addition of ordinary integers, and 3+5 == 8, while {3}+{5} is - addition in the field, and {3} + {5} = {3 XOR 5} = {6}.) -*/ - - -// Define constants for languages. -typedef enum { C, IntelAssembly } Language; - - -/* LogBase3[i] will contain the base-three logarithm of i in the 256-element - Galois field defined by AES. That is, {3}**LogBase3[i] == {3}**i. -*/ -static Byte LogBase3[256]; - -/* AntilogBase3[i] will contain {3}**i in the 256-element Galois field defined - by AES. It contains extra elements so that the antilog of a+b can be found - by looking up a+b directly, without having to reduce modulo the period, for - 0 <= a, b < 255. - - (254 is the greatest value we encounter. Each a or b we use is the - base-three logarithm of some element. As a primitive root, the powers of - three cycle through all non-zero elements of the field, of which there are - 255, so the exponents cover 0 to 254 before the powers repeat.) -*/ -static Byte AntilogBase3[254+254+1]; - - -static void InitializeLogTables(void) -{ - // log({1}) is zero, so start {p} (power) at {1} and l (logarithm) at 0. - Byte p = 1; - int l = 0; - do - { - // Record table entries. - LogBase3[p] = l; - AntilogBase3[l] = p; - - /* Observe that {2}*{p} is {p << 1 ^ (a & 0x80 ? 0x1b : 0)}, per notes - in ReadMe.txt. We produce {3}*{p}: - - {3}*{p} - = {1}*{p} + {2}*{p} - = {1}*{p} + {p << 1 ^ (a & 0x80 ? 0x1b : 0)} - = {p ^ p << 1 ^ (p & 0x80 ? 0x1b : 0)}. - */ - p ^= p << 1 ^ (p & 0x80 ? 0x1b : 0); - ++l; - - } while (p != 1); // Stop when we have gone around completely. - - /* The antilogarithms are periodic with a period of 255, and we want to - look up elements as high as 254+254 (the largest that a sum of two - logarithms could be), so we replicate the table beyond the first - period. - */ - for (l = 255; l < 254+254; ++l) - AntilogBase3[l] = AntilogBase3[l-255]; -} - - -/* MultiplyByte(Byte b, Byte c) returns {b}*{c}. It requires tables that must - be initialized before this routine is used. -*/ -static Byte MultiplyByte(Byte b, Byte c) -{ - // Calculate product by adding logarithms, but avoid logarithms of zero. - return b == 0 || c == 0 ? 0 : AntilogBase3[LogBase3[b] + LogBase3[c]]; -} - - -// Return {0} if {b} is {0} and the multiplicative inverse of {b} otherwise. -static Byte InverseByte(Byte b) -{ - return b == 0 ? 0 : AntilogBase3[255 - LogBase3[b]]; -} - - -// Perform AES' SubBytes operation on a single byte. -static Byte SubByte(Byte b) -{ - unsigned int r = InverseByte(b); - - // Duplicate r as a proxy for a rotate operation. - r = r | r<<8; - - // Apply the standard's affine transformation. - return r ^ r>>4 ^ r>>5 ^ r>>6 ^ r>>7 ^ 0x63; -} - - -// Define and populate tables for the SubBytes and InvSubBytes operations. -static Byte SubBytesTable[256]; -static Byte InvSubBytesTable[256]; - - -static void InitializeSubBytesTable(void) -{ - for (int i = 0; i < 256; ++i) - SubBytesTable[i] = SubByte((Byte) i); -} - - -static void InitializeInvSubBytesTable(void) -{ - for (int i = 0; i < 256; ++i) - InvSubBytesTable[SubByte((Byte) i)] = i; -} - - -/* Print tables for SubBytes function providing the output byte embedded in - various places in a word, so that the table entries can be used with - fewer byte manipulations. -*/ -static void PrintSubBytesWordTable(Language language) -{ - switch (language) - { - case C: - printf("\n\n" - "// SubBytes embedded in words tables.\n" - "const Word AESSubBytesWordTable[4][256] =\n" - "{\n"); - for (int j = 0; j < 4; ++j) - { - printf("\t{\n"); - for (int i = 0; i < 256; ++i) - printf("\t\t0x%08x,\n", SubBytesTable[i] << j*8); - printf("\t},\n"); - } - printf("};\n"); - break; - - case IntelAssembly: - printf("\n\n" - "// SubBytes embedded in words tables.\n" - "\t.globl\t_AESSubBytesWordTable\n" - "\t.private_extern\t_AESSubBytesWordTable\n" - "\t.align\t2\n" - "_AESSubBytesWordTable:\n"); - for (int j = 0; j < 4; ++j) - { - printf("\t// Table %d.\n", j); - for (int i = 0; i < 256; ++i) - printf("\t.long\t0x%08x\n", SubBytesTable[i] << j*8); - } - break; - } -} - - -/* Print tables for InvSubBytes function providing the output byte embedded in - various places in a word, so that the table entries can be used with - fewer byte manipulations. -*/ -static void PrintInvSubBytesWordTable(Language language) -{ - switch (language) - { - case C: - printf("\n\n" - "// InvSubBytes embedded in words tables.\n" - "const Word AESInvSubBytesWordTable[4][256] =\n" - "{\n"); - for (int j = 0; j < 4; ++j) - { - printf("\t{\n"); - for (int i = 0; i < 256; ++i) - printf("\t\t0x%08x,\n", InvSubBytesTable[i] << j*8); - printf("\t},\n"); - } - printf("};\n"); - break; - - case IntelAssembly: - printf("\n\n" - "// InvSubBytes embedded in words tables.\n" - "\t.globl\t_AESInvSubBytesWordTable\n" - "\t.private_extern\t_AESInvSubBytesWordTable\n" - "\t.align\t2\n" - "_AESInvSubBytesWordTable:\n"); - for (int j = 0; j < 4; ++j) - { - printf("\t// Table %d.\n", j); - for (int i = 0; i < 256; ++i) - printf("\t.long\t0x%08x\n", InvSubBytesTable[i] << j*8); - } - break; - } -} - - -// Print the round constants. -static void PrintRcon(Language language) -{ - union { Byte c[4]; Word w; } t = { { 1, 0, 0, 0 } }; - - switch (language) - { - case C: - printf("\n\n" - "// Round constants.\n" - "const Byte AESRcon[] =\n" - "{\n" - "\t0,\t// Not used, included for indexing simplicity.\n"); - for (int i = 1; i < MaxRcon; ++i) - { - printf("\t0x%02x,\n", t.w); - t.c[0] = MultiplyByte(0x2, t.c[0]); - } - printf("};\n"); - break; - - case IntelAssembly: - printf("\n\n" - "// Round constants.\n" - "\t.globl\t_AESRcon\n" - "\t.private_extern\t_AESRcon\n" - "_AESRcon:\n" - "\t.byte\t0\t// Not used, included for indexing simplicity.\n"); - for (int i = 1; i < MaxRcon; ++i) - { - printf("\t.byte\t0x%02x\n", t.w); - t.c[0] = MultiplyByte(0x2, t.c[0]); - } - break; - } -} - - -// Print tables for the InvMixColumn operation. -static void PrintInvMixColumnTable(Language language) -{ - Word T[4][256]; - - for (int i = 0; i < 256; ++i) - { - union { Byte b[4]; Word w; } c; - - Byte s9 = MultiplyByte(0x9, i); - Byte sb = MultiplyByte(0xb, i); - Byte sd = MultiplyByte(0xd, i); - Byte se = MultiplyByte(0xe, i); - - c.b[0] = se; - c.b[1] = s9; - c.b[2] = sd; - c.b[3] = sb; - T[0][i] = c.w; - - c.b[0] = sb; - c.b[1] = se; - c.b[2] = s9; - c.b[3] = sd; - T[1][i] = c.w; - - c.b[0] = sd; - c.b[1] = sb; - c.b[2] = se; - c.b[3] = s9; - T[2][i] = c.w; - - c.b[0] = s9; - c.b[1] = sd; - c.b[2] = sb; - c.b[3] = se; - T[3][i] = c.w; - } - - switch (language) - { - case C: - printf("\n\n" - "// Tables for InvMixColumn.\n" - "const Word AESInvMixColumnTable[4][256] =\n" - "{\n"); - for (int i = 0; i < 4; ++i) - { - printf("\t{\n"); - for (int j = 0; j < 256; ++j) - printf("\t\t0x%08x,\n", T[i][j]); - printf("\t},\n"); - } - printf("};\n"); - break; - - case IntelAssembly: - printf("\n\n" - "// Tables for InvMixColumn.\n" - "\t.globl\t_AESInvMixColumnTable\n" - "\t.private_extern\t_AESInvMixColumnTable\n" - "\t.align\t2\n" - "_AESInvMixColumnTable:\n"); - for (int i = 0; i < 4; ++i) - { - printf("\t// Table %d.\n", i); - for (int j = 0; j < 256; ++j) - printf("\t.long\t0x%08x\n", T[i][j]); - } - break; - } -} - - -/* Print the tables defined AES Proposal: Rijndael, amended, 9/04/2003, - section 5.2.1. These combine the MixColumn and SubBytes operations. -*/ -static void PrintEncryptTable(Language language) -{ - Word T[4][256]; - - for (int i = 0; i < 256; ++i) - { - union { Byte b[4]; Word w; } c; - - Byte s1 = SubBytesTable[i]; - Byte s2 = MultiplyByte(0x2, s1); - Byte s3 = s1 ^ s2; - - c.b[0] = s2; - c.b[1] = s1; - c.b[2] = s1; - c.b[3] = s3; - T[0][i] = c.w; - - c.b[0] = s3; - c.b[1] = s2; - //c.b[2] = s1; - c.b[3] = s1; - T[1][i] = c.w; - - c.b[0] = s1; - c.b[1] = s3; - c.b[2] = s2; - //c.b[3] = s1; - T[2][i] = c.w; - - //c.b[0] = s1; - c.b[1] = s1; - c.b[2] = s3; - c.b[3] = s2; - T[3][i] = c.w; - } - - switch (language) - { - case C: - printf("\n\n" - "// Tables for main encryption iterations.\n" - "const Word AESEncryptTable[4][256] =\n" - "{\n"); - for (int i = 0; i < 4; ++i) - { - printf("\t{\n"); - for (int j = 0; j < 256; ++j) - printf("\t\t0x%08x,\n", T[i][j]); - printf("\t},\n"); - } - printf("};\n"); - break; - - case IntelAssembly: - printf("\n\n" - "// Tables for main encryption iterations.\n" - "\t.globl\t_AESEncryptTable\n" - "\t.private_extern\t_AESEncryptTable\n" - "\t.align\t2\n" - "_AESEncryptTable:\n"); - for (int i = 0; i < 4; ++i) - { - printf("\t// Table %d.\n", i); - for (int j = 0; j < 256; ++j) - printf("\t.long\t0x%08x\n", T[i][j]); - } - break; - } -} - - -/* Print the inverse tables. These correspond to the tables above, but for - decyrption. These combine the InvSubBytes and InvMixColumn operations. -*/ -static void PrintDecryptTable(Language language) -{ - Word T[4][256]; - - for (int i = 0; i < 256; ++i) - { - union { Byte b[4]; Word w; } c; - - Byte si = InvSubBytesTable[i]; - - Byte s9 = MultiplyByte(0x9, si); - Byte sb = MultiplyByte(0xb, si); - Byte sd = MultiplyByte(0xd, si); - Byte se = MultiplyByte(0xe, si); - - c.b[0] = se; - c.b[1] = s9; - c.b[2] = sd; - c.b[3] = sb; - T[0][i] = c.w; - - c.b[0] = sb; - c.b[1] = se; - c.b[2] = s9; - c.b[3] = sd; - T[1][i] = c.w; - - c.b[0] = sd; - c.b[1] = sb; - c.b[2] = se; - c.b[3] = s9; - T[2][i] = c.w; - - c.b[0] = s9; - c.b[1] = sd; - c.b[2] = sb; - c.b[3] = se; - T[3][i] = c.w; - } - - switch (language) - { - case C: - printf("\n\n" - "// Tables for main decryption iterations.\n" - "const Word AESDecryptTable[4][256] =\n" - "{\n"); - for (int i = 0; i < 4; ++i) - { - printf("\t{\n"); - for (int j = 0; j < 256; ++j) - printf("\t\t0x%08x,\n", T[i][j]); - printf("\t},\n"); - } - printf("};\n"); - break; - - case IntelAssembly: - printf("\n\n" - "// Tables for main decryption iterations.\n" - "\t.globl\t_AESDecryptTable\n" - "\t.private_extern\t_AESDecryptTable\n" - "\t.align\t2\n" - "_AESDecryptTable:\n"); - for (int i = 0; i < 4; ++i) - { - printf("\t// Table %d.\n", i); - for (int j = 0; j < 256; ++j) - printf("\t.long\t0x%08x\n", T[i][j]); - } - break; - } -} - - -static void Usage(const char *ProgramName) -{ - fprintf(stderr, - "%s: This program must have exactly one argument, \"C\" to generate\n" - "C or \"Intel\" to generate GCC i386/x86_64 assembly.\n", ProgramName); - exit(EXIT_FAILURE); -} - - -int main(int argc, char *argv[]) -{ - if (argc != 2) - Usage(argv[0]); - - Language language; - - // Figure out which language to generate, C or Intel assembly. - if (0 == strcmp(argv[1], "C")) - language = C; - else if (0 == strcmp(argv[1], "Intel")) - language = IntelAssembly; - else - Usage(argv[0]); - - printf("// This file was generated by " __FILE__ ".\n"); - - if (language == C) - printf("\n\n#include \"AES.h\"\n"); - - if (language == IntelAssembly) - printf("\n\n\t.const\n"); - - InitializeLogTables(); - InitializeSubBytesTable(); - InitializeInvSubBytesTable(); - - PrintRcon(language); - PrintInvMixColumnTable(language); - PrintEncryptTable(language); - PrintDecryptTable(language); - PrintSubBytesWordTable(language); - PrintInvSubBytesWordTable(language); - - return 0; -} diff --git a/bsd/crypto/aes/i386/Makefile b/bsd/crypto/aes/i386/Makefile deleted file mode 100644 index 851f7b2ac..000000000 --- a/bsd/crypto/aes/i386/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_I386 = \ - -INSTALL_MI_DIR = crypto - -EXPORT_MI_DIR = ${INSTALL_MI_DIR} - -PRIVATE_DATAFILES = \ - aesxts.h - -# /System/Library/Frameworks/Kernel.framework/PrivateHeaders -INSTALL_KF_MD_LCL_LIST = ${PRIVATE_DATAFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/crypto/aes/i386/ReadMe.txt b/bsd/crypto/aes/i386/ReadMe.txt deleted file mode 100644 index 7ac833117..000000000 --- a/bsd/crypto/aes/i386/ReadMe.txt +++ /dev/null @@ -1,22 +0,0 @@ -This directory contains a hybrid AES implementation. The core AES routines -(the actual encryption, decryption, and key expansion) are in: - - AES.s - Data.mk - Data.s - EncryptDecrypt.s - ExpandKeyForDecryption.s - ExpandKeyForEncryption.s - MakeData.c - -Although the above files do not explicitly include aes.h, they confirm to -certain things defined in it, notably the aes_rval type and the layout of the -aes_encrypt_ctx and aes_decrypt_ctx structures. These must be kept -compatibility; the definitions of ContextKey and ContextKeyLength in AES.s must -match the offsets of the key ("ks") and key_length ("inf") members of -aes_encrypt_ctx and aes_decrypt_ctx. (For some reason, aes_inf is a union that -is written as a 32-bit integer and read as an 8-bit integer. I do not know -why but have reproduced that behavior in the new implementation.) - -aes_modes.c extends the API, most notably by implementing CBC mode using the -basic AES block encryption. It uses aesopt.h and edefs.h. diff --git a/bsd/crypto/aes/i386/aes_crypt_hw.s b/bsd/crypto/aes/i386/aes_crypt_hw.s deleted file mode 100644 index 2edc3e2fd..000000000 --- a/bsd/crypto/aes/i386/aes_crypt_hw.s +++ /dev/null @@ -1,472 +0,0 @@ -/* This files defines _aes_encrypt_hw and _aes_decrypt_hw --- Intel Westmere HW AES-based implementation - of _aes_encrypt and _aes_decrypt. - - These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. - They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. - - The AES HW is detected 1st thing in - _aes_encrypt (EncryptDecrypt.s) - _aes_decrypt (EncryptDecrypt.s) - and, if AES HW is detected, branch without link (ie, jump) to the functions here. - - The implementation here follows the examples in an Intel White Paper - "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 - - Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 - - cclee 3-13-10 -*/ - - .text - .align 4,0x90 -.globl _aes_encrypt_hw -_aes_encrypt_hw: - -#if defined __i386__ - movl 4(%esp), %eax // in - movl 12(%esp), %edx // ctx - movl 8(%esp), %ecx // out - - #define LOCAL_SIZE (12+16+16) // 16-byte align (-4 for return address) + 16 (xmm0) + 16 (xmm1) - #define in %eax - #define ctx %edx - #define out %ecx - #define r13 %esp - -#else // x86_64 - - #define LOCAL_SIZE (8+16+16) // 16-byte align (-8 for return address) + 16 (xmm0) + 16 (xmm1) - #define in %rdi - #define ctx %rdx - #define out %rsi - #define r13 %rsp - -#endif // i386 or x86_64 - -#ifdef KERNEL - sub $LOCAL_SIZE, r13 - movaps %xmm0, (r13) -#endif - movups (in), %xmm0 - - // key length identification - movl 240(ctx), %eax // key length - cmp $160, %eax - je L_AES_128 - cmp $192, %eax - je L_AES_192 - cmp $224, %eax - je L_AES_256 - mov $-1, %eax // return ERROR -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret - -L_AES_128: - testb $15, %dl // check whether expanded key is 16-byte aligned - jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work - pxor (ctx), %xmm0 - aesenc 16(ctx), %xmm0 - aesenc 32(ctx), %xmm0 - aesenc 48(ctx), %xmm0 - aesenc 64(ctx), %xmm0 - aesenc 80(ctx), %xmm0 - aesenc 96(ctx), %xmm0 - aesenc 112(ctx), %xmm0 - aesenc 128(ctx), %xmm0 - aesenc 144(ctx), %xmm0 - aesenclast 160(ctx), %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret -0: // special case expanded key is not 16-byte aligned -#ifdef KERNEL - movaps %xmm1, 16(r13) // save xmm1 into stack -#endif - movups (ctx), %xmm1 - pxor %xmm1, %xmm0 - movups 16(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 32(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 48(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 64(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 80(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 96(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 112(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 128(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 144(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 160(ctx), %xmm1 - aesenclast %xmm1, %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - movaps 16(r13), %xmm1 - add $LOCAL_SIZE, r13 -#endif - ret - -L_AES_192: - testb $15, %dl // check whether expanded key is 16-byte aligned - jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work - pxor (ctx), %xmm0 - aesenc 16(ctx), %xmm0 - aesenc 32(ctx), %xmm0 - aesenc 48(ctx), %xmm0 - aesenc 64(ctx), %xmm0 - aesenc 80(ctx), %xmm0 - aesenc 96(ctx), %xmm0 - aesenc 112(ctx), %xmm0 - aesenc 128(ctx), %xmm0 - aesenc 144(ctx), %xmm0 - aesenc 160(ctx), %xmm0 - aesenc 176(ctx), %xmm0 - aesenclast 192(ctx), %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret -0: // special case expanded key is not 16-byte aligned -#ifdef KERNEL - movaps %xmm1, 16(r13) // save xmm1 into stack -#endif - movups (ctx), %xmm1 - pxor %xmm1, %xmm0 - movups 16(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 32(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 48(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 64(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 80(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 96(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 112(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 128(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 144(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 160(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 176(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 192(ctx), %xmm1 - aesenclast %xmm1, %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - movaps 16(r13), %xmm1 - add $LOCAL_SIZE, r13 -#endif - ret - -L_AES_256: - testb $15, %dl // check whether expanded key is 16-byte aligned - jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work - pxor (ctx), %xmm0 - aesenc 16(ctx), %xmm0 - aesenc 32(ctx), %xmm0 - aesenc 48(ctx), %xmm0 - aesenc 64(ctx), %xmm0 - aesenc 80(ctx), %xmm0 - aesenc 96(ctx), %xmm0 - aesenc 112(ctx), %xmm0 - aesenc 128(ctx), %xmm0 - aesenc 144(ctx), %xmm0 - aesenc 160(ctx), %xmm0 - aesenc 176(ctx), %xmm0 - aesenc 192(ctx), %xmm0 - aesenc 208(ctx), %xmm0 - aesenclast 224(ctx), %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret -0: // special case expanded key is not 16-byte aligned -#ifdef KERNEL - movaps %xmm1, 16(r13) // save xmm1 into stack -#endif - movups (ctx), %xmm1 - pxor %xmm1, %xmm0 - movups 16(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 32(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 48(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 64(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 80(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 96(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 112(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 128(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 144(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 160(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 176(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 192(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 208(ctx), %xmm1 - aesenc %xmm1, %xmm0 - movups 224(ctx), %xmm1 - aesenclast %xmm1, %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - movaps 16(r13), %xmm1 - add $LOCAL_SIZE, r13 -#endif - ret - - - .text - .align 4,0x90 -.globl _aes_decrypt_hw -_aes_decrypt_hw: - -#if defined __i386__ - movl 4(%esp), %eax // in - movl 12(%esp), %edx // ctx - movl 8(%esp), %ecx // out - -#endif - -#ifdef KERNEL - sub $LOCAL_SIZE, r13 - movaps %xmm0, (r13) -#endif - movups (in), %xmm0 - - // key length identification - movl 240(ctx), %eax // key length - cmp $160, %eax - je 0f // AES-128 - cmp $192, %eax - je 1f // AES-192 - cmp $224, %eax - je 2f // AES-256 - mov $-1, %eax // return ERROR -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret - -0: // AES-128 - testb $15, %dl // check whether expanded key is 16-byte aligned - jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work - pxor 160(ctx), %xmm0 - aesdec 144(ctx), %xmm0 - aesdec 128(ctx), %xmm0 - aesdec 112(ctx), %xmm0 - aesdec 96(ctx), %xmm0 - aesdec 80(ctx), %xmm0 - aesdec 64(ctx), %xmm0 - aesdec 48(ctx), %xmm0 - aesdec 32(ctx), %xmm0 - aesdec 16(ctx), %xmm0 - aesdeclast (ctx), %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret -9: // AES-128 Decrypt : special case expanded key is not 16-byte aligned -#ifdef KERNEL - movaps %xmm1, 16(r13) // save xmm1 into stack -#endif - movups 160(ctx), %xmm1 - pxor %xmm1, %xmm0 - movups 144(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 128(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - movaps 16(r13), %xmm1 - add $LOCAL_SIZE, r13 -#endif - ret - -1: // AES-192 - testb $15, %dl // check whether expanded key is 16-byte aligned - jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work - pxor 192(ctx), %xmm0 - aesdec 176(ctx), %xmm0 - aesdec 160(ctx), %xmm0 - aesdec 144(ctx), %xmm0 - aesdec 128(ctx), %xmm0 - aesdec 112(ctx), %xmm0 - aesdec 96(ctx), %xmm0 - aesdec 80(ctx), %xmm0 - aesdec 64(ctx), %xmm0 - aesdec 48(ctx), %xmm0 - aesdec 32(ctx), %xmm0 - aesdec 16(ctx), %xmm0 - aesdeclast (ctx), %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret -9: // AES-192 Decrypt : special case expanded key is not 16-byte aligned -#ifdef KERNEL - movaps %xmm1, 16(r13) // save xmm1 into stack -#endif - movups 192(ctx), %xmm1 - pxor %xmm1, %xmm0 - movups 176(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 160(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 144(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 128(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - movaps 16(r13), %xmm1 - add $LOCAL_SIZE, r13 -#endif - ret - -2: // AES-256 - testb $15, %dl // check whether expanded key is 16-byte aligned - jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work - pxor 224(ctx), %xmm0 - aesdec 208(ctx), %xmm0 - aesdec 192(ctx), %xmm0 - aesdec 176(ctx), %xmm0 - aesdec 160(ctx), %xmm0 - aesdec 144(ctx), %xmm0 - aesdec 128(ctx), %xmm0 - aesdec 112(ctx), %xmm0 - aesdec 96(ctx), %xmm0 - aesdec 80(ctx), %xmm0 - aesdec 64(ctx), %xmm0 - aesdec 48(ctx), %xmm0 - aesdec 32(ctx), %xmm0 - aesdec 16(ctx), %xmm0 - aesdeclast (ctx), %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - add $LOCAL_SIZE, r13 -#endif - ret -9: // AES-256 Decrypt : special case expanded key is not 16-byte aligned -#ifdef KERNEL - movaps %xmm1, 16(r13) // save xmm1 into stack -#endif - movups 224(ctx), %xmm1 - pxor %xmm1, %xmm0 - movups 208(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 192(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 176(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 160(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 144(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 128(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm0 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm0 - xorl %eax, %eax - movups %xmm0, (out) -#ifdef KERNEL - movaps (r13), %xmm0 - movaps 16(r13), %xmm1 - add $LOCAL_SIZE, r13 -#endif - ret - diff --git a/bsd/crypto/aes/i386/aes_key_hw.s b/bsd/crypto/aes/i386/aes_key_hw.s deleted file mode 100644 index 434fa553c..000000000 --- a/bsd/crypto/aes/i386/aes_key_hw.s +++ /dev/null @@ -1,405 +0,0 @@ -/* This files defines _aes_encrypt_key_hw and _aes_decrypt_key_hw --- Intel Westmere HW AES-based implementation - of _aes_encrypt_key and _aes_decrypt_key. - - These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. - They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. - - The AES HW is detected 1st thing in - _aes_encrypt_key (ExpandKeyForEncryption.s) - _aes_decrypt_key (ExpandKeyForDecryption.s) - and, if AES HW is detected, branch without link (ie, jump) to the functions here. - - The implementation here follows the examples in an Intel White Paper - "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 - - Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 - - cclee 3-13-10 -*/ - - .text - .align 4,0x90 - - // hw_aes_encrypt_key(key, klen, hwectx); - // klen = 16, 24, or 32, or (128/192/256) - - .globl _aes_encrypt_key_hw -_aes_encrypt_key_hw: - -#ifdef __i386__ - push %ebp - mov %esp, %ebp - push %ebx - push %edi - mov 8(%ebp), %eax // pointer to key - mov 12(%ebp), %ebx // klen - mov 16(%ebp), %edi // ctx - #define pkey %eax - #define klen %ebx - #define ctx %edi - #define sp %esp - #define cx %ecx -#else - #define pkey %rdi - #define klen %rsi - #define ctx %rdx - #define sp %rsp - #define cx %rcx - push %rbp - mov %rsp, %rbp -#endif - -#ifdef KERNEL - // for xmm registers save and restore - sub $(16*4), sp -#endif - - cmp $32, klen - jg 0f // klen>32 - shl $3, klen // convert 16/24/32 to 128/192/256 -0: - - cmp $128, klen // AES-128 ? - je L_AES_128_Encrypt_Key - cmp $192, klen // AES-192 ? - je L_AES_192_Encrypt_Key - cmp $256, klen // AES-256 ? - je L_AES_256_Encrypt_Key - mov $1, %eax // return error for wrong klen -L_Encrypt_Key_2_return: -#ifdef KERNEL - add $(16*4), sp -#endif -#ifdef __i386__ - pop %edi - pop %ebx -#endif - leave - ret - -L_AES_128_Encrypt_Key: -#ifdef KERNEL - // save xmm registers - movaps %xmm1, (sp) - movaps %xmm2, 16(sp) - movaps %xmm3, 32(sp) -#endif // KERNEL - - movl $160, 240(ctx) // write expanded key length to ctx - xor cx, cx - - movups (pkey), %xmm1 - movups %xmm1, (ctx) - aeskeygenassist $1, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $2, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $4, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $8, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $0x10, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $0x20, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $0x40, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $0x80, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $0x1b, %xmm1, %xmm2 - call L_key_expansion_128 - aeskeygenassist $0x36, %xmm1, %xmm2 - call L_key_expansion_128 - -#ifdef KERNEL - // restore xmm registers - movaps (sp), %xmm1 - movaps 16(sp), %xmm2 - movaps 32(sp), %xmm3 -#endif // KERNEL - xor %eax, %eax // return 0 for success - jmp L_Encrypt_Key_2_return - - .align 4, 0x90 -L_key_expansion_128: - pshufd $0xff, %xmm2, %xmm2 - movaps %xmm1, %xmm3 - pslldq $4, %xmm3 - pxor %xmm3, %xmm1 - movaps %xmm1, %xmm3 - pslldq $4, %xmm3 - pxor %xmm3, %xmm1 - movaps %xmm1, %xmm3 - pslldq $4, %xmm3 - pxor %xmm3, %xmm1 - pxor %xmm2, %xmm1 - add $16, cx - movups %xmm1, (ctx, cx) - ret - -L_AES_192_Encrypt_Key: -#ifdef KERNEL - // save xmm registers - movaps %xmm1, (sp) - movaps %xmm2, 16(sp) - movaps %xmm3, 32(sp) - movaps %xmm4, 48(sp) -#endif // KERNEL - movl $192, 240(ctx) // write expanded key length to ctx - - movups (pkey), %xmm1 - movq 16(pkey), %xmm3 - - movups %xmm1, (ctx) - movq %xmm3, 16(ctx) - - lea 24(ctx), cx - - aeskeygenassist $1, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $2, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $4, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $8, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $0x10, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $0x20, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $0x40, %xmm3, %xmm2 - call L_key_expansion_192 - aeskeygenassist $0x80, %xmm3, %xmm2 - call L_key_expansion_192 - -#ifdef KERNEL - // restore xmm registers - movaps (sp), %xmm1 - movaps 16(sp), %xmm2 - movaps 32(sp), %xmm3 - movaps 48(sp), %xmm4 -#endif // KERNEL - xor %eax, %eax // return 0 for success - jmp L_Encrypt_Key_2_return - - .align 4, 0x90 -L_key_expansion_192: - pshufd $0x55, %xmm2, %xmm2 - - movaps %xmm1, %xmm4 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pxor %xmm2, %xmm1 - - pshufd $0xff, %xmm1, %xmm2 - - movaps %xmm3, %xmm4 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm3 - pxor %xmm2, %xmm3 - - movups %xmm1, (cx) - movq %xmm3, 16(cx) - - add $24, cx - ret - -L_AES_256_Encrypt_Key: -#ifdef KERNEL - // save xmm registers - movaps %xmm1, (sp) - movaps %xmm2, 16(sp) - movaps %xmm3, 32(sp) - movaps %xmm4, 48(sp) -#endif // KERNEL - movl $224, 240(ctx) // write expanded key length to ctx - - movups (pkey), %xmm1 - movups 16(pkey), %xmm3 - movups %xmm1, (ctx) - movups %xmm3, 16(ctx) - - lea 32(ctx), cx - - aeskeygenassist $1, %xmm3, %xmm2 - call L_key_expansion_256 - aeskeygenassist $2, %xmm3, %xmm2 - call L_key_expansion_256 - aeskeygenassist $4, %xmm3, %xmm2 - call L_key_expansion_256 - aeskeygenassist $8, %xmm3, %xmm2 - call L_key_expansion_256 - aeskeygenassist $0x10, %xmm3, %xmm2 - call L_key_expansion_256 - aeskeygenassist $0x20, %xmm3, %xmm2 - call L_key_expansion_256 - aeskeygenassist $0x40, %xmm3, %xmm2 - call L_key_expansion_256_final - -#ifdef KERNEL - // restore xmm registers - movaps (sp), %xmm1 - movaps 16(sp), %xmm2 - movaps 32(sp), %xmm3 - movaps 48(sp), %xmm4 -#endif // KERNEL - xor %eax, %eax // return 0 for success - jmp L_Encrypt_Key_2_return - - .align 4, 0x90 -L_key_expansion_256: - - pshufd $0xff, %xmm2, %xmm2 - - movaps %xmm1, %xmm4 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pxor %xmm2, %xmm1 - - movups %xmm1, (cx) - - aeskeygenassist $0, %xmm1, %xmm4 - - pshufd $0xaa, %xmm4, %xmm2 - - movaps %xmm3, %xmm4 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm3 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm3 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm3 - pxor %xmm2, %xmm3 - - movups %xmm3, 16(cx) - - add $32, cx - ret - - .align 4, 0x90 -L_key_expansion_256_final: - - pshufd $0xff, %xmm2, %xmm2 - - movaps %xmm1, %xmm4 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pslldq $4, %xmm4 - - pxor %xmm4, %xmm1 - pxor %xmm2, %xmm1 - - movups %xmm1, (cx) - ret - -// _aes_decrypt_key_hw is implemented as -// 1. call _aes_encrypt_key_hw -// 2. use aesimc to convert the expanded round keys (except the 1st and last round keys) - - .text - .align 4, 0x90 - .globl _aes_decrypt_key_hw -_aes_decrypt_key_hw: - -#ifdef __i386__ - - push %ebp - mov %esp, %ebp - sub $(8+16), %esp - - // copy input arguments for calling aes_decrypt_key_hw - - mov 8(%ebp), %eax - mov %eax, (%esp) - mov 12(%ebp), %eax - mov %eax, 4(%esp) - mov 16(%ebp), %eax - mov %eax, 8(%esp) - -#else - - push %rbp - mov %rsp, %rbp - sub $16, %rsp - - // calling arguments %rdi/%rsi/%rdx will be used for encrypt_key - // %rdx (ctx) will return unchanged - // %rsi (klen) will (<<3) if <= 32 - -#endif - call _aes_encrypt_key_hw - cmp $0, %eax - je L_decrypt_inv -L_decrypt_almost_done: -#ifdef __i386__ - add $(8+16), %esp -#else - add $16, %rsp -#endif - leave - ret - -L_decrypt_inv: -#ifdef KERNEL - movaps %xmm0, (sp) -#endif - -#ifdef __i386__ - #undef klen - #undef ctx - mov 12(%ebp), %eax // klen - mov 16(%ebp), %edx // ctx - #define klen %eax - #define ctx %edx - cmp $32, klen - jg 0f // klen>32 - shl $3, klen // convert 16/24/32 to 128/192/256 -0: -#endif - - mov $9, cx // default is AES-128 - cmp $128, klen - je L_Decrypt_Key - add $2, cx - cmp $192, klen - je L_Decrypt_Key - add $2, cx - -L_Decrypt_Key: - add $16, ctx - movups (ctx), %xmm0 - aesimc %xmm0, %xmm0 - movups %xmm0, (ctx) - sub $1, cx - jg L_Decrypt_Key - -#ifdef KERNEL - movaps (sp), %xmm0 -#endif -#ifdef __i386__ - xor %eax, %eax -#endif - jmp L_decrypt_almost_done - diff --git a/bsd/crypto/aes/i386/aes_modes_asm.s b/bsd/crypto/aes/i386/aes_modes_asm.s deleted file mode 100644 index 3b0f29aa1..000000000 --- a/bsd/crypto/aes/i386/aes_modes_asm.s +++ /dev/null @@ -1,420 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - These subroutines implement multiple block AES modes for ECB, CBC, CFB, - OFB and CTR encryption, The code provides support for the VIA Advanced - Cryptography Engine (ACE). - - NOTE: In the following subroutines, the AES contexts (ctx) must be - 16 byte aligned if VIA ACE is being used -*/ - -/* modified 3/5/10 cclee */ -/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */ -/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */ - -/* add code comments/description and HW AES detection and execution branch cclee 3-13-10 */ - -#ifdef KERNEL -#include // to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW -#else -#include // to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW -#endif - -#if 0 - -// TODO: -// aes_ecb_encrypt and aes_ecb_decrypt are not present in gen/aescrypt.c -// would add the implementation if needed -// they are now compiled from aes_modes.c - -aes_rval aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, const aes_encrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) return 1; - while(nb--) { - aes_encrypt(ibuf, obuf, ctx); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - return 0; -} - -aes_rval aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, const aes_decrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) return 1; - while(nb--) { - aes_decrypt(ibuf, obuf, ctx); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - return 0; -} -#endif - -#if 0 -aes_rval aes_encrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *obuf, const aes_encrypt_ctx ctx[1]) -{ - unsigned char iv[16]; - int i; - - for (i = 0; i < 16; i++) iv[i] = *(in_iv + i); - - while (num_blk--) { - iv ^= ibuf; // 128-bit - aes_encrypt(iv, iv, ctx); - memcpy(obuf, iv, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - - } - - return 0; -} -#endif - - .text - .align 4,0x90 - .globl _aes_encrypt_cbc -_aes_encrypt_cbc: - - // detect AES HW - // if AES HW detected, branch to AES-HW-specific function _aes_encrypt_cbc_hw (aes_modes_hw.s) - // o.w., fall through to the original AES-SW function - -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capability - mov (%rax), %eax // %eax = __cpu_capabilities -#else -#ifdef KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - mov _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif -#endif - test $(kHasAES), %eax // kHasAES & __cpu_capabilities - jne _aes_encrypt_cbc_hw // if AES HW detected, branch to HW-specific code - - // save registers and allocate stack memory for xmm registers and calling arguments (i386 only) -#if defined __i386__ - push %ebp - mov %esp, %ebp - push %ebx // to be used as ibuf - push %edi // to be used as obuf - sub $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) - mov %esi, 12(%esp) // save %esp in the unused 4-bytes, to be used as num_blk - - #define sp %esp -#else // __x86_64__ - push %rbp - mov %rsp, %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - sub $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) - - #define sp %rsp -#endif - - // save xmm registers for kernel use - // xmm6-xmm7 will be used locally - // xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored) - // there is a hole not used for xmm, which is 48(sp). - // it has been used to store iv (16-bytes) in i386 code - // for consistency between i386 and x86_64, this hole is dummied in x86_64 code - // also the 1st 16 bytes (sp) is dummied in x86_64 code - -#ifdef KERNEL - movaps %xmm7, 16(sp) - movaps %xmm6, 32(sp) - movaps %xmm0, 64(sp) - movaps %xmm1, 80(sp) - movaps %xmm2, 96(sp) -#if defined __i386__ - movaps %xmm3, 112(sp) - movaps %xmm4, 128(sp) -#endif -#endif - - // set up registers from calling arguments - -#if defined __i386__ - - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), %xmm7 // in_iv - lea 48(%esp), %eax // &iv[0] - mov %eax, (%esp) // 1st iv for aes_encrypt - mov %eax, 4(%esp) // 2nd iv for aes_encrypt - mov %edx, 8(%esp) // ctx for aes_encrypt - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %esi // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %esi - -#else // __x86_64__, calling arguments order : rdi/rsi/rdx/rcx/r8 - - mov %rdi, %rbx // ibuf - lea 48(sp), %r12 // &iv - movups (%rsi), %xmm7 // in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define iv %r12 - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - cmp $1, num_blk // num_blk vs 1 - jl 9f // if num_blk < 1, branch to bypass the main loop -0: - movups (ibuf), %xmm6 // ibuf -#if defined __i386__ - lea 48(sp), %eax // &iv[0] - pxor %xmm6, %xmm7 // iv ^= ibuf - movups %xmm7, (%eax) // save iv -#else - pxor %xmm6, %xmm7 // iv ^= ibuf - movups %xmm7, (iv) // save iv - mov iv, %rdi // 1st calling argument for aes_encrypt - mov iv, %rsi // 2nd calling argument for aes_encrypt - mov ctx, %rdx // 3rd calling argument for aes_encrypt -#endif - call _aes_encrypt_xmm_no_save // aes_encrypt(iv, iv, ctx) -#if defined __i386__ - leal 48(%esp), %eax // &iv[0] - movups (%eax), %xmm7 // read iv -#else - movups (iv), %xmm7 // read iv -#endif - movups %xmm7, (obuf) // memcpy(obuf, iv, AES_BLOCK_SIZE); - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop -9: - -L_crypt_cbc_done: - - // restore xmm registers due to kernel use -#ifdef KERNEL - movaps 16(sp), %xmm7 - movaps 32(sp), %xmm6 - movaps 64(sp), %xmm0 - movaps 80(sp), %xmm1 - movaps 96(sp), %xmm2 -#if defined __i386__ - movaps 112(sp), %xmm3 - movaps 128(sp), %xmm4 -#endif -#endif - - xor %eax, %eax // to return 0 for SUCCESS - -#if defined __i386__ - mov 12(%esp), %esi // restore %esi - add $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) - pop %edi - pop %ebx -#else - add $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx -#endif - leave - ret - -#if 0 -aes_rval aes_decrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *obuf, const aes_decrypt_ctx cx[1]) -{ - unsigned char iv[16], tmp[16]; - int i; - - for (i = 0; i < 16; i++) iv[i] = *(in_iv + i); - - while (num_blk--) { - - memcpy(tmp, ibuf, AES_BLOCK_SIZE); - aes_decrypt(ibuf, obuf, ctx); - obuf ^= iv; - memcpy(iv, tmp, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - - return 0; -} -#endif - - .text - .align 4,0x90 - .globl _aes_decrypt_cbc -_aes_decrypt_cbc: - - // detect AES HW - // if AES HW detected, branch to AES-HW-specific function _aes_decrypt_cbc_hw (aes_modes_hw.s) - // o.w., fall through to the original AES-SW function - -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capability - mov (%rax), %eax // %eax = __cpu_capabilities -#else -#ifdef KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - mov _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif -#endif - test $(kHasAES), %eax // kHasAES & __cpu_capabilities - jne _aes_decrypt_cbc_hw - - // save registers and allocate stack memory for xmm registers and calling arguments (i386 only) -#if defined __i386__ - push %ebp - mov %esp, %ebp - push %ebx // to be used as ibuf - push %edi // to be used as obuf - sub $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) - mov %esi, 12(%esp) // save %esp in the unused 4-bytes, to be used as num_blk - - #define sp %esp -#else // __x86_64__ - push %rbp - mov %rsp, %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - sub $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) - - #define sp %rsp -#endif - - // save xmm registers for kernel use - // xmm6-xmm7 will be used locally - // xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored) - // there is a hole not used for xmm, which is 48(sp). - // it has been used to store iv (16-bytes) in i386 code - // for consistency between i386 and x86_64, this hole is dummied in x86_64 code - // also the 1st 16 bytes (sp) is dummied in x86_64 code - -#ifdef KERNEL - movaps %xmm7, 16(sp) - movaps %xmm6, 32(sp) - movaps %xmm0, 64(sp) - movaps %xmm1, 80(sp) - movaps %xmm2, 96(sp) -#if defined __i386__ - movaps %xmm3, 112(sp) - movaps %xmm4, 128(sp) -#endif -#endif - - // set up registers from calling arguments - -#if defined __i386__ - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), %xmm7 // in_iv - mov %edx, 8(%esp) // ctx for aes_encrypt - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %esi // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %esi -#else // __x86_64__, rdi/rsi/rdx/rcx/r8 - mov %rdi, %rbx // ibuf - movups (%rsi), %xmm7 // in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - // memcpy(tmp, ibuf, AES_BLOCK_SIZE); - // aes_decrypt(ibuf, obuf, ctx); - // obuf ^= iv; - // memcpy(iv, tmp, AES_BLOCK_SIZE); - // ibuf += AES_BLOCK_SIZE; - // obuf += AES_BLOCK_SIZE; - - cmp $1, num_blk // num_blk vs 1 - jl L_crypt_cbc_done // if num_blk < 1, bypass the main loop, jump to finishing code -0: - movups (ibuf), %xmm6 // tmp -#if defined __i386__ - mov ibuf, (sp) // ibuf - mov obuf, 4(sp) // obuf -#else - mov ibuf, %rdi // ibuf - mov obuf, %rsi // obuf - mov ctx, %rdx // ctx -#endif - call _aes_decrypt_xmm_no_save // aes_decrypt(ibuf, obuf, ctx) - movups (obuf), %xmm0 // obuf - pxor %xmm7, %xmm0 // obuf ^= iv; - movaps %xmm6, %xmm7 // memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm0, (obuf) // update obuf - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop -9: - - // we are done here, the finishing code is identical to that in aes_encrypt_cbc, so just jump to there - jmp L_crypt_cbc_done - diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s deleted file mode 100644 index b9e35085c..000000000 --- a/bsd/crypto/aes/i386/aes_modes_hw.s +++ /dev/null @@ -1,1623 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - These subroutines implement multiple block AES modes for ECB, CBC, CFB, - OFB and CTR encryption, The code provides support for the VIA Advanced - Cryptography Engine (ACE). - - NOTE: In the following subroutines, the AES contexts (ctx) must be - 16 byte aligned if VIA ACE is being used -*/ - - -/* ---------------------------------------------------------------------------------------------------------------- - - aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : - - For simplicity, I am assuming all variables are in 128-bit data type. - - aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) - { - while(num_blk--) { - *iv ^= *ibuf++; - aes_encrypt(iv, iv, ctx); - *obuf++ = *iv; - } - return 0; - } - - The following is an implementation of this function using Intel AESNI. - This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. - Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch - to this aesni-based function should it detecs that aesni is available. - Blindly call this function SURELY will cause a CRASH on systems with no aesni support. - - Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks - are serially chained. This prevents us from arranging several blocks for encryption in parallel. - - ----------------------------------------------------------------------------------------------------------------*/ - - .text - .align 4,0x90 - .globl _aes_encrypt_cbc_hw -_aes_encrypt_cbc_hw: - - // push/save registers for local use -#if defined __i386__ - - push %ebp - movl %esp, %ebp - push %ebx - push %edi - - #define sp %esp - -#else // __x86_64__ - - push %rbp - mov %rsp, %rbp - push %rbx - push %r13 - push %r14 - push %r15 - - #define sp %rsp - -#endif - - // if this is kernel code, need to save used xmm registers -#ifdef KERNEL - -#if defined __i386__ - sub $(8*16), %esp // for possible xmm0-xmm7 save/restore -#else - sub $(16*16), %rsp // xmm0-xmm15 save/restore -#endif - - movaps %xmm0, (sp) - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm3, 48(sp) - movaps %xmm4, 64(sp) - movaps %xmm5, 80(sp) - movaps %xmm6, 96(sp) - movaps %xmm7, 112(sp) -#if defined __x86_64__ - movaps %xmm8, 16*8(sp) - movaps %xmm9, 16*9(sp) - movaps %xmm10, 16*10(sp) - movaps %xmm11, 16*11(sp) - movaps %xmm12, 16*12(sp) - movaps %xmm13, 16*13(sp) - movaps %xmm14, 16*14(sp) - movaps %xmm15, 16*15(sp) -#endif // __x86_64__ - -#endif // KERNEL - - #define iv %xmm0 - -#ifdef __i386__ - - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), iv // iv = in_iv - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %ecx // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %ecx - #define ctx %edx - -#else - - mov %rdi, %rbx // ibuf - movups (%rsi), iv // iv = in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - mov 240(ctx), %eax // aes length - cmp $160, %eax // aes-128 encrypt ? - je L_encrypt_128 - cmp $192, %eax // aes-192 encrypt ? - je L_encrypt_192 - cmp $224, %eax // aes-256 encrypt ? - je L_encrypt_256 - mov $-1, %eax // return error - jmp L_error - - // - // aes-128 encrypt_cbc operation, up to L_HW_cbc_done - // - -L_encrypt_128: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm2, iv // 1st instruction inside aes_encrypt - pxor %xmm1, iv // *iv ^= *ibuf - - // finishing up the rest of aes_encrypt - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenclast %xmm12, iv -#else - movups 96(ctx), %xmm1 // key6 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 // key7 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 // key8 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 // key9 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 // keyA - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, obuf // obuf++; - add $16, ibuf // ibuf++; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) - -L_HW_cbc_done: - - xor %eax, %eax // to return CRYPT_OK - -L_error: - - // if kernel, restore xmm registers -#ifdef KERNEL - movaps 0(sp), %xmm0 - movaps 16(sp), %xmm1 - movaps 32(sp), %xmm2 - movaps 48(sp), %xmm3 - movaps 64(sp), %xmm4 - movaps 80(sp), %xmm5 - movaps 96(sp), %xmm6 - movaps 112(sp), %xmm7 -#if defined __x86_64__ - movaps 16*8(sp), %xmm8 - movaps 16*9(sp), %xmm9 - movaps 16*10(sp), %xmm10 - movaps 16*11(sp), %xmm11 - movaps 16*12(sp), %xmm12 - movaps 16*13(sp), %xmm13 - movaps 16*14(sp), %xmm14 - movaps 16*15(sp), %xmm15 -#endif // __x86_64__ -#endif // KERNEL - - // release used stack memory, restore used callee-saved registers, and return -#if defined __i386__ -#ifdef KERNEL - add $(8*16), %esp -#endif - pop %edi - pop %ebx -#else -#ifdef KERNEL - add $(16*16), %rsp -#endif - pop %r15 - pop %r14 - pop %r13 - pop %rbx -#endif - leave - ret - - // - // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_encrypt_192: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA - movups 176(ctx), %xmm13 // keyB - movups 192(ctx), %xmm14 // keyC -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm1, iv // *iv ^= ibuf - - // aes_encrypt(iv, iv, ctx); - - pxor %xmm2, iv - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenc %xmm12, iv - aesenc %xmm13, iv - aesenclast %xmm14, iv -#else - movups 96(ctx), %xmm1 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 - aesenc %xmm1, iv - movups 176(ctx), %xmm1 - aesenc %xmm1, iv - movups 192(ctx), %xmm1 - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, ibuf // ibuf++ - add $16, obuf // obuf++ - - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done // share with the common exit code - - // - // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_encrypt_256: - - cmp $1, num_blk // check number of block - jl L_HW_cbc_done // should it be less than 1, nothing to do - - movups (ctx), %xmm2 // key0 - movups 16(ctx), %xmm3 // key1 - movups 32(ctx), %xmm4 // key2 - movups 48(ctx), %xmm5 // key3 - movups 64(ctx), %xmm6 // key4 - movups 80(ctx), %xmm7 // key5 -#if defined __x86_64__ - movups 96(ctx), %xmm8 // key6 - movups 112(ctx), %xmm9 // key7 - movups 128(ctx), %xmm10 // key8 - movups 144(ctx), %xmm11 // key9 - movups 160(ctx), %xmm12 // keyA - movups 176(ctx), %xmm13 // keyB - movups 192(ctx), %xmm14 // keyC - movups 208(ctx), %xmm15 // keyD - // movups 224(ctx), %xmm1 // keyE -#endif - - // while (num_blk--) { - // *iv ^= *ibuf++; - // aes_encrypt(iv, iv, ctx); - // *obuf++ = *iv; - // } -0: - movups (ibuf), %xmm1 // *ibuf - pxor %xmm1, iv // *iv ^= ibuf - - // aes_encrypt(iv, iv, ctx); - pxor %xmm2, iv - aesenc %xmm3, iv - aesenc %xmm4, iv - aesenc %xmm5, iv - aesenc %xmm6, iv - aesenc %xmm7, iv -#if defined __x86_64__ - movups 224(ctx), %xmm1 // keyE - aesenc %xmm8, iv - aesenc %xmm9, iv - aesenc %xmm10, iv - aesenc %xmm11, iv - aesenc %xmm12, iv - aesenc %xmm13, iv - aesenc %xmm14, iv - aesenc %xmm15, iv - aesenclast %xmm1, iv -#else - movups 96(ctx), %xmm1 // key6 - aesenc %xmm1, iv - movups 112(ctx), %xmm1 // key7 - aesenc %xmm1, iv - movups 128(ctx), %xmm1 // key8 - aesenc %xmm1, iv - movups 144(ctx), %xmm1 // key9 - aesenc %xmm1, iv - movups 160(ctx), %xmm1 // keyA - aesenc %xmm1, iv - movups 176(ctx), %xmm1 // keyB - aesenc %xmm1, iv - movups 192(ctx), %xmm1 // keyC - aesenc %xmm1, iv - movups 208(ctx), %xmm1 // keyD - aesenc %xmm1, iv - movups 224(ctx), %xmm1 // keyE - aesenclast %xmm1, iv -#endif - - movups iv, (obuf) // *obuf = *iv; - add $16, ibuf // ibuf++ - add $16, obuf // obuf++ - - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done // share with the common exit code - - - - // - // --------- END of aes_encrypt_cbc_hw ------------------- - // - - -/* ---------------------------------------------------------------------------------------------------------------- - - aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : - - For simplicity, I am assuming all variables are in 128-bit data type. - - aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) - { - while(num_blk--) { - aes_decrypt(ibuf, obuf, ctx); - *obuf++ ^= *iv; - *iv = *ibuf++; - } - return 0; - } - - The following is an implementation of this function using Intel AESNI. - This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. - Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch - to this aesni-based function should it detecs that aesni is available. - Blindly call this function SURELY will cause a CRASH on systems with no aesni support. - - Note that the decryption operation is not related over blocks. - This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. - This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) - The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. - - Example C code for packing 4 blocks in an iteration is shown as follows: - - while ((num_blk-=4)>=0) { - - // the following 4 functions can be interleaved to exploit parallelism - aes_decrypt(ibuf, obuf, ctx); - aes_decrypt(ibuf+1, obuf+1, ctx); - aes_decrypt(ibuf+2, obuf+2, ctx); - aes_decrypt(ibuf+3, obuf+3, ctx); - - obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - *iv = ibuf[3]; ibuf += 4; obuf += 4; - } - num_blk+=4; - - ----------------------------------------------------------------------------------------------------------------*/ - - .text - .align 4,0x90 - .globl _aes_decrypt_cbc_hw -_aes_decrypt_cbc_hw: - - // push/save registers for local use -#if defined __i386__ - - push %ebp - movl %esp, %ebp - push %ebx // ibuf - push %edi // obuf - - #define sp %esp - -#else // __x86_64__ - - push %rbp - mov %rsp, %rbp - push %rbx - push %r13 - push %r14 - push %r15 - - #define sp %rsp - -#endif - - - // if kernel, allocate stack space to save xmm registers -#ifdef KERNEL -#if defined __i386__ - sub $(8*16), %esp -#else - sub $(16*16), %rsp -#endif - movaps %xmm0, (sp) - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm3, 48(sp) - movaps %xmm4, 64(sp) - movaps %xmm5, 80(sp) - movaps %xmm6, 96(sp) - movaps %xmm7, 112(sp) -#if defined __x86_64__ - movaps %xmm8, 16*8(sp) - movaps %xmm9, 16*9(sp) - movaps %xmm10, 16*10(sp) - movaps %xmm11, 16*11(sp) - movaps %xmm12, 16*12(sp) - movaps %xmm13, 16*13(sp) - movaps %xmm14, 16*14(sp) - movaps %xmm15, 16*15(sp) -#endif // __x86_64__ -#endif - - #undef iv - #define iv %xmm0 - -#if defined __i386__ - mov 12(%ebp), %eax // in_iv - mov 24(%ebp), %edx // ctx - movups (%eax), iv // iv = in_iv - mov 8(%ebp), %ebx // ibuf - mov 16(%ebp), %ecx // num_blk - mov 20(%ebp), %edi // obuf - - #define ibuf %ebx - #define obuf %edi - #define num_blk %ecx - #define ctx %edx - -#else // __x86_64__, rdi/rsi/rdx/rcx/r8 - - mov %rdi, %rbx // ibuf - movups (%rsi), iv // iv = in_iv - mov %rdx, %r13 // num_blk - mov %rcx, %r14 // obuf - mov %r8, %r15 // ctx - - #define ibuf %rbx - #define num_blk %r13d - #define obuf %r14 - #define ctx %r15 - -#endif - - mov 240(ctx), %eax // aes length - cmp $160, %eax // aes-128 decrypt - je L_decrypt_128 - cmp $192, %eax // aes-192 decrypt - je L_decrypt_192 - cmp $224, %eax // aes-256 decrypt - je L_decrypt_256 - - mov $-1, %eax // wrong aes length, to return -1 - jmp L_error // early exit due to wrong aes length - - - // - // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_128: - - cmp $1, num_blk - jl L_HW_cbc_done // if num_blk < 1, early return - - // aes-128 decrypt expanded keys - movups 160(ctx), %xmm3 - movups 144(ctx), %xmm4 - movups 128(ctx), %xmm5 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#if defined __x86_64__ - movups 80(ctx), %xmm8 - movups 64(ctx), %xmm9 - movups 48(ctx), %xmm10 - movups 32(ctx), %xmm11 - movups 16(ctx), %xmm12 - movups 0(ctx), %xmm13 -#endif - - // performs 4 block decryption in an iteration to exploit decrypt in parallel - - // while ((num_blk-=4)>=0) { - // aes_decrypt(ibuf, obuf, ctx); - // aes_decrypt(ibuf+1, obuf+1, ctx); - // aes_decrypt(ibuf+2, obuf+2, ctx); - // aes_decrypt(ibuf+3, obuf+3, ctx); - // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - // *iv = ibuf[3]; ibuf += 4; obuf += 4; - // } - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code - -0: - - -#if defined __x86_64__ - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // for x86_64, the expanded keys are already stored in xmm3-xmm13 - - // aes-128 decrypt round 0 per 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - // aes-128 decrypt round 1 per 4 blocks - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - // aes-128 decrypt round 2 per 4 blocks - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - // aes-128 decrypt round 3 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - // aes-128 decrypt round 4 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - // aes-128 decrypt round 5 per 4 blocks - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - // aes-128 decrypt round 6 per 4 blocks - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - // aes-128 decrypt round 7 per 4 blocks - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - // aes-128 decrypt round 8 per 4 blocks - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - // aes-128 decrypt round 9 per 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - // aes-128 decrypt round 10 (last) per 4 blocks - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; - movups 16(ibuf), iv // ibuf[1] - pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; - movups 32(ibuf), iv // ibuf[2] - pxor iv, %xmm15 // obuf[3] ^= obuf[2]; - movups 48(ibuf), iv // *iv = ibuf[3] - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - -#else - - // aes_decrypt_cbc per 4 blocks using aes-128 for i386 - // xmm1/xmm2/xmm4/xmm5 used for obuf per block - // xmm3 = key0 - // xmm0 = iv - // xmm6/xmm7 dynamically load with other expanded keys - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - - movups 144(ctx), %xmm6 // key1 - - // aes-128 decrypt round 0 per 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 128(ctx), %xmm7 // key2 - - // aes-128 decrypt round 1 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 // key3 - - // aes-128 decrypt round 2 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 // key4 - - // aes-128 decrypt round 3 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 // key5 - - // aes-128 decrypt round 4 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 // key6 - - // aes-128 decrypt round 5 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 // key7 - - // aes-128 decrypt round 6 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 // key8 - - // aes-128 decrypt round 7 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 // key9 - - // aes-128 decrypt round 8 per 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 // keyA - - // aes-128 decrypt round 9 per 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - // aes-128 decrypt round 10 (last) per 4 blocks - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf -#endif - - add $64, ibuf // ibuf += 4; - add $64, obuf // obuf += 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - -#if defined __i386__ - // updated as they might be needed as expanded keys in the remaining - movups 144(ctx), %xmm4 - movups 128(ctx), %xmm5 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#endif - - test $2, num_blk // check whether num_blk has 2 blocks - je 9f // if num_blk & 2 == 0, skip the per-pair processing code - - // do the remaining 2 blocks together - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - - // aes_decrypt - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 -#if defined __x86_64__ - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 -#else - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - movups 112(ctx), %xmm6 - movups 96(ctx), %xmm7 -#endif - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0] - movups 16(ibuf), iv // *iv = ibuf[1] - - movups %xmm1, (obuf) // write obuf[0] - movups %xmm2, 16(obuf) // write obuf[1] - - add $32, ibuf // ibuf += 2 - add $32, obuf // obuf += 2 - -9: - test $1, num_blk // check whether num_blk has residual 1 block - je L_HW_cbc_done // if num_blk == 0, no need for residual processing code - - movups (ibuf), %xmm2 // tmp = ibuf - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdeclast %xmm13, %xmm2 -#else - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 -#endif - - pxor iv, %xmm2 // *obuf ^= *iv; - movups (ibuf), iv // *iv = *ibuf; - movups %xmm2, (obuf) // write *obuf - - jmp L_HW_cbc_done - - // - // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_192: - - cmp $1, num_blk - jl L_HW_cbc_done // if num_blk < 1, early return - - // aes-192 decryp expanded keys - movups 192(ctx), %xmm3 - movups 176(ctx), %xmm4 - movups 160(ctx), %xmm5 - movups 144(ctx), %xmm6 - movups 128(ctx), %xmm7 -#if defined __x86_64__ - movups 112(ctx), %xmm8 - movups 96(ctx), %xmm9 - movups 80(ctx), %xmm10 - movups 64(ctx), %xmm11 - movups 48(ctx), %xmm12 - movups 32(ctx), %xmm13 - movups 16(ctx), %xmm14 - movups (ctx), %xmm15 -#endif - - // performs 4 block decryption in an iteration to exploit decrypt in parallel - - // while ((num_blk-=4)>=0) { - // aes_decrypt(ibuf, obuf, ctx); - // aes_decrypt(ibuf+1, obuf+1, ctx); - // aes_decrypt(ibuf+2, obuf+2, ctx); - // aes_decrypt(ibuf+3, obuf+3, ctx); - // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; - // *iv = ibuf[3]; ibuf += 4; obuf += 4; - // } - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code -0: - -#if defined __x86_64__ - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 - // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards - - // round 0 for 4 blocks - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - // round 1 for 4 blocks - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - // round 2 for 4 blocks - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - // round 3 for 4 blocks - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - // round 4 for 4 blocks - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - // round 5 for 4 blocks - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - // round 6 for 4 blocks - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - // round 7 for 4 blocks - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - // round 8 for 4 blocks - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - // round 9 for 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - movups 16(ctx), %xmm12 - - // round A for 4 blocks - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - - movups (ctx), %xmm13 - - // round B for 4 blocks - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - - movups 48(ctx), %xmm12 // restore %xmm12 to its original key - - // round C (last) for 4 blocks - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - - movups 32(ctx), %xmm13 // restore %xmm13 to its original key - - pxor iv, %xmm1 // obuf[0] ^= *iv; - movups (ibuf), iv // ibuf[0] - pxor iv, %xmm2 // obuf[1] ^= ibuf[0] - movups 16(ibuf), iv // ibuf[1] - pxor iv, %xmm14 // obuf[2] ^= ibuf[1] - movups 32(ibuf), iv // ibuf[2] - pxor iv, %xmm15 // obuf[3] ^= ibuf[2] - movups 48(ibuf), iv // *iv = ibuf[3] - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += 4; - add $64, obuf // obuf += 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, prepare to return - - movups 16(ctx), %xmm14 // restore %xmm14 to its key - movups (ctx), %xmm15 // restore %xmm15 to its key - -#else - - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - movups 176(ctx), %xmm6 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 160(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 144(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 128(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; - add $64, obuf // obuf += AES_BLOCK_SIZE * 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 176(ctx), %xmm4 - movups 160(ctx), %xmm5 - movups 144(ctx), %xmm6 - movups 128(ctx), %xmm7 - -#endif - - // per-block aes_decrypt_cbc loop - -0: - movups (ibuf), %xmm2 // tmp = ibuf - - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdec %xmm13, %xmm2 - aesdec %xmm14, %xmm2 - aesdeclast %xmm15, %xmm2 -#else - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 -#endif - - pxor iv, %xmm2 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm2, (obuf) // write obuf - - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done - - // - // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done - // - -L_decrypt_256: - - cmp $1, num_blk - jl L_HW_cbc_done - - movups 224(ctx), %xmm3 - movups 208(ctx), %xmm4 - movups 192(ctx), %xmm5 - movups 176(ctx), %xmm6 - movups 160(ctx), %xmm7 -#if defined __x86_64__ - movups 144(ctx), %xmm8 - movups 128(ctx), %xmm9 - movups 112(ctx), %xmm10 - movups 96(ctx), %xmm11 - movups 80(ctx), %xmm12 - movups 64(ctx), %xmm13 - movups 48(ctx), %xmm14 - movups 32(ctx), %xmm15 -// movups 16(ctx), %xmm14 -// movups (ctx), %xmm15 -#endif - -#if defined __x86_64__ - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-4-blocks processing code -0: - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm14 // tmp = 3rd ibuf - movups 48(ibuf), %xmm15 // tmp = 4th ibuf - - // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm14 - pxor %xmm3, %xmm15 - - aesdec %xmm4, %xmm1 - aesdec %xmm4, %xmm2 - aesdec %xmm4, %xmm14 - aesdec %xmm4, %xmm15 - - aesdec %xmm5, %xmm1 - aesdec %xmm5, %xmm2 - aesdec %xmm5, %xmm14 - aesdec %xmm5, %xmm15 - - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm14 - aesdec %xmm6, %xmm15 - - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm14 - aesdec %xmm7, %xmm15 - - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm14 - aesdec %xmm8, %xmm15 - - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm14 - aesdec %xmm9, %xmm15 - - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm14 - aesdec %xmm10, %xmm15 - - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm14 - aesdec %xmm11, %xmm15 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 48(ctx), %xmm12 - - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - movups 32(ctx), %xmm13 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 16(ctx), %xmm12 - - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm14 - aesdec %xmm13, %xmm15 - movups (ctx), %xmm13 - - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm14 - aesdec %xmm12, %xmm15 - movups 80(ctx), %xmm12 - - aesdeclast %xmm13, %xmm1 - aesdeclast %xmm13, %xmm2 - aesdeclast %xmm13, %xmm14 - aesdeclast %xmm13, %xmm15 - movups 64(ctx), %xmm13 - - pxor iv, %xmm1 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // obuf ^= iv; - movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm14 // obuf ^= iv; - movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm15 // obuf ^= iv; - movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm14, 32(obuf) // write 3rd obuf - movups %xmm15, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; - add $64, obuf // obuf += AES_BLOCK_SIZE*4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 48(ctx), %xmm14 - movups 32(ctx), %xmm15 - -#else - - sub $4, num_blk // pre decrement num_blk by 4 - jl 9f // if num_blk < 4, skip the per-pair processing code -0: - movups (ibuf), %xmm1 // tmp = 1st ibuf - movups 16(ibuf), %xmm2 // tmp = 2nd ibuf - movups 32(ibuf), %xmm4 // tmp = 3rd ibuf - movups 48(ibuf), %xmm5 // tmp = 4th ibuf - - // aes_decrypt - // for i386, sequentially load expanded keys into xmm6/xmm7 - movups 208(ctx), %xmm6 - pxor %xmm3, %xmm1 - pxor %xmm3, %xmm2 - pxor %xmm3, %xmm4 - pxor %xmm3, %xmm5 - - movups 192(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 176(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 160(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 144(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 128(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 112(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 96(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 80(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 64(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 48(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 32(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - movups 16(ctx), %xmm6 - aesdec %xmm7, %xmm1 - aesdec %xmm7, %xmm2 - aesdec %xmm7, %xmm4 - aesdec %xmm7, %xmm5 - - movups 0(ctx), %xmm7 - aesdec %xmm6, %xmm1 - aesdec %xmm6, %xmm2 - aesdec %xmm6, %xmm4 - aesdec %xmm6, %xmm5 - - aesdeclast %xmm7, %xmm1 - aesdeclast %xmm7, %xmm2 - aesdeclast %xmm7, %xmm4 - aesdeclast %xmm7, %xmm5 - - pxor iv, %xmm1 // 1st obuf ^= iv; - movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm2 // 2nd obuf ^= iv; - movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm4 // 3rd obuf ^= iv; - movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); - pxor iv, %xmm5 // 4th obuf ^= iv; - movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); - movups %xmm1, (obuf) // write 1st obuf - movups %xmm2, 16(obuf) // write 2nd obuf - movups %xmm4, 32(obuf) // write 3rd obuf - movups %xmm5, 48(obuf) // write 4th obuf - - add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; - add $64, obuf // obuf += AES_BLOCK_SIZE * 4; - - sub $4, num_blk // num_blk -= 4 - jge 0b // if num_blk > 0, repeat the loop - - -9: add $4, num_blk // post incremtn num_blk by 4 - je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code - - movups 208(ctx), %xmm4 - movups 192(ctx), %xmm5 - movups 176(ctx), %xmm6 - movups 160(ctx), %xmm7 - -#endif - -0: - movups (ibuf), %xmm2 // tmp = ibuf - - // aes_decrypt - pxor %xmm3, %xmm2 - aesdec %xmm4, %xmm2 - aesdec %xmm5, %xmm2 - aesdec %xmm6, %xmm2 - aesdec %xmm7, %xmm2 -#if defined __x86_64__ - aesdec %xmm8, %xmm2 - aesdec %xmm9, %xmm2 - aesdec %xmm10, %xmm2 - aesdec %xmm11, %xmm2 - aesdec %xmm12, %xmm2 - aesdec %xmm13, %xmm2 - aesdec %xmm14, %xmm2 - aesdec %xmm15, %xmm2 -#else - movups 144(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 128(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 112(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 96(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 80(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 64(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 48(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups 32(ctx), %xmm1 - aesdec %xmm1, %xmm2 -#endif - movups 16(ctx), %xmm1 - aesdec %xmm1, %xmm2 - movups (ctx), %xmm1 - aesdeclast %xmm1, %xmm2 - - pxor iv, %xmm2 // obuf ^= iv; - movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); - - movups %xmm2, (obuf) // write obuf - - add $16, ibuf // ibuf += AES_BLOCK_SIZE; - add $16, obuf // obuf += AES_BLOCK_SIZE; - sub $1, num_blk // num_blk -- - jg 0b // if num_blk > 0, repeat the loop - - jmp L_HW_cbc_done - - // - // --------- END of aes_decrypt_cbc_hw ------------------- - // diff --git a/bsd/crypto/aes/i386/aesxts.c b/bsd/crypto/aes/i386/aesxts.c deleted file mode 100644 index c0eaaa609..000000000 --- a/bsd/crypto/aes/i386/aesxts.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright (c) 2010 Apple Inc. All Rights Reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#include "aesxts.h" -#include -#include -#include - -int -aes_encrypt_key(const uint8_t *key, int key_len, aesedp_encrypt_ctx cx[1]); - -int -aes_decrypt_key(const uint8_t *key, int key_len, aesedp_decrypt_ctx cx[1]); - -int -aes_encrypt(const uint8_t *Plaintext, uint8_t *Ciphertext, aesedp_encrypt_ctx *ctx); - -int -aes_decrypt(const uint8_t *Ciphertext, uint8_t *Plaintext, aesedp_decrypt_ctx *ctx); - - -/* error codes [will be expanded in future releases] */ -enum { - CRYPT_OK=0, /* Result OK */ - CRYPT_ERROR=1, /* Generic Error */ - CRYPT_INVALID_KEYSIZE=3, /* Invalid key size given */ - CRYPT_INVALID_ARG=16, /* Generic invalid argument */ -}; - -static int -aesedp_keysize(int *keysize) -{ - switch (*keysize) { - case 16: - case 24: - case 32: - return CRYPT_OK; - default: - return CRYPT_INVALID_KEYSIZE; - } -} - -static int -aesedp_setup(const uint8_t *key, int keylen, int num_rounds __unused, aesedp_ctx *skey) -{ - aesedp_ctx *ctx = (aesedp_ctx *) skey; - int retval; - - if((retval = aesedp_keysize(&keylen)) != CRYPT_OK) return retval; - if((retval = aes_encrypt_key(key, keylen, &ctx->encrypt)) != CRYPT_OK) return CRYPT_ERROR; - if((retval = aes_decrypt_key(key, keylen, &ctx->decrypt)) != CRYPT_OK) return CRYPT_ERROR; - return CRYPT_OK; -} - -#ifdef ZZZNEVER -static int -aesedp_ecb_encrypt(const uint8_t *pt, uint8_t *ct, aesedp_ctx *skey) -{ - aesedp_ctx *ctx = (aesedp_ctx *) skey; - return aes_encrypt(pt, ct, &ctx->encrypt); -} - - - -static int -aesedp_ecb_decrypt(const uint8_t *ct, uint8_t *pt, aesedp_ctx *skey) -{ - return aes_decrypt(ct, pt, &skey->decrypt); -} -#endif - - -static void -aesedp_done(aesedp_ctx *skey __unused) -{ -} - -/** Start XTS mode - @param cipher The index of the cipher to use - @param key1 The encrypt key - @param key2 The tweak encrypt key - @param keylen The length of the keys (each) in octets - @param num_rounds The number of rounds for the cipher (0 == default) - @param xts [out] XTS structure - Returns CRYPT_OK upon success. -*/ - -uint32_t -xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only - const uint8_t *IV __unused, // ignored - const uint8_t *key1, int keylen, - const uint8_t *key2, int tweaklen __unused, // both keys are the same size for xts - uint32_t num_rounds, // ignored - uint32_t options __unused, // ignored - symmetric_xts *xts) -{ - uint32_t err; - - /* check inputs */ - if((key1 == NULL)|| (key2 == NULL) || (xts == NULL)) return CRYPT_INVALID_ARG; - - /* schedule the two ciphers */ - if ((err = aesedp_setup(key1, keylen, num_rounds, &xts->key1)) != 0) { - return err; - } - if ((err = aesedp_setup(key2, keylen, num_rounds, &xts->key2)) != 0) { - return err; - } - xts->cipher = cipher; - - return err; -} - - - - -/** multiply by x - @param I The value to multiply by x (LFSR shift) -*/ -#if defined __x86_64__ || defined __i386__ -extern void xts_mult_x(uint8_t *I); -#else -static void xts_mult_x(uint8_t *I) -{ - uint32_t x; - uint8_t t, tt; - - for (x = t = 0; x < 16; x++) { - tt = I[x] >> 7; - I[x] = ((I[x] << 1) | t) & 0xFF; - t = tt; - } - if (tt) { - I[0] ^= 0x87; - } -} -#endif - -#if defined __x86_64__ || defined __i386__ -extern int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); -extern int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); -#else -static int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx) -{ - uint32_t x; - uint32_t err; - - /* tweak encrypt block i */ - for (x = 0; x < 16; x += sizeof(uint64_t)) { - *((uint64_t*)&C[x]) = *((uint64_t*)&P[x]) ^ *((uint64_t*)&T[x]); - } - - if ((err = aes_encrypt(C, C, ctx)) != CRYPT_OK) { - return CRYPT_INVALID_KEYSIZE; - } - - for (x = 0; x < 16; x += sizeof(uint64_t)) { - *((uint64_t*)&C[x]) ^= *((uint64_t*)&T[x]); - } - - /* LFSR the tweak */ - xts_mult_x(T); - - return CRYPT_OK; -} -#endif - -/** XTS Encryption - @param pt [in] Plaintext - @param ptlen Length of plaintext (and ciphertext) - @param ct [out] Ciphertext - @param tweak [in] The 128--bit encryption tweak (e.g. sector number) - @param xts The XTS structure - Returns CRYPT_OK upon success -*/ -int xts_encrypt( - const uint8_t *pt, unsigned long ptlen, - uint8_t *ct, - const uint8_t *tweak, - symmetric_xts *xts) -{ - aesedp_encrypt_ctx *encrypt_ctx = &xts->key1.encrypt; - uint8_t PP[16], CC[16], T[16]; - uint32_t i, m, mo, lim; - uint32_t err; - - /* check inputs */ - if((pt == NULL) || (ct == NULL)|| (tweak == NULL) || (xts == NULL)) return 1; - - /* get number of blocks */ - m = ptlen >> 4; - mo = ptlen & 15; - - /* must have at least one full block */ - if (m == 0) { - return CRYPT_INVALID_ARG; - } - - /* encrypt the tweak */ - if ((err = aes_encrypt(tweak, T, &xts->key2.encrypt)) != 0) { - return CRYPT_INVALID_KEYSIZE; - } - - /* for i = 0 to m-2 do */ - if (mo == 0) { - lim = m; - } else { - lim = m - 1; - } - -#if defined __x86_64__ || defined __i386__ - if (lim>0) { - err = tweak_crypt_group(pt, ct, T, encrypt_ctx, lim); - ct += (lim<<4); - pt += (lim<<4); - } -#else - for (i = 0; i < lim; i++) { - err = tweak_crypt(pt, ct, T, encrypt_ctx); - ct += 16; - pt += 16; - } -#endif - - /* if ptlen not divide 16 then */ - if (mo > 0) { - /* CC = tweak encrypt block m-1 */ - if ((err = tweak_crypt(pt, CC, T, encrypt_ctx)) != 0) { - return err; - } - - /* Cm = first ptlen % 16 bytes of CC */ - for (i = 0; i < mo; i++) { - PP[i] = pt[16+i]; - ct[16+i] = CC[i]; - } - - for (; i < 16; i++) { - PP[i] = CC[i]; - } - - /* Cm-1 = Tweak encrypt PP */ - if ((err = tweak_crypt(PP, ct, T, encrypt_ctx)) != 0) { - return err; - } - } - - return err; -} - -#if defined __x86_64__ || defined __i386__ -extern int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); -extern int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); -#else -static int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx) -{ - uint32_t x; - uint32_t err; - - /* tweak encrypt block i */ - for (x = 0; x < 16; x += sizeof(uint64_t)) { - *((uint64_t*)&P[x]) = *((uint64_t*)&C[x]) ^ *((uint64_t*)&T[x]); - } - - err = aes_decrypt(P, P, ctx); - - for (x = 0; x < 16; x += sizeof(uint64_t)) { - *((uint64_t*)&P[x]) ^= *((uint64_t*)&T[x]); - } - - /* LFSR the tweak */ - xts_mult_x(T); - - return err; -} -#endif - -/** XTS Decryption - @param ct [in] Ciphertext - @param ptlen Length of plaintext (and ciphertext) - @param pt [out] Plaintext - @param tweak [in] The 128--bit encryption tweak (e.g. sector number) - @param xts The XTS structure - Returns CRYPT_OK upon success -*/ - -int xts_decrypt( - const uint8_t *ct, unsigned long ptlen, - uint8_t *pt, - const uint8_t *tweak, - symmetric_xts *xts) -{ - aesedp_decrypt_ctx *decrypt_ctx = &xts->key1.decrypt; - uint8_t PP[16], CC[16], T[16]; - uint32_t i, m, mo, lim; - uint32_t err; - - /* check inputs */ - if((pt == NULL) || (ct == NULL)|| (tweak == NULL) || (xts == NULL)) return 1; - - /* get number of blocks */ - m = ptlen >> 4; - mo = ptlen & 15; - - /* must have at least one full block */ - if (m == 0) { - return CRYPT_INVALID_ARG; - } - - /* encrypt the tweak , yes - encrypt */ - if ((err = aes_encrypt(tweak, T, &xts->key2.encrypt)) != 0) { - return CRYPT_INVALID_KEYSIZE; - } - - /* for i = 0 to m-2 do */ - if (mo == 0) { - lim = m; - } else { - lim = m - 1; - } - -#if defined __x86_64__ || defined __i386__ - if (lim>0) { - err = tweak_uncrypt_group(ct, pt, T, decrypt_ctx, lim); - ct += (lim<<4); - pt += (lim<<4); - } -#else - for (i = 0; i < lim; i++) { - err = tweak_uncrypt(ct, pt, T, decrypt_ctx); - ct += 16; - pt += 16; - } -#endif - - /* if ptlen not divide 16 then */ - if (mo > 0) { - memcpy(CC, T, 16); - xts_mult_x(CC); - - /* PP = tweak decrypt block m-1 */ - if ((err = tweak_uncrypt(ct, PP, CC, decrypt_ctx)) != CRYPT_OK) { - return err; - } - - /* Pm = first ptlen % 16 bytes of PP */ - for (i = 0; i < mo; i++) { - CC[i] = ct[16+i]; - pt[16+i] = PP[i]; - } - for (; i < 16; i++) { - CC[i] = PP[i]; - } - - /* Pm-1 = Tweak uncrypt CC */ - if ((err = tweak_uncrypt(CC, pt, T, decrypt_ctx)) != CRYPT_OK) { - return err; - } - } - - return CRYPT_OK; -} - - - -void xts_done(symmetric_xts *xts) -{ - if(xts == NULL) return; - aesedp_done(&xts->key1); - aesedp_done(&xts->key2); -} - diff --git a/bsd/crypto/aes/i386/aesxts.h b/bsd/crypto/aes/i386/aesxts.h deleted file mode 100644 index fe7618066..000000000 --- a/bsd/crypto/aes/i386/aesxts.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2010 Apple Inc. All Rights Reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -/* - * aesxts.h - * - * - */ - -#include "stdint.h" - - -#ifndef _AESXTS_H -#define _AESXTS_H - -#if defined(__cplusplus) -extern "C" -{ -#endif - -/* - * The context for XTS-AES - */ - - -#define KS_LENGTH 60 - -typedef struct { - uint32_t ks[KS_LENGTH]; - uint32_t rn; -} aesedp_encrypt_ctx; - -typedef struct { - uint32_t ks[KS_LENGTH]; - uint32_t rn; -} aesedp_decrypt_ctx; - -typedef struct { - aesedp_decrypt_ctx decrypt; - aesedp_encrypt_ctx encrypt; -} aesedp_ctx; - -// xts mode context - -typedef struct { - aesedp_ctx key1, key2; - uint32_t cipher; // ignore - this is to fit with the library, but in this case we're only using aes -} symmetric_xts; - - -/* - * These are the interfaces required for XTS-AES support - */ - -uint32_t -xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only - const uint8_t *IV, // ignored - const uint8_t *key1, int keylen, - const uint8_t *key2, int tweaklen, // both keys are the same size for xts - uint32_t num_rounds, // ignored - uint32_t options, // ignored - symmetric_xts *xts); - -int xts_encrypt( - const uint8_t *pt, unsigned long ptlen, - uint8_t *ct, - const uint8_t *tweak, // this can be considered the sector IV for this use - symmetric_xts *xts); - -int xts_decrypt( - const uint8_t *ct, unsigned long ptlen, - uint8_t *pt, - const uint8_t *tweak, // this can be considered the sector IV for this use - symmetric_xts *xts); - - -void xts_done(symmetric_xts *xts); - -#if defined(__cplusplus) -} -#endif - -#endif /* _AESXTS_H */ \ No newline at end of file diff --git a/bsd/crypto/aes/i386/aesxts_asm.s b/bsd/crypto/aes/i386/aesxts_asm.s deleted file mode 100644 index ec6b924b7..000000000 --- a/bsd/crypto/aes/i386/aesxts_asm.s +++ /dev/null @@ -1,1305 +0,0 @@ -/* - This file "aesxts.s" provides x86_64 / i386 optimization of the following functions - - 0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions - 1. void xts_mult_x(uint8_t *I); - 2. int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); - 3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); - 4. int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); - 5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); - - This file should be compiled together with xtsClearC.c - - functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures - functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c) - - we can possibly add C code for functions 3 and 5 for future porting to other architectures - - cclee 4-29-10 - -*/ - -#ifdef KERNEL -#include -#else -#include -#endif -#define CRYPT_OK 0 // can not include "crypt.h" in which CRYPT_OK is from enum - -/* - The following macro is used throughout the functions in this file. - It is the core function within the function xts_mult_x defined in (xtsClearC.c) - - upon entry, %xmm7 = the input tweak (128-bit), - on return, %xmm7 = the updated tweak (128-bit) - the macro uses %xmm1/%xmm2/%ecx in the computation - the operation can be described as follows : - 0. let x = %xmm7; // 128-bit little-endian input - 1. x = rotate_left(x,1); // rotate left by 1 -bit - 2. if (x&1) x ^= 0x0000...0086; // if least significant bit = 1, least significant byte ^= 0x86; - 3. return x; - - It's a pity that SSE does not support shifting of the whole 128-bit xmm registers. - The workaround is - 1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits - 2. manipulating the shifted quad words to form the 128-bit shifted result. - - Input : %xmm7 - Output : %xmm7 - Used : %xmm1/%xmm2/%ecx - - The macro is good for both x86_64 and i386. - -*/ - - .macro xts_mult_x_on_xmm7 // input : x = %xmm7, MS = most significant, LS = least significant - movaps %xmm7, %xmm1 // %xmm1 = a copy of x - movaps %xmm7, %xmm2 // %xmm2 = a copy of x - psllq $$1, %xmm7 // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled - psrlq $$63, %xmm1 // 2 leading bits, each in the least significant bit of a quad word - psrad $$31, %xmm2 // the MS 32-bit will be either 0 or -1, depending on the MS bit of x - pshufd $$0xc6, %xmm1, %xmm1 // switch the positions of the 2 leading bits - pshufd $$0x03, %xmm2, %xmm2 // the LS 32-bit will be either 0 or -1, depending on the MS bit of x - por %xmm1, %xmm7 // we finally has %xmm7 = rotate_left(x,1); - movl $$0x86, %ecx // a potential byte to xor the bottom byte - movd %ecx, %xmm1 // copy it to %xmm1, the other is 0 - pand %xmm2, %xmm1 // %xmm1 = 0 or 0x86, depending on the MS bit of x - pxor %xmm1, %xmm7 // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x - .endm - - -/* - function : void xts_mult_x(uint8_t *I); - - 1. load (__m128*) (I) into xmm7 - 2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx) - 3. save output (%xmm7) to memory pointed by I - - input : 16-byte memory pointed by I - output : same 16-byte memory pointed by I - - if kernel code, xmm1/xmm2/xmm7 saved and restored - other used registers : eax/ecx - - */ - .text - .align 4,0x90 - .globl _xts_mult_x -_xts_mult_x: - -#if defined __x86_64__ - #define I %rdi // 1st argument at %rdi for x86_64 - #define sp %rsp -#else - mov 4(%esp), %eax // 1st argument at stack, offset 4 for ret_addr for i386 - #define I %eax - #define sp %esp -#endif - - // if KERNEL code, allocate memory and save xmm1/xmm2/xmm7 -#ifdef KERNEL -#if defined __x86_64__ - sub $0x38, sp // 8-bytes alignment + 3 * 16 bytes -#else - sub $0x3c, sp // 12-bytes alignment + 3 * 16 bytes -#endif - movaps %xmm1, (sp) - movaps %xmm2, 16(sp) - movaps %xmm7, 32(sp) -#endif - - // load, compute, and save - movups (I), %xmm7 // load input tweak 128-bit into %xmm7 - xts_mult_x_on_xmm7 // the macro (also used else where) will update %xmm7 as the output - movups %xmm7, (I) // save the xts_mult_x output - - // if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory -#ifdef KERNEL - movaps (sp), %xmm1 - movaps 16(sp), %xmm2 - movaps 32(sp), %xmm7 -#if defined __x86_64__ - add $0x38, sp // 8-bytes alignment + 3 * 16 bytes -#else - add $0x3c, sp // 12-bytes alignment + 3 * 16 bytes -#endif -#endif - - ret // return - - #undef I - #undef sp - -/* - The following is x86_64/i386 assembly implementation of - - int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); - - Its C code implementation is given in xtsClearC.c - - all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data. - - The operation of tweak_crypt - - 1. C = P ^ T - 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err; - 3. C = C ^ T - 4. xts_mult_x(T) - 5. return CRYPT_OK; - - The following is the assembly implementation flow - - 1. save used xmm registers (xmm1/xmm7) if kernel code - 2. load xmm1 = P, xmm7 = T - 3. xmm1 = C = P ^ T - 4. write xmm1 to C - 5. call aes_encryp(C,C,ctx); note that it will use aesni if available, also xmm will return intact - 6. load xmm1 = C - 7. xmm1 = C = C^T = xmm1 ^ xmm7 - 8. write xmm1 to C - 9. update T (in xmm7) via xts_mult_x macro - a. restore xmm registers (xmm1/xmm7) if kernel code - b. return CRYPT_OK (in eax) - - Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro - -*/ - - .text - .align 4,0x90 - .globl _tweak_crypt -_tweak_crypt: -#if defined __i386__ - - // push into stack for local use - push %ebp - mov %esp, %ebp - push %ebx - push %edi - push %esi - - // alllocate stack memory for local use - sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) - - // load with called arguments - mov 8(%ebp), %eax // P, we need this only briefly, so eax is fine - mov 12(%ebp), %edi // C - mov 16(%ebp), %ebx // T - mov 20(%ebp), %esi // ctx - - #define P %eax - #define C %edi - #define T %ebx - #define ctx %esi - #define sp %esp - -#else - // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8 - - // push into stack for local use - push %rbp - mov %rsp, %rbp - push %r12 - push %r13 - push %r14 - push %r15 - - // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers -#ifdef KERNEL - sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386 -#endif - - // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt - mov %rsi, %r13 - mov %rdx, %r14 - mov %rcx, %r15 - - #define P %rdi - #define C %r13 - #define T %r14 - #define ctx %r15 - #define sp %rsp - -#endif - - // if kernel, save used xmm registers -#ifdef KERNEL - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm7, 48(sp) -#endif - - movups (P), %xmm1 // P - movups (T), %xmm7 // T - - // setup caliing arguments for aes_encrypt -#if defined __i386__ - mov C, (%esp) // C - mov C, 4(%esp) // C - mov ctx, 8(%esp) // ctx -#else - mov C, %rdi // C - mov C, %rsi // C - mov ctx, %rdx // ctx -#endif - - pxor %xmm7, %xmm1 // C = P ^ T - movups %xmm1, (C) // save C into memory - - call _aes_encrypt // err = aes_encrypt(C,C,ctx); - - cmp $CRYPT_OK, %eax // check err == CRYPT_OK - jne 9f // if err != CRYPT_OK, exit - - movups (C), %xmm1 // load xmm1 = C - pxor %xmm7, %xmm1 // C ^= T - movups %xmm1, (C) // write C with xmm1, xmm1 is freed now, will be changed in the following macro - - xts_mult_x_on_xmm7 // update T (on xmm7) - - movups %xmm7, (T) // write xmm7 to T -9: - - // restore used xmm registers if this is for kernel -#ifdef KERNEL - movaps 16(sp), %xmm1 - movaps 32(sp), %xmm2 - movaps 48(sp), %xmm7 -#endif - - // free stack memory and restore callee registers -#if defined __i386__ - add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) - pop %esi - pop %edi - pop %ebx -#else -#ifdef KERNEL - add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386 -#endif - pop %r15 - pop %r14 - pop %r13 - pop %r12 -#endif - - // return, eax/rax already has the return val - leave - ret - - #undef P - #undef C - #undef T - #undef ctx - #undef sp - -/* - The following is x86_64/i386 assembly implementation of - - int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); - - TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs) - This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need - to happen once for all grouped blocks. - - The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available. - If aesni is available, the code branch to optimized code that uses aesni. - - The optimized aesni code operates as follows: - - while (more than 4 consecutive blocks available) { - - do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned) - - perform 4 C = P ^ T; // T is on 16-byte aligned stack - - perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput) - - perform 4 C = C ^ T // T is on 16-byte aligned stack - - } - - The code then falls through to the scalar code, that sequentially performs what tweak_crypt does - - 1. C = P ^ T - 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err; - 3. C = C ^ T - 4. xts_mult_x(T) - - Note: used xmm registers : - xmm0-xmm5, xmm7 if aesni is available - xmm0-xmm4, xmm7 if aesni is not available. - -*/ - - .text - .align 4,0x90 - .globl _tweak_crypt_group -_tweak_crypt_group: - -#if defined __i386__ - - // push callee-saved registers for local use - push %ebp - mov %esp, %ebp - push %ebx - push %edi - push %esi - - // allocate stack memory for local use and/or xmm register save for kernel code - sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni - // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni - // transfer calling arguments - mov 20(%ebp), %eax // ctx - mov 12(%ebp), %edi // C - mov 16(%ebp), %ebx // T - mov 8(%ebp), %esi // P - mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt - - #define P %esi - #define C %edi - #define T %ebx - #define lim 24(%ebp) - #define sp %esp - -#else - - // push callee-saved registers for local use - push %rbp - mov %rsp, %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - // allocate stack memory for local use and/or xmm register save for kernel code - sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386) - - // rdi/rsi/rdx/rcx/r8 - // transfer calling arguments - mov %rdi, %r12 - mov %rsi, %r13 - mov %rdx, %r14 - mov %rcx, %r15 - mov %r8, %rbx - - #define P %r12 - #define C %r13 - #define T %r14 - #define ctx %r15 - #define lim %ebx - #define sp %rsp -#endif - -#ifdef KERNEL - movaps %xmm0, 0x50(sp) - movaps %xmm1, 0x60(sp) - movaps %xmm2, 0x70(sp) - movaps %xmm3, 0x80(sp) - movaps %xmm4, 0x90(sp) - movaps %xmm7, 0xa0(sp) -#endif - - // probe __cpu_capabilities to detect aesni -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities - mov (%rax), %eax // %eax = __cpu_capabilities -#else // i386 -#if defined KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - movl _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif -#endif - test $(kHasAES), %eax - je L_crypt_group_sw // if aesni not available, jump to sw-based implementation - - // aesni-based implementation - - sub $4, lim // pre-decrement lim by 4 - jl 9f // if lim < 4, skip the following code - - movups (T), %xmm7 // xmm7 is the tweak before encrypting every 4 blocks -#ifdef KERNEL - movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5 -#endif - -0: - // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space - // xmm7 will be the tweak for next 4-blocks iteration - - #define tweak1 16(sp) - #define tweak2 32(sp) - #define tweak3 48(sp) - #define tweak4 64(sp) - - movaps %xmm7, tweak1 // save 1st tweak on stack - xts_mult_x_on_xmm7 // compute 2nd tweak - movaps %xmm7, tweak2 // save 2nd tweak on stack - xts_mult_x_on_xmm7 // compute 3rd tweak - movaps %xmm7, tweak3 // save 3rd tweak on stack - xts_mult_x_on_xmm7 // compute 4th tweak - movaps %xmm7, tweak4 // save 4th tweak on stack - xts_mult_x_on_xmm7 // compute 1st tweak for next iteration - - // read 4 Ps - movups (P), %xmm0 - movups 16(P), %xmm1 - movups 32(P), %xmm2 - movups 48(P), %xmm3 - - // 4 C = P ^ T - pxor tweak1, %xmm0 - pxor tweak2, %xmm1 - pxor tweak3, %xmm2 - pxor tweak4, %xmm3 - - // 4 interleaved aes_encrypt - -#if defined __i386__ - mov 8(sp), %ecx // ctx - #undef ctx - #define ctx %ecx -#endif - - mov 240(ctx), %eax // aes length - - cmp $160, %eax // AES-128 ? - je 160f - cmp $192, %eax // AES-192 ? - je 192f - cmp $224, %eax // AES-256 ? - je 224f - mov $-1, %eax // error : non-supported aes length -#ifdef KERNEL - movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 -#endif - jmp L_error_crypt - - // definitions, macros, and constructs for 4 blocks hw-aes-encrypt - - // the following key definitions will also be used in tweak_uncrypt_group - #define key0 0(ctx) - #define key1 16(ctx) - #define key2 32(ctx) - #define key3 48(ctx) - #define key4 64(ctx) - #define key5 80(ctx) - #define key6 96(ctx) - #define key7 112(ctx) - #define key8 128(ctx) - #define key9 144(ctx) - #define keyA 160(ctx) - #define keyB 176(ctx) - #define keyC 192(ctx) - #define keyD 208(ctx) - #define keyE 224(ctx) - - #define aes aesenc - #define aeslast aesenclast - - // all aes encrypt operations start with the following sequence - .macro aes_common_part - movups key0, %xmm4 - movups key1, %xmm5 - pxor %xmm4, %xmm0 - pxor %xmm4, %xmm1 - pxor %xmm4, %xmm2 - pxor %xmm4, %xmm3 - movups key2, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key3, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key4, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key5, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key6, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key7, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key8, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key9, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups keyA, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - .endm - - // all aes encypt operations end with the following 4 instructions - .macro aes_last - aeslast %xmm4, %xmm0 - aeslast %xmm4, %xmm1 - aeslast %xmm4, %xmm2 - aeslast %xmm4, %xmm3 - .endm - - .macro aes_128 - aes_common_part // encrypt common part - aes_last // encrypt ending part - .endm - - .macro aes_192 - aes_common_part // encrypt common part - - // 10 extra instructions in between common and ending - movups keyB, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups keyC, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - - aes_last // encrypt ending part - .endm - - .macro aes_256 - aes_common_part // encrypt common part - - // 20 extra instructions in between common and ending - movups keyB, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups keyC, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups keyD, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups keyE, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - - aes_last // encrypt ending part - .endm - -160: // AES-128 encrypt - aes_128 - jmp 8f - -192: // AES-192 encrypt - aes_192 - jmp 8f - -224: // AES-256 encrypt - aes_256 - -8: - - // 4 C = C ^ T - pxor tweak1, %xmm0 - pxor tweak2, %xmm1 - pxor tweak3, %xmm2 - pxor tweak4, %xmm3 - - // write 4 Cs - movups %xmm0, (C) - movups %xmm1, 16(C) - movups %xmm2, 32(C) - movups %xmm3, 48(C) - - add $64, P - add $64, C - - sub $4, lim - jge 0b - -#ifdef KERNEL - movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 -#endif - movups %xmm7, (T) - -9: - xor %eax, %eax // to return CRYPT_OK - add $4, lim // post-increment lim by 4 - je 9f // if lim==0, branch to prepare to return - -L_crypt_group_sw: - - movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop - - sub $1, lim // pre-decrement lim by 1 - jl 1f // if lim < 1, branch to prepare to return -0: - movups (P), %xmm0 // P - - // prepare for calling aes_encrypt -#if defined __i386__ - mov C, (%esp) // C - mov C, 4(%esp) // C - // ctx was prepared previously in preamble -#else - mov C, %rdi // C - mov C, %rsi // C - mov ctx, %rdx // ctx -#endif - - pxor %xmm7, %xmm0 // C = P ^ T - movups %xmm0, (C) // save C into memory - - call _aes_encrypt_xmm_no_save // err = aes_encrypt(C,C,ctx); - - cmp $CRYPT_OK, %eax // err == CRYPT_OK ? - jne 9f // if err != CRYPT_OK, branch to exit with error - - movups (C), %xmm0 // load xmm0 with C - pxor %xmm7, %xmm0 // C ^= T - movups %xmm0, (C) // save output C - - xts_mult_x_on_xmm7 - - add $16, C // next C - add $16, P // next P - sub $1, lim // lim-- - jge 0b // if (lim>0) repeat the scalar loop - -1: movups %xmm7, (T) // save final tweak -L_error_crypt: -9: - // if kernel, restore used xmm registers -#ifdef KERNEL - movaps 0x50(sp), %xmm0 - movaps 0x60(sp), %xmm1 - movaps 0x70(sp), %xmm2 - movaps 0x80(sp), %xmm3 - movaps 0x90(sp), %xmm4 - movaps 0xa0(sp), %xmm7 -#endif - -#if defined __i386__ - add $(12+16*8+16*4), %esp - pop %esi - pop %edi - pop %ebx -#else - add $(8+16*8+16*5), %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx -#endif - leave - ret - - #undef P - #undef C - #undef T - #undef ctx - #undef sp - -/* - The following is x86_64/i386 assembly implementation of - - int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); - - Its C code implementation is given in xtsClearC.c - - all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data. - - The operation of tweak_crypt - - 1. P = C ^ T - 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err; - 3. P = P ^ T - 4. xts_mult_x(T) - 5. return CRYPT_OK; - - The following is the assembly implementation flow - - 1. save used xmm registers (xmm1/xmm7) if kernel code - 2. load xmm1 = C, xmm7 = T - 3. xmm1 = P = C ^ T - 4. write xmm1 to P - 5. call aes_decryp(P,P,ctx); note that it will use aesni if available, also xmm will return intact - 6. load xmm1 = P - 7. xmm1 = P = P^T = xmm1 ^ xmm7 - 8. write xmm1 to P - 9. update T (in xmm7) via xts_mult_x macro - a. restore xmm registers (xmm1/xmm7) if kernel code - b. return CRYPT_OK (in eax) - - Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro - -*/ - - .text - .align 4,0x90 - .globl _tweak_uncrypt -_tweak_uncrypt: -#if defined __i386__ - - // push into stack for local use - push %ebp - mov %esp, %ebp - push %ebx - push %edi - push %esi - - // alllocate stack memory for local use - sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) - - // load with called arguments - mov 8(%ebp), %eax // C, we need this only briefly, so eax is fine - mov 12(%ebp), %edi // P - mov 16(%ebp), %ebx // T - mov 20(%ebp), %esi // ctx - - #define C %eax - #define P %edi - #define T %ebx - #define ctx %esi - #define sp %esp - -#else - // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8 - - // push into stack for local use - push %rbp - mov %rsp, %rbp - push %r12 - push %r13 - push %r14 - push %r15 - - // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers -#ifdef KERNEL - sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386 -#endif - - // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt - mov %rsi, %r13 - mov %rdx, %r14 - mov %rcx, %r15 - - #define C %rdi - #define P %r13 - #define T %r14 - #define ctx %r15 - #define sp %rsp - -#endif - - // if kernel, save used xmm registers -#ifdef KERNEL - movaps %xmm1, 16(sp) - movaps %xmm2, 32(sp) - movaps %xmm7, 48(sp) -#endif - - movups (C), %xmm1 // C - movups (T), %xmm7 // T - - // setup caliing arguments for aes_decrypt -#if defined __i386__ - mov P, (%esp) // P - mov P, 4(%esp) // P - mov ctx, 8(%esp) // ctx -#else - mov P, %rdi // P - mov P, %rsi // P - mov ctx, %rdx // ctx -#endif - - pxor %xmm7, %xmm1 // P = C ^ T - movups %xmm1, (P) // save P into memory - - call _aes_decrypt // err = aes_decrypt(P,P,ctx); - - cmp $CRYPT_OK, %eax // check err == CRYPT_OK - jne 9f // if err != CRYPT_OK, exit - - movups (P), %xmm1 // load xmm1 = P - pxor %xmm7, %xmm1 // P ^= T - movups %xmm1, (P) // write P with xmm1, xmm1 is freed now, will be changed in the following macro - - xts_mult_x_on_xmm7 // update T (on xmm7) - - movups %xmm7, (T) // write xmm7 to T -9: - - // restore used xmm registers if this is for kernel -#ifdef KERNEL - movaps 16(sp), %xmm1 - movaps 32(sp), %xmm2 - movaps 48(sp), %xmm7 -#endif - - // free stack memory and restore callee registers -#if defined __i386__ - add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) - pop %esi - pop %edi - pop %ebx -#else -#ifdef KERNEL - add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386 -#endif - pop %r15 - pop %r14 - pop %r13 - pop %r12 -#endif - - // return, eax/rax already has the return val - leave - ret - - #undef P - #undef C - #undef T - #undef ctx - #undef sp - -/* - The following is x86_64/i386 assembly implementation of - - int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); - - TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs) - This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need - to happen once for all grouped blocks. - - The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available. - If aesni is available, the code branch to optimized code that uses aesni. - - The optimized aesni code operates as follows: - - while (more than 4 consecutive blocks available) { - - do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned) - - perform 4 P = C ^ T; // T is on 16-byte aligned stack - - perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput) - - perform 4 P = P ^ T // T is on 16-byte aligned stack - - } - - The code then falls through to the scalar code, that sequentially performs what tweak_crypt does - - 1. P = C ^ T - 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err; - 3. P = P ^ T - 4. xts_mult_x(T) - - Note: used xmm registers : - xmm0-xmm5, xmm7 if aesni is available - xmm0-xmm4, xmm7 if aesni is not available. - -*/ - - .text - .align 4,0x90 - .globl _tweak_uncrypt_group -_tweak_uncrypt_group: - -#if defined __i386__ - - // push callee-saved registers for local use - push %ebp - mov %esp, %ebp - push %ebx - push %edi - push %esi - - // allocate stack memory for local use and/or xmm register save for kernel code - sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni - // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni - // transfer calling arguments - mov 20(%ebp), %eax // ctx - mov 12(%ebp), %edi // P - mov 16(%ebp), %ebx // T - mov 8(%ebp), %esi // C - mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt - - #define C %esi - #define P %edi - #define T %ebx - #define lim 24(%ebp) - #define sp %esp - -#else - - // push callee-saved registers for local use - push %rbp - mov %rsp, %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - // allocate stack memory for local use and/or xmm register save for kernel code - sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386) - - // rdi/rsi/rdx/rcx/r8 - // transfer calling arguments - mov %rdi, %r12 - mov %rsi, %r13 - mov %rdx, %r14 - mov %rcx, %r15 - mov %r8, %rbx - - #define C %r12 - #define P %r13 - #define T %r14 - #define ctx %r15 - #define lim %ebx - #define sp %rsp -#endif - -#ifdef KERNEL - movaps %xmm0, 0x50(sp) - movaps %xmm1, 0x60(sp) - movaps %xmm2, 0x70(sp) - movaps %xmm3, 0x80(sp) - movaps %xmm4, 0x90(sp) - movaps %xmm7, 0xa0(sp) -#endif - - // probe __cpu_capabilities to detect aesni -#if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities - mov (%rax), %eax // %eax = __cpu_capabilities -#else // i386 -#if defined KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities -#else - movl _COMM_PAGE_CPU_CAPABILITIES, %eax -#endif -#endif - test $(kHasAES), %eax - je L_uncrypt_group_sw // if aesni not available, jump to sw-based implementation - - // aesni-based implementation - - sub $4, lim // pre-decrement lim by 4 - jl 9f // if lim < 4, skip the following code - - movups (T), %xmm7 // xmm7 is the tweak before decrypting every 4 blocks -#ifdef KERNEL - movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5 -#endif - -0: - // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space - // xmm7 will be the tweak for next 4-blocks iteration - - #define tweak1 16(sp) - #define tweak2 32(sp) - #define tweak3 48(sp) - #define tweak4 64(sp) - - movaps %xmm7, tweak1 // save 1st tweak on stack - xts_mult_x_on_xmm7 // compute 2nd tweak - movaps %xmm7, tweak2 // save 2nd tweak on stack - xts_mult_x_on_xmm7 // compute 3rd tweak - movaps %xmm7, tweak3 // save 3rd tweak on stack - xts_mult_x_on_xmm7 // compute 4th tweak - movaps %xmm7, tweak4 // save 4th tweak on stack - xts_mult_x_on_xmm7 // compute 1st tweak for next iteration - - // read 4 Cs - movups (C), %xmm0 - movups 16(C), %xmm1 - movups 32(C), %xmm2 - movups 48(C), %xmm3 - - // 4 P = C ^ T - pxor tweak1, %xmm0 - pxor tweak2, %xmm1 - pxor tweak3, %xmm2 - pxor tweak4, %xmm3 - - // 4 interleaved aes_decrypt - -#if defined __i386__ - mov 8(sp), %ecx // ctx - #undef ctx - #define ctx %ecx -#endif - - mov 240(ctx), %eax // aes length - - cmp $160, %eax // AES-128 ? - je 160f - cmp $192, %eax // AES-192 ? - je 192f - cmp $224, %eax // AES-256 ? - je 224f - mov $-1, %eax // error : non-supported aes length -#ifdef KERNEL - movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 -#endif - jmp L_error_uncrypt - - // definitions, macros to construc hw-aes-decrypt - // will reuse previously defined key0 = (ctx), key1 = 16(ctx), .... - #undef aes - #undef aeslast - #define aes aesdec - #define aeslast aesdeclast - - .macro aes_decrypt_common - movups key8, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key7, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key6, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key5, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key4, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key3, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key2, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key1, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups key0, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - aeslast %xmm4, %xmm0 - aeslast %xmm4, %xmm1 - aeslast %xmm4, %xmm2 - aeslast %xmm4, %xmm3 - .endm - - .macro aes_dec_128 - movups keyA, %xmm4 - movups key9, %xmm5 - pxor %xmm4, %xmm0 - pxor %xmm4, %xmm1 - pxor %xmm4, %xmm2 - pxor %xmm4, %xmm3 - aes_decrypt_common - .endm - - .macro aes_dec_192 - movups keyC, %xmm4 - movups keyB, %xmm5 - pxor %xmm4, %xmm0 - pxor %xmm4, %xmm1 - pxor %xmm4, %xmm2 - pxor %xmm4, %xmm3 - movups keyA, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key9, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - aes_decrypt_common - .endm - - .macro aes_dec_256 - movups keyE, %xmm4 - movups keyD, %xmm5 - pxor %xmm4, %xmm0 - pxor %xmm4, %xmm1 - pxor %xmm4, %xmm2 - pxor %xmm4, %xmm3 - movups keyC, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups keyB, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - movups keyA, %xmm4 - aes %xmm5, %xmm0 - aes %xmm5, %xmm1 - aes %xmm5, %xmm2 - aes %xmm5, %xmm3 - movups key9, %xmm5 - aes %xmm4, %xmm0 - aes %xmm4, %xmm1 - aes %xmm4, %xmm2 - aes %xmm4, %xmm3 - aes_decrypt_common - .endm - -160: // AES-128 decrypt - aes_dec_128 - jmp 8f - -192: // AES-192 decrypt - aes_dec_192 - jmp 8f - -224: // AES-256 decrypt - aes_dec_256 - -8: - - // 4 P = P ^ T - pxor tweak1, %xmm0 - pxor tweak2, %xmm1 - pxor tweak3, %xmm2 - pxor tweak4, %xmm3 - - // write 4 Ps - movups %xmm0, (P) - movups %xmm1, 16(P) - movups %xmm2, 32(P) - movups %xmm3, 48(P) - - add $64, C - add $64, P - - sub $4, lim - jge 0b - -#ifdef KERNEL - movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 -#endif - movups %xmm7, (T) - -9: - xor %eax, %eax // to return CRYPT_OK - add $4, lim // post-increment lim by 4 - je 9f // if lim==0, branch to prepare to return - -L_uncrypt_group_sw: - - movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop - - sub $1, lim // pre-decrement lim by 1 - jl 1f // if lim < 1, branch to prepare to return -0: - movups (C), %xmm0 // C - - // prepare for calling aes_decrypt -#if defined __i386__ - mov P, (%esp) // P - mov P, 4(%esp) // P - // ctx was prepared previously in preamble -#else - mov P, %rdi // P - mov P, %rsi // P - mov ctx, %rdx // ctx -#endif - - pxor %xmm7, %xmm0 // P = C ^ T - movups %xmm0, (P) // save P into memory - - call _aes_decrypt_xmm_no_save // err = aes_decrypt(P,P,ctx); - - cmp $CRYPT_OK, %eax // err == CRYPT_OK ? - jne 9f // if err != CRYPT_OK, branch to exit with error - - movups (P), %xmm0 // load xmm0 with P - pxor %xmm7, %xmm0 // P ^= T - movups %xmm0, (P) // save output P - - xts_mult_x_on_xmm7 - - add $16, C // next C - add $16, P // next P - sub $1, lim // lim-- - jge 0b // if (lim>0) repeat the scalar loop - -1: movups %xmm7, (T) // save final tweak -L_error_uncrypt: -9: - // if kernel, restore used xmm registers -#ifdef KERNEL - movaps 0x50(sp), %xmm0 - movaps 0x60(sp), %xmm1 - movaps 0x70(sp), %xmm2 - movaps 0x80(sp), %xmm3 - movaps 0x90(sp), %xmm4 - movaps 0xa0(sp), %xmm7 -#endif - -#if defined __i386__ - add $(12+16*8+16*4), %esp - pop %esi - pop %edi - pop %ebx -#else - add $(8+16*8+16*5), %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx -#endif - leave - ret diff --git a/bsd/crypto/aesxts.h b/bsd/crypto/aesxts.h new file mode 100644 index 000000000..574ed199c --- /dev/null +++ b/bsd/crypto/aesxts.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2012 Apple Inc. All Rights Reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * This header file is kept for legacy reasons and may be removed in + * future; the interface resides in . + */ +#include diff --git a/bsd/crypto/des.h b/bsd/crypto/des.h new file mode 100644 index 000000000..5347f7b0e --- /dev/null +++ b/bsd/crypto/des.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * This header file is kept for legacy reasons and may be removed in + * future; the supported interface resides in . + */ +#include diff --git a/bsd/crypto/des/des.h b/bsd/crypto/des/des.h deleted file mode 100644 index 9f232b185..000000000 --- a/bsd/crypto/des/des.h +++ /dev/null @@ -1,117 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/des.h,v 1.1.2.3 2002/03/26 10:12:24 ume Exp $ */ -/* $KAME: des.h,v 1.8 2001/09/10 04:03:57 itojun Exp $ */ - -/* lib/des/des.h */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) - * All rights reserved. - * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#ifndef HEADER_DES_H -#define HEADER_DES_H - -#ifdef __cplusplus -extern "C" { -#endif - -/* must be 32bit quantity */ -#define DES_LONG u_int32_t - -typedef unsigned char des_cblock[8]; -typedef struct des_ks_struct - { - union { - des_cblock cblock; - /* make sure things are correct size on machines with - * 8 byte longs */ - DES_LONG deslong[2]; - } ks; - int weak_key; -} des_key_schedule[16]; - -#define DES_KEY_SZ (sizeof(des_cblock)) -#define DES_SCHEDULE_SZ (sizeof(des_key_schedule)) - -#define DES_ENCRYPT 1 -#define DES_DECRYPT 0 - -#define DES_CBC_MODE 0 -#define DES_PCBC_MODE 1 - -extern int des_check_key; /* defaults to false */ - -char *des_options(void); -void des_ecb_encrypt(des_cblock *, des_cblock *, des_key_schedule, int); - -void des_encrypt1(DES_LONG *, des_key_schedule, int); -void des_encrypt2(DES_LONG *, des_key_schedule, int); -void des_encrypt3(DES_LONG *, des_key_schedule, des_key_schedule, - des_key_schedule); -void des_decrypt3(DES_LONG *, des_key_schedule, des_key_schedule, - des_key_schedule); - -void des_ecb3_encrypt(des_cblock *, des_cblock *, des_key_schedule, - des_key_schedule, des_key_schedule, int); - -void des_ncbc_encrypt(const unsigned char *, unsigned char *, long, - des_key_schedule, des_cblock *, int); - -void des_ede3_cbc_encrypt(const unsigned char *, unsigned char *, long, - des_key_schedule, des_key_schedule, - des_key_schedule, des_cblock *, int); - -void des_set_odd_parity(des_cblock *); -void des_fixup_key_parity(des_cblock *); -int des_is_weak_key(des_cblock *); -int des_set_key(des_cblock *, des_key_schedule); -int des_key_sched(des_cblock *, des_key_schedule); -int des_set_key_checked(des_cblock *, des_key_schedule); -void des_set_key_unchecked(des_cblock *, des_key_schedule); -int des_check_key_parity(des_cblock *); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/bsd/crypto/des/des_ecb.c b/bsd/crypto/des/des_ecb.c deleted file mode 100644 index 4a3ea9959..000000000 --- a/bsd/crypto/des/des_ecb.c +++ /dev/null @@ -1,137 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/des_ecb.c,v 1.1.2.3 2002/03/26 10:12:24 ume Exp $ */ -/* $KAME: des_ecb.c,v 1.6 2001/09/10 04:03:58 itojun Exp $ */ - -/* crypto/des/ecb_enc.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@mincom.oz.au) - * All rights reserved. - * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include -#include -#include -#include - -/* char *libdes_version="libdes v 3.24 - 20-Apr-1996 - eay"; */ /* wrong */ -/* char *DES_version="DES part of SSLeay 0.6.4 30-Aug-1996"; */ - -char *des_options(void) - { - static int init=1; - static char buf[32]; - - if (init) - { - const char *ptr,*unroll,*risc,*size; - -#ifdef DES_PTR - ptr="ptr"; -#else - ptr="idx"; -#endif -#if defined(DES_RISC1) || defined(DES_RISC2) -#ifdef DES_RISC1 - risc="risc1"; -#endif -#ifdef DES_RISC2 - risc="risc2"; -#endif -#else - risc="cisc"; -#endif -#ifdef DES_UNROLL - unroll="16"; -#else - unroll="4"; -#endif - if (sizeof(DES_LONG) != sizeof(long)) - size="int"; - else - size="long"; - snprintf(buf, sizeof(buf), "des(%s,%s,%s,%s)", - ptr, risc, unroll, size); - init=0; - } - return(buf); -} -void des_ecb_encrypt(des_cblock *input, des_cblock *output, - des_key_schedule ks, int enc) -{ - register DES_LONG l; - DES_LONG ll[2]; - const unsigned char *in=&(*input)[0]; - unsigned char *out = &(*output)[0]; - - c2l(in,l); ll[0]=l; - c2l(in,l); ll[1]=l; - des_encrypt1(ll,ks,enc); - l=ll[0]; l2c(l,out); - l=ll[1]; l2c(l,out); - l=ll[0]=ll[1]=0; -} - -void des_ecb3_encrypt(des_cblock *input, des_cblock *output, - des_key_schedule ks1, des_key_schedule ks2, des_key_schedule ks3, - int enc) -{ - register DES_LONG l0,l1; - DES_LONG ll[2]; - const unsigned char *in = &(*input)[0]; - unsigned char *out = &(*output)[0]; - - c2l(in,l0); - c2l(in,l1); - ll[0]=l0; - ll[1]=l1; - - if (enc) - des_encrypt3(ll,ks1,ks2,ks3); - else - des_decrypt3(ll,ks1,ks2,ks3); - - l0=ll[0]; - l1=ll[1]; - l2c(l0,out); - l2c(l1,out); -} diff --git a/bsd/crypto/des/des_enc.c b/bsd/crypto/des/des_enc.c deleted file mode 100644 index f5e269eaf..000000000 --- a/bsd/crypto/des/des_enc.c +++ /dev/null @@ -1,294 +0,0 @@ -/* $KAME: kame/kame/sys/crypto/des/des_enc.c,v 1.1 2001/09/10 04:03:58 itojun Exp $ */ -/* $FreeBSD: src/sys/crypto/des/des_enc.c,v 1.1.2.1 2002/03/26 10:12:24 ume Exp $ */ - -/* crypto/des/des_enc.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include -#include - -extern const DES_LONG des_SPtrans[8][64]; - -void des_encrypt1(DES_LONG *data, des_key_schedule ks, int enc) -{ - register DES_LONG l,r,t,u; -#ifdef DES_PTR - register const unsigned char *des_SP=(const unsigned char *)des_SPtrans; -#endif -#ifndef DES_UNROLL - register int i; -#endif - register DES_LONG *s; - - r=data[0]; - l=data[1]; - - IP(r,l); - /* Things have been modified so that the initial rotate is - * done outside the loop. This required the - * des_SPtrans values in sp.h to be rotated 1 bit to the right. - * One perl script later and things have a 5% speed up on a sparc2. - * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> - * for pointing this out. */ - /* clear the top bits on machines with 8byte longs */ - /* shift left by 2 */ - r=ROTATE(r,29)&0xffffffffL; - l=ROTATE(l,29)&0xffffffffL; - - s=ks->ks.deslong; - /* I don't know if it is worth the effort of loop unrolling the - * inner loop */ - if (enc) - { -#ifdef DES_UNROLL - D_ENCRYPT(l,r, 0); /* 1 */ - D_ENCRYPT(r,l, 2); /* 2 */ - D_ENCRYPT(l,r, 4); /* 3 */ - D_ENCRYPT(r,l, 6); /* 4 */ - D_ENCRYPT(l,r, 8); /* 5 */ - D_ENCRYPT(r,l,10); /* 6 */ - D_ENCRYPT(l,r,12); /* 7 */ - D_ENCRYPT(r,l,14); /* 8 */ - D_ENCRYPT(l,r,16); /* 9 */ - D_ENCRYPT(r,l,18); /* 10 */ - D_ENCRYPT(l,r,20); /* 11 */ - D_ENCRYPT(r,l,22); /* 12 */ - D_ENCRYPT(l,r,24); /* 13 */ - D_ENCRYPT(r,l,26); /* 14 */ - D_ENCRYPT(l,r,28); /* 15 */ - D_ENCRYPT(r,l,30); /* 16 */ -#else - for (i=0; i<32; i+=8) - { - D_ENCRYPT(l,r,i+0); /* 1 */ - D_ENCRYPT(r,l,i+2); /* 2 */ - D_ENCRYPT(l,r,i+4); /* 3 */ - D_ENCRYPT(r,l,i+6); /* 4 */ - } -#endif - } - else - { -#ifdef DES_UNROLL - D_ENCRYPT(l,r,30); /* 16 */ - D_ENCRYPT(r,l,28); /* 15 */ - D_ENCRYPT(l,r,26); /* 14 */ - D_ENCRYPT(r,l,24); /* 13 */ - D_ENCRYPT(l,r,22); /* 12 */ - D_ENCRYPT(r,l,20); /* 11 */ - D_ENCRYPT(l,r,18); /* 10 */ - D_ENCRYPT(r,l,16); /* 9 */ - D_ENCRYPT(l,r,14); /* 8 */ - D_ENCRYPT(r,l,12); /* 7 */ - D_ENCRYPT(l,r,10); /* 6 */ - D_ENCRYPT(r,l, 8); /* 5 */ - D_ENCRYPT(l,r, 6); /* 4 */ - D_ENCRYPT(r,l, 4); /* 3 */ - D_ENCRYPT(l,r, 2); /* 2 */ - D_ENCRYPT(r,l, 0); /* 1 */ -#else - for (i=30; i>0; i-=8) - { - D_ENCRYPT(l,r,i-0); /* 16 */ - D_ENCRYPT(r,l,i-2); /* 15 */ - D_ENCRYPT(l,r,i-4); /* 14 */ - D_ENCRYPT(r,l,i-6); /* 13 */ - } -#endif - } - - /* rotate and clear the top bits on machines with 8byte longs */ - l=ROTATE(l,3)&0xffffffffL; - r=ROTATE(r,3)&0xffffffffL; - - FP(r,l); - data[0]=l; - data[1]=r; - l=r=t=u=0; -} - -void des_encrypt2(DES_LONG *data, des_key_schedule ks, int enc) -{ - register DES_LONG l,r,t,u; -#ifdef DES_PTR - register const unsigned char *des_SP=(const unsigned char *)des_SPtrans; -#endif -#ifndef DES_UNROLL - register int i; -#endif - register DES_LONG *s; - - r=data[0]; - l=data[1]; - - /* Things have been modified so that the initial rotate is - * done outside the loop. This required the - * des_SPtrans values in sp.h to be rotated 1 bit to the right. - * One perl script later and things have a 5% speed up on a sparc2. - * Thanks to Richard Outerbridge <71755.204@CompuServe.COM> - * for pointing this out. */ - /* clear the top bits on machines with 8byte longs */ - r=ROTATE(r,29)&0xffffffffL; - l=ROTATE(l,29)&0xffffffffL; - - s=ks->ks.deslong; - /* I don't know if it is worth the effort of loop unrolling the - * inner loop */ - if (enc) - { -#ifdef DES_UNROLL - D_ENCRYPT(l,r, 0); /* 1 */ - D_ENCRYPT(r,l, 2); /* 2 */ - D_ENCRYPT(l,r, 4); /* 3 */ - D_ENCRYPT(r,l, 6); /* 4 */ - D_ENCRYPT(l,r, 8); /* 5 */ - D_ENCRYPT(r,l,10); /* 6 */ - D_ENCRYPT(l,r,12); /* 7 */ - D_ENCRYPT(r,l,14); /* 8 */ - D_ENCRYPT(l,r,16); /* 9 */ - D_ENCRYPT(r,l,18); /* 10 */ - D_ENCRYPT(l,r,20); /* 11 */ - D_ENCRYPT(r,l,22); /* 12 */ - D_ENCRYPT(l,r,24); /* 13 */ - D_ENCRYPT(r,l,26); /* 14 */ - D_ENCRYPT(l,r,28); /* 15 */ - D_ENCRYPT(r,l,30); /* 16 */ -#else - for (i=0; i<32; i+=8) - { - D_ENCRYPT(l,r,i+0); /* 1 */ - D_ENCRYPT(r,l,i+2); /* 2 */ - D_ENCRYPT(l,r,i+4); /* 3 */ - D_ENCRYPT(r,l,i+6); /* 4 */ - } -#endif - } - else - { -#ifdef DES_UNROLL - D_ENCRYPT(l,r,30); /* 16 */ - D_ENCRYPT(r,l,28); /* 15 */ - D_ENCRYPT(l,r,26); /* 14 */ - D_ENCRYPT(r,l,24); /* 13 */ - D_ENCRYPT(l,r,22); /* 12 */ - D_ENCRYPT(r,l,20); /* 11 */ - D_ENCRYPT(l,r,18); /* 10 */ - D_ENCRYPT(r,l,16); /* 9 */ - D_ENCRYPT(l,r,14); /* 8 */ - D_ENCRYPT(r,l,12); /* 7 */ - D_ENCRYPT(l,r,10); /* 6 */ - D_ENCRYPT(r,l, 8); /* 5 */ - D_ENCRYPT(l,r, 6); /* 4 */ - D_ENCRYPT(r,l, 4); /* 3 */ - D_ENCRYPT(l,r, 2); /* 2 */ - D_ENCRYPT(r,l, 0); /* 1 */ -#else - for (i=30; i>0; i-=8) - { - D_ENCRYPT(l,r,i-0); /* 16 */ - D_ENCRYPT(r,l,i-2); /* 15 */ - D_ENCRYPT(l,r,i-4); /* 14 */ - D_ENCRYPT(r,l,i-6); /* 13 */ - } -#endif - } - /* rotate and clear the top bits on machines with 8byte longs */ - data[0]=ROTATE(l,3)&0xffffffffL; - data[1]=ROTATE(r,3)&0xffffffffL; - l=r=t=u=0; -} - -void des_encrypt3(DES_LONG *data, des_key_schedule ks1, des_key_schedule ks2, - des_key_schedule ks3) -{ - register DES_LONG l,r; - - l=data[0]; - r=data[1]; - IP(l,r); - data[0]=l; - data[1]=r; - des_encrypt2((DES_LONG *)data,ks1,DES_ENCRYPT); - des_encrypt2((DES_LONG *)data,ks2,DES_DECRYPT); - des_encrypt2((DES_LONG *)data,ks3,DES_ENCRYPT); - l=data[0]; - r=data[1]; - FP(r,l); - data[0]=l; - data[1]=r; -} - -void des_decrypt3(DES_LONG *data, des_key_schedule ks1, des_key_schedule ks2, - des_key_schedule ks3) -{ - register DES_LONG l,r; - - l=data[0]; - r=data[1]; - IP(l,r); - data[0]=l; - data[1]=r; - des_encrypt2((DES_LONG *)data,ks3,DES_DECRYPT); - des_encrypt2((DES_LONG *)data,ks2,DES_ENCRYPT); - des_encrypt2((DES_LONG *)data,ks1,DES_DECRYPT); - l=data[0]; - r=data[1]; - FP(r,l); - data[0]=l; - data[1]=r; -} diff --git a/bsd/crypto/des/des_locl.h b/bsd/crypto/des/des_locl.h deleted file mode 100644 index e894cb2f5..000000000 --- a/bsd/crypto/des/des_locl.h +++ /dev/null @@ -1,364 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/des_locl.h,v 1.2.2.3 2002/03/26 10:12:25 ume Exp $ */ -/* $KAME: des_locl.h,v 1.7 2001/09/10 04:03:58 itojun Exp $ */ - -/* crypto/des/des_locl.h */ -/* Copyright (C) 1995-1997 Eric Young (eay@mincom.oz.au) - * All rights reserved. - * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#ifndef HEADER_DES_LOCL_H -#define HEADER_DES_LOCL_H - -#include - -#undef DES_PTR - -#ifdef __STDC__ -#undef NOPROTO -#endif - -#define ITERATIONS 16 -#define HALF_ITERATIONS 8 - -/* used in des_read and des_write */ -#define MAXWRITE (1024*16) -#define BSIZE (MAXWRITE+4) - -#define c2l(c,l) (l =((DES_LONG)(*((c)++))) , \ - l|=((DES_LONG)(*((c)++)))<< 8L, \ - l|=((DES_LONG)(*((c)++)))<<16L, \ - l|=((DES_LONG)(*((c)++)))<<24L) - -/* NOTE - c is not incremented as per c2l */ -#define c2ln(c,l1,l2,n) { \ - c+=n; \ - l1=l2=0; \ - switch (n) { \ - case 8: l2 =((DES_LONG)(*(--(c))))<<24L; \ - case 7: l2|=((DES_LONG)(*(--(c))))<<16L; \ - case 6: l2|=((DES_LONG)(*(--(c))))<< 8L; \ - case 5: l2|=((DES_LONG)(*(--(c)))); \ - case 4: l1 =((DES_LONG)(*(--(c))))<<24L; \ - case 3: l1|=((DES_LONG)(*(--(c))))<<16L; \ - case 2: l1|=((DES_LONG)(*(--(c))))<< 8L; \ - case 1: l1|=((DES_LONG)(*(--(c)))); \ - } \ - } - -#define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ - *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ - *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ - *((c)++)=(unsigned char)(((l)>>24L)&0xff)) - -/* replacements for htonl and ntohl since I have no idea what to do - * when faced with machines with 8 byte longs. */ -#define HDRSIZE 4 - -#define n2l(c,l) (l =((DES_LONG)(*((c)++)))<<24L, \ - l|=((DES_LONG)(*((c)++)))<<16L, \ - l|=((DES_LONG)(*((c)++)))<< 8L, \ - l|=((DES_LONG)(*((c)++)))) - -#define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \ - *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ - *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ - *((c)++)=(unsigned char)(((l) )&0xff)) - -/* NOTE - c is not incremented as per l2c */ -#define l2cn(l1,l2,c,n) { \ - c+=n; \ - switch (n) { \ - case 8: *(--(c))=(unsigned char)(((l2)>>24L)&0xff); \ - case 7: *(--(c))=(unsigned char)(((l2)>>16L)&0xff); \ - case 6: *(--(c))=(unsigned char)(((l2)>> 8L)&0xff); \ - case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ - case 4: *(--(c))=(unsigned char)(((l1)>>24L)&0xff); \ - case 3: *(--(c))=(unsigned char)(((l1)>>16L)&0xff); \ - case 2: *(--(c))=(unsigned char)(((l1)>> 8L)&0xff); \ - case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ - } \ - } - -#define ROTATE(a,n) (((a)>>(n))+((a)<<(32-(n)))) - -#define LOAD_DATA_tmp(a,b,c,d,e,f) LOAD_DATA(a,b,c,d,e,f,g) -#define LOAD_DATA(R,S,u,t,E0,E1,tmp) \ - u=R^s[S ]; \ - t=R^s[S+1] - -/* The changes to this macro may help or hinder, depending on the - * compiler and the achitecture. gcc2 always seems to do well :-). - * Inspired by Dana How - * DO NOT use the alternative version on machines with 8 byte longs. - * It does not seem to work on the Alpha, even when DES_LONG is 4 - * bytes, probably an issue of accessing non-word aligned objects :-( */ -#ifdef DES_PTR - -/* It recently occurred to me that 0^0^0^0^0^0^0 == 0, so there - * is no reason to not xor all the sub items together. This potentially - * saves a register since things can be xored directly into L */ - -#if defined(DES_RISC1) || defined(DES_RISC2) -#ifdef DES_RISC1 -#define D_ENCRYPT(LL,R,S) { \ - unsigned int u1,u2,u3; \ - LOAD_DATA(R,S,u,t,E0,E1,u1); \ - u2=(int)u>>8L; \ - u1=(int)u&0xfc; \ - u2&=0xfc; \ - t=ROTATE(t,4); \ - u>>=16L; \ - LL^= *(const DES_LONG *)(des_SP +u1); \ - LL^= *(const DES_LONG *)(des_SP+0x200+u2); \ - u3=(int)(u>>8L); \ - u1=(int)u&0xfc; \ - u3&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x400+u1); \ - LL^= *(const DES_LONG *)(des_SP+0x600+u3); \ - u2=(int)t>>8L; \ - u1=(int)t&0xfc; \ - u2&=0xfc; \ - t>>=16L; \ - LL^= *(const DES_LONG *)(des_SP+0x100+u1); \ - LL^= *(const DES_LONG *)(des_SP+0x300+u2); \ - u3=(int)t>>8L; \ - u1=(int)t&0xfc; \ - u3&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x500+u1); \ - LL^= *(const DES_LONG *)(des_SP+0x700+u3); } -#endif /* DES_RISC1 */ -#ifdef DES_RISC2 -#define D_ENCRYPT(LL,R,S) { \ - unsigned int u1,u2,s1,s2; \ - LOAD_DATA(R,S,u,t,E0,E1,u1); \ - u2=(int)u>>8L; \ - u1=(int)u&0xfc; \ - u2&=0xfc; \ - t=ROTATE(t,4); \ - LL^= *(const DES_LONG *)(des_SP +u1); \ - LL^= *(const DES_LONG *)(des_SP+0x200+u2); \ - s1=(int)(u>>16L); \ - s2=(int)(u>>24L); \ - s1&=0xfc; \ - s2&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x400+s1); \ - LL^= *(const DES_LONG *)(des_SP+0x600+s2); \ - u2=(int)t>>8L; \ - u1=(int)t&0xfc; \ - u2&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x100+u1); \ - LL^= *(const DES_LONG *)(des_SP+0x300+u2); \ - s1=(int)(t>>16L); \ - s2=(int)(t>>24L); \ - s1&=0xfc; \ - s2&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x400+s1); \ - LL^= *(const DES_LONG *)(des_SP+0x600+s2); \ - u2=(int)t>>8L; \ - u1=(int)t&0xfc; \ - u2&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x100+u1); \ - LL^= *(const DES_LONG *)(des_SP+0x300+u2); \ - s1=(int)(t>>16L); \ - s2=(int)(t>>24L); \ - s1&=0xfc; \ - s2&=0xfc; \ - LL^= *(const DES_LONG *)(des_SP+0x500+s1); \ - LL^= *(const DES_LONG *)(des_SP+0x700+s2); } -#endif /* DES_RISC2 */ -#else /* DES_RISC1 || DES_RISC2 */ -#define D_ENCRYPT(LL,R,S) { \ - LOAD_DATA_tmp(R,S,u,t,E0,E1); \ - t=ROTATE(t,4); \ - LL^= \ - *(const DES_LONG *)(des_SP +((u )&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x200+((u>> 8L)&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x400+((u>>16L)&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x600+((u>>24L)&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x100+((t )&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x300+((t>> 8L)&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x500+((t>>16L)&0xfc))^ \ - *(const DES_LONG *)(des_SP+0x700+((t>>24L)&0xfc)); } -#endif /* DES_RISC1 || DES_RISC2 */ -#else /* original version */ - -#if defined(DES_RISC1) || defined(DES_RISC2) -#ifdef DES_RISC1 -#define D_ENCRYPT(LL,R,S) {\ - unsigned int u1,u2,u3; \ - LOAD_DATA(R,S,u,t,E0,E1,u1); \ - u>>=2L; \ - t=ROTATE(t,6); \ - u2=(int)u>>8L; \ - u1=(int)u&0x3f; \ - u2&=0x3f; \ - u>>=16L; \ - LL^=des_SPtrans[0][u1]; \ - LL^=des_SPtrans[2][u2]; \ - u3=(int)u>>8L; \ - u1=(int)u&0x3f; \ - u3&=0x3f; \ - LL^=des_SPtrans[4][u1]; \ - LL^=des_SPtrans[6][u3]; \ - u2=(int)t>>8L; \ - u1=(int)t&0x3f; \ - u2&=0x3f; \ - t>>=16L; \ - LL^=des_SPtrans[1][u1]; \ - LL^=des_SPtrans[3][u2]; \ - u3=(int)t>>8L; \ - u1=(int)t&0x3f; \ - u3&=0x3f; \ - LL^=des_SPtrans[5][u1]; \ - LL^=des_SPtrans[7][u3]; } -#endif /* DES_RISC1 */ -#ifdef DES_RISC2 -#define D_ENCRYPT(LL,R,S) {\ - unsigned int u1,u2,s1,s2; \ - LOAD_DATA(R,S,u,t,E0,E1,u1); \ - u>>=2L; \ - t=ROTATE(t,6); \ - u2=(int)u>>8L; \ - u1=(int)u&0x3f; \ - u2&=0x3f; \ - LL^=des_SPtrans[0][u1]; \ - LL^=des_SPtrans[2][u2]; \ - s1=(int)u>>16L; \ - s2=(int)u>>24L; \ - s1&=0x3f; \ - s2&=0x3f; \ - LL^=des_SPtrans[4][s1]; \ - LL^=des_SPtrans[6][s2]; \ - u2=(int)t>>8L; \ - u1=(int)t&0x3f; \ - u2&=0x3f; \ - LL^=des_SPtrans[1][u1]; \ - LL^=des_SPtrans[3][u2]; \ - s1=(int)t>>16; \ - s2=(int)t>>24L; \ - s1&=0x3f; \ - s2&=0x3f; \ - LL^=des_SPtrans[5][s1]; \ - LL^=des_SPtrans[7][s2]; } -#endif /* DES_RISC2 */ - -#else /* DES_RISC1 || DES_RISC2 */ - -#define D_ENCRYPT(LL,R,S) {\ - LOAD_DATA_tmp(R,S,u,t,E0,E1); \ - t=ROTATE(t,4); \ - LL^=\ - des_SPtrans[0][(u>> 2L)&0x3f]^ \ - des_SPtrans[2][(u>>10L)&0x3f]^ \ - des_SPtrans[4][(u>>18L)&0x3f]^ \ - des_SPtrans[6][(u>>26L)&0x3f]^ \ - des_SPtrans[1][(t>> 2L)&0x3f]^ \ - des_SPtrans[3][(t>>10L)&0x3f]^ \ - des_SPtrans[5][(t>>18L)&0x3f]^ \ - des_SPtrans[7][(t>>26L)&0x3f]; } -#endif /* DES_RISC1 || DES_RISC2 */ -#endif /* DES_PTR */ - - /* IP and FP - * The problem is more of a geometric problem that random bit fiddling. - 0 1 2 3 4 5 6 7 62 54 46 38 30 22 14 6 - 8 9 10 11 12 13 14 15 60 52 44 36 28 20 12 4 - 16 17 18 19 20 21 22 23 58 50 42 34 26 18 10 2 - 24 25 26 27 28 29 30 31 to 56 48 40 32 24 16 8 0 - - 32 33 34 35 36 37 38 39 63 55 47 39 31 23 15 7 - 40 41 42 43 44 45 46 47 61 53 45 37 29 21 13 5 - 48 49 50 51 52 53 54 55 59 51 43 35 27 19 11 3 - 56 57 58 59 60 61 62 63 57 49 41 33 25 17 9 1 - - The output has been subject to swaps of the form - 0 1 -> 3 1 but the odd and even bits have been put into - 2 3 2 0 - different words. The main trick is to remember that - t=((l>>size)^r)&(mask); - r^=t; - l^=(t<>(n))^(b))&(m)),\ - (b)^=(t),\ - (a)^=((t)<<(n))) - -#define IP(l,r) \ - { \ - register DES_LONG tt; \ - PERM_OP(r,l,tt, 4,0x0f0f0f0fL); \ - PERM_OP(l,r,tt,16,0x0000ffffL); \ - PERM_OP(r,l,tt, 2,0x33333333L); \ - PERM_OP(l,r,tt, 8,0x00ff00ffL); \ - PERM_OP(r,l,tt, 1,0x55555555L); \ - } - -#define FP(l,r) \ - { \ - register DES_LONG tt; \ - PERM_OP(l,r,tt, 1,0x55555555L); \ - PERM_OP(r,l,tt, 8,0x00ff00ffL); \ - PERM_OP(l,r,tt, 2,0x33333333L); \ - PERM_OP(r,l,tt,16,0x0000ffffL); \ - PERM_OP(l,r,tt, 4,0x0f0f0f0fL); \ - } -#endif diff --git a/bsd/crypto/des/des_setkey.c b/bsd/crypto/des/des_setkey.c deleted file mode 100644 index 5b7f5dec2..000000000 --- a/bsd/crypto/des/des_setkey.c +++ /dev/null @@ -1,232 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/des_setkey.c,v 1.1.2.4 2002/03/26 10:12:25 ume Exp $ */ -/* $KAME: des_setkey.c,v 1.7 2001/09/10 04:03:58 itojun Exp $ */ - -/* crypto/des/set_key.c */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) - * All rights reserved. - * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -/* set_key.c v 1.4 eay 24/9/91 - * 1.4 Speed up by 400% :-) - * 1.3 added register declarations. - * 1.2 unrolled make_key_sched a bit more - * 1.1 added norm_expand_bits - * 1.0 First working version - */ -#include -#include -#include -#include -#include - -int des_check_key=0; - -void des_set_odd_parity(des_cblock *key) -{ - int i; - - for (i=0; i>(n))^(b))&(m)),\ - * (b)^=(t),\ - * (a)=((a)^((t)<<(n)))) - */ - -#define HPERM_OP(a,t,n,m) ((t)=((((a)<<(16-(n)))^(a))&(m)),\ - (a)=(a)^(t)^(t>>(16-(n)))) - -int des_set_key(des_cblock *key, des_key_schedule schedule) -{ - if (des_check_key) - { - return des_set_key_checked(key, schedule); - } - else - { - des_set_key_unchecked(key, schedule); - return 0; - } -} - -/* return 0 if key parity is odd (correct), - * return -1 if key parity error, - * return -2 if illegal weak key. - */ -int des_set_key_checked(des_cblock *key, des_key_schedule schedule) -{ - if (!des_check_key_parity(key)) - return(-1); - if (des_is_weak_key(key)) - return(-2); - des_set_key_unchecked(key, schedule); - return 0; -} - -void des_set_key_unchecked(des_cblock *key, des_key_schedule schedule) -{ - static int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0}; - register DES_LONG c,d,t,s,t2; - register const unsigned char *in; - register DES_LONG *k; - register int i; - - k = &schedule->ks.deslong[0]; - in = &(*key)[0]; - - c2l(in,c); - c2l(in,d); - - /* do PC1 in 47 simple operations :-) - * Thanks to John Fletcher (john_fletcher@lccmail.ocf.llnl.gov) - * for the inspiration. :-) */ - PERM_OP (d,c,t,4,0x0f0f0f0fL); - HPERM_OP(c,t,-2,0xcccc0000L); - HPERM_OP(d,t,-2,0xcccc0000L); - PERM_OP (d,c,t,1,0x55555555L); - PERM_OP (c,d,t,8,0x00ff00ffL); - PERM_OP (d,c,t,1,0x55555555L); - d= (((d&0x000000ffL)<<16L)| (d&0x0000ff00L) | - ((d&0x00ff0000L)>>16L)|((c&0xf0000000L)>>4L)); - c&=0x0fffffffL; - - for (i=0; i>2L)|(c<<26L)); d=((d>>2L)|(d<<26L)); } - else - { c=((c>>1L)|(c<<27L)); d=((d>>1L)|(d<<27L)); } - c&=0x0fffffffL; - d&=0x0fffffffL; - /* could be a few less shifts but I am to lazy at this - * point in time to investigate */ - s= des_skb[0][ (c )&0x3f ]| - des_skb[1][((c>> 6L)&0x03)|((c>> 7L)&0x3c)]| - des_skb[2][((c>>13L)&0x0f)|((c>>14L)&0x30)]| - des_skb[3][((c>>20L)&0x01)|((c>>21L)&0x06) | - ((c>>22L)&0x38)]; - t= des_skb[4][ (d )&0x3f ]| - des_skb[5][((d>> 7L)&0x03)|((d>> 8L)&0x3c)]| - des_skb[6][ (d>>15L)&0x3f ]| - des_skb[7][((d>>21L)&0x0f)|((d>>22L)&0x30)]; - - /* table contained 0213 4657 */ - t2=((t<<16L)|(s&0x0000ffffL))&0xffffffffL; - *(k++)=ROTATE(t2,30)&0xffffffffL; - - t2=((s>>16L)|(t&0xffff0000L)); - *(k++)=ROTATE(t2,26)&0xffffffffL; - } -} - -int des_key_sched(des_cblock *key, des_key_schedule schedule) -{ - return(des_set_key(key,schedule)); -} - -void des_fixup_key_parity(des_cblock *key) -{ - des_set_odd_parity(key); -} diff --git a/bsd/crypto/des/podd.h b/bsd/crypto/des/podd.h deleted file mode 100644 index 61646cc4b..000000000 --- a/bsd/crypto/des/podd.h +++ /dev/null @@ -1,67 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/podd.h,v 1.1.2.1 2000/07/15 07:14:21 kris Exp $ */ -/* $KAME: podd.h,v 1.3 2000/03/27 04:36:34 sumikawa Exp $ */ - -/* crypto/des/podd.h */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) - * All rights reserved. - * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -static const unsigned char odd_parity[256]={ - 1, 1, 2, 2, 4, 4, 7, 7, 8, 8, 11, 11, 13, 13, 14, 14, - 16, 16, 19, 19, 21, 21, 22, 22, 25, 25, 26, 26, 28, 28, 31, 31, - 32, 32, 35, 35, 37, 37, 38, 38, 41, 41, 42, 42, 44, 44, 47, 47, - 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 59, 59, 61, 61, 62, 62, - 64, 64, 67, 67, 69, 69, 70, 70, 73, 73, 74, 74, 76, 76, 79, 79, - 81, 81, 82, 82, 84, 84, 87, 87, 88, 88, 91, 91, 93, 93, 94, 94, - 97, 97, 98, 98,100,100,103,103,104,104,107,107,109,109,110,110, -112,112,115,115,117,117,118,118,121,121,122,122,124,124,127,127, -128,128,131,131,133,133,134,134,137,137,138,138,140,140,143,143, -145,145,146,146,148,148,151,151,152,152,155,155,157,157,158,158, -161,161,162,162,164,164,167,167,168,168,171,171,173,173,174,174, -176,176,179,179,181,181,182,182,185,185,186,186,188,188,191,191, -193,193,194,194,196,196,199,199,200,200,203,203,205,205,206,206, -208,208,211,211,213,213,214,214,217,217,218,218,220,220,223,223, -224,224,227,227,229,229,230,230,233,233,234,234,236,236,239,239, -241,241,242,242,244,244,247,247,248,248,251,251,253,253,254,254}; diff --git a/bsd/crypto/des/sk.h b/bsd/crypto/des/sk.h deleted file mode 100644 index 6009c114a..000000000 --- a/bsd/crypto/des/sk.h +++ /dev/null @@ -1,196 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/sk.h,v 1.1.2.1 2000/07/15 07:14:21 kris Exp $ */ -/* $KAME: sk.h,v 1.3 2000/03/27 04:36:34 sumikawa Exp $ */ - -/* crypto/des/sk.h */ -/* Copyright (C) 1995-1996 Eric Young (eay@mincom.oz.au) - * All rights reserved. - * - * This file is part of an SSL implementation written - * by Eric Young (eay@mincom.oz.au). - * The implementation was written so as to conform with Netscapes SSL - * specification. This library and applications are - * FREE FOR COMMERCIAL AND NON-COMMERCIAL USE - * as long as the following conditions are aheared to. - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. If this code is used in a product, - * Eric Young should be given attribution as the author of the parts used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Eric Young (eay@mincom.oz.au) - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -static const DES_LONG des_skb[8][64]={ -{ -/* for C bits (numbered as per FIPS 46) 1 2 3 4 5 6 */ -0x00000000L,0x00000010L,0x20000000L,0x20000010L, -0x00010000L,0x00010010L,0x20010000L,0x20010010L, -0x00000800L,0x00000810L,0x20000800L,0x20000810L, -0x00010800L,0x00010810L,0x20010800L,0x20010810L, -0x00000020L,0x00000030L,0x20000020L,0x20000030L, -0x00010020L,0x00010030L,0x20010020L,0x20010030L, -0x00000820L,0x00000830L,0x20000820L,0x20000830L, -0x00010820L,0x00010830L,0x20010820L,0x20010830L, -0x00080000L,0x00080010L,0x20080000L,0x20080010L, -0x00090000L,0x00090010L,0x20090000L,0x20090010L, -0x00080800L,0x00080810L,0x20080800L,0x20080810L, -0x00090800L,0x00090810L,0x20090800L,0x20090810L, -0x00080020L,0x00080030L,0x20080020L,0x20080030L, -0x00090020L,0x00090030L,0x20090020L,0x20090030L, -0x00080820L,0x00080830L,0x20080820L,0x20080830L, -0x00090820L,0x00090830L,0x20090820L,0x20090830L, -},{ -/* for C bits (numbered as per FIPS 46) 7 8 10 11 12 13 */ -0x00000000L,0x02000000L,0x00002000L,0x02002000L, -0x00200000L,0x02200000L,0x00202000L,0x02202000L, -0x00000004L,0x02000004L,0x00002004L,0x02002004L, -0x00200004L,0x02200004L,0x00202004L,0x02202004L, -0x00000400L,0x02000400L,0x00002400L,0x02002400L, -0x00200400L,0x02200400L,0x00202400L,0x02202400L, -0x00000404L,0x02000404L,0x00002404L,0x02002404L, -0x00200404L,0x02200404L,0x00202404L,0x02202404L, -0x10000000L,0x12000000L,0x10002000L,0x12002000L, -0x10200000L,0x12200000L,0x10202000L,0x12202000L, -0x10000004L,0x12000004L,0x10002004L,0x12002004L, -0x10200004L,0x12200004L,0x10202004L,0x12202004L, -0x10000400L,0x12000400L,0x10002400L,0x12002400L, -0x10200400L,0x12200400L,0x10202400L,0x12202400L, -0x10000404L,0x12000404L,0x10002404L,0x12002404L, -0x10200404L,0x12200404L,0x10202404L,0x12202404L, -},{ -/* for C bits (numbered as per FIPS 46) 14 15 16 17 19 20 */ -0x00000000L,0x00000001L,0x00040000L,0x00040001L, -0x01000000L,0x01000001L,0x01040000L,0x01040001L, -0x00000002L,0x00000003L,0x00040002L,0x00040003L, -0x01000002L,0x01000003L,0x01040002L,0x01040003L, -0x00000200L,0x00000201L,0x00040200L,0x00040201L, -0x01000200L,0x01000201L,0x01040200L,0x01040201L, -0x00000202L,0x00000203L,0x00040202L,0x00040203L, -0x01000202L,0x01000203L,0x01040202L,0x01040203L, -0x08000000L,0x08000001L,0x08040000L,0x08040001L, -0x09000000L,0x09000001L,0x09040000L,0x09040001L, -0x08000002L,0x08000003L,0x08040002L,0x08040003L, -0x09000002L,0x09000003L,0x09040002L,0x09040003L, -0x08000200L,0x08000201L,0x08040200L,0x08040201L, -0x09000200L,0x09000201L,0x09040200L,0x09040201L, -0x08000202L,0x08000203L,0x08040202L,0x08040203L, -0x09000202L,0x09000203L,0x09040202L,0x09040203L, -},{ -/* for C bits (numbered as per FIPS 46) 21 23 24 26 27 28 */ -0x00000000L,0x00100000L,0x00000100L,0x00100100L, -0x00000008L,0x00100008L,0x00000108L,0x00100108L, -0x00001000L,0x00101000L,0x00001100L,0x00101100L, -0x00001008L,0x00101008L,0x00001108L,0x00101108L, -0x04000000L,0x04100000L,0x04000100L,0x04100100L, -0x04000008L,0x04100008L,0x04000108L,0x04100108L, -0x04001000L,0x04101000L,0x04001100L,0x04101100L, -0x04001008L,0x04101008L,0x04001108L,0x04101108L, -0x00020000L,0x00120000L,0x00020100L,0x00120100L, -0x00020008L,0x00120008L,0x00020108L,0x00120108L, -0x00021000L,0x00121000L,0x00021100L,0x00121100L, -0x00021008L,0x00121008L,0x00021108L,0x00121108L, -0x04020000L,0x04120000L,0x04020100L,0x04120100L, -0x04020008L,0x04120008L,0x04020108L,0x04120108L, -0x04021000L,0x04121000L,0x04021100L,0x04121100L, -0x04021008L,0x04121008L,0x04021108L,0x04121108L, -},{ -/* for D bits (numbered as per FIPS 46) 1 2 3 4 5 6 */ -0x00000000L,0x10000000L,0x00010000L,0x10010000L, -0x00000004L,0x10000004L,0x00010004L,0x10010004L, -0x20000000L,0x30000000L,0x20010000L,0x30010000L, -0x20000004L,0x30000004L,0x20010004L,0x30010004L, -0x00100000L,0x10100000L,0x00110000L,0x10110000L, -0x00100004L,0x10100004L,0x00110004L,0x10110004L, -0x20100000L,0x30100000L,0x20110000L,0x30110000L, -0x20100004L,0x30100004L,0x20110004L,0x30110004L, -0x00001000L,0x10001000L,0x00011000L,0x10011000L, -0x00001004L,0x10001004L,0x00011004L,0x10011004L, -0x20001000L,0x30001000L,0x20011000L,0x30011000L, -0x20001004L,0x30001004L,0x20011004L,0x30011004L, -0x00101000L,0x10101000L,0x00111000L,0x10111000L, -0x00101004L,0x10101004L,0x00111004L,0x10111004L, -0x20101000L,0x30101000L,0x20111000L,0x30111000L, -0x20101004L,0x30101004L,0x20111004L,0x30111004L, -},{ -/* for D bits (numbered as per FIPS 46) 8 9 11 12 13 14 */ -0x00000000L,0x08000000L,0x00000008L,0x08000008L, -0x00000400L,0x08000400L,0x00000408L,0x08000408L, -0x00020000L,0x08020000L,0x00020008L,0x08020008L, -0x00020400L,0x08020400L,0x00020408L,0x08020408L, -0x00000001L,0x08000001L,0x00000009L,0x08000009L, -0x00000401L,0x08000401L,0x00000409L,0x08000409L, -0x00020001L,0x08020001L,0x00020009L,0x08020009L, -0x00020401L,0x08020401L,0x00020409L,0x08020409L, -0x02000000L,0x0A000000L,0x02000008L,0x0A000008L, -0x02000400L,0x0A000400L,0x02000408L,0x0A000408L, -0x02020000L,0x0A020000L,0x02020008L,0x0A020008L, -0x02020400L,0x0A020400L,0x02020408L,0x0A020408L, -0x02000001L,0x0A000001L,0x02000009L,0x0A000009L, -0x02000401L,0x0A000401L,0x02000409L,0x0A000409L, -0x02020001L,0x0A020001L,0x02020009L,0x0A020009L, -0x02020401L,0x0A020401L,0x02020409L,0x0A020409L, -},{ -/* for D bits (numbered as per FIPS 46) 16 17 18 19 20 21 */ -0x00000000L,0x00000100L,0x00080000L,0x00080100L, -0x01000000L,0x01000100L,0x01080000L,0x01080100L, -0x00000010L,0x00000110L,0x00080010L,0x00080110L, -0x01000010L,0x01000110L,0x01080010L,0x01080110L, -0x00200000L,0x00200100L,0x00280000L,0x00280100L, -0x01200000L,0x01200100L,0x01280000L,0x01280100L, -0x00200010L,0x00200110L,0x00280010L,0x00280110L, -0x01200010L,0x01200110L,0x01280010L,0x01280110L, -0x00000200L,0x00000300L,0x00080200L,0x00080300L, -0x01000200L,0x01000300L,0x01080200L,0x01080300L, -0x00000210L,0x00000310L,0x00080210L,0x00080310L, -0x01000210L,0x01000310L,0x01080210L,0x01080310L, -0x00200200L,0x00200300L,0x00280200L,0x00280300L, -0x01200200L,0x01200300L,0x01280200L,0x01280300L, -0x00200210L,0x00200310L,0x00280210L,0x00280310L, -0x01200210L,0x01200310L,0x01280210L,0x01280310L, -},{ -/* for D bits (numbered as per FIPS 46) 22 23 24 25 27 28 */ -0x00000000L,0x04000000L,0x00040000L,0x04040000L, -0x00000002L,0x04000002L,0x00040002L,0x04040002L, -0x00002000L,0x04002000L,0x00042000L,0x04042000L, -0x00002002L,0x04002002L,0x00042002L,0x04042002L, -0x00000020L,0x04000020L,0x00040020L,0x04040020L, -0x00000022L,0x04000022L,0x00040022L,0x04040022L, -0x00002020L,0x04002020L,0x00042020L,0x04042020L, -0x00002022L,0x04002022L,0x00042022L,0x04042022L, -0x00000800L,0x04000800L,0x00040800L,0x04040800L, -0x00000802L,0x04000802L,0x00040802L,0x04040802L, -0x00002800L,0x04002800L,0x00042800L,0x04042800L, -0x00002802L,0x04002802L,0x00042802L,0x04042802L, -0x00000820L,0x04000820L,0x00040820L,0x04040820L, -0x00000822L,0x04000822L,0x00040822L,0x04040822L, -0x00002820L,0x04002820L,0x00042820L,0x04042820L, -0x00002822L,0x04002822L,0x00042822L,0x04042822L, -}}; diff --git a/bsd/crypto/des/spr.h b/bsd/crypto/des/spr.h deleted file mode 100644 index e7d8626dc..000000000 --- a/bsd/crypto/des/spr.h +++ /dev/null @@ -1,207 +0,0 @@ -/* $FreeBSD: src/sys/crypto/des/spr.h,v 1.1.2.2 2002/03/26 10:12:25 ume Exp $ */ -/* $KAME: spr.h,v 1.4 2001/09/10 04:03:58 itojun Exp $ */ - -/* crypto/des/spr.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -const DES_LONG des_SPtrans[8][64]={ -{ -/* nibble 0 */ -0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L, -0x02000000L, 0x00080802L, 0x00080002L, 0x02000002L, -0x00080802L, 0x02080800L, 0x02080000L, 0x00000802L, -0x02000802L, 0x02000000L, 0x00000000L, 0x00080002L, -0x00080000L, 0x00000002L, 0x02000800L, 0x00080800L, -0x02080802L, 0x02080000L, 0x00000802L, 0x02000800L, -0x00000002L, 0x00000800L, 0x00080800L, 0x02080002L, -0x00000800L, 0x02000802L, 0x02080002L, 0x00000000L, -0x00000000L, 0x02080802L, 0x02000800L, 0x00080002L, -0x02080800L, 0x00080000L, 0x00000802L, 0x02000800L, -0x02080002L, 0x00000800L, 0x00080800L, 0x02000002L, -0x00080802L, 0x00000002L, 0x02000002L, 0x02080000L, -0x02080802L, 0x00080800L, 0x02080000L, 0x02000802L, -0x02000000L, 0x00000802L, 0x00080002L, 0x00000000L, -0x00080000L, 0x02000000L, 0x02000802L, 0x02080800L, -0x00000002L, 0x02080002L, 0x00000800L, 0x00080802L, -},{ -/* nibble 1 */ -0x40108010L, 0x00000000L, 0x00108000L, 0x40100000L, -0x40000010L, 0x00008010L, 0x40008000L, 0x00108000L, -0x00008000L, 0x40100010L, 0x00000010L, 0x40008000L, -0x00100010L, 0x40108000L, 0x40100000L, 0x00000010L, -0x00100000L, 0x40008010L, 0x40100010L, 0x00008000L, -0x00108010L, 0x40000000L, 0x00000000L, 0x00100010L, -0x40008010L, 0x00108010L, 0x40108000L, 0x40000010L, -0x40000000L, 0x00100000L, 0x00008010L, 0x40108010L, -0x00100010L, 0x40108000L, 0x40008000L, 0x00108010L, -0x40108010L, 0x00100010L, 0x40000010L, 0x00000000L, -0x40000000L, 0x00008010L, 0x00100000L, 0x40100010L, -0x00008000L, 0x40000000L, 0x00108010L, 0x40008010L, -0x40108000L, 0x00008000L, 0x00000000L, 0x40000010L, -0x00000010L, 0x40108010L, 0x00108000L, 0x40100000L, -0x40100010L, 0x00100000L, 0x00008010L, 0x40008000L, -0x40008010L, 0x00000010L, 0x40100000L, 0x00108000L, -},{ -/* nibble 2 */ -0x04000001L, 0x04040100L, 0x00000100L, 0x04000101L, -0x00040001L, 0x04000000L, 0x04000101L, 0x00040100L, -0x04000100L, 0x00040000L, 0x04040000L, 0x00000001L, -0x04040101L, 0x00000101L, 0x00000001L, 0x04040001L, -0x00000000L, 0x00040001L, 0x04040100L, 0x00000100L, -0x00000101L, 0x04040101L, 0x00040000L, 0x04000001L, -0x04040001L, 0x04000100L, 0x00040101L, 0x04040000L, -0x00040100L, 0x00000000L, 0x04000000L, 0x00040101L, -0x04040100L, 0x00000100L, 0x00000001L, 0x00040000L, -0x00000101L, 0x00040001L, 0x04040000L, 0x04000101L, -0x00000000L, 0x04040100L, 0x00040100L, 0x04040001L, -0x00040001L, 0x04000000L, 0x04040101L, 0x00000001L, -0x00040101L, 0x04000001L, 0x04000000L, 0x04040101L, -0x00040000L, 0x04000100L, 0x04000101L, 0x00040100L, -0x04000100L, 0x00000000L, 0x04040001L, 0x00000101L, -0x04000001L, 0x00040101L, 0x00000100L, 0x04040000L, -},{ -/* nibble 3 */ -0x00401008L, 0x10001000L, 0x00000008L, 0x10401008L, -0x00000000L, 0x10400000L, 0x10001008L, 0x00400008L, -0x10401000L, 0x10000008L, 0x10000000L, 0x00001008L, -0x10000008L, 0x00401008L, 0x00400000L, 0x10000000L, -0x10400008L, 0x00401000L, 0x00001000L, 0x00000008L, -0x00401000L, 0x10001008L, 0x10400000L, 0x00001000L, -0x00001008L, 0x00000000L, 0x00400008L, 0x10401000L, -0x10001000L, 0x10400008L, 0x10401008L, 0x00400000L, -0x10400008L, 0x00001008L, 0x00400000L, 0x10000008L, -0x00401000L, 0x10001000L, 0x00000008L, 0x10400000L, -0x10001008L, 0x00000000L, 0x00001000L, 0x00400008L, -0x00000000L, 0x10400008L, 0x10401000L, 0x00001000L, -0x10000000L, 0x10401008L, 0x00401008L, 0x00400000L, -0x10401008L, 0x00000008L, 0x10001000L, 0x00401008L, -0x00400008L, 0x00401000L, 0x10400000L, 0x10001008L, -0x00001008L, 0x10000000L, 0x10000008L, 0x10401000L, -},{ -/* nibble 4 */ -0x08000000L, 0x00010000L, 0x00000400L, 0x08010420L, -0x08010020L, 0x08000400L, 0x00010420L, 0x08010000L, -0x00010000L, 0x00000020L, 0x08000020L, 0x00010400L, -0x08000420L, 0x08010020L, 0x08010400L, 0x00000000L, -0x00010400L, 0x08000000L, 0x00010020L, 0x00000420L, -0x08000400L, 0x00010420L, 0x00000000L, 0x08000020L, -0x00000020L, 0x08000420L, 0x08010420L, 0x00010020L, -0x08010000L, 0x00000400L, 0x00000420L, 0x08010400L, -0x08010400L, 0x08000420L, 0x00010020L, 0x08010000L, -0x00010000L, 0x00000020L, 0x08000020L, 0x08000400L, -0x08000000L, 0x00010400L, 0x08010420L, 0x00000000L, -0x00010420L, 0x08000000L, 0x00000400L, 0x00010020L, -0x08000420L, 0x00000400L, 0x00000000L, 0x08010420L, -0x08010020L, 0x08010400L, 0x00000420L, 0x00010000L, -0x00010400L, 0x08010020L, 0x08000400L, 0x00000420L, -0x00000020L, 0x00010420L, 0x08010000L, 0x08000020L, -},{ -/* nibble 5 */ -0x80000040L, 0x00200040L, 0x00000000L, 0x80202000L, -0x00200040L, 0x00002000L, 0x80002040L, 0x00200000L, -0x00002040L, 0x80202040L, 0x00202000L, 0x80000000L, -0x80002000L, 0x80000040L, 0x80200000L, 0x00202040L, -0x00200000L, 0x80002040L, 0x80200040L, 0x00000000L, -0x00002000L, 0x00000040L, 0x80202000L, 0x80200040L, -0x80202040L, 0x80200000L, 0x80000000L, 0x00002040L, -0x00000040L, 0x00202000L, 0x00202040L, 0x80002000L, -0x00002040L, 0x80000000L, 0x80002000L, 0x00202040L, -0x80202000L, 0x00200040L, 0x00000000L, 0x80002000L, -0x80000000L, 0x00002000L, 0x80200040L, 0x00200000L, -0x00200040L, 0x80202040L, 0x00202000L, 0x00000040L, -0x80202040L, 0x00202000L, 0x00200000L, 0x80002040L, -0x80000040L, 0x80200000L, 0x00202040L, 0x00000000L, -0x00002000L, 0x80000040L, 0x80002040L, 0x80202000L, -0x80200000L, 0x00002040L, 0x00000040L, 0x80200040L, -},{ -/* nibble 6 */ -0x00004000L, 0x00000200L, 0x01000200L, 0x01000004L, -0x01004204L, 0x00004004L, 0x00004200L, 0x00000000L, -0x01000000L, 0x01000204L, 0x00000204L, 0x01004000L, -0x00000004L, 0x01004200L, 0x01004000L, 0x00000204L, -0x01000204L, 0x00004000L, 0x00004004L, 0x01004204L, -0x00000000L, 0x01000200L, 0x01000004L, 0x00004200L, -0x01004004L, 0x00004204L, 0x01004200L, 0x00000004L, -0x00004204L, 0x01004004L, 0x00000200L, 0x01000000L, -0x00004204L, 0x01004000L, 0x01004004L, 0x00000204L, -0x00004000L, 0x00000200L, 0x01000000L, 0x01004004L, -0x01000204L, 0x00004204L, 0x00004200L, 0x00000000L, -0x00000200L, 0x01000004L, 0x00000004L, 0x01000200L, -0x00000000L, 0x01000204L, 0x01000200L, 0x00004200L, -0x00000204L, 0x00004000L, 0x01004204L, 0x01000000L, -0x01004200L, 0x00000004L, 0x00004004L, 0x01004204L, -0x01000004L, 0x01004200L, 0x01004000L, 0x00004004L, -},{ -/* nibble 7 */ -0x20800080L, 0x20820000L, 0x00020080L, 0x00000000L, -0x20020000L, 0x00800080L, 0x20800000L, 0x20820080L, -0x00000080L, 0x20000000L, 0x00820000L, 0x00020080L, -0x00820080L, 0x20020080L, 0x20000080L, 0x20800000L, -0x00020000L, 0x00820080L, 0x00800080L, 0x20020000L, -0x20820080L, 0x20000080L, 0x00000000L, 0x00820000L, -0x20000000L, 0x00800000L, 0x20020080L, 0x20800080L, -0x00800000L, 0x00020000L, 0x20820000L, 0x00000080L, -0x00800000L, 0x00020000L, 0x20000080L, 0x20820080L, -0x00020080L, 0x20000000L, 0x00000000L, 0x00820000L, -0x20800080L, 0x20020080L, 0x20020000L, 0x00800080L, -0x20820000L, 0x00000080L, 0x00800080L, 0x20020000L, -0x20820080L, 0x00800000L, 0x20800000L, 0x20000080L, -0x00820000L, 0x00020080L, 0x20020080L, 0x20800000L, -0x00000080L, 0x20820000L, 0x00820080L, 0x00000000L, -0x20000000L, 0x20800080L, 0x00020000L, 0x00820080L, -}}; diff --git a/bsd/crypto/sha2.h b/bsd/crypto/sha2.h new file mode 100644 index 000000000..7e1dea80c --- /dev/null +++ b/bsd/crypto/sha2.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * This header file is kept for legacy reasons and may be removed in + * future; the interface resides in . + */ +#include diff --git a/bsd/crypto/sha2/Makefile b/bsd/crypto/sha2/Makefile deleted file mode 100644 index 4cc93fb76..000000000 --- a/bsd/crypto/sha2/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_I386 = \ - -INSTINC_SUBDIRS_X86_64 = \ - -INSTINC_SUBDIRS_ARM = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS_X86_64 = \ - -EXPINC_SUBDIRS_ARM = \ - -PRIVATE_DATAFILES = \ - sha2.h - -INSTALL_MI_DIR = crypto - -EXPORT_MI_DIR = ${INSTALL_MI_DIR} - -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/bsd/crypto/sha2/intel/sha256.s b/bsd/crypto/sha2/intel/sha256.s deleted file mode 100644 index 59353ff4b..000000000 --- a/bsd/crypto/sha2/intel/sha256.s +++ /dev/null @@ -1,617 +0,0 @@ -/* - This file provides x86_64/i386 hand implementation of the following function - - void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); - - which is a C function in sha2.c (from xnu). - - The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to - SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file - with all ssse3 instructions replaced with sse3 or below instructions. - - sha256 algorithm per block description: - - 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) - 2. load 8 digests a-h from ctx->state - 3. for r = 0:15 - T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; - d += T1; - h = T1 + Sigma0(a) + Maj(a,b,c) - permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g - 4. for r = 16:63 - W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); - T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; - d += T1; - h = T1 + Sigma0(a) + Maj(a,b,c) - permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g - - In the assembly implementation: - - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 - - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer - - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) - - the implementation per block looks like - - ---------------------------------------------------------------------------- - - load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 - pre_calculate and store W+K(0:15) in stack - - load digests a-h from ctx->state; - - for (r=0;r<48;r+=4) { - digests a-h update and permute round r:r+3 - update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration - } - - for (r=48;r<64;r+=4) { - digests a-h update and permute round r:r+3 - } - - ctx->states += digests a-h; - - ---------------------------------------------------------------------------- - - our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block - into the last 16 rounds of its previous block: - - ---------------------------------------------------------------------------- - - load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 - pre_calculate and store W+K(0:15) in stack - -L_loop: - - load digests a-h from ctx->state; - - for (r=0;r<48;r+=4) { - digests a-h update and permute round r:r+3 - update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration - } - - num_block--; - if (num_block==0) jmp L_last_block; - - for (r=48;r<64;r+=4) { - digests a-h update and permute round r:r+3 - load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 - pre_calculate and store W+K([r:r+3]%16) in stack - } - - ctx->states += digests a-h; - - jmp L_loop; - -L_last_block: - - for (r=48;r<64;r+=4) { - digests a-h update and permute round r:r+3 - } - - ctx->states += digests a-h; - - ------------------------------------------------------------------------ - - Apple CoreOS vector & numerics - cclee 8-3-10 -*/ - -#if defined KERNEL -#include -#else -#include -#endif - - // associate variables with registers or memory - -#if defined (__x86_64__) - #define sp %rsp - #define ctx %rdi - #define data %rsi - #define num_blocks %rdx - - #define a %r8d - #define b %r9d - #define c %r10d - #define d %r11d - #define e %r12d - #define f %r13d - #define g %r14d - #define h %r15d - - #define K %rbx - #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) - - #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words - #define xmm_save 80(sp) // starting address for xmm save/restore -#else - #define sp %esp - #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) - #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument - #define data_addr 24+stack_size(sp) // 2nd caller argument - #define num_blocks 28+stack_size(sp) // 3rd caller argument - - #define a %ebx - #define b %edx - #define c 64(sp) - #define d %ebp - #define e %esi - #define f 68(sp) - #define g %edi - #define h 72(sp) - - #define K 76(sp) // pointer to K256[] table - #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words - #define xmm_save 96(sp) // starting address for xmm save/restore -#endif - - // 2 local variables - #define t %eax - #define s %ecx - - // a window (16 words) of message scheule - #define W0 %xmm0 - #define W1 %xmm1 - #define W2 %xmm2 - #define W3 %xmm3 - - // circular buffer for WK[(r:r+15)%16] - #define WK(x) (x&15)*4(sp) - -// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) - - .macro Ch - mov $0, t // x - mov $0, s // x - not t // ~x - and $1, s // x & y - and $2, t // ~x & z - xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); - .endm - -// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - - .macro Maj - mov $0, t // x - mov $1, s // y - and s, t // x&y - and $2, s // y&z - xor s, t // (x&y) ^ (y&z) - mov $2, s // z - and $0, s // (x&z) - xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - .endm - -/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ -// #define R(b,x) ((x) >> (b)) -/* 32-bit Rotate-right (used in SHA-256): */ -// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) - -// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) - - // performs sigma0_256 on 4 words on an xmm registers - // use xmm6/xmm7 as intermediate registers - .macro sigma0 - movdqa $0, %xmm6 - movdqa $0, %xmm7 - psrld $$3, $0 // SHR3(x) - psrld $$7, %xmm6 // part of ROTR7 - pslld $$14, %xmm7 // part of ROTR18 - pxor %xmm6, $0 - pxor %xmm7, $0 - psrld $$11, %xmm6 // part of ROTR18 - pslld $$11, %xmm7 // part of ROTR7 - pxor %xmm6, $0 - pxor %xmm7, $0 - .endm - -// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) - - // performs sigma1_256 on 4 words on an xmm registers - // use xmm6/xmm7 as intermediate registers - .macro sigma1 - movdqa $0, %xmm6 - movdqa $0, %xmm7 - psrld $$10, $0 // SHR10(x) - psrld $$17, %xmm6 // part of ROTR17 - pxor %xmm6, $0 - pslld $$13, %xmm7 // part of ROTR19 - pxor %xmm7, $0 - psrld $$2, %xmm6 // part of ROTR19 - pxor %xmm6, $0 - pslld $$2, %xmm7 // part of ROTR17 - pxor %xmm7, $0 - .endm - -// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) - - .macro Sigma0 - mov $0, t // x - mov $0, s // x - ror $$2, t // S32(2, (x)) - ror $$13, s // S32(13, (x)) - xor s, t // S32(2, (x)) ^ S32(13, (x)) - ror $$9, s // S32(22, (x)) - xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) - .endm - -// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) - - .macro Sigma1 - mov $0, s // x - ror $$6, s // S32(6, (x)) - mov s, t // S32(6, (x)) - ror $$5, s // S32(11, (x)) - xor s, t // S32(6, (x)) ^ S32(11, (x)) - ror $$14, s // S32(25, (x)) - xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) - .endm - - // per round digests update - .macro round - Sigma1 $4 // t = T1 - add t, $7 // use h to store h+Sigma1(e) - Ch $4, $5, $6 // t = Ch (e, f, g); - add $7, t // t = h+Sigma1(e)+Ch(e,f,g); - add WK($8), t // h = T1 - add t, $3 // d += T1; - mov t, $7 // h = T1 - Sigma0 $0 // t = Sigma0(a); - add t, $7 // h = T1 + Sigma0(a); - Maj $0, $1, $2 // t = Maj(a,b,c) - add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); - .endm - - // per 4 rounds digests update and permutation - // permutation is absorbed by rotating the roles of digests a-h - .macro rounds - round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 - round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 - round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 - round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 - .endm - - // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future - .macro message_schedule - - // 4 32-bit K256 words in xmm5 -#if defined (__x86_64__) - movdqu (K), %xmm5 -#else - mov K, t - movdqu (t), %xmm5 -#endif - add $$16, K // K points to next K256 word for next iteration - movdqa $1, %xmm4 // W7:W4 - palignr $$4, $0, %xmm4 // W4:W1 - sigma0 %xmm4 // sigma0(W4:W1) - movdqa $3, %xmm6 // W15:W12 - paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) - palignr $$4, $2, %xmm6 // W12:W9 - paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 - movdqa $3, %xmm4 // W15:W12 - psrldq $$8, %xmm4 // 0,0,W15,W14 - sigma1 %xmm4 // sigma1(0,0,W15,W14) - paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 - movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 - pslldq $$8, %xmm4 // W17, W16, 0, 0 - sigma1 %xmm4 // sigma1(W17,W16,0,0) - paddd %xmm4, $0 // W19:W16 - paddd $0, %xmm5 // WK - movdqa %xmm5, WK($4) - .endm - - // this macro is used in the last 16 rounds of a current block - // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] - // and save into stack to prepare for next block - - .macro update_W_WK -#if defined (__x86_64__) - movdqu $0*16(data), $1 // read 4 4-byte words - pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] - movdqu $0*16(K), %xmm4 // K[r:r+3] -#else - mov data_addr, t - movdqu $0*16(t), $1 // read 4 4-byte words - pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] - mov K, t - movdqu $0*16(t), %xmm4 // K[r:r+3] -#endif - paddd $1, %xmm4 // WK[r:r+3] - movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer - .endm - - .text - -#if defined (__x86_64__) || defined (__i386__) - - .globl _SHA256_Transform - -_SHA256_Transform: - - - // detect SSSE3 and dispatch appropriate code branch - #if defined __x86_64__ - movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities - mov (%rax), %eax // %eax = __cpu_capabilities - #else // i386 - #if defined KERNEL - leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities - mov (%eax), %eax // %eax = __cpu_capabilities - #else - mov _COMM_PAGE_CPU_CAPABILITIES, %eax - #endif - #endif - test $(kHasSupplementalSSE3), %eax - je _SHA256_Transform_nossse3 // branch to no-ssse3 code - - // push callee-saved registers -#if defined (__x86_64__) - push %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 -#else - push %ebp - push %ebx - push %esi - push %edi -#endif - - // allocate stack space - sub $stack_size, sp - - // if kernel code, save used xmm registers -#if KERNEL - movdqa %xmm0, 0*16+xmm_save - movdqa %xmm1, 1*16+xmm_save - movdqa %xmm2, 2*16+xmm_save - movdqa %xmm3, 3*16+xmm_save - movdqa %xmm4, 4*16+xmm_save - movdqa %xmm5, 5*16+xmm_save - movdqa %xmm6, 6*16+xmm_save - movdqa %xmm7, 7*16+xmm_save -#endif - - // set up bswap parameters in the aligned stack space and pointer to table K256[] -#if defined (__x86_64__) - lea _K256(%rip), K - lea L_bswap(%rip), %rax - movdqa (%rax), %xmm0 -#else - lea _K256, t - mov t, K - lea L_bswap, %eax - movdqa (%eax), %xmm0 -#endif - movdqa %xmm0, L_aligned_bswap - - // load W[0:15] into xmm0-xmm3 -#if defined (__x86_64__) - movdqu 0*16(data), W0 - movdqu 1*16(data), W1 - movdqu 2*16(data), W2 - movdqu 3*16(data), W3 - add $64, data -#else - mov data_addr, t - movdqu 0*16(t), W0 - movdqu 1*16(t), W1 - movdqu 2*16(t), W2 - movdqu 3*16(t), W3 - add $64, data_addr -#endif - pshufb L_aligned_bswap, W0 - pshufb L_aligned_bswap, W1 - pshufb L_aligned_bswap, W2 - pshufb L_aligned_bswap, W3 - - // compute WK[0:15] and save in stack -#if defined (__x86_64__) - movdqu 0*16(K), %xmm4 - movdqu 1*16(K), %xmm5 - movdqu 2*16(K), %xmm6 - movdqu 3*16(K), %xmm7 -#else - mov K, t - movdqu 0*16(t), %xmm4 - movdqu 1*16(t), %xmm5 - movdqu 2*16(t), %xmm6 - movdqu 3*16(t), %xmm7 -#endif - add $64, K - paddd %xmm0, %xmm4 - paddd %xmm1, %xmm5 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm4, WK(0) - movdqa %xmm5, WK(4) - movdqa %xmm6, WK(8) - movdqa %xmm7, WK(12) - -L_loop: - - // digests a-h = ctx->states; -#if defined (__x86_64__) - mov 0*4(ctx), a - mov 1*4(ctx), b - mov 2*4(ctx), c - mov 3*4(ctx), d - mov 4*4(ctx), e - mov 5*4(ctx), f - mov 6*4(ctx), g - mov 7*4(ctx), h -#else - mov ctx_addr, t - mov 0*4(t), a - mov 1*4(t), b - mov 2*4(t), s - mov s, c - mov 3*4(t), d - mov 4*4(t), e - mov 5*4(t), s - mov s, f - mov 6*4(t), g - mov 7*4(t), s - mov s, h -#endif - - // rounds 0:47 interleaved with W/WK update for rounds 16:63 - rounds a, b, c, d, e, f, g, h, 0 - message_schedule W0,W1,W2,W3,16 - rounds e, f, g, h, a, b, c, d, 4 - message_schedule W1,W2,W3,W0,20 - rounds a, b, c, d, e, f, g, h, 8 - message_schedule W2,W3,W0,W1,24 - rounds e, f, g, h, a, b, c, d, 12 - message_schedule W3,W0,W1,W2,28 - rounds a, b, c, d, e, f, g, h, 16 - message_schedule W0,W1,W2,W3,32 - rounds e, f, g, h, a, b, c, d, 20 - message_schedule W1,W2,W3,W0,36 - rounds a, b, c, d, e, f, g, h, 24 - message_schedule W2,W3,W0,W1,40 - rounds e, f, g, h, a, b, c, d, 28 - message_schedule W3,W0,W1,W2,44 - rounds a, b, c, d, e, f, g, h, 32 - message_schedule W0,W1,W2,W3,48 - rounds e, f, g, h, a, b, c, d, 36 - message_schedule W1,W2,W3,W0,52 - rounds a, b, c, d, e, f, g, h, 40 - message_schedule W2,W3,W0,W1,56 - rounds e, f, g, h, a, b, c, d, 44 - message_schedule W3,W0,W1,W2,60 - - // revert K to the beginning of K256[] -#if defined __x86_64__ - sub $256, K -#else - subl $256, K -#endif - - sub $1, num_blocks // num_blocks-- - je L_final_block // if final block, wrap up final rounds - - // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 - rounds a, b, c, d, e, f, g, h, 48 - update_W_WK 0, W0 - rounds e, f, g, h, a, b, c, d, 52 - update_W_WK 1, W1 - rounds a, b, c, d, e, f, g, h, 56 - update_W_WK 2, W2 - rounds e, f, g, h, a, b, c, d, 60 - update_W_WK 3, W3 - - add $64, K -#if defined (__x86_64__) - add $64, data -#else - add $64, data_addr -#endif - - // ctx->states += digests a-h -#if defined (__x86_64__) - add a, 0*4(ctx) - add b, 1*4(ctx) - add c, 2*4(ctx) - add d, 3*4(ctx) - add e, 4*4(ctx) - add f, 5*4(ctx) - add g, 6*4(ctx) - add h, 7*4(ctx) -#else - mov ctx_addr, t - add a, 0*4(t) - add b, 1*4(t) - mov c, s - add s, 2*4(t) - add d, 3*4(t) - add e, 4*4(t) - mov f, s - add s, 5*4(t) - add g, 6*4(t) - mov h, s - add s, 7*4(t) -#endif - - jmp L_loop // branch for next block - - // wrap up digest update round 48:63 for final block -L_final_block: - rounds a, b, c, d, e, f, g, h, 48 - rounds e, f, g, h, a, b, c, d, 52 - rounds a, b, c, d, e, f, g, h, 56 - rounds e, f, g, h, a, b, c, d, 60 - - // ctx->states += digests a-h -#if defined (__x86_64__) - add a, 0*4(ctx) - add b, 1*4(ctx) - add c, 2*4(ctx) - add d, 3*4(ctx) - add e, 4*4(ctx) - add f, 5*4(ctx) - add g, 6*4(ctx) - add h, 7*4(ctx) -#else - mov ctx_addr, t - add a, 0*4(t) - add b, 1*4(t) - mov c, s - add s, 2*4(t) - add d, 3*4(t) - add e, 4*4(t) - mov f, s - add s, 5*4(t) - add g, 6*4(t) - mov h, s - add s, 7*4(t) -#endif - - // if kernel, restore xmm0-xmm7 -#if KERNEL - movdqa 0*16+xmm_save, %xmm0 - movdqa 1*16+xmm_save, %xmm1 - movdqa 2*16+xmm_save, %xmm2 - movdqa 3*16+xmm_save, %xmm3 - movdqa 4*16+xmm_save, %xmm4 - movdqa 5*16+xmm_save, %xmm5 - movdqa 6*16+xmm_save, %xmm6 - movdqa 7*16+xmm_save, %xmm7 -#endif - - // free allocated stack memory - add $stack_size, sp - - // restore callee-saved registers -#if defined (__x86_64__) - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp -#else - pop %edi - pop %esi - pop %ebx - pop %ebp -#endif - - // return - ret - - - .const - .align 4, 0x90 - -L_bswap: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - -#endif // x86_64/i386 - diff --git a/bsd/crypto/sha2/intel/sha256nossse3.s b/bsd/crypto/sha2/intel/sha256nossse3.s deleted file mode 100644 index b4dd0a035..000000000 --- a/bsd/crypto/sha2/intel/sha256nossse3.s +++ /dev/null @@ -1,649 +0,0 @@ -/* - This file provides x86_64/i386 hand implementation of the following function - - void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); - - which is a C function in sha2.c (from xnu). - - The code SHA256_Transform_nossse3 is a clone of SHA256_Transform - with all ssse3 instructions replaced with sse3 or below instructions. - - For performance reason, this function should not be called directly. This file should be working - together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect - ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function. - - sha256 algorithm per block description: - - 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) - 2. load 8 digests a-h from ctx->state - 3. for r = 0:15 - T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; - d += T1; - h = T1 + Sigma0(a) + Maj(a,b,c) - permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g - 4. for r = 16:63 - W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); - T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; - d += T1; - h = T1 + Sigma0(a) + Maj(a,b,c) - permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g - - In the assembly implementation: - - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 - - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer - - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) - - the implementation per block looks like - - ---------------------------------------------------------------------------- - - load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 - pre_calculate and store W+K(0:15) in stack - - load digests a-h from ctx->state; - - for (r=0;r<48;r+=4) { - digests a-h update and permute round r:r+3 - update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration - } - - for (r=48;r<64;r+=4) { - digests a-h update and permute round r:r+3 - } - - ctx->states += digests a-h; - - ---------------------------------------------------------------------------- - - our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block - into the last 16 rounds of its previous block: - - ---------------------------------------------------------------------------- - - load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 - pre_calculate and store W+K(0:15) in stack - -L_loop: - - load digests a-h from ctx->state; - - for (r=0;r<48;r+=4) { - digests a-h update and permute round r:r+3 - update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration - } - - num_block--; - if (num_block==0) jmp L_last_block; - - for (r=48;r<64;r+=4) { - digests a-h update and permute round r:r+3 - load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 - pre_calculate and store W+K([r:r+3]%16) in stack - } - - ctx->states += digests a-h; - - jmp L_loop; - -L_last_block: - - for (r=48;r<64;r+=4) { - digests a-h update and permute round r:r+3 - } - - ctx->states += digests a-h; - - ------------------------------------------------------------------------ - - Apple CoreOS vector & numerics - cclee 8-3-10 -*/ - -#if defined KERNEL -#include -#else -#include -#endif - - // associate variables with registers or memory - -#if defined (__x86_64__) - #define sp %rsp - #define ctx %rdi - #define data %rsi - #define num_blocks %rdx - - #define a %r8d - #define b %r9d - #define c %r10d - #define d %r11d - #define e %r12d - #define f %r13d - #define g %r14d - #define h %r15d - - #define K %rbx - #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) - - #define xmm_save 80(sp) // starting address for xmm save/restore -#else - #define sp %esp - #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) - #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument - #define data_addr 24+stack_size(sp) // 2nd caller argument - #define num_blocks 28+stack_size(sp) // 3rd caller argument - - #define a %ebx - #define b %edx - #define c 64(sp) - #define d %ebp - #define e %esi - #define f 68(sp) - #define g %edi - #define h 72(sp) - - #define K 76(sp) // pointer to K256[] table - #define xmm_save 96(sp) // starting address for xmm save/restore -#endif - - // 2 local variables - #define t %eax - #define s %ecx - - // a window (16 words) of message scheule - #define W0 %xmm0 - #define W1 %xmm1 - #define W2 %xmm2 - #define W3 %xmm3 - - // circular buffer for WK[(r:r+15)%16] - #define WK(x) (x&15)*4(sp) - -// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) - - .macro Ch - mov $0, t // x - mov $0, s // x - not t // ~x - and $1, s // x & y - and $2, t // ~x & z - xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); - .endm - -// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - - .macro Maj - mov $0, t // x - mov $1, s // y - and s, t // x&y - and $2, s // y&z - xor s, t // (x&y) ^ (y&z) - mov $2, s // z - and $0, s // (x&z) - xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - .endm - -/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ -// #define R(b,x) ((x) >> (b)) -/* 32-bit Rotate-right (used in SHA-256): */ -// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) - -// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) - - // performs sigma0_256 on 4 words on an xmm registers - // use xmm6/xmm7 as intermediate registers - .macro sigma0 - movdqa $0, %xmm6 - movdqa $0, %xmm7 - psrld $$3, $0 // SHR3(x) - psrld $$7, %xmm6 // part of ROTR7 - pslld $$14, %xmm7 // part of ROTR18 - pxor %xmm6, $0 - pxor %xmm7, $0 - psrld $$11, %xmm6 // part of ROTR18 - pslld $$11, %xmm7 // part of ROTR7 - pxor %xmm6, $0 - pxor %xmm7, $0 - .endm - -// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) - - // performs sigma1_256 on 4 words on an xmm registers - // use xmm6/xmm7 as intermediate registers - .macro sigma1 - movdqa $0, %xmm6 - movdqa $0, %xmm7 - psrld $$10, $0 // SHR10(x) - psrld $$17, %xmm6 // part of ROTR17 - pxor %xmm6, $0 - pslld $$13, %xmm7 // part of ROTR19 - pxor %xmm7, $0 - psrld $$2, %xmm6 // part of ROTR19 - pxor %xmm6, $0 - pslld $$2, %xmm7 // part of ROTR17 - pxor %xmm7, $0 - .endm - -// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) - - .macro Sigma0 - mov $0, t // x - mov $0, s // x - ror $$2, t // S32(2, (x)) - ror $$13, s // S32(13, (x)) - xor s, t // S32(2, (x)) ^ S32(13, (x)) - ror $$9, s // S32(22, (x)) - xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) - .endm - -// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) - - .macro Sigma1 - mov $0, s // x - ror $$6, s // S32(6, (x)) - mov s, t // S32(6, (x)) - ror $$5, s // S32(11, (x)) - xor s, t // S32(6, (x)) ^ S32(11, (x)) - ror $$14, s // S32(25, (x)) - xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) - .endm - - // per round digests update - .macro round - Sigma1 $4 // t = T1 - add t, $7 // use h to store h+Sigma1(e) - Ch $4, $5, $6 // t = Ch (e, f, g); - add $7, t // t = h+Sigma1(e)+Ch(e,f,g); - add WK($8), t // h = T1 - add t, $3 // d += T1; - mov t, $7 // h = T1 - Sigma0 $0 // t = Sigma0(a); - add t, $7 // h = T1 + Sigma0(a); - Maj $0, $1, $2 // t = Maj(a,b,c) - add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); - .endm - - // per 4 rounds digests update and permutation - // permutation is absorbed by rotating the roles of digests a-h - .macro rounds - round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 - round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 - round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 - round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 - .endm - - // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future - .macro message_schedule - - // 4 32-bit K256 words in xmm5 -#if defined (__x86_64__) - movdqu (K), %xmm5 -#else - mov K, t - movdqu (t), %xmm5 -#endif - add $$16, K // K points to next K256 word for next iteration - movdqa $1, %xmm4 // W7:W4 -#if 0 - palignr $$4, $0, %xmm4 // W4:W1 -#else // no-ssse3 implementation of palignr - movdqa $0, %xmm7 - pslldq $$12, %xmm4 - psrldq $$4, %xmm7 - por %xmm7, %xmm4 -#endif - sigma0 %xmm4 // sigma0(W4:W1) - movdqa $3, %xmm6 // W15:W12 - paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) -#if 0 - palignr $$4, $2, %xmm6 // W12:W9 -#else // no-ssse3 implementation of palignr - movdqa $2, %xmm7 - pslldq $$12, %xmm6 - psrldq $$4, %xmm7 - por %xmm7, %xmm6 -#endif - paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 - movdqa $3, %xmm4 // W15:W12 - psrldq $$8, %xmm4 // 0,0,W15,W14 - sigma1 %xmm4 // sigma1(0,0,W15,W14) - paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 - movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 - pslldq $$8, %xmm4 // W17, W16, 0, 0 - sigma1 %xmm4 // sigma1(W17,W16,0,0) - paddd %xmm4, $0 // W19:W16 - paddd $0, %xmm5 // WK - movdqa %xmm5, WK($4) - .endm - - // this macro is used in the last 16 rounds of a current block - // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] - // and save into stack to prepare for next block - - .macro update_W_WK -#if defined (__x86_64__) -#if 0 - movdqu $0*16(data), $1 // read 4 4-byte words - pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] -#else // no-ssse3 implementation - mov 0+$0*16(data), s - bswap s - mov s, 0+WK($0*4) - mov 4+$0*16(data), s - bswap s - mov s, 4+WK($0*4) - mov 8+$0*16(data), s - bswap s - mov s, 8+WK($0*4) - mov 12+$0*16(data), s - bswap s - mov s, 12+WK($0*4) - movdqa WK($0*4), $1 -#endif - movdqu $0*16(K), %xmm4 // K[r:r+3] -#else - mov data_addr, t -#if 0 - movdqu $0*16(t), $1 // read 4 4-byte words - pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] -#else // no-ssse3 implementation - mov 0+$0*16(t), s - bswap s - mov s, 0+WK($0*4) - mov 4+$0*16(t), s - bswap s - mov s, 4+WK($0*4) - mov 8+$0*16(t), s - bswap s - mov s, 8+WK($0*4) - mov 12+$0*16(t), s - bswap s - mov s, 12+WK($0*4) - movdqa WK($0*4), $1 -#endif - mov K, t - movdqu $0*16(t), %xmm4 // K[r:r+3] -#endif - paddd $1, %xmm4 // WK[r:r+3] - movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer - .endm - - .text - -#if defined (__x86_64__) || defined (__i386__) - - .globl _SHA256_Transform_nossse3 - -_SHA256_Transform_nossse3: - - // push callee-saved registers -#if defined (__x86_64__) - push %rbp - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 -#else - push %ebp - push %ebx - push %esi - push %edi -#endif - - // allocate stack space - sub $stack_size, sp - - // if kernel code, save used xmm registers -#if KERNEL - movdqa %xmm0, 0*16+xmm_save - movdqa %xmm1, 1*16+xmm_save - movdqa %xmm2, 2*16+xmm_save - movdqa %xmm3, 3*16+xmm_save - movdqa %xmm4, 4*16+xmm_save - movdqa %xmm5, 5*16+xmm_save - movdqa %xmm6, 6*16+xmm_save - movdqa %xmm7, 7*16+xmm_save -#endif - - // set up pointer to table K256[] -#if defined (__x86_64__) - lea _K256(%rip), K -#else - lea _K256, t - mov t, K -#endif - - // load W[0:15] into xmm0-xmm3 - .macro mybswap - movl 0+$0*16($1), a - movl 4+$0*16($1), b - movl 8+$0*16($1), e - movl 12+$0*16($1), d - bswap a - bswap b - bswap e - bswap d - movl a, $0*16(sp) - movl b, 4+$0*16(sp) - movl e, 8+$0*16(sp) - movl d, 12+$0*16(sp) - .endm - -#if defined (__x86_64__) - mybswap 0, data - mybswap 1, data - mybswap 2, data - mybswap 3, data - add $64, data -#else - mov data_addr, t - mybswap 0, t - mybswap 1, t - mybswap 2, t - mybswap 3, t - add $64, data_addr -#endif - movdqa 0*16(sp), W0 - movdqa 1*16(sp), W1 - movdqa 2*16(sp), W2 - movdqa 3*16(sp), W3 - - // compute WK[0:15] and save in stack -#if defined (__x86_64__) - movdqu 0*16(K), %xmm4 - movdqu 1*16(K), %xmm5 - movdqu 2*16(K), %xmm6 - movdqu 3*16(K), %xmm7 -#else - mov K, t - movdqu 0*16(t), %xmm4 - movdqu 1*16(t), %xmm5 - movdqu 2*16(t), %xmm6 - movdqu 3*16(t), %xmm7 -#endif - add $64, K - paddd %xmm0, %xmm4 - paddd %xmm1, %xmm5 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm4, WK(0) - movdqa %xmm5, WK(4) - movdqa %xmm6, WK(8) - movdqa %xmm7, WK(12) - -L_loop: - - // digests a-h = ctx->states; -#if defined (__x86_64__) - mov 0*4(ctx), a - mov 1*4(ctx), b - mov 2*4(ctx), c - mov 3*4(ctx), d - mov 4*4(ctx), e - mov 5*4(ctx), f - mov 6*4(ctx), g - mov 7*4(ctx), h -#else - mov ctx_addr, t - mov 0*4(t), a - mov 1*4(t), b - mov 2*4(t), s - mov s, c - mov 3*4(t), d - mov 4*4(t), e - mov 5*4(t), s - mov s, f - mov 6*4(t), g - mov 7*4(t), s - mov s, h -#endif - - // rounds 0:47 interleaved with W/WK update for rounds 16:63 - rounds a, b, c, d, e, f, g, h, 0 - message_schedule W0,W1,W2,W3,16 - rounds e, f, g, h, a, b, c, d, 4 - message_schedule W1,W2,W3,W0,20 - rounds a, b, c, d, e, f, g, h, 8 - message_schedule W2,W3,W0,W1,24 - rounds e, f, g, h, a, b, c, d, 12 - message_schedule W3,W0,W1,W2,28 - rounds a, b, c, d, e, f, g, h, 16 - message_schedule W0,W1,W2,W3,32 - rounds e, f, g, h, a, b, c, d, 20 - message_schedule W1,W2,W3,W0,36 - rounds a, b, c, d, e, f, g, h, 24 - message_schedule W2,W3,W0,W1,40 - rounds e, f, g, h, a, b, c, d, 28 - message_schedule W3,W0,W1,W2,44 - rounds a, b, c, d, e, f, g, h, 32 - message_schedule W0,W1,W2,W3,48 - rounds e, f, g, h, a, b, c, d, 36 - message_schedule W1,W2,W3,W0,52 - rounds a, b, c, d, e, f, g, h, 40 - message_schedule W2,W3,W0,W1,56 - rounds e, f, g, h, a, b, c, d, 44 - message_schedule W3,W0,W1,W2,60 - - // revert K to the beginning of K256[] -#if defined __x86_64__ - sub $256, K -#else - subl $256, K -#endif - - sub $1, num_blocks // num_blocks-- - je L_final_block // if final block, wrap up final rounds - - // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 - rounds a, b, c, d, e, f, g, h, 48 - update_W_WK 0, W0 - rounds e, f, g, h, a, b, c, d, 52 - update_W_WK 1, W1 - rounds a, b, c, d, e, f, g, h, 56 - update_W_WK 2, W2 - rounds e, f, g, h, a, b, c, d, 60 - update_W_WK 3, W3 - - add $64, K -#if defined (__x86_64__) - add $64, data -#else - add $64, data_addr -#endif - - // ctx->states += digests a-h -#if defined (__x86_64__) - add a, 0*4(ctx) - add b, 1*4(ctx) - add c, 2*4(ctx) - add d, 3*4(ctx) - add e, 4*4(ctx) - add f, 5*4(ctx) - add g, 6*4(ctx) - add h, 7*4(ctx) -#else - mov ctx_addr, t - add a, 0*4(t) - add b, 1*4(t) - mov c, s - add s, 2*4(t) - add d, 3*4(t) - add e, 4*4(t) - mov f, s - add s, 5*4(t) - add g, 6*4(t) - mov h, s - add s, 7*4(t) -#endif - - jmp L_loop // branch for next block - - // wrap up digest update round 48:63 for final block -L_final_block: - rounds a, b, c, d, e, f, g, h, 48 - rounds e, f, g, h, a, b, c, d, 52 - rounds a, b, c, d, e, f, g, h, 56 - rounds e, f, g, h, a, b, c, d, 60 - - // ctx->states += digests a-h -#if defined (__x86_64__) - add a, 0*4(ctx) - add b, 1*4(ctx) - add c, 2*4(ctx) - add d, 3*4(ctx) - add e, 4*4(ctx) - add f, 5*4(ctx) - add g, 6*4(ctx) - add h, 7*4(ctx) -#else - mov ctx_addr, t - add a, 0*4(t) - add b, 1*4(t) - mov c, s - add s, 2*4(t) - add d, 3*4(t) - add e, 4*4(t) - mov f, s - add s, 5*4(t) - add g, 6*4(t) - mov h, s - add s, 7*4(t) -#endif - - // if kernel, restore xmm0-xmm7 -#if KERNEL - movdqa 0*16+xmm_save, %xmm0 - movdqa 1*16+xmm_save, %xmm1 - movdqa 2*16+xmm_save, %xmm2 - movdqa 3*16+xmm_save, %xmm3 - movdqa 4*16+xmm_save, %xmm4 - movdqa 5*16+xmm_save, %xmm5 - movdqa 6*16+xmm_save, %xmm6 - movdqa 7*16+xmm_save, %xmm7 -#endif - - // free allocated stack memory - add $stack_size, sp - - // restore callee-saved registers -#if defined (__x86_64__) - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp -#else - pop %edi - pop %esi - pop %ebx - pop %ebp -#endif - - // return - ret - - -#endif // x86_64/i386 - diff --git a/bsd/crypto/sha2/sha2.c b/bsd/crypto/sha2/sha2.c deleted file mode 100644 index 603d32834..000000000 --- a/bsd/crypto/sha2/sha2.c +++ /dev/null @@ -1,1083 +0,0 @@ -/* $FreeBSD: src/sys/crypto/sha2/sha2.c,v 1.2.2.2 2002/03/05 08:36:47 ume Exp $ */ -/* $KAME: sha2.c,v 1.8 2001/11/08 01:07:52 itojun Exp $ */ - -/* - * sha2.c - * - * Version 1.0.0beta1 - * - * Written by Aaron D. Gifford - * - * Copyright 2000 Aaron D. Gifford. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the copyright holder nor the names of contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) AND CONTRIBUTOR(S) ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR(S) OR CONTRIBUTOR(S) BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - - -#include -#include -#include -#include -#include - -/* - * ASSERT NOTE: - * Some sanity checking code is included using assert(). On my FreeBSD - * system, this additional code can be removed by compiling with NDEBUG - * defined. Check your own systems manpage on assert() to see how to - * compile WITHOUT the sanity checking code on your system. - * - * UNROLLED TRANSFORM LOOP NOTE: - * You can define SHA2_UNROLL_TRANSFORM to use the unrolled transform - * loop version for the hash transform rounds (defined using macros - * later in this file). Either define on the command line, for example: - * - * cc -DSHA2_UNROLL_TRANSFORM -o sha2 sha2.c sha2prog.c - * - * or define below: - * - * #define SHA2_UNROLL_TRANSFORM - * - */ - -#ifndef assert -#define assert(x) do {} while(0) -#endif - -/*** SHA-256/384/512 Machine Architecture Definitions *****************/ -/* - * BYTE_ORDER NOTE: - * - * Please make sure that your system defines BYTE_ORDER. If your - * architecture is little-endian, make sure it also defines - * LITTLE_ENDIAN and that the two (BYTE_ORDER and LITTLE_ENDIAN) are - * equivilent. - * - * If your system does not define the above, then you can do so by - * hand like this: - * - * #define LITTLE_ENDIAN 1234 - * #define BIG_ENDIAN 4321 - * - * And for little-endian machines, add: - * - * #define BYTE_ORDER LITTLE_ENDIAN - * - * Or for big-endian machines: - * - * #define BYTE_ORDER BIG_ENDIAN - * - * The FreeBSD machine this was written on defines BYTE_ORDER - * appropriately by including (which in turn includes - * where the appropriate definitions are actually - * made). - */ -#if !defined(BYTE_ORDER) || (BYTE_ORDER != LITTLE_ENDIAN && BYTE_ORDER != BIG_ENDIAN) -#error Define BYTE_ORDER to be equal to either LITTLE_ENDIAN or BIG_ENDIAN -#endif - -/* - * Define the followingsha2_* types to types of the correct length on - * the native archtecture. Most BSD systems and Linux define u_intXX_t - * types. Machines with very recent ANSI C headers, can use the - * uintXX_t definintions from inttypes.h by defining SHA2_USE_INTTYPES_H - * during compile or in the sha.h header file. - * - * Machines that support neither u_intXX_t nor inttypes.h's uintXX_t - * will need to define these three typedefs below (and the appropriate - * ones in sha.h too) by hand according to their system architecture. - * - * Thank you, Jun-ichiro itojun Hagino, for suggesting using u_intXX_t - * types and pointing out recent ANSI C support for uintXX_t in inttypes.h. - */ -#if 0 /*def SHA2_USE_INTTYPES_H*/ - -typedef uint8_t sha2_byte; /* Exactly 1 byte */ -typedef uint32_t sha2_word32; /* Exactly 4 bytes */ -typedef uint64_t sha2_word64; /* Exactly 8 bytes */ - -#else /* SHA2_USE_INTTYPES_H */ - -typedef u_int8_t sha2_byte; /* Exactly 1 byte */ -typedef u_int32_t sha2_word32; /* Exactly 4 bytes */ -typedef u_int64_t sha2_word64; /* Exactly 8 bytes */ - -#endif /* SHA2_USE_INTTYPES_H */ - - -/*** SHA-256/384/512 Various Length Definitions ***********************/ -/* NOTE: Most of these are in sha2.h */ -#define SHA256_SHORT_BLOCK_LENGTH (SHA256_BLOCK_LENGTH - 8) -#define SHA384_SHORT_BLOCK_LENGTH (SHA384_BLOCK_LENGTH - 16) -#define SHA512_SHORT_BLOCK_LENGTH (SHA512_BLOCK_LENGTH - 16) - - -/*** ENDIAN REVERSAL MACROS *******************************************/ -#if BYTE_ORDER == LITTLE_ENDIAN -#define REVERSE32(w,x) { \ - sha2_word32 tmp = (w); \ - tmp = (tmp >> 16) | (tmp << 16); \ - (x) = ((tmp & 0xff00ff00UL) >> 8) | ((tmp & 0x00ff00ffUL) << 8); \ -} -#define REVERSE64(w,x) { \ - sha2_word64 tmp = (w); \ - tmp = (tmp >> 32) | (tmp << 32); \ - tmp = ((tmp & 0xff00ff00ff00ff00ULL) >> 8) | \ - ((tmp & 0x00ff00ff00ff00ffULL) << 8); \ - (x) = ((tmp & 0xffff0000ffff0000ULL) >> 16) | \ - ((tmp & 0x0000ffff0000ffffULL) << 16); \ -} -#endif /* BYTE_ORDER == LITTLE_ENDIAN */ - -/* - * Macro for incrementally adding the unsigned 64-bit integer n to the - * unsigned 128-bit integer (represented using a two-element array of - * 64-bit words): - */ -#define ADDINC128(w,n) { \ - (w)[0] += (sha2_word64)(n); \ - if ((w)[0] < (n)) { \ - (w)[1]++; \ - } \ -} - -/*** THE SIX LOGICAL FUNCTIONS ****************************************/ -/* - * Bit shifting and rotation (used by the six SHA-XYZ logical functions: - * - * NOTE: The naming of R and S appears backwards here (R is a SHIFT and - * S is a ROTATION) because the SHA-256/384/512 description document - * (see http://csrc.nist.gov/cryptval/shs/sha256-384-512.pdf) uses this - * same "backwards" definition. - */ -/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ -#define R(b,x) ((x) >> (b)) -/* 32-bit Rotate-right (used in SHA-256): */ -#define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) -/* 64-bit Rotate-right (used in SHA-384 and SHA-512): */ -#define S64(b,x) (((x) >> (b)) | ((x) << (64 - (b)))) - -/* Two of six logical functions used in SHA-256, SHA-384, and SHA-512: */ -#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) -#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - -/* Four of six logical functions used in SHA-256: */ -#define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) -#define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) -#define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) -#define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) - -/* Four of six logical functions used in SHA-384 and SHA-512: */ -#define Sigma0_512(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x))) -#define Sigma1_512(x) (S64(14, (x)) ^ S64(18, (x)) ^ S64(41, (x))) -#define sigma0_512(x) (S64( 1, (x)) ^ S64( 8, (x)) ^ R( 7, (x))) -#define sigma1_512(x) (S64(19, (x)) ^ S64(61, (x)) ^ R( 6, (x))) - -/*** INTERNAL FUNCTION PROTOTYPES *************************************/ -/* NOTE: These should not be accessed directly from outside this - * library -- they are intended for private internal visibility/use - * only. - */ -void SHA512_Last(SHA512_CTX*); -#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) -void SHA256_Transform(SHA256_CTX*, const sha2_word32*, unsigned int num_blocks); -#else -void SHA256_Transform(SHA256_CTX*, const sha2_word32*); -#endif -void SHA512_Transform(SHA512_CTX*, const sha2_word64*); - - -/*** SHA-XYZ INITIAL HASH VALUES AND CONSTANTS ************************/ -/* Hash constant words K for SHA-256: */ -#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) -const sha2_word32 K256[64] = { // assembly code will need to read this table -#else -static const sha2_word32 K256[64] = { -#endif - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, - 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, - 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, - 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, - 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, - 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, - 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, - 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, - 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, - 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, - 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, - 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, - 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL -}; - -/* Initial hash value H for SHA-256: */ -static const sha2_word32 sha256_initial_hash_value[8] = { - 0x6a09e667UL, - 0xbb67ae85UL, - 0x3c6ef372UL, - 0xa54ff53aUL, - 0x510e527fUL, - 0x9b05688cUL, - 0x1f83d9abUL, - 0x5be0cd19UL -}; - -/* Hash constant words K for SHA-384 and SHA-512: */ -static const sha2_word64 K512[80] = { - 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, - 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, - 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, - 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, - 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, - 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, - 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, - 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, - 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, - 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, - 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, - 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, - 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, - 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, - 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, - 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, - 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, - 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, - 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, - 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, - 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, - 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, - 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, - 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, - 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, - 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, - 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, - 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, - 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, - 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, - 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, - 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, - 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, - 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, - 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, - 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, - 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, - 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, - 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, - 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL -}; - -/* Initial hash value H for SHA-384 */ -static const sha2_word64 sha384_initial_hash_value[8] = { - 0xcbbb9d5dc1059ed8ULL, - 0x629a292a367cd507ULL, - 0x9159015a3070dd17ULL, - 0x152fecd8f70e5939ULL, - 0x67332667ffc00b31ULL, - 0x8eb44a8768581511ULL, - 0xdb0c2e0d64f98fa7ULL, - 0x47b5481dbefa4fa4ULL -}; - -/* Initial hash value H for SHA-512 */ -static const sha2_word64 sha512_initial_hash_value[8] = { - 0x6a09e667f3bcc908ULL, - 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, - 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, - 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL -}; - -/* - * Constant used by SHA256/384/512_End() functions for converting the - * digest to a readable hexadecimal character string: - */ -static const char *sha2_hex_digits = "0123456789abcdef"; - - -/*** SHA-256: *********************************************************/ -void SHA256_Init(SHA256_CTX* context) { - if (context == (SHA256_CTX*)0) { - return; - } - bcopy(sha256_initial_hash_value, context->state, SHA256_DIGEST_LENGTH); - bzero(context->buffer, SHA256_BLOCK_LENGTH); - context->bitcount = 0; -} - -#if !(defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__))) - -#ifdef SHA2_UNROLL_TRANSFORM - -/* Unrolled SHA-256 round macros: */ - -#if BYTE_ORDER == LITTLE_ENDIAN - -#define ROUND256_0_TO_15(a,b,c,d,e,f,g,h) \ - REVERSE32(*data++, W256[j]); \ - T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + \ - K256[j] + W256[j]; \ - (d) += T1; \ - (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ - j++ - - -#else /* BYTE_ORDER == LITTLE_ENDIAN */ - -#define ROUND256_0_TO_15(a,b,c,d,e,f,g,h) \ - T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + \ - K256[j] + (W256[j] = *data++); \ - (d) += T1; \ - (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ - j++ - -#endif /* BYTE_ORDER == LITTLE_ENDIAN */ - -#define ROUND256(a,b,c,d,e,f,g,h) \ - s0 = W256[(j+1)&0x0f]; \ - s0 = sigma0_256(s0); \ - s1 = W256[(j+14)&0x0f]; \ - s1 = sigma1_256(s1); \ - T1 = (h) + Sigma1_256(e) + Ch((e), (f), (g)) + K256[j] + \ - (W256[j&0x0f] += s1 + W256[(j+9)&0x0f] + s0); \ - (d) += T1; \ - (h) = T1 + Sigma0_256(a) + Maj((a), (b), (c)); \ - j++ - -void SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) { - sha2_word32 a, b, c, d, e, f, g, h, s0, s1; - sha2_word32 T1, *W256; - int j; - - W256 = (sha2_word32*)context->buffer; - - /* Initialize registers with the prev. intermediate value */ - a = context->state[0]; - b = context->state[1]; - c = context->state[2]; - d = context->state[3]; - e = context->state[4]; - f = context->state[5]; - g = context->state[6]; - h = context->state[7]; - - j = 0; - do { - /* Rounds 0 to 15 (unrolled): */ - ROUND256_0_TO_15(a,b,c,d,e,f,g,h); - ROUND256_0_TO_15(h,a,b,c,d,e,f,g); - ROUND256_0_TO_15(g,h,a,b,c,d,e,f); - ROUND256_0_TO_15(f,g,h,a,b,c,d,e); - ROUND256_0_TO_15(e,f,g,h,a,b,c,d); - ROUND256_0_TO_15(d,e,f,g,h,a,b,c); - ROUND256_0_TO_15(c,d,e,f,g,h,a,b); - ROUND256_0_TO_15(b,c,d,e,f,g,h,a); - } while (j < 16); - - /* Now for the remaining rounds to 64: */ - do { - ROUND256(a,b,c,d,e,f,g,h); - ROUND256(h,a,b,c,d,e,f,g); - ROUND256(g,h,a,b,c,d,e,f); - ROUND256(f,g,h,a,b,c,d,e); - ROUND256(e,f,g,h,a,b,c,d); - ROUND256(d,e,f,g,h,a,b,c); - ROUND256(c,d,e,f,g,h,a,b); - ROUND256(b,c,d,e,f,g,h,a); - } while (j < 64); - - /* Compute the current intermediate hash value */ - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - context->state[4] += e; - context->state[5] += f; - context->state[6] += g; - context->state[7] += h; - - /* Clean up */ - a = b = c = d = e = f = g = h = T1 = 0; -} - -#else /* SHA2_UNROLL_TRANSFORM */ - -void SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) { - sha2_word32 a, b, c, d, e, f, g, h, s0, s1; - sha2_word32 T1, T2, *W256; - int j; - - W256 = (sha2_word32*)context->buffer; - - /* Initialize registers with the prev. intermediate value */ - a = context->state[0]; - b = context->state[1]; - c = context->state[2]; - d = context->state[3]; - e = context->state[4]; - f = context->state[5]; - g = context->state[6]; - h = context->state[7]; - - j = 0; - do { -#if BYTE_ORDER == LITTLE_ENDIAN - /* Copy data while converting to host byte order */ - REVERSE32(*data++,W256[j]); - /* Apply the SHA-256 compression function to update a..h */ - T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + W256[j]; -#else /* BYTE_ORDER == LITTLE_ENDIAN */ - /* Apply the SHA-256 compression function to update a..h with copy */ - T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + (W256[j] = *data++); -#endif /* BYTE_ORDER == LITTLE_ENDIAN */ - T2 = Sigma0_256(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - - j++; - } while (j < 16); - - do { - /* Part of the message block expansion: */ - s0 = W256[(j+1)&0x0f]; - s0 = sigma0_256(s0); - s1 = W256[(j+14)&0x0f]; - s1 = sigma1_256(s1); - - /* Apply the SHA-256 compression function to update a..h */ - T1 = h + Sigma1_256(e) + Ch(e, f, g) + K256[j] + - (W256[j&0x0f] += s1 + W256[(j+9)&0x0f] + s0); - T2 = Sigma0_256(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - - j++; - } while (j < 64); - - /* Compute the current intermediate hash value */ - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - context->state[4] += e; - context->state[5] += f; - context->state[6] += g; - context->state[7] += h; - - /* Clean up */ - a = b = c = d = e = f = g = h = T1 = T2 = 0; -} - -#endif /* SHA2_UNROLL_TRANSFORM */ - -#endif // defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) - -void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { - unsigned int freespace, usedspace; - - if (len == 0) { - /* Calling with no data is valid - we do nothing */ - return; - } - - /* Sanity check: */ - assert(context != (SHA256_CTX*)0 && data != (sha2_byte*)0); - - usedspace = (context->bitcount >> 3) % SHA256_BLOCK_LENGTH; - if (usedspace > 0) { - /* Calculate how much free space is available in the buffer */ - freespace = SHA256_BLOCK_LENGTH - usedspace; - - if (len >= freespace) { - /* Fill the buffer completely and process it */ - bcopy(data, &context->buffer[usedspace], freespace); - context->bitcount += freespace << 3; - len -= freespace; - data += freespace; -#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) - SHA256_Transform(context, (sha2_word32*)context->buffer, 1); -#else - SHA256_Transform(context, (sha2_word32*)context->buffer); -#endif - } else { - /* The buffer is not yet full */ - bcopy(data, &context->buffer[usedspace], len); - context->bitcount += len << 3; - /* Clean up: */ - usedspace = freespace = 0; - return; - } - } -#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) - { - unsigned int kk = len/SHA256_BLOCK_LENGTH; - if (kk>0) { - SHA256_Transform(context, (const sha2_word32*)data, kk); - context->bitcount += (SHA256_BLOCK_LENGTH << 3)*kk; - len -= SHA256_BLOCK_LENGTH*kk; - data += SHA256_BLOCK_LENGTH*kk; - } - } -#else - while (len >= SHA256_BLOCK_LENGTH) { - /* Process as many complete blocks as we can */ - SHA256_Transform(context, (const sha2_word32*)data); - context->bitcount += SHA256_BLOCK_LENGTH << 3; - len -= SHA256_BLOCK_LENGTH; - data += SHA256_BLOCK_LENGTH; - } -#endif - if (len > 0) { - /* There's left-overs, so save 'em */ - bcopy(data, context->buffer, len); - context->bitcount += len << 3; - } - /* Clean up: */ - usedspace = freespace = 0; -} - -void SHA256_Final(sha2_byte digest[], SHA256_CTX* context) { - sha2_word32 *d = (sha2_word32*)digest; - unsigned int usedspace; - - /* Sanity check: */ - assert(context != (SHA256_CTX*)0); - - /* If no digest buffer is passed, we don't bother doing this: */ - if (digest != (sha2_byte*)0) { - usedspace = (context->bitcount >> 3) % SHA256_BLOCK_LENGTH; -#if BYTE_ORDER == LITTLE_ENDIAN - /* Convert FROM host byte order */ - REVERSE64(context->bitcount,context->bitcount); -#endif - if (usedspace > 0) { - /* Begin padding with a 1 bit: */ - context->buffer[usedspace++] = 0x80; - - if (usedspace <= SHA256_SHORT_BLOCK_LENGTH) { - /* Set-up for the last transform: */ - bzero(&context->buffer[usedspace], SHA256_SHORT_BLOCK_LENGTH - usedspace); - } else { - if (usedspace < SHA256_BLOCK_LENGTH) { - bzero(&context->buffer[usedspace], SHA256_BLOCK_LENGTH - usedspace); - } - /* Do second-to-last transform: */ -#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) - SHA256_Transform(context, (sha2_word32*)context->buffer, 1); -#else - SHA256_Transform(context, (sha2_word32*)context->buffer); -#endif - - /* And set-up for the last transform: */ - bzero(context->buffer, SHA256_SHORT_BLOCK_LENGTH); - } - } else { - /* Set-up for the last transform: */ - bzero(context->buffer, SHA256_SHORT_BLOCK_LENGTH); - - /* Begin padding with a 1 bit: */ - *context->buffer = 0x80; - } - /* Set the bit count: */ - *(sha2_word64*)&context->buffer[SHA256_SHORT_BLOCK_LENGTH] = context->bitcount; - - /* Final transform: */ -#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) - SHA256_Transform(context, (sha2_word32*)context->buffer, 1); -#else - SHA256_Transform(context, (sha2_word32*)context->buffer); -#endif - -#if BYTE_ORDER == LITTLE_ENDIAN - { - /* Convert TO host byte order */ - int j; - for (j = 0; j < 8; j++) { - REVERSE32(context->state[j],context->state[j]); - *d++ = context->state[j]; - } - } -#else - bcopy(context->state, d, SHA256_DIGEST_LENGTH); -#endif - } - - /* Clean up state data: */ - bzero(context, sizeof(context)); - usedspace = 0; -} - -char *SHA256_End(SHA256_CTX* context, char buffer[]) { - sha2_byte digest[SHA256_DIGEST_LENGTH], *d = digest; - int i; - - /* Sanity check: */ - assert(context != (SHA256_CTX*)0); - - if (buffer != (char*)0) { - SHA256_Final(digest, context); - - for (i = 0; i < SHA256_DIGEST_LENGTH; i++) { - *buffer++ = sha2_hex_digits[(*d & 0xf0) >> 4]; - *buffer++ = sha2_hex_digits[*d & 0x0f]; - d++; - } - *buffer = (char)0; - } else { - bzero(context, sizeof(context)); - } - bzero(digest, SHA256_DIGEST_LENGTH); - return buffer; -} - -char* SHA256_Data(const sha2_byte* data, size_t len, char digest[SHA256_DIGEST_STRING_LENGTH]) { - SHA256_CTX context; - - SHA256_Init(&context); - SHA256_Update(&context, data, len); - return SHA256_End(&context, digest); -} - - -/*** SHA-512: *********************************************************/ -void SHA512_Init(SHA512_CTX* context) { - if (context == (SHA512_CTX*)0) { - return; - } - bcopy(sha512_initial_hash_value, context->state, SHA512_DIGEST_LENGTH); - bzero(context->buffer, SHA512_BLOCK_LENGTH); - context->bitcount[0] = context->bitcount[1] = 0; -} - -#ifdef SHA2_UNROLL_TRANSFORM - -/* Unrolled SHA-512 round macros: */ -#if BYTE_ORDER == LITTLE_ENDIAN - -#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \ - REVERSE64(*data++, W512[j]); \ - T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + \ - K512[j] + W512[j]; \ - (d) += T1, \ - (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)), \ - j++ - - -#else /* BYTE_ORDER == LITTLE_ENDIAN */ - -#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \ - T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + \ - K512[j] + (W512[j] = *data++); \ - (d) += T1; \ - (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)); \ - j++ - -#endif /* BYTE_ORDER == LITTLE_ENDIAN */ - -#define ROUND512(a,b,c,d,e,f,g,h) \ - s0 = W512[(j+1)&0x0f]; \ - s0 = sigma0_512(s0); \ - s1 = W512[(j+14)&0x0f]; \ - s1 = sigma1_512(s1); \ - T1 = (h) + Sigma1_512(e) + Ch((e), (f), (g)) + K512[j] + \ - (W512[j&0x0f] += s1 + W512[(j+9)&0x0f] + s0); \ - (d) += T1; \ - (h) = T1 + Sigma0_512(a) + Maj((a), (b), (c)); \ - j++ - -void SHA512_Transform(SHA512_CTX* context, const sha2_word64* data) { - sha2_word64 a, b, c, d, e, f, g, h, s0, s1; - sha2_word64 T1, *W512 = (sha2_word64*)context->buffer; - int j; - - /* Initialize registers with the prev. intermediate value */ - a = context->state[0]; - b = context->state[1]; - c = context->state[2]; - d = context->state[3]; - e = context->state[4]; - f = context->state[5]; - g = context->state[6]; - h = context->state[7]; - - j = 0; - do { - ROUND512_0_TO_15(a,b,c,d,e,f,g,h); - ROUND512_0_TO_15(h,a,b,c,d,e,f,g); - ROUND512_0_TO_15(g,h,a,b,c,d,e,f); - ROUND512_0_TO_15(f,g,h,a,b,c,d,e); - ROUND512_0_TO_15(e,f,g,h,a,b,c,d); - ROUND512_0_TO_15(d,e,f,g,h,a,b,c); - ROUND512_0_TO_15(c,d,e,f,g,h,a,b); - ROUND512_0_TO_15(b,c,d,e,f,g,h,a); - } while (j < 16); - - /* Now for the remaining rounds up to 79: */ - do { - ROUND512(a,b,c,d,e,f,g,h); - ROUND512(h,a,b,c,d,e,f,g); - ROUND512(g,h,a,b,c,d,e,f); - ROUND512(f,g,h,a,b,c,d,e); - ROUND512(e,f,g,h,a,b,c,d); - ROUND512(d,e,f,g,h,a,b,c); - ROUND512(c,d,e,f,g,h,a,b); - ROUND512(b,c,d,e,f,g,h,a); - } while (j < 80); - - /* Compute the current intermediate hash value */ - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - context->state[4] += e; - context->state[5] += f; - context->state[6] += g; - context->state[7] += h; - - /* Clean up */ - a = b = c = d = e = f = g = h = T1 = 0; -} - -#else /* SHA2_UNROLL_TRANSFORM */ - -void SHA512_Transform(SHA512_CTX* context, const sha2_word64* data) { - sha2_word64 a, b, c, d, e, f, g, h, s0, s1; - sha2_word64 T1, T2, *W512 = (sha2_word64*)context->buffer; - int j; - - /* Initialize registers with the prev. intermediate value */ - a = context->state[0]; - b = context->state[1]; - c = context->state[2]; - d = context->state[3]; - e = context->state[4]; - f = context->state[5]; - g = context->state[6]; - h = context->state[7]; - - j = 0; - do { -#if BYTE_ORDER == LITTLE_ENDIAN - /* Convert TO host byte order */ - REVERSE64(*data++, W512[j]); - /* Apply the SHA-512 compression function to update a..h */ - T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + W512[j]; -#else /* BYTE_ORDER == LITTLE_ENDIAN */ - /* Apply the SHA-512 compression function to update a..h with copy */ - T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + (W512[j] = *data++); -#endif /* BYTE_ORDER == LITTLE_ENDIAN */ - T2 = Sigma0_512(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - - j++; - } while (j < 16); - - do { - /* Part of the message block expansion: */ - s0 = W512[(j+1)&0x0f]; - s0 = sigma0_512(s0); - s1 = W512[(j+14)&0x0f]; - s1 = sigma1_512(s1); - - /* Apply the SHA-512 compression function to update a..h */ - T1 = h + Sigma1_512(e) + Ch(e, f, g) + K512[j] + - (W512[j&0x0f] += s1 + W512[(j+9)&0x0f] + s0); - T2 = Sigma0_512(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - - j++; - } while (j < 80); - - /* Compute the current intermediate hash value */ - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - context->state[4] += e; - context->state[5] += f; - context->state[6] += g; - context->state[7] += h; - - /* Clean up */ - a = b = c = d = e = f = g = h = T1 = T2 = 0; -} - -#endif /* SHA2_UNROLL_TRANSFORM */ - -void SHA512_Update(SHA512_CTX* context, const sha2_byte *data, size_t len) { - unsigned int freespace, usedspace; - - if (len == 0) { - /* Calling with no data is valid - we do nothing */ - return; - } - - /* Sanity check: */ - assert(context != (SHA512_CTX*)0 && data != (sha2_byte*)0); - - usedspace = (context->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH; - if (usedspace > 0) { - /* Calculate how much free space is available in the buffer */ - freespace = SHA512_BLOCK_LENGTH - usedspace; - - if (len >= freespace) { - /* Fill the buffer completely and process it */ - bcopy(data, &context->buffer[usedspace], freespace); - ADDINC128(context->bitcount, freespace << 3); - len -= freespace; - data += freespace; - SHA512_Transform(context, (sha2_word64*)context->buffer); - } else { - /* The buffer is not yet full */ - bcopy(data, &context->buffer[usedspace], len); - ADDINC128(context->bitcount, len << 3); - /* Clean up: */ - usedspace = freespace = 0; - return; - } - } - while (len >= SHA512_BLOCK_LENGTH) { - /* Process as many complete blocks as we can */ - SHA512_Transform(context, (const sha2_word64*)data); - ADDINC128(context->bitcount, SHA512_BLOCK_LENGTH << 3); - len -= SHA512_BLOCK_LENGTH; - data += SHA512_BLOCK_LENGTH; - } - if (len > 0) { - /* There's left-overs, so save 'em */ - bcopy(data, context->buffer, len); - ADDINC128(context->bitcount, len << 3); - } - /* Clean up: */ - usedspace = freespace = 0; -} - -void SHA512_Last(SHA512_CTX* context) { - unsigned int usedspace; - - usedspace = (context->bitcount[0] >> 3) % SHA512_BLOCK_LENGTH; -#if BYTE_ORDER == LITTLE_ENDIAN - /* Convert FROM host byte order */ - REVERSE64(context->bitcount[0],context->bitcount[0]); - REVERSE64(context->bitcount[1],context->bitcount[1]); -#endif - if (usedspace > 0) { - /* Begin padding with a 1 bit: */ - context->buffer[usedspace++] = 0x80; - - if (usedspace <= SHA512_SHORT_BLOCK_LENGTH) { - /* Set-up for the last transform: */ - bzero(&context->buffer[usedspace], SHA512_SHORT_BLOCK_LENGTH - usedspace); - } else { - if (usedspace < SHA512_BLOCK_LENGTH) { - bzero(&context->buffer[usedspace], SHA512_BLOCK_LENGTH - usedspace); - } - /* Do second-to-last transform: */ - SHA512_Transform(context, (sha2_word64*)context->buffer); - - /* And set-up for the last transform: */ - bzero(context->buffer, SHA512_BLOCK_LENGTH - 2); - } - } else { - /* Prepare for final transform: */ - bzero(context->buffer, SHA512_SHORT_BLOCK_LENGTH); - - /* Begin padding with a 1 bit: */ - *context->buffer = 0x80; - } - /* Store the length of input data (in bits): */ - *(sha2_word64*)&context->buffer[SHA512_SHORT_BLOCK_LENGTH] = context->bitcount[1]; - *(sha2_word64*)&context->buffer[SHA512_SHORT_BLOCK_LENGTH+8] = context->bitcount[0]; - - /* Final transform: */ - SHA512_Transform(context, (sha2_word64*)context->buffer); -} - -void SHA512_Final(sha2_byte digest[], SHA512_CTX* context) { - sha2_word64 *d = (sha2_word64*)digest; - - /* Sanity check: */ - assert(context != (SHA512_CTX*)0); - - /* If no digest buffer is passed, we don't bother doing this: */ - if (digest != (sha2_byte*)0) { - SHA512_Last(context); - - /* Save the hash data for output: */ -#if BYTE_ORDER == LITTLE_ENDIAN - { - /* Convert TO host byte order */ - int j; - for (j = 0; j < 8; j++) { - REVERSE64(context->state[j],context->state[j]); - *d++ = context->state[j]; - } - } -#else - bcopy(context->state, d, SHA512_DIGEST_LENGTH); -#endif - } - - /* Zero out state data */ - bzero(context, sizeof(context)); -} - -char *SHA512_End(SHA512_CTX* context, char buffer[]) { - sha2_byte digest[SHA512_DIGEST_LENGTH], *d = digest; - int i; - - /* Sanity check: */ - assert(context != (SHA512_CTX*)0); - - if (buffer != (char*)0) { - SHA512_Final(digest, context); - - for (i = 0; i < SHA512_DIGEST_LENGTH; i++) { - *buffer++ = sha2_hex_digits[(*d & 0xf0) >> 4]; - *buffer++ = sha2_hex_digits[*d & 0x0f]; - d++; - } - *buffer = (char)0; - } else { - bzero(context, sizeof(context)); - } - bzero(digest, SHA512_DIGEST_LENGTH); - return buffer; -} - -char* SHA512_Data(const sha2_byte* data, size_t len, char digest[SHA512_DIGEST_STRING_LENGTH]) { - SHA512_CTX context; - - SHA512_Init(&context); - SHA512_Update(&context, data, len); - return SHA512_End(&context, digest); -} - - -/*** SHA-384: *********************************************************/ -void SHA384_Init(SHA384_CTX* context) { - if (context == (SHA384_CTX*)0) { - return; - } - bcopy(sha384_initial_hash_value, context->state, SHA512_DIGEST_LENGTH); - bzero(context->buffer, SHA384_BLOCK_LENGTH); - context->bitcount[0] = context->bitcount[1] = 0; -} - -void SHA384_Update(SHA384_CTX* context, const sha2_byte* data, size_t len) { - SHA512_Update((SHA512_CTX*)context, data, len); -} - -void SHA384_Final(sha2_byte digest[], SHA384_CTX* context) { - sha2_word64 *d = (sha2_word64*)digest; - - /* Sanity check: */ - assert(context != (SHA384_CTX*)0); - - /* If no digest buffer is passed, we don't bother doing this: */ - if (digest != (sha2_byte*)0) { - SHA512_Last((SHA512_CTX*)context); - - /* Save the hash data for output: */ -#if BYTE_ORDER == LITTLE_ENDIAN - { - /* Convert TO host byte order */ - int j; - for (j = 0; j < 6; j++) { - REVERSE64(context->state[j],context->state[j]); - *d++ = context->state[j]; - } - } -#else - bcopy(context->state, d, SHA384_DIGEST_LENGTH); -#endif - } - - /* Zero out state data */ - bzero(context, sizeof(context)); -} - -char *SHA384_End(SHA384_CTX* context, char buffer[]) { - sha2_byte digest[SHA384_DIGEST_LENGTH], *d = digest; - int i; - - /* Sanity check: */ - assert(context != (SHA384_CTX*)0); - - if (buffer != (char*)0) { - SHA384_Final(digest, context); - - for (i = 0; i < SHA384_DIGEST_LENGTH; i++) { - *buffer++ = sha2_hex_digits[(*d & 0xf0) >> 4]; - *buffer++ = sha2_hex_digits[*d & 0x0f]; - d++; - } - *buffer = (char)0; - } else { - bzero(context, sizeof(context)); - } - bzero(digest, SHA384_DIGEST_LENGTH); - return buffer; -} - -char* SHA384_Data(const sha2_byte* data, size_t len, char digest[SHA384_DIGEST_STRING_LENGTH]) { - SHA384_CTX context; - - SHA384_Init(&context); - SHA384_Update(&context, data, len); - return SHA384_End(&context, digest); -} - diff --git a/bsd/crypto/sha2/sha2.h b/bsd/crypto/sha2/sha2.h deleted file mode 100644 index 3997e63f1..000000000 --- a/bsd/crypto/sha2/sha2.h +++ /dev/null @@ -1,141 +0,0 @@ -/* $FreeBSD: src/sys/crypto/sha2/sha2.h,v 1.1.2.1 2001/07/03 11:01:36 ume Exp $ */ -/* $KAME: sha2.h,v 1.3 2001/03/12 08:27:48 itojun Exp $ */ - -/* - * sha2.h - * - * Version 1.0.0beta1 - * - * Written by Aaron D. Gifford - * - * Copyright 2000 Aaron D. Gifford. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the copyright holder nor the names of contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) AND CONTRIBUTOR(S) ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR(S) OR CONTRIBUTOR(S) BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - -#ifndef __SHA2_H__ -#define __SHA2_H__ - -#ifdef __cplusplus -extern "C" { -#endif - - -/*** SHA-256/384/512 Various Length Definitions ***********************/ -#define SHA256_BLOCK_LENGTH 64 -#define SHA256_DIGEST_LENGTH 32 -#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) -#define SHA384_BLOCK_LENGTH 128 -#define SHA384_DIGEST_LENGTH 48 -#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) -#define SHA512_BLOCK_LENGTH 128 -#define SHA512_DIGEST_LENGTH 64 -#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) - - -/*** SHA-256/384/512 Context Structures *******************************/ -/* NOTE: If your architecture does not define either u_intXX_t types or - * uintXX_t (from inttypes.h), you may need to define things by hand - * for your system: - */ -#if 0 -typedef unsigned char u_int8_t; /* 1-byte (8-bits) */ -typedef unsigned int u_int32_t; /* 4-bytes (32-bits) */ -typedef unsigned long long u_int64_t; /* 8-bytes (64-bits) */ -#endif -/* - * Most BSD systems already define u_intXX_t types, as does Linux. - * Some systems, however, like Compaq's Tru64 Unix instead can use - * uintXX_t types defined by very recent ANSI C standards and included - * in the file: - * - * #include - * - * If you choose to use then please define: - * - * #define SHA2_USE_INTTYPES_H - * - * Or on the command line during compile: - * - * cc -DSHA2_USE_INTTYPES_H ... - */ -#if 0 /*def SHA2_USE_INTTYPES_H*/ - -typedef struct _SHA256_CTX { - uint32_t state[8]; - uint64_t bitcount; - uint8_t buffer[SHA256_BLOCK_LENGTH]; -} SHA256_CTX; -typedef struct _SHA512_CTX { - uint64_t state[8]; - uint64_t bitcount[2]; - uint8_t buffer[SHA512_BLOCK_LENGTH]; -} SHA512_CTX; - -#else /* SHA2_USE_INTTYPES_H */ - -typedef struct _SHA256_CTX { - u_int32_t state[8]; - u_int64_t bitcount; - u_int8_t buffer[SHA256_BLOCK_LENGTH]; -} SHA256_CTX; -typedef struct _SHA512_CTX { - u_int64_t state[8]; - u_int64_t bitcount[2]; - u_int8_t buffer[SHA512_BLOCK_LENGTH]; -} SHA512_CTX; - -#endif /* SHA2_USE_INTTYPES_H */ - -typedef SHA512_CTX SHA384_CTX; - - -/*** SHA-256/384/512 Function Prototypes ******************************/ - -void SHA256_Init(SHA256_CTX *); -void SHA256_Update(SHA256_CTX*, const u_int8_t*, size_t); -void SHA256_Final(u_int8_t[SHA256_DIGEST_LENGTH], SHA256_CTX*); -char* SHA256_End(SHA256_CTX*, char[SHA256_DIGEST_STRING_LENGTH]); -char* SHA256_Data(const u_int8_t*, size_t, char[SHA256_DIGEST_STRING_LENGTH]); - -void SHA384_Init(SHA384_CTX*); -void SHA384_Update(SHA384_CTX*, const u_int8_t*, size_t); -void SHA384_Final(u_int8_t[SHA384_DIGEST_LENGTH], SHA384_CTX*); -char* SHA384_End(SHA384_CTX*, char[SHA384_DIGEST_STRING_LENGTH]); -char* SHA384_Data(const u_int8_t*, size_t, char[SHA384_DIGEST_STRING_LENGTH]); - -void SHA512_Init(SHA512_CTX*); -void SHA512_Update(SHA512_CTX*, const u_int8_t*, size_t); -void SHA512_Final(u_int8_t[SHA512_DIGEST_LENGTH], SHA512_CTX*); -char* SHA512_End(SHA512_CTX*, char[SHA512_DIGEST_STRING_LENGTH]); -char* SHA512_Data(const u_int8_t*, size_t, char[SHA512_DIGEST_STRING_LENGTH]); - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif /* __SHA2_H__ */ - diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 745a0fa01..1cf6fff33 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -737,20 +737,11 @@ static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); * for these functions, there will be a comment above the function reading * "Note: not called from probe context." */ -void -dtrace_panic(const char *format, ...) -{ - va_list alist; - - va_start(alist, format); - dtrace_vpanic(format, alist); - va_end(alist); -} int dtrace_assfail(const char *a, const char *f, int l) { - dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l); + panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l); /* * We just need something here that even the most clever compiler @@ -6168,7 +6159,7 @@ dtrace_action_panic(dtrace_ecb_t *ecb) * thread calls panic() from dtrace_probe(), and that panic() is * called exactly once.) */ - dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)", + panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)", probe->dtpr_provider->dtpv_name, probe->dtpr_mod, probe->dtpr_func, probe->dtpr_name, (void *)ecb); @@ -6231,7 +6222,6 @@ dtrace_action_stop(void) uthread->t_dtrace_stop = 1; act_set_astbsd(current_thread()); } - #endif /* __APPLE__ */ } @@ -6246,7 +6236,6 @@ dtrace_action_pidresume(uint64_t pid) DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); return; } - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); /* @@ -16289,7 +16278,7 @@ static void dtrace_module_loaded(struct modctl *ctl) #else static int -dtrace_module_loaded(struct kmod_info *kmod) +dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag) #endif /* __APPLE__ */ { dtrace_provider_t *prv; @@ -16378,16 +16367,36 @@ dtrace_module_loaded(struct kmod_info *kmod) lck_mtx_lock(&dtrace_lock); /* - * If the module does not have a valid UUID, we will not be able to find symbols for it from - * userspace. Go ahead and instrument it now. + * DTrace must decide if it will instrument modules lazily via + * userspace symbols (default mode), or instrument immediately via + * kernel symbols (non-default mode) + * + * When in default/lazy mode, DTrace will only support modules + * built with a valid UUID. + * + * Overriding the default can be done explicitly in one of + * the following two ways. + * + * A module can force symbols from kernel space using the plist key, + * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set, + * we fall through and instrument this module now. + * + * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols + * from kernel space (see dtrace_impl.h). If this system state is set + * to a non-userspace mode, we fall through and instrument the module now. */ - if (MOD_HAS_UUID(ctl) && (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE)) { + + if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) && + (!(flag & KMOD_DTRACE_FORCE_INIT))) + { + /* We will instrument the module lazily -- this is the default */ lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&mod_lock); lck_mtx_unlock(&dtrace_provider_lock); return 0; } + /* We will instrument the module immediately using kernel symbols */ ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS; lck_mtx_unlock(&dtrace_lock); @@ -19713,6 +19722,8 @@ dtrace_init( void ) (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */ + dtrace_isa_init(); + /* * See dtrace_impl.h for a description of dof modes. * The default is lazy dof. @@ -19781,7 +19792,7 @@ dtrace_postinit(void) fake_kernel_kmod.address = g_kernel_kmod_info.address; fake_kernel_kmod.size = g_kernel_kmod_info.size; - if (dtrace_module_loaded(&fake_kernel_kmod) != 0) { + if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) { printf("dtrace_postinit: Could not register mach_kernel modctl\n"); } diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index a046e3eac..db3c5766c 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -1480,13 +1480,6 @@ dtrace_tally_fault(user_addr_t uaddr) return( DTRACE_CPUFLAG_ISSET(CPU_DTRACE_NOFAULT) ? TRUE : FALSE ); } -void -dtrace_vpanic(const char *format, va_list alist) -{ - vuprintf( format, alist ); - panic("dtrace_vpanic"); -} - #define TOTTY 0x02 extern int prf(const char *, va_list, int, struct tty *); /* bsd/kern/subr_prf.h */ diff --git a/bsd/dev/dtrace/dtrace_ptss.c b/bsd/dev/dtrace/dtrace_ptss.c index 9c75c3f1f..1027f0d01 100644 --- a/bsd/dev/dtrace/dtrace_ptss.c +++ b/bsd/dev/dtrace/dtrace_ptss.c @@ -127,10 +127,14 @@ dtrace_ptss_claim_entry(struct proc* p) { /* * This function does not require any locks to be held on entry. + * + * (PR-11138709) A NULL p->p_dtrace_ptss_pages means the entry can + * no longer be referenced safely. When found in this state, the chore + * of releasing an entry to the free list is ignored. */ void dtrace_ptss_release_entry(struct proc* p, struct dtrace_ptss_page_entry* e) { - if (p && e) { + if (p && p->p_dtrace_ptss_pages && e) { do { e->next = p->p_dtrace_ptss_free_list; } while (!OSCompareAndSwapPtr((void *)e->next, (void *)e, (void * volatile *)&p->p_dtrace_ptss_free_list)); @@ -164,7 +168,7 @@ dtrace_ptss_allocate_page(struct proc* p) #if CONFIG_EMBEDDED /* The embedded OS has extra permissions for writable and executable pages. We can't pass in the flags * we need for the correct permissions from mach_vm_allocate, so need to call mach_vm_map directly. */ - vm_map_offset_t map_addr = 0; + mach_vm_offset_t map_addr = 0; kern_return_t kr = mach_vm_map(map, &map_addr, size, 0, VM_FLAGS_ANYWHERE, IPC_PORT_NULL, 0, FALSE, VM_PROT_READ|VM_PROT_EXECUTE, VM_PROT_READ|VM_PROT_EXECUTE, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { goto err; diff --git a/bsd/dev/dtrace/dtrace_subr.c b/bsd/dev/dtrace/dtrace_subr.c index c3a69c48f..c609fddec 100644 --- a/bsd/dev/dtrace/dtrace_subr.c +++ b/bsd/dev/dtrace/dtrace_subr.c @@ -53,7 +53,7 @@ void (*dtrace_cpu_init)(processorid_t); void (*dtrace_modload)(struct modctl *); void (*dtrace_modunload)(struct modctl *); #else -int (*dtrace_modload)(struct kmod_info *); +int (*dtrace_modload)(struct kmod_info *, uint32_t); int (*dtrace_modunload)(struct kmod_info *); void (*dtrace_helpers_cleanup)(proc_t *); #endif /*__APPLE__*/ diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index a9f003e65..68a3a91d0 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -63,7 +63,6 @@ #error "not ported to this architecture" #endif - typedef struct lockstat_probe { const char *lsp_func; const char *lsp_name; @@ -74,21 +73,19 @@ typedef struct lockstat_probe { lockstat_probe_t lockstat_probes[] = { #if defined(__i386__) || defined(__x86_64__) - /* Not implemented yet on PPC... */ + /* Only provide implemented probes for each architecture */ { LS_LCK_MTX_LOCK, LSA_ACQUIRE, LS_LCK_MTX_LOCK_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_MTX_LOCK, LSA_SPIN, LS_LCK_MTX_LOCK_SPIN, DTRACE_IDNONE }, + { LS_LCK_MTX_LOCK, LSA_BLOCK, LS_LCK_MTX_LOCK_BLOCK, DTRACE_IDNONE }, { LS_LCK_MTX_TRY_LOCK, LSA_ACQUIRE, LS_LCK_MTX_TRY_LOCK_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_MTX_TRY_SPIN_LOCK, LSA_ACQUIRE, LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_MTX_UNLOCK, LSA_RELEASE, LS_LCK_MTX_UNLOCK_RELEASE, DTRACE_IDNONE }, { LS_LCK_MTX_EXT_LOCK, LSA_ACQUIRE, LS_LCK_MTX_EXT_LOCK_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_MTX_EXT_LOCK, LSA_SPIN, LS_LCK_MTX_EXT_LOCK_SPIN, DTRACE_IDNONE }, - { LS_LCK_MTX_EXT_TRY_LOCK, LSA_ACQUIRE, LS_LCK_MTX_TRY_EXT_LOCK_ACQUIRE, DTRACE_IDNONE }, - { LS_LCK_MTX_UNLOCK, LSA_RELEASE, LS_LCK_MTX_EXT_UNLOCK_RELEASE, DTRACE_IDNONE }, - { LS_LCK_MTX_LOCK_SPIN_LOCK, LSA_ACQUIRE, LS_LCK_MTX_LOCK_SPIN_ACQUIRE, DTRACE_IDNONE }, -#endif - { LS_LCK_MTX_LOCK, LSA_BLOCK, LS_LCK_MTX_LOCK_BLOCK, DTRACE_IDNONE }, { LS_LCK_MTX_EXT_LOCK, LSA_BLOCK, LS_LCK_MTX_EXT_LOCK_BLOCK, DTRACE_IDNONE }, - +// { LS_LCK_MTX_EXT_TRY_LOCK, LSA_ACQUIRE, LS_LCK_MTX_TRY_EXT_LOCK_ACQUIRE, DTRACE_IDNONE }, + { LS_LCK_MTX_EXT_UNLOCK, LSA_RELEASE, LS_LCK_MTX_EXT_UNLOCK_RELEASE, DTRACE_IDNONE }, + { LS_LCK_MTX_LOCK_SPIN_LOCK, LSA_ACQUIRE, LS_LCK_MTX_LOCK_SPIN_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_RW_LOCK_SHARED, LSR_ACQUIRE, LS_LCK_RW_LOCK_SHARED_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_RW_LOCK_SHARED, LSR_BLOCK, LS_LCK_RW_LOCK_SHARED_BLOCK, DTRACE_IDNONE }, { LS_LCK_RW_LOCK_SHARED, LSR_SPIN, LS_LCK_RW_LOCK_SHARED_SPIN, DTRACE_IDNONE }, @@ -99,11 +96,10 @@ lockstat_probe_t lockstat_probes[] = { LS_LCK_RW_TRY_LOCK_SHARED, LSR_ACQUIRE, LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_RW_TRY_LOCK_EXCL, LSR_ACQUIRE, LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, DTRACE_IDNONE }, { LS_LCK_RW_LOCK_SHARED_TO_EXCL, LSR_UPGRADE, LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, DTRACE_IDNONE }, - { LS_LCK_RW_LOCK_SHARED_TO_EXCL, LSR_BLOCK, LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, DTRACE_IDNONE }, { LS_LCK_RW_LOCK_SHARED_TO_EXCL, LSR_SPIN, LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, DTRACE_IDNONE }, + { LS_LCK_RW_LOCK_SHARED_TO_EXCL, LSR_BLOCK, LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, DTRACE_IDNONE }, { LS_LCK_RW_LOCK_EXCL_TO_SHARED, LSR_DOWNGRADE, LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, DTRACE_IDNONE }, - - +#endif #ifdef LATER /* Interlock and spinlock measurements would be nice, but later */ { LS_LCK_SPIN_LOCK, LSS_ACQUIRE, LS_LCK_SPIN_LOCK_ACQUIRE, DTRACE_IDNONE }, @@ -130,6 +126,8 @@ extern void lck_mtx_unlock_lockstat_patch_point(void); extern void lck_mtx_lock_ext_lockstat_patch_point(void); extern void lck_mtx_ext_unlock_lockstat_patch_point(void); +extern void lck_rw_done_release1_lockstat_patch_point(void); +extern void lck_rw_done_release2_lockstat_patch_point(void); extern void lck_rw_lock_shared_lockstat_patch_point(void); extern void lck_rw_lock_exclusive_lockstat_patch_point(void); extern void lck_rw_lock_shared_to_exclusive_lockstat_patch_point(void); @@ -138,61 +136,89 @@ extern void lck_rw_try_lock_exclusive_lockstat_patch_point(void); extern void lck_mtx_lock_spin_lockstat_patch_point(void); #endif /* CONFIG_DTRACE */ -vm_offset_t *assembly_probes[] = { +typedef struct lockstat_assembly_probe { + int lsap_probe; + vm_offset_t * lsap_patch_point; +} lockstat_assembly_probe_t; + + + lockstat_assembly_probe_t assembly_probes[] = + { #if CONFIG_DTRACE #if defined(__i386__) || defined(__x86_64__) - /* - * On x86 these points are better done via hot patches, which ensure - * there is zero overhead when not in use. On x86 these patch points - * are swapped between the return instruction and a no-op, with the - * Dtrace call following the return. - */ - (vm_offset_t *) lck_mtx_lock_lockstat_patch_point, - (vm_offset_t *) lck_mtx_try_lock_lockstat_patch_point, - (vm_offset_t *) lck_mtx_try_lock_spin_lockstat_patch_point, - (vm_offset_t *) lck_mtx_unlock_lockstat_patch_point, - (vm_offset_t *) lck_mtx_lock_ext_lockstat_patch_point, - (vm_offset_t *) lck_mtx_ext_unlock_lockstat_patch_point, - (vm_offset_t *) lck_rw_lock_shared_lockstat_patch_point, - (vm_offset_t *) lck_rw_lock_exclusive_lockstat_patch_point, - (vm_offset_t *) lck_rw_lock_shared_to_exclusive_lockstat_patch_point, - (vm_offset_t *) lck_rw_try_lock_shared_lockstat_patch_point, - (vm_offset_t *) lck_rw_try_lock_exclusive_lockstat_patch_point, - (vm_offset_t *) lck_mtx_lock_spin_lockstat_patch_point, -#else - (vm_offset_t *) lck_mtx_unlock_lockstat_patch_point, + /* + * On x86 these points are better done via hot patches, which ensure + * there is zero overhead when not in use. On x86 these patch points + * are swapped between the return instruction and a no-op, with the + * Dtrace call following the return. + */ + { LS_LCK_MTX_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_lock_lockstat_patch_point }, + { LS_LCK_MTX_TRY_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_try_lock_lockstat_patch_point }, + { LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_try_lock_spin_lockstat_patch_point }, + { LS_LCK_MTX_UNLOCK_RELEASE, (vm_offset_t *) lck_mtx_unlock_lockstat_patch_point }, + { LS_LCK_MTX_EXT_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_lock_ext_lockstat_patch_point }, + { LS_LCK_MTX_EXT_UNLOCK_RELEASE, (vm_offset_t *) lck_mtx_ext_unlock_lockstat_patch_point }, + { LS_LCK_RW_LOCK_SHARED_ACQUIRE, (vm_offset_t *) lck_rw_lock_shared_lockstat_patch_point }, + { LS_LCK_RW_LOCK_EXCL_ACQUIRE, (vm_offset_t *) lck_rw_lock_exclusive_lockstat_patch_point }, + { LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE,(vm_offset_t *) lck_rw_lock_shared_to_exclusive_lockstat_patch_point }, + { LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, (vm_offset_t *) lck_rw_try_lock_shared_lockstat_patch_point }, + { LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, (vm_offset_t *) lck_rw_try_lock_exclusive_lockstat_patch_point }, + { LS_LCK_MTX_LOCK_SPIN_ACQUIRE, (vm_offset_t *) lck_mtx_lock_spin_lockstat_patch_point }, #endif #endif /* CONFIG_DTRACE */ - NULL + { LS_LCK_INVALID, NULL } }; /* * Hot patch switches back and forth the probe points between NOP and RET. - * The argument indicates whether the probe point is on or off. + * The active argument indicates whether the probe point will turn on or off. + * on == plant a NOP and thus fall through to the probe call + * off == plant a RET and thus avoid the probe call completely + * The lsap_probe identifies which probe we will patch. */ #if defined(__APPLE__) static -#endif /* __APPLE__ */ -void lockstat_hot_patch(boolean_t active) +void lockstat_hot_patch(boolean_t active, int ls_probe) { #pragma unused(active) int i; - - for (i = 0; assembly_probes[i]; i++) { + /* + * Loop through entire table, in case there are + * multiple patch points per probe. + */ + for (i = 0; assembly_probes[i].lsap_patch_point; i++) { + if (ls_probe == assembly_probes[i].lsap_probe) #if defined(__i386__) || defined(__x86_64__) - uint8_t instr; - instr = (active ? NOP : RET ); - (void) ml_nofault_copy( (vm_offset_t)&instr, *(assembly_probes[i]), + { + uint8_t instr; + instr = (active ? NOP : RET ); + (void) ml_nofault_copy( (vm_offset_t)&instr, *(assembly_probes[i].lsap_patch_point), sizeof(instr)); + } #endif - } + } /* for */ } - +#endif /* __APPLE__*/ void (*lockstat_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); +#if defined(__APPLE__) +/* This wrapper is used by arm assembler hot patched probes */ +void +lockstat_probe_wrapper(int probe, uintptr_t lp, int rwflag) +{ + dtrace_id_t id; + id = lockstat_probemap[probe]; + if (id != 0) + { + (*lockstat_probe)(id, (uintptr_t)lp, (uint64_t)rwflag, 0,0,0); + } +} +#endif /* __APPLE__ */ + + static dev_info_t *lockstat_devi; /* saved in xxattach() for xxinfo() */ static dtrace_provider_id_t lockstat_id; @@ -209,7 +235,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg) lockstat_probemap[probe->lsp_probe] = id; membar_producer(); - lockstat_hot_patch(TRUE); + lockstat_hot_patch(TRUE, probe->lsp_probe); membar_producer(); return(0); @@ -227,7 +253,7 @@ lockstat_disable(void *arg, dtrace_id_t id, void *parg) ASSERT(lockstat_probemap[probe->lsp_probe]); lockstat_probemap[probe->lsp_probe] = 0; - lockstat_hot_patch(FALSE); + lockstat_hot_patch(FALSE, probe->lsp_probe); membar_producer(); /* diff --git a/bsd/dev/dtrace/profile_prvd.c b/bsd/dev/dtrace/profile_prvd.c index 69f3aadd5..36e213ce1 100644 --- a/bsd/dev/dtrace/profile_prvd.c +++ b/bsd/dev/dtrace/profile_prvd.c @@ -227,7 +227,7 @@ profile_fire(void *arg) x86_saved_state32_t *regs = saved_state32(tagged_regs); dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0); - } + } } #else #error Unknown architecture @@ -273,7 +273,7 @@ profile_tick(void *arg) x86_saved_state32_t *regs = saved_state32(tagged_regs); dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0); - } + } } #else #error Unknown architecture diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index bca167f01..89ac8ef2b 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -63,11 +63,7 @@ extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); #define SDT_PROBETAB_SIZE 0x1000 /* 4k entries -- 16K total */ -#if defined(__x86_64__) -#define DTRACE_PROBE_PREFIX "_dtrace_probeDOLLAR" -#else #define DTRACE_PROBE_PREFIX "_dtrace_probe$" -#endif static dev_info_t *sdt_devi; static int sdt_verbose = 0; @@ -508,14 +504,6 @@ static struct module g_sdt_mach_module; #include #include -#if defined(__LP64__) -#define KERNEL_MAGIC MH_MAGIC_64 -typedef struct nlist_64 kernel_nlist_t; -#else -#define KERNEL_MAGIC MH_MAGIC -typedef struct nlist kernel_nlist_t; -#endif - void sdt_init( void ) { if (0 == gSDTInited) @@ -528,7 +516,7 @@ void sdt_init( void ) return; } - if (KERNEL_MAGIC != _mh_execute_header.magic) { + if (MH_MAGIC_KERNEL != _mh_execute_header.magic) { g_sdt_kernctl.mod_address = (vm_address_t)NULL; g_sdt_kernctl.mod_size = 0; } else { diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 271b2a0e1..6761beec9 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -95,15 +95,20 @@ extern const char *syscallnames[]; #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */ #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */ +extern lck_attr_t* dtrace_lck_attr; +extern lck_grp_t* dtrace_lck_grp; +static lck_mtx_t dtrace_systrace_lock; /* probe state lock */ + systrace_sysent_t *systrace_sysent = NULL; -void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); +void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + +static uint64_t systrace_getarg(void *, dtrace_id_t, void *, int, int); void systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1, - uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, uint64_t arg6, uint64_t arg7) + uint64_t arg2, uint64_t arg3, uint64_t arg4) { -#pragma unused(id,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7) +#pragma unused(id,arg0,arg1,arg2,arg3,arg4) } int32_t @@ -153,10 +158,17 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code]; if ((id = sy->stsy_entry) != DTRACE_IDNONE) { + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + if (uthread) + uthread->t_dtrace_syscall_args = (void *)ip; + if (ip) - (*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4), *(ip+5), *(ip+6), *(ip+7)); + (*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4)); else - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0); + + if (uthread) + uthread->t_dtrace_syscall_args = (void *)0; } #if 0 /* XXX */ @@ -244,7 +256,7 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) * * This change was made 4/23/2003 according to the DTrace project's putback log." */ - (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0, 0, 0, 0); + (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0); } return (rval); @@ -312,7 +324,7 @@ dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv) munged_rv1 = 0LL; } - (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0, 0, 0, 0); + (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0); } } #endif /* __APPLE__ */ @@ -393,6 +405,7 @@ systrace_init(struct sysent *actual, systrace_sysent_t **interposed) s->stsy_underlying = a->sy_callc; s->stsy_return_type = a->sy_return_type; } + lck_mtx_init(&dtrace_systrace_lock, dtrace_lck_grp, dtrace_lck_attr); } #endif /* __APPLE__ */ @@ -491,15 +504,18 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); return(0); } - - (void) casptr(&sysent[sysnum].sy_callc, - (void *)systrace_sysent[sysnum].stsy_underlying, - (void *)dtrace_systrace_syscall); #ifdef _SYSCALL32_IMPL (void) casptr(&sysent32[sysnum].sy_callc, (void *)systrace_sysent32[sysnum].stsy_underlying, (void *)dtrace_systrace_syscall32); #endif + + lck_mtx_lock(&dtrace_systrace_lock); + if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) { + vm_offset_t dss = (vm_offset_t)&dtrace_systrace_syscall; + ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t)); + } + lck_mtx_unlock(&dtrace_systrace_lock); return (0); } @@ -514,9 +530,10 @@ systrace_disable(void *arg, dtrace_id_t id, void *parg) systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); if (disable) { - (void) casptr(&sysent[sysnum].sy_callc, - (void *)dtrace_systrace_syscall, - (void *)systrace_sysent[sysnum].stsy_underlying); + lck_mtx_lock(&dtrace_systrace_lock); + if (sysent[sysnum].sy_callc == dtrace_systrace_syscall) + ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(systrace_sysent[sysnum].stsy_underlying)); + lck_mtx_unlock(&dtrace_systrace_lock); #ifdef _SYSCALL32_IMPL (void) casptr(&sysent32[sysnum].sy_callc, @@ -554,7 +571,7 @@ static dtrace_pops_t systrace_pops = { NULL, NULL, NULL, - NULL, + systrace_getarg, NULL, systrace_destroy }; @@ -723,14 +740,14 @@ typedef kern_return_t (*mach_call_t)(void *); typedef void mach_munge_t(const void *, void *); typedef struct { - int mach_trap_arg_count; - int (*mach_trap_function)(void); + int mach_trap_arg_count; + kern_return_t (*mach_trap_function)(void *); #if 0 /* no active architectures use mungers for mach traps */ - mach_munge_t *mach_trap_arg_munge32; /* system call arguments for 32-bit */ - mach_munge_t *mach_trap_arg_munge64; /* system call arguments for 64-bit */ + mach_munge_t *mach_trap_arg_munge32; /* system call arguments for 32-bit */ + mach_munge_t *mach_trap_arg_munge64; /* system call arguments for 64-bit */ #endif -#if MACH_ASSERT - const char* mach_trap_name; +#if MACH_ASSERT + const char* mach_trap_name; #endif /* MACH_ASSERT */ } mach_trap_t; @@ -759,13 +776,20 @@ struct mach_call_args { #error 1 << SYSTRACE_SHIFT must exceed number of Mach traps #endif -typedef systrace_sysent_t machtrace_sysent_t; +typedef struct machtrace_sysent { + dtrace_id_t stsy_entry; + dtrace_id_t stsy_return; + kern_return_t (*stsy_underlying)(void *); + int32_t stsy_return_type; +} machtrace_sysent_t; static machtrace_sysent_t *machtrace_sysent = NULL; void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); +static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int); + static dev_info_t *machtrace_devi; static dtrace_provider_id_t machtrace_id; @@ -802,8 +826,17 @@ dtrace_machtrace_syscall(struct mach_call_args *args) sy = &machtrace_sysent[code]; - if ((id = sy->stsy_entry) != DTRACE_IDNONE) + if ((id = sy->stsy_entry) != DTRACE_IDNONE) { + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + + if (uthread) + uthread->t_dtrace_syscall_args = (void *)ip; + (*machtrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4)); + + if (uthread) + uthread->t_dtrace_syscall_args = (void *)0; + } #if 0 /* XXX */ /* @@ -846,10 +879,10 @@ machtrace_init(mach_trap_t *actual, machtrace_sysent_t **interposed) if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) continue; - if ((mach_call_t)(a->mach_trap_function) == (mach_call_t)(dtrace_machtrace_syscall)) + if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) continue; - s->stsy_underlying = (sy_call_t *)a->mach_trap_function; + s->stsy_underlying = a->mach_trap_function; } } @@ -924,13 +957,19 @@ machtrace_enable(void *arg, dtrace_id_t id, void *parg) } if (enabled) { - ASSERT(sysent[sysnum].sy_callc == (void *)dtrace_machtrace_syscall); + ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall); return(0); } - (void) casptr(&mach_trap_table[sysnum].mach_trap_function, - (void *)machtrace_sysent[sysnum].stsy_underlying, - (void *)dtrace_machtrace_syscall); + lck_mtx_lock(&dtrace_systrace_lock); + + if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) { + vm_offset_t dss = (vm_offset_t)&dtrace_machtrace_syscall; + ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t)); + } + + lck_mtx_unlock(&dtrace_systrace_lock); + return(0); } @@ -945,10 +984,13 @@ machtrace_disable(void *arg, dtrace_id_t id, void *parg) machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); if (disable) { - (void) casptr(&mach_trap_table[sysnum].mach_trap_function, - (void *)dtrace_machtrace_syscall, - (void *)machtrace_sysent[sysnum].stsy_underlying); + lck_mtx_lock(&dtrace_systrace_lock); + + if (mach_trap_table[sysnum].mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) { + ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t)); + } + lck_mtx_unlock(&dtrace_systrace_lock); } if (SYSTRACE_ISENTRY((uintptr_t)parg)) { @@ -974,7 +1016,7 @@ static dtrace_pops_t machtrace_pops = { NULL, NULL, NULL, - NULL, + machtrace_getarg, NULL, machtrace_destroy }; @@ -1076,3 +1118,49 @@ void systrace_init( void ) } #undef SYSTRACE_MAJOR #endif /* __APPLE__ */ + +static uint64_t +systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) +{ +#pragma unused(arg,id,parg,aframes) /* __APPLE__ */ + uint64_t val = 0; + syscall_arg_t *stack = (syscall_arg_t *)NULL; + + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + + if (uthread) + stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args; + + if (!stack) + return(0); + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ + val = (uint64_t)*(stack+argno); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + return (val); +} + + +static uint64_t +machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) +{ +#pragma unused(arg,id,parg,aframes) /* __APPLE__ */ + uint64_t val = 0; + syscall_arg_t *stack = (syscall_arg_t *)NULL; + + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + + if (uthread) + stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args; + + if (!stack) + return(0); + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ + val = (uint64_t)*(stack+argno); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + return (val); +} + diff --git a/bsd/dev/dtrace/systrace.h b/bsd/dev/dtrace/systrace.h index 915ed2561..aeab1de4e 100644 --- a/bsd/dev/dtrace/systrace.h +++ b/bsd/dev/dtrace/systrace.h @@ -71,9 +71,9 @@ extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); #else extern void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + uint64_t, uint64_t, uint64_t); extern void systrace_stub(dtrace_id_t, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + uint64_t, uint64_t, uint64_t); extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); diff --git a/bsd/dev/i386/conf.c b/bsd/dev/i386/conf.c index b7de69df7..bff6f7232 100644 --- a/bsd/dev/i386/conf.c +++ b/bsd/dev/i386/conf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1997-2012 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -121,6 +121,7 @@ extern d_ioctl_t volioctl; #endif extern d_open_t cttyopen; +extern d_close_t cttyclose; extern d_read_t cttyread; extern d_write_t cttywrite; extern d_ioctl_t cttyioctl; @@ -201,9 +202,9 @@ struct cdevsw cdevsw[] = }, NO_CDEVICE, /* 1*/ { - cttyopen, nullclose, cttyread, cttywrite, /* 2*/ + cttyopen, cttyclose, cttyread, cttywrite, /* 2*/ cttyioctl, nullstop, nullreset, 0, cttyselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY + eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY | D_TRACKCLOSE }, { nullopen, nullclose, mmread, mmwrite, /* 3*/ @@ -307,7 +308,7 @@ isdisk(dev_t dev, int type) } /* FALL THROUGH */ case VBLK: - if (bdevsw[maj].d_type == D_DISK) { + if ((D_TYPEMASK & bdevsw[maj].d_type) == D_DISK) { return (1); } break; @@ -324,7 +325,7 @@ static int chrtoblktab[] = { /* 8 */ NODEV, /* 9 */ NODEV, /* 10 */ NODEV, /* 11 */ NODEV, /* 12 */ NODEV, /* 13 */ NODEV, - /* 14 */ 6, /* 15 */ NODEV, + /* 14 */ NODEV, /* 15 */ NODEV, /* 16 */ NODEV, /* 17 */ NODEV, /* 18 */ NODEV, /* 19 */ NODEV, /* 20 */ NODEV, /* 21 */ NODEV, @@ -337,7 +338,7 @@ static int chrtoblktab[] = { /* 34 */ NODEV, /* 35 */ NODEV, /* 36 */ NODEV, /* 37 */ NODEV, /* 38 */ NODEV, /* 39 */ NODEV, - /* 40 */ NODEV, /* 41 */ 1, + /* 40 */ NODEV, /* 41 */ NODEV, /* 42 */ NODEV, /* 43 */ NODEV, /* 44 */ NODEV, }; diff --git a/bsd/dev/i386/dtrace_isa.c b/bsd/dev/i386/dtrace_isa.c index 88e789fce..bdb177028 100644 --- a/bsd/dev/i386/dtrace_isa.c +++ b/bsd/dev/i386/dtrace_isa.c @@ -165,6 +165,15 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) } } +/* + * Initialization + */ +void +dtrace_isa_init(void) +{ + return; +} + /* * Runtime and ABI */ diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index baec24f83..ef45ffacd 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -1533,7 +1533,13 @@ __user_syms_provide_module(void *arg, struct modctl *ctl) dtrace_module_symbols_t* module_symbols = ctl->mod_user_symbols; if (module_symbols) { for (i=0; idtmodsyms_count; i++) { + + /* + * symbol->dtsym_addr (the symbol address) passed in from + * user space, is already slid for both kexts and kernel. + */ dtrace_symbol_t* symbol = &module_symbols->dtmodsyms_symbols[i]; + char* name = symbol->dtsym_name; /* Lop off omnipresent leading underscore. */ @@ -1543,8 +1549,8 @@ __user_syms_provide_module(void *arg, struct modctl *ctl) /* * We're only blacklisting functions in the kernel for now. */ - if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) - continue; + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) + continue; __provide_probe_64(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr); } diff --git a/bsd/dev/i386/kern_machdep.c b/bsd/dev/i386/kern_machdep.c index c77af1d8b..79bd890e2 100644 --- a/bsd/dev/i386/kern_machdep.c +++ b/bsd/dev/i386/kern_machdep.c @@ -40,8 +40,6 @@ #include #include -extern int bootarg_no64exec; /* bsd_init.c */ - /********************************************************************** * Routine: grade_binary() * @@ -58,7 +56,7 @@ grade_binary(cpu_type_t exectype, __unused cpu_subtype_t execsubtype) case CPU_TYPE_POWERPC: /* via translator */ return 1; case CPU_TYPE_X86_64: /* native 64-bit */ - return ((ml_is64bit() && !bootarg_no64exec) ? 2 : 0); + return (ml_is64bit() ? 2 : 0); default: /* all other binary types */ return 0; } diff --git a/bsd/dev/i386/mem.c b/bsd/dev/i386/mem.c index 4b4589295..61f0d6929 100644 --- a/bsd/dev/i386/mem.c +++ b/bsd/dev/i386/mem.c @@ -119,11 +119,15 @@ mmioctl(dev_t dev, u_long cmd, __unused caddr_t data, { int minnum = minor(dev); - if ((minnum == 0) || (minnum == 1)) -#if !defined(SECURE_KERNEL) - if (setup_kmem == 0) - return(EINVAL); + if (0 == minnum || 1 == minnum) { + /* /dev/mem and /dev/kmem */ +#if defined(SECURE_KERNEL) + return (ENODEV); +#else + if (0 == setup_kmem) + return (EINVAL); #endif + } switch (cmd) { case FIONBIO: diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 3e001de2b..c2d7de7ff 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,6 +36,7 @@ #include #include #include +#include static int _i386_cpu_info SYSCTL_HANDLER_ARGS @@ -119,6 +120,7 @@ cpu_xsave SYSCTL_HANDLER_ARGS return _i386_cpu_info(oidp, ptr, arg2, req); } + static int cpu_features SYSCTL_HANDLER_ARGS { @@ -280,6 +282,26 @@ misc_interrupt_latency_max(__unused struct sysctl_oid *oidp, __unused void *arg1 return error; } +/* + * Triggers a machine-check exception - for a suitably configured kernel only. + */ +extern void mca_exception_panic(void); +static int +misc_machine_check_panic(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int changed = 0, error; + char buf[128]; + buf[0] = '\0'; + + error = sysctl_io_string(req, buf, sizeof(buf), 0, &changed); + + if (error == 0 && changed) { + mca_exception_panic(); + } + return error; +} + + SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "CPU info"); @@ -660,6 +682,15 @@ SYSCTL_PROC(_machdep_cpu, OID_AUTO, ucupdate, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, cpu_ucode_update, "S", "Microcode update interface"); +static const uint32_t apic_timer_vector = (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT); +static const uint32_t apic_IPI_vector = (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT); + +SYSCTL_NODE(_machdep, OID_AUTO, vectors, CTLFLAG_RD | CTLFLAG_LOCKED, 0, + "Interrupt vector assignments"); + +SYSCTL_UINT (_machdep_vectors, OID_AUTO, timer, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (uint32_t *)&apic_timer_vector, 0, ""); +SYSCTL_UINT (_machdep_vectors, OID_AUTO, IPI, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (uint32_t *)&apic_IPI_vector, 0, ""); + uint64_t pmap_pv_hashlist_walks; uint64_t pmap_pv_hashlist_cnts; uint32_t pmap_pv_hashlist_max; @@ -709,6 +740,13 @@ SYSCTL_PROC(_machdep_misc, OID_AUTO, panic_restart_timeout, 0, 0, panic_set_restart_timeout, "I", "Panic restart timeout in seconds"); -SYSCTL_PROC(_machdep_misc, OID_AUTO, interrupt_latency_max, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_PROC(_machdep_misc, OID_AUTO, interrupt_latency_max, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, misc_interrupt_latency_max, "A", "Maximum Interrupt latency"); + +SYSCTL_PROC(_machdep_misc, OID_AUTO, machine_check_panic, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + misc_machine_check_panic, "A", "Machine-check exception test"); + diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index 7a849ca31..e8494ca4e 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -167,8 +167,9 @@ unix_syscall(x86_saved_state_t *state) if (__probable(code != 180)) { int *ip = (int *)vt; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - *ip, *(ip+1), *(ip+2), *(ip+3), 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + *ip, *(ip+1), *(ip+2), *(ip+3), 0); } mungerp = callp->sy_arg_munge32; @@ -182,7 +183,8 @@ unix_syscall(x86_saved_state_t *state) if (mungerp != NULL) (*mungerp)(NULL, vt); } else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* @@ -256,8 +258,9 @@ unix_syscall(x86_saved_state_t *state) throttle_lowpri_io(TRUE); } if (__probable(code != 180)) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { pal_execve_return(thread); @@ -328,8 +331,9 @@ unix_syscall64(x86_saved_state_t *state) if (code != 180) { uint64_t *ip = (uint64_t *)uargp; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); } assert(callp->sy_narg <= 8); @@ -354,8 +358,9 @@ unix_syscall64(x86_saved_state_t *state) goto unsafe; } } else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - 0, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, + 0, 0, 0, 0, 0); unsafe: /* @@ -453,8 +458,9 @@ unsafe: throttle_lowpri_io(TRUE); } if (__probable(code != 180)) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ @@ -599,8 +605,9 @@ unix_syscall_return(int error) throttle_lowpri_io(TRUE); } if (code != 180) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, + error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index 4292d6515..4f31f83e5 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -102,9 +102,10 @@ struct sigframe32 { /* * NOTE: Source and target may *NOT* overlap! + * XXX: Unify with bsd/kern/kern_exit.c */ static void -siginfo_user_to_user32(user_siginfo_t *in, user32_siginfo_t *out) +siginfo_user_to_user32_x86(user_siginfo_t *in, user32_siginfo_t *out) { out->si_signo = in->si_signo; out->si_errno = in->si_errno; @@ -120,7 +121,7 @@ siginfo_user_to_user32(user_siginfo_t *in, user32_siginfo_t *out) } static void -siginfo_user_to_user64(user_siginfo_t *in, user64_siginfo_t *out) +siginfo_user_to_user64_x86(user_siginfo_t *in, user64_siginfo_t *out) { out->si_signo = in->si_signo; out->si_errno = in->si_errno; @@ -521,7 +522,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint bzero((caddr_t)&sinfo64_user64, sizeof(sinfo64_user64)); - siginfo_user_to_user64(&sinfo64,&sinfo64_user64); + siginfo_user_to_user64_x86(&sinfo64,&sinfo64_user64); #if CONFIG_DTRACE bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); @@ -560,7 +561,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint bzero((caddr_t)&sinfo32, sizeof(sinfo32)); - siginfo_user_to_user32(&sinfo64,&sinfo32); + siginfo_user_to_user32_x86(&sinfo64,&sinfo32); #if CONFIG_DTRACE bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index c425c7e08..58fecce01 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -113,7 +113,7 @@ static int mdevrw(dev_t dev, struct uio *uio, int ioflag); static char * nonspace(char *pos, char *end); static char * getspace(char *pos, char *end); -static char * cvtnum(char *pos, char *end, unsigned int *num); +static char * cvtnum(char *pos, char *end, uint64_t *num); #endif /* CONFIG_MEMDEV_INSECURE */ @@ -436,8 +436,8 @@ void mdevinit(__unused int the_cnt) { #ifdef CONFIG_MEMDEV_INSECURE int devid, phys; - ppnum_t base; - unsigned int size; + uint64_t base; + uint64_t size; char *ba, *lp; dev_t dev; @@ -476,7 +476,7 @@ void mdevinit(__unused int the_cnt) { if((ba[0] != ' ') && (ba[0] != 0)) continue; /* End must be null or space */ } - dev = mdevadd(devid, base >> 12, size >> 12, phys); /* Go add the device */ + dev = mdevadd(devid, base >> 12, (unsigned)size >> 12, phys); /* Go add the device */ } #endif /* CONFIG_MEMDEV_INSECURE */ @@ -509,7 +509,7 @@ char *getspace(char *pos, char *end) { /* Find next non-space in string */ } } -char *cvtnum(char *pos, char *end, unsigned int *num) { /* Convert to a number */ +char *cvtnum(char *pos, char *end, uint64_t *num) { /* Convert to a number */ int rad, dig; diff --git a/bsd/dev/random/randomdev.c b/bsd/dev/random/randomdev.c index c29e9f877..c081bd209 100644 --- a/bsd/dev/random/randomdev.c +++ b/bsd/dev/random/randomdev.c @@ -56,7 +56,6 @@ #include #include -#include #include #include @@ -102,14 +101,13 @@ static struct cdevsw random_cdevsw = /* Used to detect whether we've already been initialized */ -static UInt8 gRandomInstalled = 0; +static int gRandomInstalled = 0; static PrngRef gPrngRef; static int gRandomError = 1; static lck_grp_t *gYarrowGrp; static lck_attr_t *gYarrowAttr; static lck_grp_attr_t *gYarrowGrpAttr; static lck_mtx_t *gYarrowMutex = 0; -static UInt8 gYarrowInitializationLock = 0; #define RESEED_TICKS 50 /* how long a reseed operation can take */ @@ -309,27 +307,6 @@ PreliminarySetup(void) { prng_error_status perr; - /* Multiple threads can enter this as a result of an earlier - * check of gYarrowMutex. We make sure that only one of them - * can enter at a time. If one of them enters and discovers - * that gYarrowMutex is no longer NULL, we know that another - * thread has initialized the Yarrow state and we can exit. - */ - - /* The first thread that enters this function will find - * gYarrowInitializationLock set to 0. It will atomically - * set the value to 1 and, seeing that it was zero, drop - * out of the loop. Other threads will see that the value is - * 1 and continue to loop until we are initialized. - */ - - while (OSTestAndSet(0, &gYarrowInitializationLock)); /* serialize access to this function */ - - if (gYarrowMutex) { - /* we've already been initialized, clear and get out */ - goto function_exit; - } - /* create a Yarrow object */ perr = prngInitialize(&gPrngRef); if (perr != 0) { @@ -344,8 +321,6 @@ PreliminarySetup(void) char buffer [16]; /* get a little non-deterministic data as an initial seed. */ - /* On OSX, securityd will add much more entropy as soon as it */ - /* comes up. On iOS, entropy is added with each system interrupt. */ microtime(&tt); /* @@ -359,7 +334,7 @@ PreliminarySetup(void) if (perr != 0) { /* an error, complain */ printf ("Couldn't seed Yarrow.\n"); - goto function_exit; + return; } /* turn the data around */ @@ -375,10 +350,6 @@ PreliminarySetup(void) gYarrowMutex = lck_mtx_alloc_init(gYarrowGrp, gYarrowAttr); fips_initialize (); - -function_exit: - /* allow other threads to figure out whether or not we have been initialized. */ - gYarrowInitializationLock = 0; } const Block kKnownAnswer = {0x92, 0xb4, 0x04, 0xe5, 0x56, 0x58, 0x8c, 0xed, 0x6c, 0x1a, 0xcd, 0x4e, 0xbf, 0x05, 0x3f, 0x68, 0x09, 0xf7, 0x3a, 0x93}; @@ -413,11 +384,14 @@ random_init(void) { int ret; - if (OSTestAndSet(0, &gRandomInstalled)) { - /* do this atomically so that it works correctly with - multiple threads */ + if (gRandomInstalled) return; - } + + /* install us in the file system */ + gRandomInstalled = 1; + + /* setup yarrow and the mutex */ + PreliminarySetup(); ret = cdevsw_add(RANDOM_MAJOR, &random_cdevsw); if (ret < 0) { @@ -435,9 +409,6 @@ random_init(void) */ devfs_make_node(makedev (ret, 1), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "urandom", 0); - - /* setup yarrow and the mutex if needed*/ - PreliminarySetup(); } int diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index f167a1752..a1d8f5200 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -157,11 +157,7 @@ bsd_startupearly(void) #if SOCKETS { -#if CONFIG_USESOCKTHRESHOLD - static const unsigned int maxspace = 64 * 1024; -#else static const unsigned int maxspace = 128 * 1024; -#endif int scale; nmbclusters = bsd_mbuf_cluster_reserve(NULL) / MCLBYTES; @@ -303,7 +299,6 @@ done: #if defined(__LP64__) extern int tcp_tcbhashsize; extern int max_cached_sock_count; -void IOSleep(int); #endif diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 114fcecc7..ec8db3864 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -72,6 +72,11 @@ #include #endif +#if CONFIG_PROTECT +/* Forward declare the cprotect struct */ +struct cprotect; +#endif + /* * Just reported via MIG interface. */ @@ -144,6 +149,7 @@ typedef struct hfsmount { /* Physical Description */ u_int32_t hfs_logical_block_size; /* Logical block size of the disk as reported by ioctl(DKIOCGETBLOCKSIZE), always a multiple of 512 */ daddr64_t hfs_logical_block_count; /* Number of logical blocks on the disk */ + u_int64_t hfs_logical_bytes; /* Number of bytes on the disk device this HFS is mounted on (blockcount * blocksize) */ daddr64_t hfs_alt_id_sector; /* location of alternate VH/MDB */ u_int32_t hfs_physical_block_size; /* Physical block size of the disk as reported by ioctl(DKIOCGETPHYSICALBLOCKSIZE) */ u_int32_t hfs_log_per_phys; /* Number of logical blocks per physical block size */ @@ -320,6 +326,11 @@ typedef struct hfsmount { u_int32_t hfs_resize_blocksmoved; u_int32_t hfs_resize_totalblocks; u_int32_t hfs_resize_progress; +#if CONFIG_PROTECT + struct cprotect *hfs_resize_cpentry; + u_int16_t hfs_running_cp_major_vers; +#endif + /* Per mount cnode hash variables: */ lck_mtx_t hfs_chash_mutex; /* protects access to cnode hash table */ @@ -438,9 +449,11 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; */ #define HFS_RDONLY_DOWNGRADE 0x80000 #define HFS_DID_CONTIG_SCAN 0x100000 +#define HFS_UNMAP 0x200000 #define HFS_SSD 0x400000 + /* Macro to update next allocation block in the HFS mount structure. If * the HFS_SKIP_UPDATE_NEXT_ALLOCATION is set, do not update * nextAllocation block. @@ -586,10 +599,10 @@ int hfs_vnop_readdirattr(struct vnop_readdirattr_args *); /* in hfs_attrlist.c int hfs_vnop_inactive(struct vnop_inactive_args *); /* in hfs_cnode.c */ int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* in hfs_cnode.c */ + int hfs_set_backingstore (struct vnode *vp, int val); /* in hfs_cnode.c */ int hfs_is_backingstore (struct vnode *vp, int *val); /* in hfs_cnode.c */ - int hfs_vnop_link(struct vnop_link_args *); /* in hfs_link.c */ int hfs_vnop_lookup(struct vnop_lookup_args *); /* in hfs_lookup.c */ diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c index 3f1e6da64..2cb73b6b5 100644 --- a/bsd/hfs/hfs_attrlist.c +++ b/bsd/hfs/hfs_attrlist.c @@ -139,7 +139,7 @@ hfs_vnop_readdirattr(ap) return (EINVAL); } - if (VTOC(dvp)->c_flags & UF_COMPRESSED) { + if (VTOC(dvp)->c_bsdflags & UF_COMPRESSED) { int compressed = hfs_file_is_compressed(VTOC(dvp), 0); /* 0 == take the cnode lock */ if (!compressed) { diff --git a/bsd/hfs/hfs_attrlist.h b/bsd/hfs/hfs_attrlist.h index c40ba1e56..cb72bce1e 100644 --- a/bsd/hfs/hfs_attrlist.h +++ b/bsd/hfs/hfs_attrlist.h @@ -65,9 +65,23 @@ struct attrblock { ATTR_CMN_FLAGS | ATTR_CMN_USERACCESS | \ ATTR_CMN_FILEID | ATTR_CMN_PARENTID ) +#define HFS_ATTR_CMN_SEARCH_VALID \ + (ATTR_CMN_NAME | ATTR_CMN_OBJID | \ + ATTR_CMN_PAROBJID | ATTR_CMN_CRTIME | \ + ATTR_CMN_MODTIME | ATTR_CMN_CHGTIME | \ + ATTR_CMN_ACCTIME | ATTR_CMN_BKUPTIME | \ + ATTR_CMN_FNDRINFO | ATTR_CMN_OWNERID | \ + ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | \ + ATTR_CMN_FILEID | ATTR_CMN_PARENTID ) + + + #define HFS_ATTR_DIR_VALID \ (ATTR_DIR_LINKCOUNT | ATTR_DIR_ENTRYCOUNT | ATTR_DIR_MOUNTSTATUS) +#define HFS_ATTR_DIR_SEARCH_VALID \ + (ATTR_DIR_ENTRYCOUNT) + #define HFS_ATTR_FILE_VALID \ (ATTR_FILE_LINKCOUNT |ATTR_FILE_TOTALSIZE | \ ATTR_FILE_ALLOCSIZE | ATTR_FILE_IOBLOCKSIZE | \ @@ -75,6 +89,9 @@ struct attrblock { ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE) +#define HFS_ATTR_FILE_SEARCH_VALID \ + (ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ + ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE ) extern int hfs_attrblksize(struct attrlist *attrlist); diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c index f69c780c0..7e5182cd9 100644 --- a/bsd/hfs/hfs_btreeio.c +++ b/bsd/hfs/hfs_btreeio.c @@ -865,6 +865,18 @@ again: lck_mtx_unlock(&hfsmp->hfs_mutex); (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + + if (intrans) { + hfs_end_transaction(hfsmp); + intrans = 0; + } + + /* Initialize the vnode for virtual attribute data file */ + result = init_attrdata_vnode(hfsmp); + if (result) { + printf("hfs_create_attr_btree: init_attrdata_vnode() error=%d\n", result); + } + exit: if (vp) { hfs_unlock(VTOC(vp)); diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 673adbb91..bc2e5959e 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -575,7 +575,10 @@ cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, cnid_t fileID; u_int32_t prefixlen; int result; - int extlen1, extlen2; + u_int8_t utf8[NAME_MAX + 1]; + u_int32_t utf8len; + u_int16_t unicode[kHFSPlusMaxFileNameChars + 1]; + size_t unicodelen; if (wantrsrc) return (ENOENT); @@ -598,19 +601,26 @@ cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, if (descp->cd_parentcnid != outdescp->cd_parentcnid) goto falsematch; - if (((u_int16_t)outdescp->cd_namelen < prefixlen) || - bcmp(outdescp->cd_nameptr, descp->cd_nameptr, prefixlen-6) != 0) - goto falsematch; - - extlen1 = CountFilenameExtensionChars(descp->cd_nameptr, descp->cd_namelen); - extlen2 = CountFilenameExtensionChars(outdescp->cd_nameptr, outdescp->cd_namelen); - if (extlen1 != extlen2) + /* + * Compare the mangled version of file name looked up from the + * disk with the mangled name provided by the user. Note that + * this comparison is case-sensitive, which should be fine + * since we're trying to prevent user space from constructing + * a mangled name that differs from the one they'd get from the + * file system. + */ + result = utf8_decodestr(outdescp->cd_nameptr, outdescp->cd_namelen, + unicode, &unicodelen, sizeof(unicode), ':', 0); + if (result) { goto falsematch; - - if (bcmp(outdescp->cd_nameptr + (outdescp->cd_namelen - extlen2), - descp->cd_nameptr + (descp->cd_namelen - extlen1), - extlen1) != 0) + } + result = ConvertUnicodeToUTF8Mangled(unicodelen, unicode, + sizeof(utf8), &utf8len, utf8, fileID); + if ((result != 0) || + ((u_int16_t)descp->cd_namelen != utf8len) || + (bcmp(descp->cd_nameptr, utf8, utf8len) != 0)) { goto falsematch; + } return (0); @@ -657,6 +667,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t h if (!std_hfs) { parentid = keyp->hfsPlus.parentID; } + encoding = getencoding(recp); hint = iterator->hint.nodeNum; @@ -707,7 +718,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t h /* Update the inode number for this hard link */ attrp->ca_linkref = ilink; } - + /* * Set kHFSHasLinkChainBit for hard links, and reset it for all * other items. Also set linkCount to 1 for regular files. @@ -728,12 +739,10 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t h } } else { /* Make sure that this non-hard link (regular) record is not - * an inode record or a valid hard link being that is not - * resolved for volume resize purposes. We do not want to - * reset the hard link bit or reset link count on these records. + * an inode record that was looked up and we do not end up + * reseting the hard link bit on it. */ - if (!(flags & HFS_LOOKUP_HARDLINK) && - (parentid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + if ((parentid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && (parentid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { /* This is not a hard link or inode and the link count bit was set */ if (attrp->ca_recflags & kHFSHasLinkChainMask) { @@ -1111,10 +1120,11 @@ cat_rename ( * When moving a directory, make sure its a valid move. */ if (directory && (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid)) { - struct BTreeIterator iterator; + struct BTreeIterator *dir_iterator = NULL; + cnid_t cnid = from_cdp->cd_cnid; cnid_t pathcnid = todir_cdp->cd_parentcnid; - + /* First check the obvious ones */ if (cnid == fsRtDirID || cnid == to_cdp->cd_parentcnid || @@ -1122,25 +1132,33 @@ cat_rename ( result = EINVAL; goto exit; } - bzero(&iterator, sizeof(iterator)); + /* now allocate the dir_iterator */ + MALLOC (dir_iterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (dir_iterator == NULL) { + return ENOMEM; + } + bzero(dir_iterator, sizeof(*dir_iterator)); + /* * Traverse destination path all the way back to the root * making sure that source directory is not encountered. * */ while (pathcnid > fsRtDirID) { - buildthreadkey(pathcnid, std_hfs, - (CatalogKey *)&iterator.key); - result = BTSearchRecord(fcb, &iterator, &btdata, - &datasize, NULL); - if (result) goto exit; - + buildthreadkey(pathcnid, std_hfs, (CatalogKey *)&dir_iterator->key); + result = BTSearchRecord(fcb, dir_iterator, &btdata, &datasize, NULL); + if (result) { + FREE(dir_iterator, M_TEMP); + goto exit; + } pathcnid = getparentcnid(recp); if (pathcnid == cnid || pathcnid == 0) { result = EINVAL; + FREE(dir_iterator, M_TEMP); goto exit; } } + FREE(dir_iterator, M_TEMP); } /* @@ -1783,7 +1801,7 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st * a hardlink. In this case, update the linkcount from the cat_attr passed in. */ if ((descp->cd_cnid != attrp->ca_fileid) || (attrp->ca_linkcount > 1 ) || - (file->hl_linkCount > 1)) { + (file->hl_linkCount > 1)) { file->hl_linkCount = attrp->ca_linkcount; } } @@ -1966,7 +1984,7 @@ cat_update_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevli /* Create an iterator for use by us temporarily */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); bzero(iterator, sizeof(*iterator)); - + result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key); if (result == 0) { result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)update_siblinglinks_callback, &state); @@ -1974,7 +1992,7 @@ cat_update_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevli } else { printf("hfs: cat_update_siblinglinks: couldn't resolve cnid %d\n", linkfileid); } - + FREE (iterator, M_TEMP); return MacToVFSError(result); } @@ -2041,16 +2059,13 @@ cat_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevl /* Create an iterator for use by us temporarily */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); bzero(iterator, sizeof(*iterator)); - - + if ((result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key))) { - printf("hfs: cat_lookup_siblinglinks: getkey for %d failed %d\n", linkfileid, result); goto exit; } BDINIT(btdata, &file); if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - printf("hfs: cat_lookup_siblinglinks: cannot find %d\n", linkfileid); goto exit; } /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ @@ -2737,7 +2752,7 @@ typedef struct linkinfo linkinfo_t; /* State information for the getdirentries_callback function. */ struct packdirentry_state { - int cbs_extended; + int cbs_flags; /* VNODE_READDIR_* flags */ u_int32_t cbs_parentID; u_int32_t cbs_index; uio_t cbs_uio; @@ -2814,7 +2829,7 @@ getdirentries_callback(const CatalogKey *ckp, const CatalogRecord *crp, * especially since it's closer to the return of this function. */ - if (state->cbs_extended) { + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { /* The last record has not been returned yet, so we * want to stop after packing the last item */ @@ -2832,16 +2847,26 @@ getdirentries_callback(const CatalogKey *ckp, const CatalogRecord *crp, } } - if (state->cbs_extended) { + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { entry = state->cbs_direntry; nameptr = (u_int8_t *)&entry->d_name[0]; - maxnamelen = NAME_MAX; + if (state->cbs_flags & VNODE_READDIR_NAMEMAX) { + /* + * The NFS server sometimes needs to make filenames fit in + * NAME_MAX bytes (since its client may not be able to + * handle a longer name). In that case, NFS will ask us + * to mangle the name to keep it short enough. + */ + maxnamelen = NAME_MAX; + } else { + maxnamelen = sizeof(entry->d_name); + } } else { nameptr = (u_int8_t *)&catent.d_name[0]; - maxnamelen = NAME_MAX; + maxnamelen = sizeof(catent.d_name); } - if (state->cbs_extended && stop_after_pack) { + if ((state->cbs_flags & VNODE_READDIR_EXTENDED) && stop_after_pack) { /* The last item returns a non-zero invalid cookie */ cnid = INT_MAX; } else { @@ -2951,7 +2976,7 @@ encodestr: } } - if (state->cbs_extended) { + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { /* * The index is 1 relative and includes "." and ".." * @@ -2983,7 +3008,7 @@ encodestr: return (0); /* stop */ } - if (!state->cbs_extended || state->cbs_hasprevdirentry) { + if (!(state->cbs_flags & VNODE_READDIR_EXTENDED) || state->cbs_hasprevdirentry) { state->cbs_result = uiomove(uioaddr, uiosize, state->cbs_uio); if (state->cbs_result == 0) { ++state->cbs_index; @@ -3047,7 +3072,7 @@ encodestr: } /* Fill the direntry to be used the next time */ - if (state->cbs_extended) { + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { if (stop_after_pack) { state->cbs_eof = true; return (0); /* stop */ @@ -3167,8 +3192,8 @@ getdirentries_std_callback(const CatalogKey *ckp, const CatalogRecord *crp, * Pack a uio buffer with directory entries from the catalog */ int -cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint, - uio_t uio, int extended, int * items, int * eofflag) +cat_getdirentries(struct hfsmount *hfsmp, u_int32_t entrycnt, directoryhint_t *dirhint, + uio_t uio, int flags, int * items, int * eofflag) { FCB* fcb; BTreeIterator * iterator; @@ -3180,7 +3205,10 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint int result; int index; int have_key; - + int extended; + + extended = flags & VNODE_READDIR_EXTENDED; + if (extended && (hfsmp->hfs_flags & HFS_STANDARD)) { return (ENOTSUP); } @@ -3189,7 +3217,7 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint /* * Get a buffer for link info array, btree iterator and a direntry: */ - maxlinks = MIN(entrycnt, uio_resid(uio) / SMALL_DIRENTRY_SIZE); + maxlinks = MIN(entrycnt, (u_int32_t)(uio_resid(uio) / SMALL_DIRENTRY_SIZE)); bufsize = MAXPATHLEN + (maxlinks * sizeof(linkinfo_t)) + sizeof(*iterator); if (extended) { bufsize += 2*sizeof(struct direntry); @@ -3197,7 +3225,7 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint MALLOC(buffer, void *, bufsize, M_TEMP, M_WAITOK); bzero(buffer, bufsize); - state.cbs_extended = extended; + state.cbs_flags = flags; state.cbs_hasprevdirentry = false; state.cbs_previlinkref = 0; state.cbs_nlinks = 0; @@ -3323,7 +3351,7 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint * dummy values to copy the last directory entry stored in * packdirentry_state */ - if (state.cbs_extended && (result == fsBTRecordNotFoundErr)) { + if (extended && (result == fsBTRecordNotFoundErr)) { CatalogKey ckp; CatalogRecord crp; diff --git a/bsd/hfs/hfs_catalog.h b/bsd/hfs/hfs_catalog.h index e8574e17d..13aa86513 100644 --- a/bsd/hfs/hfs_catalog.h +++ b/bsd/hfs/hfs_catalog.h @@ -323,7 +323,7 @@ extern int cat_update ( struct hfsmount *hfsmp, extern int cat_getdirentries( struct hfsmount *hfsmp, - int entrycnt, + u_int32_t entrycnt, directoryhint_t *dirhint, uio_t uio, int extended, diff --git a/bsd/hfs/hfs_chash.c b/bsd/hfs/hfs_chash.c index 13c58bf5b..2910c5612 100644 --- a/bsd/hfs/hfs_chash.c +++ b/bsd/hfs/hfs_chash.c @@ -202,12 +202,12 @@ loop: */ if (!allow_deleted) { if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - if (!skiplock) - hfs_unlock(cp); + if (!skiplock) { + hfs_unlock(cp); + } vnode_put(vp); - return (NULL); - } + } } return (vp); } @@ -342,12 +342,12 @@ loop_with_lock: goto loop; } if (ncp) { - /* + /* * someone else won the race to create * this cnode and add it to the hash * just dump our allocation */ - FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); + FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); ncp = NULL; } @@ -376,9 +376,8 @@ loop_with_lock: vnode_put(vp); } else { hfs_chash_lock_spin(hfsmp); - CLR(cp->c_hflag, H_ATTACH); + CLR(cp->c_hflag, H_ATTACH); *hflags &= ~H_ATTACH; - if (ISSET(cp->c_hflag, H_WAITING)) { CLR(cp->c_hflag, H_WAITING); wakeup((caddr_t)cp); @@ -403,7 +402,8 @@ loop_with_lock: if (ncp == NULL) { hfs_chash_unlock(hfsmp); - MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); + + MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); /* * since we dropped the chash lock, * we need to go back and re-verify diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index 016df24e0..970f2648e 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2008 Apple Inc. All rights reserved. + * Copyright (c) 2002-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -59,11 +60,12 @@ static void hfs_reclaim_cnode(struct cnode *); static int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim); static int hfs_isordered(struct cnode *, struct cnode *); +extern int hfs_removefile_callback(struct buf *bp, void *hfsmp); + __inline__ int hfs_checkdeleted (struct cnode *cp) { return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); } - /* * Function used by a special fcntl() that decorates a cnode/vnode that * indicates it is backing another filesystem, like a disk image. @@ -240,8 +242,35 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { (!ISSET(cp->c_flag, C_NOEXISTS)) && (VTOF(vp)->ff_blocks) && (reclaim == 0)) { + /* + * Note that if content protection is enabled, then this is where we will + * attempt to issue IOs for all dirty regions of this file. + * + * If we're called from hfs_vnop_inactive, all this means is at the time + * the logic for deciding to call this function, there were not any lingering + * mmap/fd references for this file. However, there is nothing preventing the system + * from creating a new reference in between the time that logic was checked + * and we entered hfs_vnop_inactive. As a result, the only time we can guarantee + * that there aren't any references is during vnop_reclaim. + */ hfs_filedone(vp, ctx); } + + /* + * We're holding the cnode lock now. Stall behind any shadow BPs that may + * be involved with this vnode if it is a symlink. We don't want to allow + * the blocks that we're about to release to be put back into the pool if there + * is pending I/O to them. + */ + if (v_type == VLNK) { + /* + * This will block if the asynchronous journal flush is in progress. + * If this symlink is not being renamed over and doesn't have any open FDs, + * then we'll remove it from the journal's bufs below in kill_block. + */ + buf_wait_for_shadow_io (vp, 0); + } + /* * Remove any directory hints or cached origins */ @@ -282,9 +311,44 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { if ((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED) && ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { - + + /* Start a transaction here. We're about to change file sizes */ + if (started_tr == 0) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + else { + started_tr = 1; + } + } + /* Truncate away our own fork data. (Case A, B, C above) */ if (VTOF(vp)->ff_blocks != 0) { + + /* + * At this point, we have decided that this cnode is + * suitable for full removal. We are about to deallocate + * its blocks and remove its entry from the catalog. + * If it was a symlink, then it's possible that the operation + * which created it is still in the current transaction group + * due to coalescing. Take action here to kill the data blocks + * of the symlink out of the journal before moving to + * deallocate the blocks. We need to be in the middle of + * a transaction before calling buf_iterate like this. + * + * Note: we have to kill any potential symlink buffers out of + * the journal prior to deallocating their blocks. This is so + * that we don't race with another thread that may be doing an + * an allocation concurrently and pick up these blocks. It could + * generate I/O against them which could go out ahead of our journal + * transaction. + */ + + if (hfsmp->jnl && vnode_islnk(vp)) { + buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); + } + /* * Since we're already inside a transaction, * tell hfs_truncate to skip the ubc_setsize. @@ -303,46 +367,85 @@ int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { /* * Truncate away the resource fork, if we represent the data fork and * it is the last fork. That means, by definition, the rsrc fork is not in - * core. So we bring it into core, and then truncate it away. + * core. To avoid bringing a vnode into core for the sole purpose of deleting the + * data in the resource fork, we call cat_lookup directly, then hfs_release_storage + * to get rid of the resource fork's data. * * This is invoked via case A above only. */ if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) { - struct vnode *rvp = NULLVP; + struct cat_lookup_buffer *lookup_rsrc = NULL; + struct cat_desc *desc_ptr = NULL; + lockflags = 0; + + MALLOC(lookup_rsrc, struct cat_lookup_buffer*, sizeof (struct cat_lookup_buffer), M_TEMP, M_WAITOK); + if (lookup_rsrc == NULL) { + printf("hfs_cnode_teardown: ENOMEM from MALLOC\n"); + error = ENOMEM; + goto out; + } + else { + bzero (lookup_rsrc, sizeof (struct cat_lookup_buffer)); + } + + if (cp->c_desc.cd_namelen == 0) { + /* Initialize the rsrc descriptor for lookup if necessary*/ + MAKE_DELETED_NAME (lookup_rsrc->lookup_name, HFS_TEMPLOOKUP_NAMELEN, cp->c_fileid); + + lookup_rsrc->lookup_desc.cd_nameptr = (const uint8_t*) lookup_rsrc->lookup_name; + lookup_rsrc->lookup_desc.cd_namelen = strlen (lookup_rsrc->lookup_name); + lookup_rsrc->lookup_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + lookup_rsrc->lookup_desc.cd_cnid = cp->c_cnid; + + desc_ptr = &lookup_rsrc->lookup_desc; + } + else { + desc_ptr = &cp->c_desc; + } + + lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_lookup (hfsmp, desc_ptr, 1, (struct cat_desc *) NULL, + (struct cat_attr*) NULL, &lookup_rsrc->lookup_fork.ff_data, NULL); + + hfs_systemfile_unlock (hfsmp, lockflags); - /* - * It is safe for us to pass FALSE to the argument can_drop_lock - * on this call to hfs_vgetrsrc. We know that the resource fork does not - * exist in core, so we'll have to go to the catalog to retrieve its - * information. That will attach the resource fork vnode to our cnode. - */ - error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE, FALSE); if (error) { + FREE (lookup_rsrc, M_TEMP); goto out; } + /* - * Defer the vnode_put and ubc_setsize on rvp until hfs_unlock(). - * - * By bringing the vnode into core above, we may force hfs_vnop_reclaim - * to only partially finish if that's what called us. Bringing the - * resource fork into core results in a new rsrc vnode that will get - * immediately marked for termination below. It will get recycled/reclaimed - * as soon as possible, but that could cause another round of inactive and reclaim. + * Make the filefork in our temporary struct look like a real + * filefork. Fill in the cp, sysfileinfo and rangelist fields.. + */ + rl_init (&lookup_rsrc->lookup_fork.ff_invalidranges); + lookup_rsrc->lookup_fork.ff_cp = cp; + + /* + * If there were no errors, then we have the catalog's fork information + * for the resource fork in question. Go ahead and delete the data in it now. */ - cp->c_flag |= C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE; - error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 0, ctx); + + error = hfs_release_storage (hfsmp, NULL, &lookup_rsrc->lookup_fork, cp->c_fileid); + FREE(lookup_rsrc, M_TEMP); + if (error) { goto out; } - - /* - * Note that the following call to vnode_recycle is safe from within the - * context of hfs_vnop_inactive or hfs_vnop_reclaim. It is being invoked - * on the RSRC fork vp (which is not our current vnode) As such, we hold - * an iocount on it and vnode_recycle will just add the MARKTERM bit at this - * point. + + /* + * This fileid's resource fork extents have now been fully deleted on-disk + * and this CNID is no longer valid. At this point, we should be able to + * zero out cp->c_blocks to indicate there is no data left in this file. */ - vnode_recycle(rvp); /* all done with this vnode */ + cp->c_blocks = 0; + } + + /* End the transaction from the start of the file truncation segment */ + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; } } @@ -485,13 +588,44 @@ out: hfs_end_transaction(hfsmp); started_tr = 0; } - + +#if 0 +#if CONFIG_PROTECT + /* + * cnode truncate lock and cnode lock are both held exclusive here. + * + * Go ahead and flush the keys out if this cnode is the last fork + * and it is not class F. Class F keys should not be purged because they only + * exist in memory and have no persistent keys. Only do this + * if we haven't already done it yet (maybe a vnode skipped inactive + * and went straight to reclaim). This function gets called from both reclaim and + * inactive, so it will happen first in inactive if possible. + * + * We need to be mindful that all pending IO for this file has already been + * issued and completed before we bzero out the key. This is because + * if it isn't, tossing the key here could result in garbage IO being + * written (by using the bzero'd key) if the writes are happening asynchronously. + * + * In addition, class A files may have already been purged due to the + * lock event occurring. + */ + if (forkcount == 1) { + struct cprotect *entry = cp->c_cpentry; + if ((entry) && (entry->cp_pclass != PROTECTION_CLASS_F)) { + if ((cp->c_cpentry->cp_flags & CP_KEY_FLUSHED) == 0) { + cp->c_cpentry->cp_flags |= CP_KEY_FLUSHED; + bzero (cp->c_cpentry->cp_cache_key, cp->c_cpentry->cp_cache_key_len); + bzero (cp->c_cpentry->cp_cache_iv_ctx, sizeof(aes_encrypt_ctx)); + } + } + } +#endif +#endif return error; } - /* * hfs_vnop_inactive * @@ -600,6 +734,7 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) struct rl_entry *invalid_range; off_t leof; u_int32_t blks, blocksize; + /* flags for zero-filling sparse ranges */ int cluster_flags = IO_CLOSE; int cluster_zero_flags = IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE; @@ -611,6 +746,25 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (fp->ff_blocks == 0)) return (0); +#if CONFIG_PROTECT + /* + * Figure out if we need to do synchronous IO. + * + * If the file represents a content-protected file, we may need + * to issue synchronous IO when we dispatch to the cluster layer. + * If we didn't, then the IO would go out to the disk asynchronously. + * If the vnode hits the end of inactive before getting reclaimed, the + * content protection keys would be wiped/bzeroed out, and we'd end up + * trying to issue the IO with an invalid key. This will lead to file + * corruption. IO_SYNC will force the cluster_push to wait until all IOs + * have completed (though they may be in the track cache). + */ + if (cp_fs_protected(VTOVFS(vp))) { + cluster_flags |= IO_SYNC; + cluster_zero_flags |= IO_SYNC; + } +#endif + /* * If we are being invoked from F_SWAPDATAEXTENTS, then we * need to issue synchronous IO; Unless we are sure that all @@ -654,8 +808,10 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) /* * Shrink the peof to the smallest size neccessary to contain the leof. */ - if (blks < fp->ff_blocks) + if (blks < fp->ff_blocks) { (void) hfs_truncate(vp, leof, IO_NDELAY, 0, 0, context); + } + hfs_unlock(cp); (void) cluster_push(vp, cluster_flags); hfs_lock(cp, HFS_FORCE_LOCK); @@ -871,7 +1027,7 @@ hfs_getnewvnode( if (cp == NULL) { return (ENOENT); } - + /* * If we get a cnode/vnode pair out of hfs_chash_getcnode, then update the * descriptor in the cnode as needed if the cnode represents a hardlink. @@ -913,7 +1069,7 @@ hfs_getnewvnode( vnode_put (*vpp); *vpp = NULL; } - + /* * If we raced with VNOP_RECLAIM for this vnode, the hash code could * have observed it after the c_vp or c_rsrc_vp fields had been torn down; @@ -925,18 +1081,23 @@ hfs_getnewvnode( if (hflags) { hfs_chashwakeup(hfsmp, cp, hflags); } - + *out_flags = GNV_CAT_ATTRCHANGED; return ERECYCLE; } else { - /* Otherwise, CNID != fileid. Go ahead and copy in the new descriptor */ + /* + * Otherwise, CNID != fileid. Go ahead and copy in the new descriptor. + * + * Replacing the descriptor here is fine because we looked up the item without + * a vnode in hand before. If a vnode existed, its identity must be attached to this + * item. We are not susceptible to the lookup fastpath issue at this point. + */ replace_desc(cp, descp); } } } - /* Check if we found a matching vnode */ if (*vpp != NULL) { return (0); @@ -1210,8 +1371,10 @@ hfs_getnewvnode( } #if CONFIG_PROTECT - if (!issystemfile && (*out_flags & GNV_NEW_CNODE)) + /* Initialize the cp data structures. The key should be in place now. */ + if (!issystemfile && (*out_flags & GNV_NEW_CNODE)) { cp_entry_init(cp, mp); + } #endif *vpp = vp; @@ -1264,7 +1427,7 @@ hfs_reclaim_cnode(struct cnode *cp) } #endif #if CONFIG_PROTECT - cp_entry_destroy(cp); + cp_entry_destroy(&cp->c_cpentry); #endif @@ -1303,7 +1466,6 @@ hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname if (dvp && cnp) { int lookup = 0; struct cat_fork fork; - bzero(&cndesc, sizeof(cndesc)); cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; cndesc.cd_namelen = cnp->cn_namelen; @@ -1327,6 +1489,7 @@ hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname */ lookup = cat_lookup (hfsmp, &cndesc, 0, NULL, &attr, &fork, NULL); + if ((lookup == 0) && (cnid == attr.ca_fileid)) { stillvalid = 1; *error = 0; @@ -1394,6 +1557,7 @@ notvalid: return (stillvalid); } + /* * Per HI and Finder requirements, HFS should add in the * date/time that a particular directory entry was added @@ -1408,11 +1572,11 @@ notvalid: */ void hfs_write_dateadded (struct cat_attr *attrp, u_int32_t dateadded) { u_int8_t *finfo = NULL; - + /* overlay the FinderInfo to the correct pointer, and advance */ finfo = (u_int8_t*)attrp->ca_finderinfo; finfo = finfo + 16; - + /* * Make sure to write it out as big endian, since that's how * finder info is defined. @@ -1427,27 +1591,27 @@ void hfs_write_dateadded (struct cat_attr *attrp, u_int32_t dateadded) { else if (S_ISDIR(attrp->ca_mode)) { struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; extinfo->date_added = OSSwapHostToBigInt32(dateadded); - attrp->ca_recflags |= kHFSHasDateAddedMask; + attrp->ca_recflags |= kHFSHasDateAddedMask; } - /* If it were neither directory/file, then we'd bail out */ return; } + u_int32_t hfs_get_dateadded (struct cnode *cp) { u_int8_t *finfo = NULL; u_int32_t dateadded = 0; - + if ((cp->c_attr.ca_recflags & kHFSHasDateAddedMask) == 0) { /* Date added was never set. Return 0. */ return dateadded; } - - + + /* overlay the FinderInfo to the correct pointer, and advance */ finfo = (u_int8_t*)cp->c_finderinfo; finfo = finfo + 16; - + /* * FinderInfo is written out in big endian... make sure to convert it to host * native before we use it. @@ -1460,12 +1624,10 @@ u_int32_t hfs_get_dateadded (struct cnode *cp) { struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; dateadded = OSSwapBigToHostInt32 (extinfo->date_added); } - + return dateadded; } - - /* * Touch cnode times based on c_touch_xxx flags * @@ -1546,7 +1708,7 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) cp->c_flag |= C_MODIFIED; touchvol = 1; } - + if (cp->c_flag & C_NEEDS_DATEADDED) { hfs_write_dateadded (&(cp->c_attr), tv.tv_sec); cp->c_flag |= C_MODIFIED; diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 03878ef7f..082fbb858 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -65,6 +65,30 @@ struct filefork { }; typedef struct filefork filefork_t; + +#define HFS_TEMPLOOKUP_NAMELEN 32 + +/* + * Catalog Lookup struct (runtime) + * + * This is used so that when we need to malloc a container for a catalog + * lookup operation, we can acquire memory for everything in one fell swoop + * as opposed to putting many of these objects on the stack. The cat_fork + * data structure can take up 100+bytes easily, and that can add to stack + * overhead. + * + * As a result, we use this to easily pass around the memory needed for a + * lookup operation. + */ +struct cat_lookup_buffer { + struct cat_desc lookup_desc; + struct cat_attr lookup_attr; + struct filefork lookup_fork; + struct componentname lookup_cn; + char lookup_name[HFS_TEMPLOOKUP_NAMELEN]; /* for open-unlinked paths only */ +}; + + /* Aliases for common fields */ #define ff_size ff_data.cf_size #define ff_new_size ff_data.cf_new_size @@ -161,7 +185,7 @@ typedef struct cnode cnode_t; #define c_ctime c_attr.ca_ctime #define c_itime c_attr.ca_itime #define c_btime c_attr.ca_btime -#define c_flags c_attr.ca_flags +#define c_bsdflags c_attr.ca_flags #define c_finderinfo c_attr.ca_finderinfo #define c_blocks c_attr.ca_union2.cau_blocks #define c_entries c_attr.ca_union2.cau_entries @@ -192,7 +216,12 @@ typedef struct cnode cnode_t; #define C_FORCEUPDATE 0x00100 /* force the catalog entry update */ #define C_HASXATTRS 0x00200 /* cnode has extended attributes */ #define C_NEG_ENTRIES 0x00400 /* directory has negative name entries */ -#define C_SWAPINPROGRESS 0x00800 /* cnode's data is about to be swapped. Issue synchronous cluster io */ +/* + * For C_SSD_STATIC: SSDs may want to deal with the file payload data in a + * different manner knowing that the content is not likely to be modified. This is + * purely advisory at the HFS level, and is not maintained after the cnode goes out of core. + */ +#define C_SSD_STATIC 0x00800 /* Assume future writes contain static content */ #define C_NEED_DATA_SETSIZE 0x01000 /* Do a ubc_setsize(0) on c_rsrc_vp after the unlock */ #define C_NEED_RSRC_SETSIZE 0x02000 /* Do a ubc_setsize(0) on c_vp after the unlock */ @@ -202,6 +231,9 @@ typedef struct cnode cnode_t; #define C_RENAMED 0x10000 /* cnode was deleted as part of rename; C_DELETED should also be set */ #define C_NEEDS_DATEADDED 0x20000 /* cnode needs date-added written to the finderinfo bit */ #define C_BACKINGSTORE 0x40000 /* cnode is a backing store for an existing or currently-mounting filesystem */ +#define C_SWAPINPROGRESS 0x80000 /* cnode's data is about to be swapped. Issue synchronous cluster io */ + + #define ZFTIMELIMIT (5 * 60) /* @@ -318,8 +350,8 @@ extern void hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct extern void hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int flags); extern void hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp); -extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, - int skiplock, int allow_deleted); +extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, + int skiplock, int allow_deleted); extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock, int *out_flags, int *hflags); extern int hfs_chash_snoop(struct hfsmount *, ino_t, int, int (*)(const struct cat_desc *, diff --git a/bsd/hfs/hfs_cprotect.c b/bsd/hfs/hfs_cprotect.c index 0345e4d9e..4a88d0c52 100644 --- a/bsd/hfs/hfs_cprotect.c +++ b/bsd/hfs/hfs_cprotect.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,14 +34,15 @@ #include #include #include +#include #include #include "hfs.h" #include "hfs_cnode.h" -#ifdef CONFIG_PROTECT +#if CONFIG_PROTECT static struct cp_wrap_func g_cp_wrap_func = {NULL, NULL}; -static struct cp_global_state g_cp_state = {0, 0}; +static struct cp_global_state g_cp_state = {0, 0, 0}; extern int (**hfs_vnodeop_p) (void *); @@ -49,24 +50,26 @@ extern int (**hfs_vnodeop_p) (void *); * CP private functions */ static int cp_is_valid_class(int); -static int cp_getxattr(cnode_t *, struct cp_xattr *); -static int cp_setxattr(cnode_t *, struct cp_xattr *, int); -static struct cprotect *cp_entry_alloc(void); -static int cp_make_keys (struct cprotect *); -static int cp_restore_keys(struct cprotect *); +static int cp_root_major_vers(mount_t mp); +static int cp_getxattr(cnode_t *, struct hfsmount *hfsmp, struct cprotect **); +static struct cprotect *cp_entry_alloc(size_t); +static void cp_entry_dealloc(struct cprotect *entry); +static int cp_setup_aes_ctx(struct cprotect *); +static int cp_make_keys (struct cprotect **, struct hfsmount *hfsmp, cnid_t, int); +static int cp_restore_keys(struct cprotect *, struct hfsmount *hfsmp); static int cp_lock_vfs_callback(mount_t, void *); static int cp_lock_vnode_callback(vnode_t, void *); static int cp_vnode_is_eligible (vnode_t); static int cp_check_access (cnode_t *, int); -static int cp_wrap(int, void *, void *); -static int cp_unwrap(int, void *, void *); +static int cp_wrap(int, struct hfsmount *hfsmp, cnid_t, struct cprotect**); +static int cp_unwrap(int, struct cprotect *); #if DEVELOPMENT || DEBUG #define CP_ASSERT(x) \ if ((x) == 0) { \ - panic("CP: failed assertion in %s", __FUNCTION__); \ + panic("Content Protection: failed assertion in %s", __FUNCTION__); \ } #else #define CP_ASSERT(x) @@ -76,10 +79,17 @@ int cp_key_store_action(int action) { g_cp_state.lock_state = action; - if (action == CP_LOCKED_STATE) - return vfs_iterate(0, cp_lock_vfs_callback, (void *)action); - else - return 0; + if (action == CP_LOCKED_STATE) { + /* + * Note that because we are using the void* arg to pass the key store + * value into the vfs cp iteration, we need to pass around the int as an ptr. + * This may silence 32-64 truncation warnings. + */ + return vfs_iterate(0, cp_lock_vfs_callback, (void*)((uintptr_t)action)); + } + + return 0; + } @@ -94,87 +104,248 @@ cp_register_wraps(cp_wrap_func_t key_store_func) return 0; } +#if 0 +/* + * If necessary, this function can be used to + * query the device's lock state. + */ +int +cp_isdevice_locked (void) { + if (g_cp_state.lock_state == CP_UNLOCKED_STATE) { + return 0; + } + return 1; +} +#endif + /* * Allocate and initialize a cprotect blob for a new cnode. - * Called from hfs_getnewcnode: cnode is locked exclusive. + * Called from hfs_getnewvnode: cnode is locked exclusive. * Read xattr data off the cnode. Then, if conditions permit, * unwrap the file key and cache it in the cprotect blob. */ int -cp_entry_init(cnode_t *cnode, struct mount *mp) +cp_entry_init(struct cnode *cp, struct mount *mp) { - struct cprotect *entry; - struct cp_xattr xattr; + struct cprotect *entry = NULL; int error = 0; - + struct hfsmount *hfsmp = VFSTOHFS(mp); + if (!cp_fs_protected (mp)) { - cnode->c_cpentry = NULL; + cp->c_cpentry = NULL; return 0; } - if (!S_ISREG(cnode->c_mode)) { - cnode->c_cpentry = NULL; + if (!S_ISREG(cp->c_mode) && !S_ISDIR(cp->c_mode)) { + cp->c_cpentry = NULL; return 0; } - + if (!g_cp_state.wrap_functions_set) { printf("hfs: cp_update_entry: wrap functions not yet set\n"); return ENXIO; } - CP_ASSERT (cnode->c_cpentry == NULL); - - entry = cp_entry_alloc(); - if (!entry) - return ENOMEM; - - entry->cp_flags |= CP_KEY_FLUSHED; - cnode->c_cpentry = entry; + if (hfsmp->hfs_running_cp_major_vers == 0) { + cp_root_major_vers(mp); + } - error = cp_getxattr(cnode, &xattr); + CP_ASSERT (cp->c_cpentry == NULL); + + error = cp_getxattr(cp, hfsmp, &entry); + + /* + * Normally, we should always have a CP EA for a file or directory that + * we are initializing here. However, there are some extenuating circumstances, + * such as the root directory immediately following a newfs_hfs. + * + * As a result, we leave code here to deal with an ENOATTR which will always + * default to a 'D' key, though we don't expect to use it much. + */ if (error == ENOATTR) { + int sub_error; + + sub_error = cp_entry_create_keys (&entry, NULL, hfsmp, PROTECTION_CLASS_D, cp->c_fileid, cp->c_mode); + + /* Now we have keys. Write them out. */ + if (sub_error == 0) { + sub_error = cp_setxattr (cp, entry, hfsmp, cp->c_fileid, XATTR_CREATE); + } + error = sub_error; + } + else if (error == 0) { + if (S_ISREG(cp->c_mode)) { + entry->cp_flags |= CP_KEY_FLUSHED; + } + } + /* + * For errors other than ENOATTR, we don't do anything. + * cp_entry_destroy can deal with a NULL argument if cp_getxattr + * failed malloc or there was a B-Tree error. + */ + + cp->c_cpentry = entry; + + if (error) { + cp_entry_destroy(&cp->c_cpentry); + } + + return error; +} + +/* + * Set up initial key/class pair on cnode. The cnode does not yet exist, + * so we must take a pointer to the cprotect struct. + * + * NOTE: + * We call this function in two places: + * 1) hfs_makenode *prior* to taking the journal/b-tree locks. + * A successful return value from this function is a pre-requisite for continuing on + * with file creation, as a wrap failure should immediately preclude the creation of + * the file. + * + * 2) cp_entry_init if we are trying to establish keys for a file/directory that did not + * have them already. (newfs_hfs may create entries in the namespace). + * + * At this point, we hold the directory cnode lock exclusive if it is available. + */ +int +cp_entry_create_keys(struct cprotect **entry_ptr, struct cnode *dcp, struct hfsmount *hfsmp, + uint32_t input_class, cnid_t fileid, mode_t cmode) +{ + int error = 0; + struct cprotect *entry = NULL; + size_t keylen; + + /* Default to class D */ + uint32_t target_class = PROTECTION_CLASS_D; + + /* Decide the target class. Input argument takes priority. */ + if (cp_is_valid_class (input_class)) { + target_class = input_class; /* - * Can't tell if the file is new, or was previously created but never - * written to or set-classed. In either case, it'll need a fresh - * per-file key. + * One exception, F is never valid for a directory + * because its children may inherit and userland will be + * unable to read/write to the files. */ - entry->cp_flags |= CP_NEEDS_KEYS; - error = 0; - } else { - if (xattr.xattr_major_version != CP_CURRENT_MAJOR_VERS) { - printf("hfs: cp_entry_init: bad xattr version\n"); - error = EINVAL; - goto out; + if (S_ISDIR(cmode)) { + if (target_class == PROTECTION_CLASS_F) { + return EINVAL; + } + } + } + else { + /* If no valid class was supplied, then inherit from parent if possible */ + if ((dcp) && (dcp->c_cpentry)) { + uint32_t parentclass = dcp->c_cpentry->cp_pclass; + /* If the parent class is not valid, default back to D */ + if (cp_is_valid_class(parentclass)) { + /* Parent class was good. use it. */ + target_class = parentclass; + } + /* Otherwise, we already defaulted to 'D' */ } + } - /* set up entry with information from xattr */ - entry->cp_pclass = xattr.persistent_class; - bcopy(&xattr.persistent_key, &entry->cp_persistent_key, CP_WRAPPEDKEYSIZE); + keylen = S_ISDIR(cmode) ? 0 : CP_INITIAL_WRAPPEDKEYSIZE; + entry = cp_entry_alloc (keylen); + if (!entry) { + *entry_ptr = NULL; + return ENOMEM; } -out: + if (S_ISREG(cmode)) { + entry->cp_pclass = target_class; + entry->cp_flags |= CP_NEEDS_KEYS; + /* + * The 'fileid' argument to this function will either be + * a valid fileid for an existing file/dir, or it will be 0. + * If it is 0, then that is an indicator to the layer below + * that the file does not yet exist and we need to bypass the + * cp_wrap work to the keybag. + * + * If we are being invoked on behalf of a file/dir that does + * not yet have a key, then it will be a valid key and we + * need to behave like a setclass. + */ + error = cp_make_keys(&entry, hfsmp, fileid, entry->cp_pclass); + } + else if (S_ISDIR(cmode)) { + /* Directories just get their cp_pclass set */ + entry->cp_pclass = target_class; + } + else { + /* Unsupported for non-dir and non-file. */ + error = EINVAL; + } + + /* + * We only initialize and create the keys here; we cannot + * write out the EA until the journal lock and EA b-tree locks + * are acquired. + */ + if (error) { - cp_entry_destroy (cnode); + /* destroy the CP blob */ + cp_entry_destroy (&entry); + *entry_ptr = NULL; + } + else { + /* otherwise, emit the cprotect entry */ + *entry_ptr = entry; } + return error; } /* - * Set up initial key/class pair on cnode. The cnode is locked exclusive. + * Set up an initial key/class pair for a disassociated cprotect entry. + * This function is used to generate transient keys that will never be + * written to disk. We use class F for this since it provides the exact + * semantics that are needed here. Because we never attach this blob to + * a cnode directly, we take a pointer to the cprotect struct. + * + * This function is primarily used in the HFS FS truncation codepath + * where we may rely on AES symmetry to relocate encrypted data from + * one spot in the disk to another. */ -int -cp_entry_create_keys(cnode_t *cnode) -{ - struct cprotect *entry = cnode->c_cpentry; +int cp_entry_gentempkeys(struct cprotect **entry_ptr, struct hfsmount *hfsmp) { + int error = 0; + struct cprotect *entry = NULL; + size_t keylen; + + /* Default to class F */ + uint32_t target_class = PROTECTION_CLASS_F; + /* + * This should only be used for files, so we default to the + * initial wrapped key size + */ + keylen = CP_INITIAL_WRAPPEDKEYSIZE; + entry = cp_entry_alloc (keylen); if (!entry) { - //unprotected file: continue - return 0; + *entry_ptr = NULL; + return ENOMEM; } - CP_ASSERT((entry->cp_flags & CP_NEEDS_KEYS)); + error = cp_make_keys (&entry, hfsmp, 0, target_class); + + /* + * We only initialize the keys here; we don't write anything out + */ + + if (error) { + /* destroy the CP blob */ + cp_entry_destroy (&entry); + *entry_ptr = NULL; + } + else { + /* otherwise, emit the cprotect entry */ + *entry_ptr = entry; + } + + return error; - return cp_make_keys(entry); } /* @@ -182,18 +353,17 @@ cp_entry_create_keys(cnode_t *cnode) * Called at hfs_reclaim_cnode: cnode is locked exclusive. */ void -cp_entry_destroy(cnode_t *cnode) -{ - struct cprotect *entry = cnode->c_cpentry; +cp_entry_destroy(struct cprotect **entry_ptr) { + struct cprotect *entry = *entry_ptr; if (!entry) { /* nothing to clean up */ return; } - cnode->c_cpentry = NULL; - bzero(entry, sizeof(*entry)); - FREE(entry, M_TEMP); + *entry_ptr = NULL; + cp_entry_dealloc(entry); } + int cp_fs_protected (mount_t mnt) { return (vfs_flags(mnt) & MNT_CPROTECT); @@ -204,8 +374,8 @@ cp_fs_protected (mount_t mnt) { * Return a pointer to underlying cnode if there is one for this vnode. * Done without taking cnode lock, inspecting only vnode state. */ -cnode_t * -cp_get_protected_cnode(vnode_t vp) +struct cnode * +cp_get_protected_cnode(struct vnode *vp) { if (!cp_vnode_is_eligible(vp)) { return NULL; @@ -216,7 +386,7 @@ cp_get_protected_cnode(vnode_t vp) return NULL; } - return (cnode_t *) vp->v_data; + return (struct cnode*) vp->v_data; } @@ -225,206 +395,467 @@ cp_get_protected_cnode(vnode_t vp) * or returns error. */ int -cp_vnode_getclass(vnode_t vp, int *class) +cp_vnode_getclass(struct vnode *vp, int *class) { - struct cp_xattr xattr; + struct cprotect *entry; int error = 0; - struct cnode *cnode; - + struct cnode *cp; + int took_truncate_lock = 0; + struct hfsmount *hfsmp = NULL; + + /* Is this an interesting vp? */ if (!cp_vnode_is_eligible (vp)) { return EBADF; } - - cnode = VTOC(vp); - hfs_lock(cnode, HFS_SHARED_LOCK); + /* Is the mount point formatted for content protection? */ + if (!cp_fs_protected(VTOVFS(vp))) { + return EPERM; + } + + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + + /* + * Take the truncate lock up-front in shared mode because we may need + * to manipulate the CP blob. Pend lock events until we're done here. + */ + hfs_lock_truncate (cp, HFS_SHARED_LOCK); + took_truncate_lock = 1; - if (cp_fs_protected(VTOVFS(vp))) { - /* pull the class from the live entry */ - struct cprotect *entry = cnode->c_cpentry; - if (!entry) { - panic("Content Protection: uninitialized cnode %p", cnode); - } + /* + * We take only the shared cnode lock up-front. If it turns out that + * we need to manipulate the CP blob to write a key out, drop the + * shared cnode lock and acquire an exclusive lock. + */ + error = hfs_lock(cp, HFS_SHARED_LOCK); + if (error) { + hfs_unlock_truncate(cp, 0); + return error; + } + + /* pull the class from the live entry */ + entry = cp->c_cpentry; + + if (!entry) { + panic("Content Protection: uninitialized cnode %p", cp); + } + + /* + * Any vnode on a content protected filesystem must have keys + * created by the time the vnode is vended out. If we generate + * a vnode that does not have keys, something bad happened. + */ + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + panic ("cp_vnode_getclass: cp %p has no keys!", cp); + } - if ((entry->cp_flags & CP_NEEDS_KEYS)) { - error = cp_make_keys(entry); - } + if (error == 0) { *class = entry->cp_pclass; - - } else { - /* - * Mount point is not formatted for content protection. If a class - * has been specified anyway, report it. Otherwise, report D. - */ - error = cp_getxattr(cnode, &xattr); - if (error == ENOATTR) { - *class = PROTECTION_CLASS_D; - error = 0; - } else if (error == 0) { - *class = xattr.persistent_class; - } } - hfs_unlock(cnode); + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } + + hfs_unlock(cp); return error; } /* - * Sets persistent class for this file. + * Sets persistent class for this file or directory. * If vnode cannot be protected (system file, non-regular file, non-hfs), EBADF. * If the new class can't be accessed now, EPERM. * Otherwise, record class and re-wrap key if the mount point is content-protected. */ int -cp_vnode_setclass(vnode_t vp, uint32_t newclass) +cp_vnode_setclass(struct vnode *vp, uint32_t newclass) { - struct cnode *cnode; - struct cp_xattr xattr; + struct cnode *cp; struct cprotect *entry = 0; int error = 0; + int took_truncate_lock = 0; + u_int32_t keylen = 0; + struct hfsmount *hfsmp = NULL; if (!cp_is_valid_class(newclass)) { printf("hfs: CP: cp_setclass called with invalid class %d\n", newclass); return EINVAL; } - /* is this an interesting file? */ + if (vnode_isdir(vp)) { + if (newclass == PROTECTION_CLASS_F) { + /* + * Directories are not allowed to set to class F, since the + * children may inherit it and then userland will not be able + * to read/write to the file. + */ + return EINVAL; + } + } + + /* Is this an interesting vp? */ if (!cp_vnode_is_eligible(vp)) { return EBADF; } - cnode = VTOC(vp); + /* Is the mount point formatted for content protection? */ + if (!cp_fs_protected(VTOVFS(vp))) { + return EPERM; + } + + cp = VTOC(vp); + hfsmp = VTOHFS(vp); - if (hfs_lock(cnode, HFS_EXCLUSIVE_LOCK)) { + /* + * Take the cnode truncate lock exclusive because we want to manipulate the + * CP blob. The lock-event handling code is doing the same. This also forces + * all pending IOs to drain before we can re-write the persistent and cache keys. + */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK); + took_truncate_lock = 1; + + if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK)) { return EINVAL; } - /* is the volume formatted for content protection? */ - if (cp_fs_protected(VTOVFS(vp))) { - entry = cnode->c_cpentry; - if (entry == NULL) { - error = EINVAL; - goto out; - } + entry = cp->c_cpentry; + if (entry == NULL) { + error = EINVAL; + goto out; + } - if ((entry->cp_flags & CP_NEEDS_KEYS)) { - if ((error = cp_make_keys(entry)) != 0) { - goto out; - } - } + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + /* + * We should have created this vnode and its keys atomically during + * file/directory creation. If we get here and it doesn't have keys yet, + * something bad happened. + */ + panic ("cp_vnode_setclass: cp %p has no keys!\n", cp); + } - if (entry->cp_flags & CP_KEY_FLUSHED) { - error = cp_restore_keys(entry); - if (error) - goto out; - } + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry, hfsmp); + if (error) + goto out; + } - /* re-wrap per-file key with new class */ - error = cp_wrap(newclass, - &entry->cp_cache_key[0], - &entry->cp_persistent_key[0]); + /* re-wrap per-file key with new class */ + if (vnode_isreg(vp)) { + error = cp_wrap(newclass, hfsmp, cp->c_fileid, &cp->c_cpentry); if (error) { /* we didn't have perms to set this class. leave file as-is and error out */ goto out; } + } - entry->cp_pclass = newclass; + /* cp_wrap() potentially updates c_cpentry because we passed in its ptr */ + entry = cp->c_cpentry; + + entry->cp_pclass = newclass; - /* prepare to write the xattr out */ - bcopy(&entry->cp_persistent_key, &xattr.persistent_key, CP_WRAPPEDKEYSIZE); - } else { - /* no live keys for this file. just remember intended class */ - bzero(&xattr.persistent_key, CP_WRAPPEDKEYSIZE); + /* prepare to write the xattr out */ + keylen = entry->cp_persistent_key_len; + + error = cp_setxattr(cp, entry, VTOHFS(vp), 0,XATTR_REPLACE); + if (error == ENOATTR) + error = cp_setxattr(cp, entry, VTOHFS(vp), 0, XATTR_CREATE); + +out: + + if (took_truncate_lock) { + hfs_unlock_truncate (cp, 0); + } + hfs_unlock(cp); + return error; +} + + +int cp_vnode_transcode(vnode_t vp) +{ + struct cnode *cp; + struct cprotect *entry = 0; + int error = 0; + int took_truncate_lock = 0; + struct hfsmount *hfsmp = NULL; + + /* Is this an interesting vp? */ + if (!cp_vnode_is_eligible(vp)) { + return EBADF; + } + + /* Is the mount point formatted for content protection? */ + if (!cp_fs_protected(VTOVFS(vp))) { + return EPERM; } - xattr.xattr_major_version = CP_CURRENT_MAJOR_VERS; - xattr.xattr_minor_version = CP_CURRENT_MINOR_VERS; - xattr.key_size = CP_WRAPPEDKEYSIZE; - xattr.flags = 0; - xattr.persistent_class = newclass; - error = cp_setxattr(cnode, &xattr, XATTR_REPLACE); + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + + /* + * Take the cnode truncate lock exclusive because we want to manipulate the + * CP blob. The lock-event handling code is doing the same. This also forces + * all pending IOs to drain before we can re-write the persistent and cache keys. + */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK); + took_truncate_lock = 1; - if (error == ENOATTR) { - error = cp_setxattr (cnode, &xattr, XATTR_CREATE); + if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK)) { + return EINVAL; + } + + entry = cp->c_cpentry; + if (entry == NULL) { + error = EINVAL; + goto out; + } + + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + /* + * If we are transcoding keys for AKB, then we should have already established + * a set of keys for this vnode. IF we don't have keys yet, then something bad + * happened. + */ + panic ("cp_vnode_transcode: cp %p has no keys!", cp); + } + + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry, hfsmp); + + if (error) { + goto out; + } + } + + /* Send the per-file key for re-wrap with the current class information + * Send NULLs in the output parameters of the wrapper() and AKS will do the rest. + * Don't need to process any outputs, so just clear the locks and pass along the error. */ + if (vnode_isreg(vp)) { + + /* Picked up the following from cp_wrap(). + * If needed, more comments available there. */ + + if (entry->cp_pclass == PROTECTION_CLASS_F) { + error = EINVAL; + goto out; + } + + error = g_cp_wrap_func.wrapper(entry->cp_pclass, + cp->c_fileid, + entry->cp_cache_key, + entry->cp_cache_key_len, + NULL, + NULL); + + if(error) + error = EPERM; } out: - hfs_unlock(cnode); + if (took_truncate_lock) { + hfs_unlock_truncate (cp, 0); + } + hfs_unlock(cp); return error; } + /* - * Check permission for the given operation (read, write, page in) on this node. + * Check permission for the given operation (read, write) on this node. * Additionally, if the node needs work, do it: * - create a new key for the file if one hasn't been set before * - write out the xattr if it hasn't already been saved * - unwrap the key if needed * * Takes cnode lock, and upgrades to exclusive if modifying cprotect. + * + * Note that this function does *NOT* take the cnode truncate lock. This is because + * the thread calling us may already have the truncate lock. It is not necessary + * because either we successfully finish this function before the keys are tossed + * and the IO will fail, or the keys are tossed and then this function will fail. + * Either way, the cnode lock still ultimately guards the keys. We only rely on the + * truncate lock to protect us against tossing the keys as a cluster call is in-flight. */ - int -cp_handle_vnop(cnode_t *cnode, int vnop) +int +cp_handle_vnop(struct vnode *vp, int vnop, int ioflag) { struct cprotect *entry; int error = 0; - struct cp_xattr xattr; + struct hfsmount *hfsmp = NULL; + struct cnode *cp = NULL; - if ((error = hfs_lock(cnode, HFS_SHARED_LOCK)) != KERN_SUCCESS) { + /* + * First, do validation against the vnode before proceeding any further: + * Is this vnode originating from a valid content-protected filesystem ? + */ + if (cp_vnode_is_eligible(vp) == 0) { + /* + * It is either not HFS or not a file/dir. Just return success. This is a valid + * case if servicing i/o against another filesystem type from VFS + */ + return 0; + } + + if (cp_fs_protected (VTOVFS(vp)) == 0) { + /* + * The underlying filesystem does not support content protection. This is also + * a valid case. Simply return success. + */ + return 0; + } + + /* + * At this point, we know we have a HFS vnode that backs a file or directory on a + * filesystem that supports content protection + */ + cp = VTOC(vp); + + if ((error = hfs_lock(cp, HFS_SHARED_LOCK))) { return error; } - entry = cnode->c_cpentry; - if (!entry) - goto out; + entry = cp->c_cpentry; + + if (!entry) { + /* + * If this cnode is not content protected, simply return success. + * Note that this function is called by all I/O-based call sites + * when CONFIG_PROTECT is enabled during XNU building. + */ - if ((error = cp_check_access(cnode, vnop)) != KERN_SUCCESS) { goto out; } + vp = CTOV(cp, 0); + if (vp == NULL) { + /* is it a rsrc */ + vp = CTOV(cp,1); + if (vp == NULL) { + error = EINVAL; + goto out; + } + } + hfsmp = VTOHFS(vp); + + if ((error = cp_check_access(cp, vnop))) { + /* check for raw encrypted access before bailing out */ + if ((vnop == CP_READ_ACCESS) && (ioflag & IO_ENCRYPTED)) { + /* + * read access only + asking for the raw encrypted bytes + * is legitimate, so reset the error value to 0 + */ + error = 0; + } + else { + goto out; + } + } + if (entry->cp_flags == 0) { /* no more work to do */ goto out; } /* upgrade to exclusive lock */ - if (lck_rw_lock_shared_to_exclusive(&cnode->c_rwlock) == FALSE) { - if ((error = hfs_lock(cnode, HFS_EXCLUSIVE_LOCK)) != KERN_SUCCESS) { + if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock) == FALSE) { + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { return error; } } else { - cnode->c_lockowner = current_thread(); + cp->c_lockowner = current_thread(); } - + /* generate new keys if none have ever been saved */ if ((entry->cp_flags & CP_NEEDS_KEYS)) { - if ((error = cp_make_keys(entry)) != 0) { - goto out; + /* + * By the time we're trying to initiate I/O against a content + * protected vnode, we should have already created keys for this + * file/dir. If we don't have keys, something bad happened. + */ + panic ("cp_handle_vnop: cp %p has no keys!", cp); + } + + /* unwrap keys if needed */ + if (entry->cp_flags & CP_KEY_FLUSHED) { + if ((vnop == CP_READ_ACCESS) && (ioflag & IO_ENCRYPTED)) { + /* no need to try to restore keys; they are not going to be used */ + error = 0; + } + else { + error = cp_restore_keys(entry, hfsmp); + + if (error) { + goto out; + } } } - /* unwrap keys if needed */ - if (entry->cp_flags & CP_KEY_FLUSHED) { - error = cp_restore_keys(entry); - if (error) - goto out; + /* write out the xattr if it's new */ + if (entry->cp_flags & CP_NO_XATTR) + error = cp_setxattr(cp, entry, VTOHFS(cp->c_vp), 0, XATTR_CREATE); + +out: + + hfs_unlock(cp); + return error; +} + + +int +cp_handle_open(struct vnode *vp, int mode) +{ + struct cnode *cp = NULL ; + struct cprotect *entry = NULL; + int error = 0; + + /* If vnode not eligible, just return success */ + if (!cp_vnode_is_eligible(vp)) { + return 0; + } + + /* If mount point not properly set up, then also return success */ + if (!cp_fs_protected(VTOVFS(vp))) { + return 0; + } + + /* We know the vnode is in a valid state. acquire cnode and validate */ + cp = VTOC(vp); + + if ((error = hfs_lock(cp, HFS_SHARED_LOCK))) { + return error; } - /* write out the xattr if it's new */ - if (entry->cp_flags & CP_NO_XATTR) { - bcopy(&entry->cp_persistent_key[0], &xattr.persistent_key, CP_WRAPPEDKEYSIZE); - xattr.xattr_major_version = CP_CURRENT_MAJOR_VERS; - xattr.xattr_minor_version = CP_CURRENT_MINOR_VERS; - xattr.key_size = CP_WRAPPEDKEYSIZE; - xattr.persistent_class = entry->cp_pclass; - error = cp_setxattr(cnode, &xattr, XATTR_CREATE); + entry = cp->c_cpentry; + if (!entry) + goto out; + + if (!S_ISREG(cp->c_mode)) + goto out; + + switch (entry->cp_pclass) { + case PROTECTION_CLASS_B: + /* Class B always allows creation */ + if (mode & O_CREAT) + goto out; + case PROTECTION_CLASS_A: + error = g_cp_wrap_func.unwrapper(entry->cp_pclass, + entry->cp_persistent_key, + entry->cp_persistent_key_len, + NULL, NULL); + if (error) + error = EPERM; + break; + default: + break; } out: - hfs_unlock(cnode); + hfs_unlock(cp); return error; } + /* * During hfs resize operations, we have slightly different constraints than during * normal VNOPS that read/write data to files. Specifically, we already have the cnode @@ -433,7 +864,8 @@ out: * vs. lock), and don't worry about non-existing keys. If the file exists on-disk with valid * payload, then it must have keys set up already by definition. */ -int cp_handle_relocate (cnode_t *cp) { +int +cp_handle_relocate (struct cnode *cp, struct hfsmount *hfsmp) { struct cprotect *entry; int error = -1; @@ -446,7 +878,7 @@ int cp_handle_relocate (cnode_t *cp) { * Still need to validate whether to permit access to the file or not * based on lock status */ - if ((error = cp_check_access(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != KERN_SUCCESS) { + if ((error = cp_check_access(cp, CP_READ_ACCESS | CP_WRITE_ACCESS))) { goto out; } @@ -460,18 +892,19 @@ int cp_handle_relocate (cnode_t *cp) { /* unwrap keys if needed */ if (entry->cp_flags & CP_KEY_FLUSHED) { - error = cp_restore_keys(entry); + error = cp_restore_keys(entry, hfsmp); } - /* don't need to write out the EA since the file is extant */ + /* + * Don't need to write out the EA since if the file has actual extents, + * it must have an EA + */ out: /* return the cp still locked */ return error; } - - /* * cp_getrootxattr: * Gets the EA we set on the root folder (fileid 1) to get information about the @@ -479,8 +912,8 @@ out: * Note that all multi-byte fields are written to disk little endian so they must be * converted to native endian-ness as needed. */ - -int cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) { +int +cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) { uio_t auio; char uio_buf[UIO_SIZEOF(1)]; size_t attrsize = sizeof(struct cp_root_xattr); @@ -488,7 +921,7 @@ int cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) { struct vnop_getxattr_args args; if (!outxattr) { - panic("cp_xattr called with xattr == NULL"); + panic("Content Protection: cp_xattr called with xattr == NULL"); } auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); @@ -509,7 +942,7 @@ int cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) { outxattr->minor_version = OSSwapLittleToHostInt16(outxattr->minor_version); outxattr->flags = OSSwapLittleToHostInt64(outxattr->flags); - if (error != KERN_SUCCESS) { + if (error != 0) { goto out; } @@ -528,7 +961,7 @@ out: * This will be written to the disk when it detects the EA is not there, or when we need * to make a modification to the on-disk version that can be done in-place. */ - int +int cp_setrootxattr(struct hfsmount *hfsmp, struct cp_root_xattr *newxattr) { int error = 0; @@ -552,18 +985,193 @@ cp_setrootxattr(struct hfsmount *hfsmp, struct cp_root_xattr *newxattr) } +/* + * Stores new xattr data on the cnode. + * cnode lock held exclusive (if available). + * + * This function is also invoked during file creation. + */ +int cp_setxattr(struct cnode *cp, struct cprotect *entry, struct hfsmount *hfsmp, uint32_t fileid, int options) +{ + int error = 0; + size_t attrsize; + struct vnop_setxattr_args args; + uint32_t target_fileid; + struct cnode *arg_cp = NULL; + uint32_t tempflags = 0; + + args.a_desc = NULL; + if (cp) { + args.a_vp = cp->c_vp; + target_fileid = 0; + arg_cp = cp; + } + else { + /* + * When we set the EA in the same txn as the file creation, + * we do not have a vnode/cnode yet. Use the specified fileid. + */ + args.a_vp = NULL; + target_fileid = fileid; + } + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = NULL; //pass data ptr instead + args.a_options = options; + args.a_context = vfs_context_current(); + + /* Add asserts for the CP flags in the CP blob. */ + if (entry->cp_flags & CP_NEEDS_KEYS) { + panic ("cp_setxattr: cp %p , cpentry %p still needs keys!", cp, entry); + } + + /* Disable flags that will be invalid as we're writing the EA out at this point. */ + tempflags = entry->cp_flags; + tempflags &= ~CP_NO_XATTR; + + switch(hfsmp->hfs_running_cp_major_vers) { + case CP_NEW_MAJOR_VERS: { + struct cp_xattr_v4 *newxattr = NULL; // 70+ bytes; don't alloc on stack. + MALLOC (newxattr, struct cp_xattr_v4*, sizeof(struct cp_xattr_v4), M_TEMP, M_WAITOK); + if (newxattr == NULL) { + error = ENOMEM; + break; + } + bzero (newxattr, sizeof(struct cp_xattr_v4)); + + attrsize = sizeof(*newxattr) - CP_MAX_WRAPPEDKEYSIZE + entry->cp_persistent_key_len; + + /* Endian swap the multi-byte fields into L.E from host. */ + newxattr->xattr_major_version = OSSwapHostToLittleInt16 (hfsmp->hfs_running_cp_major_vers); + newxattr->xattr_minor_version = OSSwapHostToLittleInt16(CP_MINOR_VERS); + newxattr->key_size = OSSwapHostToLittleInt32(entry->cp_persistent_key_len); + newxattr->flags = OSSwapHostToLittleInt32(tempflags); + newxattr->persistent_class = OSSwapHostToLittleInt32(entry->cp_pclass); + bcopy(entry->cp_persistent_key, newxattr->persistent_key, entry->cp_persistent_key_len); + + error = hfs_setxattr_internal(arg_cp, (caddr_t)newxattr, attrsize, &args, hfsmp, target_fileid); + + FREE(newxattr, M_TEMP); + break; + } + case CP_PREV_MAJOR_VERS: { + struct cp_xattr_v2 *newxattr = NULL; + MALLOC (newxattr, struct cp_xattr_v2*, sizeof(struct cp_xattr_v2), M_TEMP, M_WAITOK); + if (newxattr == NULL) { + error = ENOMEM; + break; + } + bzero (newxattr, sizeof(struct cp_xattr_v2)); + + attrsize = sizeof(*newxattr); + + /* Endian swap the multi-byte fields into L.E from host. */ + newxattr->xattr_major_version = OSSwapHostToLittleInt16(hfsmp->hfs_running_cp_major_vers); + newxattr->xattr_minor_version = OSSwapHostToLittleInt16(CP_MINOR_VERS); + newxattr->key_size = OSSwapHostToLittleInt32(entry->cp_persistent_key_len); + newxattr->flags = OSSwapHostToLittleInt32(tempflags); + newxattr->persistent_class = OSSwapHostToLittleInt32(entry->cp_pclass); + bcopy(entry->cp_persistent_key, newxattr->persistent_key, entry->cp_persistent_key_len); + + error = hfs_setxattr_internal(arg_cp, (caddr_t)newxattr, attrsize, &args, hfsmp, target_fileid); + + FREE (newxattr, M_TEMP); + break; + } + } + + if (error == 0 ) { + entry->cp_flags &= ~CP_NO_XATTR; + } + + return error; + + +} + +/* + * This function takes a cprotect struct with the cache keys and re-wraps them for + * MKB's sake so that it can update its own data structures. It is useful when + * there may not be a cnode in existence yet (for example, after creating + * a file). + */ +int +cp_update_mkb (struct cprotect *entry, uint32_t fileid) { + + int error = 0; + + /* We already validated this pclass earlier */ + if (entry->cp_pclass != PROTECTION_CLASS_F ) { + error = g_cp_wrap_func.wrapper (entry->cp_pclass, fileid, entry->cp_cache_key, + entry->cp_cache_key_len, NULL, NULL); + } + + if (error) { + error = EPERM; + } + + return error; +} + +/* + * Used by an fcntl to query the underlying FS for its content protection version # + */ + +int +cp_get_root_major_vers(vnode_t vp, uint32_t *level) { + int err = 0; + struct hfsmount *hfsmp = NULL; + struct mount *mp = NULL; + + mp = VTOVFS(vp); + + /* check if it supports content protection */ + if (cp_fs_protected(mp) == 0) { + return EINVAL; + } + + hfsmp = VFSTOHFS(mp); + /* figure out the level */ + + err = cp_root_major_vers(mp); + + if (err == 0) { + *level = hfsmp->hfs_running_cp_major_vers; + } + /* in error case, cp_root_major_vers will just return EINVAL. Use that */ + return err; +} /******************** * Private Functions *******************/ static int -cp_vnode_is_eligible(vnode_t vp) +cp_root_major_vers(mount_t mp) +{ + int err = 0; + struct cp_root_xattr xattr; + struct hfsmount *hfsmp = NULL; + + hfsmp = vfs_fsprivate(mp); + err = cp_getrootxattr (hfsmp, &xattr); + + if (err == 0) { + hfsmp->hfs_running_cp_major_vers = xattr.major_version; + } + else { + return EINVAL; + } + + return 0; +} + +static int +cp_vnode_is_eligible(struct vnode *vp) { return ((vp->v_op == hfs_vnodeop_p) && (!vnode_issystem(vp)) && - (vnode_isreg(vp))); + (vnode_isreg(vp) || vnode_isdir(vp))); } @@ -577,101 +1185,214 @@ cp_is_valid_class(int class) static struct cprotect * -cp_entry_alloc(void) +cp_entry_alloc(size_t keylen) { struct cprotect *cp_entry; + + if (keylen > CP_MAX_WRAPPEDKEYSIZE) + return (NULL); - MALLOC(cp_entry, struct cprotect *, sizeof(struct cprotect), + MALLOC(cp_entry, struct cprotect *, sizeof(struct cprotect) + keylen, M_TEMP, M_WAITOK); if (cp_entry == NULL) return (NULL); - - bzero(cp_entry, sizeof(*cp_entry)); + + bzero(cp_entry, sizeof(*cp_entry) + keylen); + cp_entry->cp_persistent_key_len = keylen; return (cp_entry); } +static void +cp_entry_dealloc(struct cprotect *entry) +{ + uint32_t keylen = entry->cp_persistent_key_len; + bzero(entry, (sizeof(*entry) + keylen)); + FREE(entry, M_TEMP); +} + /* - * Reads xattr data off the cnode and into provided xattr. + * Initializes a new cprotect entry with xattr data from the cnode. * cnode lock held shared */ static int -cp_getxattr(cnode_t *cnode, struct cp_xattr *outxattr) +cp_getxattr(struct cnode *cp, struct hfsmount *hfsmp, struct cprotect **outentry) { - uio_t auio; - char uio_buf[UIO_SIZEOF(1)]; - size_t attrsize = sizeof(struct cp_xattr); int error = 0; + uio_t auio; + size_t attrsize; + char uio_buf[UIO_SIZEOF(1)]; struct vnop_getxattr_args args; - + struct cprotect *entry = NULL; + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); - uio_addiov(auio, CAST_USER_ADDR_T(outxattr), attrsize); - args.a_desc = NULL; // unused - args.a_vp = cnode->c_vp; + args.a_vp = cp->c_vp; args.a_name = CONTENT_PROTECTION_XATTR_NAME; args.a_uio = auio; - args.a_size = &attrsize; args.a_options = XATTR_REPLACE; args.a_context = vfs_context_current(); // unused - error = hfs_getxattr_internal(cnode, &args, VTOHFS(cnode->c_vp), 0); - if (error != KERN_SUCCESS) { - goto out; - } - /* Endian swap the multi-byte fields into host endianness from L.E. */ - outxattr->xattr_major_version = OSSwapLittleToHostInt16(outxattr->xattr_major_version); - outxattr->xattr_minor_version = OSSwapLittleToHostInt16(outxattr->xattr_minor_version); - outxattr->key_size = OSSwapLittleToHostInt32(outxattr->key_size); - outxattr->flags = OSSwapLittleToHostInt32(outxattr->flags); - outxattr->persistent_class = OSSwapLittleToHostInt32(outxattr->persistent_class); + switch (hfsmp->hfs_running_cp_major_vers) { + case CP_NEW_MAJOR_VERS: { + struct cp_xattr_v4 *xattr = NULL; + MALLOC (xattr, struct cp_xattr_v4*, sizeof(struct cp_xattr_v4), M_TEMP, M_WAITOK); + if (xattr == NULL) { + error = ENOMEM; + break; + } + bzero(xattr, sizeof (struct cp_xattr_v4)); + attrsize = sizeof(*xattr); + + uio_addiov(auio, CAST_USER_ADDR_T(xattr), attrsize); + args.a_size = &attrsize; + + error = hfs_getxattr_internal(cp, &args, VTOHFS(cp->c_vp), 0); + if (error != 0) { + FREE (xattr, M_TEMP); + goto out; + } + + /* Endian swap the multi-byte fields into host endianness from L.E. */ + xattr->xattr_major_version = OSSwapLittleToHostInt16(xattr->xattr_major_version); + xattr->xattr_minor_version = OSSwapLittleToHostInt16(xattr->xattr_minor_version); + xattr->key_size = OSSwapLittleToHostInt32(xattr->key_size); + xattr->flags = OSSwapLittleToHostInt32(xattr->flags); + xattr->persistent_class = OSSwapLittleToHostInt32(xattr->persistent_class); + + if (xattr->xattr_major_version != hfsmp->hfs_running_cp_major_vers ) { + printf("hfs: cp_getxattr: bad xattr version %d expecting %d\n", + xattr->xattr_major_version, hfsmp->hfs_running_cp_major_vers); + error = EINVAL; + FREE (xattr, M_TEMP); + + goto out; + } + /* + * Prevent a buffer overflow, and validate the key length obtained from the + * EA. If it's too big, then bail out, because the EA can't be trusted at this + * point. + */ + if (xattr->key_size > CP_MAX_WRAPPEDKEYSIZE) { + error = EINVAL; + FREE (xattr, M_TEMP); + + goto out; + } + + /* set up entry with information from xattr */ + entry = cp_entry_alloc(xattr->key_size); + if (!entry) { + FREE (xattr, M_TEMP); + + return ENOMEM; + } + + entry->cp_pclass = xattr->persistent_class; + if (xattr->xattr_major_version >= CP_NEW_MAJOR_VERS) { + entry->cp_flags |= CP_OFF_IV_ENABLED; + } + bcopy(xattr->persistent_key, entry->cp_persistent_key, xattr->key_size); + + FREE (xattr, M_TEMP); + + break; + } + case CP_PREV_MAJOR_VERS: { + struct cp_xattr_v2 *xattr = NULL; + MALLOC (xattr, struct cp_xattr_v2*, sizeof(struct cp_xattr_v2), M_TEMP, M_WAITOK); + if (xattr == NULL) { + error = ENOMEM; + break; + } + bzero (xattr, sizeof (struct cp_xattr_v2)); + attrsize = sizeof(*xattr); + + uio_addiov(auio, CAST_USER_ADDR_T(xattr), attrsize); + args.a_size = &attrsize; + + error = hfs_getxattr_internal(cp, &args, VTOHFS(cp->c_vp), 0); + if (error != 0) { + FREE (xattr, M_TEMP); + goto out; + } + + /* Endian swap the multi-byte fields into host endianness from L.E. */ + xattr->xattr_major_version = OSSwapLittleToHostInt16(xattr->xattr_major_version); + xattr->xattr_minor_version = OSSwapLittleToHostInt16(xattr->xattr_minor_version); + xattr->key_size = OSSwapLittleToHostInt32(xattr->key_size); + xattr->flags = OSSwapLittleToHostInt32(xattr->flags); + xattr->persistent_class = OSSwapLittleToHostInt32(xattr->persistent_class); + + if (xattr->xattr_major_version != hfsmp->hfs_running_cp_major_vers) { + printf("hfs: cp_getxattr: bad xattr version %d expecting %d\n", + xattr->xattr_major_version, hfsmp->hfs_running_cp_major_vers); + error = EINVAL; + FREE (xattr, M_TEMP); + goto out; + } + + /* + * Prevent a buffer overflow, and validate the key length obtained from the + * EA. If it's too big, then bail out, because the EA can't be trusted at this + * point. + */ + if (xattr->key_size > CP_V2_WRAPPEDKEYSIZE) { + error = EINVAL; + FREE (xattr, M_TEMP); + goto out; + } + /* set up entry with information from xattr */ + entry = cp_entry_alloc(xattr->key_size); + if (!entry) { + FREE (xattr, M_TEMP); + return ENOMEM; + } + + entry->cp_pclass = xattr->persistent_class; + bcopy(xattr->persistent_key, entry->cp_persistent_key, xattr->key_size); + FREE (xattr, M_TEMP); + break; + } + } out: uio_free(auio); + + *outentry = entry; return error; } -/* - * Stores new xattr data on the cnode. - * cnode lock held exclusive - */ + +/* Setup AES context */ static int -cp_setxattr(cnode_t *cnode, struct cp_xattr *newxattr, int options) +cp_setup_aes_ctx(struct cprotect *entry) { - int error = 0; - struct vnop_setxattr_args args; + SHA1_CTX sha1ctxt; + uint8_t cp_cache_iv_key[CP_IV_KEYSIZE]; /* Kiv */ - args.a_desc = NULL; - args.a_vp = cnode->c_vp; - args.a_name = CONTENT_PROTECTION_XATTR_NAME; - args.a_uio = NULL; //pass data ptr instead - args.a_options = options; - args.a_context = vfs_context_current(); - - /* Endian swap the multi-byte fields into L.E from host. */ - newxattr->xattr_major_version = OSSwapHostToLittleInt16(newxattr->xattr_major_version); - newxattr->xattr_minor_version = OSSwapHostToLittleInt16(newxattr->xattr_minor_version); - newxattr->key_size = OSSwapHostToLittleInt32(newxattr->key_size); - newxattr->flags = OSSwapHostToLittleInt32(newxattr->flags); - newxattr->persistent_class = OSSwapHostToLittleInt32(newxattr->persistent_class); - - error = hfs_setxattr_internal(cnode, (caddr_t)newxattr, - sizeof(struct cp_xattr), &args, VTOHFS(cnode->c_vp), 0); - - if ((error == KERN_SUCCESS) && (cnode->c_cpentry)) { - cnode->c_cpentry->cp_flags &= ~CP_NO_XATTR; - } + /* First init the cp_cache_iv_key[] */ + SHA1Init(&sha1ctxt); + SHA1Update(&sha1ctxt, &entry->cp_cache_key[0], CP_MAX_KEYSIZE); + SHA1Final(&cp_cache_iv_key[0], &sha1ctxt); + + aes_encrypt_key128(&cp_cache_iv_key[0], &entry->cp_cache_iv_ctx); - return error; + return 0; } /* * Make a new random per-file key and wrap it. + * Normally this will get default_pclass as PROTECTION_CLASS_D. + * + * But when the directory's class is set, we use that as the default. */ static int -cp_make_keys(struct cprotect *entry) +cp_make_keys(struct cprotect **entry_arg, struct hfsmount *hfsmp, cnid_t fileid, int default_pclass) { + struct cprotect *entry = *entry_arg; + int target_pclass = 0; int error = 0; if (g_cp_state.wrap_functions_set != 1) { @@ -680,94 +1401,122 @@ cp_make_keys(struct cprotect *entry) } /* create new cp data: key and class */ - read_random(&entry->cp_cache_key[0], CP_KEYSIZE); - entry->cp_pclass = PROTECTION_CLASS_D; + entry->cp_cache_key_len = CP_MAX_KEYSIZE; + read_random(&entry->cp_cache_key[0], entry->cp_cache_key_len); + + if (cp_is_valid_class(default_pclass) == 0) { + target_pclass = PROTECTION_CLASS_D; + } else { + target_pclass = default_pclass; + } + + /* + * Attempt to wrap the new key in the class key specified by target_pclass + * Note that because we may be inheriting a protection level specified + * by the containing directory, this can fail; we could be trying to + * wrap this cache key in the class 'A' key while the device is locked. + * As such, emit an error if we fail to wrap the key here, instead of + * panicking. + */ + + error = cp_wrap(target_pclass, hfsmp, fileid, entry_arg); - /* wrap the new key in the class key */ - error = cp_wrap(PROTECTION_CLASS_D, - &entry->cp_cache_key[0], - &entry->cp_persistent_key[0]); - if (error) { - panic("could not wrap new key in class D\n"); + goto out; + } + /* cp_wrap() potentially updates c_cpentry */ + entry = *entry_arg; + + /* set the pclass to the target since the wrap was successful */ + entry->cp_pclass = target_pclass; + + /* No need to go here for older EAs */ + if (hfsmp->hfs_running_cp_major_vers == CP_NEW_MAJOR_VERS) { + cp_setup_aes_ctx(entry); + entry->cp_flags |= CP_OFF_IV_ENABLED; } /* ready for business */ entry->cp_flags &= ~CP_NEEDS_KEYS; entry->cp_flags |= CP_NO_XATTR; +out: return error; } /* * If permitted, restore entry's unwrapped key from the persistent key. - * If not, clear key and set CP_ENTRY_FLUSHED. + * If not, clear key and set CP_KEY_FLUSHED. * cnode lock held exclusive */ static int -cp_restore_keys(struct cprotect *entry) +cp_restore_keys(struct cprotect *entry, struct hfsmount *hfsmp) { int error = 0; - error = cp_unwrap(entry->cp_pclass, - &entry->cp_persistent_key[0], - &entry->cp_cache_key[0]); - + error = cp_unwrap(entry->cp_pclass, entry); if (error) { entry->cp_flags |= CP_KEY_FLUSHED; - bzero(entry->cp_cache_key, CP_KEYSIZE); + bzero(entry->cp_cache_key, entry->cp_cache_key_len); error = EPERM; } else { + /* No need to go here for older EAs */ + if (hfsmp->hfs_running_cp_major_vers == CP_NEW_MAJOR_VERS) { + cp_setup_aes_ctx(entry); + entry->cp_flags |= CP_OFF_IV_ENABLED; + } + + /* ready for business */ entry->cp_flags &= ~CP_KEY_FLUSHED; + } return error; } static int -cp_lock_vfs_callback(mount_t mp, void *arg) -{ - if (!cp_fs_protected(mp)) { - /* not interested in this mount point */ - return 0; - } - - return vnode_iterate(mp, 0, cp_lock_vnode_callback, arg); +cp_lock_vfs_callback(mount_t mp, void *arg) { + + /* + * When iterating the various mount points that may + * be present on a content-protected device, we need to skip + * those that do not have it enabled. + */ + if (!cp_fs_protected(mp)) { + return 0; + } + + return vnode_iterate(mp, 0, cp_lock_vnode_callback, arg); } /* * Deny access to protected files if keys have been locked. - * - * cnode lock is taken shared. */ - static int -cp_check_access(cnode_t *cnode, int vnop) +static int +cp_check_access(struct cnode *cp, int vnop __unused) { int error = 0; if (g_cp_state.lock_state == CP_UNLOCKED_STATE) { - return KERN_SUCCESS; + return 0; } - if (!cnode->c_cpentry) { + if (!cp->c_cpentry) { /* unprotected node */ - return KERN_SUCCESS; + return 0; + } + + if (!S_ISREG(cp->c_mode)) { + return 0; } - /* Deny all access for class A files, and read access for class B */ - switch (cnode->c_cpentry->cp_pclass) { + /* Deny all access for class A files */ + switch (cp->c_cpentry->cp_pclass) { case PROTECTION_CLASS_A: { error = EPERM; break; } - case PROTECTION_CLASS_B: { - if (vnop & CP_READ_ACCESS) - error = EPERM; - else - error = 0; - break; - } default: error = 0; break; @@ -776,21 +1525,20 @@ cp_check_access(cnode_t *cnode, int vnop) return error; } - - /* * Respond to a lock or unlock event. * On lock: clear out keys from memory, then flush file contents. * On unlock: nothing (function not called). */ static int -cp_lock_vnode_callback(vnode_t vp, void *arg) +cp_lock_vnode_callback(struct vnode *vp, void *arg) { cnode_t *cp = NULL; struct cprotect *entry = NULL; int error = 0; int locked = 1; int action = 0; + int took_truncate_lock = 0; error = vnode_getwithref (vp); if (error) { @@ -798,6 +1546,18 @@ cp_lock_vnode_callback(vnode_t vp, void *arg) } cp = VTOC(vp); + + /* + * When cleaning cnodes due to a lock event, we must + * take the truncate lock AND the cnode lock. By taking + * the truncate lock here, we force (nearly) all pending IOs + * to drain before we can acquire the truncate lock. All HFS cluster + * io calls except for swapfile IO need to acquire the truncate lock + * prior to calling into the cluster layer. + */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK); + took_truncate_lock = 1; + hfs_lock(cp, HFS_FORCE_LOCK); entry = cp->c_cpentry; @@ -810,12 +1570,26 @@ cp_lock_vnode_callback(vnode_t vp, void *arg) switch (action) { case CP_LOCKED_STATE: { vfs_context_t ctx; - if (entry->cp_pclass != PROTECTION_CLASS_A) { - /* no change at lock for other classes */ + if (entry->cp_pclass != PROTECTION_CLASS_A || + vnode_isdir(vp)) { + /* + * There is no change at lock for other classes than A. + * B is kept in memory for writing, and class F (for VM) does + * not have a wrapped key, so there is no work needed for + * wrapping/unwrapping. + * + * Note that 'class F' is relevant here because if + * hfs_vnop_strategy does not take the cnode lock + * to protect the cp blob across IO operations, we rely + * implicitly on the truncate lock to be held when doing IO. + * The only case where the truncate lock is not held is during + * swapfile IO because HFS just funnels the VNOP_PAGEOUT + * directly to cluster_pageout. + */ goto out; } - /* Before doing anything else, zero-fille sparse ranges as needed */ + /* Before doing anything else, zero-fill sparse ranges as needed */ ctx = vfs_context_current(); (void) hfs_filedone (vp, ctx); @@ -823,10 +1597,20 @@ cp_lock_vnode_callback(vnode_t vp, void *arg) hfs_unlock (cp); ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); hfs_lock (cp, HFS_FORCE_LOCK); - - /* flush keys */ + + /* flush keys: + * There was a concern here(9206856) about flushing keys before nand layer is done using them. + * But since we are using ubc_msync with UBC_SYNC, it blocks until all IO is completed. + * Once IOFS caches or is done with these keys, it calls the completion routine in IOSF. + * Which in turn calls buf_biodone() and eventually unblocks ubc_msync() + * Also verified that the cached data in IOFS is overwritten by other data, and there + * is no key leakage in that layer. + */ + entry->cp_flags |= CP_KEY_FLUSHED; - bzero(&entry->cp_cache_key, CP_KEYSIZE); + bzero(&entry->cp_cache_key, entry->cp_cache_key_len); + bzero(&entry->cp_cache_iv_ctx, sizeof(aes_encrypt_ctx)); + /* some write may have arrived in the mean time. dump those pages */ hfs_unlock(cp); locked = 0; @@ -839,56 +1623,120 @@ cp_lock_vnode_callback(vnode_t vp, void *arg) break; } default: - panic("unknown lock action %d\n", action); + panic("Content Protection: unknown lock action %d\n", action); } out: - if (locked) + if (locked) { hfs_unlock(cp); + } + + if (took_truncate_lock) { + hfs_unlock_truncate (cp, 0); + } + vnode_put (vp); return error; } static int -cp_wrap(int class, void *inkey, void *outkey) +cp_wrap(int class, struct hfsmount *hfsmp, cnid_t fileid, struct cprotect **entry_ptr) { - int error = 0; - size_t keyln = CP_WRAPPEDKEYSIZE; + struct cprotect *entry = *entry_ptr; + uint8_t newkey[CP_MAX_WRAPPEDKEYSIZE]; + size_t keylen = CP_MAX_WRAPPEDKEYSIZE; + int error = 0; + + /* + * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient + * key that is only good as long as the file is open. There is no + * wrapped key, so there isn't anything to wrap. + */ if (class == PROTECTION_CLASS_F) { - bzero(outkey, CP_WRAPPEDKEYSIZE); + bzero(entry->cp_persistent_key, entry->cp_persistent_key_len); + entry->cp_persistent_key_len = 0; return 0; } - + + /* + * inode is passed here to find the backup bag wrapped blob + * from userspace. This lookup will occur shortly after creation + * and only if the file still exists. Beyond this lookup the + * inode is not used. Technically there is a race, we practically + * don't lose. + */ error = g_cp_wrap_func.wrapper(class, - inkey, - CP_KEYSIZE, - outkey, - &keyln); - + fileid, + entry->cp_cache_key, + entry->cp_cache_key_len, + newkey, + &keylen); + + if (!error) { + /* + * v2 EA's don't support the larger class B keys + */ + if ((keylen != CP_V2_WRAPPEDKEYSIZE) && + (hfsmp->hfs_running_cp_major_vers == CP_PREV_MAJOR_VERS)) { + return EINVAL; + } + + /* + * Reallocate the entry if the new persistent key changed length + */ + if (entry->cp_persistent_key_len != keylen) { + struct cprotect *oldentry = entry; + + entry = cp_entry_alloc(keylen); + if (entry == NULL) + return ENOMEM; + + bcopy(oldentry, entry, sizeof(struct cprotect)); + entry->cp_persistent_key_len = keylen; + + cp_entry_destroy (&oldentry); + + *entry_ptr = entry; + } + + bcopy(newkey, entry->cp_persistent_key, keylen); + } + else { + error = EPERM; + } + return error; } static int -cp_unwrap(int class, void *inkey, void *outkey) +cp_unwrap(int class, struct cprotect *entry) { int error = 0; - size_t keyln = CP_KEYSIZE; - + size_t keylen = CP_MAX_KEYSIZE; + + /* + * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient + * key that is only good as long as the file is open. There is no + * wrapped key, so there isn't anything to unwrap. + */ if (class == PROTECTION_CLASS_F) { - /* we didn't save a wrapped key, so nothing to unwrap */ return EPERM; } - + error = g_cp_wrap_func.unwrapper(class, - inkey, - CP_WRAPPEDKEYSIZE, - outkey, - &keyln); + entry->cp_persistent_key, + entry->cp_persistent_key_len, + entry->cp_cache_key, + &keylen); + if (!error) { + entry->cp_cache_key_len = keylen; + } else { + error = EPERM; + } return error; - } diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h index ae1039a3e..2cf6a0756 100644 --- a/bsd/hfs/hfs_format.h +++ b/bsd/hfs/hfs_format.h @@ -373,7 +373,6 @@ enum { kHFSHasDateAddedBit = 0x0007, /* File/Folder has the date-added stored in the finder info. */ kHFSHasDateAddedMask = 0x0080 - }; diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h index 7bebee3fb..d19d8e7d4 100644 --- a/bsd/hfs/hfs_fsctl.h +++ b/bsd/hfs/hfs_fsctl.h @@ -125,6 +125,20 @@ struct hfs_journal_info { #define HFSIOC_DISABLE_METAZONE _IO('h', 25) #define HFS_DISABLE_METAZONE IOCBASECMD(HFSIOC_DISABLE_METAZONE) +/* Change the next CNID value */ +#define HFSIOC_CHANGE_NEXTCNID _IOWR('h', 26, u_int32_t) +#define HFS_CHANGE_NEXTCNID IOCBASECMD(HFSIOC_CHANGE_NEXTCNID) + +/* Get the low disk space values */ +#define HFSIOC_GET_VERY_LOW_DISK _IOR('h', 27, u_int32_t) +#define HFS_FSCTL_GET_VERY_LOW_DISK IOCBASECMD(HFSIOC_GET_VERY_LOW_DISK) + +#define HFSIOC_GET_LOW_DISK _IOR('h', 28, u_int32_t) +#define HFS_FSCTL_GET_LOW_DISK IOCBASECMD(HFSIOC_GET_LOW_DISK) + +#define HFSIOC_GET_DESIRED_DISK _IOR('h', 29, u_int32_t) +#define HFS_FSCTL_GET_DESIRED_DISK IOCBASECMD(HFSIOC_GET_DESIRED_DISK) + #endif /* __APPLE_API_UNSTABLE */ #endif /* ! _HFS_FSCTL_H_ */ diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c index 66e273b5d..7ebb82bc6 100644 --- a/bsd/hfs/hfs_hotfiles.c +++ b/bsd/hfs/hfs_hotfiles.c @@ -810,7 +810,7 @@ hfs_addhotfile_internal(struct vnode *vp) (ffp->ff_size == 0) || (ffp->ff_blocks > hotdata->maxblocks) || (cp->c_flag & (C_DELETED | C_NOEXISTS)) || - (cp->c_flags & UF_NODUMP) || + (cp->c_bsdflags & UF_NODUMP) || (cp->c_atime < hfsmp->hfc_timebase)) { return (0); } diff --git a/bsd/hfs/hfs_kdebug.h b/bsd/hfs/hfs_kdebug.h index 5dd5d6a9c..d3202bca4 100644 --- a/bsd/hfs/hfs_kdebug.h +++ b/bsd/hfs/hfs_kdebug.h @@ -37,8 +37,8 @@ enum { HFSDBG_UNMAP_CALLBACK 0, extentCount, 0, 0 ... 0, 0, 0, 0 HFSDBG_UNMAP_FREE startBlock, blockCount, 0, 0 ... err, 0, 0, 0 HFSDBG_UNMAP_ALLOC startBlock, blockCount, 0, 0 ... err, 0, 0, 0 - HFSDBG_REMOVE_EXTENT_CACHE startBlock, blockCount, 0, 0 ... 0, 0, 0, 0 - HFSDBG_ADD_EXTENT_CACHE startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_REMOVE_EXTENT_CACHE startBlock, blockCount, vcbFreeExtCnt, 0 ... 0, 0, vcbFreeExtCnt, extentsRemoved + HFSDBG_ADD_EXTENT_CACHE startBlock, blockCount, vcbFreeExtCnt, 0 ... 0, 0, vcbFreeExtCnt, retval HFSDBG_MARK_ALLOC_BITMAP startBlock, blockCount, 0, 0 ... err, 0, 0, 0 HFSDBG_MARK_FREE_BITMAP startBlock, blockCount, valid, 0 ... err, 0, 0, 0 HFSDBG_BLOCK_DEALLOCATE startBlock, blockCount, flags, 0 ... err, 0, 0, 0 diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index d24a92011..b1b9359af 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -503,7 +503,7 @@ hfs_vnop_link(struct vnop_link_args *ap) error = EMLINK; goto out; } - if (cp->c_flags & (IMMUTABLE | APPEND)) { + if (cp->c_bsdflags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c index 13cb1aa48..2200fe1de 100644 --- a/bsd/hfs/hfs_lookup.c +++ b/bsd/hfs/hfs_lookup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2008 Apple Inc. All rights reserved. + * Copyright (c) 1999-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,7 @@ #include #include #include +#include #include "hfs.h" #include "hfs_catalog.h" @@ -498,12 +499,14 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) */ if ((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) { - hfs_lock(cp, HFS_FORCE_LOCK); + int stale_link = 0; + + hfs_lock(cp, HFS_FORCE_LOCK); if ((cp->c_parentcnid != dcp->c_cnid) || (bcmp(cnp->cn_nameptr, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0)) { struct cat_desc desc; + struct cat_attr lookup_attr; int lockflags; - /* * Get an updated descriptor */ @@ -514,28 +517,84 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) desc.cd_encoding = 0; desc.cd_cnid = 0; desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; - + + /* + * Because lookups call replace_desc to put a new descriptor in + * the cnode we are modifying it is possible that this cnode's + * descriptor is out of date for the parent ID / name that + * we are trying to look up. (It may point to a different hardlink). + * + * We need to be cautious that when re-supplying the + * descriptor below that the results of the catalog lookup + * still point to the same raw inode for the hardlink. This would + * not be the case if we found something in the cache above but + * the vnode it returned no longer has a valid hardlink for the + * parent ID/filename combo we are requesting. (This is because + * hfs_unlink does not directly trigger namecache removal). + * + * As a result, before vending out the vnode (and replacing + * its descriptor) verify that the fileID is the same by comparing + * the in-cnode attributes vs. the one returned from the lookup call + * below. If they do not match, treat this lookup as if we never hit + * in the cache at all. + */ lockflags = hfs_systemfile_lock(VTOHFS(dvp), SFL_CATALOG, HFS_SHARED_LOCK); - if (cat_lookup(VTOHFS(vp), &desc, 0, &desc, NULL, NULL, NULL) == 0) - replace_desc(cp, &desc); + + error = cat_lookup(VTOHFS(vp), &desc, 0, &desc, &lookup_attr, NULL, NULL); + hfs_systemfile_unlock(VTOHFS(dvp), lockflags); /* - * Save the origin info for file and directory hardlinks. Directory hardlinks - * need the origin for '..' lookups, and file hardlinks need it to ensure that - * competing lookups do not cause us to vend different hardlinks than the ones requested. - * We want to restrict saving the cache entries to LOOKUP namei operations, since - * we're really doing this to protect getattr. + * Note that cat_lookup may fail to find something with the name provided in the + * stack-based descriptor above. In that case, an ENOENT is a legitimate errno + * to be placed in error, which will get returned in the fastpath below. */ - if (cnp->cn_nameiop == LOOKUP) { - hfs_savelinkorigin(cp, dcp->c_fileid); + if (error == 0) { + if (lookup_attr.ca_fileid == cp->c_attr.ca_fileid) { + /* It still points to the right raw inode. Replacing the descriptor is fine */ + replace_desc (cp, &desc); + + /* + * Save the origin info for file and directory hardlinks. Directory hardlinks + * need the origin for '..' lookups, and file hardlinks need it to ensure that + * competing lookups do not cause us to vend different hardlinks than the ones requested. + * We want to restrict saving the cache entries to LOOKUP namei operations, since + * we're really doing this to protect getattr. + */ + if (cnp->cn_nameiop == LOOKUP) { + hfs_savelinkorigin(cp, dcp->c_fileid); + } + } + else { + /* If the fileID does not match then do NOT replace the descriptor! */ + stale_link = 1; + } } } - hfs_unlock(cp); - } + hfs_unlock (cp); + + if (stale_link) { + /* + * If we had a stale_link, then we need to pretend as though + * we never found this vnode and force a lookup through the + * traditional path. Drop the iocount acquired through + * cache_lookup above and force a cat lookup / getnewvnode + */ + vnode_put(vp); + goto lookup; + } + + if (error) { + /* + * If the cat_lookup failed then the caller will not expect + * a vnode with an iocount on it. + */ + vnode_put(vp); + } - return (error); + } + goto exit; lookup: /* @@ -550,6 +609,24 @@ lookup: if (cnode_locked) hfs_unlock(VTOC(*vpp)); exit: + { + uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread()); + + /* + * check to see if we issued any I/O while completing this lookup and + * this thread/task is throttleable... if so, throttle now + * + * this allows us to throttle in between multiple meta data reads that + * might result due to looking up a long pathname (since we'll have to + * re-enter hfs_vnop_lookup for each component of the pathnam not in + * the VFS cache), instead of waiting until the entire path lookup has + * completed and throttling at the systemcall return + */ + if (__improbable(ut->uu_lowpri_window)) { + throttle_lowpri_io(TRUE); + } + } + return (error); } diff --git a/bsd/hfs/hfs_notification.c b/bsd/hfs/hfs_notification.c index 227e744b2..2423db07a 100644 --- a/bsd/hfs/hfs_notification.c +++ b/bsd/hfs/hfs_notification.c @@ -58,7 +58,14 @@ void hfs_generate_volume_notifications(struct hfsmount *hfsmp) { fsid_t fsid; u_int32_t freeblks, state=999; - + char *volname = NULL; + + if (hfsmp->vcbVN) { + if (strlen((char*)hfsmp->vcbVN) < 256) { + volname = (char*) hfsmp->vcbVN; + } + } + fsid.val[0] = (long)hfsmp->hfs_raw_dev; fsid.val[1] = (long)vfs_typenum(HFSTOVFS(hfsmp)); @@ -74,14 +81,25 @@ void hfs_generate_volume_notifications(struct hfsmount *hfsmp) if (state == 2 && !(hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK)) { /* Dump some logging to track down intermittent issues */ - printf("HFS: Very Low Disk: freeblks: %d, dangerlimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_dangerlimit); + if (volname) { + printf("HFS: Vol: %s Very Low Disk: freeblks: %d, dangerlimit: %d\n", volname, freeblks, hfsmp->hfs_freespace_notify_dangerlimit); + } + else { + printf("HFS: Very Low Disk: freeblks: %d, dangerlimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_dangerlimit); + } + #if HFS_SPARSE_DEV if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { if (hfsmp->hfs_backingfs_rootvp) { struct mount *mp = vnode_mount (hfsmp->hfs_backingfs_rootvp); /* If we're a sparse device, dump some info about the backing store... */ - if (mp) { - printf("HFS: Very Low Disk: backingstore b_avail %lld, tag %d\n", mp->mnt_vfsstat.f_bavail, hfsmp->hfs_backingfs_rootvp->v_tag); + if (mp) { + if (volname) { + printf("HFS: Vol: %s Very Low Disk: backingstore b_avail %lld, tag %d\n", volname, mp->mnt_vfsstat.f_bavail, hfsmp->hfs_backingfs_rootvp->v_tag); + } + else { + printf("HFS: Very Low Disk: backingstore b_avail %lld, tag %d\n", mp->mnt_vfsstat.f_bavail, hfsmp->hfs_backingfs_rootvp->v_tag); + } } } } @@ -90,7 +108,12 @@ void hfs_generate_volume_notifications(struct hfsmount *hfsmp) vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); } else if (state == 1) { if (!(hfsmp->hfs_notification_conditions & VQ_LOWDISK)) { - printf("HFS: Low Disk: freeblks: %d, warninglimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_warninglimit); + if (volname) { + printf("HFS: Low Disk: Vol: %s freeblks: %d, warninglimit: %d\n", volname, freeblks, hfsmp->hfs_freespace_notify_warninglimit); + } + else { + printf("HFS: Low Disk: freeblks: %d, warninglimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_warninglimit); + } hfsmp->hfs_notification_conditions |= VQ_LOWDISK; vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); } else if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 63acbac05..f0b91b94a 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include @@ -98,6 +100,16 @@ SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, int hfs_vnop_read(struct vnop_read_args *ap) { + /* + struct vnop_read_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; + */ + uio_t uio = ap->a_uio; struct vnode *vp = ap->a_vp; struct cnode *cp; @@ -109,6 +121,7 @@ hfs_vnop_read(struct vnop_read_args *ap) off_t offset = uio_offset(uio); int retval = 0; int took_truncate_lock = 0; + int io_throttle = 0; /* Preflight checks */ if (!vnode_isreg(vp)) { @@ -147,7 +160,7 @@ hfs_vnop_read(struct vnop_read_args *ap) } /* otherwise the file was converted back to a regular file while we were reading it */ retval = 0; - } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { int error; error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); @@ -164,11 +177,24 @@ hfs_vnop_read(struct vnop_read_args *ap) hfsmp = VTOHFS(vp); #if CONFIG_PROTECT - if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) { + if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) { goto exit; } #endif + /* + * If this read request originated from a syscall (as opposed to + * an in-kernel page fault or something), then set it up for + * throttle checks. For example, large EAs may cause a VNOP_READ + * to occur, and we wouldn't want to throttle I/O while holding the + * EA B-Tree lock. + */ + if (ap->a_ioflag & IO_SYSCALL_DISPATCH) { + io_throttle = IO_RETURN_ON_THROTTLE; + } + +read_again: + /* Protect against a size change. */ hfs_lock_truncate(cp, HFS_SHARED_LOCK); took_truncate_lock = 1; @@ -186,7 +212,7 @@ hfs_vnop_read(struct vnop_read_args *ap) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START, (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - retval = cluster_read(vp, uio, filesize, ap->a_ioflag); + retval = cluster_read(vp, uio, filesize, ap->a_ioflag | (io_throttle)); cp->c_touch_acctime = TRUE; @@ -227,7 +253,12 @@ exit: if (took_truncate_lock) { hfs_unlock_truncate(cp, 0); } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + retval = 0; + goto read_again; + } return (retval); } @@ -259,6 +290,7 @@ hfs_vnop_write(struct vnop_write_args *ap) int do_snapshot = 1; time_t orig_ctime=VTOC(vp)->c_ctime; int took_truncate_lock = 0; + int io_return_on_throttle = 0; struct rl_entry *invalid_range; #if HFS_COMPRESSION @@ -277,7 +309,7 @@ hfs_vnop_write(struct vnop_write_args *ap) printf("invalid state %d for compressed file\n", state); /* fall through */ } - } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { int error; error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); @@ -308,7 +340,7 @@ hfs_vnop_write(struct vnop_write_args *ap) hfsmp = VTOHFS(vp); #if CONFIG_PROTECT - if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) { + if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) { goto exit; } #endif @@ -327,6 +359,10 @@ hfs_vnop_write(struct vnop_write_args *ap) } #endif /* HFS_SPARSE_DEV */ + if ((ioflag & (IO_SINGLE_WRITER | IO_RETURN_ON_THROTTLE)) == + (IO_SINGLE_WRITER | IO_RETURN_ON_THROTTLE)) { + io_return_on_throttle = IO_RETURN_ON_THROTTLE; + } again: /* Protect against a size change. */ /* @@ -349,7 +385,7 @@ again: uio_setoffset(uio, fp->ff_size); offset = fp->ff_size; } - if ((cp->c_flags & APPEND) && offset != fp->ff_size) { + if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) { retval = EPERM; goto exit; } @@ -647,9 +683,39 @@ sizeok: ubc_setsize(vp, filesize); } retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, - tail_off, lflag | IO_NOZERODIRTY); + tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle); if (retval) { fp->ff_new_size = 0; /* no longer extending; use ff_size */ + + if (retval == EAGAIN) { + /* + * EAGAIN indicates that we still have I/O to do, but + * that we now need to be throttled + */ + if (resid != uio_resid(uio)) { + /* + * did manage to do some I/O before returning EAGAIN + */ + resid = uio_resid(uio); + offset = uio_offset(uio); + + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + } + if (filesize > fp->ff_size) { + /* + * we called ubc_setsize before the call to + * cluster_write... since we only partially + * completed the I/O, we need to + * re-adjust our idea of the filesize based + * on our interim EOF + */ + ubc_setsize(vp, offset); + + fp->ff_size = offset; + } + goto exit; + } if (filesize > origFileSize) { ubc_setsize(vp, origFileSize); } @@ -732,6 +798,12 @@ exit: if (took_truncate_lock) { hfs_unlock_truncate(cp, 0); } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + + retval = 0; + goto again; + } return (retval); } @@ -1000,8 +1072,11 @@ do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, } else { int lockflags; + if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp))) + throttle_lowpri_io(1); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - + /* lookup this cnid in the catalog */ error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); @@ -1177,7 +1252,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, boolean_t is64bit; /* - * NOTE: on entry, the vnode is locked. Incase this vnode + * NOTE: on entry, the vnode has an io_ref. In case this vnode * happens to be in our list of file_ids, we'll note it * avoid calling hfs_chashget_nowait() on that id as that * will cause a "locking against myself" panic. @@ -1422,7 +1497,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, access[i] = 0; continue; } - + myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, skip_cp, p, cred, context,bitmap, map_size, parents, num_parents); @@ -1532,7 +1607,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { #if CONFIG_PROTECT { int error = 0; - if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { return error; } } @@ -1811,6 +1886,50 @@ fail_change_next_allocation: } #endif /* HFS_SPARSE_DEV */ + /* Change the next CNID stored in the VH */ + case HFS_CHANGE_NEXTCNID: { + int error = 0; /* Assume success */ + u_int32_t fileid; + int wraparound = 0; + int lockflags = 0; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + + fileid = *(u_int32_t *)ap->a_data; + + /* Must have catalog lock excl. to advance the CNID pointer */ + lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK); + + HFS_MOUNT_LOCK(hfsmp, TRUE); + + /* If it is less than the current next CNID, force the wraparound bit to be set */ + if (fileid < hfsmp->vcbNxtCNID) { + wraparound=1; + } + + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID; + + hfsmp->vcbNxtCNID = fileid; + + if (wraparound) { + hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; + } + + MarkVCBDirty(hfsmp); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_systemfile_unlock (hfsmp, lockflags); + + return (error); + } + case F_FREEZE_FS: { struct mount *mp; @@ -1942,6 +2061,73 @@ fail_change_next_allocation: return (EINVAL); } + case F_SETSTATICCONTENT: { + int error; + int enable_static = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the static bit in the cnode. + */ + enable_static = 1; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK); + if (error == 0) { + if (enable_static) { + cp->c_flag |= C_SSD_STATIC; + } + else { + cp->c_flag &= ~C_SSD_STATIC; + } + hfs_unlock (cp); + } + return error; + } + + case F_SETBACKINGSTORE: { + + int error = 0; + + /* + * See comment in F_SETSTATICCONTENT re: using + * a null check for a_data + */ + if (ap->a_data) { + error = hfs_set_backingstore (vp, 1); + } + else { + error = hfs_set_backingstore (vp, 0); + } + + return error; + } + + case F_GETPATH_MTMINFO: { + int error = 0; + + int *data = (int*) ap->a_data; + + /* Ask if this is a backingstore vnode */ + error = hfs_is_backingstore (vp, data); + + return error; + } + case F_FULLFSYNC: { int error; @@ -2013,10 +2199,6 @@ fail_change_next_allocation: return (error); } - case F_READBOOTSTRAP: - case F_WRITEBOOTSTRAP: - return 0; - case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ { if (is64bit) { @@ -2036,6 +2218,10 @@ fail_change_next_allocation: *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime; break; + case HFS_FSCTL_GET_VERY_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit; + break; + case HFS_FSCTL_SET_VERY_LOW_DISK: if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) { return EINVAL; @@ -2044,6 +2230,10 @@ fail_change_next_allocation: hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data; break; + case HFS_FSCTL_GET_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit; + break; + case HFS_FSCTL_SET_LOW_DISK: if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) { @@ -2054,6 +2244,10 @@ fail_change_next_allocation: hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data; break; + case HFS_FSCTL_GET_DESIRED_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel; + break; + case HFS_FSCTL_SET_DESIRED_DISK: if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { return EINVAL; @@ -2464,44 +2658,43 @@ retry: } /* Validate if the start offset is within logical file size */ - if (ap->a_foffset > fp->ff_size) { + if (ap->a_foffset >= fp->ff_size) { goto exit; } - /* Searching file extents has failed for read operation, therefore - * search rangelist for any uncommitted holes in the file. + /* + * At this point, we have encountered a failure during + * MapFileBlockC that resulted in ERANGE, and we are not servicing + * a write, and there are borrowed blocks. + * + * However, the cluster layer will not call blockmap for + * blocks that are borrowed and in-cache. We have to assume that + * because we observed ERANGE being emitted from MapFileBlockC, this + * extent range is not valid on-disk. So we treat this as a + * mapping that needs to be zero-filled prior to reading. + * + * Note that under certain circumstances (such as non-contiguous + * userland VM mappings in the calling process), cluster_io + * may be forced to split a large I/O driven by hfs_vnop_write + * into multiple sub-I/Os that necessitate a RMW cycle. If this is + * the case here, then we have already removed the invalid range list + * mapping prior to getting to this blockmap call, so we should not + * search the invalid rangelist for this byte range. */ - overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, - ap->a_foffset + (off_t)(ap->a_size - 1), - &invalid_range); - switch(overlaptype) { - case RL_OVERLAPISCONTAINED: - /* start_offset <= rl_start, end_offset >= rl_end */ - if (ap->a_foffset != invalid_range->rl_start) { - break; - } - case RL_MATCHINGOVERLAP: - /* start_offset = rl_start, end_offset = rl_end */ - case RL_OVERLAPCONTAINSRANGE: - /* start_offset >= rl_start, end_offset <= rl_end */ - case RL_OVERLAPSTARTSBEFORE: - /* start_offset > rl_start, end_offset >= rl_start */ - if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) { - bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset; - } else { - bytesContAvail = fp->ff_size - ap->a_foffset; - } - if (bytesContAvail > ap->a_size) { - bytesContAvail = ap->a_size; - } - *ap->a_bpn = (daddr64_t)-1; - retval = 0; - break; - case RL_OVERLAPENDSAFTER: - /* start_offset < rl_start, end_offset < rl_end */ - case RL_NOOVERLAP: - break; + + bytesContAvail = fp->ff_size - ap->a_foffset; + /* + * Clip the contiguous available bytes to, at most, the allowable + * maximum or the amount requested. + */ + + if (bytesContAvail > ap->a_size) { + bytesContAvail = ap->a_size; } + + *ap->a_bpn = (daddr64_t) -1; + retval = 0; + goto exit; } @@ -2566,7 +2759,6 @@ exit: return (MacToVFSError(retval)); } - /* * prepare and issue the I/O * buf_strategy knows how to deal @@ -2580,28 +2772,53 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap) vnode_t vp = buf_vnode(bp); int error = 0; + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_STATIC) { + buf_markstatic(bp); + } + #if CONFIG_PROTECT cnode_t *cp = NULL; if ((cp = cp_get_protected_cnode(vp)) != NULL) { - /* - * Some paths to hfs_vnop_strategy will take the cnode lock, - * and some won't. But since content protection is only enabled - * for files that (a) aren't system files and (b) are regular - * files, any valid cnode here will be unlocked. + /* + * We rely upon the truncate lock to protect the + * CP cache key from getting tossed prior to our IO finishing here. + * Nearly all cluster io calls to manipulate file payload from HFS + * take the truncate lock before calling into the cluster + * layer to ensure the file size does not change, or that they + * have exclusive right to change the EOF of the file. + * That same guarantee protects us here since the code that + * deals with CP lock events must now take the truncate lock + * before doing anything. + * + * There is 1 exception here: + * 1) One exception should be the VM swapfile IO, because HFS will + * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the + * swapfile code only without holding the truncate lock. This is because + * individual swapfiles are maintained at fixed-length sizes by the VM code. + * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to + * create our own UPL and thus take the truncate lock before calling + * into the cluster layer. In that case, however, we are not concerned + * with the CP blob being wiped out in the middle of the IO + * because there isn't anything to toss; the VM swapfile key stays + * in-core as long as the file is open. + * + * NB: + * For filesystem resize, we may not have access to the underlying + * file's cache key for whatever reason (device may be locked). However, + * we do not need it since we are going to use the temporary HFS-wide resize key + * which is generated once we start relocating file content. If this file's I/O + * should be done using the resize key, it will have been supplied already, so + * do not attach the file's cp blob to the buffer. */ - hfs_lock(cp, HFS_SHARED_LOCK); - buf_setcpaddr(bp, cp->c_cpentry); + if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) { + buf_setcpaddr(bp, cp->c_cpentry); + } } #endif /* CONFIG_PROTECT */ error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); - -#if CONFIG_PROTECT - if (cp) { - hfs_unlock(cp); - } -#endif return error; } @@ -2938,7 +3155,9 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { struct filefork *fp = VTOF(vp); struct cnode *cp = VTOC(vp); +#if QUOTA int retval = 0; +#endif /* QUOTA */ /* Cannot truncate an HFS directory! */ if (vnode_isdir(vp)) { @@ -3001,6 +3220,12 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { * for use when deleting a file. The simplification here is that we know * that we are releasing all blocks. * + * Note that this function may be called when there is no vnode backing + * the file fork in question. We may call this from hfs_vnop_inactive + * to clear out resource fork data (and may not want to clear out the data + * fork yet). As a result, we pointer-check both sets of inputs before + * doing anything with them. + * * The caller is responsible for saving off a copy of the filefork(s) * embedded within the cnode prior to calling this function. The pointers * supplied as arguments must be valid even if the cnode is no longer valid. @@ -3019,7 +3244,7 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, blksize = hfsmp->blockSize; /* Data Fork */ - if (datafork->ff_blocks > 0) { + if ((datafork != NULL) && (datafork->ff_blocks > 0)) { fileblocks = datafork->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; @@ -3477,7 +3702,7 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) fp = VTOF(vp); #if CONFIG_PROTECT - if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) { return error; } #endif /* CONFIG_PROTECT */ @@ -3554,6 +3779,8 @@ retry_pagein: error = EINVAL; goto pagein_done; } + ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1); + isize = ap->a_size; /* @@ -3813,10 +4040,19 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) a_pl_offset = 0; /* - * take truncate lock (shared) to guard against - * zero-fill thru fsync interfering, but only for v2 + * For V2 semantics, we want to take the cnode truncate lock + * shared to guard against the file size changing via zero-filling. + * + * However, we have to be careful because we may be invoked + * via the ubc_msync path to write out dirty mmap'd pages + * in response to a lock event on a content-protected + * filesystem (e.g. to write out class A files). + * As a result, we want to take the truncate lock 'SHARED' with + * the mini-recursion locktype so that we don't deadlock/panic + * because we may be already holding the truncate lock exclusive to force any other + * IOs to have blocked behind us. */ - hfs_lock_truncate(cp, HFS_SHARED_LOCK); + hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK); if (a_flags & UPL_MSYNC) { request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; @@ -4035,8 +4271,13 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) pageout_done: if (is_pageoutv2) { - /* release truncate lock (shared) */ - hfs_unlock_truncate(cp, 0); + /* + * Release the truncate lock. Note that because + * we may have taken the lock recursively by + * being invoked via ubc_msync due to lockdown, + * we should release it recursively, too. + */ + hfs_unlock_truncate(cp, 1); } return (retval); } @@ -4173,12 +4414,12 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, return EINVAL; } #endif - /* If it's an SSD, also disable HFS relocation */ if (hfsmp->hfs_flags & HFS_SSD) { return EINVAL; } + blksize = hfsmp->blockSize; if (blockHint == 0) blockHint = hfsmp->nextAllocation; @@ -4452,7 +4693,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) hfs_unlock(VTOC(vp)); #if CONFIG_PROTECT - if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { hfs_lock(VTOC(vp), HFS_FORCE_LOCK); return (error); } @@ -4477,7 +4718,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) break; } if (uio_resid(auio) != 0) { - printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio)); + printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio)); error = EIO; break; } diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c index 878c70dc5..b73ec8339 100644 --- a/bsd/hfs/hfs_search.c +++ b/bsd/hfs/hfs_search.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2010 Apple Inc. All rights reserved. + * Copyright (c) 1997-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,8 @@ #include "hfscommon/headers/BTreeScanner.h" #include "hfscommon/headers/CatalogPrivate.h" +#if CONFIG_SEARCHFS + /* Search criterea. */ struct directoryInfoSpec { @@ -206,7 +208,9 @@ hfs_vnop_search(ap) return (EINVAL); /* - * Reject requests for unsupported attributes. + * Fail requests for attributes that HFS does not support for the + * items that match the search criteria. Note that these checks + * are for the OUTBOUND attributes to be returned (not search criteria). */ if ((ap->a_returnattrs->commonattr & ~HFS_ATTR_CMN_VALID) || (ap->a_returnattrs->volattr != 0) || @@ -280,6 +284,7 @@ hfs_vnop_search(ap) err = ENOMEM; goto ExitThisRoutine; } + bzero(attributesBuffer, eachReturnBufferSize); variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize); // XXXdbg - have to lock the user's buffer so we don't fault @@ -422,7 +427,7 @@ ExitThisRoutine: if (attributesBuffer) FREE(attributesBuffer, M_TEMP); - if (hfsmp->jnl && user_start) { + if (user_start) { vsunlock(user_start, user_len, TRUE); } @@ -1075,7 +1080,7 @@ InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, *((u_int32_t *)attributesBuffer) = packedBufferSize; /* Store length of fixed + var block */ - err = uiomove( (caddr_t)attributesBuffer, packedBufferSize, a_uio ); /* XXX should be packedBufferSize */ + err = uiomove( (caddr_t)attributesBuffer, packedBufferSize, a_uio ); exit: cat_releasedesc(&c_desc); @@ -1291,4 +1296,4 @@ UnpackSearchAttributeBlock( struct hfsmount *hfsmp, struct attrlist *alist, return (0); } - +#endif /* CONFIG_SEARCHFS */ diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 4e5b76b14..26bafe770 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -88,6 +88,7 @@ #include #include #include +#include #include @@ -154,9 +155,11 @@ static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_con static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec); static int hfs_journal_replay(vnode_t devvp, vfs_context_t context); static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context); +static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context); void hfs_initialize_allocator (struct hfsmount *hfsmp); int hfs_teardown_allocator (struct hfsmount *hfsmp); +void hfs_unmap_blocks (struct hfsmount *hfsmp); int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); @@ -292,7 +295,39 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); hfsmp->hfs_flags |= HFS_READ_ONLY; - /* also get the volume bitmap blocks */ + /* + * Close down the journal. + * + * NOTE: It is critically important to close down the journal + * and have it issue all pending I/O prior to calling VNOP_FSYNC below. + * In a journaled environment it is expected that the journal be + * the only actor permitted to issue I/O for metadata blocks in HFS. + * If we were to call VNOP_FSYNC prior to closing down the journal, + * we would inadvertantly issue (and wait for) the I/O we just + * initiated above as part of the flushvolumeheader call. + * + * To avoid this, we follow the same order of operations as in + * unmount and issue the journal_close prior to calling VNOP_FSYNC. + */ + + if (hfsmp->jnl) { + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + + // Note: we explicitly don't want to shutdown + // access to the jvp because we may need + // it later if we go back to being read-write. + + hfs_unlock_global (hfsmp); + } + + + /* + * Write out any pending I/O still outstanding against the device node + * now that the journal has been closed. + */ if (!retval) { if (vnode_mount(hfsmp->hfs_devvp) == mp) { retval = hfs_fsync(hfsmp->hfs_devvp, MNT_WAIT, 0, p); @@ -302,6 +337,7 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte vnode_put(hfsmp->hfs_devvp); } } + if (retval) { if (HFS_MOUNT_DEBUG) { printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN); @@ -311,19 +347,7 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte hfsmp->hfs_flags &= ~HFS_READ_ONLY; goto out; } - if (hfsmp->jnl) { - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - - journal_close(hfsmp->jnl); - hfsmp->jnl = NULL; - - // Note: we explicitly don't want to shutdown - // access to the jvp because we may need - // it later if we go back to being read-write. - - hfs_unlock_global (hfsmp); - } - + #if CONFIG_HFS_ALLOC_RBTREE (void) hfs_teardown_allocator(hfsmp); #endif @@ -439,7 +463,7 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte * Allow hot file clustering if conditions allow. */ if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - ((hfsmp->hfs_flags & HFS_SSD) == 0)) { + ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0)) { (void) hfs_recording_init(hfsmp); } /* Force ACLs on HFS+ file systems. */ @@ -465,6 +489,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte * Only do this if we're operating on a read-write mount (we wouldn't care for read-only), * which has not previously encountered a bad error on the red-black tree code. Also, don't * try to re-build a tree that already exists. + * + * When this is enabled, we must re-integrate the above function into our bitmap iteration + * so that we accurately send TRIMs down to the underlying disk device as needed. */ if (hfsmp->extent_tree_flags == 0) { @@ -504,32 +531,49 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte */ if ((retval == 0) && (cp_fs_protected (mp))) { int err = 0; - struct cp_root_xattr xattr; - bzero (&xattr, sizeof(struct cp_root_xattr)); + + struct cp_root_xattr *xattr = NULL; + MALLOC (xattr, struct cp_root_xattr*, sizeof(struct cp_root_xattr), M_TEMP, M_WAITOK); + if (xattr == NULL) { + err = ENOMEM; + goto badalloc; + } + bzero (xattr, sizeof(struct cp_root_xattr)); hfsmp = vfs_fsprivate(mp); /* go get the EA to get the version information */ - err = cp_getrootxattr (hfsmp, &xattr); - /* If there was no EA there, then write one out. */ + err = cp_getrootxattr (hfsmp, xattr); + /* + * If there was no EA there, then write one out. + * Assuming EA is not present on the root means + * this is an erase install or a very old FS + */ if (err == ENOATTR) { - bzero(&xattr, sizeof(struct cp_root_xattr)); - xattr.major_version = CP_CURRENT_MAJOR_VERS; - xattr.minor_version = CP_CURRENT_MINOR_VERS; - xattr.flags = 0; + printf("No root EA set, creating new EA with new version: %d\n", CP_NEW_MAJOR_VERS); + bzero(xattr, sizeof(struct cp_root_xattr)); + xattr->major_version = CP_NEW_MAJOR_VERS; + xattr->minor_version = CP_MINOR_VERS; + xattr->flags = 0; + + err = cp_setrootxattr (hfsmp, xattr); + } - err = cp_setrootxattr (hfsmp, &xattr); - } /* * For any other error, including having an out of date CP version in the * EA, or for an error out of cp_setrootxattr, deny the mount * and do not proceed further. */ - if (err || xattr.major_version != CP_CURRENT_MAJOR_VERS) { + if (err || (xattr->major_version != CP_NEW_MAJOR_VERS && xattr->major_version != CP_PREV_MAJOR_VERS)) { /* Deny the mount and tear down. */ retval = EPERM; (void) hfs_unmount (mp, MNT_FORCE, context); - } - } + } + printf("Running with CP root xattr: %d.%d\n", xattr->major_version, xattr->minor_version); +badalloc: + if(xattr) { + FREE(xattr, M_TEMP); + } + } #endif } out: @@ -1155,6 +1199,22 @@ bailout: #endif } +void hfs_unmap_blocks (struct hfsmount *hfsmp) { + /* + * Take the allocation file lock. Journal transactions will block until + * we're done here. + */ + int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * UnmapBlocks assumes that the bitmap lock is held when you call the function. + * We don't care if there were any error issuing unmaps yet. + */ + (void) UnmapBlocks(hfsmp); + + hfs_systemfile_unlock(hfsmp, flags); +} + /* * Teardown code for the Red-Black Tree allocator. @@ -1205,7 +1265,6 @@ hfs_teardown_allocator (struct hfsmount *hfsmp) { } - static int hfs_root_unmounted_cleanly = 0; SYSCTL_DECL(_vfs_generic); @@ -1239,6 +1298,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, daddr64_t mdb_offset; int isvirtual = 0; int isroot = 0; + u_int32_t device_features = 0; int isssd; #if CONFIG_HFS_ALLOC_RBTREE thread_t allocator_thread; @@ -1405,9 +1465,21 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, bzero(hfsmp, sizeof(struct hfsmount)); hfs_chashinit_finish(hfsmp); - + /* - * See if the disk is a solid state device. We need this to decide what to do about + * See if the disk supports unmap (trim). + * + * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field + * returned by vfs_ioattr. We need to call VNOP_IOCTL ourselves. + */ + if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) { + if (device_features & DK_FEATURE_UNMAP) { + hfsmp->hfs_flags |= HFS_UNMAP; + } + } + + /* + * See if the disk is a solid state device, too. We need this to decide what to do about * hotfiles. */ if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) { @@ -1434,6 +1506,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, vnode_ref(devvp); /* Hold a ref on the device, dropped when hfsmp is freed. */ hfsmp->hfs_logical_block_size = log_blksize; hfsmp->hfs_logical_block_count = log_blkcnt; + hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt; hfsmp->hfs_physical_block_size = phys_blksize; hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize); hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; @@ -1493,7 +1566,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Mount a standard HFS disk */ if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) && (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) { - +#if CONFIG_HFS_STD /* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */ if (vfs_isrdwr(mp)) { retval = EROFS; @@ -1529,6 +1602,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } hfsmp->hfs_logical_block_size = log_blksize; hfsmp->hfs_logical_block_count = log_blkcnt; + hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt; hfsmp->hfs_physical_block_size = log_blksize; hfsmp->hfs_log_per_phys = 1; } @@ -1548,6 +1622,11 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, retval = hfs_MountHFSVolume(hfsmp, mdbp, p); if (retval) (void) hfs_relconverter(hfsmp->hfs_encoding); +#else + /* On platforms where HFS Standard is not supported, deny the mount altogether */ + retval = EINVAL; + goto error_exit; +#endif } else /* Mount an HFS Plus disk */ { HFSPlusVolumeHeader *vhp; @@ -1595,6 +1674,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Update logical /physical block size */ hfsmp->hfs_logical_block_size = log_blksize; hfsmp->hfs_physical_block_size = log_blksize; + phys_blksize = log_blksize; hfsmp->hfs_log_per_phys = 1; } @@ -1604,6 +1684,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_logical_block_count = disksize / log_blksize; + hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), phys_blksize, cred, &bp); @@ -1624,7 +1706,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } if (isroot) { - hfs_root_unmounted_cleanly = (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0; + hfs_root_unmounted_cleanly = ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0); } /* @@ -1758,7 +1840,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, */ if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) { printf("hfs_mountfs: could not use physical block size " - "(%d) switching to 512\n", log_blksize); + "(%d) switching to 512\n", log_blksize); log_blksize = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { if (HFS_MOUNT_DEBUG) { @@ -1776,10 +1858,12 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } devvp->v_specsize = log_blksize; /* Note: relative block count adjustment (in case this is an embedded volume). */ - hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; - hfsmp->hfs_logical_block_size = log_blksize; - hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize; - + hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize; + + hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; + if (hfsmp->jnl && hfsmp->jvp == devvp) { // close and re-open this with the new block size journal_close(hfsmp->jnl); @@ -1931,7 +2015,20 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } } } - + +#if CONFIG_HFS_MOUNT_UNMAP + /* Enable UNMAPs for embedded SSDs only for now */ + /* + * TODO: Should we enable this for CoreStorage volumes, too? + */ + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { + if (hfsmp->hfs_flags & HFS_UNMAP) { + hfs_unmap_blocks(hfsmp); + } + } +#endif + + #if CONFIG_HFS_ALLOC_RBTREE /* * We spawn a thread to create the pair of red-black trees for this volume. @@ -3456,11 +3553,11 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) HFSMasterDirectoryBlock *mdb; struct buf *bp = NULL; int retval; - int sectorsize; + int sector_size; ByteCount namelen; - sectorsize = hfsmp->hfs_logical_block_size; - retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp); + sector_size = hfsmp->hfs_logical_block_size; + retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp); if (retval) { if (bp) buf_brelse(bp); @@ -3469,7 +3566,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) lck_mtx_lock(&hfsmp->hfs_mutex); - mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); + mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size)); mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime))); mdb->drLsMod = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod))); @@ -3524,8 +3621,8 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) if (altflush) { struct buf *alt_bp = NULL; - if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sectorsize, NOCRED, &alt_bp) == 0) { - bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sectorsize), kMDBSize); + if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sector_size, NOCRED, &alt_bp) == 0) { + bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize); (void) VNOP_BWRITE(alt_bp); } else if (alt_bp) @@ -3870,9 +3967,10 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) u_int64_t newblkcnt; u_int64_t prev_phys_block_count; u_int32_t addblks; - u_int64_t sectorcnt; - u_int32_t sectorsize; - u_int32_t phys_sectorsize; + u_int64_t sector_count; + u_int32_t sector_size; + u_int32_t phys_sector_size; + u_int32_t overage_blocks; daddr64_t prev_alt_sector; daddr_t bitmapblks; int lockflags = 0; @@ -3916,33 +4014,33 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) if (error) return (error); } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§orsize, 0, context)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§or_size, 0, context)) { return (ENXIO); } - if (sectorsize != hfsmp->hfs_logical_block_size) { + if (sector_size != hfsmp->hfs_logical_block_size) { return (ENXIO); } - if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§orcnt, 0, context)) { + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§or_count, 0, context)) { return (ENXIO); } - if ((sectorsize * sectorcnt) < newsize) { + if ((sector_size * sector_count) < newsize) { printf("hfs_extendfs: not enough space on device\n"); return (ENOSPC); } - error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sectorsize, 0, context); + error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context); if (error) { if ((error != ENOTSUP) && (error != ENOTTY)) { return (ENXIO); } /* If ioctl is not supported, force physical and logical sector size to be same */ - phys_sectorsize = sectorsize; + phys_sector_size = sector_size; } oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; /* * Validate new size. */ - if ((newsize <= oldsize) || (newsize % sectorsize) || (newsize % phys_sectorsize)) { + if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) { printf("hfs_extendfs: invalid size\n"); return (EINVAL); } @@ -3966,7 +4064,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; HFS_MOUNT_UNLOCK(hfsmp, TRUE); - + /* Start with a clean journal. */ hfs_journal_flush(hfsmp, TRUE); @@ -3979,6 +4077,21 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } transaction_begun = 1; + + /* Update the hfsmp fields for the physical information about the device */ + prev_phys_block_count = hfsmp->hfs_logical_block_count; + prev_alt_sector = hfsmp->hfs_alt_id_sector; + + hfsmp->hfs_logical_block_count = sector_count; + /* + * Note that the new AltVH location must be based on the device's EOF rather than the new + * filesystem's EOF, so we use logical_block_count here rather than newsize. + */ + hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) + + HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count); + hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size; + + /* * Note: we take the attributes lock in case we have an attribute data vnode * which needs to change size. @@ -4005,9 +4118,19 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * After extending the file system, those bits can represent valid * allocation blocks, so we mark all the bits from the end of current * volume to end of allocation bitmap as "free". + * + * Figure out the number of overage blocks before proceeding though, + * so we don't add more bytes to our I/O than necessary. + * First figure out the total number of blocks representable by the + * end of the bitmap file vs. the total number of blocks in the new FS. + * Then subtract away the number of blocks in the current FS. This is how much + * we can mark as free right now without having to grow the bitmap file. */ - BlockMarkFreeUnused(vcb, vcb->totalBlocks, - (fp->ff_blocks * vcb->blockSize * 8) - vcb->totalBlocks); + overage_blocks = fp->ff_blocks * vcb->blockSize * 8; + overage_blocks = MIN (overage_blocks, newblkcnt); + overage_blocks -= vcb->totalBlocks; + + BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks); if (bitmapblks > 0) { daddr64_t blkno; @@ -4125,14 +4248,8 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* * Adjust file system variables for new space. */ - prev_phys_block_count = hfsmp->hfs_logical_block_count; - prev_alt_sector = hfsmp->hfs_alt_id_sector; - vcb->totalBlocks += addblks; vcb->freeBlocks += addblks; - hfsmp->hfs_logical_block_count = newsize / sectorsize; - hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sectorsize) + - HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_logical_block_count); MarkVCBDirty(vcb); error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); if (error) { @@ -4223,11 +4340,30 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * * We only update hfsmp->allocLimit if totalBlocks actually increased. */ - if (error == 0) { UpdateAllocLimit(hfsmp, hfsmp->totalBlocks); } - + + /* Release all locks and sync up journal content before + * checking and extending, if required, the journal + */ + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + if (transaction_begun) { + hfs_end_transaction(hfsmp); + hfs_journal_flush(hfsmp, TRUE); + transaction_begun = 0; + } + + /* Increase the journal size, if required. */ + error = hfs_extend_journal(hfsmp, sector_size, sector_count, context); + if (error) { + printf ("hfs_extendfs: Could not extend journal size\n"); + goto out_noalloc; + } + /* Log successful extending */ printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n", hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize)); @@ -4239,7 +4375,8 @@ out: VTOC(vp)->c_blocks = fp->ff_blocks; } - + +out_noalloc: HFS_MOUNT_LOCK(hfsmp, TRUE); hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; HFS_MOUNT_UNLOCK(hfsmp, TRUE); @@ -4530,6 +4667,14 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) */ hfsmp->totalBlocks = newblkcnt; hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size; + hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; + + /* + * Note that although the logical block size is updated here, it is only done for + * the benefit of the partition management software. The logical block count change + * has not yet actually been propagated to the disk device yet. + */ + hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); MarkVCBDirty(hfsmp); error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); @@ -4641,6 +4786,9 @@ hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sector * physical block number of any buffer cache block in the copied extent * (so that if the block is written, it will go through VNOP_BLOCKMAP to * determine the new physical block number). + * + * At this point, for regular files, we hold the truncate lock exclusive + * and the cnode lock exclusive. */ static int hfs_copy_extent( @@ -4677,24 +4825,45 @@ hfs_copy_extent( panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp); #if CONFIG_PROTECT - /* Prepare the CP blob and get it ready for use */ - if (!vnode_issystem (vp) && vnode_isreg(vp) && - cp_fs_protected (hfsmp->hfs_mp)) { + /* + * Prepare the CP blob and get it ready for use, if necessary. + * + * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs), + * because they are implicitly protected via the media key on iOS. As such, they + * must not be relocated except with the media key. So it is OK to not pass down + * a special cpentry to the IOMedia/LwVM code for handling. + */ + if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) { int cp_err = 0; - cp_err = cp_handle_relocate (cp); + /* + * Ideally, the file whose extents we are about to manipulate is using the + * newer offset-based IVs so that we can manipulate it regardless of the + * current lock state. However, we must maintain support for older-style + * EAs. + * + * For the older EA case, the IV was tied to the device LBA for file content. + * This means that encrypted data cannot be moved from one location to another + * in the filesystem without garbling the IV data. As a result, we need to + * access the file's plaintext because we cannot do our AES-symmetry trick + * here. This requires that we attempt a key-unwrap here (via cp_handle_relocate) + * to make forward progress. If the keys are unavailable then we will + * simply stop the resize in its tracks here since we cannot move + * this extent at this time. + */ + if ((cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) == 0) { + cp_err = cp_handle_relocate(cp, hfsmp); + } + if (cp_err) { - /* - * can't copy the file because we couldn't set up keys. - * bail out - */ + printf ("hfs_copy_extent: cp_handle_relocate failed (%d) \n", cp_err); return cp_err; } - else { - cpenabled = 1; - } + + cpenabled = 1; } #endif + /* * Determine the I/O size to use * @@ -4725,10 +4894,31 @@ hfs_copy_extent( buf_setblkno(bp, srcSector); buf_setlblkno(bp, srcSector); - /* Attach the CP to the buffer */ + /* + * Note that because this is an I/O to the device vp + * it is correct to have lblkno and blkno both point to the + * start sector being read from. If it were being issued against the + * underlying file then that would be different. + */ + + /* Attach the new CP blob to the buffer if needed */ #if CONFIG_PROTECT if (cpenabled) { - buf_setcpaddr (bp, cp->c_cpentry); + if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) { + /* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */ + cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT; + buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry); + } + else { + /* + * Use the cnode's cp key. This file is tied to the + * LBAs of the physical blocks that it occupies. + */ + buf_setcpaddr (bp, cp->c_cpentry); + } + + /* Initialize the content protection file offset to start at 0 */ + buf_setcpoff (bp, 0); } #endif @@ -4737,6 +4927,12 @@ hfs_copy_extent( if (!err) err = buf_biowait(bp); if (err) { +#if CONFIG_PROTECT + /* Turn the flag off in error cases. */ + if (cpenabled) { + cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT; + } +#endif printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err); break; } @@ -4751,17 +4947,39 @@ hfs_copy_extent( buf_markfua(bp); #if CONFIG_PROTECT - /* Attach the CP to the buffer */ + /* Attach the CP to the buffer if needed */ if (cpenabled) { - buf_setcpaddr (bp, cp->c_cpentry); - } + if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) { + buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry); + } + else { + /* + * Use the cnode's CP key. This file is still tied + * to the LBAs of the physical blocks that it occupies. + */ + buf_setcpaddr (bp, cp->c_cpentry); + } + /* + * The last STRATEGY call may have updated the cp file offset behind our + * back, so we cannot trust it. Re-initialize the content protection + * file offset back to 0 before initiating the write portion of this I/O. + */ + buf_setcpoff (bp, 0); + } #endif /* Do the write */ vnode_startwrite(hfsmp->hfs_devvp); err = VNOP_STRATEGY(bp); - if (!err) + if (!err) { err = buf_biowait(bp); + } +#if CONFIG_PROTECT + /* Turn the flag off regardless once the strategy call finishes. */ + if (cpenabled) { + cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT; + } +#endif if (err) { printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err); break; @@ -5564,7 +5782,7 @@ out: static void hfs_truncatefs_progress(struct hfsmount *hfsmp) { - u_int32_t cur_progress; + u_int32_t cur_progress = 0; hfs_resize_progress(hfsmp, &cur_progress); if (cur_progress > (hfsmp->hfs_resize_progress + 9)) { @@ -5630,6 +5848,17 @@ hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, cp = VTOC(vp); + if (hfs_resize_debug) { + const char *filename = (const char *) cp->c_desc.cd_nameptr; + int namelen = cp->c_desc.cd_namelen; + + if (filename == NULL) { + filename = ""; + namelen = 0; + } + printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename); + } + MALLOC(extent_info, struct hfs_reclaim_extent_info *, sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); if (extent_info == NULL) { @@ -5895,6 +6124,7 @@ struct hfs_journal_relocate_args { struct hfsmount *hfsmp; vfs_context_t context; u_int32_t newStartBlock; + u_int32_t newBlockCount; }; static errno_t @@ -5910,7 +6140,7 @@ hfs_journal_relocate_callback(void *_args) hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), hfsmp->blockSize, vfs_context_ucred(args->context), &bp); if (error) { - printf("hfs_reclaim_journal_file: failed to read JIB (%d)\n", error); + printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error); if (bp) { buf_brelse(bp); } @@ -5918,18 +6148,18 @@ hfs_journal_relocate_callback(void *_args) } jibp = (JournalInfoBlock*) buf_dataptr(bp); jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize); - jibp->size = SWAP_BE64(hfsmp->jnl_size); + jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize); if (journal_uses_fua(hfsmp->jnl)) buf_markfua(bp); error = buf_bwrite(bp); if (error) { - printf("hfs_reclaim_journal_file: failed to write JIB (%d)\n", error); + printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error); return error; } if (!journal_uses_fua(hfsmp->jnl)) { error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, args->context); if (error) { - printf("hfs_reclaim_journal_file: DKIOCSYNCHRONIZECACHE failed (%d)\n", error); + printf("hfs_journal_relocate_callback: DKIOCSYNCHRONIZECACHE failed (%d)\n", error); error = 0; /* Don't fail the operation. */ } } @@ -5938,8 +6168,21 @@ hfs_journal_relocate_callback(void *_args) } +/* Type of resize operation in progress */ +#define HFS_RESIZE_TRUNCATE 1 +#define HFS_RESIZE_EXTEND 2 + +/* + * Core function to relocate the journal file. This function takes the + * journal size of the newly relocated journal --- the caller can + * provide a new journal size if they want to change the size of + * the journal. The function takes care of updating the journal info + * block and all other data structures correctly. + * + * Note: This function starts a transaction and grabs the btree locks. + */ static int -hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context) { int error; int journal_err; @@ -5948,51 +6191,70 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_conte u_int32_t newStartBlock; u_int32_t oldBlockCount; u_int32_t newBlockCount; + u_int32_t jnlBlockCount; + u_int32_t alloc_skipfreeblks; struct cat_desc journal_desc; struct cat_attr journal_attr; struct cat_fork journal_fork; struct hfs_journal_relocate_args callback_args; - if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) <= allocLimit) { - /* The journal does not require relocation */ - return 0; + /* Calculate the number of allocation blocks required for the journal */ + jnlBlockCount = howmany(jnl_size, hfsmp->blockSize); + + /* + * During truncatefs(), the volume free block count is updated + * before relocating data and reflects the total number of free + * blocks that will exist on volume after the resize is successful. + * This means that the allocation blocks required for relocation + * have already been reserved and accounted for in the free block + * count. Therefore, block allocation and deallocation routines + * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS + * flag. + * + * This special handling is not required when the file system + * is being extended as we want all the allocated and deallocated + * blocks to be accounted for correctly. + */ + if (resize_type == HFS_RESIZE_TRUNCATE) { + alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS; + } else { + alloc_skipfreeblks = 0; } error = hfs_start_transaction(hfsmp); if (error) { - printf("hfs_reclaim_journal_file: hfs_start_transaction returned %d\n", error); + printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error); return error; } lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - oldBlockCount = hfsmp->jnl_size / hfsmp->blockSize; - - /* TODO: Allow the journal to change size based on the new volume size. */ - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, - HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS, + error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount, + HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | alloc_skipfreeblks, &newStartBlock, &newBlockCount); if (error) { - printf("hfs_reclaim_journal_file: BlockAllocate returned %d\n", error); + printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error); goto fail; } - if (newBlockCount != oldBlockCount) { - printf("hfs_reclaim_journal_file: newBlockCount != oldBlockCount (%u, %u)\n", newBlockCount, oldBlockCount); + if (newBlockCount != jnlBlockCount) { + printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount); goto free_fail; } - error = BlockDeallocate(hfsmp, hfsmp->jnl_start, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); + error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, &journal_desc, &journal_attr, &journal_fork); if (error) { - printf("hfs_reclaim_journal_file: BlockDeallocate returned %d\n", error); + printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error); goto free_fail; } - /* Update the catalog record for .journal */ - error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, &journal_desc, &journal_attr, &journal_fork); + oldStartBlock = journal_fork.cf_extents[0].startBlock; + oldBlockCount = journal_fork.cf_extents[0].blockCount; + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks); if (error) { - printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error); + printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error); goto free_fail; } - oldStartBlock = journal_fork.cf_extents[0].startBlock; + + /* Update the catalog record for .journal */ journal_fork.cf_size = newBlockCount * hfsmp->blockSize; journal_fork.cf_extents[0].startBlock = newStartBlock; journal_fork.cf_extents[0].blockCount = newBlockCount; @@ -6000,54 +6262,117 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_conte error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL); cat_releasedesc(&journal_desc); /* all done with cat descriptor */ if (error) { - printf("hfs_reclaim_journal_file: cat_update returned %d\n", error); + printf("hfs_relocate_journal_file: cat_update returned %d\n", error); goto free_fail; } - callback_args.hfsmp = hfsmp; - callback_args.context = context; - callback_args.newStartBlock = newStartBlock; - error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize, - (off_t)newBlockCount*hfsmp->blockSize, 0, - hfs_journal_relocate_callback, &callback_args); - if (error) { - /* NOTE: journal_relocate will mark the journal invalid. */ - printf("hfs_reclaim_journal_file: journal_relocate returned %d\n", error); - goto fail; + /* + * If the journal is part of the file system, then tell the journal + * code about the new location. If the journal is on an external + * device, then just keep using it as-is. + */ + if (hfsmp->jvp == hfsmp->hfs_devvp) { + callback_args.hfsmp = hfsmp; + callback_args.context = context; + callback_args.newStartBlock = newStartBlock; + callback_args.newBlockCount = newBlockCount; + + error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize, + (off_t)newBlockCount*hfsmp->blockSize, 0, + hfs_journal_relocate_callback, &callback_args); + if (error) { + /* NOTE: journal_relocate will mark the journal invalid. */ + printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error); + goto fail; + } + if (hfs_resize_debug) { + printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } + hfsmp->jnl_start = newStartBlock; + hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize; } - hfsmp->jnl_start = newStartBlock; - hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize; hfs_systemfile_unlock(hfsmp, lockflags); error = hfs_end_transaction(hfsmp); if (error) { - printf("hfs_reclaim_journal_file: hfs_end_transaction returned %d\n", error); - } - - /* Account for the blocks relocated and print progress */ - hfsmp->hfs_resize_blocksmoved += oldBlockCount; - hfs_truncatefs_progress(hfsmp); - if (!error) { - printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n", - oldBlockCount, hfsmp->vcbVN); - if (hfs_resize_debug) { - printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); - } + printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error); } + return error; free_fail: journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); if (journal_err) { - printf("hfs_reclaim_journal_file: BlockDeallocate returned %d\n", error); + printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error); hfs_mark_volume_inconsistent(hfsmp); } fail: hfs_systemfile_unlock(hfsmp, lockflags); (void) hfs_end_transaction(hfsmp); if (hfs_resize_debug) { - printf ("hfs_reclaim_journal_file: Error relocating journal file (error=%d)\n", error); + printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error); + } + return error; +} + + +/* + * Relocate the journal file when the file system is being truncated. + * We do not down-size the journal when the file system size is + * reduced, so we always provide the current journal size to the + * relocate code. + */ +static int +hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + u_int32_t startBlock; + u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize; + + /* + * Figure out the location of the .journal file. When the journal + * is on an external device, we need to look up the .journal file. + */ + if (hfsmp->jvp == hfsmp->hfs_devvp) { + startBlock = hfsmp->jnl_start; + blockCount = hfsmp->jnl_size / hfsmp->blockSize; + } else { + u_int32_t fileid; + u_int32_t old_jnlfileid; + struct cat_attr attr; + struct cat_fork fork; + + /* + * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid + * is set, and it is trying to hide the .journal file. So temporarily + * unset the field while calling GetFileInfo. + */ + old_jnlfileid = hfsmp->hfs_jnlfileid; + hfsmp->hfs_jnlfileid = 0; + fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork); + hfsmp->hfs_jnlfileid = old_jnlfileid; + if (fileid != old_jnlfileid) { + printf("hfs_reclaim_journal_file: cannot find .journal file!\n"); + return EIO; + } + + startBlock = fork.cf_extents[0].startBlock; + blockCount = fork.cf_extents[0].blockCount; } + + if (startBlock + blockCount <= allocLimit) { + /* The journal file does not require relocation */ + return 0; + } + + error = hfs_relocate_journal_file(hfsmp, blockCount * hfsmp->blockSize, HFS_RESIZE_TRUNCATE, context); + if (error == 0) { + hfsmp->hfs_resize_blocksmoved += blockCount; + hfs_truncatefs_progress(hfsmp); + printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n", + blockCount, hfsmp->vcbVN); + } + return error; } @@ -6134,7 +6459,7 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs /* Update the catalog record for .journal_info_block */ error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, &jib_desc, &jib_attr, &jib_fork); if (error) { - printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error); + printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error); goto fail; } oldBlock = jib_fork.cf_extents[0].startBlock; @@ -6195,6 +6520,72 @@ fail: } +static u_int64_t +calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count) +{ + u_int64_t journal_size; + u_int32_t journal_scale; + +#define DEFAULT_JOURNAL_SIZE (8*1024*1024) +#define MAX_JOURNAL_SIZE (512*1024*1024) + + /* Calculate the journal size for this volume. We want + * at least 8 MB of journal for each 100 GB of disk space. + * We cap the size at 512 MB, unless the allocation block + * size is larger, in which case, we use one allocation + * block. + */ + journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024); + journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1); + if (journal_size > MAX_JOURNAL_SIZE) { + journal_size = MAX_JOURNAL_SIZE; + } + if (journal_size < hfsmp->blockSize) { + journal_size = hfsmp->blockSize; + } + return journal_size; +} + + +/* + * Calculate the expected journal size based on current partition size. + * If the size of the current journal is less than the calculated size, + * force journal relocation with the new journal size. + */ +static int +hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context) +{ + int error = 0; + u_int64_t calc_journal_size; + + if (hfsmp->jvp != hfsmp->hfs_devvp) { + if (hfs_resize_debug) { + printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n"); + } + return 0; + } + + calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count); + if (calc_journal_size <= hfsmp->jnl_size) { + /* The journal size requires no modification */ + goto out; + } + + if (hfs_resize_debug) { + printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size); + } + + /* Extend the journal to the new calculated size */ + error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context); + if (error == 0) { + printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n", + hfsmp->jnl_size, hfsmp->vcbVN); + } +out: + return error; +} + + /* * This function traverses through all extended attribute records for a given * fileID, and calls function that reclaims data blocks that exist in the @@ -6524,8 +6915,28 @@ hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_ prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { - return ENOMEM; + error = ENOMEM; + goto reclaim_filespace_done; + } + +#if CONFIG_PROTECT + int keys_generated = 0; + /* + * For content-protected filesystems, we may need to relocate files that + * are encrypted. If they use the new-style offset-based IVs, then + * we can move them regardless of the lock state. We create a temporary + * key here that we use to read/write the data, then we discard it at the + * end of the function. + */ + if (cp_fs_protected (hfsmp->hfs_mp)) { + error = cp_entry_gentempkeys(&hfsmp->hfs_resize_cpentry, hfsmp); + if (error) { + printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error); + goto reclaim_filespace_done; + } } +#endif + bzero(iterator, sizeof(*iterator)); btdata.bufferAddress = &filerec; @@ -6556,6 +6967,9 @@ hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_ /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */ if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) { + if (hfs_resize_debug) { + printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID); + } continue; } @@ -6615,7 +7029,16 @@ hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_ files_moved, hfsmp->vcbVN); } - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); +reclaim_filespace_done: + if (iterator) { + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + } + +#if CONFIG_PROTECT + if (keys_generated) { + cp_entry_destroy(&hfsmp->hfs_resize_cpentry); + } +#endif return error; } @@ -7153,7 +7576,7 @@ hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) if (!error) { strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN)); volname_length = strlen ((const char*)vcb->vcbVN); -#define DKIOCCSSETLVNAME _IOW('d', 198, char[1024]) +#define DKIOCCSSETLVNAME _IOW('d', 198, char[256]) /* Send the volume name down to CoreStorage if necessary */ error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); if (error == 0) { @@ -7256,6 +7679,7 @@ void hfs_mark_volume_inconsistent(struct hfsmount *hfsmp) static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) { int retval = 0; + int error = 0; struct mount *mp = NULL; struct hfs_mount_args *args = NULL; @@ -7285,7 +7709,10 @@ static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay"); /* FSYNC the devnode to be sure all data has been flushed */ - retval = VNOP_FSYNC(devvp, MNT_WAIT, context); + error = VNOP_FSYNC(devvp, MNT_WAIT, context); + if (error) { + retval = error; + } out: if (mp) { diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 103232431..84a81a948 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -676,7 +676,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, volname_length = strlen ((const char*)vcb->vcbVN); cat_releasedesc(&cndesc); -#define DKIOCCSSETLVNAME _IOW('d', 198, char[1024]) +#define DKIOCCSSETLVNAME _IOW('d', 198, char[256]) /* Send the volume name down to CoreStorage if necessary */ @@ -3092,7 +3092,7 @@ check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *a return 0; } - if (VTOC(vp)->c_flags & UF_TRACKED) { + if (VTOC(vp)->c_bsdflags & UF_TRACKED) { // the file has the tracked bit set, so send an event to the tracked-file handler int error; @@ -3137,7 +3137,7 @@ check_for_dataless_file(struct vnode *vp, uint64_t op_type) { int error; - if (vp == NULL || (VTOC(vp)->c_flags & UF_COMPRESSED) == 0 || VTOCMP(vp) == NULL || VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { + if (vp == NULL || (VTOC(vp)->c_bsdflags & UF_COMPRESSED) == 0 || VTOCMP(vp) == NULL || VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { // there's nothing to do, it's not dataless return 0; } @@ -3155,7 +3155,7 @@ check_for_dataless_file(struct vnode *vp, uint64_t op_type) // printf("hfs: dataless: got a signal while waiting for namespace handler...\n"); return EINTR; } - } else if (VTOC(vp)->c_flags & UF_COMPRESSED) { + } else if (VTOC(vp)->c_bsdflags & UF_COMPRESSED) { // // if we're here, the dataless bit is still set on the file // which means it didn't get handled. we return an error diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 623f61d18..e48966c3c 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -47,7 +48,7 @@ #include #include #include - +#include #include #include @@ -89,11 +90,13 @@ int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, int, int, int, struct vnode *, int); +/* Used here and in cnode teardown -- for symlinks */ +int hfs_removefile_callback(struct buf *bp, void *hfsmp); + int hfs_movedata (struct vnode *, struct vnode*); static int hfs_move_fork (struct filefork *srcfork, struct cnode *src, struct filefork *dstfork, struct cnode *dst); - #if FIFO static int hfsfifo_read(struct vnop_read_args *); static int hfsfifo_write(struct vnop_write_args *); @@ -136,6 +139,27 @@ int hfsspec_close(struct vnop_close_args *); * *****************************************************************************/ +/* + * Is the given cnode either the .journal or .journal_info_block file on + * a volume with an active journal? Many VNOPs use this to deny access + * to those files. + * + * Note: the .journal file on a volume with an external journal still + * returns true here, even though it does not actually hold the contents + * of the volume's journal. + */ +static _Bool +hfs_is_journal_file(struct hfsmount *hfsmp, struct cnode *cp) +{ + if (hfsmp->jnl != NULL && + (cp->c_fileid == hfsmp->hfs_jnlinfoblkid || + cp->c_fileid == hfsmp->hfs_jnlfileid)) { + return true; + } else { + return false; + } +} + /* * Create a regular file. */ @@ -307,7 +331,7 @@ hfs_file_is_compressed(struct cnode *cp, int skiplock) int ret = 0; /* fast check to see if file is compressed. If flag is clear, just answer no */ - if (!(cp->c_flags & UF_COMPRESSED)) { + if (!(cp->c_bsdflags & UF_COMPRESSED)) { return 0; } @@ -471,15 +495,15 @@ hfs_vnop_open(struct vnop_open_args *ap) /* * Files marked append-only must be opened for appending. */ - if ((cp->c_flags & APPEND) && !vnode_isdir(vp) && + if ((cp->c_bsdflags & APPEND) && !vnode_isdir(vp) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); if (vnode_isreg(vp) && !UBCINFOEXISTS(vp)) return (EBUSY); /* file is in use by the kernel */ - /* Don't allow journal file to be opened externally. */ - if (cp->c_fileid == hfsmp->hfs_jnlfileid) + /* Don't allow journal to be opened externally. */ + if (hfs_is_journal_file(hfsmp, cp)) return (EPERM); if ((hfsmp->hfs_flags & HFS_READ_ONLY) || @@ -706,7 +730,7 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) vap->va_uid = cp->c_uid; vap->va_gid = cp->c_gid; vap->va_mode = cp->c_mode; - vap->va_flags = cp->c_flags; + vap->va_flags = cp->c_bsdflags; vap->va_supported |= VNODE_ATTR_AUTH & ~VNODE_ATTR_va_acl; if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { @@ -926,13 +950,12 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) } } - /* XXX is this really a good 'optimal I/O size'? */ vap->va_iosize = hfsmp->hfs_logBlockSize; vap->va_uid = cp->c_uid; vap->va_gid = cp->c_gid; vap->va_mode = cp->c_mode; - vap->va_flags = cp->c_flags; + vap->va_flags = cp->c_bsdflags; /* * Exporting file IDs from HFS Plus: @@ -1122,15 +1145,15 @@ hfs_vnop_setattr(ap) #if CONFIG_PROTECT - if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { return (error); } #endif /* CONFIG_PROTECT */ hfsmp = VTOHFS(vp); - /* Don't allow modification of the journal file. */ - if (hfsmp->hfs_jnlfileid == VTOC(vp)->c_fileid) { + /* Don't allow modification of the journal. */ + if (hfs_is_journal_file(hfsmp, VTOC(vp))) { return (EPERM); } @@ -1248,7 +1271,7 @@ hfs_vnop_setattr(ap) u_int16_t *fdFlags; #if HFS_COMPRESSION - if ((cp->c_flags ^ vap->va_flags) & UF_COMPRESSED) { + if ((cp->c_bsdflags ^ vap->va_flags) & UF_COMPRESSED) { /* * the UF_COMPRESSED was toggled, so reset our cached compressed state * but we don't want to actually do the update until we've released the cnode lock down below @@ -1259,7 +1282,7 @@ hfs_vnop_setattr(ap) } #endif - cp->c_flags = vap->va_flags; + cp->c_bsdflags = vap->va_flags; cp->c_touch_chgtime = TRUE; /* @@ -1376,14 +1399,9 @@ hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struc if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) return (0); - // XXXdbg - don't allow modification of the journal or journal_info_block - if (VTOHFS(vp)->jnl && cp && cp->c_datafork) { - struct HFSPlusExtentDescriptor *extd; - - extd = &cp->c_datafork->ff_extents[0]; - if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { - return EPERM; - } + // Don't allow modification of the journal or journal_info_block + if (hfs_is_journal_file(VTOHFS(vp), cp)) { + return EPERM; } #if OVERRIDE_UNKNOWN_PERMISSIONS @@ -1422,7 +1440,7 @@ hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean co } /* If immutable bit set, nobody gets to write it. */ - if (considerFlags && (cp->c_flags & IMMUTABLE)) + if (considerFlags && (cp->c_bsdflags & IMMUTABLE)) return (EPERM); /* Otherwise, user id 0 always gets access. */ @@ -1618,6 +1636,18 @@ hfs_vnop_exchange(ap) orig_from_ctime = VTOC(from_vp)->c_ctime; orig_to_ctime = VTOC(to_vp)->c_ctime; + +#if CONFIG_PROTECT + /* + * Do not allow exchangedata/F_MOVEDATAEXTENTS on data-protected filesystems + * because the EAs will not be swapped. As a result, the persistent keys would not + * match and the files will be garbage. + */ + if (cp_fs_protected (vnode_mount(from_vp))) { + return EINVAL; + } +#endif + #if HFS_COMPRESSION if ( hfs_file_is_compressed(VTOC(from_vp), 0) ) { if ( 0 != ( error = decmpfs_decompress_file(from_vp, VTOCMP(from_vp), -1, 0, 1) ) ) { @@ -1639,8 +1669,7 @@ hfs_vnop_exchange(ap) if ((ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) == 0) { check_for_tracked_file(from_vp, orig_from_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); check_for_tracked_file(to_vp, orig_to_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); - } - else { + } else { /* * We're doing a data-swap. * Take the truncate lock/cnode lock, then verify there are no mmap references. @@ -1675,7 +1704,6 @@ hfs_vnop_exchange(ap) } } - if ((error = hfs_lockpair(VTOC(from_vp), VTOC(to_vp), HFS_EXCLUSIVE_LOCK))) return (error); @@ -1690,27 +1718,13 @@ hfs_vnop_exchange(ap) goto exit; } - // XXXdbg - don't allow modification of the journal or journal_info_block - if (hfsmp->jnl) { - struct HFSPlusExtentDescriptor *extd; - - if (from_cp->c_datafork) { - extd = &from_cp->c_datafork->ff_extents[0]; - if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { - error = EPERM; - goto exit; - } - } - - if (to_cp->c_datafork) { - extd = &to_cp->c_datafork->ff_extents[0]; - if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { - error = EPERM; - goto exit; - } - } + // Don't allow modification of the journal or journal_info_block + if (hfs_is_journal_file(hfsmp, from_cp) || + hfs_is_journal_file(hfsmp, to_cp)) { + error = EPERM; + goto exit; } - + /* * Ok, now that all of the pre-flighting is done, call the underlying * function if needed. @@ -1720,7 +1734,7 @@ hfs_vnop_exchange(ap) goto exit; } - + if ((error = hfs_start_transaction(hfsmp)) != 0) { goto exit; } @@ -1802,7 +1816,7 @@ hfs_vnop_exchange(ap) from_cp->c_ctime = to_cp->c_ctime; from_cp->c_gid = to_cp->c_gid; from_cp->c_uid = to_cp->c_uid; - from_cp->c_flags = to_cp->c_flags; + from_cp->c_bsdflags = to_cp->c_bsdflags; from_cp->c_mode = to_cp->c_mode; from_cp->c_linkcount = to_cp->c_linkcount; from_cp->c_flag = to_cp->c_flag & (C_HARDLINK | C_HASXATTRS); @@ -1818,7 +1832,7 @@ hfs_vnop_exchange(ap) to_cp->c_ctime = tempattr.ca_ctime; to_cp->c_gid = tempattr.ca_gid; to_cp->c_uid = tempattr.ca_uid; - to_cp->c_flags = tempattr.ca_flags; + to_cp->c_bsdflags = tempattr.ca_flags; to_cp->c_mode = tempattr.ca_mode; to_cp->c_linkcount = tempattr.ca_linkcount; to_cp->c_flag = tempflag; @@ -1832,14 +1846,14 @@ hfs_vnop_exchange(ap) * When a file moves out of "Cleanup At Startup" * we can drop its NODUMP status. */ - if ((from_cp->c_flags & UF_NODUMP) && + if ((from_cp->c_bsdflags & UF_NODUMP) && (from_cp->c_parentcnid != to_cp->c_parentcnid)) { - from_cp->c_flags &= ~UF_NODUMP; + from_cp->c_bsdflags &= ~UF_NODUMP; from_cp->c_touch_chgtime = TRUE; } - if ((to_cp->c_flags & UF_NODUMP) && + if ((to_cp->c_bsdflags & UF_NODUMP) && (to_cp->c_parentcnid != from_cp->c_parentcnid)) { - to_cp->c_flags &= ~UF_NODUMP; + to_cp->c_bsdflags &= ~UF_NODUMP; to_cp->c_touch_chgtime = TRUE; } @@ -1867,7 +1881,7 @@ hfs_vnop_mmap(struct vnop_mmap_args *ap) int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ time_t orig_ctime = VTOC(vp)->c_ctime; - if (!compressed && (VTOC(vp)->c_flags & UF_COMPRESSED)) { + if (!compressed && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); if (error != 0) { return error; @@ -1919,7 +1933,7 @@ hfs_vnop_mmap(struct vnop_mmap_args *ap) * */ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { - + struct cnode *from_cp; struct cnode *to_cp; struct hfsmount *hfsmp = NULL; @@ -1928,13 +1942,13 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { int lockflags = 0; int overflow_blocks; int rsrc = 0; - - + + /* Get the HFS pointers */ from_cp = VTOC(from_vp); to_cp = VTOC(to_vp); hfsmp = VTOHFS(from_vp); - + /* Verify that neither source/dest file is open-unlinked */ if (from_cp->c_flag & (C_DELETED | C_NOEXISTS)) { error = EBUSY; @@ -1966,7 +1980,7 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { if (from_cp->c_rsrc_vp == from_vp) { rsrc = 1; } - + /* * We assume that the destination file is already empty. * Verify that it is. @@ -1983,7 +1997,7 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { goto movedata_exit; } } - + /* If the source has the rsrc open, make sure the destination is also the rsrc */ if (rsrc) { if (to_vp != to_cp->c_rsrc_vp) { @@ -1996,9 +2010,9 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { if (to_vp != to_cp->c_vp) { error = EINVAL; goto movedata_exit; - } + } } - + /* * See if the source file has overflow extents. If it doesn't, we don't * need to call into MoveData, and the catalog will be enough. @@ -2009,15 +2023,15 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { else { overflow_blocks = overflow_extents(from_cp->c_datafork); } - + if ((error = hfs_start_transaction (hfsmp)) != 0) { goto movedata_exit; } started_tr = 1; - + /* Lock the system files: catalog, extents, attributes */ lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - + /* Copy over any catalog allocation data into the new spot. */ if (rsrc) { if ((error = hfs_move_fork (from_cp->c_rsrcfork, from_cp, to_cp->c_rsrcfork, to_cp))){ @@ -2031,7 +2045,7 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { goto movedata_exit; } } - + /* * Note that because all we're doing is moving the extents around, we can * probably do this in a single transaction: Each extent record (group of 8) @@ -2052,7 +2066,7 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { error = MoveData (hfsmp, from_cp->c_cnid, to_cp->c_cnid, 0); } } - + if (error) { /* Reverse the operation. Copy the fork data back into the source */ if (rsrc) { @@ -2067,52 +2081,52 @@ int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { struct cat_fork *src_rsrc = NULL; struct cat_fork *dst_data = NULL; struct cat_fork *dst_rsrc = NULL; - + /* Touch the times*/ to_cp->c_touch_acctime = TRUE; to_cp->c_touch_chgtime = TRUE; to_cp->c_touch_modtime = TRUE; - + from_cp->c_touch_acctime = TRUE; from_cp->c_touch_chgtime = TRUE; from_cp->c_touch_modtime = TRUE; - + hfs_touchtimes(hfsmp, to_cp); hfs_touchtimes(hfsmp, from_cp); - + if (from_cp->c_datafork) { src_data = &from_cp->c_datafork->ff_data; } if (from_cp->c_rsrcfork) { src_rsrc = &from_cp->c_rsrcfork->ff_data; } - + if (to_cp->c_datafork) { dst_data = &to_cp->c_datafork->ff_data; } if (to_cp->c_rsrcfork) { dst_rsrc = &to_cp->c_rsrcfork->ff_data; } - + /* Update the catalog nodes */ (void) cat_update(hfsmp, &from_cp->c_desc, &from_cp->c_attr, - src_data, src_rsrc); - + src_data, src_rsrc); + (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, - dst_data, dst_rsrc); - + dst_data, dst_rsrc); + } /* unlock the system files */ hfs_systemfile_unlock(hfsmp, lockflags); - - + + movedata_exit: if (started_tr) { hfs_end_transaction(hfsmp); } - + return error; - + } /* @@ -2123,31 +2137,31 @@ movedata_exit: * non overflow-extent extents into the destination here. */ static int hfs_move_fork (struct filefork *srcfork, struct cnode *src_cp, - struct filefork *dstfork, struct cnode *dst_cp) { + struct filefork *dstfork, struct cnode *dst_cp) { struct rl_entry *invalid_range; int size = sizeof(struct HFSPlusExtentDescriptor); size = size * kHFSPlusExtentDensity; - + /* If the dstfork has any invalid ranges, bail out */ invalid_range = TAILQ_FIRST(&dstfork->ff_invalidranges); if (invalid_range != NULL) { return EFBIG; } - + if (dstfork->ff_data.cf_size != 0 || dstfork->ff_data.cf_new_size != 0) { return EFBIG; } - + /* First copy the invalid ranges */ while ((invalid_range = TAILQ_FIRST(&srcfork->ff_invalidranges))) { off_t start = invalid_range->rl_start; off_t end = invalid_range->rl_end; - + /* Remove it from the srcfork and add it to dstfork */ rl_remove(start, end, &srcfork->ff_invalidranges); rl_add(start, end, &dstfork->ff_invalidranges); } - + /* * Ignore the ff_union. We don't move symlinks or system files. * Now copy the in-catalog extent information @@ -2156,20 +2170,20 @@ static int hfs_move_fork (struct filefork *srcfork, struct cnode *src_cp, dstfork->ff_data.cf_new_size = srcfork->ff_data.cf_new_size; dstfork->ff_data.cf_vblocks = srcfork->ff_data.cf_vblocks; dstfork->ff_data.cf_blocks = srcfork->ff_data.cf_blocks; - + /* just memcpy the whole array of extents to the new location. */ memcpy (dstfork->ff_data.cf_extents, srcfork->ff_data.cf_extents, size); - + /* * Copy the cnode attribute data. * */ src_cp->c_blocks -= srcfork->ff_data.cf_vblocks; src_cp->c_blocks -= srcfork->ff_data.cf_blocks; - + dst_cp->c_blocks += srcfork->ff_data.cf_vblocks; dst_cp->c_blocks += srcfork->ff_data.cf_blocks; - + /* Now delete the entries in the source fork */ srcfork->ff_data.cf_size = 0; srcfork->ff_data.cf_new_size = 0; @@ -2181,8 +2195,7 @@ static int hfs_move_fork (struct filefork *srcfork, struct cnode *src_cp, bzero (srcfork->ff_data.cf_extents, size); return 0; } - - + /* * cnode must be locked @@ -2200,6 +2213,7 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) int wait; /* all other attributes (e.g. atime, etc.) */ int lockflag; int took_trunc_lock = 0; + int locked_buffers = 0; /* * Applications which only care about data integrity rather than full @@ -2251,7 +2265,7 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) */ if (fp && (((cp->c_flag & C_ALWAYS_ZEROFILL) && !TAILQ_EMPTY(&fp->ff_invalidranges)) || ((wait || (cp->c_flag & C_ZFWANTSYNC)) && - ((cp->c_flags & UF_NODUMP) == 0) && + ((cp->c_bsdflags & UF_NODUMP) == 0) && UBCINFOEXISTS(vp) && (vnode_issystem(vp) ==0) && cp->c_zftimeout != 0))) { @@ -2318,8 +2332,32 @@ datasync: /* * Flush all dirty buffers associated with a vnode. + * Record how many of them were dirty AND locked (if necessary). */ - buf_flushdirtyblks(vp, waitdata, lockflag, "hfs_fsync"); + locked_buffers = buf_flushdirtyblks_skipinfo(vp, waitdata, lockflag, "hfs_fsync"); + if ((lockflag & BUF_SKIP_LOCKED) && (locked_buffers) && (vnode_vtype(vp) == VLNK)) { + /* + * If there are dirty symlink buffers, then we may need to take action + * to prevent issues later on if we are journaled. If we're fsyncing a + * symlink vnode then we are in one of three cases: + * + * 1) automatic sync has fired. In this case, we don't want the behavior to change. + * + * 2) Someone has opened the FD for the symlink (not what it points to) + * and has issued an fsync against it. This should be rare, and we don't + * want the behavior to change. + * + * 3) We are being called by a vclean which is trying to reclaim this + * symlink vnode. If this is the case, then allowing this fsync to + * proceed WITHOUT flushing the journal could result in the vclean + * invalidating the buffer's blocks before the journal transaction is + * written to disk. To prevent this, we force a journal flush + * if the vnode is in the middle of a recycle (VL_TERMINATE or VL_DEAD is set). + */ + if (vnode_isrecycled(vp)) { + fullsync = 1; + } + } metasync: if (vnode_isreg(vp) && vnode_issystem(vp)) { @@ -2653,7 +2691,7 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, * the current directory and thus be * non-empty.) */ - if ((dcp->c_flags & APPEND) || (cp->c_flags & (IMMUTABLE | APPEND))) { + if ((dcp->c_bsdflags & APPEND) || (cp->c_bsdflags & (IMMUTABLE | APPEND))) { error = EPERM; goto out; } @@ -2750,17 +2788,16 @@ hfs_vnop_remove(ap) struct cnode *dcp = VTOC(dvp); struct cnode *cp; struct vnode *rvp = NULL; - struct hfsmount *hfsmp = VTOHFS(vp); int error=0, recycle_rsrc=0; - int drop_rsrc_vnode = 0; time_t orig_ctime; + uint32_t rsrc_vid = 0; if (dvp == vp) { return (EINVAL); } orig_ctime = VTOC(vp)->c_ctime; - if (!vnode_isnamedstream(vp)) { + if ( (!vnode_isnamedstream(vp)) && ((ap->a_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) == 0)) { error = check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); if (error) { // XXXdbg - decide on a policy for handling namespace handler failures! @@ -2771,51 +2808,56 @@ hfs_vnop_remove(ap) cp = VTOC(vp); - /* - * We need to grab the cnode lock on 'cp' before the lockpair() - * to get an iocount on the rsrc fork BEFORE we enter hfs_removefile. - * To prevent other deadlocks, it's best to call hfs_vgetrsrc in a way that - * allows it to drop the cnode lock that it expects to be held coming in. - * If we don't, we could commit a lock order violation, causing a deadlock. - * In order to safely get the rsrc vnode with an iocount, we need to only hold the - * lock on the file temporarily. Unlike hfs_vnop_rename, we don't have to worry - * about one rsrc fork getting recycled for another, but we do want to ensure - * that there are no deadlocks due to lock ordering issues. - * +relock: + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + + if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(cp, 0); + if (rvp) { + vnode_put (rvp); + } + return (error); + } + + /* + * Lazily respond to determining if there is a valid resource fork + * vnode attached to 'cp' if it is a regular file or symlink. + * If the vnode does not exist, then we may proceed without having to + * create it. + * + * If, however, it does exist, then we need to acquire an iocount on the + * vnode after acquiring its vid. This ensures that if we have to do I/O + * against it, it can't get recycled from underneath us in the middle + * of this call. + * * Note: this function may be invoked for directory hardlinks, so just skip these * steps if 'vp' is a directory. */ if ((vp->v_type == VLNK) || (vp->v_type == VREG)) { + if ((cp->c_rsrc_vp) && (rvp == NULL)) { + /* We need to acquire the rsrc vnode */ + rvp = cp->c_rsrc_vp; + rsrc_vid = vnode_vid (rvp); + + /* Unlock everything to acquire iocount on the rsrc vnode */ + hfs_unlock_truncate (cp, 0); + hfs_unlockpair (dcp, cp); - if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK))) { - return (error); - } - - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); - hfs_unlock(cp); - if (error) { - /* we may have gotten an rsrc vp even though we got an error */ - if (rvp) { - vnode_put(rvp); + /* Use the vid to maintain identity on rvp */ + if (vnode_getwithvid(rvp, rsrc_vid)) { + /* + * If this fails, then it was recycled or + * reclaimed in the interim. Reset fields and + * start over. + */ rvp = NULL; + rsrc_vid = 0; } - return (error); - } - drop_rsrc_vnode = 1; - } - /* Now that we may have an iocount on rvp, do the lock pair */ - - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); - - if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp, 0); - /* drop the iocount on rvp if necessary */ - if (drop_rsrc_vnode) { - vnode_put (rvp); + goto relock; } - return (error); } /* @@ -2827,7 +2869,7 @@ hfs_vnop_remove(ap) goto rm_done; } - error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, rvp, 0); + error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, NULL, 0); /* * If the remove succeeded in deleting the file, then we may need to mark @@ -2866,7 +2908,7 @@ rm_done: vnode_recycle(rvp); } - if (drop_rsrc_vnode) { + if (rvp) { /* drop iocount on rsrc fork, was obtained at beginning of fxn */ vnode_put(rvp); } @@ -2875,7 +2917,7 @@ rm_done: } -static int +int hfs_removefile_callback(struct buf *bp, void *hfsmp) { if ( !(buf_flags(bp) & B_META)) @@ -2895,22 +2937,27 @@ hfs_removefile_callback(struct buf *bp, void *hfsmp) { * This function may be used to remove directories if they have * lots of EA's -- note the 'allow_dirs' argument. * - * The 'rvp' argument is used to pass in a resource fork vnode with - * an iocount to prevent it from getting recycled during usage. If it - * is NULL, then it is assumed the caller is a VNOP that cannot operate - * on resource forks, like hfs_vnop_symlink or hfs_removedir. Otherwise in - * a VNOP that takes multiple vnodes, we could violate lock order and - * cause a deadlock. + * This function is able to delete blocks & fork data for the resource + * fork even if it does not exist in core (and have a backing vnode). + * It should infer the correct behavior based on the number of blocks + * in the cnode and whether or not the resource fork pointer exists or + * not. As a result, one only need pass in the 'vp' corresponding to the + * data fork of this file (or main vnode in the case of a directory). + * Passing in a resource fork will result in an error. + * + * Because we do not create any vnodes in this function, we are not at + * risk of deadlocking against ourselves by double-locking. * * Requires cnode and truncate locks to be held. */ int hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int flags, int skip_reserve, int allow_dirs, - struct vnode *rvp, int only_unlink) + __unused struct vnode *rvp, int only_unlink) { struct cnode *cp; struct cnode *dcp; + struct vnode *rsrc_vp = NULL; struct hfsmount *hfsmp; struct cat_desc desc; struct timeval tv; @@ -2921,7 +2968,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int started_tr = 0; int isbigfile = 0, defer_remove=0, isdir=0; int update_vh = 0; - + cp = VTOC(vp); dcp = VTOC(dvp); hfsmp = VTOHFS(vp); @@ -2939,11 +2986,37 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (VNODE_IS_RSRC(vp)) { return (EPERM); } + else { + /* + * We know it's a data fork. + * Probe the cnode to see if we have a valid resource fork + * in hand or not. + */ + rsrc_vp = cp->c_rsrc_vp; + } + /* Don't allow deleting the journal or journal_info_block. */ - if (hfsmp->jnl && - (cp->c_fileid == hfsmp->hfs_jnlfileid || cp->c_fileid == hfsmp->hfs_jnlinfoblkid)) { + if (hfs_is_journal_file(hfsmp, cp)) { return (EPERM); } + + /* + * If removing a symlink, then we need to ensure that the + * data blocks for the symlink are not still in-flight or pending. + * If so, we will unlink the symlink here, making its blocks + * available for re-allocation by a subsequent transaction. That is OK, but + * then the I/O for the data blocks could then go out before the journal + * transaction that created it was flushed, leading to I/O ordering issues. + */ + if (vp->v_type == VLNK) { + /* + * This will block if the asynchronous journal flush is in progress. + * If this symlink is not being renamed over and doesn't have any open FDs, + * then we'll remove it from the journal's bufs below in kill_block. + */ + buf_wait_for_shadow_io (vp, 0); + } + /* * Hard links require special handling. */ @@ -2962,6 +3035,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); } } + /* Directories should call hfs_rmdir! (unless they have a lot of attributes) */ if (vnode_isdir(vp)) { if (allow_dirs == 0) @@ -2982,23 +3056,30 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, /* Remove our entry from the namei cache. */ cache_purge(vp); - + /* - * We expect the caller, if operating on files, - * will have passed in a resource fork vnode with - * an iocount, even if there was no content. - * We only do the hfs_truncate on the rsrc fork - * if we know that it DID have content, however. - * This has the bonus of not requiring us to defer - * its removal, unless it is in use. + * If the caller was operating on a file (as opposed to a + * directory with EAs), then we need to figure out + * whether or not it has a valid resource fork vnode. + * + * If there was a valid resource fork vnode, then we need + * to use hfs_truncate to eliminate its data. If there is + * no vnode, then we hold the cnode lock which would + * prevent it from being created. As a result, + * we can use the data deletion functions which do not + * require that a cnode/vnode pair exist. */ /* Check if this file is being used. */ if (isdir == 0) { dataforkbusy = vnode_isinuse(vp, 0); - /* Only need to defer resource fork removal if in use and has content */ - if (rvp && (cp->c_blocks - VTOF(vp)->ff_blocks)) { - rsrcforkbusy = vnode_isinuse(rvp, 0); + /* + * At this point, we know that 'vp' points to the + * a data fork because we checked it up front. And if + * there is no rsrc fork, rsrc_vp will be NULL. + */ + if (rsrc_vp && (cp->c_blocks - VTOF(vp)->ff_blocks)) { + rsrcforkbusy = vnode_isinuse(rsrc_vp, 0); } } @@ -3054,7 +3135,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (!dataforkbusy && cp->c_datafork->ff_blocks && !isbigfile) { cp->c_flag |= C_NEED_DATA_SETSIZE; } - if (!rsrcforkbusy && rvp) { + if (!rsrcforkbusy && rsrc_vp) { cp->c_flag |= C_NEED_RSRC_SETSIZE; } } @@ -3096,8 +3177,12 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, } update_vh = 1; } - if (!rsrcforkbusy && rvp) { - error = hfs_prepare_release_storage (hfsmp, rvp); + + /* + * If the resource fork vnode does not exist, we can skip this step. + */ + if (!rsrcforkbusy && rsrc_vp) { + error = hfs_prepare_release_storage (hfsmp, rsrc_vp); if (error) { goto out; } @@ -3205,17 +3290,55 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, goto out; } - else /* Not busy */ { - + else { + /* + * Nobody is using this item; we can safely remove everything. + */ + struct filefork *temp_rsrc_fork = NULL; #if QUOTA off_t savedbytes; int blksize = hfsmp->blockSize; #endif u_int32_t fileid = cp->c_fileid; - + + /* + * Figure out if we need to read the resource fork data into + * core before wiping out the catalog record. + * + * 1) Must not be a directory + * 2) cnode's c_rsrcfork ptr must be NULL. + * 3) rsrc fork must have actual blocks + */ + if ((isdir == 0) && (cp->c_rsrcfork == NULL) && + (cp->c_blocks - VTOF(vp)->ff_blocks)) { + /* + * The resource fork vnode & filefork did not exist. + * Create a temporary one for use in this function only. + */ + MALLOC_ZONE (temp_rsrc_fork, struct filefork *, sizeof (struct filefork), M_HFSFORK, M_WAITOK); + bzero(temp_rsrc_fork, sizeof(struct filefork)); + temp_rsrc_fork->ff_cp = cp; + rl_init(&temp_rsrc_fork->ff_invalidranges); + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* Look up the resource fork first, if necessary */ + if (temp_rsrc_fork) { + error = cat_lookup (hfsmp, &desc, 1, (struct cat_desc*) NULL, + (struct cat_attr*) NULL, &temp_rsrc_fork->ff_data, NULL); + if (error) { + FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); + hfs_systemfile_unlock (hfsmp, lockflags); + goto out; + } + } + if (!skip_reserve) { if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { + if (temp_rsrc_fork) { + FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); + } hfs_systemfile_unlock(hfsmp, lockflags); goto out; } @@ -3238,7 +3361,11 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); } hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (temp_rsrc_fork) { + FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); + } goto out; } @@ -3253,9 +3380,23 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); } - if (cp->c_rsrcfork && (cp->c_rsrcfork->ff_blocks > 0)) { - savedbytes = ((off_t)cp->c_rsrcfork->ff_blocks * (off_t)blksize); - (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + /* + * We may have just deleted the catalog record for a resource fork even + * though it did not exist in core as a vnode. However, just because there + * was a resource fork pointer in the cnode does not mean that it had any blocks. + */ + if (temp_rsrc_fork || cp->c_rsrcfork) { + if (cp->c_rsrcfork) { + if (cp->c_rsrcfork->ff_blocks > 0) { + savedbytes = ((off_t)cp->c_rsrcfork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + } + else { + /* we must have used a temporary fork */ + savedbytes = ((off_t)temp_rsrc_fork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } } if (hfsmp->hfs_flags & HFS_QUOTAS) { @@ -3263,13 +3404,17 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, } #endif - /* * If we didn't get any errors deleting the catalog entry, then go ahead * and release the backing store now. The filefork pointers are still valid. - */ - error = hfs_release_storage (hfsmp, cp->c_datafork, cp->c_rsrcfork, fileid); - + */ + if (temp_rsrc_fork) { + error = hfs_release_storage (hfsmp, cp->c_datafork, temp_rsrc_fork, fileid); + } + else { + /* if cp->c_rsrcfork == NULL, hfs_release_storage will skip over it. */ + error = hfs_release_storage (hfsmp, cp->c_datafork, cp->c_rsrcfork, fileid); + } if (error) { /* * If we encountered an error updating the extents and bitmap, @@ -3284,7 +3429,12 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, /* reset update_vh to 0, since hfs_release_storage should have done it for us */ update_vh = 0; } - + + /* Get rid of the temporary rsrc fork */ + if (temp_rsrc_fork) { + FREE_ZONE (temp_rsrc_fork, sizeof(struct filefork), M_HFSFORK); + } + cp->c_flag |= C_NOEXISTS; cp->c_flag &= ~C_DELETED; @@ -3358,6 +3508,7 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) cdp->cd_flags &= ~CD_HASBUF; } + /* * Rename a cnode. * @@ -3397,8 +3548,13 @@ hfs_vnop_rename(ap) struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; - struct vnode *fvp_rsrc = NULLVP; + /* + * Note that we only need locals for the target/destination's + * resource fork vnode (and only if necessary). We don't care if the + * source has a resource fork vnode or not. + */ struct vnode *tvp_rsrc = NULLVP; + uint32_t tvp_rsrc_vid = 0; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = vfs_context_proc(ap->a_context); @@ -3479,77 +3635,13 @@ hfs_vnop_rename(ap) } } - /* - * Before grabbing the four locks, we may need to get an iocount on the resource fork - * vnodes in question, just like hfs_vnop_remove. If fvp and tvp are not - * directories, then go ahead and grab the resource fork vnodes now - * one at a time. We don't actively need the fvp_rsrc to do the rename operation, - * but we need the iocount to prevent the vnode from getting recycled/reclaimed - * during the middle of the VNOP. - */ - - - if ((vnode_isreg(fvp)) || (vnode_islnk(fvp))) { - - if ((error = hfs_lock (VTOC(fvp), HFS_EXCLUSIVE_LOCK))) { - return (error); - } - /* - * We care if we race against rename/delete with this cp, so we'll error out - * if the file becomes open-unlinked during this call. - */ - error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE, TRUE); - hfs_unlock (VTOC(fvp)); - if (error) { - if (fvp_rsrc) { - vnode_put(fvp_rsrc); - } - return error; - } - } - - if (tvp && (vnode_isreg(tvp) || vnode_islnk(tvp))) { - /* - * Lock failure is OK on tvp, since we may race with a remove on the dst. - * But this shouldn't stop rename from proceeding, so only try to - * grab the resource fork if the lock succeeded. - */ - if (hfs_lock (VTOC(tvp), HFS_EXCLUSIVE_LOCK) == 0) { - tcp = VTOC(tvp); - /* - * We only care if we get an open-unlinked file on the dst so we - * know to null out tvp/tcp to make the rename operation act - * as if they never existed. Because they're effectively out of the - * namespace already it's fine to do this. If this is true, then - * make sure to unlock the cnode and drop the iocount only after the unlock. - */ - - error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE, TRUE); - hfs_unlock (tcp); - if (error) { - /* - * Since we specify TRUE for error_on_unlinked in hfs_vgetrsrc, - * we can get a rsrc fork vnode even if it returns an error. - */ - tcp = NULL; - tvp = NULL; - if (tvp_rsrc) { - vnode_put (tvp_rsrc); - tvp_rsrc = NULL; - } - /* just bypass truncate lock and act as if we never got tcp/tvp */ - goto retry; - } - } - } - +retry: /* When tvp exists, take the truncate lock for hfs_removefile(). */ if (tvp && (vnode_isreg(tvp) || vnode_islnk(tvp))) { hfs_lock_truncate(VTOC(tvp), HFS_EXCLUSIVE_LOCK); took_trunc_lock = 1; } - retry: error = hfs_lockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL, HFS_EXCLUSIVE_LOCK, &error_cnode); if (error) { @@ -3557,6 +3649,20 @@ hfs_vnop_rename(ap) hfs_unlock_truncate(VTOC(tvp), 0); took_trunc_lock = 0; } + + /* + * We hit an error path. If we were trying to re-acquire the locks + * after coming through here once, we might have already obtained + * an iocount on tvp's resource fork vnode. Drop that before dealing + * with the failure. Note this is safe -- since we are in an + * error handling path, we can't be holding the cnode locks. + */ + if (tvp_rsrc) { + vnode_put (tvp_rsrc); + tvp_rsrc_vid = 0; + tvp_rsrc = NULL; + } + /* * tvp might no longer exist. If the cause of the lock failure * was tvp, then we can try again with tvp/tcp set to NULL. @@ -3568,13 +3674,7 @@ hfs_vnop_rename(ap) tvp = NULL; goto retry; } - /* otherwise, drop iocounts on the rsrc forks and bail out */ - if (fvp_rsrc) { - vnode_put (fvp_rsrc); - } - if (tvp_rsrc) { - vnode_put (tvp_rsrc); - } + return (error); } @@ -3583,6 +3683,37 @@ hfs_vnop_rename(ap) tdcp = VTOC(tdvp); tcp = tvp ? VTOC(tvp) : NULL; + /* + * Acquire iocounts on the destination's resource fork vnode + * if necessary. If dst/src are files and the dst has a resource + * fork vnode, then we need to try and acquire an iocount on the rsrc vnode. + * If it does not exist, then we don't care and can skip it. + */ + if ((vnode_isreg(fvp)) || (vnode_islnk(fvp))) { + if ((tvp) && (tcp->c_rsrc_vp) && (tvp_rsrc == NULL)) { + tvp_rsrc = tcp->c_rsrc_vp; + /* + * We can look at the vid here because we're holding the + * cnode lock on the underlying cnode for this rsrc vnode. + */ + tvp_rsrc_vid = vnode_vid (tvp_rsrc); + + /* Unlock everything to acquire iocount on this rsrc vnode */ + if (took_trunc_lock) { + hfs_unlock_truncate (VTOC(tvp), 0); + took_trunc_lock = 0; + } + hfs_unlockfour(fdcp, fcp, tdcp, tcp); + + if (vnode_getwithvid (tvp_rsrc, tvp_rsrc_vid)) { + /* iocount acquisition failed. Reset fields and start over.. */ + tvp_rsrc_vid = 0; + tvp_rsrc = NULL; + } + goto retry; + } + } + /* Ensure we didn't race src or dst parent directories with rmdir. */ if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) { error = ENOENT; @@ -3709,7 +3840,7 @@ hfs_vnop_rename(ap) /* * Make sure "from" vnode and its parent are changeable. */ - if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) { + if ((fcp->c_bsdflags & (IMMUTABLE | APPEND)) || (fdcp->c_bsdflags & APPEND)) { error = EPERM; goto out; } @@ -3731,6 +3862,13 @@ hfs_vnop_rename(ap) goto out; } + /* Don't allow modification of the journal or journal_info_block */ + if (hfs_is_journal_file(hfsmp, fcp) || + (tcp && hfs_is_journal_file(hfsmp, tcp))) { + error = EPERM; + goto out; + } + #if QUOTA if (tvp) (void)hfs_getinoquota(tcp); @@ -3908,7 +4046,7 @@ hfs_vnop_rename(ap) error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE, 1); } else { - error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, tvp_rsrc, 1); + error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, NULL, 1); /* * If the destination file had a resource fork vnode, then we need to get rid of @@ -3980,7 +4118,7 @@ skip_rm: replace_desc(fcp, &out_desc); fcp->c_parentcnid = tdcp->c_fileid; fcp->c_hint = 0; - + /* Now indicate this cnode needs to have date-added written to the finderinfo */ fcp->c_flag |= C_NEEDS_DATEADDED; (void) hfs_update (fvp, 0); @@ -4029,42 +4167,6 @@ skip_rm: tdcp->c_flag |= C_FORCEUPDATE; // XXXdbg - force it out! (void) hfs_update(tdvp, 0); - - - /* Update the vnode's name now that the rename has completed. */ - vnode_update_identity(fvp, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, - tcnp->cn_hash, (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); - /* - * At this point, we may have a resource fork vnode attached to the - * 'from' vnode. If it exists, we will want to update its name, because - * it contains the old name + _PATH_RSRCFORKSPEC. ("/..namedfork/rsrc"). - * - * Note that the only thing we need to update here is the name attached to - * the vnode, since a resource fork vnode does not have a separate resource - * cnode -- it's still 'fcp'. - */ - if (fcp->c_rsrc_vp) { - char* rsrc_path = NULL; - int len; - - /* Create a new temporary buffer that's going to hold the new name */ - MALLOC_ZONE (rsrc_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - len = snprintf (rsrc_path, MAXPATHLEN, "%s%s", tcnp->cn_nameptr, _PATH_RSRCFORKSPEC); - len = MIN(len, MAXPATHLEN); - - /* - * vnode_update_identity will do the following for us: - * 1) release reference on the existing rsrc vnode's name. - * 2) copy/insert new name into the name cache - * 3) attach the new name to the resource vnode - * 4) update the vnode's vid - */ - vnode_update_identity (fcp->c_rsrc_vp, fvp, rsrc_path, len, 0, (VNODE_UPDATE_NAME | VNODE_UPDATE_CACHE)); - - /* Free the memory associated with the resource fork's name */ - FREE_ZONE (rsrc_path, MAXPATHLEN, M_NAMEI); - } - out: if (got_cookie) { cat_postflight(hfsmp, &cookie, p); @@ -4086,12 +4188,10 @@ out: hfs_unlockfour(fdcp, fcp, tdcp, tcp); - /* Now vnode_put the resource forks vnodes if necessary */ + /* Now vnode_put the resource fork vnode if necessary */ if (tvp_rsrc) { vnode_put(tvp_rsrc); - } - if (fvp_rsrc) { - vnode_put(fvp_rsrc); + tvp_rsrc = NULL; } /* After tvp is removed the only acceptable error is EIO */ @@ -4339,7 +4439,7 @@ hfs_vnop_readdir(ap) if (uio_iovcnt(uio) > 1) return (EINVAL); - if (VTOC(vp)->c_flags & UF_COMPRESSED) { + if (VTOC(vp)->c_bsdflags & UF_COMPRESSED) { int compressed = hfs_file_is_compressed(VTOC(vp), 0); /* 0 == take the cnode lock */ if (VTOCMP(vp) != NULL && !compressed) { error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); @@ -4512,7 +4612,7 @@ hfs_vnop_readdir(ap) } /* Pack the buffer with dirent entries. */ - error = cat_getdirentries(hfsmp, cp->c_entries, dirhint, uio, extended, &items, &eofflag); + error = cat_getdirentries(hfsmp, cp->c_entries, dirhint, uio, ap->a_flags, &items, &eofflag); if (index == 0 && error == 0) { cp->c_dirthreadhint = dirhint->dh_threadhint; @@ -4794,52 +4894,52 @@ hfs_update(struct vnode *vp, __unused int waitfor) return error; } - /* - * Modify the values passed to cat_update based on whether or not - * the file has invalid ranges or borrowed blocks. - */ - if (dataforkp) { - off_t numbytes = 0; - - /* copy the datafork into a temporary copy so we don't pollute the cnode's */ - bcopy(dataforkp, &datafork, sizeof(datafork)); - dataforkp = &datafork; - - /* - * If there are borrowed blocks, ensure that they are subtracted - * from the total block count before writing the cnode entry to disk. - * Only extents that have actually been marked allocated in the bitmap - * should be reflected in the total block count for this fork. - */ - if (cp->c_datafork->ff_unallocblocks != 0) { - // make sure that we don't assign a negative block count - if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { - panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", - cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); - } - - /* Also cap the LEOF to the total number of bytes that are allocated. */ - datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); - datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; - } - - /* - * For files with invalid ranges (holes) the on-disk - * field representing the size of the file (cf_size) - * must be no larger than the start of the first hole. - * However, note that if the first invalid range exists - * solely within borrowed blocks, then our LEOF and block - * count should both be zero. As a result, set it to the - * min of the current cf_size and the start of the first - * invalid range, because it may have already been reduced - * to zero by the borrowed blocks check above. - */ - if (!TAILQ_EMPTY(&cp->c_datafork->ff_invalidranges)) { - numbytes = TAILQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; - datafork.cf_size = MIN((numbytes), (datafork.cf_size)); - } - } - + /* + * Modify the values passed to cat_update based on whether or not + * the file has invalid ranges or borrowed blocks. + */ + if (dataforkp) { + off_t numbytes = 0; + + /* copy the datafork into a temporary copy so we don't pollute the cnode's */ + bcopy(dataforkp, &datafork, sizeof(datafork)); + dataforkp = &datafork; + + /* + * If there are borrowed blocks, ensure that they are subtracted + * from the total block count before writing the cnode entry to disk. + * Only extents that have actually been marked allocated in the bitmap + * should be reflected in the total block count for this fork. + */ + if (cp->c_datafork->ff_unallocblocks != 0) { + // make sure that we don't assign a negative block count + if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { + panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", + cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); + } + + /* Also cap the LEOF to the total number of bytes that are allocated. */ + datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); + datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; + } + + /* + * For files with invalid ranges (holes) the on-disk + * field representing the size of the file (cf_size) + * must be no larger than the start of the first hole. + * However, note that if the first invalid range exists + * solely within borrowed blocks, then our LEOF and block + * count should both be zero. As a result, set it to the + * min of the current cf_size and the start of the first + * invalid range, because it may have already been reduced + * to zero by the borrowed blocks check above. + */ + if (!TAILQ_EMPTY(&cp->c_datafork->ff_invalidranges)) { + numbytes = TAILQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; + datafork.cf_size = MIN((numbytes), (datafork.cf_size)); + } + } + /* * For resource forks with delayed allocations, make sure * the block count and file size match the number of blocks @@ -4890,8 +4990,18 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, enum vtype vnodetype; int mode; int newvnode_flags = 0; - int nocache = 0; u_int32_t gnv_flags = 0; + int protectable_target = 0; + +#if CONFIG_PROTECT + struct cprotect *entry = NULL; + uint32_t cp_class = 0; + if (VATTR_IS_ACTIVE(vap, va_dataprotect_class)) { + cp_class = vap->va_dataprotect_class; + } + int protected_mount = 0; +#endif + if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK))) return (error); @@ -4906,8 +5016,9 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } dcp->c_flag |= C_DIR_MODIFICATION; - + hfsmp = VTOHFS(dvp); + *vpp = NULL; tvp = NULL; out_desc.cd_flags = 0; @@ -4918,13 +5029,11 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, vnodetype = VREG; mode = MAKEIMODE(vnodetype, vap->va_mode); -#if CONFIG_PROTECT - /* If we're creating a regular file on a CP filesystem, then delay caching */ - if ((vnodetype == VREG ) && (cp_fs_protected (VTOVFS(dvp)))) { - nocache = 1; + if (S_ISDIR (mode) || S_ISREG (mode)) { + protectable_target = 1; } -#endif + /* Check if were out of usable disk space. */ if ((hfs_freeblks(hfsmp, 1) == 0) && (vfs_context_suser(ctx) != 0)) { error = ENOSPC; @@ -4955,7 +5064,7 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, VATTR_SET_SUPPORTED(vap, va_flags); attr.ca_flags = vap->va_flags; } - + /* * HFS+ only: all files get ThreadExists * HFSX only: dirs get HasFolderCount @@ -4969,7 +5078,29 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } } - /* Add the date added to the item */ +#if CONFIG_PROTECT + if (cp_fs_protected(hfsmp->hfs_mp)) { + protected_mount = 1; + } + /* + * On a content-protected HFS+/HFSX filesystem, files and directories + * cannot be created without atomically setting/creating the EA that + * contains the protection class metadata and keys at the same time, in + * the same transaction. As a result, pre-set the "EAs exist" flag + * on the cat_attr for protectable catalog record creations. This will + * cause the cnode creation routine in hfs_getnewvnode to mark the cnode + * as having EAs. + */ + if ((protected_mount) && (protectable_target)) { + attr.ca_recflags |= kHFSHasAttributesMask; + } +#endif + + + /* + * Add the date added to the item. See above, as + * all of the dates are set to the itime. + */ hfs_write_dateadded (&attr, attr.ca_atime); attr.ca_uid = vap->va_uid; @@ -5010,6 +5141,22 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, in_desc.cd_hint = dcp->c_childhint; in_desc.cd_encoding = 0; +#if CONFIG_PROTECT + /* + * To preserve file creation atomicity with regards to the content protection EA, + * we must create the file in the catalog and then write out the EA in the same + * transaction. Pre-flight any operations that we can (such as allocating/preparing + * the buffer, wrapping the keys) before we start the txn and take the requisite + * b-tree locks. We pass '0' as the fileid because we do not know it yet. + */ + if ((protected_mount) && (protectable_target)) { + error = cp_entry_create_keys (&entry, dcp, hfsmp, cp_class, 0, attr.ca_mode); + if (error) { + goto exit; + } + } +#endif + if ((error = hfs_start_transaction(hfsmp)) != 0) { goto exit; } @@ -5037,6 +5184,40 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, dcp->c_ctime = tv.tv_sec; dcp->c_mtime = tv.tv_sec; (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + +#if CONFIG_PROTECT + /* + * If we are creating a content protected file, now is when + * we create the EA. We must create it in the same transaction + * that creates the file. We can also guarantee that the file + * MUST exist because we are still holding the catalog lock + * at this point. + */ + if ((attr.ca_fileid != 0) && (protected_mount) && (protectable_target)) { + error = cp_setxattr (NULL, entry, hfsmp, attr.ca_fileid, XATTR_CREATE); + + if (error) { + int delete_err; + /* + * If we fail the EA creation, then we need to delete the file. + * Luckily, we are still holding all of the right locks. + */ + delete_err = cat_delete (hfsmp, &out_desc, &attr); + if (delete_err == 0) { + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + dcp->c_dirchangecnt++; + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + } + + /* Emit EINVAL if we fail to create EA*/ + error = EINVAL; + } + } +#endif } hfs_systemfile_unlock(hfsmp, lockflags); if (error) @@ -5068,15 +5249,26 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, started_tr = 0; } +#if CONFIG_PROTECT + /* + * At this point, we must have encountered success with writing the EA. + * Update MKB with the data for the cached key, then destroy it. This may + * prevent information leakage by ensuring the cache key is only unwrapped + * to perform file I/O and it is allowed. + */ + + if ((attr.ca_fileid != 0) && (protected_mount) && (protectable_target)) { + cp_update_mkb (entry, attr.ca_fileid); + cp_entry_destroy (&entry); + } +#endif + /* Do not create vnode for whiteouts */ if (S_ISWHT(mode)) { goto exit; } gnv_flags |= GNV_CREATE; - if (nocache) { - gnv_flags |= GNV_NOCACHE; - } /* * Create a vnode for the object just created. @@ -5102,49 +5294,6 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, cp = VTOC(tvp); *vpp = tvp; -#if CONFIG_PROTECT - error = cp_entry_create_keys(cp); - /* - * If we fail to create keys, then do NOT allow this vnode to percolate out into the - * namespace. Delete it and return the errno that cp_entry_create_keys generated. - * Luckily, we can do this without issues because the entry was newly created - * and we're still holding the directory cnode lock. Because we prevented it from - * getting inserted into the namecache upon vnode creation, all accesss to this file - * would have to go through the directory, whose lock we are still holding. - */ - if (error) { - /* - * If we fail to remove/recycle the item here, we can't do much about it. Log - * a message to the console and then we can backtrack it. The ultimate error - * that will get emitted to userland will be from the failure to create the EA blob. - */ - int err = hfs_removefile (dvp, tvp, cnp, 0, 0, 0, NULL, 0); - if (err) { - printf("hfs_makenode: removefile failed (%d) for CP file %p\n", err, tvp); - } - hfs_unlock (cp); - err = vnode_recycle (tvp); - if (err) { - printf("hfs_makenode: vnode_recycle failed (%d) for CP file %p\n", err, tvp); - } - /* Drop the iocount on the new vnode to force reclamation/recycling */ - vnode_put (tvp); - cp = NULL; - *vpp = NULL; - } - else { - /* insert item into name cache if it wasn't already inserted.*/ - if (nocache) { - cache_enter (dvp, tvp, cnp); - } - } - -#endif -/* - * If CONFIG_PROTECT is not enabled, then all items will get automatically added into - * the namecache, as nocache will be set to 0. - */ - #if QUOTA /* * Once we create this vnode, we need to initialize its quota data @@ -5160,6 +5309,18 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, exit: cat_releasedesc(&out_desc); +#if CONFIG_PROTECT + /* + * We may have jumped here in error-handling various situations above. + * If we haven't already dumped the temporary CP used to initialize + * the file atomically, then free it now. cp_entry_destroy should null + * out the pointer if it was called already. + */ + if (entry) { + cp_entry_destroy (&entry); + } +#endif + /* * Make sure we release cnode lock on dcp. */ @@ -5554,7 +5715,7 @@ hfs_vnop_fsync(ap) } #if CONFIG_PROTECT - if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { return (error); } #endif /* CONFIG_PROTECT */ @@ -5685,7 +5846,11 @@ struct vnodeopv_entry_desc hfs_standard_vnodeop_entries[] = { { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ { &vnop_allocate_desc, (VOPFUNC)hfs_readonly_op }, /* allocate (READONLY) */ +#if CONFIG_SEARCHFS { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ +#else + { &vnop_searchfs_desc, (VOPFUNC)err_searchfs }, /* search fs */ +#endif { &vnop_bwrite_desc, (VOPFUNC)hfs_readonly_op }, /* bwrite (READONLY) */ { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ { &vnop_pageout_desc,(VOPFUNC) hfs_readonly_op }, /* pageout (READONLY) */ @@ -5743,7 +5908,11 @@ struct vnodeopv_entry_desc hfs_vnodeop_entries[] = { { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ { &vnop_allocate_desc, (VOPFUNC)hfs_vnop_allocate }, /* allocate */ +#if CONFIG_SEARCHFS { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ +#else + { &vnop_searchfs_desc, (VOPFUNC)err_searchfs }, /* search fs */ +#endif { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, /* bwrite */ { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ { &vnop_pageout_desc,(VOPFUNC) hfs_vnop_pageout }, /* pageout */ diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c index 8091dfaa2..e7a91addd 100644 --- a/bsd/hfs/hfs_xattr.c +++ b/bsd/hfs/hfs_xattr.c @@ -232,6 +232,7 @@ out: } #endif + /* Zero out the date added field for the specified cnode */ static int hfs_zero_dateadded (struct cnode *cp, u_int8_t *finderinfo) { u_int8_t *finfo = finderinfo; @@ -255,7 +256,6 @@ static int hfs_zero_dateadded (struct cnode *cp, u_int8_t *finderinfo) { } - /* * Retrieve the data of an extended attribute. */ @@ -785,9 +785,9 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) */ fdFlags = *((u_int16_t *) &cp->c_finderinfo[8]); if (fdFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) - cp->c_flags |= UF_HIDDEN; + cp->c_bsdflags |= UF_HIDDEN; else - cp->c_flags &= ~UF_HIDDEN; + cp->c_bsdflags &= ~UF_HIDDEN; result = hfs_update(vp, FALSE); @@ -953,12 +953,22 @@ int hfs_setxattr_internal (struct cnode *cp, caddr_t data_ptr, size_t attrsize, int exists = 0; int allocatedblks = 0; u_int32_t target_id; + int takelock = 1; if (cp) { target_id = cp->c_fileid; } else { target_id = fileid; + if (target_id != 1) { + /* + * If we are manipulating something other than + * the root folder (id 1), and do not have a cnode-in-hand, + * then we must already hold the requisite b-tree locks from + * earlier up the call stack. (See hfs_makenode) + */ + takelock = 0; + } } /* Start a transaction for our changes. */ @@ -990,10 +1000,12 @@ int hfs_setxattr_internal (struct cnode *cp, caddr_t data_ptr, size_t attrsize, if (hfsmp->hfs_max_inline_attrsize == 0) { hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); } - - /* Take exclusive access to the attributes b-tree. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - + + if (takelock) { + /* Take exclusive access to the attributes b-tree. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + } + /* Build the b-tree key. */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { @@ -1349,7 +1361,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) /* Do the byte compare against the local copy */ if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { - hfs_unlock (cp); + hfs_unlock(cp); return (ENOATTR); } @@ -1640,6 +1652,7 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) int result; u_int8_t finderinfo[32]; + if (VNODE_IS_RSRC(vp)) { return (EPERM); } @@ -1671,9 +1684,9 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) fip->fdType = 0; fip->fdCreator = 0; } + - - /* If Finder Info is non-empty then export it's name. */ + /* If Finder Info is non-empty then export it's name. */ if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) != 0) { if (uio == NULL) { *ap->a_size += sizeof(XATTR_FINDERINFO_NAME); diff --git a/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c b/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c index 219cd538f..114104c6f 100644 --- a/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c +++ b/bsd/hfs/hfscommon/Catalog/CatalogUtilities.c @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include #include "../headers/FileMgrInternal.h" #include "../headers/BTreesInternal.h" @@ -52,10 +55,15 @@ LocateCatalogNodeByKey(const ExtendedVCB *volume, u_int32_t hint, CatalogKey *ke HFSCatalogNodeID threadParentID; u_int16_t tempSize; FSBufferDescriptor btRecord; - BTreeIterator searchIterator; + struct BTreeIterator *searchIterator; FCB *fcb; - bzero(&searchIterator, sizeof(searchIterator)); + MALLOC (searchIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (searchIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + + bzero(searchIterator, sizeof(*searchIterator)); fcb = GetFileControlBlock(volume->catalogRefNum); @@ -63,22 +71,27 @@ LocateCatalogNodeByKey(const ExtendedVCB *volume, u_int32_t hint, CatalogKey *ke btRecord.itemCount = 1; btRecord.itemSize = sizeof(CatalogRecord); - searchIterator.hint.nodeNum = hint; + searchIterator->hint.nodeNum = hint; - bcopy(keyPtr, &searchIterator.key, sizeof(CatalogKey)); + bcopy(keyPtr, &searchIterator->key, sizeof(CatalogKey)); - result = BTSearchRecord( fcb, &searchIterator, &btRecord, &tempSize, &searchIterator ); + result = BTSearchRecord( fcb, searchIterator, &btRecord, &tempSize, searchIterator ); if (result == noErr) { - *newHint = searchIterator.hint.nodeNum; + *newHint = searchIterator->hint.nodeNum; - BlockMoveData(&searchIterator.key, keyPtr, sizeof(CatalogKey)); + BlockMoveData(&searchIterator->key, keyPtr, sizeof(CatalogKey)); } - if (result == btNotFound) - result = cmNotFound; - ReturnIfError(result); + if (result == btNotFound) { + result = cmNotFound; + } + + if (result) { + FREE(searchIterator, M_TEMP); + return result; + } // if we got a thread record, then go look up real record switch ( dataPtr->recordType ) @@ -103,6 +116,7 @@ LocateCatalogNodeByKey(const ExtendedVCB *volume, u_int32_t hint, CatalogKey *ke if ( threadParentID ) // found a thread result = LocateCatalogRecord(volume, threadParentID, nodeName, kNoHint, keyPtr, dataPtr, newHint); + FREE (searchIterator, M_TEMP); return result; } @@ -122,11 +136,17 @@ LocateCatalogRecord(const ExtendedVCB *volume, HFSCatalogNodeID folderID, const OSErr result; uint16_t tempSize; FSBufferDescriptor btRecord; - BTreeIterator searchIterator; + struct BTreeIterator *searchIterator = NULL; FCB *fcb; BTreeControlBlock *btcb; - bzero(&searchIterator, sizeof(searchIterator)); + MALLOC (searchIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (searchIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + + bzero(searchIterator, sizeof(*searchIterator)); + fcb = GetFileControlBlock(volume->catalogRefNum); btcb = (BTreeControlBlock *)fcb->fcbBTCBPtr; @@ -135,14 +155,15 @@ LocateCatalogRecord(const ExtendedVCB *volume, HFSCatalogNodeID folderID, const btRecord.itemCount = 1; btRecord.itemSize = sizeof(CatalogRecord); - BuildCatalogKey(folderID, name, (volume->vcbSigWord == kHFSPlusSigWord), (CatalogKey *)&searchIterator.key); + BuildCatalogKey(folderID, name, (volume->vcbSigWord == kHFSPlusSigWord), (CatalogKey *)&searchIterator->key); - result = BTSearchRecord(fcb, &searchIterator, &btRecord, &tempSize, &searchIterator); + result = BTSearchRecord(fcb, searchIterator, &btRecord, &tempSize, searchIterator); if (result == noErr) { - *newHint = searchIterator.hint.nodeNum; - BlockMoveData(&searchIterator.key, keyPtr, CalcKeySize(btcb, &searchIterator.key)); + *newHint = searchIterator->hint.nodeNum; + BlockMoveData(&searchIterator->key, keyPtr, CalcKeySize(btcb, &searchIterator->key)); } + FREE (searchIterator, M_TEMP); return (result == btNotFound ? cmNotFound : result); } diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c index fee50fe6d..29242d367 100644 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c @@ -32,6 +32,9 @@ #include "../headers/FileMgrInternal.h" #include "../headers/HFSUnicodeWrappers.h" #include "../headers/CatalogPrivate.h" +#include +#include +#include struct ExtentsRecBuffer { @@ -112,13 +115,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param int16_t numDestExtentBlocks; OSErr err; Boolean isHFSPlus = ( vcb->vcbSigWord == kHFSPlusSigWord ); - + err = BuildCatalogKeyUTF8(vcb, srcID, srcName, kUndefinedStrLen, &srcKey, NULL); ReturnIfError(err); - + err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL); ReturnIfError(err); - + if ( isHFSPlus ) { //-- Step 1: Check the catalog nodes for extents @@ -126,27 +129,27 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param //-- locate the source file, test for extents in extent file, and copy the cat record for later err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); ReturnIfError( err ); - + if ( srcData.recordType != kHFSPlusFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + //-- Check if there are any extents in the source file //€€ I am only checling the extents in the low 32 bits, routine will fail if files extents after 2 gig are in overflow numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.dataFork.extents, srcData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.resourceFork.extents, srcData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - + //-- Check if there are any extents in the destination file err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); ReturnIfError( err ); - + if ( destData.recordType != kHFSPlusFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.dataFork.extents, destData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); if ( numDestExtentBlocks == 0 ) // then check the resource fork extents numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.resourceFork.extents, destData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - + //-- Step 2: Exchange the Extent key in the extent file //-- Exchange the extents key in the extent file @@ -156,7 +159,7 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents { //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, 0,0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) @@ -171,13 +174,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - - ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + +ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo1a; } @@ -187,13 +190,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo2aPlus; } @@ -205,10 +208,10 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - + err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } @@ -219,14 +222,14 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } - + //-- Step 3: Change the data in the catalog nodes //-- find the source cnode and put dest info in it @@ -239,12 +242,12 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSPlusCatalogFile), &srcHint ); ReturnIfError( err ); - + // find the destination cnode and put source info in it err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); if ( err != noErr ) return( cmBadNews ); - + CopyBigCatalogNodeInfo( &swapData, &destData ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSPlusCatalogFile), &destHint ); ReturnIfError( err ); @@ -256,10 +259,10 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param //-- locate the source file, test for extents in extent file, and copy the cat record for later err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); ReturnIfError( err ); - + if ( srcData.recordType != kHFSFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + //-- Check if there are any extents in the source file numSrcExtentBlocks = CheckExtents( srcData.hfsFile.dataExtents, srcData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents @@ -268,21 +271,21 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param //€€ Do we save the found source node for later use? - + //-- Check if there are any extents in the destination file err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); ReturnIfError( err ); - + if ( destData.recordType != kHFSFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + numDestExtentBlocks = CheckExtents( destData.hfsFile.dataExtents, destData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); if ( numDestExtentBlocks == 0 ) // then check the resource fork extents numDestExtentBlocks = CheckExtents( destData.hfsFile.rsrcExtents, destData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); - + //€€ Do we save the found destination node for later use? - - + + //-- Step 2: Exchange the Extent key in the extent file //-- Exchange the extents key in the extent file @@ -292,15 +295,15 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents { //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + +ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = FlushCatalog( vcb ); // flush the catalog err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) return( dskFulErr ); @@ -312,13 +315,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - - ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); + +ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo1a; } @@ -328,13 +331,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo2a; } @@ -346,10 +349,10 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - + err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } @@ -360,14 +363,14 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param { if ( err != dskFulErr ) return( err ); - + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } - + //-- Step 3: Change the data in the catalog nodes //-- find the source cnode and put dest info in it @@ -381,23 +384,23 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSCatalogFile), &srcHint ); ReturnIfError( err ); - + // find the destination cnode and put source info in it err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); if ( err != noErr ) return( cmBadNews ); - + CopyCatalogNodeInfo( &swapData, &destData ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSCatalogFile), &destHint ); ReturnIfError( err ); } err = noErr; - + //-- Step 4: Error Handling section - - + + FlushAndReturn: err = FlushCatalog( vcb ); // flush the catalog err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) @@ -430,23 +433,39 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest ExtentsRecBuffer extentsBuffer[kNumExtentsToCache]; ExtentKey * extentKeyPtr; ExtentRecord extentData; - BTreeIterator btIterator; + struct BTreeIterator *btIterator = NULL; + struct BTreeIterator *tmpIterator = NULL; FSBufferDescriptor btRecord; u_int16_t btKeySize; u_int16_t btRecordSize; int16_t i, j; OSErr err; - + MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (btIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + + + MALLOC (tmpIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (tmpIterator == NULL) { + FREE (btIterator, M_TEMP); + return memFullErr; // translates to ENOMEM + } + + bzero(btIterator, sizeof(*btIterator)); + bzero (tmpIterator, sizeof(*tmpIterator)); + + fcb = GetFileControlBlock(vcb->extentsRefNum); - (void) BTInvalidateHint(&btIterator); - extentKeyPtr = (ExtentKey*) &btIterator.key; + (void) BTInvalidateHint(btIterator); + extentKeyPtr = (ExtentKey*) &btIterator->key; btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + //-- Collect the extent records - + // // A search on the following key will cause the BTree to be positioned immediately // before the first extent record for file #srcFileID, but not actually positioned @@ -459,7 +478,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest if (isHFSPlus) { btRecord.itemSize = sizeof(HFSPlusExtentRecord); btKeySize = sizeof(HFSPlusExtentKey); - + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; extentKeyPtr->hfsPlus.forkType = forkType; extentKeyPtr->hfsPlus.pad = 0; @@ -469,7 +488,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest else { btRecord.itemSize = sizeof(HFSExtentRecord); btKeySize = sizeof(HFSExtentKey); - + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; extentKeyPtr->hfs.forkType = 0; extentKeyPtr->hfs.fileID = srcFileID; @@ -491,8 +510,8 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest // of BTIterateRecord. We'd need to set up the key for BTSearchRecord to find the last record // we found, so that BTIterateRecord would get the next one (the first we haven't processed). // - - err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); + + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); // We expect a btNotFound here, since there shouldn't be an extent record with FABN = 0. if (err != btNotFound) @@ -503,24 +522,28 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest if (err == noErr) // If we found such a bogus extent record, then the tree is really messed up err = cmBadNews; // so return an error that conveys the disk is hosed. + FREE (tmpIterator, M_TEMP); + FREE (btIterator, M_TEMP); return err; } - + do { btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + for ( i=0 ; ihfsPlus.fileID : extentKeyPtr->hfs.fileID; if ( foundFileID == srcFileID ) { /* Check if we need to quit early. */ @@ -537,39 +560,45 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest } } + + //-- edit each extent key, and reinsert each extent record in the extent file if (isHFSPlus) btRecordSize = sizeof(HFSPlusExtentRecord); else btRecordSize = sizeof(HFSExtentRecord); - - for ( j=0 ; jkey, btKeySize); btRecord.bufferAddress = &(extentsBuffer[j].extentData); - - err = BTInsertRecord(fcb, &tmpIterator, &btRecord, btRecordSize); - if ( err != noErr ) - { // parse the error + + err = BTInsertRecord(fcb, tmpIterator, &btRecord, btRecordSize); + if ( err != noErr ) { + /* Parse the error and free iterators */ + FREE (btIterator, M_TEMP); + FREE (tmpIterator, M_TEMP); if ( err == btExists ) { - if ( DEBUG_BUILD ) - DebugStr("Can't insert record -- already exists"); + if ( DEBUG_BUILD ) { + DebugStr("Can't insert record -- already exists"); + } return( cmBadNews ); } - else + else { return( err ); + } } } - + //-- okay, done with this buffered batch, go get the next set of extent records // If our buffer is not full, we must be done, or recieved an error @@ -582,6 +611,9 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest } } while ( true ); + FREE (tmpIterator, M_TEMP); + FREE (btIterator, M_TEMP); + return( err ); } @@ -593,33 +625,47 @@ static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffe } - - //-- Delete all extents in extent file that have the ID given. static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) { FCB * fcb; ExtentKey * extentKeyPtr; ExtentRecord extentData; - BTreeIterator btIterator; + struct BTreeIterator *btIterator = NULL; + struct BTreeIterator *tmpIterator = NULL; FSBufferDescriptor btRecord; u_int16_t btRecordSize; OSErr err; - + + + MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (btIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + + MALLOC (tmpIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (tmpIterator == NULL) { + FREE (btIterator, M_TEMP); + return memFullErr; // translates to ENOMEM + } + + bzero(btIterator, sizeof(*btIterator)); + bzero (tmpIterator, sizeof(*tmpIterator)); + fcb = GetFileControlBlock(vcb->extentsRefNum); - - (void) BTInvalidateHint(&btIterator); - extentKeyPtr = (ExtentKey*) &btIterator.key; + + (void) BTInvalidateHint(btIterator); + extentKeyPtr = (ExtentKey*) &btIterator->key; btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + // The algorithm is to position the BTree just before any extent records for fileID. // Then just keep getting successive records. If the record is still for fileID, // then delete it. if (isHFSPlus) { btRecord.itemSize = sizeof(HFSPlusExtentRecord); - + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; extentKeyPtr->hfsPlus.forkType = forkType; extentKeyPtr->hfsPlus.pad = 0; @@ -628,14 +674,14 @@ static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, } else { btRecord.itemSize = sizeof(HFSExtentRecord); - + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; extentKeyPtr->hfs.forkType = forkType; extentKeyPtr->hfs.fileID = fileID; extentKeyPtr->hfs.startBlock = 0; } - - err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); + + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); if ( err != btNotFound ) { if (err == noErr) { // Did we find a bogus extent record? @@ -644,18 +690,17 @@ static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, return err; // Got some unexpected error, so return it } - + do { - BTreeIterator tmpIterator; HFSCatalogNodeID foundFileID; - - err = BTIterateRecord(fcb, kBTreeNextRecord, &btIterator, &btRecord, &btRecordSize); + + err = BTIterateRecord(fcb, kBTreeNextRecord, btIterator, &btRecord, &btRecordSize); if ( err != noErr ) { if (err == btNotFound) // If we hit the end of the BTree err = noErr; // then it's OK - + break; // We're done now. } @@ -670,12 +715,15 @@ static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, } } - tmpIterator = btIterator; - err = BTDeleteRecord( fcb, &tmpIterator ); + *tmpIterator = *btIterator; + err = BTDeleteRecord( fcb, tmpIterator ); if (err != noErr) break; } while ( true ); + FREE (tmpIterator, M_TEMP); + FREE (btIterator, M_TEMP); + return( err ); } diff --git a/bsd/hfs/hfscommon/Misc/BTreeWrapper.c b/bsd/hfs/hfscommon/Misc/BTreeWrapper.c index 5ce31fd3c..590fd9397 100644 --- a/bsd/hfs/hfscommon/Misc/BTreeWrapper.c +++ b/bsd/hfs/hfscommon/Misc/BTreeWrapper.c @@ -27,6 +27,9 @@ */ #include "../headers/BTreesPrivate.h" +#include +#include +#include // local routines @@ -37,11 +40,16 @@ static Boolean ValidHFSRecord(const void *record, const BTreeControlBlock *btcb, OSErr ReplaceBTreeRecord(FileReference refNum, const void* key, u_int32_t hint, void *newData, u_int16_t dataSize, u_int32_t *newHint) { FSBufferDescriptor btRecord; - BTreeIterator iterator; + struct BTreeIterator *iterator = NULL; FCB *fcb; BTreeControlBlock *btcb; OSStatus result; + MALLOC (iterator, struct BTreeIterator *, sizeof (struct BTreeIterator), M_TEMP, M_WAITOK); + if (iterator == NULL) { + return memFullErr; //translates to ENOMEM + } + bzero (iterator, sizeof (*iterator)); fcb = GetFileControlBlock(refNum); btcb = (BTreeControlBlock*) fcb->fcbBTCBPtr; @@ -50,24 +58,25 @@ OSErr ReplaceBTreeRecord(FileReference refNum, const void* key, u_int32_t hint, btRecord.itemSize = dataSize; btRecord.itemCount = 1; - iterator.hint.nodeNum = hint; + iterator->hint.nodeNum = hint; result = CheckBTreeKey((const BTreeKey *) key, btcb); - ExitOnError(result); + if (result) { + goto ErrorExit; + } - BlockMoveData(key, &iterator.key, CalcKeySize(btcb, (const BTreeKey *) key)); //€€ should we range check against maxkeylen? + BlockMoveData(key, &iterator->key, CalcKeySize(btcb, (const BTreeKey *) key)); //€€ should we range check against maxkeylen? if ( DEBUG_BUILD && !ValidHFSRecord(newData, btcb, dataSize) ) DebugStr("ReplaceBTreeRecord: bad record?"); - result = BTReplaceRecord( fcb, &iterator, &btRecord, dataSize ); - - *newHint = iterator.hint.nodeNum; + result = BTReplaceRecord( fcb, iterator, &btRecord, dataSize ); - //€€ do we need to invalidate the iterator? + *newHint = iterator->hint.nodeNum; ErrorExit: + FREE (iterator, M_TEMP); return result; } diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c index 998f97fa9..cae3db8e2 100644 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c @@ -225,7 +225,7 @@ static OSErr FindExtentRecord( u_int32_t *foundHint) { FCB * fcb; - BTreeIterator btIterator; + struct BTreeIterator *btIterator = NULL; FSBufferDescriptor btRecord; OSErr err; u_int16_t btRecordSize; @@ -234,14 +234,18 @@ static OSErr FindExtentRecord( if (foundHint) *foundHint = 0; fcb = GetFileControlBlock(vcb->extentsRefNum); - - bzero(&btIterator, sizeof(btIterator)); + + MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (btIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + bzero(btIterator, sizeof(*btIterator)); if (vcb->vcbSigWord == kHFSSigWord) { HFSExtentKey * extentKeyPtr; HFSExtentRecord extentData; - extentKeyPtr = (HFSExtentKey*) &btIterator.key; + extentKeyPtr = (HFSExtentKey*) &btIterator->key; extentKeyPtr->keyLength = kHFSExtentKeyMaximumLength; extentKeyPtr->forkType = forkType; extentKeyPtr->fileID = fileID; @@ -251,10 +255,10 @@ static OSErr FindExtentRecord( btRecord.itemSize = sizeof(HFSExtentRecord); btRecord.itemCount = 1; - err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); if (err == btNotFound && allowPrevious) { - err = BTIterateRecord(fcb, kBTreePrevRecord, &btIterator, &btRecord, &btRecordSize); + err = BTIterateRecord(fcb, kBTreePrevRecord, btIterator, &btRecord, &btRecordSize); // A previous record may not exist, so just return btNotFound (like we would if // it was for the wrong file/fork). @@ -298,7 +302,7 @@ static OSErr FindExtentRecord( HFSPlusExtentKey * extentKeyPtr; HFSPlusExtentRecord extentData; - extentKeyPtr = (HFSPlusExtentKey*) &btIterator.key; + extentKeyPtr = (HFSPlusExtentKey*) &btIterator->key; extentKeyPtr->keyLength = kHFSPlusExtentKeyMaximumLength; extentKeyPtr->forkType = forkType; extentKeyPtr->pad = 0; @@ -309,10 +313,10 @@ static OSErr FindExtentRecord( btRecord.itemSize = sizeof(HFSPlusExtentRecord); btRecord.itemCount = 1; - err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); if (err == btNotFound && allowPrevious) { - err = BTIterateRecord(fcb, kBTreePrevRecord, &btIterator, &btRecord, &btRecordSize); + err = BTIterateRecord(fcb, kBTreePrevRecord, btIterator, &btRecord, &btRecordSize); // A previous record may not exist, so just return btNotFound (like we would if // it was for the wrong file/fork). @@ -336,7 +340,9 @@ static OSErr FindExtentRecord( } if (foundHint) - *foundHint = btIterator.hint.nodeNum; + *foundHint = btIterator->hint.nodeNum; + + FREE(btIterator, M_TEMP); return err; } @@ -348,7 +354,7 @@ static OSErr CreateExtentRecord( HFSPlusExtentRecord extents, u_int32_t *hint) { - BTreeIterator btIterator; + struct BTreeIterator *btIterator = NULL; FSBufferDescriptor btRecord; u_int16_t btRecordSize; int lockflags; @@ -357,7 +363,11 @@ static OSErr CreateExtentRecord( err = noErr; *hint = 0; - bzero(&btIterator, sizeof(btIterator)); + MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (btIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + bzero(btIterator, sizeof(*btIterator)); /* * The lock taken by callers of ExtendFileC is speculative and @@ -377,7 +387,7 @@ static OSErr CreateExtentRecord( btRecord.itemSize = btRecordSize; btRecord.itemCount = 1; - keyPtr = (HFSExtentKey*) &btIterator.key; + keyPtr = (HFSExtentKey*) &btIterator->key; keyPtr->keyLength = kHFSExtentKeyMaximumLength; keyPtr->forkType = key->forkType; keyPtr->fileID = key->fileID; @@ -391,19 +401,20 @@ static OSErr CreateExtentRecord( btRecord.itemSize = btRecordSize; btRecord.itemCount = 1; - BlockMoveData(key, &btIterator.key, sizeof(HFSPlusExtentKey)); + BlockMoveData(key, &btIterator->key, sizeof(HFSPlusExtentKey)); } if (err == noErr) - err = BTInsertRecord(GetFileControlBlock(vcb->extentsRefNum), &btIterator, &btRecord, btRecordSize); + err = BTInsertRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator, &btRecord, btRecordSize); if (err == noErr) - *hint = btIterator.hint.nodeNum; + *hint = btIterator->hint.nodeNum; (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); hfs_systemfile_unlock(vcb, lockflags); - + + FREE (btIterator, M_TEMP); return err; } @@ -414,17 +425,21 @@ static OSErr DeleteExtentRecord( u_int32_t fileID, u_int32_t startBlock) { - BTreeIterator btIterator; + struct BTreeIterator *btIterator = NULL; OSErr err; err = noErr; - bzero(&btIterator, sizeof(btIterator)); + MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (btIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + bzero(btIterator, sizeof(*btIterator)); if (vcb->vcbSigWord == kHFSSigWord) { HFSExtentKey * keyPtr; - keyPtr = (HFSExtentKey*) &btIterator.key; + keyPtr = (HFSExtentKey*) &btIterator->key; keyPtr->keyLength = kHFSExtentKeyMaximumLength; keyPtr->forkType = forkType; keyPtr->fileID = fileID; @@ -433,7 +448,7 @@ static OSErr DeleteExtentRecord( else { // HFS Plus volume HFSPlusExtentKey * keyPtr; - keyPtr = (HFSPlusExtentKey*) &btIterator.key; + keyPtr = (HFSPlusExtentKey*) &btIterator->key; keyPtr->keyLength = kHFSPlusExtentKeyMaximumLength; keyPtr->forkType = forkType; keyPtr->pad = 0; @@ -441,9 +456,11 @@ static OSErr DeleteExtentRecord( keyPtr->startBlock = startBlock; } - err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), &btIterator); + err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator); (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); + + FREE(btIterator, M_TEMP); return err; } @@ -497,7 +514,6 @@ OSErr MapFileBlockC ( // // Determine the end of the available space. It will either be the end of the extent, // or the file's PEOF, whichever is smaller. - // dataEnd = (off_t)((off_t)(nextFABN) * (off_t)(allocBlockSize)); // Assume valid data through end of this extent if (((off_t)fcb->ff_blocks * (off_t)allocBlockSize) < dataEnd) // Is PEOF shorter? @@ -536,10 +552,13 @@ OSErr MapFileBlockC ( if (tmpOff <= 0) { return EINVAL; } - if (tmpOff > (off_t)(numberOfBytes)) + + if (tmpOff > (off_t)(numberOfBytes)) { *availableBytes = numberOfBytes; // more there than they asked for, so pin the output - else + } + else { *availableBytes = tmpOff; + } } return noErr; @@ -1890,7 +1909,7 @@ static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, } } else { - BTreeIterator btIterator; + struct BTreeIterator *btIterator = NULL; FSBufferDescriptor btRecord; u_int16_t btRecordSize; FCB * btFCB; @@ -1900,8 +1919,12 @@ static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, // Need to find and change a record in Extents BTree // btFCB = GetFileControlBlock(vcb->extentsRefNum); - - bzero(&btIterator, sizeof(btIterator)); + + MALLOC (btIterator, struct BTreeIterator*, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (btIterator == NULL) { + return memFullErr; // translates to ENOMEM + } + bzero(btIterator, sizeof(*btIterator)); /* * The lock taken by callers of ExtendFileC/TruncateFileC is @@ -1916,49 +1939,51 @@ static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, HFSExtentKey * key; // Actual extent key used on disk in HFS HFSExtentRecord foundData; // The extent data actually found - key = (HFSExtentKey*) &btIterator.key; + key = (HFSExtentKey*) &btIterator->key; key->keyLength = kHFSExtentKeyMaximumLength; key->forkType = extentFileKey->forkType; key->fileID = extentFileKey->fileID; key->startBlock = extentFileKey->startBlock; - btIterator.hint.index = 0; - btIterator.hint.nodeNum = extentBTreeHint; + btIterator->hint.index = 0; + btIterator->hint.nodeNum = extentBTreeHint; btRecord.bufferAddress = &foundData; btRecord.itemSize = sizeof(HFSExtentRecord); btRecord.itemCount = 1; - err = BTSearchRecord(btFCB, &btIterator, &btRecord, &btRecordSize, &btIterator); + err = BTSearchRecord(btFCB, btIterator, &btRecord, &btRecordSize, btIterator); if (err == noErr) err = HFSPlusToHFSExtents(extentData, (HFSExtentDescriptor *)&foundData); if (err == noErr) - err = BTReplaceRecord(btFCB, &btIterator, &btRecord, btRecordSize); + err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); (void) BTFlushPath(btFCB); } else { // HFS Plus volume HFSPlusExtentRecord foundData; // The extent data actually found - BlockMoveData(extentFileKey, &btIterator.key, sizeof(HFSPlusExtentKey)); + BlockMoveData(extentFileKey, &btIterator->key, sizeof(HFSPlusExtentKey)); - btIterator.hint.index = 0; - btIterator.hint.nodeNum = extentBTreeHint; + btIterator->hint.index = 0; + btIterator->hint.nodeNum = extentBTreeHint; btRecord.bufferAddress = &foundData; btRecord.itemSize = sizeof(HFSPlusExtentRecord); btRecord.itemCount = 1; - err = BTSearchRecord(btFCB, &btIterator, &btRecord, &btRecordSize, &btIterator); + err = BTSearchRecord(btFCB, btIterator, &btRecord, &btRecordSize, btIterator); if (err == noErr) { BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord)); - err = BTReplaceRecord(btFCB, &btIterator, &btRecord, btRecordSize); + err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); } (void) BTFlushPath(btFCB); } hfs_systemfile_unlock(vcb, lockflags); + + FREE(btIterator, M_TEMP); } return err; diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index de2858418..a8a874c90 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -32,7 +32,7 @@ Version: HFS Plus 1.0 - Copyright: ÔøΩ 1996-2009 by Apple Computer, Inc., all rights reserved. + Copyright: � 1996-2009 by Apple Computer, Inc., all rights reserved. */ @@ -70,6 +70,10 @@ Public routines: filesystem. It is also used to shrink or grow the number of blocks that the red-black tree should know about. If growing, scan the new range of bitmap, and if shrinking, reduce the number of items in the tree that we can allocate from. + + UnmapBlocks + Issues DKIOCUNMAPs to the device as it fills the internal volume buffer when iterating + the volume bitmap. Internal routines: Note that the RBTree routines are guarded by a cpp check for CONFIG_HFS_ALLOC_RBTREE. This @@ -141,6 +145,16 @@ Internal routines: ReleaseBitmapBlock Release a bitmap block back into the buffer cache. + + remove_free_extent_cache + Remove an extent from the free extent cache. Handles overlaps + with multiple extents in the cache, and handles splitting an + extent in the cache if the extent to be removed is in the middle + of a cached extent. + + add_free_extent_cache + Add an extent to the free extent cache. It will merge the + input extent with extents already in the cache. Debug/Test Routines @@ -204,6 +218,8 @@ Red Black Tree Specific Routines #include #include #include +/* For VM Page size */ +#include #include "../../hfs.h" #include "../../hfs_dbg.h" @@ -214,6 +230,10 @@ Red Black Tree Specific Routines #include "../headers/HybridAllocator.h" #include "../../hfs_kdebug.h" +/* Headers for unmap-on-mount support */ +#include +#include + #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif @@ -339,10 +359,25 @@ static OSErr BlockMarkFreeInternal( u_int32_t numBlocks, Boolean do_validate); -#if CONFIG_HFS_ALLOC_RBTREE -static OSErr ReleaseRBScanBitmapBlock( struct buf *bp ); +static OSErr ReleaseScanBitmapBlock( struct buf *bp ); + +static int hfs_track_unmap_blocks (struct hfsmount *hfsmp, u_int32_t offset, + u_int32_t numBlocks, struct jnl_trim_list *list); + +static int hfs_issue_unmap (struct hfsmount *hfsmp, struct jnl_trim_list *list); + +static int hfs_alloc_scan_block(struct hfsmount *hfsmp, + u_int32_t startbit, + u_int32_t endBit, + u_int32_t *bitToScan, + struct jnl_trim_list *list); + +int hfs_isallocated_scan (struct hfsmount *hfsmp, + u_int32_t startingBlock, + u_int32_t *bp_buf); +#if CONFIG_HFS_ALLOC_RBTREE static OSErr BlockAllocateAnyRBTree( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -390,15 +425,6 @@ void check_rbtree_extents (struct hfsmount *hfsmp, u_int32_t numBlocks, int shouldBeFree); -int hfs_isallocated_scan (struct hfsmount *hfsmp, - u_int32_t startingBlock, - u_int32_t *bp_buf); - -static int hfs_alloc_scan_block(struct hfsmount *hfsmp, - u_int32_t startbit, - u_int32_t endBit, - u_int32_t *bitToScan); - #define ASSERT_FREE 1 #define ASSERT_ALLOC 0 @@ -410,19 +436,13 @@ static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBloc static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated); #if ALLOC_DEBUG -/* - * Extra #includes for the debug function below. These are not normally #included because - * they would constitute a layering violation - */ -#include -#include - /* * Validation Routine to verify that the TRIM list maintained by the journal * is in good shape relative to what we think the bitmap should have. We should * never encounter allocated blocks in the TRIM list, so if we ever encounter them, * we panic. */ +int trim_validate_bitmap (struct hfsmount *hfsmp); int trim_validate_bitmap (struct hfsmount *hfsmp) { u_int64_t blockno_offset; u_int64_t numblocks; @@ -459,43 +479,60 @@ int trim_validate_bitmap (struct hfsmount *hfsmp) { #endif + /* -;________________________________________________________________________________ -; -; Routine: hfs_unmap_free_extent -; -; Function: Make note of a range of allocation blocks that should be -; unmapped (trimmed). That is, the given range of blocks no -; longer have useful content, and the device can unmap the -; previous contents. For example, a solid state disk may reuse -; the underlying storage for other blocks. -; -; This routine is only supported for journaled volumes. The extent -; being freed is passed to the journal code, and the extent will -; be unmapped after the current transaction is written to disk. -; -; Input Arguments: -; hfsmp - The volume containing the allocation blocks. -; startingBlock - The first allocation block of the extent being freed. -; numBlocks - The number of allocation blocks of the extent being freed. -;________________________________________________________________________________ -*/ + ;________________________________________________________________________________ + ; + ; Routine: hfs_unmap_free_extent + ; + ; Function: Make note of a range of allocation blocks that should be + ; unmapped (trimmed). That is, the given range of blocks no + ; longer have useful content, and the device can unmap the + ; previous contents. For example, a solid state disk may reuse + ; the underlying storage for other blocks. + ; + ; This routine is only supported for journaled volumes. The extent + ; being freed is passed to the journal code, and the extent will + ; be unmapped after the current transaction is written to disk. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; startingBlock - The first allocation block of the extent being freed. + ; numBlocks - The number of allocation blocks of the extent being freed. + ;________________________________________________________________________________ + */ static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) { u_int64_t offset; u_int64_t length; - int err; - + u_int64_t device_sz; + int err = 0; + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + if (ALLOC_DEBUG) { + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic("hfs: %p: (%u,%u) unmapping allocated blocks", hfsmp, startingBlock, numBlocks); + } + } + if (hfsmp->jnl != NULL) { + device_sz = hfsmp->hfs_logical_bytes; offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; length = (u_int64_t) numBlocks * hfsmp->blockSize; - err = journal_trim_add_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent", err); + /* Validate that the trim is in a valid range of bytes */ + if ((offset >= device_sz) || ((offset + length) > device_sz)) { + printf("hfs_unmap_free_ext: ignoring trim @ off %lld len %lld \n", offset, length); + err = EINVAL; + } + + if (err == 0) { + err = journal_trim_add_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent", err); + } } } @@ -504,22 +541,107 @@ static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBloc } + /* -;________________________________________________________________________________ -; -; Routine: hfs_unmap_alloc_extent -; -; Function: Make note of a range of allocation blocks, some of -; which may have previously been passed to hfs_unmap_free_extent, -; is now in use on the volume. The given blocks will be removed -; from any pending DKIOCUNMAP. -; -; Input Arguments: -; hfsmp - The volume containing the allocation blocks. -; startingBlock - The first allocation block of the extent being allocated. -; numBlocks - The number of allocation blocks being allocated. -;________________________________________________________________________________ -*/ + ;________________________________________________________________________________ + ; + ; Routine: hfs_track_unmap_blocks + ; + ; Function: Make note of a range of allocation blocks that should be + ; unmapped (trimmed). That is, the given range of blocks no + ; longer have useful content, and the device can unmap the + ; previous contents. For example, a solid state disk may reuse + ; the underlying storage for other blocks. + ; + ; This routine is only supported for journaled volumes. + ; + ; *****NOTE*****: + ; This function should *NOT* be used when the volume is fully + ; mounted. This function is intended to support a bitmap iteration + ; at mount time to fully inform the SSD driver of the state of all blocks + ; at mount time, and assumes that there is no allocation/deallocation + ; interference during its iteration., + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; offset - The first allocation block of the extent being freed. + ; numBlocks - The number of allocation blocks of the extent being freed. + ; list - The list of currently tracked trim ranges. + ;________________________________________________________________________________ + */ +static int hfs_track_unmap_blocks (struct hfsmount *hfsmp, u_int32_t start, + u_int32_t numBlocks, struct jnl_trim_list *list) { + + u_int64_t offset; + u_int64_t length; + int error = 0; + + if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL)) { + int extent_no = list->extent_count; + offset = (u_int64_t) start * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; + + + list->extents[extent_no].offset = offset; + list->extents[extent_no].length = length; + list->extent_count++; + if (list->extent_count == list->allocated_count) { + error = hfs_issue_unmap (hfsmp, list); + } + } + + return error; +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_issue_unmap + ; + ; Function: Issue a DKIOCUNMAP for all blocks currently tracked by the jnl_trim_list + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; list - The list of currently tracked trim ranges. + ;________________________________________________________________________________ + */ + +static int hfs_issue_unmap (struct hfsmount *hfsmp, struct jnl_trim_list *list) { + dk_unmap_t unmap; + int error = 0; + + if (list->extent_count > 0) { + bzero(&unmap, sizeof(unmap)); + unmap.extents = list->extents; + unmap.extentsCount = list->extent_count; + + /* Issue a TRIM and flush them out */ + error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); + + bzero (list->extents, (list->allocated_count * sizeof(dk_extent_t))); + list->extent_count = 0; + } + return error; +} + + + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_unmap_alloc_extent + ; + ; Function: Make note of a range of allocation blocks, some of + ; which may have previously been passed to hfs_unmap_free_extent, + ; is now in use on the volume. The given blocks will be removed + ; from any pending DKIOCUNMAP. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; startingBlock - The first allocation block of the extent being allocated. + ; numBlocks - The number of allocation blocks being allocated. + ;________________________________________________________________________________ + */ static void hfs_unmap_alloc_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) { u_int64_t offset; @@ -594,6 +716,64 @@ hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents) } +/* + ;________________________________________________________________________________ + ; + ; Routine: UnmapBlocks + ; + ; Function: Traverse the bitmap, and issue DKIOCUNMAPs to the underlying + ; device as needed so that the underlying disk device is as + ; up-to-date as possible with which blocks are unmapped. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ;________________________________________________________________________________ + */ + +__private_extern__ +u_int32_t UnmapBlocks (struct hfsmount *hfsmp) { + u_int32_t blocks_scanned = 0; + int error = 0; + struct jnl_trim_list trimlist; + + /* + *struct jnl_trim_list { + uint32_t allocated_count; + uint32_t extent_count; + dk_extent_t *extents; + }; + */ + bzero (&trimlist, sizeof(trimlist)); + if (CONFIG_HFS_TRIM) { + int alloc_count = PAGE_SIZE / sizeof(dk_extent_t); + void *extents = kalloc (alloc_count * sizeof(dk_extent_t)); + if (extents == NULL) { + return ENOMEM; + } + trimlist.extents = (dk_extent_t*)extents; + trimlist.allocated_count = alloc_count; + trimlist.extent_count = 0; + + + + while ((blocks_scanned < hfsmp->totalBlocks) && (error == 0)){ + error = hfs_alloc_scan_block (hfsmp, blocks_scanned, hfsmp->totalBlocks, + &blocks_scanned, &trimlist); + if (error) { + printf("HFS: bitmap unmap scan error: %d\n", error); + break; + } + } + if (error == 0) { + hfs_issue_unmap(hfsmp, &trimlist); + } + if (trimlist.extents) { + kfree (trimlist.extents, (trimlist.allocated_count * sizeof(dk_extent_t))); + } + } + return error; +} + /* ;________________________________________________________________________________ ; @@ -1256,16 +1436,15 @@ static OSErr ReleaseBitmapBlock( return (0); } -#if CONFIG_HFS_ALLOC_RBTREE /* - * ReleaseRBScanBitmapBlock is used to release struct bufs that were - * created for use by the Red-Black tree generation code. We want to force + * ReleaseScanBitmapBlock is used to release struct bufs that were + * created for use by bitmap scanning code. We want to force * them to be purged out of the buffer cache ASAP, so we'll release them differently * than in the ReleaseBitmapBlock case. Alternately, we know that we're only reading * the blocks, so we will never dirty them as part of the tree building scan. */ -static OSErr ReleaseRBScanBitmapBlock(struct buf *bp ) { +static OSErr ReleaseScanBitmapBlock(struct buf *bp ) { if (bp == NULL) { return (0); @@ -1284,9 +1463,6 @@ static OSErr ReleaseRBScanBitmapBlock(struct buf *bp ) { } -#endif - - /* _______________________________________________________________________ @@ -1906,9 +2082,9 @@ Exit: *actualStartBlock = 0; *actualNumBlocks = 0; } - - if (currCache) - (void) ReleaseBitmapBlock(vcb, blockRef, dirty); + + if (currCache) + (void) ReleaseBitmapBlock(vcb, blockRef, dirty); if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); @@ -1945,9 +2121,7 @@ static OSErr BlockAllocateKnown( u_int32_t *actualNumBlocks) { OSErr err; - u_int32_t i; u_int32_t foundBlocks; - u_int32_t newStartBlock, newBlockCount; if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_START, 0, 0, maxBlocks, 0, 0); @@ -1975,59 +2149,10 @@ static OSErr BlockAllocateKnown( foundBlocks = maxBlocks; *actualNumBlocks = foundBlocks; - if (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - // since sparse volumes keep the free extent list sorted by starting - // block number, the list won't get re-ordered, it may only shrink - // - vcb->vcbFreeExt[0].startBlock += foundBlocks; - vcb->vcbFreeExt[0].blockCount -= foundBlocks; - if (vcb->vcbFreeExt[0].blockCount == 0) { - for(i=1; i < vcb->vcbFreeExtCnt; i++) { - vcb->vcbFreeExt[i-1] = vcb->vcbFreeExt[i]; - } - vcb->vcbFreeExtCnt--; - } - - goto done; - } + lck_spin_unlock(&vcb->vcbFreeExtLock); - // Adjust the start and length of that extent. - newStartBlock = vcb->vcbFreeExt[0].startBlock + foundBlocks; - newBlockCount = vcb->vcbFreeExt[0].blockCount - foundBlocks; - + remove_free_extent_cache(vcb, *actualStartBlock, *actualNumBlocks); - // The first extent might not be the largest anymore. Bubble up any - // (now larger) extents to the top of the list. - for (i=1; ivcbFreeExtCnt; ++i) - { - if (vcb->vcbFreeExt[i].blockCount > newBlockCount) - { - vcb->vcbFreeExt[i-1].startBlock = vcb->vcbFreeExt[i].startBlock; - vcb->vcbFreeExt[i-1].blockCount = vcb->vcbFreeExt[i].blockCount; - } - else - { - break; - } - } - - // If this is now the smallest known free extent, then it might be smaller than - // other extents we didn't keep track of. So, just forget about this extent. - // After the previous loop, (i-1) is the index of the extent we just allocated from. - if (newBlockCount == 0) - { - // then just reduce the number of free extents since this guy got deleted - --vcb->vcbFreeExtCnt; - } - else - { - // It's not the smallest, so store it in its proper place - vcb->vcbFreeExt[i-1].startBlock = newStartBlock; - vcb->vcbFreeExt[i-1].blockCount = newBlockCount; - } - -done: - lck_spin_unlock(&vcb->vcbFreeExtLock); // sanity check if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { @@ -2546,21 +2671,24 @@ OSErr BlockMarkFreeInternal( register u_int32_t numBlocks_in, Boolean do_validate) { - OSErr err; + OSErr err; u_int32_t startingBlock = startingBlock_in; u_int32_t numBlocks = numBlocks_in; - register u_int32_t *currentWord; // Pointer to current word within bitmap block - register u_int32_t wordsLeft; // Number of words left in this bitmap block - register u_int32_t bitMask; // Word with given bits already set (ready to OR in) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; + uint32_t unmapStart = startingBlock_in; + uint32_t unmapCount = numBlocks_in; + uint32_t wordIndexInBlock; + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t wordsLeft; // Number of words left in this bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to OR in) + u_int32_t currentBit; // Bit index within word of current bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; // XXXdbg struct hfsmount *hfsmp = VCBTOHFS(vcb); - + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_START, startingBlock_in, numBlocks_in, do_validate, 0, 0); @@ -2579,30 +2707,46 @@ OSErr BlockMarkFreeInternal( err = EIO; goto Exit; } - + // // Pre-read the bitmap block containing the first word of allocation // - + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; // XXXdbg if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); } - + // - // Initialize currentWord, and wordsLeft. + // Figure out how many bits and words per bitmap block. // - { - u_int32_t wordIndexInBlock; + bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + + // + // Look for a range of free blocks immediately before startingBlock + // (up to the start of the current bitmap block). Set unmapStart to + // the first free block. + // + currentWord = buffer + wordIndexInBlock; + currentBit = startingBlock % kBitsPerWord; + bitMask = kHighBitInWordMask >> currentBit; + while (true) { + // Move currentWord/bitMask back by one bit + bitMask <<= 1; + if (bitMask == 0) { + if (--currentWord < buffer) + break; + bitMask = kLowBitInWordMask; + } - bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; - - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; + if (*currentWord & SWAP_BE32(bitMask)) + break; // Found an allocated block. Stop searching. + --unmapStart; + ++unmapCount; } // @@ -2610,14 +2754,16 @@ OSErr BlockMarkFreeInternal( // boundary in the bitmap, then treat that first word // specially. // - - firstBit = startingBlock % kBitsPerWord; - if (firstBit != 0) { - bitMask = kAllBitsSetInWord >> firstBit; // turn off all bits before firstBit - numBits = kBitsPerWord - firstBit; // number of remaining bits in this word + + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + currentBit = startingBlock % kBitsPerWord; + if (currentBit != 0) { + bitMask = kAllBitsSetInWord >> currentBit; // turn off all bits before currentBit + numBits = kBitsPerWord - currentBit; // number of remaining bits in this word if (numBits > numBlocks) { numBits = numBlocks; // entire allocation is inside this one word - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); // turn off bits after last + bitMask &= ~(kAllBitsSetInWord >> (currentBit + numBits)); // turn off bits after last } if ((do_validate == true) && (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { @@ -2625,15 +2771,15 @@ OSErr BlockMarkFreeInternal( } *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap numBlocks -= numBits; // adjust number of blocks left to free - + ++currentWord; // move to next word --wordsLeft; // one less word left in this block } - + // // Free whole words (32 blocks) at a time. // - + while (numBlocks >= kBitsPerWord) { if (wordsLeft == 0) { // Read in the next bitmap block @@ -2642,15 +2788,15 @@ OSErr BlockMarkFreeInternal( buffer = NULL; err = ReleaseBitmapBlock(vcb, blockRef, true); if (err != noErr) goto Exit; - + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; - + // XXXdbg if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); } - + // Readjust currentWord and wordsLeft currentWord = buffer; wordsLeft = wordsPerBlock; @@ -2663,7 +2809,7 @@ OSErr BlockMarkFreeInternal( numBlocks -= kBitsPerWord; ++currentWord; // move to next word - --wordsLeft; // one less word left in this block + --wordsLeft; // one less word left in this block } // @@ -2679,10 +2825,10 @@ OSErr BlockMarkFreeInternal( buffer = NULL; err = ReleaseBitmapBlock(vcb, blockRef, true); if (err != noErr) goto Exit; - + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; - + // XXXdbg if (hfsmp->jnl) { journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); @@ -2697,35 +2843,60 @@ OSErr BlockMarkFreeInternal( goto Corruption; } *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap - + // No need to update currentWord or wordsLeft } - + + // + // Look for a range of free blocks immediately after the range we just freed + // (up to the end of the current bitmap block). + // + wordIndexInBlock = ((startingBlock_in + numBlocks_in - 1) & (bitsPerBlock-1)) / kBitsPerWord; + wordsLeft = wordsPerBlock - wordIndexInBlock; + currentWord = buffer + wordIndexInBlock; + currentBit = (startingBlock_in + numBlocks_in - 1) % kBitsPerWord; + bitMask = kHighBitInWordMask >> currentBit; + while (true) { + // Move currentWord/bitMask/wordsLeft forward one bit + bitMask >>= 1; + if (bitMask == 0) { + if (--wordsLeft == 0) + break; + ++currentWord; + bitMask = kHighBitInWordMask; + } + + if (*currentWord & SWAP_BE32(bitMask)) + break; // Found an allocated block. Stop searching. + ++unmapCount; + } + Exit: - + if (buffer) (void)ReleaseBitmapBlock(vcb, blockRef, true); - + if (err == noErr) { - hfs_unmap_free_extent(vcb, startingBlock_in, numBlocks_in); + hfs_unmap_free_extent(vcb, unmapStart, unmapCount); } if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); return err; - + Corruption: #if DEBUG_BUILD panic("hfs: BlockMarkFreeInternal: blocks not allocated!"); #else - printf ("hfs: BlockMarkFreeInternal() trying to free unallocated blocks (%u,%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); + printf ("hfs: BlockMarkFreeInternal() trying to free unallocated blocks on volume %s\n", vcb->vcbVN); hfs_mark_volume_inconsistent(vcb); err = EIO; goto Exit; #endif } + #if CONFIG_HFS_ALLOC_RBTREE /* * This is a wrapper function around BlockMarkFree. This function is @@ -3628,104 +3799,31 @@ hfs_isrbtree_active(struct hfsmount *hfsmp){ return 0; } -#if CONFIG_HFS_ALLOC_RBTREE -/* - * This function is basically the same as hfs_isallocated, except it's designed for - * use with the red-black tree validation code. It assumes we're only checking whether - * one bit is active, and that we're going to pass in the buf to use, since GenerateTree - * calls ReadBitmapBlock and will have that buf locked down for the duration of its operation. + +/* + * This function scans the specified bitmap block and acts on it as necessary. + * We may add it to the list of blocks to be UNMAP/TRIM'd or add it to allocator + * data structures. This function is not #if'd to the CONFIG_RB case because + * we want to use it unilaterally at mount time if on a UNMAP-capable device. + * + * Additionally, we may want an allocating thread to invoke this if the tree + * does not have enough extents to satisfy an allocation request. + * + * startbit - the allocation block represented by a bit in 'allocblock' where we need to + * start our scan. For instance, we may need to start the normal allocation scan + * in the middle of an existing allocation block. + * endBit - the allocation block where we should end this search (inclusive). + * bitToScan - output argument for this function to specify the next bit to scan. * - * This should not be called in general purpose scanning code. - */ -int hfs_isallocated_scan(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t *bp_buf) { - - u_int32_t *currentWord; // Pointer to current word within bitmap block - u_int32_t bitMask; // Word with given bits already set (ready to test) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t bitsPerBlock; - uintptr_t blockRef; - u_int32_t wordsPerBlock; - u_int32_t numBlocks = 1; - u_int32_t *buffer = NULL; - - int inuse = 0; - int error; - - - if (bp_buf) { - /* just use passed-in buffer if avail. */ - buffer = bp_buf; - } - else { - /* - * Pre-read the bitmap block containing the first word of allocation - */ - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); - if (error) - return (error); - } - - /* - * Initialize currentWord, and wordsLeft. - */ - u_int32_t wordIndexInBlock; - - bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; - - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - - /* - * First test any non word aligned bits. - */ - firstBit = startingBlock % kBitsPerWord; - bitMask = kAllBitsSetInWord >> firstBit; - numBits = kBitsPerWord - firstBit; - if (numBits > numBlocks) { - numBits = numBlocks; - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); - } - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - inuse = 1; - goto Exit; - } - numBlocks -= numBits; - ++currentWord; - -Exit: - if(bp_buf == NULL) { - if (buffer) { - (void)ReleaseBitmapBlock(hfsmp, blockRef, false); - } - } - return (inuse); - - - -} - -/* - * This function scans the specified block and adds it to the pair of trees specified - * in its arguments. We break this behavior out of GenerateTree so that an allocating - * thread can invoke this if the tree does not have enough extents to satisfy - * an allocation request. - * - * startbit - the allocation block represented by a bit in 'allocblock' where we need to - * start our scan. For instance, we may need to start the normal allocation scan - * in the middle of an existing allocation block. - * endBit - the allocation block where we should end this search (inclusive). - * bitToScan - output argument for this function to specify the next bit to scan. - * - * Returns: - * 0 on success - * nonzero on failure. + * Returns: + * 0 on success + * nonzero on failure. */ static int hfs_alloc_scan_block(struct hfsmount *hfsmp, u_int32_t startbit, - u_int32_t endBit, u_int32_t *bitToScan) { - + u_int32_t endBit, u_int32_t *bitToScan, + struct jnl_trim_list *list) { + int error; u_int32_t curAllocBlock; struct buf *blockRef = NULL; @@ -3735,7 +3833,7 @@ static int hfs_alloc_scan_block(struct hfsmount *hfsmp, u_int32_t startbit, u_int32_t wordsPerBlock = blockSize / kBytesPerWord; u_int32_t offset = 0; u_int32_t size = 0; - + /* * Read the appropriate block from the bitmap file. ReadBitmapBlock * figures out which actual on-disk block corresponds to the bit we're @@ -3748,7 +3846,7 @@ static int hfs_alloc_scan_block(struct hfsmount *hfsmp, u_int32_t startbit, /* curAllocBlock represents the logical block we're analyzing. */ curAllocBlock = startbit; - + /* Figure out which word curAllocBlock corresponds to in the block we read */ wordIndexInBlock = (curAllocBlock / kBitsPerWord) % wordsPerBlock; @@ -3784,9 +3882,12 @@ static int hfs_alloc_scan_block(struct hfsmount *hfsmp, u_int32_t startbit, * we saw, and reset our tally counter. */ if (size != 0) { +#if CONFIG_HFS_ALLOC_RBTREE extent_tree_free_space(&hfsmp->offset_tree, size, offset); - size = 0; - offset = 0; +#endif + hfs_track_unmap_blocks (hfsmp, offset, size, list); + size = 0; + offset = 0; } } curAllocBlock++; @@ -3804,7 +3905,10 @@ DoneScanning: /* We may have been tracking a range of free blocks that hasn't been inserted yet. */ if (size != 0) { - extent_tree_free_space(&hfsmp->offset_tree, size, offset); +#if CONFIG_HFS_ALLOC_RBTREE + extent_tree_free_space(&hfsmp->offset_tree, size, offset); +#endif + hfs_track_unmap_blocks (hfsmp, offset, size, list); } /* * curAllocBlock represents the next block we need to scan while we're in this @@ -3812,11 +3916,91 @@ DoneScanning: */ *bitToScan = curAllocBlock; - ReleaseRBScanBitmapBlock(blockRef); - + ReleaseScanBitmapBlock(blockRef); + return 0; } + +/* + * This function is basically the same as hfs_isallocated, except it's designed for + * use with the red-black tree validation code. It assumes we're only checking whether + * one bit is active, and that we're going to pass in the buf to use, since GenerateTree + * calls ReadBitmapBlock and will have that buf locked down for the duration of its operation. + * + * This should not be called in general purpose scanning code. + */ +int hfs_isallocated_scan(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t *bp_buf) { + + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t bitsPerBlock; + uintptr_t blockRef; + u_int32_t wordsPerBlock; + u_int32_t numBlocks = 1; + u_int32_t *buffer = NULL; + + int inuse = 0; + int error; + + + if (bp_buf) { + /* just use passed-in buffer if avail. */ + buffer = bp_buf; + } + else { + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) + return (error); + } + + /* + * Initialize currentWord, and wordsLeft. + */ + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + inuse = 1; + goto Exit; + } + numBlocks -= numBits; + ++currentWord; + +Exit: + if(bp_buf == NULL) { + if (buffer) { + (void)ReleaseBitmapBlock(hfsmp, blockRef, false); + } + } + return (inuse); + + + +} + +#if CONFIG_HFS_ALLOC_RBTREE + /* * Extern function that is called from mount and upgrade mount routines * that enable us to initialize the tree. @@ -4166,103 +4350,276 @@ u_int32_t UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block) { } +/* + * Remove an extent from the list of free extents. + * + * This is a low-level routine. It does not handle overlaps or splitting; + * that is the responsibility of the caller. The input extent must exactly + * match an extent already in the list; it will be removed, and any following + * extents in the list will be shifted up. + * + * Inputs: + * startBlock - Start of extent to remove + * blockCount - Number of blocks in extent to remove + * + * Result: + * The index of the extent that was removed. + */ +static void remove_free_extent_list(struct hfsmount *hfsmp, int index) +{ + if (index < 0 || (uint32_t)index >= hfsmp->vcbFreeExtCnt) { + if (ALLOC_DEBUG) + panic("hfs: remove_free_extent_list: %p: index (%d) out of range (0, %u)", hfsmp, index, hfsmp->vcbFreeExtCnt); + else + printf("hfs: remove_free_extent_list: %p: index (%d) out of range (0, %u)", hfsmp, index, hfsmp->vcbFreeExtCnt); + return; + } + int shift_count = hfsmp->vcbFreeExtCnt - index - 1; + if (shift_count > 0) { + memmove(&hfsmp->vcbFreeExt[index], &hfsmp->vcbFreeExt[index+1], shift_count * sizeof(hfsmp->vcbFreeExt[0])); + } + hfsmp->vcbFreeExtCnt--; +} + + +/* + * Add an extent to the list of free extents. + * + * This is a low-level routine. It does not handle overlaps or coalescing; + * that is the responsibility of the caller. This routine *does* make + * sure that the extent it is adding is inserted in the correct location. + * If the list is full, this routine will handle either removing the last + * extent in the list to make room for the new extent, or ignoring the + * new extent if it is "worse" than the last extent in the list. + * + * Inputs: + * startBlock - Start of extent to add + * blockCount - Number of blocks in extent to add + * + * Result: + * The index where the extent that was inserted, or kMaxFreeExtents + * if the extent was not inserted (the list was full, and the extent + * being added was "worse" than everything in the list). + */ +static int add_free_extent_list(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +{ + uint32_t i; + + /* ALLOC_DEBUG: Make sure no extents in the list overlap or are contiguous with the input extent. */ + if (ALLOC_DEBUG) { + uint32_t endBlock = startBlock + blockCount; + for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { + if (endBlock < hfsmp->vcbFreeExt[i].startBlock || + startBlock > (hfsmp->vcbFreeExt[i].startBlock + hfsmp->vcbFreeExt[i].blockCount)) { + continue; + } + panic("hfs: add_free_extent_list: %p: extent(%u %u) overlaps existing extent (%u %u) at index %d", + hfsmp, startBlock, blockCount, hfsmp->vcbFreeExt[i].startBlock, hfsmp->vcbFreeExt[i].blockCount, i); + } + } + + /* Figure out what index the new extent should be inserted at. */ + for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* The list is sorted by increasing offset. */ + if (startBlock < hfsmp->vcbFreeExt[i].startBlock) { + break; + } + } else { + /* The list is sorted by decreasing size. */ + if (blockCount > hfsmp->vcbFreeExt[i].blockCount) { + break; + } + } + } + + /* When we get here, i is the index where the extent should be inserted. */ + if (i == kMaxFreeExtents) { + /* + * The new extent is worse than anything already in the list, + * and the list is full, so just ignore the extent to be added. + */ + return i; + } + + /* + * Grow the list (if possible) to make room for an insert. + */ + if (hfsmp->vcbFreeExtCnt < kMaxFreeExtents) + hfsmp->vcbFreeExtCnt++; + + /* + * If we'll be keeping any extents after the insert position, then shift them. + */ + int shift_count = hfsmp->vcbFreeExtCnt - i - 1; + if (shift_count > 0) { + memmove(&hfsmp->vcbFreeExt[i+1], &hfsmp->vcbFreeExt[i], shift_count * sizeof(hfsmp->vcbFreeExt[0])); + } + + /* Finally, store the new extent at its correct position. */ + hfsmp->vcbFreeExt[i].startBlock = startBlock; + hfsmp->vcbFreeExt[i].blockCount = blockCount; + return i; +} + + /* * Remove an entry from free extent cache after it has been allocated. * - * This function does not split extents to remove them from the allocated list. + * This is a high-level routine. It handles removing a portion of a + * cached extent, potentially splitting it into two (if the cache was + * already full, throwing away the extent that would sort last). It + * also handles removing an extent that overlaps multiple extents in + * the cache. * * Inputs: - * hfsmp - mount point structure - * startBlock - starting block of the extent to be removed. - * blockCount - number of blocks of the extent to be removed. + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. */ static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) { - int i, j; + u_int32_t i, insertedIndex; + u_int32_t currentStart, currentEnd, endBlock; int extentsRemoved = 0; - u_int32_t start, end; - + #if CONFIG_HFS_ALLOC_RBTREE /* If red-black tree is enabled, no free extent cache is necessary */ if (hfs_isrbtree_active(hfsmp) == true) { return; } #endif - + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); - + + endBlock = startBlock + blockCount; + lck_spin_lock(&hfsmp->vcbFreeExtLock); - - for (i = 0; i < (int)hfsmp->vcbFreeExtCnt; i++) { - start = hfsmp->vcbFreeExt[i].startBlock; - end = start + hfsmp->vcbFreeExt[i].blockCount; - - /* If the extent to remove from free extent list starts within - * this free extent, or, if it starts before this free extent - * but ends in this free extent, remove it by shifting all other - * extents. + + /* + * Iterate over all of the extents in the free extent cache, removing or + * updating any entries that overlap with the input extent. + */ + for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { + currentStart = hfsmp->vcbFreeExt[i].startBlock; + currentEnd = currentStart + hfsmp->vcbFreeExt[i].blockCount; + + /* + * If the current extent is entirely before or entirely after the + * the extent to be removed, then we keep it as-is. */ - if (((startBlock >= start) && (startBlock < end)) || - ((startBlock < start) && (startBlock + blockCount) > start)) { - for (j = i; j < (int)hfsmp->vcbFreeExtCnt - 1; j++) { - hfsmp->vcbFreeExt[j] = hfsmp->vcbFreeExt[j+1]; - } - hfsmp->vcbFreeExtCnt--; - /* Decrement the index so that we check the extent - * that just got shifted to the current index. + if (currentEnd <= startBlock || currentStart >= endBlock) { + continue; + } + + /* + * If the extent being removed entirely contains the current extent, + * then remove the current extent. + */ + if (startBlock <= currentStart && endBlock >= currentEnd) { + remove_free_extent_list(hfsmp, i); + + /* + * We just removed the extent at index i. The extent at + * index i+1 just got shifted to index i. So decrement i + * to undo the loop's "++i", and the next iteration will + * examine index i again, which contains the next extent + * in the list. */ - i--; - extentsRemoved++; + --i; + ++extentsRemoved; + continue; } - /* Continue looping as we might have to invalidate multiple extents, - * probably not possible in normal case, but does not hurt. + + /* + * If the extent being removed is strictly "in the middle" of the + * current extent, then we need to split the current extent into + * two discontiguous extents (the "head" and "tail"). The good + * news is that we don't need to examine any other extents in + * the list. */ + if (startBlock > currentStart && endBlock < currentEnd) { + remove_free_extent_list(hfsmp, i); + add_free_extent_list(hfsmp, currentStart, startBlock - currentStart); + add_free_extent_list(hfsmp, endBlock, currentEnd - endBlock); + break; + } + + /* + * The only remaining possibility is that the extent to be removed + * overlaps the start or end (but not both!) of the current extent. + * So we need to replace the current extent with a shorter one. + * + * The only tricky part is that the updated extent might be at a + * different index than the original extent. If the updated extent + * was inserted after the current extent, then we need to re-examine + * the entry at index i, since it now contains the extent that was + * previously at index i+1. If the updated extent was inserted + * before or at the same index as the removed extent, then the + * following extents haven't changed position. + */ + remove_free_extent_list(hfsmp, i); + if (startBlock > currentStart) { + /* Remove the tail of the current extent. */ + insertedIndex = add_free_extent_list(hfsmp, currentStart, startBlock - currentStart); + } else { + /* Remove the head of the current extent. */ + insertedIndex = add_free_extent_list(hfsmp, endBlock, currentEnd - endBlock); + } + if (insertedIndex > i) { + --i; /* Undo the "++i" in the loop, so we examine the entry at index i again. */ + } } lck_spin_unlock(&hfsmp->vcbFreeExtLock); - + sanity_check_free_ext(hfsmp, 0); - + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, extentsRemoved, 0); - + return; } + /* - * Add an entry to free extent cache after it has been deallocated. + * Add an entry to free extent cache after it has been deallocated. * - * If the extent provided has blocks beyond current allocLimit, it - * is clipped to allocLimit. This function does not merge contiguous - * extents, if they already exist in the list. + * This is a high-level routine. It will merge overlapping or contiguous + * extents into a single, larger extent. + * + * If the extent provided has blocks beyond current allocLimit, it is + * clipped to allocLimit (so that we won't accidentally find and allocate + * space beyond allocLimit). * * Inputs: - * hfsmp - mount point structure - * startBlock - starting block of the extent to be removed. - * blockCount - number of blocks of the extent to be removed. + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. * * Returns: - * true - if the extent was added successfully to the list - * false - if the extent was no added to the list, maybe because - * the extent was beyond allocLimit, or is not best - * candidate to be put in the cache. + * true - if the extent was added successfully to the list + * false - if the extent was not added to the list, maybe because + * the extent was beyond allocLimit, or is not best + * candidate to be put in the cache. */ -static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) { Boolean retval = false; - u_int32_t start, end; - int i; + uint32_t endBlock; + uint32_t currentEnd; + uint32_t i; if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); - + /* * If using the red-black tree allocator, then there's no need to special case - * for the sparse device case. We'll simply add the region we've recently freed + * for the sparse device case. We'll simply add the region we've recently freed * to the red-black tree, where it will get sorted by offset and length. The only special * casing will need to be done on the allocation side, where we may favor free extents - * based on offset even if it will cause fragmentation. This may be true, for example, if + * based on offset even if it will cause fragmentation. This may be true, for example, if * we are trying to reduce the number of bandfiles created in a sparse bundle disk image. */ #if CONFIG_HFS_ALLOC_RBTREE @@ -4270,93 +4627,58 @@ static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBloc goto out_not_locked; } #endif - + /* No need to add extent that is beyond current allocLimit */ if (startBlock >= hfsmp->allocLimit) { goto out_not_locked; } - + /* If end of the free extent is beyond current allocLimit, clip the extent */ if ((startBlock + blockCount) > hfsmp->allocLimit) { blockCount = hfsmp->allocLimit - startBlock; } - + lck_spin_lock(&hfsmp->vcbFreeExtLock); - - /* If the free extent cache is full and the new extent fails to - * compare with the last extent, skip adding it to the list. - */ - if (hfsmp->vcbFreeExtCnt == kMaxFreeExtents) { - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - /* For sparse disks, free extent cache list is sorted by start block, lowest first */ - if (startBlock > hfsmp->vcbFreeExt[kMaxFreeExtents-1].startBlock) { - goto out; - } - } else { - /* For normal mounts, free extent cache list is sorted by total blocks, highest first */ - if (blockCount <= hfsmp->vcbFreeExt[kMaxFreeExtents-1].blockCount) { - goto out; - } - } - } - - /* Check if the current extent overlaps with any of the existing - * extents. If yes, just skip adding it to the list. We have - * to do this check before shifting the extent records. - */ - for (i = 0; i < (int)hfsmp->vcbFreeExtCnt; i++) { - - start = hfsmp->vcbFreeExt[i].startBlock; - end = start + hfsmp->vcbFreeExt[i].blockCount; - - if (((startBlock >= start) && (startBlock < end)) || - ((startBlock < start) && (startBlock + blockCount) > start)) { - goto out; - } - } - - /* Scan the free extent cache array from tail to head till - * we find the entry after which our new entry should be - * inserted. After we break out of this loop, the new entry - * will be inserted at 'i+1'. + + /* + * Make a pass through the free extent cache, looking for known extents that + * overlap or are contiguous with the extent to be added. We'll remove those + * extents from the cache, and incorporate them into the new extent to be added. */ - for (i = (int)hfsmp->vcbFreeExtCnt-1; i >= 0; i--) { - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - /* For sparse devices, find entry with smaller start block than ours */ - if (hfsmp->vcbFreeExt[i].startBlock < startBlock) { - break; - } + endBlock = startBlock + blockCount; + for (i=0; i < hfsmp->vcbFreeExtCnt; ++i) { + currentEnd = hfsmp->vcbFreeExt[i].startBlock + hfsmp->vcbFreeExt[i].blockCount; + if (hfsmp->vcbFreeExt[i].startBlock > endBlock || currentEnd < startBlock) { + /* Extent i does not overlap and is not contiguous, so keep it. */ + continue; } else { - /* For normal devices, find entry with greater block count than ours */ - if (hfsmp->vcbFreeExt[i].blockCount >= blockCount) { - break; - } - } - - /* If this is not the right spot to insert, and this is - * not the last entry in the array, just shift it and - * continue check another one. - */ - if ((i+1) < kMaxFreeExtents) { - hfsmp->vcbFreeExt[i+1] = hfsmp->vcbFreeExt[i]; + /* We need to remove extent i and combine it with the input extent. */ + if (hfsmp->vcbFreeExt[i].startBlock < startBlock) + startBlock = hfsmp->vcbFreeExt[i].startBlock; + if (currentEnd > endBlock) + endBlock = currentEnd; + + remove_free_extent_list(hfsmp, i); + /* + * We just removed the extent at index i. The extent at + * index i+1 just got shifted to index i. So decrement i + * to undo the loop's "++i", and the next iteration will + * examine index i again, which contains the next extent + * in the list. + */ + --i; } } - /* 'i' points to one index offset before which the new extent should be inserted */ - hfsmp->vcbFreeExt[i+1].startBlock = startBlock; - hfsmp->vcbFreeExt[i+1].blockCount = blockCount; - if (hfsmp->vcbFreeExtCnt < kMaxFreeExtents) { - hfsmp->vcbFreeExtCnt++; - } - retval = true; - -out: + add_free_extent_list(hfsmp, startBlock, endBlock - startBlock); + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + out_not_locked: sanity_check_free_ext(hfsmp, 0); - + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, retval, 0); - + return retval; } @@ -4372,6 +4694,9 @@ static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated) lck_spin_lock(&hfsmp->vcbFreeExtLock); + if (hfsmp->vcbFreeExtCnt > kMaxFreeExtents) + panic("hfs: %p: free extent count (%u) is too large", hfsmp, hfsmp->vcbFreeExtCnt); + /* * Iterate the Free extent cache and ensure no entries are bogus or refer to * allocated blocks. diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h index 7276daa26..e8ddcac86 100644 --- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h +++ b/bsd/hfs/hfscommon/headers/FileMgrInternal.h @@ -250,7 +250,10 @@ MetaZoneFreeBlocks(ExtendedVCB *vcb); EXTERN_API_C( u_int32_t ) UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block); - + +EXTERN_API_C( u_int32_t ) +UnmapBlocks(struct hfsmount *hfsmp); + #if CONFIG_HFS_ALLOC_RBTREE EXTERN_API_C( u_int32_t ) GenerateTree( struct hfsmount *hfsmp, u_int32_t end_block, int *flags, int initialscan); diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index e60df12e0..2a04fff66 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -133,11 +133,12 @@ #include /* for mcache_init() */ #include /* for mbinit() */ #include /* for knote_init() */ -#include /* for kern_memorystatus_init() */ +#include /* for memorystatus_init() */ #include /* for aio_init() */ #include /* for psem_cache_init() */ #include /* for dlil_init() */ #include /* for proto_kpi_init() */ +#include /* for iptap_init() */ #include /* for pipeinit() */ #include /* for socketinit() */ #include /* for domaininit() */ @@ -154,6 +155,7 @@ #include /* for utun_register_control() */ #include /* for net_str_id_init() */ #include /* for netsrc_init() */ +#include /* for nstat_init() */ #include /* for assert() */ #include @@ -223,13 +225,6 @@ char hostname[MAXHOSTNAMELEN]; int hostnamelen; char domainname[MAXDOMNAMELEN]; int domainnamelen; -#if defined(__i386__) || defined(__x86_64__) -struct exec_archhandler exec_archhandler_ppc = { - .path = "/usr/libexec/oah/RosettaNonGrata", -}; -#else /* __i386__ */ -struct exec_archhandler exec_archhandler_ppc; -#endif /* __i386__ */ char rootdevice[16]; /* hfs device names have at least 9 chars */ @@ -250,6 +245,7 @@ extern void klogwakeup(void); extern void file_lock_init(void); extern void kmeminit(void); extern void bsd_bufferinit(void); +extern void throttle_init(void); extern int serverperfmode; extern int ncl; @@ -263,13 +259,8 @@ __private_extern__ int execargs_cache_size = 0; __private_extern__ int execargs_free_count = 0; __private_extern__ vm_offset_t * execargs_cache = NULL; -void bsd_exec_setup(int); +void bsd_exec_setup(int) __attribute__((aligned(4096))); -/* - * Set to disable grading 64 bit Mach-o binaries as executable, for testing; - * Intel only. - */ -__private_extern__ int bootarg_no64exec = 0; __private_extern__ int bootarg_vnode_cache_defeat = 0; /* @@ -330,7 +321,6 @@ extern void stackshot_lock_init(void); extern void dtrace_postinit(void); #endif - /* * Initialization code. * Called from cold start routine as @@ -394,14 +384,15 @@ void (*unmountroot_pre_hook)(void); * of the uu_context.vc_ucred field so that the uthread structure can be * used like any other. */ +extern void run_bringup_tests(void); + +extern void IOServicePublishResource(const char *, boolean_t); + void bsd_init(void) { struct uthread *ut; unsigned int i; -#if __i386__ || __x86_64__ - int error; -#endif struct vfs_context context; kern_return_t ret; struct ucred temp_cred; @@ -412,6 +403,8 @@ bsd_init(void) #define bsd_init_kprintf(x...) /* kprintf("bsd_init: " x) */ + throttle_init(); + kernel_flock = funnel_alloc(KERNEL_FUNNEL); if (kernel_flock == (funnel_t *)0 ) { panic("bsd_init: Failed to allocate kernel funnel"); @@ -775,22 +768,26 @@ bsd_init(void) socketinit(); bsd_init_kprintf("calling domaininit\n"); domaininit(); + iptap_init(); #endif /* SOCKETS */ kernproc->p_fd->fd_cdir = NULL; kernproc->p_fd->fd_rdir = NULL; #if CONFIG_FREEZE - /* Initialise background hibernation */ - bsd_init_kprintf("calling kern_hibernation_init\n"); - kern_hibernation_init(); +#ifndef CONFIG_MEMORYSTATUS + #error "CONFIG_FREEZE defined without matching CONFIG_MEMORYSTATUS" +#endif + /* Initialise background freezing */ + bsd_init_kprintf("calling memorystatus_freeze_init\n"); + memorystatus_freeze_init(); #endif -#if CONFIG_EMBEDDED +#if CONFIG_MEMORYSTATUS /* Initialize kernel memory status notifications */ - bsd_init_kprintf("calling kern_memorystatus_init\n"); - kern_memorystatus_init(); -#endif + bsd_init_kprintf("calling memorystatus_init\n"); + memorystatus_init(); +#endif /* CONFIG_MEMORYSTATUS */ #ifdef GPROF /* Initialize kernel profiling. */ @@ -837,10 +834,8 @@ bsd_init(void) /* register user tunnel kernel control handler */ utun_register_control(); - netsrc_init(); - - /* wait for network domain to finish */ - domainfin(); + netsrc_init(); + nstat_init(); #endif /* NETWORKING */ bsd_init_kprintf("calling vnode_pager_bootstrap\n"); @@ -963,13 +958,6 @@ bsd_init(void) pal_kernel_announce(); -#if __i386__ || __x86_64__ - /* this should be done after the root filesystem is mounted */ - error = set_archhandler(kernproc, CPU_TYPE_POWERPC); - if (error) /* XXX make more generic */ - exec_archhandler_ppc.path[0] = 0; -#endif - bsd_init_kprintf("calling mountroot_post_hook\n"); /* invoke post-root-mount hook */ @@ -1131,10 +1119,6 @@ parse_bsd_args(void) if (PE_parse_boot_argn("-x", namep, sizeof (namep))) /* safe boot */ boothowto |= RB_SAFEBOOT; - /* disable 64 bit grading */ - if (PE_parse_boot_argn("-no64exec", namep, sizeof (namep))) - bootarg_no64exec = 1; - /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */ if (PE_parse_boot_argn("-vnode_cache_defeat", namep, sizeof (namep))) bootarg_vnode_cache_defeat = 1; diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index 19da61270..bc4537d35 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -120,24 +120,29 @@ struct cdevsw nocdev = NO_CDEVICE; * else see whether the index is free * return the major number that is free else -1 * + * if index is negative, we start + * looking for a free slot at the absolute value of index, + * instead of starting at 0 */ int bdevsw_isfree(int index) { struct bdevsw *devsw; - if (index == -1) { - devsw = bdevsw; - for(index=0; index < nblkdev; index++, devsw++) { - if(memcmp((char *)devsw, - (char *)&nobdev, - sizeof(struct bdevsw)) == 0) - break; + + if (index < 0) { + if (index == -1) + index = 1; /* start at 1 to avoid collision with volfs (Radar 2842228) */ + else + index = -index; /* start at least this far up in the table */ + devsw = &bdevsw[index]; + for(; index < nblkdev; index++, devsw++) { + if(memcmp((char *)devsw, + (char *)&nobdev, + sizeof(struct bdevsw)) == 0) + break; } - } else { - /* NB: Not used below unless index is in range */ - devsw = &bdevsw[index]; } - + devsw = &bdevsw[index]; if ((index < 0) || (index >= nblkdev) || (memcmp((char *)devsw, (char *)&nobdev, @@ -151,33 +156,22 @@ bdevsw_isfree(int index) * if index is -1, find a free slot to add * else see whether the slot is free * return the major number that is used else -1 + * + * if index is negative, we start + * looking for a free slot at the absolute value of index, + * instead of starting at 0 */ int bdevsw_add(int index, struct bdevsw * bsw) { - struct bdevsw *devsw; - - if (index == -1) { - devsw = &bdevsw[1]; /* Start at slot 1 - this is a hack to fix the index=1 hack */ - /* yes, start at 1 to avoid collision with volfs (Radar 2842228) */ - for(index=1; index < nblkdev; index++, devsw++) { - if(memcmp((char *)devsw, - (char *)&nobdev, - sizeof(struct bdevsw)) == 0) - break; - } - } - devsw = &bdevsw[index]; - if ((index < 0) || (index >= nblkdev) || - (memcmp((char *)devsw, - (char *)&nobdev, - sizeof(struct bdevsw)) != 0)) { + index = bdevsw_isfree(index); + if (index < 0) { return(-1); } bdevsw[index] = *bsw; return(index); } -/* +/* * if the slot has the same bsw, then remove * else -1 */ @@ -201,19 +195,27 @@ bdevsw_remove(int index, struct bdevsw * bsw) * if index is -1, return a free slot if avaliable * else see whether the index is free * return the major number that is free else -1 + * + * if index is negative, we start + * looking for a free slot at the absolute value of index, + * instead of starting at 0 */ int cdevsw_isfree(int index) { struct cdevsw *devsw; - if (index == -1) { - devsw = cdevsw; - for(index=0; index < nchrdev; index++, devsw++) { - if(memcmp((char *)devsw, - (char *)&nocdev, - sizeof(struct cdevsw)) == 0) - break; + if (index < 0) { + if (index == -1) + index = 0; + else + index = -index; /* start at least this far up in the table */ + devsw = &cdevsw[index]; + for(; index < nchrdev; index++, devsw++) { + if(memcmp((char *)devsw, + (char *)&nocdev, + sizeof(struct cdevsw)) == 0) + break; } } devsw = &cdevsw[index]; @@ -231,45 +233,27 @@ cdevsw_isfree(int index) * else see whether the slot is free * return the major number that is used else -1 * + * if index is negative, we start + * looking for a free slot at the absolute value of index, + * instead of starting at 0 + * * NOTE: In practice, -1 is unusable, since there are kernel internal * devices that call this function with absolute index values, * which will stomp on free-slot based assignments that happen - * before them. Therefore, if index is negative, we start - * looking for a free slot at the absolute value of index, - * instead of starting at 0 (lets out slot 1, but that's one - * of the problem slots down low - the vndevice). -12 is - * currently a safe starting point. + * before them. -24 is currently a safe starting point. */ int cdevsw_add(int index, struct cdevsw * csw) { - struct cdevsw *devsw; - + index = cdevsw_isfree(index); if (index < 0) { - if (index == -1) - index = 0; /* historical behaviour; XXX broken */ - else - index = -index; /* start at least this far up in the table */ - devsw = &cdevsw[index]; - for(; index < nchrdev; index++, devsw++) { - if(memcmp((char *)devsw, - (char *)&nocdev, - sizeof(struct cdevsw)) == 0) - break; - } - } - devsw = &cdevsw[index]; - if ((index < 0) || (index >= nchrdev) || - (memcmp((char *)devsw, - (char *)&nocdev, - sizeof(struct cdevsw)) != 0)) { return(-1); } cdevsw[index] = *csw; return(index); } /* - * if the index has the same bsw, then remove + * if the slot has the same csw, then remove * else -1 */ int diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index 33e3b3040..ef8057a4e 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -168,13 +168,27 @@ vfs_context_t decmpfs_ctx; #define offsetof_func(func) ((uintptr_t)(&(((decmpfs_registration*)NULL)->func))) static void * -_func_from_offset(uint32_t type, int offset) +_func_from_offset(uint32_t type, uintptr_t offset) { /* get the function at the given offset in the registration for the given type */ decmpfs_registration *reg = decompressors[type]; char *regChar = (char*)reg; char *func = ®Char[offset]; void **funcPtr = (void**)func; + + switch (reg->decmpfs_registration) { + case DECMPFS_REGISTRATION_VERSION_V1: + if (offset > offsetof_func(free_data)) + return NULL; + break; + case DECMPFS_REGISTRATION_VERSION_V3: + if (offset > offsetof_func(get_flags)) + return NULL; + break; + default: + return NULL; + } + return funcPtr[0]; } @@ -183,7 +197,7 @@ extern boolean_t IOServiceWaitForMatchingResource( const char * property, uint64 extern boolean_t IOCatalogueMatchingDriversPresent( const char * property ); static void * -_decmp_get_func(uint32_t type, int offset) +_decmp_get_func(uint32_t type, uintptr_t offset) { /* this function should be called while holding a shared lock to decompressorsLock, @@ -208,13 +222,15 @@ _decmp_get_func(uint32_t type, int offset) snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", type); printf("waiting for %s\n", resourceName); while(decompressors[type] == NULL) { - lck_rw_done(decompressorsLock); // we have to unlock to allow the kext to register + lck_rw_unlock_shared(decompressorsLock); // we have to unlock to allow the kext to register if (IOServiceWaitForMatchingResource(resourceName, delay)) { + lck_rw_lock_shared(decompressorsLock); break; } if (!IOCatalogueMatchingDriversPresent(providesName)) { // printf("the kext with %s is no longer present\n", providesName); + lck_rw_lock_shared(decompressorsLock); break; } printf("still waiting for %s\n", resourceName); @@ -273,18 +289,12 @@ decmpfs_cnode_init(decmpfs_cnode *cp) { memset(cp, 0, sizeof(*cp)); lck_rw_init(&cp->compressed_data_lock, decmpfs_lockgrp, NULL); -#if !DECMPFS_SUPPORTS_SWAP64 - lck_mtx_init(&cp->uncompressed_size_mtx, decmpfs_lockgrp, NULL); -#endif } void decmpfs_cnode_destroy(decmpfs_cnode *cp) { lck_rw_destroy(&cp->compressed_data_lock, decmpfs_lockgrp); -#if !DECMPFS_SUPPORTS_SWAP64 - lck_mtx_destroy(&cp->uncompressed_size_mtx, decmpfs_lockgrp); -#endif } boolean_t @@ -382,25 +392,12 @@ decmpfs_cnode_set_vnode_minimal_xattr(decmpfs_cnode *cp, int minimal_xattr, int uint64_t decmpfs_cnode_get_vnode_cached_size(decmpfs_cnode *cp) { -#if DECMPFS_SUPPORTS_SWAP64 return cp->uncompressed_size; -#else - /* - since this is a 64-bit field, we may not be able to access it atomically - so lock access - */ - - lck_mtx_lock(&(cp->uncompressed_size_mtx)); - uint64_t ret = cp->uncompressed_size; - lck_mtx_unlock(&(cp->uncompressed_size_mtx)); - return ret; -#endif } static void decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size) { -#if DECMPFS_SUPPORTS_SWAP64 while(1) { uint64_t old = cp->uncompressed_size; if (OSCompareAndSwap64(old, size, (UInt64*)&cp->uncompressed_size)) { @@ -409,16 +406,25 @@ decmpfs_cnode_set_vnode_cached_size(decmpfs_cnode *cp, uint64_t size) /* failed to write our value, so loop */ } } -#else - /* - since this is a 64-bit field, we may not be able to access it atomically - so lock access - */ - - lck_mtx_lock(&(cp->uncompressed_size_mtx)); - cp->uncompressed_size = size; - lck_mtx_unlock(&(cp->uncompressed_size_mtx)); -#endif +} + +static uint64_t +decmpfs_cnode_get_decompression_flags(decmpfs_cnode *cp) +{ + return cp->decompression_flags; +} + +static void +decmpfs_cnode_set_decompression_flags(decmpfs_cnode *cp, uint64_t flags) +{ + while(1) { + uint64_t old = cp->decompression_flags; + if (OSCompareAndSwap64(old, flags, (UInt64*)&cp->decompression_flags)) { + return; + } else { + /* failed to write our value, so loop */ + } + } } #pragma mark --- decmpfs state routines --- @@ -602,7 +608,7 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp) /* no validate registered, so nothing to do */ err = 0; } - lck_rw_done(decompressorsLock); + lck_rw_unlock_shared(decompressorsLock); out: if (hdr) FREE(hdr, M_TEMP); #if COMPRESSION_DEBUG @@ -632,6 +638,7 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) mount_t mp = NULL; int cnode_locked = 0; int saveInvalid = 0; // save the header data even though the type was out of range + uint64_t decompression_flags = 0; if (vnode_isnamedstream(vp)) { /* @@ -738,6 +745,15 @@ done: if (ret == FILE_IS_COMPRESSED) { /* update the ubc's size for this file */ ubc_setsize(vp, hdr->uncompressed_size); + + /* update the decompression flags in the decmpfs cnode */ + lck_rw_lock_shared(decompressorsLock); + decmpfs_get_decompression_flags_func get_flags = decmp_get_func(hdr->compression_type, get_flags); + if (get_flags) { + decompression_flags = get_flags(vp, decmpfs_ctx, hdr); + } + lck_rw_unlock_shared(decompressorsLock); + decmpfs_cnode_set_decompression_flags(cp, decompression_flags); } } else { /* we might have already taken the lock above; if so, skip taking it again by passing cnode_locked as the skiplock parameter */ @@ -885,6 +901,11 @@ decmpfs_hides_xattr(vfs_context_t ctx, decmpfs_cnode *cp, const char *xattr) #pragma mark --- registration/validation routines --- +static inline int registration_valid(decmpfs_registration *registration) +{ + return registration && ((registration->decmpfs_registration == DECMPFS_REGISTRATION_VERSION_V1) || (registration->decmpfs_registration == DECMPFS_REGISTRATION_VERSION_V3)); +} + errno_t register_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration *registration) { @@ -894,9 +915,7 @@ register_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration *r int locked = 0; char resourceName[80]; - if ((compression_type >= CMP_MAX) || - (!registration) || - (registration->decmpfs_registration != DECMPFS_REGISTRATION_VERSION)) { + if ((compression_type >= CMP_MAX) || !registration_valid(registration)) { ret = EINVAL; goto out; } @@ -911,10 +930,9 @@ register_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration *r decompressors[compression_type] = registration; snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", compression_type); IOServicePublishResource(resourceName, TRUE); - wakeup((caddr_t)&decompressors); out: - if (locked) lck_rw_done(decompressorsLock); + if (locked) lck_rw_unlock_exclusive(decompressorsLock); return ret; } @@ -927,9 +945,7 @@ unregister_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration int locked = 0; char resourceName[80]; - if ((compression_type >= CMP_MAX) || - (!registration) || - (registration->decmpfs_registration != DECMPFS_REGISTRATION_VERSION)) { + if ((compression_type >= CMP_MAX) || !registration_valid(registration)) { ret = EINVAL; goto out; } @@ -942,10 +958,9 @@ unregister_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration decompressors[compression_type] = NULL; snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", compression_type); IOServicePublishResource(resourceName, FALSE); - wakeup((caddr_t)&decompressors); out: - if (locked) lck_rw_done(decompressorsLock); + if (locked) lck_rw_unlock_exclusive(decompressorsLock); return ret; } @@ -960,7 +975,7 @@ compression_type_valid(decmpfs_header *hdr) if (decmp_get_func(hdr->compression_type, fetch) != NULL) { ret = 1; } - lck_rw_done(decompressorsLock); + lck_rw_unlock_shared(decompressorsLock); return ret; } @@ -968,7 +983,7 @@ compression_type_valid(decmpfs_header *hdr) #pragma mark --- compression/decompression routines --- static int -decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_header *hdr, off_t offset, user_ssize_t size, int nvec, decmpfs_vector *vec, uint64_t *bytes_read) +decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *hdr, off_t offset, user_ssize_t size, int nvec, decmpfs_vector *vec, uint64_t *bytes_read) { /* get the uncompressed bytes for the specified region of vp by calling out to the registered compressor */ @@ -1000,10 +1015,22 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_header *hdr, off_t offset, u decmpfs_fetch_uncompressed_data_func fetch = decmp_get_func(hdr->compression_type, fetch); if (fetch) { err = fetch(vp, decmpfs_ctx, hdr, offset, size, nvec, vec, bytes_read); + lck_rw_unlock_shared(decompressorsLock); + if (err == 0) { + uint64_t decompression_flags = decmpfs_cnode_get_decompression_flags(cp); + if (decompression_flags & DECMPFS_FLAGS_FORCE_FLUSH_ON_DECOMPRESS) { +#if !defined(__i386__) && !defined(__x86_64__) + int i; + for (i = 0; i < nvec; i++) { + flush_dcache64((addr64_t)(uintptr_t)vec[i].buf, vec[i].size, FALSE); + } +#endif + } + } } else { err = ENOTSUP; + lck_rw_unlock_shared(decompressorsLock); } - lck_rw_done(decompressorsLock); out: return err; @@ -1105,7 +1132,7 @@ decompress: err = 0; did_read = 0; } else { - err = decmpfs_fetch_uncompressed_data(vp, hdr, uplPos, uplSize, 1, &vec, &did_read); + err = decmpfs_fetch_uncompressed_data(vp, cp, hdr, uplPos, uplSize, 1, &vec, &did_read); } if (err) { DebugLog("decmpfs_fetch_uncompressed_data err %d\n", err); @@ -1234,7 +1261,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c adjust_fetch(vp, decmpfs_ctx, hdr, &uplPos, &uplSize); VerboseLog("adjusted uplPos %lld uplSize %lld\n", (uint64_t)uplPos, (uint64_t)uplSize); } - lck_rw_done(decompressorsLock); + lck_rw_unlock_shared(decompressorsLock); /* clip the adjusted size to the size of the file */ if ((uint64_t)uplPos + uplSize > cachedSize) { @@ -1304,7 +1331,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c decmpfs_vector vec; decompress: vec = (decmpfs_vector){ .buf = data, .size = curUplSize }; - err = decmpfs_fetch_uncompressed_data(vp, hdr, curUplPos, curUplSize, 1, &vec, &did_read); + err = decmpfs_fetch_uncompressed_data(vp, cp, hdr, curUplPos, curUplSize, 1, &vec, &did_read); if (err) { ErrorLog("decmpfs_fetch_uncompressed_data err %d\n", err); @@ -1409,7 +1436,7 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) /* nothing to do, so no error */ err = 0; } - lck_rw_done(decompressorsLock); + lck_rw_unlock_shared(decompressorsLock); if (err != 0) { ErrorLog("decompressor err %d\n", err); @@ -1559,7 +1586,7 @@ decompress: uint64_t bytes_read = 0; decmpfs_vector vec = { .buf = data, .size = MIN(allocSize, remaining) }; - err = decmpfs_fetch_uncompressed_data(vp, hdr, offset, vec.size, 1, &vec, &bytes_read); + err = decmpfs_fetch_uncompressed_data(vp, cp, hdr, offset, vec.size, 1, &vec, &bytes_read); if (err != 0) { ErrorLog("decmpfs_fetch_uncompressed_data err %d\n", err); goto out; @@ -1733,7 +1760,8 @@ static decmpfs_registration Type1Reg = .validate = decmpfs_validate_compressed_file_Type1, .adjust_fetch = NULL, /* no adjust necessary */ .fetch = decmpfs_fetch_uncompressed_data_Type1, - .free_data = NULL /* no free necessary */ + .free_data = NULL, /* no free necessary */ + .get_flags = NULL /* no flags */ }; #pragma mark --- decmpfs initialization --- diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 8bc4ede36..9fa89e443 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -195,6 +195,7 @@ imageboot_mount_image(const char *root_path, int height) vnode_get_and_drop_always(old_rootvnode); } #else + height = 0; /* keep the compiler from complaining */ vnode_get_and_drop_always(old_rootvnode); #endif /* CONFIG_IMGSRC_ACCESS */ } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index f7c7fa73a..b25b3f9d4 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -65,6 +65,7 @@ #include #include #include +#include /* for isset() */ #include /* for host_info() */ #include @@ -90,6 +91,9 @@ void kdbg_mapinit(void); int kdbg_reinit(boolean_t); int kdbg_bootstrap(boolean_t); +static int kdbg_enable_typefilter(void); +static int kdbg_disable_typefilter(void); + static int create_buffers(boolean_t); static void delete_buffers(void); @@ -189,7 +193,13 @@ struct kd_bufinfo *kdbip = NULL; kd_buf *kdcopybuf = NULL; -unsigned int nkdbufs = 8192; +int kdlog_sched_events = 0; + +boolean_t kdlog_bg_trace = FALSE; +boolean_t kdlog_bg_trace_running = FALSE; +unsigned int bg_nkdbufs = 0; + +unsigned int nkdbufs = 0; unsigned int kdlog_beg=0; unsigned int kdlog_end=0; unsigned int kdlog_value1=0; @@ -237,6 +247,18 @@ pid_t global_state_pid = -1; /* Used to control exclusive use of kd_buffer #define DBG_FUNC_MASK 0xfffffffc +/* TODO: move to kdebug.h */ +#define CLASS_MASK 0xff000000 +#define CLASS_OFFSET 24 +#define SUBCLASS_MASK 0x00ff0000 +#define SUBCLASS_OFFSET 16 +#define CSC_MASK 0xffff0000 /* class and subclass mask */ +#define CSC_OFFSET SUBCLASS_OFFSET + +#define EXTRACT_CLASS(debugid) ( (uint8_t) ( ((debugid) & CLASS_MASK ) >> CLASS_OFFSET ) ) +#define EXTRACT_SUBCLASS(debugid) ( (uint8_t) ( ((debugid) & SUBCLASS_MASK) >> SUBCLASS_OFFSET ) ) +#define EXTRACT_CSC(debugid) ( (uint16_t)( ((debugid) & CSC_MASK ) >> CSC_OFFSET ) ) + #define INTERRUPT 0x01050000 #define MACH_vmfault 0x01300008 #define BSC_SysCall 0x040c0000 @@ -273,18 +295,20 @@ volatile kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit functi __private_extern__ void stackshot_lock_init( void ) __attribute__((section("__TEXT, initcode"))); +static uint8_t *type_filter_bitmap; + static void -kdbg_set_tracing_enabled(boolean_t enabled) +kdbg_set_tracing_enabled(boolean_t enabled, uint32_t trace_type) { int s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); if (enabled) { - kdebug_enable |= KDEBUG_ENABLE_TRACE; + kdebug_enable |= trace_type; kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; kd_ctrl_page.enabled = 1; } else { - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; + kdebug_enable &= ~(KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_PPT); kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; kd_ctrl_page.enabled = 0; } @@ -578,7 +602,7 @@ boolean_t allocate_storage_unit(int cpu) { union kds_ptr kdsp; - struct kd_storage *kdsp_actual; + struct kd_storage *kdsp_actual, *kdsp_next_actual; struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try; uint64_t oldest_ts, ts; boolean_t retval = TRUE; @@ -652,9 +676,14 @@ allocate_storage_unit(int cpu) } kdsp = kdbp_vict->kd_list_head; kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - kdbp_vict->kd_list_head = kdsp_actual->kds_next; + if (kdbp_vict->kd_list_head.raw != KDS_PTR_NULL) { + kdsp_next_actual = POINTER_FROM_KDS_PTR(kdbp_vict->kd_list_head); + kdsp_next_actual->kds_lostevents = TRUE; + } else + kdbp_vict->kd_lostevents = TRUE; + kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; } kdsp_actual->kds_timestamp = mach_absolute_time(); @@ -707,7 +736,9 @@ kernel_debug_internal( int cpu; struct kd_bufinfo *kdbp; struct kd_storage *kdsp_actual; + union kds_ptr kds_raw; + if (kd_ctrl_page.kdebug_slowcheck) { @@ -748,7 +779,7 @@ kernel_debug_internal( lck_spin_unlock(kds_spin_lock); ml_set_interrupts_enabled(s); } - if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || !(kdebug_enable & KDEBUG_ENABLE_TRACE)) + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || !(kdebug_enable & (KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_PPT))) goto out1; if ( !ml_at_interrupt_context()) { @@ -759,7 +790,8 @@ kernel_debug_internal( curproc = current_proc(); if ((curproc && !(curproc->p_kdebug)) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) + ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) && + (debugid >> 24 != DBG_TRACE)) goto out1; } else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) { @@ -769,30 +801,46 @@ kernel_debug_internal( curproc = current_proc(); if ((curproc && curproc->p_kdebug) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) + ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) && + (debugid >> 24 != DBG_TRACE)) goto out1; } } - if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { - if ((debugid < kdlog_beg) - || ((debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE))) - goto out1; + + if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { + /* Always record trace system info */ + if (EXTRACT_CLASS(debugid) == DBG_TRACE) + goto record_event; + + if (isset(type_filter_bitmap, EXTRACT_CSC(debugid))) + goto record_event; + goto out1; + } + else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { + if ((debugid >= kdlog_beg && debugid <= kdlog_end) || (debugid >> 24) == DBG_TRACE) + goto record_event; + if (kdlog_sched_events && (debugid & 0xffff0000) == (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) + goto record_event; + goto out1; } else if (kd_ctrl_page.kdebug_flags & KDBG_VALCHECK) { if ((debugid & DBG_FUNC_MASK) != kdlog_value1 && - (debugid & DBG_FUNC_MASK) != kdlog_value2 && - (debugid & DBG_FUNC_MASK) != kdlog_value3 && - (debugid & DBG_FUNC_MASK) != kdlog_value4 && - (debugid >> 24 != DBG_TRACE)) + (debugid & DBG_FUNC_MASK) != kdlog_value2 && + (debugid & DBG_FUNC_MASK) != kdlog_value3 && + (debugid & DBG_FUNC_MASK) != kdlog_value4 && + (debugid >> 24 != DBG_TRACE)) goto out1; } } +record_event: disable_preemption(); cpu = cpu_number(); kdbp = &kdbip[cpu]; retry_q: - if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) { - kdsp_actual = POINTER_FROM_KDS_PTR(kdbp->kd_list_tail); + kds_raw = kdbp->kd_list_tail; + + if (kds_raw.raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kds_raw); bindx = kdsp_actual->kds_bufindx; } else kdsp_actual = NULL; @@ -963,7 +1011,7 @@ kdbg_reinit(boolean_t early_trace) * First make sure we're not in * the middle of cutting a trace */ - kdbg_set_tracing_enabled(FALSE); + kdbg_set_tracing_enabled(FALSE, KDEBUG_ENABLE_TRACE); /* * make sure the SLOW_NOLOG is seen @@ -1167,7 +1215,7 @@ kdbg_clear(void) * First make sure we're not in * the middle of cutting a trace */ - kdbg_set_tracing_enabled(FALSE); + kdbg_set_tracing_enabled(FALSE, KDEBUG_ENABLE_TRACE); /* * make sure the SLOW_NOLOG is seen @@ -1176,12 +1224,16 @@ kdbg_clear(void) */ IOSleep(100); + kdlog_sched_events = 0; global_state_pid = -1; kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kd_ctrl_page.kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); kd_ctrl_page.kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); + + kdbg_disable_typefilter(); delete_buffers(); + nkdbufs = 0; /* Clean up the thread map buffer */ kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; @@ -1299,16 +1351,68 @@ kdbg_setrtcdec(kd_regtype *kdr) return(ret); } +int +kdbg_enable_typefilter(void) +{ + if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { + /* free the old filter */ + kdbg_disable_typefilter(); + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&type_filter_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE) != KERN_SUCCESS) { + return ENOSPC; + } + + bzero(type_filter_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE); + + /* Turn off range and value checks */ + kd_ctrl_page.kdebug_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK); + + /* Enable filter checking */ + kd_ctrl_page.kdebug_flags |= KDBG_TYPEFILTER_CHECK; + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + return 0; +} + +int +kdbg_disable_typefilter(void) +{ + /* Disable filter checking */ + kd_ctrl_page.kdebug_flags &= ~KDBG_TYPEFILTER_CHECK; + + /* Turn off slow checks unless pid checks are using them */ + if ( (kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + else + kdbg_set_flags(SLOW_CHECKS, 0, FALSE); + + if(type_filter_bitmap == NULL) + return 0; + + vm_offset_t old_bitmap = (vm_offset_t)type_filter_bitmap; + type_filter_bitmap = NULL; + + kmem_free(kernel_map, old_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE); + return 0; +} + int kdbg_setreg(kd_regtype * kdr) { int ret=0; unsigned int val_1, val_2, val; + + kdlog_sched_events = 0; + switch (kdr->type) { case KDBG_CLASSTYPE : val_1 = (kdr->value1 & 0xff); val_2 = (kdr->value2 & 0xff); + + if (val_1 == DBG_FSYSTEM && val_2 == (DBG_FSYSTEM + 1)) + kdlog_sched_events = 1; + kdlog_beg = (val_1<<24); kdlog_end = (val_2<<24); kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; @@ -1348,7 +1452,9 @@ kdbg_setreg(kd_regtype * kdr) case KDBG_TYPENONE : kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - if ( (kd_ctrl_page.kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) + if ( (kd_ctrl_page.kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | + KDBG_PIDCHECK | KDBG_PIDEXCLUDE | + KDBG_TYPEFILTER_CHECK)) ) kdbg_set_flags(SLOW_CHECKS, 0, TRUE); else kdbg_set_flags(SLOW_CHECKS, 0, FALSE); @@ -1612,7 +1718,7 @@ kdbg_getentropy (user_addr_t buffer, size_t *number, int ms_timeout) } -static void +static int kdbg_set_nkdbufs(unsigned int value) { /* @@ -1622,12 +1728,34 @@ kdbg_set_nkdbufs(unsigned int value) unsigned int max_entries = (sane_size/2) / sizeof(kd_buf); if (value <= max_entries) - nkdbufs = value; + return (value); else - nkdbufs = max_entries; + return (max_entries); } +static void +kdbg_enable_bg_trace(void) +{ + if (kdlog_bg_trace == TRUE && kdlog_bg_trace_running == FALSE && n_storage_buffers == 0) { + nkdbufs = bg_nkdbufs; + kdbg_reinit(FALSE); + kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE); + kdlog_bg_trace_running = TRUE; + } +} + +static void +kdbg_disable_bg_trace(void) +{ + if (kdlog_bg_trace_running == TRUE) { + kdlog_bg_trace_running = FALSE; + kdbg_clear(); + } +} + + + /* * This function is provided for the CHUD toolkit only. * int val: @@ -1672,6 +1800,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) name[0] == KERN_KDEFLAGS || name[0] == KERN_KDDFLAGS || name[0] == KERN_KDENABLE || + name[0] == KERN_KDENABLE_BG_TRACE || name[0] == KERN_KDSETBUF) { if ( namelen < 2 ) @@ -1686,7 +1815,9 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) lck_mtx_lock(kd_trace_mtx_sysctl); - if (name[0] == KERN_KDGETBUF) { + switch(name[0]) { + + case KERN_KDGETBUF: /* * Does not alter the global_state_pid * This is a passive request. @@ -1701,7 +1832,7 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) } kd_bufinfo.nkdbufs = nkdbufs; kd_bufinfo.nkdthreads = kd_mapsize / sizeof(kd_threadmap); - + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) ) kd_bufinfo.nolog = 1; else @@ -1728,13 +1859,28 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; } goto out; + break; - } else if (name[0] == KERN_KDGETENTROPY) { + case KERN_KDGETENTROPY: if (kd_entropy_buffer) ret = EBUSY; else ret = kdbg_getentropy(where, sizep, value); goto out; + break; + + case KERN_KDENABLE_BG_TRACE: + bg_nkdbufs = kdbg_set_nkdbufs(value); + kdlog_bg_trace = TRUE; + kdbg_enable_bg_trace(); + goto out; + break; + + case KERN_KDDISABLE_BG_TRACE: + kdlog_bg_trace = FALSE; + kdbg_disable_bg_trace(); + goto out; + break; } if ((curproc = current_proc()) != NULL) @@ -1764,40 +1910,55 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) switch(name[0]) { case KERN_KDEFLAGS: + kdbg_disable_bg_trace(); + value &= KDBG_USERFLAGS; kd_ctrl_page.kdebug_flags |= value; break; case KERN_KDDFLAGS: + kdbg_disable_bg_trace(); + value &= KDBG_USERFLAGS; kd_ctrl_page.kdebug_flags &= ~value; break; case KERN_KDENABLE: /* - * used to enable or disable + * Enable tracing mechanism. Two types: + * KDEBUG_TRACE is the standard one, + * and KDEBUG_PPT which is a carefully + * chosen subset to avoid performance impact. */ if (value) { /* * enable only if buffer is initialized */ - if (!(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT)) { + if (!(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT) || + !(value == KDEBUG_ENABLE_TRACE || value == KDEBUG_ENABLE_PPT)) { ret = EINVAL; break; } kdbg_mapinit(); - kdbg_set_tracing_enabled(TRUE); + kdbg_set_tracing_enabled(TRUE, value); } else - kdbg_set_tracing_enabled(FALSE); + { + kdbg_set_tracing_enabled(FALSE, 0); + } break; case KERN_KDSETBUF: - kdbg_set_nkdbufs(value); + kdbg_disable_bg_trace(); + + nkdbufs = kdbg_set_nkdbufs(value); break; case KERN_KDSETUP: + kdbg_disable_bg_trace(); + ret = kdbg_reinit(FALSE); break; case KERN_KDREMOVE: kdbg_clear(); + kdbg_enable_bg_trace(); break; case KERN_KDSETREG: if(size < sizeof(kd_regtype)) { @@ -1808,6 +1969,8 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } + kdbg_disable_bg_trace(); + ret = kdbg_setreg(&kd_Reg); break; case KERN_KDGETREG: @@ -1819,6 +1982,8 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) if (copyout(&kd_Reg, where, sizeof(kd_regtype))) { ret = EINVAL; } + kdbg_disable_bg_trace(); + break; case KERN_KDREADTR: ret = kdbg_read(where, sizep, NULL, NULL); @@ -1832,6 +1997,8 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) vnode_t vp; int fd; + kdbg_disable_bg_trace(); + if (name[0] == KERN_KDWRITETR) { int s; int wait_result = THREAD_AWAKENED; @@ -1912,6 +2079,8 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } + kdbg_disable_bg_trace(); + ret = kdbg_setpid(&kd_Reg); break; case KERN_KDPIDEX: @@ -1923,6 +2092,8 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } + kdbg_disable_bg_trace(); + ret = kdbg_setpidex(&kd_Reg); break; case KERN_KDTHRMAP: @@ -1937,9 +2108,28 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) ret = EINVAL; break; } + kdbg_disable_bg_trace(); + ret = kdbg_setrtcdec(&kd_Reg); break; - + case KERN_KDSET_TYPEFILTER: + kdbg_disable_bg_trace(); + + if ((kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) == 0){ + if ((ret = kdbg_enable_typefilter())) + break; + } + + if (size != KDBG_TYPEFILTER_BITMAP_SIZE) { + ret = EINVAL; + break; + } + + if (copyin(where, type_filter_bitmap, KDBG_TYPEFILTER_BITMAP_SIZE)) { + ret = EINVAL; + break; + } + break; default: ret = EINVAL; } @@ -2006,17 +2196,23 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) tempbuf = kdcopybuf; tempbuf_number = 0; + // While space while (tempbuf_count) { mintime = 0xffffffffffffffffULL; min_kdbp = NULL; min_cpu = 0; + // Check all CPUs for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_cpus; cpu++, kdbp++) { + // Find one with raw data if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) continue; + + // Get from cpu data to buffer header to buffer kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + // See if there are actual data left in this buffer rcursor = kdsp_actual->kds_readlast; if (rcursor == kdsp_actual->kds_bufindx) @@ -2052,11 +2248,13 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) out_of_events = TRUE; break; } + + // Get data kdsp = min_kdbp->kd_list_head; kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); if (kdsp_actual->kds_lostevents == TRUE) { - lostevent.timestamp = kdsp_actual->kds_records[kdsp_actual->kds_readlast].timestamp; + kdbg_set_timestamp_and_cpu(&lostevent, kdsp_actual->kds_records[kdsp_actual->kds_readlast].timestamp, min_cpu); *tempbuf = lostevent; kdsp_actual->kds_lostevents = FALSE; @@ -2064,6 +2262,8 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) goto nextevent; } + + // Copy into buffer *tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++]; if (kdsp_actual->kds_readlast == EVENTS_PER_STORAGE_UNIT) @@ -2263,10 +2463,10 @@ start_kern_tracing(unsigned int new_nkdbufs) { if (!new_nkdbufs) return; - kdbg_set_nkdbufs(new_nkdbufs); + nkdbufs = kdbg_set_nkdbufs(new_nkdbufs); kdbg_lock_init(); kdbg_reinit(TRUE); - kdbg_set_tracing_enabled(TRUE); + kdbg_set_tracing_enabled(TRUE, KDEBUG_ENABLE_TRACE); #if defined(__i386__) || defined(__x86_64__) uint64_t now = mach_absolute_time(); diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index f30df6d2d..1cf74dc41 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -154,11 +154,14 @@ kauth_init(void) /* bring up kauth subsystem components */ kauth_cred_init(); +#if CONFIG_EXT_RESOLVER kauth_identity_init(); kauth_groups_init(); +#endif kauth_scope_init(); +#if CONFIG_EXT_RESOLVER kauth_resolver_init(); - +#endif /* can't alloc locks after this */ lck_grp_free(kauth_lck_grp); kauth_lck_grp = NULL; diff --git a/bsd/kern/kern_callout.c b/bsd/kern/kern_callout.c deleted file mode 100644 index 58df65fa7..000000000 --- a/bsd/kern/kern_callout.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Kernel callout related functions, including moving average calculation - * to permit the kernel to know about insufficiently responsive user space - * processes. - */ - -#include /* memove, memset */ -#include /* uint64_t */ -#include - -/* - * kco_ma_init - * - * Initialize a moving average structure for use - * - * Parameters: map Pointer to the moving average state - * threshold Threshold % at which to trigger (>100) - * kind Kind of trigger(s) to set - * - * Returns: (void) - * - * Notes: The number of samples in a simple moving average is not - * controllable; this might be a future direction. - * - * The simple and weighted thresholds are not separately - * controllable; this might be a future direction, but - * will likely be unnecessary due to one type being in use - * at a time in the most likely scenarios. - */ -void -kco_ma_init(struct kco_moving_average *map, int32_t threshold, int kind) -{ - memset(map, 0, sizeof(*map)); - - /* per algorithm init required */ - map->ma_flags |= KCO_MA_F_NEEDS_INIT; - - /* set algorithm selector flags */ - map->ma_flags |= kind; - - /* set thresholds */ - map->ma_sma_threshold = threshold; - map->ma_wma_threshold = threshold; -} - - -/* - * kco_ma_info - * - * Report on the current moving average information; this is typically only - * called after a trigger event. - * - * Parameters: map Pointer to the moving average state - * kind Kind of trigger to report on - * averagep Pointer to area to receive current - * old_averagep Pointer to area to receive previous - * thresholdp Pointer to area to receive threshold - * - * Returns: 0 Information not available - * 1 Information retrieved - * - * Notes: You can only retrieve one kind of average information at a - * time; if you are collecting multiple types, then you must - * call this function one time for each type you are interested - * in obtaining. - */ -int -kco_ma_info(struct kco_moving_average *map, int kind, uint64_t *averagep, uint64_t *old_averagep, int32_t *thresholdp, int *countp) -{ - uint64_t average; - uint64_t old_average; - int32_t threshold; - int count; - - /* Not collecting this type of data or no data yet*/ - if (!(map->ma_flags & kind) || (map->ma_flags & KCO_MA_F_NEEDS_INIT)) - return(0); - - switch(kind) { - case KCO_MA_F_SMA: - average = map->ma_sma; - old_average = map->ma_old_sma; - threshold = map->ma_sma_threshold; - count = map->ma_sma_trigger_count; - break; - - case KCO_MA_F_WMA: - average = map->ma_wma; - old_average = map->ma_old_wma; - threshold = map->ma_wma_threshold; - count = map->ma_wma_trigger_count; - break; - - default: - /* - * Asking for data we don't have or more than one kind of - * data at the same time. - */ - return(0); - } - - if (averagep != NULL) - *averagep = average; - if (old_averagep != NULL) - *old_averagep = old_average; - if (thresholdp != NULL) - *thresholdp = threshold; - if (countp != NULL) - *countp = count; - - return(1); -} - - -/* - * kco_ma_addsample - * - * Accumulate a sample into a moving average - * - * Parameters: map Pointer to the moving average state - * sample_time latency delta time - * - * Returns: 0 Nothing triggered - * !0 Bitmap of KCO_MA_F_* flags for the - * algorithms which triggered - * - * Notes: Add a delta time sample to the moving average; this function - * will return bits for each algorithm which went over its - * trigger threshold as a result of receiving the sample. - * Callers can then log/complain/panic over the unresponsive - * process to which they are calling out. - */ -int -kco_ma_addsample(struct kco_moving_average *map, uint64_t sample_time) -{ - int triggered = 0; - int do_init = (map->ma_flags & KCO_MA_F_NEEDS_INIT); - - /* - * SIMPLE MOVING AVERAGE - * - * Compute simple moving average over MA_SMA_SAMPLES; incremental is - * cheaper than re-sum. - */ - if (map->ma_flags & KCO_MA_F_SMA) { - map->ma_old_sma = map->ma_sma; - - map->ma_sma = ((map->ma_sma * MA_SMA_SAMPLES) - map->ma_sma_samples[0] + sample_time) / MA_SMA_SAMPLES; - memmove(&map->ma_sma_samples[1], &map->ma_sma_samples[0], sizeof(map->ma_sma_samples[0]) *(MA_SMA_SAMPLES - 1)); - map->ma_sma_samples[0] = sample_time; - /* - * Check if percentage change exceeds the allowed trigger - * threshold; this will only happen if the sample time - * increases more than an acceptable amount; decreases will - * not cause a trigger (but will decrease the overall average, - * which can cause a trigger the next time). - * - * Note: We don't start triggering on the simple moving - * average until after we have enough samples for - * the delta to be statistically valid; this is - * defined to be MA_SMA_SAMPLES. - */ - if (map->ma_sma_samples[MA_SMA_SAMPLES-1] && ((int)((map->ma_sma * 100) / map->ma_old_sma)) > map->ma_sma_threshold) { - triggered |= KCO_MA_F_SMA; - map->ma_sma_trigger_count++; - } - } - - /* - * WEIGHTED MOVING AVERAGE - * - * Compute the weighted moving average. Do this by averaging over - * two values, one with a lesser weighting than the other; the lesser - * weighted value is the persistent historical value, whose sample - * weight decreases over time, the older the samples get. Be careful - * here to permit strict integer artimatic. - */ - if (map->ma_flags & KCO_MA_F_WMA) { - map->ma_old_wma = map->ma_wma; - - /* Prime the pump, if necessary */ - if (do_init) - map->ma_old_wma = sample_time; - - map->ma_wma = ((((map->ma_wma * 90) + sample_time * ((100*2) - 90))/100) / 2); - - /* - * Check if percentage change exceeds the allowed trigger - * threshold; this will only happen if the sample time - * increases more than an acceptable amount; decreases will - * not cause a trigger (but will decrease the overall average, - * which can cause a trigger the next time). - */ - if (((int)(((map->ma_wma * 100) / map->ma_old_wma))) > map->ma_wma_threshold) { - triggered |= KCO_MA_F_WMA; - map->ma_wma_trigger_count++; - } - } - - if (do_init) - map->ma_flags &= ~KCO_MA_F_NEEDS_INIT; - - return (triggered); -} diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index 92b15bc40..ae00ee73b 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -654,34 +654,38 @@ ctl_ioctl(__unused struct socket *so, u_long cmd, caddr_t data, /* get the number of controllers */ case CTLIOCGCOUNT: { struct kctl *kctl; - int n = 0; + u_int32_t n = 0; lck_mtx_lock(ctl_mtx); TAILQ_FOREACH(kctl, &ctl_head, next) n++; lck_mtx_unlock(ctl_mtx); - - *(u_int32_t *)data = n; + + bcopy(&n, data, sizeof (n)); error = 0; break; } case CTLIOCGINFO: { - struct ctl_info *ctl_info = (struct ctl_info *)data; + struct ctl_info ctl_info; struct kctl *kctl = 0; - size_t name_len = strlen(ctl_info->ctl_name); - + size_t name_len; + + bcopy(data, &ctl_info, sizeof (ctl_info)); + name_len = strnlen(ctl_info.ctl_name, MAX_KCTL_NAME); + if (name_len == 0 || name_len + 1 > MAX_KCTL_NAME) { error = EINVAL; break; } lck_mtx_lock(ctl_mtx); - kctl = ctl_find_by_name(ctl_info->ctl_name); + kctl = ctl_find_by_name(ctl_info.ctl_name); lck_mtx_unlock(ctl_mtx); if (kctl == 0) { error = ENOENT; break; } - ctl_info->ctl_id = kctl->id; + ctl_info.ctl_id = kctl->id; + bcopy(&ctl_info, data, sizeof (ctl_info)); error = 0; break; } diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index cf63621d9..1c3093e4d 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -211,9 +211,9 @@ coredump(proc_t core_proc) int command_size, header_size, tstate_size; int hoffset; off_t foffset; - vm_map_offset_t vmoffset; + mach_vm_offset_t vmoffset; vm_offset_t header; - vm_map_size_t vmsize; + mach_vm_size_t vmsize; vm_prot_t prot; vm_prot_t maxprot; vm_inherit_t inherit; diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index 4a6ee0386..7569cf5eb 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,21 +48,16 @@ #include #include #include +#include #include #include #include /* For manifest constants in posix_cred_access */ #include -#include #include #include -/* mach_absolute_time() */ -#include -#include -#include - #include #include @@ -76,6 +71,7 @@ #if CONFIG_MACF #include #include +#include #endif void mach_kauth_cred_uthread_update( void ); @@ -130,6 +126,7 @@ cred_debug_buffer * cred_debug_buf_p = NULL; #endif /* !DEBUG_CRED */ +#if CONFIG_EXT_RESOLVER /* * Interface to external identity resolver. * @@ -153,7 +150,6 @@ struct kauth_resolver_work { struct kauth_identity_extlookup kr_work; uint64_t kr_extend; uint32_t kr_seqno; - uint64_t kr_subtime; /* submission time */ int kr_refs; int kr_flags; #define KAUTH_REQUEST_UNSUBMITTED (1<<0) @@ -166,11 +162,82 @@ TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_ TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted; TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done; +/* Number of resolver timeouts between logged complaints */ +#define KAUTH_COMPLAINT_INTERVAL 1000 +int kauth_resolver_timeout_cnt = 0; + static int kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data); static int kauth_resolver_complete(user_addr_t message); static int kauth_resolver_getwork(user_addr_t message); static int kauth_resolver_getwork2(user_addr_t message); +#define KAUTH_CACHES_MAX_SIZE 10000 /* Max # entries for both groups and id caches */ + +struct kauth_identity { + TAILQ_ENTRY(kauth_identity) ki_link; + int ki_valid; + uid_t ki_uid; + gid_t ki_gid; + guid_t ki_guid; + ntsid_t ki_ntsid; + const char *ki_name; /* string name from string cache */ + /* + * Expiry times are the earliest time at which we will disregard the + * cached state and go to userland. Before then if the valid bit is + * set, we will return the cached value. If it's not set, we will + * not go to userland to resolve, just assume that there is no answer + * available. + */ + time_t ki_guid_expiry; + time_t ki_ntsid_expiry; +}; + +static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities; +static lck_mtx_t *kauth_identity_mtx; +#define KAUTH_IDENTITY_LOCK() lck_mtx_lock(kauth_identity_mtx); +#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(kauth_identity_mtx); +#define KAUTH_IDENTITY_CACHEMAX_DEFAULT 100 /* XXX default sizing? */ +static int kauth_identity_cachemax = KAUTH_IDENTITY_CACHEMAX_DEFAULT; +static int kauth_identity_count; + +static struct kauth_identity *kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, + ntsid_t *ntsidp, time_t ntsid_expiry, const char *name, int nametype); +static void kauth_identity_register_and_free(struct kauth_identity *kip); +static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip, uint64_t extend_data); +static void kauth_identity_trimcache(int newsize); +static void kauth_identity_lru(struct kauth_identity *kip); +static int kauth_identity_guid_expired(struct kauth_identity *kip); +static int kauth_identity_ntsid_expired(struct kauth_identity *kip); +static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_nam(char *name, int valid, struct kauth_identity *kir); + +struct kauth_group_membership { + TAILQ_ENTRY(kauth_group_membership) gm_link; + uid_t gm_uid; /* the identity whose membership we're recording */ + gid_t gm_gid; /* group of which they are a member */ + time_t gm_expiry; /* TTL for the membership, or 0 for persistent entries */ + int gm_flags; +#define KAUTH_GROUP_ISMEMBER (1<<0) +}; + +TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups; +static lck_mtx_t *kauth_groups_mtx; +#define KAUTH_GROUPS_LOCK() lck_mtx_lock(kauth_groups_mtx); +#define KAUTH_GROUPS_UNLOCK() lck_mtx_unlock(kauth_groups_mtx); +#define KAUTH_GROUPS_CACHEMAX_DEFAULT 100 /* XXX default sizing? */ +static int kauth_groups_cachemax = KAUTH_GROUPS_CACHEMAX_DEFAULT; +static int kauth_groups_count; + +static int kauth_groups_expired(struct kauth_group_membership *gm); +static void kauth_groups_lru(struct kauth_group_membership *gm); +static void kauth_groups_updatecache(struct kauth_identity_extlookup *el); +static void kauth_groups_trimcache(int newsize); + +#endif /* CONFIG_EXT_RESOLVER */ + static const int kauth_cred_primes[KAUTH_CRED_PRIMES_COUNT] = KAUTH_CRED_PRIMES; static int kauth_cred_primes_index = 0; static int kauth_cred_table_size = 0; @@ -178,9 +245,6 @@ static int kauth_cred_table_size = 0; TAILQ_HEAD(kauth_cred_entry_head, ucred); static struct kauth_cred_entry_head * kauth_cred_table_anchor = NULL; -/* Weighted moving average for resolver response time */ -static struct kco_moving_average resolver_ma; - #define KAUTH_CRED_HASH_DEBUG 0 static int kauth_cred_add(kauth_cred_t new_cred); @@ -196,7 +260,7 @@ static void kauth_cred_hash_print(void); static void kauth_cred_print(kauth_cred_t cred); #endif - +#if CONFIG_EXT_RESOLVER /* * kauth_resolver_init * @@ -206,7 +270,7 @@ static void kauth_cred_print(kauth_cred_t cred); * * Returns: (void) * - * Notes: Intialize the credential identity resolver for use; the + * Notes: Initialize the credential identity resolver for use; the * credential identity resolver is the KPI used by the user * space credential identity resolver daemon to communicate * with the kernel via the identitysvc() system call.. @@ -232,11 +296,6 @@ kauth_resolver_init(void) TAILQ_INIT(&kauth_resolver_done); kauth_resolver_sequence = 31337; kauth_resolver_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0/*LCK_ATTR_NULL*/); - - /* - * 110% of average response time is "too long" and should be reported - */ - kco_ma_init(&resolver_ma, 110, KCO_MA_F_WMA); } @@ -283,7 +342,6 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data struct kauth_resolver_work *workp, *killp; struct timespec ts; int error, shouldfree; - uint64_t duration; /* no point actually blocking if the resolver isn't up yet */ if (kauth_resolver_identity == 0) { @@ -326,7 +384,7 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data workp->kr_work.el_result = KAUTH_EXTLOOKUP_INPROG; /* - * XXX We *MUST NOT* attempt to coelesce identical work items due to + * XXX We *MUST NOT* attempt to coalesce identical work items due to * XXX the inability to ensure order of update of the request item * XXX extended data vs. the wakeup; instead, we let whoever is waiting * XXX for each item repeat the update when they wake up. @@ -335,7 +393,7 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data /* * Wake up an external resolver thread to deal with the new work; one - * may not be available, and if not, then the request will be grabed + * may not be available, and if not, then the request will be grabbed * when a resolver thread comes back into the kernel to request new * work. */ @@ -358,57 +416,44 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data break; } - /* - * Update the moving average of how long the request took; if it - * took longer than the time threshold, then we complain about it - * being slow. - */ - duration = mach_absolute_time() - workp->kr_subtime; - if (kco_ma_addsample(&resolver_ma, duration)) { - uint64_t average; - uint64_t old_average; - int32_t threshold; - int count; - - /* If we can't get information, don't log anything */ - if (kco_ma_info(&resolver_ma, KCO_MA_F_WMA, &average, &old_average, &threshold, &count)) { - char pname[MAXCOMLEN+1] = "(NULL)"; - proc_name(kauth_resolver_identity, pname, sizeof(pname)); - // printf("kauth_resolver_submit: External resolver pid %d (name %s) response time %lld, average %lld new %lld threshold %d%% actual %d%% count %d\n", kauth_resolver_identity, pname, duration, old_average, average, threshold, (int)((duration * 100) / old_average), count); - } - } - /* if the request was processed, copy the result */ if (error == 0) *lkp = workp->kr_work; - /* - * If the request timed out and was never collected, the resolver - * is dead and probably not coming back anytime soon. In this - * case we revert to no-resolver behaviour, and punt all the other - * sleeping requests to clear the backlog. - */ - if ((error == EWOULDBLOCK) && (workp->kr_flags & KAUTH_REQUEST_UNSUBMITTED)) { - KAUTH_DEBUG("RESOLVER - request timed out without being collected for processing, resolver dead"); + if (error == EWOULDBLOCK) { + if ((kauth_resolver_timeout_cnt++ % KAUTH_COMPLAINT_INTERVAL) == 0) { + printf("kauth external resolver timed out (%d timeout(s) of %d seconds).\n", + kauth_resolver_timeout_cnt, kauth_resolver_timeout); + } + + if (workp->kr_flags & KAUTH_REQUEST_UNSUBMITTED) { + /* + * If the request timed out and was never collected, the resolver + * is dead and probably not coming back anytime soon. In this + * case we revert to no-resolver behaviour, and punt all the other + * sleeping requests to clear the backlog. + */ + KAUTH_DEBUG("RESOLVER - request timed out without being collected for processing, resolver dead"); + + /* + * Make the current resolver non-authoritative, and mark it as + * no longer registered to prevent kauth_cred_ismember_gid() + * enqueueing more work until a new one is registered. This + * mitigates the damage a crashing resolver may inflict. + */ + kauth_resolver_identity = 0; + kauth_resolver_registered = 0; + + /* kill all the other requestes that are waiting as well */ + TAILQ_FOREACH(killp, &kauth_resolver_submitted, kr_link) + wakeup(killp); + TAILQ_FOREACH(killp, &kauth_resolver_unsubmitted, kr_link) + wakeup(killp); + /* Cause all waiting-for-work threads to return EIO */ + wakeup((caddr_t)&kauth_resolver_unsubmitted); + } + } - /* - * Make the current resolver non-authoritative, and mark it as - * no longer registered to prevent kauth_cred_ismember_gid() - * enqueueing more work until a new one is registered. This - * mitigates the damage a crashing resolver may inflict. - */ - kauth_resolver_identity = 0; - kauth_resolver_registered = 0; - - /* kill all the other requestes that are waiting as well */ - TAILQ_FOREACH(killp, &kauth_resolver_submitted, kr_link) - wakeup(killp); - TAILQ_FOREACH(killp, &kauth_resolver_unsubmitted, kr_link) - wakeup(killp); - /* Cause all waiting-for-work threads to return EIO */ - wakeup((caddr_t)&kauth_resolver_unsubmitted); - } - /* * drop our reference on the work item, and note whether we should * free it or not @@ -477,6 +522,7 @@ identitysvc(__unused struct proc *p, struct identitysvc_args *uap, __unused int3 int opcode = uap->opcode; user_addr_t message = uap->message; struct kauth_resolver_work *workp; + struct kauth_cache_sizes sz_arg; int error; pid_t new_id; @@ -524,8 +570,51 @@ identitysvc(__unused struct proc *p, struct identitysvc_args *uap, __unused int3 KAUTH_DEBUG("RESOLVER - call from bogus resolver %d\n", current_proc()->p_pid); return(EPERM); } + + if (opcode == KAUTH_GET_CACHE_SIZES) { + KAUTH_IDENTITY_LOCK(); + sz_arg.kcs_id_size = kauth_identity_cachemax; + KAUTH_IDENTITY_UNLOCK(); + + KAUTH_GROUPS_LOCK(); + sz_arg.kcs_group_size = kauth_groups_cachemax; + KAUTH_GROUPS_UNLOCK(); - if (opcode == KAUTH_EXTLOOKUP_DEREGISTER) { + if ((error = copyout(&sz_arg, uap->message, sizeof (sz_arg))) != 0) { + return (error); + } + + return (0); + } else if (opcode == KAUTH_SET_CACHE_SIZES) { + if ((error = copyin(uap->message, &sz_arg, sizeof (sz_arg))) != 0) { + return (error); + } + + if ((sz_arg.kcs_group_size > KAUTH_CACHES_MAX_SIZE) || + (sz_arg.kcs_id_size > KAUTH_CACHES_MAX_SIZE)) { + return (EINVAL); + } + + KAUTH_IDENTITY_LOCK(); + kauth_identity_cachemax = sz_arg.kcs_id_size; + kauth_identity_trimcache(kauth_identity_cachemax); + KAUTH_IDENTITY_UNLOCK(); + + KAUTH_GROUPS_LOCK(); + kauth_groups_cachemax = sz_arg.kcs_group_size; + kauth_groups_trimcache(kauth_groups_cachemax); + KAUTH_GROUPS_UNLOCK(); + + return (0); + } else if (opcode == KAUTH_CLEAR_CACHES) { + KAUTH_IDENTITY_LOCK(); + kauth_identity_trimcache(0); + KAUTH_IDENTITY_UNLOCK(); + + KAUTH_GROUPS_LOCK(); + kauth_groups_trimcache(0); + KAUTH_GROUPS_UNLOCK(); + } else if (opcode == KAUTH_EXTLOOKUP_DEREGISTER) { /* * Terminate outstanding requests; without an authoritative * resolver, we are now back on our own authority. @@ -638,7 +727,7 @@ kauth_resolver_getwork_continue(int result) * EFAULT Bad user space message address * * Notes: This common function exists to permit the use of continuations - * in the identity resoultion process. This frees up the stack + * in the identity resolution process. This frees up the stack * while we are waiting for the user space resolver to complete * a request. This is specifically used so that our per thread * cost can be small, and we will therefore be willing to run a @@ -708,7 +797,6 @@ kauth_resolver_getwork2(user_addr_t message) TAILQ_REMOVE(&kauth_resolver_unsubmitted, workp, kr_link); workp->kr_flags &= ~KAUTH_REQUEST_UNSUBMITTED; workp->kr_flags |= KAUTH_REQUEST_SUBMITTED; - workp->kr_subtime = mach_absolute_time(); TAILQ_INSERT_TAIL(&kauth_resolver_submitted, workp, kr_link); out: @@ -734,7 +822,7 @@ out: * identity resolution daemon makes a request for work. This * permits a large number of threads to be used by the daemon, * without using a lot of wired kernel memory when there are no - * acctual request outstanding. + * actual request outstanding. */ static int kauth_resolver_getwork(user_addr_t message) @@ -925,60 +1013,21 @@ kauth_resolver_complete(user_addr_t message) return(error); } +#endif /* CONFIG_EXT_RESOLVER */ /* * Identity cache. */ -struct kauth_identity { - TAILQ_ENTRY(kauth_identity) ki_link; - int ki_valid; #define KI_VALID_UID (1<<0) /* UID and GID are mutually exclusive */ #define KI_VALID_GID (1<<1) #define KI_VALID_GUID (1<<2) #define KI_VALID_NTSID (1<<3) #define KI_VALID_PWNAM (1<<4) /* Used for translation */ #define KI_VALID_GRNAM (1<<5) /* Used for translation */ - uid_t ki_uid; - gid_t ki_gid; - guid_t ki_guid; - ntsid_t ki_ntsid; - const char *ki_name; /* string name from string cache */ - /* - * Expiry times are the earliest time at which we will disregard the - * cached state and go to userland. Before then if the valid bit is - * set, we will return the cached value. If it's not set, we will - * not go to userland to resolve, just assume that there is no answer - * available. - */ - time_t ki_guid_expiry; - time_t ki_ntsid_expiry; -}; - -static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities; -#define KAUTH_IDENTITY_CACHEMAX 100 /* XXX sizing? */ -static int kauth_identity_count; - -static lck_mtx_t *kauth_identity_mtx; -#define KAUTH_IDENTITY_LOCK() lck_mtx_lock(kauth_identity_mtx); -#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(kauth_identity_mtx); - - -static struct kauth_identity *kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, - ntsid_t *ntsidp, time_t ntsid_expiry, const char *name, int nametype); -static void kauth_identity_register_and_free(struct kauth_identity *kip); -static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip, uint64_t extend_data); -static void kauth_identity_lru(struct kauth_identity *kip); -static int kauth_identity_guid_expired(struct kauth_identity *kip); -static int kauth_identity_ntsid_expired(struct kauth_identity *kip); -static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir, char *getname); -static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir, char *getname); -static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getname); -static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getname); -static int kauth_identity_find_nam(char *name, int valid, struct kauth_identity *kir); - +#if CONFIG_EXT_RESOLVER /* * kauth_identity_init * @@ -988,7 +1037,7 @@ static int kauth_identity_find_nam(char *name, int valid, struct kauth_identity * * Returns: (void) * - * Notes: Intialize the credential identity resolver for use; the + * Notes: Initialize the credential identity resolver for use; the * credential identity resolver is the KPI used to communicate * with a user space credential identity resolver daemon. * @@ -1013,7 +1062,7 @@ kauth_identity_init(void) * * Returns: NULL Insufficient memory to satisfy * the request - * !NULL A pointer to the applocated + * !NULL A pointer to the allocated * structure, filled in * * Notes: It is illegal to translate between UID and GID; any given UUID @@ -1125,7 +1174,7 @@ kauth_identity_register_and_free(struct kauth_identity *kip) * if it pushes us over our limit, discard the oldest one. */ TAILQ_INSERT_HEAD(&kauth_identities, kip, ki_link); - if (++kauth_identity_count > KAUTH_IDENTITY_CACHEMAX) { + if (++kauth_identity_count > kauth_identity_cachemax) { ip = TAILQ_LAST(&kauth_identities, kauth_identity_head); TAILQ_REMOVE(&kauth_identities, ip, ki_link); kauth_identity_count--; @@ -1203,12 +1252,12 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id kip->ki_guid = elp->el_uguid; kip->ki_valid |= KI_VALID_GUID; } - kip->ki_guid_expiry = tv.tv_sec + elp->el_uguid_valid; + kip->ki_guid_expiry = (elp->el_uguid_valid) ? tv.tv_sec + elp->el_uguid_valid : 0; if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_USID) { kip->ki_ntsid = elp->el_usid; kip->ki_valid |= KI_VALID_NTSID; } - kip->ki_ntsid_expiry = tv.tv_sec + elp->el_usid_valid; + kip->ki_ntsid_expiry = (elp->el_usid_valid) ? tv.tv_sec + elp->el_usid_valid : 0; if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) { const char *oname = kip->ki_name; kip->ki_name = speculative_name; @@ -1234,9 +1283,9 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id if (kip == NULL) { kip = kauth_identity_alloc(elp->el_uid, KAUTH_GID_NONE, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UGUID) ? &elp->el_uguid : NULL, - tv.tv_sec + elp->el_uguid_valid, + (elp->el_uguid_valid) ? tv.tv_sec + elp->el_uguid_valid : 0, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_USID) ? &elp->el_usid : NULL, - tv.tv_sec + elp->el_usid_valid, + (elp->el_usid_valid) ? tv.tv_sec + elp->el_usid_valid : 0, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) ? speculative_name : NULL, KI_VALID_PWNAM); if (kip != NULL) { @@ -1260,12 +1309,12 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id kip->ki_guid = elp->el_gguid; kip->ki_valid |= KI_VALID_GUID; } - kip->ki_guid_expiry = tv.tv_sec + elp->el_gguid_valid; + kip->ki_guid_expiry = (elp->el_gguid_valid) ? tv.tv_sec + elp->el_gguid_valid : 0; if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GSID) { kip->ki_ntsid = elp->el_gsid; kip->ki_valid |= KI_VALID_NTSID; } - kip->ki_ntsid_expiry = tv.tv_sec + elp->el_gsid_valid; + kip->ki_ntsid_expiry = (elp->el_gsid_valid) ? tv.tv_sec + elp->el_gsid_valid : 0; if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) { const char *oname = kip->ki_name; kip->ki_name = speculative_name; @@ -1291,9 +1340,9 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id if (kip == NULL) { kip = kauth_identity_alloc(KAUTH_UID_NONE, elp->el_gid, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GGUID) ? &elp->el_gguid : NULL, - tv.tv_sec + elp->el_gguid_valid, + (elp->el_gguid_valid) ? tv.tv_sec + elp->el_gguid_valid : 0, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GSID) ? &elp->el_gsid : NULL, - tv.tv_sec + elp->el_gsid_valid, + (elp->el_gsid_valid) ? tv.tv_sec + elp->el_gsid_valid : 0, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) ? speculative_name : NULL, KI_VALID_GRNAM); if (kip != NULL) { @@ -1314,6 +1363,25 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id } +/* + * Trim older entries from the identity cache. + * + * Must be called with the identity cache lock held. + */ +static void +kauth_identity_trimcache(int newsize) { + struct kauth_identity *kip; + + lck_mtx_assert(kauth_identity_mtx, LCK_MTX_ASSERT_OWNED); + + while (kauth_identity_count > newsize) { + kip = TAILQ_LAST(&kauth_identities, kauth_identity_head); + TAILQ_REMOVE(&kauth_identities, kip, ki_link); + kauth_identity_count--; + FREE(kip, M_KAUTH); + } +} + /* * kauth_identity_lru * @@ -1357,8 +1425,15 @@ kauth_identity_guid_expired(struct kauth_identity *kip) { struct timeval tv; + /* + * Expiration time of 0 means this entry is persistent. + */ + if (kip->ki_guid_expiry == 0) + return (0); + microuptime(&tv); KAUTH_DEBUG("CACHE - GUID expires @ %d now %d", kip->ki_guid_expiry, tv.tv_sec); + return((kip->ki_guid_expiry <= tv.tv_sec) ? 1 : 0); } @@ -1379,8 +1454,15 @@ kauth_identity_ntsid_expired(struct kauth_identity *kip) { struct timeval tv; + /* + * Expiration time of 0 means this entry is persistent. + */ + if (kip->ki_ntsid_expiry == 0) + return (0); + microuptime(&tv); KAUTH_DEBUG("CACHE - NTSID expires @ %d now %d", kip->ki_ntsid_expiry, tv.tv_sec); + return((kip->ki_ntsid_expiry <= tv.tv_sec) ? 1 : 0); } @@ -1505,7 +1587,7 @@ kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getnam * * Parameters: name Pointer to name to find * valid KI_VALID_PWNAM or KI_VALID_GRNAM - * kir Pointer to return aread + * kir Pointer to return area * * Returns: 0 Found * ENOENT Not found @@ -1570,6 +1652,7 @@ kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getn KAUTH_IDENTITY_UNLOCK(); return((kip == NULL) ? ENOENT : 0); } +#endif /* CONFIG_EXT_RESOLVER */ /* @@ -1586,7 +1669,7 @@ guid_t kauth_null_guid; * Parameters: guid1 Pointer to first GUID * guid2 Pointer to second GUID * - * Returns: 0 If GUIDs are inequal + * Returns: 0 If GUIDs are unequal * !0 If GUIDs are equal */ int @@ -1603,7 +1686,7 @@ kauth_guid_equal(guid_t *guid1, guid_t *guid2) * * Parameters: guid Pointer to GUID to check * - * Returns: KAUTH_WKG_NOT Not a wel known GUID + * Returns: KAUTH_WKG_NOT Not a well known GUID * KAUTH_WKG_EVERYBODY "Everybody" * KAUTH_WKG_NOBODY "Nobody" * KAUTH_WKG_OWNER "Other" @@ -1642,10 +1725,10 @@ kauth_wellknown_guid(guid_t *guid) * * Description: Determine the equality of two NTSIDs (NT Security Identifiers) * - * Paramters: sid1 Pointer to first NTSID + * Parameters: sid1 Pointer to first NTSID * sid2 Pointer to second NTSID * - * Returns: 0 If GUIDs are inequal + * Returns: 0 If GUIDs are unequal * !0 If GUIDs are equal */ int @@ -1673,7 +1756,6 @@ kauth_ntsid_equal(ntsid_t *sid1, ntsid_t *sid2) * be done using it. */ -static int kauth_cred_cache_lookup(int from, int to, void *src, void *dst); /* @@ -1860,6 +1942,21 @@ kauth_cred_getsvgid(kauth_cred_t cred) } +static int kauth_cred_cache_lookup(int from, int to, void *src, void *dst); + +#if CONFIG_EXT_RESOLVER == 0 +/* + * If there's no resolver, short-circuit the kauth_cred_x2y() lookups. + */ +static __inline int +kauth_cred_cache_lookup(__unused int from, __unused int to, + __unused void *src, __unused void *dst) +{ + return (EWOULDBLOCK); + +} +#endif + /* * kauth_cred_guid2pwnam * @@ -2225,6 +2322,7 @@ kauth_cred_guid2ntsid(guid_t *guidp, ntsid_t *sidp) * Returns: 0 Success * EINVAL Unknown source identity type */ +#if CONFIG_EXT_RESOLVER static int kauth_cred_cache_lookup(int from, int to, void *src, void *dst) { @@ -2325,6 +2423,7 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) /* do we have a translation? */ if (ki.ki_valid & to) { KAUTH_DEBUG("CACHE - found matching entry with valid 0x%08x", ki.ki_valid); + DTRACE_PROC4(kauth__identity__cache__hit, int, from, int, to, void *, src, void *, dst); goto found; } else { /* @@ -2425,7 +2524,13 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) /* Call resolver */ KAUTH_DEBUG("CACHE - calling resolver for %x", el.el_flags); + + DTRACE_PROC3(kauth__id__resolver__submitted, int, from, int, to, uintptr_t, src); + error = kauth_resolver_submit(&el, extend_data); + + DTRACE_PROC2(kauth__id__resolver__returned, int, error, struct kauth_identity_extlookup *, &el) + KAUTH_DEBUG("CACHE - resolver returned %d", error); /* was the external lookup successful? */ @@ -2490,28 +2595,6 @@ found: * XXX the linked-list implementation here needs to be optimized. */ -struct kauth_group_membership { - TAILQ_ENTRY(kauth_group_membership) gm_link; - uid_t gm_uid; /* the identity whose membership we're recording */ - gid_t gm_gid; /* group of which they are a member */ - time_t gm_expiry; /* TTL for the membership */ - int gm_flags; -#define KAUTH_GROUP_ISMEMBER (1<<0) -}; - -TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups; -#define KAUTH_GROUPS_CACHEMAX 100 /* XXX sizing? */ -static int kauth_groups_count; - -static lck_mtx_t *kauth_groups_mtx; -#define KAUTH_GROUPS_LOCK() lck_mtx_lock(kauth_groups_mtx); -#define KAUTH_GROUPS_UNLOCK() lck_mtx_unlock(kauth_groups_mtx); - -static int kauth_groups_expired(struct kauth_group_membership *gm); -static void kauth_groups_lru(struct kauth_group_membership *gm); -static void kauth_groups_updatecache(struct kauth_identity_extlookup *el); - - /* * kauth_groups_init * @@ -2521,7 +2604,7 @@ static void kauth_groups_updatecache(struct kauth_identity_extlookup *el); * * Returns: (void) * - * Notes: Intialize the groups cache for use; the group cache is used + * Notes: Initialize the groups cache for use; the group cache is used * to avoid unnecessary calls out to user space. * * This function is called from kauth_init() in the file @@ -2551,7 +2634,14 @@ kauth_groups_expired(struct kauth_group_membership *gm) { struct timeval tv; + /* + * Expiration time of 0 means this entry is persistent. + */ + if (gm->gm_expiry == 0) + return (0); + microuptime(&tv); + return((gm->gm_expiry <= tv.tv_sec) ? 1 : 0); } @@ -2623,7 +2713,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) } else { gm->gm_flags &= ~KAUTH_GROUP_ISMEMBER; } - gm->gm_expiry = el->el_member_valid + tv.tv_sec; + gm->gm_expiry = (el->el_member_valid) ? el->el_member_valid + tv.tv_sec : 0; kauth_groups_lru(gm); break; } @@ -2644,7 +2734,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) } else { gm->gm_flags &= ~KAUTH_GROUP_ISMEMBER; } - gm->gm_expiry = el->el_member_valid + tv.tv_sec; + gm->gm_expiry = (el->el_member_valid) ? el->el_member_valid + tv.tv_sec : 0; } /* @@ -2655,7 +2745,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) */ KAUTH_GROUPS_LOCK(); TAILQ_INSERT_HEAD(&kauth_groups, gm, gm_link); - if (kauth_groups_count++ > KAUTH_GROUPS_CACHEMAX) { + if (++kauth_groups_count > kauth_groups_cachemax) { gm = TAILQ_LAST(&kauth_groups, kauth_groups_head); TAILQ_REMOVE(&kauth_groups, gm, gm_link); kauth_groups_count--; @@ -2669,6 +2759,25 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) FREE(gm, M_KAUTH); } +/* + * Trim older entries from the group membership cache. + * + * Must be called with the group cache lock held. + */ +static void +kauth_groups_trimcache(int new_size) { + struct kauth_group_membership *gm; + + lck_mtx_assert(kauth_groups_mtx, LCK_MTX_ASSERT_OWNED); + + while (kauth_groups_count > new_size) { + gm = TAILQ_LAST(&kauth_groups, kauth_groups_head); + TAILQ_REMOVE(&kauth_groups, gm, gm_link); + kauth_groups_count--; + FREE(gm, M_KAUTH); + } +} +#endif /* CONFIG_EXT_RESOLVER */ /* * Group membership KPI @@ -2687,7 +2796,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) * result of the call * * Returns: 0 Success - * ENOENT Could not proform lookup + * ENOENT Could not perform lookup * kauth_resolver_submit:EWOULDBLOCK * kauth_resolver_submit:EINTR * kauth_resolver_submit:ENOMEM @@ -2702,16 +2811,14 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) * Notes: This function guarantees not to modify resultp when returning * an error. * - * This function effectively checkes the EGID as well, since the + * This function effectively checks the EGID as well, since the * EGID is cr_groups[0] as an implementation detail. */ int kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) { posix_cred_t pcred = posix_cred_get(cred); - struct kauth_group_membership *gm; - struct kauth_identity_extlookup el; - int i, error; + int i; /* * Check the per-credential list of override groups. @@ -2735,7 +2842,11 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) return(0); } - +#if CONFIG_EXT_RESOLVER + struct kauth_group_membership *gm; + struct kauth_identity_extlookup el; + int error; + /* * If the resolver hasn't checked in yet, we are early in the boot * phase and the local group list is complete and authoritative. @@ -2744,7 +2855,7 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) *resultp = 0; return(0); } - + /* TODO: */ /* XXX check supplementary groups */ /* XXX check whiteout groups */ @@ -2767,9 +2878,11 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) KAUTH_GROUPS_UNLOCK(); /* if we did, we can return now */ - if (gm != NULL) + if (gm != NULL) { + DTRACE_PROC2(kauth__group__cache__hit, int, pcred->cr_gmuid, int, gid); return(0); - + } + /* nothing in the cache, need to go to userland */ bzero(&el, sizeof(el)); el.el_info_pid = current_proc()->p_pid; @@ -2777,7 +2890,13 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) el.el_uid = pcred->cr_gmuid; el.el_gid = gid; el.el_member_valid = 0; /* XXX set by resolver? */ + + DTRACE_PROC2(kauth__group__resolver__submitted, int, el.el_uid, int, el.el_gid); + error = kauth_resolver_submit(&el, 0ULL); + + DTRACE_PROC2(kauth__group__resolver__returned, int, error, int, el.el_flags); + if (error != 0) return(error); /* save the results from the lookup */ @@ -2790,9 +2909,12 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) } return(ENOENT); +#else + *resultp = 0; + return(0); +#endif } - /* * kauth_cred_ismember_guid * @@ -2820,15 +2942,11 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) * 0 Is not member */ int -kauth_cred_ismember_guid(kauth_cred_t cred, guid_t *guidp, int *resultp) +kauth_cred_ismember_guid(__unused kauth_cred_t cred, guid_t *guidp, int *resultp) { - struct kauth_identity ki; - gid_t gid; - int error, wkg; + int error = 0; - error = 0; - wkg = kauth_wellknown_guid(guidp); - switch(wkg) { + switch (kauth_wellknown_guid(guidp)) { case KAUTH_WKG_NOBODY: *resultp = 0; break; @@ -2836,6 +2954,10 @@ kauth_cred_ismember_guid(kauth_cred_t cred, guid_t *guidp, int *resultp) *resultp = 1; break; default: +#if CONFIG_EXT_RESOLVER + { + struct kauth_identity ki; + gid_t gid; #if 6603280 /* * Grovel the identity cache looking for this GUID. @@ -2884,6 +3006,11 @@ kauth_cred_ismember_guid(kauth_cred_t cred, guid_t *guidp, int *resultp) error = kauth_cred_ismember_gid(cred, gid, resultp); } } +#else /* CONFIG_EXT_RESOLVER */ + error = ENOENT; +#endif /* CONFIG_EXT_RESOLVER */ + break; + } return(error); } @@ -3670,7 +3797,7 @@ kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) * Parameters: cred The original credential * groups Pointer to gid_t array which * contains the new group list - * groupcount The cound of valid groups which + * groupcount The count of valid groups which * are contained in 'groups' * gmuid KAUTH_UID_NONE -or- the new * group membership UID @@ -3693,7 +3820,7 @@ kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) * that is returned to them, if it is not intended to be a * persistent reference. * - * XXX: Changes are determined in ordinal order - if the caller pasess + * XXX: Changes are determined in ordinal order - if the caller passes * in the same groups list that is already present in the * credential, but the members are in a different order, even if * the EGID is not modified (i.e. cr_groups[0] is the same), it @@ -3753,7 +3880,7 @@ kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmu * XXX temporary, for NFS support until we can come up with a better * XXX enumeration/comparison mechanism * - * Notes: The return value exists to account for the possbility of a + * Notes: The return value exists to account for the possibility of a * kauth_cred_t without a POSIX label. This will be the case in * the future (see posix_cred_get() below, for more details). */ @@ -4326,12 +4453,12 @@ kauth_cred_ref(kauth_cred_t cred) * scoped to this compilation unit. * * This function destroys the contents of the pointer passed by - * the caller to prevent the caller accidently attempting to + * the caller to prevent the caller accidentally attempting to * release a given reference twice in error. * * The last reference is considered to be released when a release * of a credential of a reference count of 2 occurs; this is an - * intended effect, to take into accout the reference held by + * intended effect, to take into account the reference held by * the credential hash, which is released at the same time. */ static void @@ -4447,11 +4574,11 @@ kauth_cred_rele(kauth_cred_t cred) * referencing them, prior to making them visible in an externally * visible pointer (e.g. by adding them to the credential hash * cache) is the only legal time in which an existing credential - * can be safely iinitialized or modified directly. + * can be safely initialized or modified directly. * * After initialization, the caller is expected to call the * function kauth_cred_add() to add the credential to the hash - * cache, after which time it's frozen and becomes publically + * cache, after which time it's frozen and becomes publicly * visible. * * The release protocol depends on kauth_hash_add() being called @@ -4502,7 +4629,7 @@ kauth_cred_dup(kauth_cred_t cred) * result, the caller is responsible for dropping BOTH the * additional reference on the passed cred (if any), and the * credential returned by this function. The drop should be - * via the satnadr kauth_cred_unref() KPI. + * via the kauth_cred_unref() KPI. */ kauth_cred_t kauth_cred_copy_real(kauth_cred_t cred) @@ -4787,7 +4914,7 @@ kauth_cred_remove(kauth_cred_t cred) * hash cache * * Returns: NULL Not found - * !NULL Matching cedential already in + * !NULL Matching credential already in * cred hash cache * * Locks: Caller is expected to hold KAUTH_CRED_HASH_LOCK @@ -4822,21 +4949,15 @@ kauth_cred_find(kauth_cred_t cred) * don't worry about the label unless the flags in * either credential tell us to. */ - if ((found_pcred->cr_flags & CRF_MAC_ENFORCE) != 0 || - (pcred->cr_flags & CRF_MAC_ENFORCE) != 0) { - /* include the label pointer in the compare */ - match = (bcmp(&found_pcred->cr_uid, &pcred->cr_uid, - (sizeof(struct ucred) - - offsetof(struct ucred, cr_posix))) == 0); - } else { - /* flags have to match, but skip the label in bcmp */ - match = (found_pcred->cr_flags == pcred->cr_flags && - bcmp(&found_pcred->cr_uid, &pcred->cr_uid, - sizeof(struct posix_cred)) == 0 && - bcmp(&found_cred->cr_audit, &cred->cr_audit, - sizeof(cred->cr_audit)) == 0); - + match = (bcmp(found_pcred, pcred, sizeof (*pcred)) == 0) ? TRUE : FALSE; + match = match && ((bcmp(&found_cred->cr_audit, &cred->cr_audit, + sizeof(cred->cr_audit)) == 0) ? TRUE : FALSE); + if (((found_pcred->cr_flags & CRF_MAC_ENFORCE) != 0) || + ((pcred->cr_flags & CRF_MAC_ENFORCE) != 0)) { + match = match && mac_cred_label_compare(found_cred->cr_label, + cred->cr_label); } + if (match) { /* found a match */ return(found_cred); @@ -4901,17 +5022,16 @@ kauth_cred_get_hashkey(kauth_cred_t cred) posix_cred_t pcred = posix_cred_get(cred); u_long hash_key = 0; + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, + sizeof (struct posix_cred), + hash_key); + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_audit, + sizeof(struct au_session), + hash_key); + if (pcred->cr_flags & CRF_MAC_ENFORCE) { - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, - sizeof(struct ucred) - offsetof(struct ucred, cr_posix), - hash_key); - } else { - /* skip label */ - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, - sizeof(struct posix_cred), - hash_key); - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_audit, - sizeof(struct au_session), + hash_key = kauth_cred_hash((uint8_t *)cred->cr_label, + sizeof (struct label), hash_key); } return(hash_key); @@ -5286,9 +5406,9 @@ sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *ar * attach a label to the new credential * * Notes: This function currently wraps kauth_cred_create(), and is the - * only consume of tht ill-fated function, apart from bsd_init(). + * only consumer of that ill-fated function, apart from bsd_init(). * It exists solely to support the NFS server code creation of - * credentials based on the over-the-wire RPC cals containing + * credentials based on the over-the-wire RPC calls containing * traditional POSIX credential information being tunneled to * the server host from the client machine. * @@ -5296,7 +5416,7 @@ sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *ar * * In the short term, it creates a temporary credential, puts * the POSIX information from NFS into it, and then calls - * kauth_cred_create(), as an internal implementaiton detail. + * kauth_cred_create(), as an internal implementation detail. * * If we have to keep it around in the medium term, it will * create a new kauth_cred_t, then label it with a POSIX label @@ -5332,7 +5452,7 @@ posix_cred_create(posix_cred_t pcred) * this function will return a pointer to a posix_cred_t which * GRANTS all access (effectively, a "root" credential). This is * necessary to support legacy code which insists on tightly - * integrating POSIX credentails into its APIs, including, but + * integrating POSIX credentials into its APIs, including, but * not limited to, System V IPC mechanisms, POSIX IPC mechanisms, * NFSv3, signals, dtrace, and a large number of kauth routines * used to implement POSIX permissions related system calls. @@ -5369,13 +5489,13 @@ posix_cred_get(kauth_cred_t cred) * Returns: (void) * * Notes: This function is currently void in order to permit it to fit - * in with the currrent MACF framework label methods which allow - * labelling to fail silently. This is like acceptable for + * in with the current MACF framework label methods which allow + * labeling to fail silently. This is like acceptable for * mandatory access controls, but not for POSIX, since those * access controls are advisory. We will need to consider a * return value in a future version of the MACF API. * - * This operation currenty can not fail, as currently the POSIX + * This operation currently cannot fail, as currently the POSIX * credential is a subfield of the kauth_cred_t (ucred), which * MUST be valid. In the future, this will not be the case. */ diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 3283ee3c0..5913c5456 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -173,7 +173,6 @@ extern struct wait_queue select_conflict_queue; /* * Descriptor management. */ -struct filelist filehead; /* head of list of open files */ struct fmsglist fmsghead; /* head of list of open files */ struct fmsglist fmsg_ithead; /* head of list of open files */ int nfiles; /* actual number of open files */ @@ -184,7 +183,6 @@ lck_grp_t * file_lck_grp; lck_attr_t * file_lck_attr; lck_mtx_t * uipc_lock; -lck_mtx_t * file_flist_lock; /* @@ -210,7 +208,6 @@ file_lock_init(void) file_lck_attr = lck_attr_alloc_init(); uipc_lock = lck_mtx_alloc_init(file_lck_grp, file_lck_attr); - file_flist_lock = lck_mtx_alloc_init(file_lck_grp, file_lck_attr); } @@ -866,7 +863,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - if ((fl.l_whence == SEEK_CUR) && (fl.l_start + offset < fl.l_start)) { + volatile off_t affected_lock_area_set = 0; + affected_lock_area_set = fl.l_start + offset; + if ((fl.l_whence == SEEK_CUR) && (affected_lock_area_set < fl.l_start)) { error = EOVERFLOW; goto outdrop; } @@ -941,11 +940,13 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) if (error) goto outdrop; + volatile off_t affected_lock_area_end = 0; + affected_lock_area_end = fl.l_start + offset; /* Check starting byte and ending byte for EOVERFLOW in SEEK_CUR */ /* and ending byte for EOVERFLOW in SEEK_SET */ if (((fl.l_whence == SEEK_CUR) && - ((fl.l_start + offset < fl.l_start) || - ((fl.l_len > 0) && (fl.l_start+offset + fl.l_len - 1 < fl.l_start+offset)))) || + ((affected_lock_area_end < fl.l_start) || + ((fl.l_len > 0) && (affected_lock_area_end + fl.l_len - 1 < affected_lock_area_end)))) || ((fl.l_whence == SEEK_SET) && (fl.l_len > 0) && (fl.l_start + fl.l_len - 1 < fl.l_start))) { /* lf_advlock doesn't check start/end for F_GETLK if file has no locks */ @@ -1161,6 +1162,18 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto out; + case F_SINGLE_WRITER: + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + if (uap->arg) + fp->f_fglob->fg_flag |= FSINGLE_WRITER; + else + fp->f_fglob->fg_flag &= ~FSINGLE_WRITER; + + goto out; + case F_GLOBAL_NOCACHE: if (fp->f_type != DTYPE_VNODE) { error = EBADF; @@ -1239,58 +1252,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } goto outdrop; - - case F_READBOOTSTRAP: - case F_WRITEBOOTSTRAP: { - user32_fbootstraptransfer_t user32_fbt_struct; - user_fbootstraptransfer_t user_fbt_struct; - int sizeof_struct; - caddr_t boot_structp; - - if (fp->f_type != DTYPE_VNODE) { - error = EBADF; - goto out; - } - vp = (struct vnode *)fp->f_data; - proc_fdunlock(p); - - if (IS_64BIT_PROCESS(p)) { - sizeof_struct = sizeof(user_fbt_struct); - boot_structp = (caddr_t) &user_fbt_struct; - } - else { - sizeof_struct = sizeof(user32_fbt_struct); - boot_structp = (caddr_t) &user32_fbt_struct; - } - error = copyin(argp, boot_structp, sizeof_struct); - if (error) - goto outdrop; - if ( (error = vnode_getwithref(vp)) ) { - goto outdrop; - } - if (uap->cmd == F_WRITEBOOTSTRAP) { - /* - * Make sure that we are root. Updating the - * bootstrap on a disk could be a security hole - */ - if (!is_suser()) { - (void)vnode_put(vp); - error = EACCES; - goto outdrop; - } - } - if (strncmp(vnode_mount(vp)->mnt_vfsstat.f_fstypename, "hfs", - sizeof(vnode_mount(vp)->mnt_vfsstat.f_fstypename)) != 0) { - error = EINVAL; - } else { - /* - * call vnop_ioctl to handle the I/O - */ - error = VNOP_IOCTL(vp, uap->cmd, boot_structp, 0, &context); - } - (void)vnode_put(vp); - goto outdrop; - } case F_LOG2PHYS: case F_LOG2PHYS_EXT: { struct log2phys l2p_struct; /* structure for allocate command */ @@ -1577,7 +1538,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } -#define CS_MAX_BLOB_SIZE (1ULL * 1024 * 1024) /* XXX ? */ +#define CS_MAX_BLOB_SIZE (1280ULL * 1024) /* max shared cache file XXX ? */ if (fs.fs_blob_size > CS_MAX_BLOB_SIZE) { error = E2BIG; vnode_put(vp); @@ -1692,7 +1653,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) break; } -#ifdef CONFIG_PROTECT +#if CONFIG_PROTECT case F_GETPROTECTIONCLASS: { int class = 0; @@ -1746,28 +1707,80 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) vnode_put(vp); break; } + + case F_TRANSCODEKEY: { + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + + error = cp_vnode_transcode (vp); + vnode_put(vp); + break; + } + + case F_GETPROTECTIONLEVEL: { + uint32_t cp_version = 0; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + + vp = (struct vnode*) fp->f_data; + proc_fdunlock (p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + + /* + * if cp_get_major_vers fails, error will be set to proper errno + * and cp_version will still be 0. + */ + + error = cp_get_root_major_vers (vp, &cp_version); + *retval = cp_version; + + vnode_put (vp); + break; + } + #endif /* CONFIG_PROTECT */ - + case F_MOVEDATAEXTENTS: { struct fileproc *fp2 = NULL; struct vnode *src_vp = NULLVP; struct vnode *dst_vp = NULLVP; /* We need to grab the 2nd FD out of the argments before moving on. */ int fd2 = CAST_DOWN_EXPLICIT(int32_t, uap->arg); - + if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; } - vp = src_vp = (struct vnode *)fp->f_data; /* For now, special case HFS+ only, since this is SPI. */ + src_vp = (struct vnode *)fp->f_data; if (src_vp->v_tag != VT_HFS) { error = EINVAL; goto out; } - /* We're still holding the proc FD lock */ + /* + * Get the references before we start acquiring iocounts on the vnodes, + * while we still hold the proc fd lock + */ if ( (error = fp_lookup(p, fd2, &fp2, 1)) ) { error = EBADF; goto out; @@ -1778,8 +1791,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto out; } dst_vp = (struct vnode *)fp2->f_data; - - /* For now, special case HFS+ only, since this is SPI. */ if (dst_vp->v_tag != VT_HFS) { fp_drop(p, fd2, fp2, 1); error = EINVAL; @@ -1799,8 +1810,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) proc_fdunlock(p); - /* Proc lock dropped; now we have a legit pair of FDs. Go to work */ - if (vnode_getwithref(src_vp)) { fp_drop(p, fd2, fp2, 0); error = ENOENT; @@ -1812,12 +1821,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = ENOENT; goto outdrop; } - + /* * Basic asserts; validate they are not the same and that * both live on the same filesystem. */ - if (dst_vp == src_vp) { vnode_put (src_vp); vnode_put (dst_vp); @@ -1825,7 +1833,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EINVAL; goto outdrop; } - + if (dst_vp->v_mount != src_vp->v_mount) { vnode_put (src_vp); vnode_put (dst_vp); @@ -1834,31 +1842,33 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } + /* Now we have a legit pair of FDs. Go to work */ + /* Now check for write access to the target files */ if(vnode_authorize(src_vp, NULLVP, - (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { + (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { vnode_put(src_vp); vnode_put(dst_vp); fp_drop(p, fd2, fp2, 0); error = EBADF; goto outdrop; } - + if(vnode_authorize(dst_vp, NULLVP, - (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { + (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { vnode_put(src_vp); vnode_put(dst_vp); fp_drop(p, fd2, fp2, 0); error = EBADF; goto outdrop; } - + /* Verify that both vps point to files and not directories */ - if (!vnode_isreg(src_vp) || !vnode_isreg(dst_vp)) { - vnode_put(src_vp); - vnode_put(dst_vp); - fp_drop(p, fd2, fp2, 0); + if ( !vnode_isreg(src_vp) || !vnode_isreg(dst_vp)) { error = EINVAL; + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop (p, fd2, fp2, 0); goto outdrop; } @@ -1866,15 +1876,54 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) * The exchangedata syscall handler passes in 0 for the flags to VNOP_EXCHANGE. * We'll pass in our special bit indicating that the new behavior is expected */ - + error = VNOP_EXCHANGE(src_vp, dst_vp, FSOPT_EXCHANGE_DATA_ONLY, &context); - + vnode_put (src_vp); vnode_put (dst_vp); fp_drop(p, fd2, fp2, 0); break; } + + + /* + * SPI (private) for indicating to a filesystem that subsequent writes to + * the open FD will represent static content. + */ + case F_SETSTATICCONTENT: { + caddr_t ioctl_arg = NULL; + if (uap->arg) { + ioctl_arg = (caddr_t) 1; + } + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + error = vnode_getwithref(vp); + if (error) { + error = ENOENT; + goto outdrop; + } + + /* Only go forward if you have write access */ + vfs_context_t ctx = vfs_context_current(); + if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { + vnode_put(vp); + error = EBADF; + goto outdrop; + } + + error = VNOP_IOCTL(vp, uap->cmd, ioctl_arg, 0, &context); + (void)vnode_put(vp); + + break; + } + /* * Set the vnode pointed to by 'fd' * and tag it as the (potentially future) backing store @@ -1885,12 +1934,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = EBADF; goto out; } - vp = (struct vnode *)fp->f_data; + vp = (struct vnode *)fp->f_data; + if (vp->v_tag != VT_HFS) { error = EINVAL; goto out; - } proc_fdunlock(p); @@ -1910,14 +1959,12 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* If arg != 0, set, otherwise unset */ if (uap->arg) { - error = hfs_set_backingstore (vp, 1); + error = VNOP_IOCTL (vp, uap->cmd, (caddr_t)1, 0, &context); } else { - error = hfs_set_backingstore (vp, 0); + error = VNOP_IOCTL (vp, uap->cmd, (caddr_t)NULL, 0, &context); } - /* Success. explicitly set error to 0. */ - error = 0; - + vnode_put(vp); break; } @@ -1949,7 +1996,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* Check for error from vn_getpath before moving on */ if ((error = vn_getpath(vp, pathbufp, &pathlen)) == 0) { if (vp->v_tag == VT_HFS) { - error = hfs_is_backingstore (vp, &backingstore); + error = VNOP_IOCTL (vp, uap->cmd, (caddr_t) &backingstore, 0, &context); } (void)vnode_put(vp); @@ -1973,7 +2020,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - default: /* * This is an fcntl() that we d not recognize at this level; @@ -3920,7 +3966,7 @@ int falloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, vfs_context_t ctx, int locked) { - struct fileproc *fp, *fq; + struct fileproc *fp; struct fileglob *fg; int error, nfd; @@ -3988,16 +4034,7 @@ falloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, mac_file_label_associate(fp->f_cred, fg); #endif - lck_mtx_lock_spin(file_flist_lock); - - nfiles++; - - if ( (fq = p->p_fd->fd_ofiles[0]) ) { - LIST_INSERT_AFTER(fq->f_fglob, fg, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fg, f_list); - } - lck_mtx_unlock(file_flist_lock); + OSAddAtomic(1, &nfiles); p->p_fd->fd_ofiles[nfd] = fp; @@ -4028,10 +4065,7 @@ falloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, void fg_free(struct fileglob *fg) { - lck_mtx_lock_spin(file_flist_lock); - LIST_REMOVE(fg, f_list); - nfiles--; - lck_mtx_unlock(file_flist_lock); + OSAddAtomic(-1, &nfiles); if (IS_VALID_CRED(fg->fg_cred)) { kauth_cred_unref(&fg->fg_cred); @@ -4089,7 +4123,7 @@ fdexec(proc_t p, short flags) struct fileproc *fp = fdp->fd_ofiles[i]; char *flagp = &fdp->fd_ofileflags[i]; - if (cloexec_default) { + if (fp && cloexec_default) { /* * Reverse the usual semantics of file descriptor * inheritance - all of them should be closed diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 632501473..ee3d66b09 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,7 +93,10 @@ #include "net/net_str_id.h" #include + +#if VM_PRESSURE_EVENTS #include +#endif MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); @@ -188,6 +191,7 @@ static struct filterops proc_filtops = { .f_event = filt_proc, }; +#if VM_PRESSURE_EVENTS static int filt_vmattach(struct knote *kn); static void filt_vmdetach(struct knote *kn); static int filt_vm(struct knote *kn, long hint); @@ -196,6 +200,7 @@ static struct filterops vm_filtops = { .f_detach = filt_vmdetach, .f_event = filt_vm, }; +#endif /* VM_PRESSURE_EVENTS */ extern struct filterops fs_filtops; @@ -271,7 +276,12 @@ static struct filterops *sysfilt_ops[] = { &fs_filtops, /* EVFILT_FS */ &user_filtops, /* EVFILT_USER */ &bad_filtops, /* unused */ +#if VM_PRESSURE_EVENTS &vm_filtops, /* EVFILT_VM */ +#else + &bad_filtops, /* EVFILT_VM */ +#endif + &file_filtops, /* EVFILT_SOCK */ }; /* @@ -549,12 +559,21 @@ filt_proc(struct knote *kn, long hint) kn->kn_fflags |= NOTE_RESOURCEEND; kn->kn_data = (hint & NOTE_PDATAMASK); } +#if CONFIG_EMBEDDED + /* If the event is one of the APPSTATE events,remove the rest */ + if (((event & NOTE_APPALLSTATES) != 0) && ((kn->kn_sfflags & NOTE_APPALLSTATES) != 0)) { + /* only one state at a time */ + kn->kn_fflags &= ~NOTE_APPALLSTATES; + kn->kn_fflags |= event; + } +#endif /* CONFIG_EMBEDDED */ } /* atomic check, no locking need when called from above */ return (kn->kn_fflags != 0); } +#if VM_PRESSURE_EVENTS /* * Virtual memory kevents * @@ -584,14 +603,15 @@ filt_vm(struct knote *kn, long hint) { /* hint == 0 means this is just an alive? check (always true) */ if (hint != 0) { - /* If this knote is interested in the event specified in hint... */ - if ((kn->kn_sfflags & hint) != 0) { - kn->kn_fflags |= hint; + const pid_t pid = (pid_t)hint; + if ((kn->kn_sfflags & NOTE_VM_PRESSURE) && (kn->kn_kq->kq_p->p_pid == pid)) { + kn->kn_fflags |= NOTE_VM_PRESSURE; } } return (kn->kn_fflags != 0); } +#endif /* VM_PRESSURE_EVENTS */ /* * filt_timervalidate - process data from user @@ -2405,19 +2425,21 @@ knote_detach(struct klist *list, struct knote *kn) * we permanently enqueue them here. * * kqueue and knote references are held by caller. + * + * caller provides the wait queue link structure. */ int -knote_link_wait_queue(struct knote *kn, struct wait_queue *wq) +knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t wql) { struct kqueue *kq = kn->kn_kq; kern_return_t kr; - kr = wait_queue_link(wq, kq->kq_wqs); + kr = wait_queue_link_noalloc(wq, kq->kq_wqs, wql); if (kr == KERN_SUCCESS) { knote_markstayqueued(kn); return 0; } else { - return ENOMEM; + return EINVAL; } } @@ -2427,17 +2449,21 @@ knote_link_wait_queue(struct knote *kn, struct wait_queue *wq) * * Note that the unlink may have already happened from the other side, so * ignore any failures to unlink and just remove it from the kqueue list. + * + * On success, caller is responsible for the link structure */ -void -knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq) +int +knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp) { struct kqueue *kq = kn->kn_kq; + kern_return_t kr; - (void) wait_queue_unlink(wq, kq->kq_wqs); + kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp); kqlock(kq); kn->kn_status &= ~KN_STAYQUEUED; knote_dequeue(kn); kqunlock(kq); + return (kr != KERN_SUCCESS) ? EINVAL : 0; } /* @@ -2487,7 +2513,7 @@ knote_fdclose(struct proc *p, int fd) /* proc_fdlock held on entry (and exit) */ static int -knote_fdpattach(struct knote *kn, struct filedesc *fdp, __unused struct proc *p) +knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p) { struct klist *list = NULL; @@ -2500,10 +2526,18 @@ knote_fdpattach(struct knote *kn, struct filedesc *fdp, __unused struct proc *p) if ((u_int)fdp->fd_knlistsize <= kn->kn_id) { u_int size = 0; + if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur + || kn->kn_id >= (uint64_t)maxfiles) + return (EINVAL); + /* have to grow the fd_knlist */ size = fdp->fd_knlistsize; while (size <= kn->kn_id) size += KQEXTENT; + + if (size >= (UINT_MAX/sizeof(struct klist *))) + return (EINVAL); + MALLOC(list, struct klist *, size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); if (list == NULL) @@ -2630,7 +2664,11 @@ knote_init(void) /* Initialize the timer filter lock */ lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr); - lck_mtx_init(&vm_pressure_klist_mutex, kq_lck_grp, kq_lck_attr); + +#if VM_PRESSURE_EVENTS + /* Initialize the vm pressure list lock */ + vm_pressure_init(kq_lck_grp, kq_lck_attr); +#endif } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 4c17cd232..dde1b3c40 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -81,6 +81,7 @@ * Version 2.0. */ #include +#include #include #include @@ -128,6 +129,7 @@ #include /* thread_wakeup() */ #include #include +#include #if CONFIG_MACF #include @@ -138,9 +140,18 @@ #include #include #include +#include + +#include #include +#include + +#if CONFIG_MEMORYSTATUS +#include +#endif + #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ extern void (*dtrace_fasttrap_exec_ptr)(proc_t); @@ -154,7 +165,8 @@ extern void dtrace_lazy_dofs_destroy(proc_t); thread_t fork_create_child(task_t parent_task, proc_t child_proc, int inherit_memory, int is64bit); void vfork_exit(proc_t p, int rv); int setsigvec(proc_t, thread_t, int, struct __kern_sigaction *, boolean_t in_sigstart); -extern void proc_apply_task_networkbg_internal(proc_t); +extern void proc_apply_task_networkbg_internal(proc_t, thread_t); +int task_set_cpuusage(task_t task, uint64_t percentage, uint64_t interval, uint64_t deadline, int scope); /* * Mach things for which prototypes are unavailable from Mach headers @@ -218,11 +230,27 @@ static int exec_add_apple_strings(struct image_params *imgp); static int exec_handle_sugid(struct image_params *imgp); static int sugid_scripts = 0; SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, ""); -static kern_return_t create_unix_stack(vm_map_t map, user_addr_t user_stack, - int customstack, proc_t p); +static kern_return_t create_unix_stack(vm_map_t map, load_result_t* load_result, proc_t p); static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size); static void exec_resettextvp(proc_t, struct image_params *); static int check_for_signature(proc_t, struct image_params *); +static void exec_prefault_data(proc_t, struct image_params *, load_result_t *); + +#if !CONFIG_EMBEDDED + +/* Identify process during exec and opt into legacy behaviors */ + +struct legacy_behavior { + uuid_t process_uuid; + uint32_t legacy_mask; +}; + +static const struct legacy_behavior legacy_behaviors[] = +{ + {{ 0xF8, 0x7C, 0xC3, 0x67, 0xFB, 0x68, 0x37, 0x93, 0xBC, 0x34, 0xB2, 0xB6, 0x05, 0x2B, 0xCD, 0xE2 }, PROC_LEGACY_BEHAVIOR_IOTHROTTLE }, + {{ 0x0B, 0x4E, 0xDF, 0xD8, 0x76, 0xD1, 0x3D, 0x4D, 0x9D, 0xD7, 0x37, 0x43, 0x1C, 0xA8, 0xFB, 0x26 }, PROC_LEGACY_BEHAVIOR_IOTHROTTLE }, +}; +#endif /* !CONFIG_EMBEDDED */ /* We don't want this one exported */ __private_extern__ @@ -374,96 +402,6 @@ exec_reset_save_path(struct image_params *imgp) return (0); } -#ifdef IMGPF_POWERPC -/* - * exec_powerpc32_imgact - * - * Implicitly invoke the PowerPC handler for a byte-swapped image magic - * number. This may happen either as a result of an attempt to invoke a - * PowerPC image directly, or indirectly as the interpreter used in an - * interpreter script. - * - * Parameters; struct image_params * image parameter block - * - * Returns: -1 not an PowerPC image (keep looking) - * -3 Success: exec_archhandler_ppc: relookup - * >0 Failure: exec_archhandler_ppc: error number - * - * Note: This image activator does not handle the case of a direct - * invocation of the exec_archhandler_ppc, since in that case, the - * exec_archhandler_ppc itself is not a PowerPC binary; instead, - * binary image activators must recognize the exec_archhandler_ppc; - * This is managed in exec_check_permissions(). - * - * Note: This image activator is limited to 32 bit powerpc images; - * if support for 64 bit powerpc images is desired, it would - * be more in line with this design to write a separate 64 bit - * image activator. - */ -static int -exec_powerpc32_imgact(struct image_params *imgp) -{ - struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata; - int error; - size_t len = 0; - - /* - * Make sure it's a PowerPC binary. If we've already redirected - * from an interpreted file once, don't do it again. - */ - if (mach_header->magic != MH_CIGAM) { - /* - * If it's a cross-architecture 64 bit binary, then claim - * it, but refuse to run it. - */ - if (mach_header->magic == MH_CIGAM_64) - return (EBADARCH); - return (-1); - } - - /* If there is no exec_archhandler_ppc, we can't run it */ - if (exec_archhandler_ppc.path[0] == 0) - return (EBADARCH); - - /* Remember the type of the original file for later grading */ - if (!imgp->ip_origcputype) { - imgp->ip_origcputype = - OSSwapBigToHostInt32(mach_header->cputype); - imgp->ip_origcpusubtype = - OSSwapBigToHostInt32(mach_header->cpusubtype); - } - - /* - * The PowerPC flag will be set by the exec_check_permissions() - * call anyway; however, we set this flag here so that the relookup - * in execve() does not follow symbolic links, as a side effect. - */ - imgp->ip_flags |= IMGPF_POWERPC; - - /* impute an interpreter */ - error = copystr(exec_archhandler_ppc.path, imgp->ip_interp_buffer, - IMG_SHSIZE, &len); - if (error) - return (error); - - exec_reset_save_path(imgp); - exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer), - UIO_SYSSPACE); - - /* - * provide a replacement string for p->p_comm; we have to use an - * alternate buffer for this, rather than replacing it directly, - * since the exec may fail and return to the parent. In that case, - * we would have erroneously changed the parent p->p_comm instead. - */ - strlcpy(imgp->ip_p_comm, imgp->ip_ndp->ni_cnd.cn_nameptr, MAXCOMLEN+1); - /* +1 to allow MAXCOMLEN characters to be copied */ - - return (-3); -} -#endif /* IMGPF_POWERPC */ - - /* * exec_shell_imgact * @@ -511,11 +449,6 @@ exec_shell_imgact(struct image_params *imgp) return (-1); } -#ifdef IMGPF_POWERPC - if ((imgp->ip_flags & IMGPF_POWERPC) != 0) - return (EBADARCH); -#endif /* IMGPF_POWERPC */ - imgp->ip_flags |= IMGPF_INTERPRET; imgp->ip_interp_sugid_fd = -1; imgp->ip_interp_buffer[0] = '\0'; @@ -792,8 +725,15 @@ exec_mach_imgact(struct image_params *imgp) /* * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference * is a reserved field on the end, so for the most part, we can - * treat them as if they were identical. - */ + * treat them as if they were identical. Reverse-endian Mach-O + * binaries are recognized but not compatible. + */ + if ((mach_header->magic == MH_CIGAM) || + (mach_header->magic == MH_CIGAM_64)) { + error = EBADARCH; + goto bad; + } + if ((mach_header->magic != MH_MAGIC) && (mach_header->magic != MH_MAGIC_64)) { error = -1; @@ -874,21 +814,6 @@ grade: AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc, imgp->ip_endenvv - imgp->ip_endargv); -#ifdef IMGPF_POWERPC - /* - * XXX - * - * Should be factored out; this is here because we might be getting - * invoked this way as the result of a shell script, and the check - * in exec_check_permissions() is not interior to the jump back up - * to the "encapsulated_binary:" label in exec_activate_image(). - */ - if (imgp->ip_vattr->va_fsid == exec_archhandler_ppc.fsid && - imgp->ip_vattr->va_fileid == exec_archhandler_ppc.fileid) { - imgp->ip_flags |= IMGPF_POWERPC; - } -#endif /* IMGPF_POWERPC */ - /* * We are being called to activate an image subsequent to a vfork() * operation; in this case, we know that our task, thread, and @@ -971,10 +896,6 @@ grade: vm_map_exec(get_task_map(task), task, (void *) p->p_fd->fd_rdir, -#ifdef IMGPF_POWERPC - imgp->ip_flags & IMGPF_POWERPC ? - CPU_TYPE_POWERPC : -#endif cpu_type()); /* @@ -997,8 +918,7 @@ grade: if (load_result.unixproc && create_unix_stack(get_task_map(task), - load_result.user_stack, - load_result.customstack, + &load_result, p) != KERN_SUCCESS) { error = load_return_to_errno(LOAD_NOSPACE); goto badtoolate; @@ -1043,6 +963,9 @@ grade: load_result.all_image_info_size); } + /* Avoid immediate VM faults back into kernel */ + exec_prefault_data(p, imgp, &load_result); + if (vfexec || spawn) { vm_map_switch(old_map); } @@ -1096,6 +1019,28 @@ grade: memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid)); +#if !CONFIG_EMBEDDED + unsigned int i; + + if (!vfexec && !spawn) { + if (p->p_legacy_behavior & PROC_LEGACY_BEHAVIOR_IOTHROTTLE) { + throttle_legacy_process_decr(); + } + } + + p->p_legacy_behavior = 0; + for (i=0; i < sizeof(legacy_behaviors)/sizeof(legacy_behaviors[0]); i++) { + if (0 == uuid_compare(legacy_behaviors[i].process_uuid, p->p_uuid)) { + p->p_legacy_behavior = legacy_behaviors[i].legacy_mask; + break; + } + } + + if (p->p_legacy_behavior & PROC_LEGACY_BEHAVIOR_IOTHROTTLE) { + throttle_legacy_process_incr(); + } +#endif + // dtrace code cleanup needed #if CONFIG_DTRACE /* @@ -1154,18 +1099,11 @@ grade: } } -#ifdef IMGPF_POWERPC /* - * Mark the process as powerpc or not. If powerpc, set the affinity - * flag, which will be used for grading binaries in future exec's - * from the process. + * Ensure the 'translated' and 'affinity' flags are cleared, since we + * no longer run PowerPC binaries. */ - if (((imgp->ip_flags & IMGPF_POWERPC) != 0)) - OSBitOrAtomic(P_TRANSLATED, &p->p_flag); - else -#endif /* IMGPF_POWERPC */ - OSBitAndAtomic(~((uint32_t)P_TRANSLATED), &p->p_flag); - OSBitAndAtomic(~((uint32_t)P_AFFINITY), &p->p_flag); + OSBitAndAtomic(~((uint32_t)(P_TRANSLATED | P_AFFINITY)), &p->p_flag); /* * If posix_spawned with the START_SUSPENDED flag, stop the @@ -1179,22 +1117,54 @@ grade: proc_unlock(p); (void) task_suspend(p->task); } - if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START) || (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START) || (psa->psa_flags & POSIX_SPAWN_IOS_APP_START)) { +#if CONFIG_EMBEDDED + if ((psa->psa_flags & POSIX_SPAWN_IOS_RESV1_APP_START) || (psa->psa_flags & POSIX_SPAWN_IOS_APPLE_DAEMON_START) || (psa->psa_flags & POSIX_SPAWN_IOS_APP_START)) { + if ((psa->psa_flags & POSIX_SPAWN_IOS_RESV1_APP_START)) + apptype = PROC_POLICY_IOS_RESV1_APPTYPE; + else if (psa->psa_flags & POSIX_SPAWN_IOS_APPLE_DAEMON_START) + apptype = PROC_POLICY_IOS_APPLE_DAEMON; + else if (psa->psa_flags & POSIX_SPAWN_IOS_APP_START) + apptype = PROC_POLICY_IOS_APPTYPE; + else + apptype = PROC_POLICY_OSX_APPTYPE_NONE; + proc_set_task_apptype(p->task, apptype, imgp->ip_new_thread); + if (apptype == PROC_POLICY_IOS_RESV1_APPTYPE) + proc_apply_task_networkbg_internal(p, NULL); + } + + if (psa->psa_apptype & POSIX_SPAWN_APPTYPE_IOS_APPLEDAEMON) { + apptype = PROC_POLICY_IOS_APPLE_DAEMON; + proc_set_task_apptype(p->task, apptype, imgp->ip_new_thread); + } +#else /* CONFIG_EMBEDDED */ + if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START) || (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START)) { if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START)) apptype = PROC_POLICY_OSX_APPTYPE_TAL; else if (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START) apptype = PROC_POLICY_OSX_APPTYPE_DBCLIENT; - else if (psa->psa_flags & POSIX_SPAWN_IOS_APP_START) - apptype = PROC_POLICY_IOS_APPTYPE; else - apptype = 0; - proc_set_task_apptype(p->task, apptype); + apptype = PROC_POLICY_OSX_APPTYPE_NONE; + proc_set_task_apptype(p->task, apptype, NULL); if ((apptype == PROC_POLICY_OSX_APPTYPE_TAL) || (apptype == PROC_POLICY_OSX_APPTYPE_DBCLIENT)) { - - proc_apply_task_networkbg_internal(p); + proc_apply_task_networkbg_internal(p, NULL); } } + if ((psa->psa_apptype & POSIX_SPAWN_APPTYPE_OSX_TAL) || + (psa->psa_apptype & POSIX_SPAWN_APPTYPE_OSX_WIDGET)) { + if ((psa->psa_apptype & POSIX_SPAWN_APPTYPE_OSX_TAL)) + apptype = PROC_POLICY_OSX_APPTYPE_TAL; + else if (psa->psa_flags & POSIX_SPAWN_APPTYPE_OSX_WIDGET) + apptype = PROC_POLICY_OSX_APPTYPE_DBCLIENT; + else + apptype = PROC_POLICY_OSX_APPTYPE_NONE; + proc_set_task_apptype(p->task, apptype, imgp->ip_new_thread); + if ((apptype == PROC_POLICY_OSX_APPTYPE_TAL) || + (apptype == PROC_POLICY_OSX_APPTYPE_DBCLIENT)) { + proc_apply_task_networkbg_internal(p, NULL); + } + } +#endif /* CONFIG_EMBEDDED */ } /* @@ -1249,9 +1219,6 @@ struct execsw { } execsw[] = { { exec_mach_imgact, "Mach-o Binary" }, { exec_fat_imgact, "Fat Binary" }, -#ifdef IMGPF_POWERPC - { exec_powerpc32_imgact, "PowerPC binary" }, -#endif /* IMGPF_POWERPC */ { exec_shell_imgact, "Interpreter Script" }, { NULL, NULL} }; @@ -1393,16 +1360,6 @@ encapsulated_binary: NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_strings), imgp->ip_vfs_context); -#ifdef IMGPF_POWERPC - /* - * PowerPC does not follow symlinks because the - * code which sets exec_archhandler_ppc.fsid and - * exec_archhandler_ppc.fileid doesn't follow them. - */ - if (imgp->ip_flags & IMGPF_POWERPC) - nd.ni_cnd.cn_flags &= ~FOLLOW; -#endif /* IMGPF_POWERPC */ - proc_transend(p, 0); goto again; @@ -1455,57 +1412,55 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) _ps_port_action_t *act = NULL; task_t task = p->task; ipc_port_t port = NULL; - errno_t ret = KERN_SUCCESS; + errno_t ret = 0; int i; for (i = 0; i < pacts->pspa_count; i++) { act = &pacts->pspa_actions[i]; if (ipc_object_copyin(get_task_ipcspace(current_task()), - CAST_MACH_PORT_TO_NAME(act->new_port), - MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port) != KERN_SUCCESS) - return EINVAL; - - if (ret) - return ret; + act->new_port, MACH_MSG_TYPE_COPY_SEND, + (ipc_object_t *) &port) != KERN_SUCCESS) + return (EINVAL); switch (act->port_type) { - case PSPA_SPECIAL: - /* Only allowed when not under vfork */ - if (!(psa_flags & POSIX_SPAWN_SETEXEC)) - return ENOTSUP; - ret = (task_set_special_port(task, - act->which, - port) == KERN_SUCCESS) ? 0 : EINVAL; - break; - case PSPA_EXCEPTION: - /* Only allowed when not under vfork */ - if (!(psa_flags & POSIX_SPAWN_SETEXEC)) - return ENOTSUP; - ret = (task_set_exception_ports(task, - act->mask, - port, - act->behavior, - act->flavor) == KERN_SUCCESS) ? 0 : EINVAL; - break; + case PSPA_SPECIAL: + /* Only allowed when not under vfork */ + if (!(psa_flags & POSIX_SPAWN_SETEXEC)) + ret = ENOTSUP; + else if (task_set_special_port(task, + act->which, port) != KERN_SUCCESS) + ret = EINVAL; + break; + + case PSPA_EXCEPTION: + /* Only allowed when not under vfork */ + if (!(psa_flags & POSIX_SPAWN_SETEXEC)) + ret = ENOTSUP; + else if (task_set_exception_ports(task, + act->mask, port, act->behavior, + act->flavor) != KERN_SUCCESS) + ret = EINVAL; + break; #if CONFIG_AUDIT - case PSPA_AU_SESSION: - ret = audit_session_spawnjoin(p, - port); - break; + case PSPA_AU_SESSION: + ret = audit_session_spawnjoin(p, port); + break; #endif - default: - ret = EINVAL; + default: + ret = EINVAL; + break; } + /* action failed, so release port resources */ + if (ret) { ipc_port_release_send(port); - return ret; + break; } } - return ret; + return (ret); } /* @@ -2059,10 +2014,30 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) error = setsigvec(p, child_thread, sig + 1, &vec, spawn_no_exec); } } + + /* + * Activate the CPU usage monitor, if requested. This is done via a task-wide, per-thread CPU + * usage limit, which will generate a resource exceeded exception if any one thread exceeds the + * limit. + * + * Userland gives us interval in seconds, and the kernel SPI expects nanoseconds. + */ + if (px_sa.psa_cpumonitor_percent != 0) { + error = proc_set_task_ruse_cpu(p->task, + TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC, + px_sa.psa_cpumonitor_percent, + px_sa.psa_cpumonitor_interval * NSEC_PER_SEC, + 0); + } } bad: if (error == 0) { + /* reset delay idle sleep status if set */ +#if !CONFIG_EMBEDDED + if ((p->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP) + OSBitAndAtomic(~((uint32_t)P_DELAYIDLESLEEP), &p->p_flag); +#endif /* !CONFIG_EMBEDDED */ /* upon successful spawn, re/set the proc control state */ if (imgp->ip_px_sa != NULL) { switch (px_sa.psa_pcontrol) { @@ -2080,8 +2055,20 @@ bad: p->p_pcaction = 0; break; }; +#if !CONFIG_EMBEDDED + if ((px_sa.psa_apptype & POSIX_SPAWN_APPTYPE_DELAYIDLESLEEP) != 0) + OSBitOrAtomic(P_DELAYIDLESLEEP, &p->p_flag); +#endif /* !CONFIG_EMBEDDED */ } exec_resettextvp(p, imgp); + +#if CONFIG_EMBEDDED + /* Has jetsam attributes? */ + if (imgp->ip_px_sa != NULL) { + memorystatus_list_change((px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY), + p->p_pid, px_sa.psa_priority, -1, px_sa.psa_high_water_mark); + } +#endif } /* @@ -2969,6 +2956,9 @@ random_hex_str(char *str, int len) #define ENTROPY_VALUES 2 #define ENTROPY_KEY "malloc_entropy=" +#define PFZ_KEY "pfz=" +extern user32_addr_t commpage_text32_location; +extern user64_addr_t commpage_text64_location; /* * Build up the contents of the apple[] string vector */ @@ -2976,16 +2966,31 @@ static int exec_add_apple_strings(struct image_params *imgp) { int i, error; - int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + int new_ptr_size=4; char guard[19]; char guard_vec[strlen(GUARD_KEY) + 19 * GUARD_VALUES + 1]; char entropy[19]; char entropy_vec[strlen(ENTROPY_KEY) + 19 * ENTROPY_VALUES + 1]; + char pfz_string[strlen(PFZ_KEY) + 16 + 4 +1]; + + if( imgp->ip_flags & IMGPF_IS_64BIT) { + new_ptr_size = 8; + snprintf(pfz_string, sizeof(pfz_string),PFZ_KEY "0x%llx",commpage_text64_location); + }else{ + snprintf(pfz_string, sizeof(pfz_string),PFZ_KEY "0x%x",commpage_text32_location); + } + /* exec_save_path stored the first string */ imgp->ip_applec = 1; + /* adding the pfz string */ + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(pfz_string),UIO_SYSSPACE,FALSE); + if(error) + goto bad; + imgp->ip_applec++; + /* * Supply libc with a collection of random values to use when * implementing -fstack-protector. @@ -3116,18 +3121,6 @@ exec_check_permissions(struct image_params *imgp) #endif -#ifdef IMGPF_POWERPC - /* - * If the file we are about to attempt to load is the exec_handler_ppc, - * which is determined by matching the vattr fields against previously - * cached values, then we set the PowerPC environment flag. - */ - if (vap->va_fsid == exec_archhandler_ppc.fsid && - vap->va_fileid == exec_archhandler_ppc.fileid) { - imgp->ip_flags |= IMGPF_POWERPC; - } -#endif /* IMGPF_POWERPC */ - /* XXX May want to indicate to underlying FS that vnode is open */ return (error); @@ -3390,64 +3383,79 @@ handle_mac_transition: * limits on stack growth, if they end up being needed. * * Parameters: p Process to set stack on - * user_stack Address to set stack for process to - * customstack FALSE if no custom stack in binary - * map Address map in which to allocate the - * new stack, if 'customstack' is FALSE + * load_result Information from mach-o load commands + * map Address map in which to allocate the new stack * * Returns: KERN_SUCCESS Stack successfully created * !KERN_SUCCESS Mach failure code */ static kern_return_t -create_unix_stack(vm_map_t map, user_addr_t user_stack, int customstack, +create_unix_stack(vm_map_t map, load_result_t* load_result, proc_t p) { mach_vm_size_t size, prot_size; mach_vm_offset_t addr, prot_addr; kern_return_t kr; + mach_vm_address_t user_stack = load_result->user_stack; + proc_lock(p); p->user_stack = user_stack; proc_unlock(p); - if (!customstack) { + if (!load_result->prog_allocated_stack) { /* * Allocate enough space for the maximum stack size we * will ever authorize and an extra page to act as - * a guard page for stack overflows. + * a guard page for stack overflows. For default stacks, + * vm_initial_limit_stack takes care of the extra guard page. + * Otherwise we must allocate it ourselves. */ - size = mach_vm_round_page(MAXSSIZ); -#if STACK_GROWTH_UP - addr = mach_vm_trunc_page(user_stack); -#else /* STACK_GROWTH_UP */ - addr = mach_vm_trunc_page(user_stack - size); -#endif /* STACK_GROWTH_UP */ + + size = mach_vm_round_page(load_result->user_stack_size); + if (load_result->prog_stack_size) + size += PAGE_SIZE; + addr = mach_vm_trunc_page(load_result->user_stack - size); kr = mach_vm_allocate(map, &addr, size, VM_MAKE_TAG(VM_MEMORY_STACK) | - VM_FLAGS_FIXED); + VM_FLAGS_FIXED); if (kr != KERN_SUCCESS) { - return kr; + /* If can't allocate at default location, try anywhere */ + addr = 0; + kr = mach_vm_allocate(map, &addr, size, + VM_MAKE_TAG(VM_MEMORY_STACK) | + VM_FLAGS_ANYWHERE); + if (kr != KERN_SUCCESS) + return kr; + + user_stack = addr + size; + load_result->user_stack = user_stack; + + proc_lock(p); + p->user_stack = user_stack; + proc_unlock(p); } + /* * And prevent access to what's above the current stack * size limit for this process. */ prot_addr = addr; -#if STACK_GROWTH_UP - prot_addr += unix_stack_size(p); -#endif /* STACK_GROWTH_UP */ - prot_addr = mach_vm_round_page(prot_addr); - prot_size = mach_vm_trunc_page(size - unix_stack_size(p)); + if (load_result->prog_stack_size) + prot_size = PAGE_SIZE; + else + prot_size = mach_vm_trunc_page(size - unix_stack_size(p)); kr = mach_vm_protect(map, - prot_addr, - prot_size, - FALSE, - VM_PROT_NONE); + prot_addr, + prot_size, + FALSE, + VM_PROT_NONE); if (kr != KERN_SUCCESS) { (void) mach_vm_deallocate(map, addr, size); return kr; } } + return KERN_SUCCESS; } @@ -3891,3 +3899,131 @@ done: return error; } +/* + * Typically as soon as we start executing this process, the + * first instruction will trigger a VM fault to bring the text + * pages (as executable) into the address space, followed soon + * thereafter by dyld data structures (for dynamic executable). + * To optimize this, as well as improve support for hardware + * debuggers that can only access resident pages present + * in the process' page tables, we prefault some pages if + * possible. Errors are non-fatal. + */ +static void exec_prefault_data(proc_t p __unused, struct image_params *imgp, load_result_t *load_result) +{ + int ret; + size_t expected_all_image_infos_size; + + /* + * Prefault executable or dyld entry point. + */ + vm_fault( current_map(), + vm_map_trunc_page(load_result->entry_point), + VM_PROT_READ | VM_PROT_EXECUTE, + FALSE, + THREAD_UNINT, NULL, 0); + + if (imgp->ip_flags & IMGPF_IS_64BIT) { + expected_all_image_infos_size = sizeof(struct user64_dyld_all_image_infos); + } else { + expected_all_image_infos_size = sizeof(struct user32_dyld_all_image_infos); + } + + /* Decode dyld anchor structure from */ + if (load_result->dynlinker && + load_result->all_image_info_addr && + load_result->all_image_info_size >= expected_all_image_infos_size) { + union { + struct user64_dyld_all_image_infos infos64; + struct user32_dyld_all_image_infos infos32; + } all_image_infos; + + /* + * Pre-fault to avoid copyin() going through the trap handler + * and recovery path. + */ + vm_fault( current_map(), + vm_map_trunc_page(load_result->all_image_info_addr), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); + if ((load_result->all_image_info_addr & PAGE_MASK) + expected_all_image_infos_size > PAGE_SIZE) { + /* all_image_infos straddles a page */ + vm_fault( current_map(), + vm_map_trunc_page(load_result->all_image_info_addr + expected_all_image_infos_size - 1), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); + } + + ret = copyin(load_result->all_image_info_addr, + &all_image_infos, + expected_all_image_infos_size); + if (ret == 0 && all_image_infos.infos32.version >= 9) { + + user_addr_t notification_address; + user_addr_t dyld_image_address; + user_addr_t dyld_version_address; + user_addr_t dyld_all_image_infos_address; + user_addr_t dyld_slide_amount; + + if (imgp->ip_flags & IMGPF_IS_64BIT) { + notification_address = all_image_infos.infos64.notification; + dyld_image_address = all_image_infos.infos64.dyldImageLoadAddress; + dyld_version_address = all_image_infos.infos64.dyldVersion; + dyld_all_image_infos_address = all_image_infos.infos64.dyldAllImageInfosAddress; + } else { + notification_address = all_image_infos.infos32.notification; + dyld_image_address = all_image_infos.infos32.dyldImageLoadAddress; + dyld_version_address = all_image_infos.infos32.dyldVersion; + dyld_all_image_infos_address = all_image_infos.infos32.dyldAllImageInfosAddress; + } + + /* + * dyld statically sets up the all_image_infos in its Mach-O + * binary at static link time, with pointers relative to its default + * load address. Since ASLR might slide dyld before its first + * instruction is executed, "dyld_slide_amount" tells us how far + * dyld was loaded compared to its default expected load address. + * All other pointers into dyld's image should be adjusted by this + * amount. At some point later, dyld will fix up pointers to take + * into account the slide, at which point the all_image_infos_address + * field in the structure will match the runtime load address, and + * "dyld_slide_amount" will be 0, if we were to consult it again. + */ + + dyld_slide_amount = load_result->all_image_info_addr - dyld_all_image_infos_address; + +#if 0 + kprintf("exec_prefault: 0x%016llx 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n", + (uint64_t)load_result->all_image_info_addr, + all_image_infos.infos32.version, + (uint64_t)notification_address, + (uint64_t)dyld_image_address, + (uint64_t)dyld_version_address, + (uint64_t)dyld_all_image_infos_address); +#endif + + vm_fault( current_map(), + vm_map_trunc_page(notification_address + dyld_slide_amount), + VM_PROT_READ | VM_PROT_EXECUTE, + FALSE, + THREAD_UNINT, NULL, 0); + vm_fault( current_map(), + vm_map_trunc_page(dyld_image_address + dyld_slide_amount), + VM_PROT_READ | VM_PROT_EXECUTE, + FALSE, + THREAD_UNINT, NULL, 0); + vm_fault( current_map(), + vm_map_trunc_page(dyld_version_address + dyld_slide_amount), + VM_PROT_READ, + FALSE, + THREAD_UNINT, NULL, 0); + vm_fault( current_map(), + vm_map_trunc_page(dyld_all_image_infos_address + dyld_slide_amount), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); + } + } +} diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index 6d04dd6a9..4e9f418be 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,14 @@ #include #include +#if VM_PRESSURE_EVENTS +#include +#endif + +#if CONFIG_MEMORYSTATUS +#include +#endif + #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ extern void (*dtrace_fasttrap_exit_ptr)(proc_t); @@ -140,7 +148,7 @@ extern void dtrace_lazy_dofs_destroy(proc_t); #include extern char init_task_failure_data[]; -void proc_prepareexit(proc_t p, int rv); +void proc_prepareexit(proc_t p, int rv, boolean_t perf_notify); void vfork_exit(proc_t p, int rv); void vproc_exit(proc_t p); __private_extern__ void munge_user64_rusage(struct rusage *a_rusage_p, struct user64_rusage *a_user_rusage_p); @@ -156,15 +164,15 @@ int wait1continue(int result); int waitidcontinue(int result); int *get_bsduthreadrval(thread_t); kern_return_t sys_perf_notify(thread_t thread, int pid); -kern_return_t abnormal_exit_notify(mach_exception_data_type_t code, - mach_exception_data_type_t subcode); +kern_return_t task_exception_notify(exception_type_t exception, + mach_exception_data_type_t code, mach_exception_data_type_t subcode); void delay(int); /* * NOTE: Source and target may *NOT* overlap! * XXX Should share code with bsd/dev/ppc/unix_signal.c */ -static void +void siginfo_user_to_user32(user_siginfo_t *in, user32_siginfo_t *out) { out->si_signo = in->si_signo; @@ -179,7 +187,7 @@ siginfo_user_to_user32(user_siginfo_t *in, user32_siginfo_t *out) out->si_band = in->si_band; /* range reduction */ } -static void +void siginfo_user_to_user64(user_siginfo_t *in, user64_siginfo_t *out) { out->si_signo = in->si_signo; @@ -194,6 +202,24 @@ siginfo_user_to_user64(user_siginfo_t *in, user64_siginfo_t *out) out->si_band = in->si_band; /* range reduction */ } +static int +copyoutsiginfo(user_siginfo_t *native, boolean_t is64, user_addr_t uaddr) +{ + if (is64) { + user64_siginfo_t sinfo64; + + bzero(&sinfo64, sizeof (sinfo64)); + siginfo_user_to_user64(native, &sinfo64); + return (copyout(&sinfo64, uaddr, sizeof (sinfo64))); + } else { + user32_siginfo_t sinfo32; + + bzero(&sinfo32, sizeof (sinfo32)); + siginfo_user_to_user32(native, &sinfo32); + return (copyout(&sinfo32, uaddr, sizeof (sinfo32))); + } +} + /* * exit -- * Death of process. @@ -218,10 +244,17 @@ exit(proc_t p, struct exit_args *uap, int *retval) */ int exit1(proc_t p, int rv, int *retval) +{ + return exit1_internal(p, rv, retval, TRUE, TRUE); +} + +int +exit1_internal(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, boolean_t perf_notify) { thread_t self = current_thread(); struct task *task = p->task; struct uthread *ut; + int error = 0; /* * If a thread in this task has already @@ -231,10 +264,14 @@ exit1(proc_t p, int rv, int *retval) ut = get_bsdthread_info(self); if (ut->uu_flag & UT_VFORK) { - vfork_exit(p, rv); - vfork_return(p , retval, p->p_pid); - unix_syscall_return(0); - /* NOT REACHED */ + if (!thread_can_terminate) { + return EINVAL; + } + + vfork_exit(p, rv); + vfork_return(p , retval, p->p_pid); + unix_syscall_return(0); + /* NOT REACHED */ } /* @@ -254,8 +291,30 @@ exit1(proc_t p, int rv, int *retval) DTRACE_PROC1(exit, int, CLD_EXITED); + /* mark process is going to exit and pull out of DBG/disk throttle */ + proc_removethrottle(p); + +#if CONFIG_MEMORYSTATUS + memorystatus_list_remove(p->p_pid); +#endif + proc_lock(p); - proc_transstart(p, 1); + error = proc_transstart(p, 1); + if (error == EDEADLK) { + /* Temp: If deadlock error, then it implies multithreaded exec is + * in progress. Instread of letting exit continue and + * corrupting the freed memory, let the exit thread + * return. This will save corruption in remote case. + */ + proc_unlock(p); + if (current_proc() == p){ + thread_exception_return(); + } else { + /* external termination like jetsam */ + return(error); + } + } + while (p->exit_thread != self) { if (sig_try_locked(p) <= 0) { proc_transend(p, 1); @@ -264,7 +323,12 @@ exit1(proc_t p, int rv, int *retval) return(0); } proc_unlock(p); + thread_terminate(self); + if (!thread_can_terminate) { + return 0; + } + thread_exception_return(); /* NOTREACHED */ } @@ -287,7 +351,7 @@ exit1(proc_t p, int rv, int *retval) proc_transend(p, 1); proc_unlock(p); - proc_prepareexit(p, rv); + proc_prepareexit(p, rv, perf_notify); /* Last thread to terminate will call proc_exit() */ task_terminate_internal(task); @@ -296,7 +360,7 @@ exit1(proc_t p, int rv, int *retval) } void -proc_prepareexit(proc_t p, int rv) +proc_prepareexit(proc_t p, int rv, boolean_t perf_notify) { mach_exception_data_type_t code, subcode; struct uthread *ut; @@ -323,12 +387,14 @@ proc_prepareexit(proc_t p, int rv) ((ut->uu_exception & 0x0f) << 20) | ((int)ut->uu_code & 0xfffff); subcode = ut->uu_subcode; - (void) abnormal_exit_notify(code, subcode); + (void) task_exception_notify(EXC_CRASH, code, subcode); } skipcheck: - /* Notify the perf server */ - (void)sys_perf_notify(self, p->p_pid); + /* Notify the perf server? */ + if (perf_notify) { + (void)sys_perf_notify(self, p->p_pid); + } /* * Remove proc from allproc queue and from pidhash chain. @@ -386,7 +452,7 @@ proc_exit(proc_t p) p->p_lflag |= P_LEXIT; proc_transend(p, 1); proc_unlock(p); - proc_prepareexit(p, 0); + proc_prepareexit(p, 0, TRUE); (void) task_terminate_internal(task); proc_lock(p); } else { @@ -410,8 +476,9 @@ proc_exit(proc_t p) proc_unlock(p); pid = p->p_pid; exitval = p->p_xstat; - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_START, - pid, exitval, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_START, + pid, exitval, 0, 0, 0); #if CONFIG_DTRACE /* @@ -451,6 +518,10 @@ proc_exit(proc_t p) nspace_proc_exit(p); +#if VM_PRESSURE_EVENTS + vm_pressure_proc_cleanup(p); +#endif + /* * need to cancel async IO requests that can be cancelled and wait for those * already active. MAY BLOCK! @@ -458,6 +529,9 @@ proc_exit(proc_t p) proc_refdrain(p); + /* if any pending cpu limits action, clear it */ + task_clear_cpuusage(p->task); + workqueue_mark_exiting(p); workqueue_exit(p); @@ -480,6 +554,12 @@ proc_exit(proc_t p) throttle_lowpri_io(FALSE); } +#if !CONFIG_EMBEDDED + if (p->p_legacy_behavior & PROC_LEGACY_BEHAVIOR_IOTHROTTLE) { + throttle_legacy_process_decr(); + } +#endif + #if SYSV_SHM /* Close ref SYSV Shared memory*/ if (p->vm_shm) @@ -501,8 +581,7 @@ proc_exit(proc_t p) struct vnode *ttyvp; int ttyvid; struct vfs_context context; - struct tty * tp; - + struct tty *tp; /* * Controlling process. @@ -510,55 +589,43 @@ proc_exit(proc_t p) * drain controlling terminal * and revoke access to controlling terminal. */ + session_lock(sessp); tp = SESSION_TP(sessp); - if ((tp != TTY_NULL) && (tp->t_session == sessp)) { - tty_pgsignal(tp, SIGHUP, 1); - - session_lock(sessp); - /* reget potentially tp due to revocation */ - tp = SESSION_TP(sessp); - ttyvp = sessp->s_ttyvp; - ttyvid = sessp->s_ttyvid; - sessp->s_ttyvp = NULLVP; - sessp->s_ttyvid = 0; - sessp->s_ttyp = TTY_NULL; - sessp->s_ttypgrpid = NO_PID; session_unlock(sessp); - if ((ttyvp != NULLVP) && (vnode_getwithvid(ttyvp, ttyvid) == 0)) { + tty_pgsignal(tp, SIGHUP, 1); - if (tp != TTY_NULL) { - tty_lock(tp); - (void) ttywait(tp); - tty_unlock(tp); - } - context.vc_thread = proc_thread(p); /* XXX */ - context.vc_ucred = kauth_cred_proc_ref(p); - VNOP_REVOKE(ttyvp, REVOKEALL, &context); - vnode_put(ttyvp); - kauth_cred_unref(&context.vc_ucred); - } - } else { session_lock(sessp); - /* reget potentially tp due to revocation */ tp = SESSION_TP(sessp); - ttyvp = sessp->s_ttyvp; - sessp->s_ttyvp = NULLVP; - sessp->s_ttyvid = 0; - sessp->s_ttyp = TTY_NULL; - sessp->s_ttypgrpid = NO_PID; - session_unlock(sessp); + } + ttyvp = sessp->s_ttyvp; + ttyvid = sessp->s_ttyvid; + sessp->s_ttyvp = NULLVP; + sessp->s_ttyvid = 0; + sessp->s_ttyp = TTY_NULL; + sessp->s_ttypgrpid = NO_PID; + session_unlock(sessp); + + if ((ttyvp != NULLVP) && (vnode_getwithvid(ttyvp, ttyvid) == 0)) { + if (tp != TTY_NULL) { + tty_lock(tp); + (void) ttywait(tp); + tty_unlock(tp); + } + context.vc_thread = proc_thread(p); /* XXX */ + context.vc_ucred = kauth_cred_proc_ref(p); + vnode_rele(ttyvp); + VNOP_REVOKE(ttyvp, REVOKEALL, &context); + vnode_put(ttyvp); + kauth_cred_unref(&context.vc_ucred); + ttyvp = NULLVP; } if (ttyvp) vnode_rele(ttyvp); - /* - * s_ttyp is not zero'd; we use this to indicate - * that the session once had a controlling terminal. - * (for logging and informational purposes) - */ + if (tp) + ttyfree(tp); } - session_lock(sessp); sessp->s_leader = NULL; session_unlock(sessp); @@ -783,8 +850,9 @@ proc_exit(proc_t p) * The write is to an int and is coherent. Also parent is * keyed off of list lock for reaping */ - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END, - pid, exitval, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END, + pid, exitval, 0, 0, 0); p->p_stat = SZOMB; /* * The current process can be reaped so, no one @@ -806,8 +874,9 @@ proc_exit(proc_t p) * keyed off of list lock for reaping */ proc_list_lock(); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END, - pid, exitval, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, + BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END, + pid, exitval, 0, 0, 0); /* check for sysctl zomb lookup */ while ((p->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) { msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); @@ -1209,6 +1278,12 @@ out: return (error); } +#if DEBUG +#define ASSERT_LCK_MTX_OWNED(lock) \ + lck_mtx_assert(lock, LCK_MTX_ASSERT_OWNED) +#else +#define ASSERT_LCK_MTX_OWNED(lock) /* nothing */ +#endif int waitidcontinue(int result) @@ -1218,12 +1293,12 @@ waitidcontinue(int result) int *retval; if (result) - return(result); + return (result); thread = current_thread(); vt = get_bsduthreadarg(thread); retval = get_bsduthreadrval(thread); - return(waitid(current_proc(), (struct waitid_args *)vt, retval)); + return (waitid(current_proc(), (struct waitid_args *)vt, retval)); } /* @@ -1232,7 +1307,7 @@ waitidcontinue(int result) * * Parameters: uap->idtype one of P_PID, P_PGID, P_ALL * uap->id pid_t or gid_t or ignored - * uap->infop Address of signinfo_t struct in + * uap->infop Address of siginfo_t struct in * user space into which to return status * uap->options flag values * @@ -1243,33 +1318,24 @@ int waitid(proc_t q, struct waitid_args *uap, int32_t *retval) { __pthread_testcancel(1); - return(waitid_nocancel(q, (struct waitid_nocancel_args *)uap, retval)); + return (waitid_nocancel(q, (struct waitid_nocancel_args *)uap, retval)); } int -waitid_nocancel(proc_t q, struct waitid_nocancel_args *uap, __unused int32_t *retval) +waitid_nocancel(proc_t q, struct waitid_nocancel_args *uap, + __unused int32_t *retval) { - user_siginfo_t collect64; /* siginfo data to return to caller */ - + user_siginfo_t siginfo; /* siginfo data to return to caller */ + boolean_t caller64 = IS_64BIT_PROCESS(q); int nfound; proc_t p; int error; - /* - * Forced validation of options for T.waitpid 21; should be a TSD! - * This will pass the test, but note that we have more bits than the - * standard specifies that we will allow in, in this case. The test - * passes because they light all the bits, not just the ones we allow, - * and so the following check returns EINVAL like the test wants. - */ - if (((uap->options & (WNOHANG|WNOWAIT|WCONTINUED|WUNTRACED|WSTOPPED|WEXITED)) != uap->options) || - (uap->options == 0)) + if (uap->options == 0 || + (uap->options & ~(WNOHANG|WNOWAIT|WCONTINUED|WSTOPPED|WEXITED))) return (EINVAL); /* bits set that aren't recognized */ - /* - * Overly critical options checking, per POSIX - */ - switch(uap->idtype) { + switch (uap->idtype) { case P_PID: /* child with process ID equal to... */ case P_PGID: /* child with process group ID equal to... */ if (((int)uap->id) < 0) @@ -1284,7 +1350,8 @@ loop: loop1: nfound = 0; for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { - switch(uap->idtype) { + + switch (uap->idtype) { case P_PID: /* child with process ID equal to... */ if (p->p_pid != (pid_t)uap->id) continue; @@ -1304,68 +1371,44 @@ loop1: * the single return for waited process guarantee. */ if (p->p_listflag & P_LIST_WAITING) { - (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitidcoll", 0); + (void) msleep(&p->p_stat, proc_list_mlock, + PWAIT, "waitidcoll", 0); goto loop1; } p->p_listflag |= P_LIST_WAITING; /* mark busy */ nfound++; - /* - * Types of processes we are interested in - * - * XXX Don't know what to do for WCONTINUED?!? - */ - switch(p->p_stat) { + bzero(&siginfo, sizeof (siginfo)); + + switch (p->p_stat) { case SZOMB: /* Exited */ if (!(uap->options & WEXITED)) break; - - /* drop the lock and the thread is going to return */ proc_list_unlock(); +#if CONFIG_MACF + if ((error = mac_proc_check_wait(q, p)) != 0) + goto out; +#endif + siginfo.si_signo = SIGCHLD; + siginfo.si_pid = p->p_pid; + siginfo.si_status = WEXITSTATUS(p->p_xstat); + if (WIFSIGNALED(p->p_xstat)) { + siginfo.si_code = WCOREDUMP(p->p_xstat) ? + CLD_DUMPED : CLD_KILLED; + } else + siginfo.si_code = CLD_EXITED; - /* Collect "siginfo" information for caller */ - collect64.si_signo = SIGCHLD; - collect64.si_code = 0; - collect64.si_errno = 0; - collect64.si_pid = 0; - collect64.si_uid = 0; - collect64.si_addr = 0; - collect64.si_status = WEXITSTATUS(p->p_xstat); - collect64.si_band = 0; - - if (IS_64BIT_PROCESS(p)) { - user64_siginfo_t sinfo64; - - siginfo_user_to_user64(&collect64, &sinfo64); - - error = copyout((caddr_t)&sinfo64, - uap->infop, - sizeof(sinfo64)); - } else { - user32_siginfo_t sinfo32; - - siginfo_user_to_user32(&collect64, &sinfo32); - - error = copyout((caddr_t)&sinfo32, - uap->infop, - sizeof(sinfo32)); - } - /* information unavailable? */ - if (error) + if ((error = copyoutsiginfo(&siginfo, + caller64, uap->infop)) != 0) goto out; /* Prevent other process for waiting for this event? */ if (!(uap->options & WNOWAIT)) { - /* Clean up */ - (void)reap_child_locked(q, p, 0, 0, 0); - } else { - proc_list_lock(); - p->p_listflag &= ~P_LIST_WAITING; - proc_list_unlock(); + (void) reap_child_locked(q, p, 0, 0, 0); + return (0); } - - return (0); + goto out; case SSTOP: /* Stopped */ /* @@ -1381,41 +1424,18 @@ loop1: */ if ((p->p_lflag & P_LWAITED) != 0) break; - - /* drop the lock and the thread is going to return */ proc_list_unlock(); +#if CONFIG_MACF + if ((error = mac_proc_check_wait(q, p)) != 0) + goto out; +#endif + siginfo.si_signo = SIGCHLD; + siginfo.si_pid = p->p_pid; + siginfo.si_status = p->p_xstat; /* signal number */ + siginfo.si_code = CLD_STOPPED; - /* Collect "siginfo" information for caller */ - collect64.si_signo = SIGCHLD; - collect64.si_code = 0; - collect64.si_errno = 0; - collect64.si_pid = 0; - collect64.si_uid = 0; - collect64.si_addr = 0; - proc_lock(p); - collect64.si_status = p->p_xstat; - proc_unlock(p); - collect64.si_band = 0; - - if (IS_64BIT_PROCESS(p)) { - user64_siginfo_t sinfo64; - - siginfo_user_to_user64(&collect64, &sinfo64); - - error = copyout((caddr_t)&sinfo64, - uap->infop, - sizeof(sinfo64)); - } else { - user32_siginfo_t sinfo32; - - siginfo_user_to_user32(&collect64, &sinfo32); - - error = copyout((caddr_t)&sinfo32, - uap->infop, - sizeof(sinfo32)); - } - /* information unavailable? */ - if (error) + if ((error = copyoutsiginfo(&siginfo, + caller64, uap->infop)) != 0) goto out; /* Prevent other process for waiting for this event? */ @@ -1424,12 +1444,9 @@ loop1: p->p_lflag |= P_LWAITED; proc_unlock(p); } - - error = 0; goto out; - default: /* All others */ - /* ...meaning Continued */ + default: /* All other states => Continued */ if (!(uap->options & WCONTINUED)) break; @@ -1440,60 +1457,40 @@ loop1: */ if ((p->p_flag & P_CONTINUED) == 0) break; - - /* drop the lock and the thread is going to return */ proc_list_unlock(); - - /* Collect "siginfo" information for caller */ +#if CONFIG_MACF + if ((error = mac_proc_check_wait(q, p)) != 0) + goto out; +#endif + siginfo.si_signo = SIGCHLD; + siginfo.si_code = CLD_CONTINUED; proc_lock(p); - collect64.si_signo = SIGCHLD; - collect64.si_code = CLD_CONTINUED; - collect64.si_errno = 0; - collect64.si_pid = p->p_contproc; - collect64.si_uid = 0; - collect64.si_addr = 0; - collect64.si_status = p->p_xstat; - collect64.si_band = 0; + siginfo.si_pid = p->p_contproc; + siginfo.si_status = p->p_xstat; proc_unlock(p); - if (IS_64BIT_PROCESS(p)) { - user64_siginfo_t sinfo64; - - siginfo_user_to_user64(&collect64, &sinfo64); - - error = copyout((caddr_t)&sinfo64, - uap->infop, - sizeof(sinfo64)); - } else { - user32_siginfo_t sinfo32; - - siginfo_user_to_user32(&collect64, &sinfo32); - - error = copyout((caddr_t)&sinfo32, - uap->infop, - sizeof(sinfo32)); - } - /* information unavailable? */ - if (error) + if ((error = copyoutsiginfo(&siginfo, + caller64, uap->infop)) != 0) goto out; /* Prevent other process for waiting for this event? */ if (!(uap->options & WNOWAIT)) { - OSBitAndAtomic(~((uint32_t)P_CONTINUED), &p->p_flag); + OSBitAndAtomic(~((uint32_t)P_CONTINUED), + &p->p_flag); } - - error = 0; goto out; } - /* LIST LOCK IS HELD HERE */ + ASSERT_LCK_MTX_OWNED(proc_list_mlock); + /* Not a process we are interested in; go on to next child */ - + p->p_listflag &= ~P_LIST_WAITING; wakeup(&p->p_stat); } + ASSERT_LCK_MTX_OWNED(proc_list_mlock); - /* list lock is always held */ /* No child processes that could possibly satisfy the request? */ + if (nfound == 0) { proc_list_unlock(); return (ECHILD); @@ -1501,10 +1498,24 @@ loop1: if (uap->options & WNOHANG) { proc_list_unlock(); +#if CONFIG_MACF + if ((error = mac_proc_check_wait(q, p)) != 0) + return (error); +#endif + /* + * The state of the siginfo structure in this case + * is undefined. Some implementations bzero it, some + * (like here) leave it untouched for efficiency. + * + * Thus the most portable check for "no matching pid with + * WNOHANG" is to store a zero into si_pid before + * invocation, then check for a non-zero value afterwards. + */ return (0); } - if ((error = msleep0((caddr_t)q, proc_list_mlock, PWAIT | PCATCH | PDROP, "waitid", 0, waitidcontinue))) + if ((error = msleep0(q, proc_list_mlock, + PWAIT | PCATCH | PDROP, "waitid", 0, waitidcontinue)) != 0) return (error); goto loop; @@ -1682,6 +1693,12 @@ vproc_exit(proc_t p) */ fdfree(p); +#if !CONFIG_EMBEDDED + if (p->p_legacy_behavior & PROC_LEGACY_BEHAVIOR_IOTHROTTLE) { + throttle_legacy_process_decr(); + } +#endif + sessp = proc_session(p); if (SESS_LEADER(p, sessp)) { @@ -1689,7 +1706,7 @@ vproc_exit(proc_t p) struct vnode *ttyvp; int ttyvid; struct vfs_context context; - struct tty * tp; + struct tty *tp; /* * Controlling process. @@ -1697,54 +1714,43 @@ vproc_exit(proc_t p) * drain controlling terminal * and revoke access to controlling terminal. */ + session_lock(sessp); tp = SESSION_TP(sessp); - if ((tp != TTY_NULL) && (tp->t_session == sessp)) { + session_unlock(sessp); + tty_pgsignal(tp, SIGHUP, 1); - tty_lock(tp); - (void) ttywait(tp); - tty_unlock(tp); - /* - * The tty could have been revoked - * if we blocked. - */ session_lock(sessp); - /* reget in case of race */ tp = SESSION_TP(sessp); - ttyvp = sessp->s_ttyvp; - ttyvid = sessp->s_ttyvid; - sessp->s_ttyvp = NULL; - sessp->s_ttyvid = 0; - sessp->s_ttyp = TTY_NULL; - sessp->s_ttypgrpid = NO_PID; - session_unlock(sessp); - - if ((ttyvp != NULLVP) && (vnode_getwithvid(ttyvp, ttyvid) == 0)) { - context.vc_thread = proc_thread(p); /* XXX */ - context.vc_ucred = kauth_cred_proc_ref(p); - VNOP_REVOKE(ttyvp, REVOKEALL, &context); - vnode_put(ttyvp); - kauth_cred_unref(&context.vc_ucred); + } + ttyvp = sessp->s_ttyvp; + ttyvid = sessp->s_ttyvid; + sessp->s_ttyvp = NULL; + sessp->s_ttyvid = 0; + sessp->s_ttyp = TTY_NULL; + sessp->s_ttypgrpid = NO_PID; + session_unlock(sessp); + + if ((ttyvp != NULLVP) && (vnode_getwithvid(ttyvp, ttyvid) == 0)) { + if (tp != TTY_NULL) { + tty_lock(tp); + (void) ttywait(tp); + tty_unlock(tp); } - } else { - session_lock(sessp); - ttyvp = sessp->s_ttyvp; - sessp->s_ttyvp = NULL; - sessp->s_ttyvid = 0; - sessp->s_ttyp = TTY_NULL; - sessp->s_ttypgrpid = NO_PID; - session_unlock(sessp); + context.vc_thread = proc_thread(p); /* XXX */ + context.vc_ucred = kauth_cred_proc_ref(p); + vnode_rele(ttyvp); + VNOP_REVOKE(ttyvp, REVOKEALL, &context); + vnode_put(ttyvp); + kauth_cred_unref(&context.vc_ucred); + ttyvp = NULLVP; } if (ttyvp) vnode_rele(ttyvp); - /* - * s_ttyp is not zero'd; we use this to indicate - * that the session once had a controlling terminal. - * (for logging and informational purposes) - */ + if (tp) + ttyfree(tp); } - session_lock(sessp); sessp->s_leader = NULL; session_unlock(sessp); @@ -1838,13 +1844,13 @@ vproc_exit(proc_t p) #ifdef FIXME if (task) { - task_basic_info_data_t tinfo; + mach_task_basic_info_data_t tinfo; task_thread_times_info_data_t ttimesinfo; int task_info_stuff, task_ttimes_stuff; struct timeval ut,st; - task_info_stuff = TASK_BASIC_INFO_COUNT; - task_info(task, TASK_BASIC_INFO, + task_info_stuff = MACH_TASK_BASIC_INFO_COUNT; + task_info(task, MACH_TASK_BASIC_INFO, &tinfo, &task_info_stuff); p->p_ru->ru_utime.tv_sec = tinfo.user_time.seconds; p->p_ru->ru_utime.tv_usec = tinfo.user_time.microseconds; diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index 76c1fbae6..37d02887f 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -129,6 +129,10 @@ extern void dtrace_lazy_dofs_duplicate(proc_t, proc_t); #include +#if CONFIG_MEMORYSTATUS +#include +#endif + /* XXX routines which should have Mach prototypes, but don't */ void thread_set_parent(thread_t parent, int pid); extern void act_thread_catt(void *ctx); @@ -158,8 +162,8 @@ void proc_vfork_end(proc_t parent_proc); * Notes: Although this function increments a count, a count in * excess of 1 is not currently supported. According to the * POSIX standard, calling anything other than execve() or - * _exit() fillowing a vfork(), including calling vfork() - * itself again, will result in undefned behaviour + * _exit() following a vfork(), including calling vfork() + * itself again, will result in undefined behaviour */ void proc_vfork_begin(proc_t parent_proc) @@ -179,7 +183,7 @@ proc_vfork_begin(proc_t parent_proc) * * Returns: (void) * - * Notes: Decerements the count; currently, reentrancy of vfork() + * Notes: Decrements the count; currently, reentrancy of vfork() * is unsupported on the current process */ void @@ -189,7 +193,6 @@ proc_vfork_end(proc_t parent_proc) parent_proc->p_vforkcnt--; if (parent_proc->p_vforkcnt < 0) panic("vfork cnt is -ve"); - /* resude the vfork count; clear the flag when it goes to 0 */ if (parent_proc->p_vforkcnt == 0) parent_proc->p_lflag &= ~P_LVFORK; proc_unlock(parent_proc); @@ -650,6 +653,12 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) /* return the thread pointer to the caller */ *child_threadp = child_thread; +#if CONFIG_MEMORYSTATUS + if (!err) { + memorystatus_list_add(child_proc->p_pid, DEFAULT_JETSAM_PRIORITY, -1); + } +#endif + bad: /* * In the error case, we return a 0 value for the returned pid (but @@ -671,48 +680,53 @@ bad: * this is done by reassociating the parent process structure * with the task, thread, and uthread. * + * Refer to the ASCII art above vfork() to figure out the + * state we're undoing. + * * Parameters: child_proc Child process * retval System call return value array * rval Return value to present to parent * * Returns: void * - * Note: The caller resumes or exits the parent, as appropriate, after - * callling this function. + * Notes: The caller resumes or exits the parent, as appropriate, after + * calling this function. */ void vfork_return(proc_t child_proc, int32_t *retval, int rval) { - proc_t parent_proc = child_proc->p_pptr; - thread_t parent_thread = (thread_t)current_thread(); - uthread_t parent_uthread = (uthread_t)get_bsdthread_info(parent_thread); + task_t parent_task = get_threadtask(child_proc->p_vforkact); + proc_t parent_proc = get_bsdtask_info(parent_task); + thread_t th = current_thread(); + uthread_t uth = get_bsdthread_info(th); - act_thread_catt(parent_uthread->uu_userstate); + act_thread_catt(uth->uu_userstate); - /* end vfork in parent */ + /* clear vfork state in parent proc structure */ proc_vfork_end(parent_proc); /* REPATRIATE PARENT TASK, THREAD, UTHREAD */ - parent_uthread->uu_userstate = 0; - parent_uthread->uu_flag &= ~UT_VFORK; + uth->uu_userstate = 0; + uth->uu_flag &= ~UT_VFORK; /* restore thread-set-id state */ - if (parent_uthread->uu_flag & UT_WASSETUID) { - parent_uthread->uu_flag |= UT_SETUID; - parent_uthread->uu_flag &= UT_WASSETUID; + if (uth->uu_flag & UT_WASSETUID) { + uth->uu_flag |= UT_SETUID; + uth->uu_flag &= UT_WASSETUID; } - parent_uthread->uu_proc = 0; - parent_uthread->uu_sigmask = parent_uthread->uu_vforkmask; - child_proc->p_lflag &= ~P_LINVFORK; - child_proc->p_vforkact = (void *)0; + uth->uu_proc = 0; + uth->uu_sigmask = uth->uu_vforkmask; + + proc_lock(child_proc); + child_proc->p_lflag &= ~P_LINVFORK; + child_proc->p_vforkact = 0; + proc_unlock(child_proc); - thread_set_parent(parent_thread, rval); + thread_set_parent(th, rval); if (retval) { retval[0] = rval; retval[1] = 0; /* mark parent */ } - - return; } @@ -1006,6 +1020,12 @@ forkproc_free(proc_t p) /* Need to undo the effects of the fdcopy(), if any */ fdfree(p); +#if !CONFIG_EMBEDDED + if (p->p_legacy_behavior & PROC_LEGACY_BEHAVIOR_IOTHROTTLE) { + throttle_legacy_process_decr(); + } +#endif + /* * Drop the reference on a text vnode pointer, if any * XXX This code is broken in forkproc(); see ; @@ -1174,9 +1194,20 @@ retry: * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_fork. */ +#if !CONFIG_EMBEDDED + child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY | P_DISABLE_ASLR | P_DELAYIDLESLEEP)); +#else /* !CONFIG_EMBEDDED */ child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY | P_DISABLE_ASLR)); +#endif /* !CONFIG_EMBEDDED */ if (parent_proc->p_flag & P_PROFIL) startprofclock(child_proc); + +#if !CONFIG_EMBEDDED + if (child_proc->p_legacy_behavior & PROC_LEGACY_BEHAVIOR_IOTHROTTLE) { + throttle_legacy_process_incr(); + } +#endif + /* * Note that if the current thread has an assumed identity, this * credential will be granted to the new process. @@ -1319,6 +1350,9 @@ retry: } #endif + /* Default to no tracking of dirty state */ + child_proc->p_dirty = 0; + bad: return(child_proc); } @@ -1393,6 +1427,7 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) p = (proc_t) get_bsdtask_info(task); uth = (uthread_t)ut; uth->uu_kwe.kwe_uth = uth; + uth->uu_thread = thread; /* * Thread inherits credential from the creating thread, if both @@ -1445,6 +1480,9 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) if (p->p_dtrace_ptss_pages != NULL) { uth->t_dtrace_scratch = dtrace_ptss_claim_entry(p); } +#endif +#if CONFIG_MACF + mac_thread_label_init(uth); #endif } @@ -1532,6 +1570,9 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info) if (tmpptr != NULL) { dtrace_ptss_release_entry(p, tmpptr); } +#endif +#if CONFIG_MACF + mac_thread_label_destroy(uth); #endif } } diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index b7775864c..13e4c97db 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -127,11 +127,8 @@ typedef enum { static int lf_clearlock(struct lockf *); static overlap_t lf_findoverlap(struct lockf *, struct lockf *, int, struct lockf ***, struct lockf **); -static struct lockf *lf_getblock(struct lockf *); -static int lf_getlock(struct lockf *, struct flock *); -#if CONFIG_EMBEDDED -static int lf_getlockpid(struct vnode *, struct flock *); -#endif +static struct lockf *lf_getblock(struct lockf *, pid_t); +static int lf_getlock(struct lockf *, struct flock *, pid_t); static int lf_setlock(struct lockf *); static int lf_split(struct lockf *, struct lockf *); static void lf_wakelock(struct lockf *, boolean_t); @@ -174,11 +171,6 @@ lf_advlock(struct vnop_advlock_args *ap) /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */ -#if CONFIG_EMBEDDED - if (ap->a_op == F_GETLKPID) - return lf_getlockpid(vp, fl); -#endif - /* * Avoid the common case of unlocking when inode has no locks. */ @@ -287,9 +279,16 @@ lf_advlock(struct vnop_advlock_args *ap) break; case F_GETLK: - error = lf_getlock(lock, fl); + error = lf_getlock(lock, fl, -1); + FREE(lock, M_LOCKF); + break; + +#if CONFIG_EMBEDDED + case F_GETLKPID: + error = lf_getlock(lock, fl, fl->l_pid); FREE(lock, M_LOCKF); break; +#endif default: FREE(lock, M_LOCKF); @@ -302,6 +301,36 @@ lf_advlock(struct vnop_advlock_args *ap) return (error); } +/* + * Empty the queue of msleeping requests for a lock on the given vnode. + * Called with the vnode already locked. Used for forced unmount, where + * a flock(2) invoker sleeping on a blocked lock holds an iocount reference + * that prevents the vnode from ever being drained. Force unmounting wins. + */ +void +lf_abort_advlocks(vnode_t vp) +{ + struct lockf *lock; + + if ((lock = vp->v_lockf) == NULL) + return; + + lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); + + if (!TAILQ_EMPTY(&lock->lf_blkhd)) { + struct lockf *tlock; + + TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) { + /* + * Setting this flag should cause all + * currently blocked F_SETLK request to + * return to userland with an errno. + */ + tlock->lf_flags |= F_ABORT; + } + lf_wakelock(lock, TRUE); + } +} /* * Take any lock attempts which are currently blocked by a given lock ("from") @@ -351,8 +380,6 @@ lf_coalesce_adjacent(struct lockf *lock) * NOTE: Assumes that if two locks are adjacent on the number line * and belong to the same owner, then they are adjacent on the list. */ - - /* If the lock ends adjacent to us, we can coelesce it */ if ((*lf)->lf_end != -1 && ((*lf)->lf_end + 1) == lock->lf_start) { struct lockf *adjacent = *lf; @@ -439,7 +466,7 @@ lf_setlock(struct lockf *lock) /* * Scan lock list for this file looking for locks that would block us. */ - while ((block = lf_getblock(lock))) { + while ((block = lf_getblock(lock, -1))) { /* * Free the structure and return if nonblocking. */ @@ -553,10 +580,14 @@ lf_setlock(struct lockf *lock) error = msleep(lock, &vp->v_lock, priority, lockstr, 0); if (!TAILQ_EMPTY(&lock->lf_blkhd)) { - if ((block = lf_getblock(lock))) { + if ((block = lf_getblock(lock, -1))) { lf_move_blocked(block, lock); } } + + if (error == 0 && (lock->lf_flags & F_ABORT) != 0) + error = EBADF; + if (error) { /* XXX */ /* * We may have been awakened by a signal and/or by a @@ -816,6 +847,7 @@ lf_clearlock(struct lockf *unlock) * fl Pointer to flock structure to receive * the blocking lock information, if a * blocking lock is found. + * matchpid -1, or pid value to match in lookup. * * Returns: 0 Success * @@ -828,7 +860,7 @@ lf_clearlock(struct lockf *unlock) * the blocking process ID for advisory record locks. */ static int -lf_getlock(struct lockf *lock, struct flock *fl) +lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid) { struct lockf *block; @@ -837,7 +869,7 @@ lf_getlock(struct lockf *lock, struct flock *fl) lf_print("lf_getlock", lock); #endif /* LOCKF_DEBUGGING */ - if ((block = lf_getblock(lock))) { + if ((block = lf_getblock(lock, matchpid))) { fl->l_type = block->lf_type; fl->l_whence = SEEK_SET; fl->l_start = block->lf_start; @@ -855,56 +887,6 @@ lf_getlock(struct lockf *lock, struct flock *fl) return (0); } -#if CONFIG_EMBEDDED -int lf_getlockpid(struct vnode *vp, struct flock *fl) -{ - struct lockf *lf, *blk; - - if (vp == 0) - return EINVAL; - - fl->l_type = F_UNLCK; - - lck_mtx_lock(&vp->v_lock); - - for (lf = vp->v_lockf; lf; lf = lf->lf_next) { - - if (lf->lf_flags & F_POSIX) { - if ((((struct proc *)lf->lf_id)->p_pid) == fl->l_pid) { - fl->l_type = lf->lf_type; - fl->l_whence = SEEK_SET; - fl->l_start = lf->lf_start; - if (lf->lf_end == -1) - fl->l_len = 0; - else - fl->l_len = lf->lf_end - lf->lf_start + 1; - - break; - } - } - - TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { - if (blk->lf_flags & F_POSIX) { - if ((((struct proc *)blk->lf_id)->p_pid) == fl->l_pid) { - fl->l_type = blk->lf_type; - fl->l_whence = SEEK_SET; - fl->l_start = blk->lf_start; - if (blk->lf_end == -1) - fl->l_len = 0; - else - fl->l_len = blk->lf_end - blk->lf_start + 1; - - break; - } - } - } - } - - lck_mtx_unlock(&vp->v_lock); - return (0); -} -#endif - /* * lf_getblock * @@ -915,29 +897,35 @@ int lf_getlockpid(struct vnode *vp, struct flock *fl) * * Parameters: lock The lock for which we are interested * in obtaining the blocking lock, if any + * matchpid -1, or pid value to match in lookup. * * Returns: NOLOCKF No blocking lock exists * !NOLOCKF The address of the blocking lock's * struct lockf. */ static struct lockf * -lf_getblock(struct lockf *lock) +lf_getblock(struct lockf *lock, pid_t matchpid) { struct lockf **prev, *overlap, *lf = *(lock->lf_head); - int ovcase; - prev = lock->lf_head; - while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) != OVERLAP_NONE) { + for (prev = lock->lf_head; + lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE; + lf = overlap->lf_next) { /* - * We've found an overlap, see if it blocks us + * Found an overlap. + * + * If we're matching pids, and it's a record lock, + * but the pid doesn't match, then keep on looking .. */ - if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) - return (overlap); + if (matchpid != -1 && + (overlap->lf_flags & F_POSIX) != 0 && + proc_pid((struct proc *)(overlap->lf_id)) != matchpid) + continue; /* - * Nope, point to the next one on the list and - * see if it blocks us + * does it block us? */ - lf = overlap->lf_next; + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); } return (NOLOCKF); } @@ -970,7 +958,7 @@ lf_getblock(struct lockf *lock) * this is generally used to relink the * lock list, avoiding a second iteration. * *overlap The pointer to the overlapping lock - * itself; this is ussed to return data in + * itself; this is used to return data in * the check == OTHERS case, and for the * caller to modify the overlapping lock, * in the check == SELF case diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index c1700ee51..ee021079a 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -273,11 +273,11 @@ const char *memname[] = { "fileglob", /* 99 M_FILEGLOB */ "kauth", /* 100 M_KAUTH */ "dummynet", /* 101 M_DUMMYNET */ -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL "unsafe_fsnode", /* 102 M_UNSAFEFS */ #else "", /* 102 M_UNSAFEFS */ -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ "macpipelabel", /* 103 M_MACPIPELABEL */ "mactemp", /* 104 M_MACTEMP */ "sbuf", /* 105 M_SBUF */ @@ -459,11 +459,11 @@ struct kmzones { { SOS(fileglob), KMZ_CREATEZONE, TRUE }, /* 99 M_FILEGLOB */ { 0, KMZ_MALLOC, FALSE }, /* 100 M_KAUTH */ { 0, KMZ_MALLOC, FALSE }, /* 101 M_DUMMYNET */ -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL { SOS(unsafe_fsnode),KMZ_CREATEZONE, TRUE }, /* 102 M_UNSAFEFS */ #else { 0, KMZ_MALLOC, FALSE }, /* 102 M_UNSAFEFS */ -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ { 0, KMZ_MALLOC, FALSE }, /* 103 M_MACPIPELABEL */ { 0, KMZ_MALLOC, FALSE }, /* 104 M_MACTEMP */ { 0, KMZ_MALLOC, FALSE }, /* 105 M_SBUF */ @@ -562,11 +562,23 @@ _MALLOC( return (NULL); if (flags & M_NOWAIT) { - hdr = (void *)kalloc_noblock(memsize); + if (size > memsize) /* overflow detected */ + return (NULL); + else + hdr = (void *)kalloc_noblock(memsize); } else { - hdr = (void *)kalloc(memsize); - - if (hdr == NULL) { + if (size > memsize) { + /* + * We get here when the caller told us to block, waiting for memory but an overflow + * has been detected. The caller isn't expecting a NULL return code so we panic + * with a descriptive message. + */ + panic("_MALLOC: overflow detected, size %llu ", (uint64_t) size); + } + else + hdr = (void *)kalloc(memsize); + + if (hdr == NULL) { /* * We get here when the caller told us to block waiting for memory, but diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 489ddd2be..1bf8dd616 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -27,245 +27,895 @@ * */ -#include -#include - #include #include +#include #include #include #include #include +#include #include +#include #include #include +#include +#include #include #include #include #include +#include #include #include +#include #include #if CONFIG_FREEZE #include #include +#endif -enum { - kProcessSuspended = (1 << 0), - kProcessHibernated = (1 << 1), - kProcessNoReclaimWorth = (1 << 2), - kProcessIgnored = (1 << 3), - kProcessBusy = (1 << 4) -}; +#include -static lck_mtx_t * hibernation_mlock; -static lck_attr_t * hibernation_lck_attr; -static lck_grp_t * hibernation_lck_grp; -static lck_grp_attr_t * hibernation_lck_grp_attr; - -typedef struct hibernation_node { - RB_ENTRY(hibernation_node) link; - pid_t pid; - uint32_t state; - mach_timespec_t hibernation_ts; -} hibernation_node; - -static int hibernation_tree_compare(hibernation_node *n1, hibernation_node *n2) { - if (n1->pid < n2->pid) - return -1; - else if (n1->pid > n2->pid) - return 1; - else - return 0; -} +/* These are very verbose printfs(), enable with + * MEMORYSTATUS_DEBUG_LOG + */ +#if MEMORYSTATUS_DEBUG_LOG +#define MEMORYSTATUS_DEBUG(cond, format, ...) \ +do { \ + if (cond) { printf(format, ##__VA_ARGS__); } \ +} while(0) +#else +#define MEMORYSTATUS_DEBUG(cond, format, ...) +#endif -static RB_HEAD(hibernation_tree, hibernation_node) hibernation_tree_head; -RB_PROTOTYPE_SC(static, hibernation_tree, hibernation_node, link, hibernation_tree_compare); +/* General memorystatus stuff */ -RB_GENERATE(hibernation_tree, hibernation_node, link, hibernation_tree_compare); +static void memorystatus_add_node(memorystatus_node *node); +static void memorystatus_remove_node(memorystatus_node *node); +static memorystatus_node *memorystatus_get_node(pid_t pid); +static void memorystatus_release_node(memorystatus_node *node); -static inline boolean_t kern_hibernation_can_hibernate_processes(void); -static boolean_t kern_hibernation_can_hibernate(void); +int memorystatus_wakeup = 0; -static void kern_hibernation_add_node(hibernation_node *node); -static hibernation_node *kern_hibernation_get_node(pid_t pid); -static void kern_hibernation_release_node(hibernation_node *node); -static void kern_hibernation_free_node(hibernation_node *node, boolean_t unlock); +static void memorystatus_thread(void *param __unused, wait_result_t wr __unused); -static void kern_hibernation_register_pid(pid_t pid); -static void kern_hibernation_unregister_pid(pid_t pid); +static memorystatus_node *next_memorystatus_node = NULL; -static int kern_hibernation_get_process_state(pid_t pid, uint32_t *state, mach_timespec_t *ts); -static int kern_hibernation_set_process_state(pid_t pid, uint32_t state); +static int memorystatus_list_count = 0; -static void kern_hibernation_cull(void); +static lck_mtx_t * memorystatus_list_mlock; +static lck_attr_t * memorystatus_lck_attr; +static lck_grp_t * memorystatus_lck_grp; +static lck_grp_attr_t * memorystatus_lck_grp_attr; -static void kern_hibernation_thread(void); +static TAILQ_HEAD(memorystatus_list_head, memorystatus_node) memorystatus_list; -extern boolean_t vm_freeze_enabled; +static uint64_t memorystatus_idle_delay_time = 0; -int kern_hibernation_wakeup = 0; +static unsigned int memorystatus_dirty_count = 0; -static int jetsam_priority_list_hibernation_index = 0; +extern void proc_dirty_start(struct proc *p); +extern void proc_dirty_end(struct proc *p); -/* Thresholds */ -static int kern_memorystatus_level_hibernate = 50; +/* Jetsam */ + +#if CONFIG_JETSAM + +extern unsigned int vm_page_free_count; +extern unsigned int vm_page_active_count; +extern unsigned int vm_page_inactive_count; +extern unsigned int vm_page_throttled_count; +extern unsigned int vm_page_purgeable_count; +extern unsigned int vm_page_wire_count; + +static lck_mtx_t * exit_list_mlock; + +static TAILQ_HEAD(exit_list_head, memorystatus_node) exit_list; + +static unsigned int memorystatus_kev_failure_count = 0; + +/* Counted in pages... */ +unsigned int memorystatus_delta = 0; + +unsigned int memorystatus_available_pages = (unsigned int)-1; +unsigned int memorystatus_available_pages_critical = 0; +unsigned int memorystatus_available_pages_highwater = 0; + +/* ...with the exception of the legacy level in percent. */ +unsigned int memorystatus_level = 0; + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_kev_failure_count, CTLFLAG_RD, &memorystatus_kev_failure_count, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD, &memorystatus_level, 0, ""); + +unsigned int memorystatus_jetsam_policy = kPolicyDefault; + +unsigned int memorystatus_jetsam_policy_offset_pages_more_free = 0; +#if DEVELOPMENT || DEBUG +unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0; +#endif + +static memorystatus_jetsam_snapshot_t memorystatus_jetsam_snapshot; +#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot.entries + +static int memorystatus_jetsam_snapshot_list_count = 0; + +int memorystatus_jetsam_wakeup = 0; +unsigned int memorystatus_jetsam_running = 1; + +static uint32_t memorystatus_task_page_count(task_t task); + +static void memorystatus_move_node_to_exit_list(memorystatus_node *node); + +static void memorystatus_update_levels_locked(void); + +static void memorystatus_jetsam_thread_block(void); +static void memorystatus_jetsam_thread(void *param __unused, wait_result_t wr __unused); + +static int memorystatus_send_note(int event_code, void *data, size_t data_length); + +static uint32_t memorystatus_build_flags_from_state(uint32_t state); -#define HIBERNATION_PAGES_MIN ( 1 * 1024 * 1024 / PAGE_SIZE) -#define HIBERNATION_PAGES_MAX (16 * 1024 * 1024 / PAGE_SIZE) +/* VM pressure */ -static unsigned int kern_memorystatus_hibernation_pages_min = HIBERNATION_PAGES_MIN; -static unsigned int kern_memorystatus_hibernation_pages_max = HIBERNATION_PAGES_MAX; +#if VM_PRESSURE_EVENTS -static unsigned int kern_memorystatus_suspended_count = 0; -static unsigned int kern_memorystatus_hibernated_count = 0; +typedef enum vm_pressure_level { + kVMPressureNormal = 0, + kVMPressureWarning = 1, + kVMPressureUrgent = 2, + kVMPressureCritical = 3, +} vm_pressure_level_t; -static unsigned int kern_memorystatus_hibernation_suspended_minimum = 4; +static vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal; -static unsigned int kern_memorystatus_low_swap_pages = 0; +unsigned int memorystatus_available_pages_pressure = 0; + +static inline boolean_t memorystatus_get_pressure_locked(void); +static void memorystatus_check_pressure_reset(void); + +#endif /* VM_PRESSURE_EVENTS */ + +#endif /* CONFIG_JETSAM */ + +/* Freeze */ + +#if CONFIG_FREEZE + +static unsigned int memorystatus_suspended_resident_count = 0; +static unsigned int memorystatus_suspended_count = 0; + +boolean_t memorystatus_freeze_enabled = FALSE; +int memorystatus_freeze_wakeup = 0; + +static inline boolean_t memorystatus_can_freeze_processes(void); +static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low); + +static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused); + +/* Thresholds */ +static unsigned int memorystatus_freeze_threshold = 0; + +static unsigned int memorystatus_freeze_pages_min = FREEZE_PAGES_MIN; +static unsigned int memorystatus_freeze_pages_max = FREEZE_PAGES_MAX; + +static unsigned int memorystatus_frozen_count = 0; + +static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT; + +/* Stats */ +static uint64_t memorystatus_freeze_count = 0; +static uint64_t memorystatus_freeze_pageouts = 0; /* Throttling */ -#define HIBERNATION_DAILY_MB_MAX 1024 -#define HIBERNATION_DAILY_PAGEOUTS_MAX (HIBERNATION_DAILY_MB_MAX * (1024 * 1024 / PAGE_SIZE)) - -static struct throttle_interval_t { - uint32_t mins; - uint32_t burst_multiple; - uint32_t pageouts; - uint32_t max_pageouts; - mach_timespec_t ts; - boolean_t throttle; -} throttle_intervals[] = { - { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */ +static throttle_interval_t throttle_intervals[] = { + { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */ { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */ }; -/* Stats */ -static uint64_t kern_memorystatus_hibernation_count = 0; -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_count, CTLFLAG_RD, &kern_memorystatus_hibernation_count, ""); +static uint64_t memorystatus_freeze_throttle_count = 0; -static uint64_t kern_memorystatus_hibernation_pageouts = 0; -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_pageouts, CTLFLAG_RD, &kern_memorystatus_hibernation_pageouts, ""); +#endif /* CONFIG_FREEZE */ -static uint64_t kern_memorystatus_hibernation_throttle_count = 0; -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_throttle_count, CTLFLAG_RD, &kern_memorystatus_hibernation_throttle_count, ""); +#if CONFIG_JETSAM -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_min_processes, CTLFLAG_RW, &kern_memorystatus_hibernation_suspended_minimum, 0, ""); +/* Debug */ #if DEVELOPMENT || DEBUG -/* Allow parameter tweaking in these builds */ -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_level_hibernate, CTLFLAG_RW, &kern_memorystatus_level_hibernate, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_pages_min, CTLFLAG_RW, &kern_memorystatus_hibernation_pages_min, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_pages_max, CTLFLAG_RW, &kern_memorystatus_hibernation_pages_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD, &memorystatus_available_pages, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RW, &memorystatus_available_pages_critical, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_highwater, CTLFLAG_RW, &memorystatus_available_pages_highwater, 0, ""); +#if VM_PRESSURE_EVENTS +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW, &memorystatus_available_pages_pressure, 0, ""); +#endif /* VM_PRESSURE_EVENTS */ + +/* Diagnostic code */ +enum { + kJetsamDiagnosticModeNone = 0, + kJetsamDiagnosticModeAll = 1, + kJetsamDiagnosticModeStopAtFirstActive = 2, + kJetsamDiagnosticModeCount +} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone; + +static int jetsam_diagnostic_suspended_one_active_proc = 0; + +static int +sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + const char *diagnosticStrings[] = { + "jetsam: diagnostic mode: resetting critical level.", + "jetsam: diagnostic mode: will examine all processes", + "jetsam: diagnostic mode: will stop at first active process" + }; + + int error, val = jetsam_diagnostic_mode; + boolean_t changed = FALSE; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) { + printf("jetsam: diagnostic mode: invalid value - %d\n", val); + return EINVAL; + } + + lck_mtx_lock(memorystatus_list_mlock); + + if ((unsigned int) val != jetsam_diagnostic_mode) { + jetsam_diagnostic_mode = val; + + memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive; + + switch (jetsam_diagnostic_mode) { + case kJetsamDiagnosticModeNone: + /* Already cleared */ + break; + case kJetsamDiagnosticModeAll: + memorystatus_jetsam_policy |= kPolicyDiagnoseAll; + break; + case kJetsamDiagnosticModeStopAtFirstActive: + memorystatus_jetsam_policy |= kPolicyDiagnoseFirst; + break; + default: + /* Already validated */ + break; + } + + memorystatus_update_levels_locked(); + changed = TRUE; + } + + lck_mtx_unlock(memorystatus_list_mlock); + + if (changed) { + printf("%s\n", diagnosticStrings[val]); + } + + return (0); +} + +SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode"); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_more_free, CTLFLAG_RW, &memorystatus_jetsam_policy_offset_pages_more_free, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, ""); + +#if VM_PRESSURE_EVENTS + +#include "vm_pressure.h" + +static int +sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + int error = 0; + + error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0); + if (error) + return (error); + + return SYSCTL_OUT(req, &memorystatus_vm_pressure_level, sizeof(memorystatus_vm_pressure_level)); +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", ""); + +static int +sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, pid = 0; + + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) + return (error); + + if (vm_dispatch_pressure_note_to_pid(pid)) { + return 0; + } + + return EINVAL; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_vm_pressure_send, "I", ""); + +#endif /* VM_PRESSURE_EVENTS */ + +#endif /* CONFIG_JETSAM */ + +#if CONFIG_FREEZE + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW, &memorystatus_freeze_threshold, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW, &memorystatus_freeze_pages_min, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW, &memorystatus_freeze_pages_max, 0, ""); + +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD, &memorystatus_freeze_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD, &memorystatus_freeze_pageouts, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD, &memorystatus_freeze_throttle_count, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW, &memorystatus_freeze_suspended_threshold, 0, ""); + +boolean_t memorystatus_freeze_throttle_enabled = TRUE; +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW, &memorystatus_freeze_throttle_enabled, 0, ""); + +/* + * Manual trigger of freeze and thaw for dev / debug kernels only. + */ +static int +sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, pid = 0; + proc_t p; + + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) + return (error); + + p = proc_find(pid); + if (p != NULL) { + uint32_t purgeable, wired, clean, dirty; + boolean_t shared; + uint32_t max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max); + task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); + proc_rele(p); + return 0; + } + + return EINVAL; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_freeze, "I", ""); + +static int +sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, pid = 0; + proc_t p; + + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error || !req->newptr) + return (error); + + p = proc_find(pid); + if (p != NULL) { + task_thaw(p->task); + proc_rele(p); + return 0; + } + + return EINVAL; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", ""); -boolean_t kern_memorystatus_hibernation_throttle_enabled = TRUE; -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_throttle_enabled, CTLFLAG_RW, &kern_memorystatus_hibernation_throttle_enabled, 0, ""); -#endif /* DEVELOPMENT || DEBUG */ #endif /* CONFIG_FREEZE */ -extern unsigned int vm_page_free_count; -extern unsigned int vm_page_active_count; -extern unsigned int vm_page_inactive_count; -extern unsigned int vm_page_purgeable_count; -extern unsigned int vm_page_wire_count; +#endif /* DEVELOPMENT || DEBUG */ + +__private_extern__ void +memorystatus_init(void) +{ + thread_t thread = THREAD_NULL; + kern_return_t result; + + memorystatus_lck_attr = lck_attr_alloc_init(); + memorystatus_lck_grp_attr = lck_grp_attr_alloc_init(); + memorystatus_lck_grp = lck_grp_alloc_init("memorystatus", memorystatus_lck_grp_attr); + memorystatus_list_mlock = lck_mtx_alloc_init(memorystatus_lck_grp, memorystatus_lck_attr); + TAILQ_INIT(&memorystatus_list); + +#if CONFIG_JETSAM + exit_list_mlock = lck_mtx_alloc_init(memorystatus_lck_grp, memorystatus_lck_attr); + TAILQ_INIT(&exit_list); + + memorystatus_delta = DELTA_PERCENT * atop_64(max_mem) / 100; +#endif + +#if CONFIG_FREEZE + memorystatus_freeze_threshold = (FREEZE_PERCENT / DELTA_PERCENT) * memorystatus_delta; +#endif + + nanoseconds_to_absolutetime((uint64_t)IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time); + + result = kernel_thread_start(memorystatus_thread, NULL, &thread); + if (result == KERN_SUCCESS) { + thread_deallocate(thread); + } else { + panic("Could not create memorystatus_thread"); + } + +#if CONFIG_JETSAM + memorystatus_jetsam_policy_offset_pages_more_free = (POLICY_MORE_FREE_OFFSET_PERCENT / DELTA_PERCENT) * memorystatus_delta; +#if DEVELOPMENT || DEBUG + memorystatus_jetsam_policy_offset_pages_diagnostic = (POLICY_DIAGNOSTIC_OFFSET_PERCENT / DELTA_PERCENT) * memorystatus_delta; +#endif + + /* No contention at this point */ + memorystatus_update_levels_locked(); + + result = kernel_thread_start(memorystatus_jetsam_thread, NULL, &thread); + if (result == KERN_SUCCESS) { + thread_deallocate(thread); + } else { + panic("Could not create memorystatus_jetsam_thread"); + } +#endif +} + +/* + * Node manipulation + */ + +static void +memorystatus_add_node(memorystatus_node *new_node) +{ + memorystatus_node *node; + + /* Make sure we're called with the list lock held */ + lck_mtx_assert(memorystatus_list_mlock, LCK_MTX_ASSERT_OWNED); + + TAILQ_FOREACH(node, &memorystatus_list, link) { + if (node->priority <= new_node->priority) { + break; + } + } + + if (node) { + TAILQ_INSERT_BEFORE(node, new_node, link); + } else { + TAILQ_INSERT_TAIL(&memorystatus_list, new_node, link); + } + + next_memorystatus_node = TAILQ_FIRST(&memorystatus_list); + + memorystatus_list_count++; +} + +static void +memorystatus_remove_node(memorystatus_node *node) +{ + /* Make sure we're called with the list lock held */ + lck_mtx_assert(memorystatus_list_mlock, LCK_MTX_ASSERT_OWNED); + + TAILQ_REMOVE(&memorystatus_list, node, link); + next_memorystatus_node = TAILQ_FIRST(&memorystatus_list); + +#if CONFIG_FREEZE + if (node->state & (kProcessFrozen)) { + memorystatus_frozen_count--; + } + + if (node->state & kProcessSuspended) { + memorystatus_suspended_resident_count -= node->resident_pages; + memorystatus_suspended_count--; + } +#endif + + memorystatus_list_count--; +} + +/* Returns with the lock taken if found */ +static memorystatus_node * +memorystatus_get_node(pid_t pid) +{ + memorystatus_node *node; + + lck_mtx_lock(memorystatus_list_mlock); + + TAILQ_FOREACH(node, &memorystatus_list, link) { + if (node->pid == pid) { + break; + } + } + + if (!node) { + lck_mtx_unlock(memorystatus_list_mlock); + } + + return node; +} + +static void +memorystatus_release_node(memorystatus_node *node) +{ +#pragma unused(node) + lck_mtx_unlock(memorystatus_list_mlock); +} + +/* + * List manipulation + */ + +kern_return_t +memorystatus_list_add(pid_t pid, int priority, int high_water_mark) +{ + +#if !CONFIG_JETSAM +#pragma unused(high_water_mark) +#endif + + memorystatus_node *new_node; + + new_node = (memorystatus_node*)kalloc(sizeof(memorystatus_node)); + if (!new_node) { + assert(FALSE); + } + memset(new_node, 0, sizeof(memorystatus_node)); + + MEMORYSTATUS_DEBUG(1, "memorystatus_list_add: adding process %d with priority %d, high water mark %d.\n", pid, priority, high_water_mark); + + new_node->pid = pid; + new_node->priority = priority; +#if CONFIG_JETSAM + new_node->hiwat_pages = high_water_mark; +#endif + + lck_mtx_lock(memorystatus_list_mlock); + + memorystatus_add_node(new_node); + + lck_mtx_unlock(memorystatus_list_mlock); + + return KERN_SUCCESS; +} + +kern_return_t +memorystatus_list_change(boolean_t effective, pid_t pid, int priority, int state_flags, int high_water_mark) +{ + +#if !CONFIG_JETSAM +#pragma unused(high_water_mark) +#endif + + kern_return_t ret; + memorystatus_node *node, *search; + + MEMORYSTATUS_DEBUG(1, "memorystatus_list_change: changing process %d to priority %d with flags %d\n", pid, priority, state_flags); + + lck_mtx_lock(memorystatus_list_mlock); + + TAILQ_FOREACH(node, &memorystatus_list, link) { + if (node->pid == pid) { + break; + } + } + + if (!node) { + ret = KERN_FAILURE; + goto out; + } + + if (effective && (node->state & kProcessPriorityUpdated)) { + MEMORYSTATUS_DEBUG(1, "memorystatus_list_change: effective change specified for pid %d, but change already occurred.\n", pid); + ret = KERN_FAILURE; + goto out; + } + + node->state |= kProcessPriorityUpdated; + + if (state_flags != -1) { + node->state &= ~(kProcessActive|kProcessForeground); + if (state_flags & kMemorystatusFlagsFrontmost) { + node->state |= kProcessForeground; + } + if (state_flags & kMemorystatusFlagsActive) { + node->state |= kProcessActive; + } + } + +#if CONFIG_JETSAM + if (high_water_mark != -1) { + node->hiwat_pages = high_water_mark; + } +#endif + + if (node->priority == priority) { + /* Priority unchanged */ + MEMORYSTATUS_DEBUG(1, "memorystatus_list_change: same priority set for pid %d\n", pid); + ret = KERN_SUCCESS; + goto out; + } + + if (node->priority < priority) { + /* Higher priority value (ie less important) - search backwards */ + search = TAILQ_PREV(node, memorystatus_list_head, link); + TAILQ_REMOVE(&memorystatus_list, node, link); + + node->priority = priority; + while (search && (search->priority <= node->priority)) { + search = TAILQ_PREV(search, memorystatus_list_head, link); + } + if (search) { + TAILQ_INSERT_AFTER(&memorystatus_list, search, node, link); + } else { + TAILQ_INSERT_HEAD(&memorystatus_list, node, link); + } + } else { + /* Lower priority value (ie more important) - search forwards */ + search = TAILQ_NEXT(node, link); + TAILQ_REMOVE(&memorystatus_list, node, link); + + node->priority = priority; + while (search && (search->priority >= node->priority)) { + search = TAILQ_NEXT(search, link); + } + if (search) { + TAILQ_INSERT_BEFORE(search, node, link); + } else { + TAILQ_INSERT_TAIL(&memorystatus_list, node, link); + } + } + + next_memorystatus_node = TAILQ_FIRST(&memorystatus_list); + ret = KERN_SUCCESS; + +out: + lck_mtx_unlock(memorystatus_list_mlock); + return ret; +} + +kern_return_t memorystatus_list_remove(pid_t pid) +{ + kern_return_t ret; + memorystatus_node *node = NULL; + + MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing process %d\n", pid); + +#if CONFIG_JETSAM + /* Did we mark this as a exited process? */ + lck_mtx_lock(exit_list_mlock); + + TAILQ_FOREACH(node, &exit_list, link) { + if (node->pid == pid) { + /* We did, so remove it from the list. The stats were updated when the queues were shifted. */ + TAILQ_REMOVE(&exit_list, node, link); + break; + } + } + + lck_mtx_unlock(exit_list_mlock); +#endif + + /* If not, search the main list */ + if (!node) { + lck_mtx_lock(memorystatus_list_mlock); + + TAILQ_FOREACH(node, &memorystatus_list, link) { + if (node->pid == pid) { + /* Remove from the list, and update accounting accordingly */ + memorystatus_remove_node(node); + break; + } + } + + lck_mtx_unlock(memorystatus_list_mlock); + } + + if (node) { + kfree(node, sizeof(memorystatus_node)); + ret = KERN_SUCCESS; + } else { + ret = KERN_FAILURE; + } + + return ret; +} + +kern_return_t +memorystatus_on_track_dirty(int pid, boolean_t track) +{ + kern_return_t ret = KERN_FAILURE; + memorystatus_node *node; + + node = memorystatus_get_node((pid_t)pid); + if (!node) { + return KERN_FAILURE; + } + + if (track & !(node->state & kProcessSupportsIdleExit)) { + node->state |= kProcessSupportsIdleExit; + node->clean_time = mach_absolute_time() + memorystatus_idle_delay_time; + ret = KERN_SUCCESS; + } else if (!track & (node->state & kProcessSupportsIdleExit)) { + node->state &= ~kProcessSupportsIdleExit; + node->clean_time = 0; + ret = KERN_SUCCESS; + } + + memorystatus_release_node(node); + + return ret; +} -static void kern_memorystatus_thread(void); +kern_return_t +memorystatus_on_dirty(int pid, boolean_t dirty) +{ + kern_return_t ret = KERN_FAILURE; + memorystatus_node *node; + + node = memorystatus_get_node((pid_t)pid); + if (!node) { + return KERN_FAILURE; + } + + if (dirty) { + if (!(node->state & kProcessDirty)) { + node->state |= kProcessDirty; + node->clean_time = 0; + memorystatus_dirty_count++; + ret = KERN_SUCCESS; + } + } else { + if (node->state & kProcessDirty) { + node->state &= ~kProcessDirty; + node->clean_time = mach_absolute_time() + memorystatus_idle_delay_time; + memorystatus_dirty_count--; + ret = KERN_SUCCESS; + } + } + + memorystatus_release_node(node); + + return ret; +} -int kern_memorystatus_wakeup = 0; -int kern_memorystatus_level = 0; -int kern_memorystatus_last_level = 0; -unsigned int kern_memorystatus_delta; +void +memorystatus_on_suspend(int pid) +{ + memorystatus_node *node = memorystatus_get_node((pid_t)pid); -unsigned int kern_memorystatus_kev_failure_count = 0; -int kern_memorystatus_level_critical = 5; -#define kern_memorystatus_level_highwater (kern_memorystatus_level_critical + 5) + if (node) { +#if CONFIG_FREEZE + proc_t p; -static struct { - jetsam_kernel_stats_t stats; - size_t entry_count; - jetsam_snapshot_entry_t entries[kMaxSnapshotEntries]; -} jetsam_snapshot; + p = proc_find(pid); + if (p != NULL) { + uint32_t pages = memorystatus_task_page_count(p->task); + proc_rele(p); + node->resident_pages = pages; + memorystatus_suspended_resident_count += pages; + } + memorystatus_suspended_count++; +#endif -static jetsam_priority_entry_t jetsam_priority_list[kMaxPriorityEntries]; -#define jetsam_snapshot_list jetsam_snapshot.entries + node->state |= kProcessSuspended; -static int jetsam_priority_list_index = 0; -static int jetsam_priority_list_count = 0; -static int jetsam_snapshot_list_count = 0; + memorystatus_release_node(node); + } +} -static lck_mtx_t * jetsam_list_mlock; -static lck_attr_t * jetsam_lck_attr; -static lck_grp_t * jetsam_lck_grp; -static lck_grp_attr_t * jetsam_lck_grp_attr; +void +memorystatus_on_resume(int pid) +{ + memorystatus_node *node = memorystatus_get_node((pid_t)pid); -SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_memorystatus_level, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_kev_failure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_memorystatus_kev_failure_count, 0, ""); + if (node) { +#if CONFIG_FREEZE + boolean_t frozen = (node->state & kProcessFrozen); + if (node->state & (kProcessFrozen)) { + memorystatus_frozen_count--; + } + memorystatus_suspended_resident_count -= node->resident_pages; + memorystatus_suspended_count--; +#endif -#if DEVELOPMENT || DEBUG + node->state &= ~(kProcessSuspended | kProcessFrozen | kProcessIgnored); -enum { - kJetsamDiagnosticModeNone = 0, - kJetsamDiagnosticModeAll = 1, - kJetsamDiagnosticModeStopAtFirstActive = 2 -} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone; + memorystatus_release_node(node); -static int jetsam_diagnostic_suspended_one_active_proc = 0; +#if CONFIG_FREEZE + if (frozen) { + memorystatus_freeze_entry_t data = { pid, kMemorystatusFlagsThawed, 0 }; + memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + } +#endif + } +} -static int -sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS +void +memorystatus_on_inactivity(int pid) { -#pragma unused(arg1, arg2) - int error, val = jetsam_diagnostic_mode; - boolean_t disabled; +#pragma unused(pid) +#if CONFIG_FREEZE + /* Wake the freeze thread */ + thread_wakeup((event_t)&memorystatus_freeze_wakeup); +#endif +} - error = sysctl_handle_int(oidp, &val, 0, req); - if (error || !req->newptr) - return (error); - if ((val < 0) || (val > 2)) { - printf("jetsam: diagnostic mode: invalid value - %d\n", val); - return (0); +static void +memorystatus_thread(void *param __unused, wait_result_t wr __unused) +{ + static boolean_t initialized = FALSE; + memorystatus_node *node; + uint64_t current_time; + pid_t victim_pid = -1; + + if (initialized == FALSE) { + initialized = TRUE; + assert_wait(&memorystatus_wakeup, THREAD_UNINT); + (void)thread_block((thread_continue_t)memorystatus_thread); } + + /* Pick next idle exit victim. For now, just iterate through; ideally, this would be be more intelligent. */ + current_time = mach_absolute_time(); - /* - * If jetsam_diagnostic_mode is set, we need to lower memory threshold for jetsam - */ - disabled = (val == 0) && (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone); + /* Set a cutoff so that we don't idle exit processes that went recently clean */ - jetsam_diagnostic_mode = val; + lck_mtx_lock(memorystatus_list_mlock); - if (disabled) { - kern_memorystatus_level_critical = 5; - printf("jetsam: diagnostic mode: resetting critical level to %d\n", kern_memorystatus_level_critical); - } else { - kern_memorystatus_level_critical = 10; - printf("jetsam: diagnostic mode: %d: increasing critical level to %d\n", (int) jetsam_diagnostic_mode, kern_memorystatus_level_critical); - if (jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) - printf("jetsam: diagnostic mode: will stop at first active app\n"); + if (memorystatus_dirty_count) { + TAILQ_FOREACH(node, &memorystatus_list, link) { + if ((node->state & kProcessSupportsIdleExit) && !(node->state & (kProcessDirty|kProcessIgnoreIdleExit))) { + if (current_time >= node->clean_time) { + victim_pid = node->pid; + break; + } + } + } } - - return (0); -} - -SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, - &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode"); -#endif /* DEVELOPMENT || DEBUG */ -__private_extern__ void -kern_memorystatus_init(void) -{ - jetsam_lck_attr = lck_attr_alloc_init(); - jetsam_lck_grp_attr= lck_grp_attr_alloc_init(); - jetsam_lck_grp = lck_grp_alloc_init("jetsam", jetsam_lck_grp_attr); - jetsam_list_mlock = lck_mtx_alloc_init(jetsam_lck_grp, jetsam_lck_attr); - kern_memorystatus_delta = 5 * atop_64(max_mem) / 100; + lck_mtx_unlock(memorystatus_list_mlock); + + if (-1 != victim_pid) { + proc_t p = proc_find(victim_pid); + if (p != NULL) { + boolean_t kill = FALSE; + proc_dirty_start(p); + /* Ensure process is still marked for idle exit and is clean */ + if ((p->p_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) { + /* Clean; issue SIGKILL */ + p->p_dirty |= P_DIRTY_TERMINATED; + kill = TRUE; + } + proc_dirty_end(p); + if (TRUE == kill) { + printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_pid, (p->p_comm ? p->p_comm : "(unknown)")); + psignal(p, SIGKILL); + } + proc_rele(p); + } + } - (void)kernel_thread(kernel_task, kern_memorystatus_thread); + assert_wait(&memorystatus_wakeup, THREAD_UNINT); + (void)thread_block((thread_continue_t)memorystatus_thread); } +#if CONFIG_JETSAM + static uint32_t -jetsam_task_page_count(task_t task) +memorystatus_task_page_count(task_t task) { kern_return_t ret; static task_info_data_t data; @@ -279,704 +929,644 @@ jetsam_task_page_count(task_t task) return 0; } +static int +memorystatus_send_note(int event_code, void *data, size_t data_length) { + int ret; + struct kev_msg ev_msg; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_SYSTEM_CLASS; + ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; + + ev_msg.event_code = event_code; + + ev_msg.dv[0].data_length = data_length; + ev_msg.dv[0].data_ptr = data; + ev_msg.dv[1].data_length = 0; + + ret = kev_post_msg(&ev_msg); + if (ret) { + memorystatus_kev_failure_count++; + printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); + } + + return ret; +} + static uint32_t -jetsam_flags_for_pid(pid_t pid) +memorystatus_build_flags_from_state(uint32_t state) { + uint32_t flags = 0; + + if (state & kProcessForeground) { + flags |= kMemorystatusFlagsFrontmost; + } + if (state & kProcessActive) { + flags |= kMemorystatusFlagsActive; + } + if (state & kProcessSupportsIdleExit) { + flags |= kMemorystatusFlagsSupportsIdleExit; + } + if (state & kProcessDirty) { + flags |= kMemorystatusFlagsDirty; + } + + return flags; +} + +static void +memorystatus_move_node_to_exit_list(memorystatus_node *node) { - int i; + /* Make sure we're called with the list lock held */ + lck_mtx_assert(memorystatus_list_mlock, LCK_MTX_ASSERT_OWNED); + + /* Now, acquire the exit list lock... */ + lck_mtx_lock(exit_list_mlock); + + /* Remove from list + update accounting... */ + memorystatus_remove_node(node); + + /* ...then insert at the end of the exit queue */ + TAILQ_INSERT_TAIL(&exit_list, node, link); + + /* And relax */ + lck_mtx_unlock(exit_list_mlock); +} - for (i = 0; i < jetsam_priority_list_count; i++) { - if (pid == jetsam_priority_list[i].pid) { - return jetsam_priority_list[i].flags; +void memorystatus_update(unsigned int pages_avail) +{ + if (!memorystatus_delta) { + return; + } + + if ((pages_avail < memorystatus_available_pages_critical) || + (pages_avail >= (memorystatus_available_pages + memorystatus_delta)) || + (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) { + memorystatus_available_pages = pages_avail; + memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem); + /* Only wake the thread if currently blocked */ + if (OSCompareAndSwap(0, 1, &memorystatus_jetsam_running)) { + thread_wakeup((event_t)&memorystatus_jetsam_wakeup); } } - return 0; +} + +static boolean_t +memorystatus_get_snapshot_properties_for_proc_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry) +{ + memorystatus_node *node; + + TAILQ_FOREACH(node, &memorystatus_list, link) { + if (node->pid == p->p_pid) { + break; + } + } + + if (!node) { + return FALSE; + } + + entry->pid = p->p_pid; + strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1); + entry->priority = node->priority; + entry->pages = memorystatus_task_page_count(p->task); + entry->flags = memorystatus_build_flags_from_state(node->state); + memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid)); + + return TRUE; } static void -jetsam_snapshot_procs(void) +memorystatus_jetsam_snapshot_procs_locked(void) { proc_t p; int i = 0; - jetsam_snapshot.stats.free_pages = vm_page_free_count; - jetsam_snapshot.stats.active_pages = vm_page_active_count; - jetsam_snapshot.stats.inactive_pages = vm_page_inactive_count; - jetsam_snapshot.stats.purgeable_pages = vm_page_purgeable_count; - jetsam_snapshot.stats.wired_pages = vm_page_wire_count; + memorystatus_jetsam_snapshot.stats.free_pages = vm_page_free_count; + memorystatus_jetsam_snapshot.stats.active_pages = vm_page_active_count; + memorystatus_jetsam_snapshot.stats.inactive_pages = vm_page_inactive_count; + memorystatus_jetsam_snapshot.stats.throttled_pages = vm_page_throttled_count; + memorystatus_jetsam_snapshot.stats.purgeable_pages = vm_page_purgeable_count; + memorystatus_jetsam_snapshot.stats.wired_pages = vm_page_wire_count; proc_list_lock(); LIST_FOREACH(p, &allproc, p_list) { - task_t task = p->task; - jetsam_snapshot_list[i].pid = p->p_pid; - jetsam_snapshot_list[i].pages = jetsam_task_page_count(task); - jetsam_snapshot_list[i].flags = jetsam_flags_for_pid(p->p_pid); - strlcpy(&jetsam_snapshot_list[i].name[0], p->p_comm, MAXCOMLEN+1); -#ifdef DEBUG - printf("jetsam snapshot pid = %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + if (FALSE == memorystatus_get_snapshot_properties_for_proc_locked(p, &memorystatus_jetsam_snapshot_list[i])) { + continue; + } + + MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid = %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", p->p_pid, p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7], p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]); -#endif - memcpy(&jetsam_snapshot_list[i].uuid[0], &p->p_uuid[0], sizeof(p->p_uuid)); - i++; - if (i == kMaxSnapshotEntries) { + + if (++i == kMaxSnapshotEntries) { break; } } proc_list_unlock(); - jetsam_snapshot.entry_count = jetsam_snapshot_list_count = i - 1; + memorystatus_jetsam_snapshot.snapshot_time = mach_absolute_time(); + memorystatus_jetsam_snapshot.entry_count = memorystatus_jetsam_snapshot_list_count = i - 1; } static void -jetsam_mark_pid_in_snapshot(pid_t pid, int flags) +memorystatus_mark_pid_in_snapshot(pid_t pid, int flags) { - int i = 0; - for (i = 0; i < jetsam_snapshot_list_count; i++) { - if (jetsam_snapshot_list[i].pid == pid) { - jetsam_snapshot_list[i].flags |= flags; + for (i = 0; i < memorystatus_jetsam_snapshot_list_count; i++) { + if (memorystatus_jetsam_snapshot_list[i].pid == pid) { + memorystatus_jetsam_snapshot_list[i].flags |= flags; return; } } } int -jetsam_kill_top_proc(boolean_t any, uint32_t cause) +memorystatus_kill_top_proc(boolean_t any, uint32_t cause) { proc_t p; + int pending_snapshot = 0; #ifndef CONFIG_FREEZE #pragma unused(any) #endif + + lck_mtx_lock(memorystatus_list_mlock); - if (jetsam_snapshot_list_count == 0) { - jetsam_snapshot_procs(); + if (memorystatus_jetsam_snapshot_list_count == 0) { + memorystatus_jetsam_snapshot_procs_locked(); + } else { + pending_snapshot = 1; } - lck_mtx_lock(jetsam_list_mlock); - while (jetsam_priority_list_index < jetsam_priority_list_count) { - jetsam_priority_entry_t* jetsam_priority_entry = &jetsam_priority_list[jetsam_priority_list_index]; - pid_t aPid = jetsam_priority_entry->pid; + + while (next_memorystatus_node) { + memorystatus_node *node; + pid_t aPid; +#if DEVELOPMENT || DEBUG + int activeProcess; + int procSuspendedForDiagnosis; +#endif /* DEVELOPMENT || DEBUG */ + + node = next_memorystatus_node; + next_memorystatus_node = TAILQ_NEXT(next_memorystatus_node, link); + #if DEVELOPMENT || DEBUG - int activeProcess = jetsam_priority_entry->flags & kJetsamFlagsFrontmost; - int procSuspendedForDiagnosis = jetsam_priority_entry->flags & kJetsamFlagsSuspForDiagnosis; + activeProcess = node->state & kProcessForeground; + procSuspendedForDiagnosis = node->state & kProcessSuspendedForDiag; #endif /* DEVELOPMENT || DEBUG */ - jetsam_priority_list_index++; + + aPid = node->pid; + /* skip empty slots in the list */ - if (aPid == 0) { + if (aPid == 0 || (node->state & kProcessKilled)) { continue; // with lock held } - lck_mtx_unlock(jetsam_list_mlock); + p = proc_find(aPid); if (p != NULL) { int flags = cause; + #if DEVELOPMENT || DEBUG - if ((jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) && procSuspendedForDiagnosis) { + if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) { printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid); proc_rele(p); - lck_mtx_lock(jetsam_list_mlock); continue; } #endif /* DEVELOPMENT || DEBUG */ + #if CONFIG_FREEZE - hibernation_node *node; boolean_t skip; - if ((node = kern_hibernation_get_node(aPid))) { - boolean_t reclaim_proc = !(node->state & (kProcessBusy | kProcessNoReclaimWorth)); - if (any || reclaim_proc) { - if (node->state & kProcessHibernated) { - flags |= kJetsamFlagsHibernated; - } - skip = FALSE; - } else { - skip = TRUE; + boolean_t reclaim_proc = !(node->state & (kProcessLocked | kProcessNoReclaimWorth)); + if (any || reclaim_proc) { + if (node->state & kProcessFrozen) { + flags |= kMemorystatusFlagsFrozen; } - kern_hibernation_release_node(node); - } else { skip = FALSE; + } else { + skip = TRUE; } + if (skip) { proc_rele(p); } else #endif { #if DEVELOPMENT || DEBUG - if ((jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) && activeProcess) { -#if DEBUG - printf("jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n", - aPid, (p->p_comm ? p->p_comm: "(unknown)"), kern_memorystatus_level); -#endif /* DEBUG */ - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsSuspForDiagnosis); - jetsam_priority_entry->flags |= kJetsamFlagsSuspForDiagnosis; - task_suspend(p->task); - proc_rele(p); - if (jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) { + if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) { + MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n", + aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level); + memorystatus_mark_pid_in_snapshot(aPid, kMemorystatusFlagsSuspForDiagnosis); + node->state |= kProcessSuspendedForDiag; + if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) { jetsam_diagnostic_suspended_one_active_proc = 1; printf("jetsam: returning after suspending first active proc - %d\n", aPid); } + lck_mtx_unlock(memorystatus_list_mlock); + task_suspend(p->task); + proc_rele(p); return 0; } else #endif /* DEVELOPMENT || DEBUG */ { - printf("jetsam: killing pid %d [%s] - memory_status_level: %d\n", - aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level); - jetsam_mark_pid_in_snapshot(aPid, flags); - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); + printf("memorystatus: jetsam killing pid %d [%s] - memorystatus_available_pages: %d\n", + aPid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages); + /* Shift queue, update stats */ + memorystatus_move_node_to_exit_list(node); + memorystatus_mark_pid_in_snapshot(aPid, flags); + lck_mtx_unlock(memorystatus_list_mlock); + exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE); proc_rele(p); -#if DEBUG - printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); -#endif /* DEBUG */ return 0; } } } - lck_mtx_lock(jetsam_list_mlock); } - lck_mtx_unlock(jetsam_list_mlock); + + lck_mtx_unlock(memorystatus_list_mlock); + + // If we didn't kill anything, toss any newly-created snapshot + if (!pending_snapshot) { + memorystatus_jetsam_snapshot.entry_count = memorystatus_jetsam_snapshot_list_count = 0; + } + return -1; } +int memorystatus_kill_top_proc_from_VM(void) { + return memorystatus_kill_top_proc(TRUE, kMemorystatusFlagsKilledVM); +} + static int -jetsam_kill_hiwat_proc(void) +memorystatus_kill_hiwat_proc(void) { proc_t p; - int i; - if (jetsam_snapshot_list_count == 0) { - jetsam_snapshot_procs(); + int pending_snapshot = 0; + memorystatus_node *next_hiwat_node; + + lck_mtx_lock(memorystatus_list_mlock); + + if (memorystatus_jetsam_snapshot_list_count == 0) { + memorystatus_jetsam_snapshot_procs_locked(); + } else { + pending_snapshot = 1; } - lck_mtx_lock(jetsam_list_mlock); - for (i = jetsam_priority_list_index; i < jetsam_priority_list_count; i++) { + + next_hiwat_node = next_memorystatus_node; + + while (next_hiwat_node) { pid_t aPid; int32_t hiwat; - aPid = jetsam_priority_list[i].pid; - hiwat = jetsam_priority_list[i].hiwat_pages; + memorystatus_node *node; + + node = next_hiwat_node; + next_hiwat_node = TAILQ_NEXT(next_hiwat_node, link); + + aPid = node->pid; + hiwat = node->hiwat_pages; + /* skip empty or non-hiwat slots in the list */ - if (aPid == 0 || (hiwat < 0)) { + if (aPid == 0 || (hiwat < 0) || (node->state & kProcessKilled)) { continue; // with lock held } + p = proc_find(aPid); if (p != NULL) { - int32_t pages = (int32_t)jetsam_task_page_count(p->task); + int32_t pages = (int32_t)memorystatus_task_page_count(p->task); boolean_t skip = (pages <= hiwat); #if DEVELOPMENT || DEBUG - if (!skip && (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone)) { - if (jetsam_priority_list[i].flags & kJetsamFlagsSuspForDiagnosis) { + if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) { + if (node->state & kProcessSuspendedForDiag) { proc_rele(p); continue; } } #endif /* DEVELOPMENT || DEBUG */ + #if CONFIG_FREEZE if (!skip) { - hibernation_node *node; - if ((node = kern_hibernation_get_node(aPid))) { - if (node->state & kProcessBusy) { - kern_hibernation_release_node(node); - skip = TRUE; - } else { - kern_hibernation_free_node(node, TRUE); - skip = FALSE; - } + if (node->state & kProcessLocked) { + skip = TRUE; + } else { + skip = FALSE; } } #endif + if (!skip) { -#if DEBUG - printf("jetsam: %s pid %d [%s] - %d pages > hiwat (%d)\n", - (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone)?"suspending": "killing", aPid, p->p_comm, pages, hiwat); -#endif /* DEBUG */ + MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d pages > 1 (%d)\n", + (memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, pages, hiwat); #if DEVELOPMENT || DEBUG - if (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) { - lck_mtx_unlock(jetsam_list_mlock); + if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) { + memorystatus_mark_pid_in_snapshot(aPid, kMemorystatusFlagsSuspForDiagnosis); + node->state |= kProcessSuspendedForDiag; + lck_mtx_unlock(memorystatus_list_mlock); task_suspend(p->task); proc_rele(p); -#if DEBUG - printf("jetsam: pid %d suspended for diagnosis - memory_status_level: %d\n", aPid, kern_memorystatus_level); -#endif /* DEBUG */ - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsSuspForDiagnosis); - jetsam_priority_list[i].flags |= kJetsamFlagsSuspForDiagnosis; + MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages); } else #endif /* DEVELOPMENT || DEBUG */ - { - jetsam_priority_list[i].pid = 0; - lck_mtx_unlock(jetsam_list_mlock); + { + printf("memorystatus: jetsam killing pid %d [%s] (highwater) - memorystatus_available_pages: %d\n", + aPid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages); + /* Shift queue, update stats */ + memorystatus_move_node_to_exit_list(node); + memorystatus_mark_pid_in_snapshot(aPid, kMemorystatusFlagsKilledHiwat); + lck_mtx_unlock(memorystatus_list_mlock); exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); proc_rele(p); -#if DEBUG - printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); -#endif /* DEBUG */ - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilledHiwat); - } - return 0; - } else { - proc_rele(p); - } - - } - } - lck_mtx_unlock(jetsam_list_mlock); - return -1; -} - -#if CONFIG_FREEZE -static void -jetsam_send_hibernation_note(uint32_t flags, pid_t pid, uint32_t pages) { - int ret; - struct kev_msg ev_msg; - jetsam_hibernation_entry_t data; - - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_SYSTEM_CLASS; - ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; - - ev_msg.event_code = kMemoryStatusHibernationNote; - - ev_msg.dv[0].data_length = sizeof data; - ev_msg.dv[0].data_ptr = &data; - ev_msg.dv[1].data_length = 0; - - data.pid = pid; - data.flags = flags; - data.pages = pages; - - ret = kev_post_msg(&ev_msg); - if (ret) { - kern_memorystatus_kev_failure_count++; - printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); - } -} - -static int -jetsam_hibernate_top_proc(void) -{ - int hibernate_index; - proc_t p; - uint32_t i; - - lck_mtx_lock(jetsam_list_mlock); - - for (hibernate_index = jetsam_priority_list_index; hibernate_index < jetsam_priority_list_count; hibernate_index++) { - pid_t aPid; - uint32_t state = 0; - - aPid = jetsam_priority_list[hibernate_index].pid; - - /* skip empty slots in the list */ - if (aPid == 0) { - continue; // with lock held - } - - if (kern_hibernation_get_process_state(aPid, &state, NULL) != 0) { - continue; // with lock held - } - - /* ensure the process isn't marked as busy and is suspended */ - if ((state & kProcessBusy) || !(state & kProcessSuspended)) { - continue; // with lock held - } - - p = proc_find(aPid); - if (p != NULL) { - hibernation_node *node; - boolean_t skip; - uint32_t purgeable, wired, clean, dirty; - boolean_t shared; - - lck_mtx_unlock(jetsam_list_mlock); - - if ((node = kern_hibernation_get_node(aPid))) { - if (node->state & kProcessBusy) { - skip = TRUE; - } else { - node->state |= kProcessBusy; - /* Whether we hibernate or not, increase the count so can we maintain the gap between hibernated and suspended processes. */ - kern_memorystatus_hibernated_count++; - skip = FALSE; - } - kern_hibernation_release_node(node); - } else { - skip = TRUE; - } - - if (!skip) { - /* Only hibernate processes meeting our size criteria. If not met, mark it as such and return. */ - task_freeze(p->task, &purgeable, &wired, &clean, &dirty, &shared, TRUE); - skip = (dirty < kern_memorystatus_hibernation_pages_min) || (dirty > kern_memorystatus_hibernation_pages_max); - } - - if (!skip) { - unsigned int swap_pages_free = default_pager_swap_pages_free(); - - /* Ensure there's actually enough space free to hibernate this process. */ - if (dirty > swap_pages_free) { - kern_memorystatus_low_swap_pages = swap_pages_free; - skip = TRUE; } - } - - if (skip) { - kern_hibernation_set_process_state(aPid, kProcessIgnored); - proc_rele(p); return 0; + } else { + proc_rele(p); } -#if DEBUG - printf("jetsam: pid %d [%s] hibernating - memory_status_level: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n", - aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free()); -#endif - - task_freeze(p->task, &purgeable, &wired, &clean, &dirty, &shared, FALSE); - proc_rele(p); - - kern_hibernation_set_process_state(aPid, kProcessHibernated | (shared ? 0: kProcessNoReclaimWorth)); - - /* Update stats */ - for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { - throttle_intervals[i].pageouts += dirty; - } - kern_memorystatus_hibernation_pageouts += dirty; - kern_memorystatus_hibernation_count++; - - jetsam_send_hibernation_note(kJetsamFlagsHibernated, aPid, dirty); - - return dirty; } } - lck_mtx_unlock(jetsam_list_mlock); + + lck_mtx_unlock(memorystatus_list_mlock); + + // If we didn't kill anything, toss any newly-created snapshot + if (!pending_snapshot) { + memorystatus_jetsam_snapshot.entry_count = memorystatus_jetsam_snapshot_list_count = 0; + } + return -1; } -#endif /* CONFIG_FREEZE */ static void -kern_memorystatus_thread(void) +memorystatus_jetsam_thread_block(void) { - struct kev_msg ev_msg; - jetsam_kernel_stats_t data; - boolean_t post_memorystatus_snapshot = FALSE; - int ret; + assert_wait(&memorystatus_jetsam_wakeup, THREAD_UNINT); + assert(memorystatus_jetsam_running == 1); + OSDecrementAtomic(&memorystatus_jetsam_running); + (void)thread_block((thread_continue_t)memorystatus_jetsam_thread); +} - bzero(&data, sizeof(jetsam_kernel_stats_t)); - bzero(&ev_msg, sizeof(struct kev_msg)); +static void +memorystatus_jetsam_thread(void *param __unused, wait_result_t wr __unused) +{ + boolean_t post_snapshot = FALSE; + static boolean_t is_vm_privileged = FALSE; + + if (is_vm_privileged == FALSE) { + /* + * It's the first time the thread has run, so just mark the thread as privileged and block. + * This avoids a spurious pass with unset variables, as set out in . + */ + thread_wire(host_priv_self(), current_thread(), TRUE); + is_vm_privileged = TRUE; + memorystatus_jetsam_thread_block(); + } + + assert(memorystatus_available_pages != (unsigned)-1); + while(1) { + unsigned int last_available_pages; #if DEVELOPMENT || DEBUG jetsam_diagnostic_suspended_one_active_proc = 0; #endif /* DEVELOPMENT || DEBUG */ - - while (kern_memorystatus_level <= kern_memorystatus_level_highwater) { - if (jetsam_kill_hiwat_proc() < 0) { + + while (memorystatus_available_pages <= memorystatus_available_pages_highwater) { + if (memorystatus_kill_hiwat_proc() < 0) { break; } - post_memorystatus_snapshot = TRUE; + post_snapshot = TRUE; } - while (kern_memorystatus_level <= kern_memorystatus_level_critical) { - if (jetsam_kill_top_proc(FALSE, kJetsamFlagsKilled) < 0) { - break; + while (memorystatus_available_pages <= memorystatus_available_pages_critical) { + if (memorystatus_kill_top_proc(FALSE, kMemorystatusFlagsKilled) < 0) { + /* No victim was found - panic */ + panic("memorystatus_jetsam_thread: no victim! available pages:%d, critical page level: %d\n", + memorystatus_available_pages, memorystatus_available_pages_critical); } - post_memorystatus_snapshot = TRUE; + post_snapshot = TRUE; #if DEVELOPMENT || DEBUG - if ((jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) && jetsam_diagnostic_suspended_one_active_proc) { + if ((memorystatus_jetsam_policy & kPolicyDiagnoseFirst) && jetsam_diagnostic_suspended_one_active_proc) { printf("jetsam: stopping killing since 1 active proc suspended already for diagnosis\n"); break; // we found first active proc, let's not kill any more } #endif /* DEVELOPMENT || DEBUG */ } + + last_available_pages = memorystatus_available_pages; - kern_memorystatus_last_level = kern_memorystatus_level; - - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_SYSTEM_CLASS; - ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; - - /* pass the memory status level (percent free) */ - ev_msg.event_code = kMemoryStatusLevelNote; - - ev_msg.dv[0].data_length = sizeof kern_memorystatus_last_level; - ev_msg.dv[0].data_ptr = &kern_memorystatus_last_level; - ev_msg.dv[1].data_length = sizeof data; - ev_msg.dv[1].data_ptr = &data; - ev_msg.dv[2].data_length = 0; - - data.free_pages = vm_page_free_count; - data.active_pages = vm_page_active_count; - data.inactive_pages = vm_page_inactive_count; - data.purgeable_pages = vm_page_purgeable_count; - data.wired_pages = vm_page_wire_count; - - ret = kev_post_msg(&ev_msg); - if (ret) { - kern_memorystatus_kev_failure_count++; - printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); + if (post_snapshot) { + size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_list_count - 1); + memorystatus_jetsam_snapshot.notification_time = mach_absolute_time(); + memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); } - if (post_memorystatus_snapshot) { - size_t snapshot_size = sizeof(jetsam_kernel_stats_t) + sizeof(size_t) + sizeof(jetsam_snapshot_entry_t) * jetsam_snapshot_list_count; - ev_msg.event_code = kMemoryStatusSnapshotNote; - ev_msg.dv[0].data_length = sizeof snapshot_size; - ev_msg.dv[0].data_ptr = &snapshot_size; - ev_msg.dv[1].data_length = 0; - - ret = kev_post_msg(&ev_msg); - if (ret) { - kern_memorystatus_kev_failure_count++; - printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); - } + if (memorystatus_available_pages >= (last_available_pages + memorystatus_delta) || + last_available_pages >= (memorystatus_available_pages + memorystatus_delta)) { + continue; } - if (kern_memorystatus_level >= kern_memorystatus_last_level + 5 || - kern_memorystatus_level <= kern_memorystatus_last_level - 5) - continue; +#if VM_PRESSURE_EVENTS + memorystatus_check_pressure_reset(); +#endif - assert_wait(&kern_memorystatus_wakeup, THREAD_UNINT); - (void)thread_block((thread_continue_t)kern_memorystatus_thread); + memorystatus_jetsam_thread_block(); } } +#endif /* CONFIG_JETSAM */ + #if CONFIG_FREEZE __private_extern__ void -kern_hibernation_init(void) +memorystatus_freeze_init(void) { - hibernation_lck_attr = lck_attr_alloc_init(); - hibernation_lck_grp_attr = lck_grp_attr_alloc_init(); - hibernation_lck_grp = lck_grp_alloc_init("hibernation", hibernation_lck_grp_attr); - hibernation_mlock = lck_mtx_alloc_init(hibernation_lck_grp, hibernation_lck_attr); + kern_return_t result; + thread_t thread; - RB_INIT(&hibernation_tree_head); - - (void)kernel_thread(kernel_task, kern_hibernation_thread); + result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread); + if (result == KERN_SUCCESS) { + thread_deallocate(thread); + } else { + panic("Could not create memorystatus_freeze_thread"); + } } -static inline boolean_t -kern_hibernation_can_hibernate_processes(void) +static int +memorystatus_freeze_top_proc(boolean_t *memorystatus_freeze_swap_low) { - boolean_t ret; - - lck_mtx_lock_spin(hibernation_mlock); - ret = (kern_memorystatus_suspended_count - kern_memorystatus_hibernated_count) > - kern_memorystatus_hibernation_suspended_minimum ? TRUE : FALSE; - lck_mtx_unlock(hibernation_mlock); - - return ret; -} + proc_t p; + uint32_t i; + memorystatus_node *next_freeze_node; -static boolean_t -kern_hibernation_can_hibernate(void) -{ - /* Only hibernate if we're sufficiently low on memory; this holds off hibernation right after boot, - and is generally is a no-op once we've reached steady state. */ - if (kern_memorystatus_level > kern_memorystatus_level_hibernate) { - return FALSE; - } + lck_mtx_lock(memorystatus_list_mlock); - /* Check minimum suspended process threshold. */ - if (!kern_hibernation_can_hibernate_processes()) { - return FALSE; - } - - /* Is swap running low? */ - if (kern_memorystatus_low_swap_pages) { - /* If there's been no movement in free swap pages since we last attempted hibernation, return. */ - if (default_pager_swap_pages_free() <= kern_memorystatus_low_swap_pages) { - return FALSE; - } - - /* Pages have been freed, so we can retry. */ - kern_memorystatus_low_swap_pages = 0; - } + next_freeze_node = next_memorystatus_node; - /* OK */ - return TRUE; -} - -static void -kern_hibernation_add_node(hibernation_node *node) -{ - lck_mtx_lock_spin(hibernation_mlock); - - RB_INSERT(hibernation_tree, &hibernation_tree_head, node); - kern_memorystatus_suspended_count++; - - lck_mtx_unlock(hibernation_mlock); -} - -/* Returns with the hibernation lock taken */ -static hibernation_node * -kern_hibernation_get_node(pid_t pid) -{ - hibernation_node sought, *found; - sought.pid = pid; - lck_mtx_lock_spin(hibernation_mlock); - found = RB_FIND(hibernation_tree, &hibernation_tree_head, &sought); - if (!found) { - lck_mtx_unlock(hibernation_mlock); - } - return found; -} - -static void -kern_hibernation_release_node(hibernation_node *node) -{ -#pragma unused(node) - lck_mtx_unlock(hibernation_mlock); -} - -static void -kern_hibernation_free_node(hibernation_node *node, boolean_t unlock) -{ - /* make sure we're called with the hibernation_mlock held */ - lck_mtx_assert(hibernation_mlock, LCK_MTX_ASSERT_OWNED); - - if (node->state & (kProcessHibernated | kProcessIgnored)) { - kern_memorystatus_hibernated_count--; - } + while (next_freeze_node) { + memorystatus_node *node; + pid_t aPid; + uint32_t state; + + node = next_freeze_node; + next_freeze_node = TAILQ_NEXT(next_freeze_node, link); - kern_memorystatus_suspended_count--; - - RB_REMOVE(hibernation_tree, &hibernation_tree_head, node); - kfree(node, sizeof(hibernation_node)); + aPid = node->pid; + state = node->state; - if (unlock) { - lck_mtx_unlock(hibernation_mlock); - } -} + /* skip empty slots in the list */ + if (aPid == 0) { + continue; // with lock held + } -static void -kern_hibernation_register_pid(pid_t pid) -{ - hibernation_node *node; + /* Ensure the process is eligible for freezing */ + if ((state & (kProcessKilled | kProcessLocked | kProcessFrozen)) || !(state & kProcessSuspended)) { + continue; // with lock held + } -#if DEVELOPMENT || DEBUG - node = kern_hibernation_get_node(pid); - if (node) { - printf("kern_hibernation_register_pid: pid %d already registered!\n", pid); - kern_hibernation_release_node(node); - return; - } -#endif + p = proc_find(aPid); + if (p != NULL) { + kern_return_t kr; + uint32_t purgeable, wired, clean, dirty; + boolean_t shared; + uint32_t max_pages = 0; + + /* Only freeze processes meeting our minimum resident page criteria */ + if (memorystatus_task_page_count(p->task) < memorystatus_freeze_pages_min) { + proc_rele(p); + continue; + } - /* Register as a candiate for hibernation */ - node = (hibernation_node *)kalloc(sizeof(hibernation_node)); - if (node) { - clock_sec_t sec; - clock_nsec_t nsec; - mach_timespec_t ts; - - memset(node, 0, sizeof(hibernation_node)); + /* Ensure there's enough free space to freeze this process. */ + max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max); + if (max_pages < memorystatus_freeze_pages_min) { + *memorystatus_freeze_swap_low = TRUE; + proc_rele(p); + lck_mtx_unlock(memorystatus_list_mlock); + return 0; + } + + /* Mark as locked temporarily to avoid kill */ + node->state |= kProcessLocked; + + kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); + + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_proc: task_freeze %s for pid %d [%s] - " + "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n", + (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"), + memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free()); + + proc_rele(p); + + node->state &= ~kProcessLocked; + + if (KERN_SUCCESS == kr) { + memorystatus_freeze_entry_t data = { aPid, kMemorystatusFlagsFrozen, dirty }; + + memorystatus_frozen_count++; + + node->state |= (kProcessFrozen | (shared ? 0: kProcessNoReclaimWorth)); + + /* Update stats */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + throttle_intervals[i].pageouts += dirty; + } + + memorystatus_freeze_pageouts += dirty; + memorystatus_freeze_count++; - node->pid = pid; - node->state = kProcessSuspended; + lck_mtx_unlock(memorystatus_list_mlock); - clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = sec; - ts.tv_nsec = nsec; - - node->hibernation_ts = ts; + memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); - kern_hibernation_add_node(node); + return dirty; + } + + /* Failed; go round again */ + } } + + lck_mtx_unlock(memorystatus_list_mlock); + + return -1; } -static void -kern_hibernation_unregister_pid(pid_t pid) +static inline boolean_t +memorystatus_can_freeze_processes(void) { - hibernation_node *node; + boolean_t ret; - node = kern_hibernation_get_node(pid); - if (node) { - kern_hibernation_free_node(node, TRUE); - } -} - -void -kern_hibernation_on_pid_suspend(pid_t pid) -{ - kern_hibernation_register_pid(pid); -} - -/* If enabled, we bring all the hibernated pages back prior to resumption; otherwise, they're faulted back in on demand */ -#define THAW_ON_RESUME 1 - -void -kern_hibernation_on_pid_resume(pid_t pid, task_t task) -{ -#if THAW_ON_RESUME - hibernation_node *node; - if ((node = kern_hibernation_get_node(pid))) { - if (node->state & kProcessHibernated) { - node->state |= kProcessBusy; - kern_hibernation_release_node(node); - task_thaw(task); - jetsam_send_hibernation_note(kJetsamFlagsThawed, pid, 0); + lck_mtx_lock(memorystatus_list_mlock); + + if (memorystatus_suspended_count) { + uint32_t average_resident_pages, estimated_processes; + + /* Estimate the number of suspended processes we can fit */ + average_resident_pages = memorystatus_suspended_resident_count / memorystatus_suspended_count; + estimated_processes = memorystatus_suspended_count + + ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages); + + /* If it's predicted that no freeze will occur, lower the threshold temporarily */ + if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) { + memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW; } else { - kern_hibernation_release_node(node); + memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT; } - } -#else -#pragma unused(task) -#endif - kern_hibernation_unregister_pid(pid); -} - -void -kern_hibernation_on_pid_hibernate(pid_t pid) -{ -#pragma unused(pid) - - /* Wake the hibernation thread */ - thread_wakeup((event_t)&kern_hibernation_wakeup); -} -static int -kern_hibernation_get_process_state(pid_t pid, uint32_t *state, mach_timespec_t *ts) -{ - hibernation_node *found; - int err = ESRCH; + MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n", + memorystatus_suspended_count, average_resident_pages, estimated_processes); - *state = 0; - - found = kern_hibernation_get_node(pid); - if (found) { - *state = found->state; - if (ts) { - *ts = found->hibernation_ts; + if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) { + ret = TRUE; + } else { + ret = FALSE; } - err = 0; - kern_hibernation_release_node(found); + } else { + ret = FALSE; } + + lck_mtx_unlock(memorystatus_list_mlock); - return err; + return ret; } -static int -kern_hibernation_set_process_state(pid_t pid, uint32_t state) +static boolean_t +memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low) { - hibernation_node *found; - int err = ESRCH; + /* Only freeze if we're sufficiently low on memory; this holds off freeze right + after boot, and is generally is a no-op once we've reached steady state. */ + if (memorystatus_available_pages > memorystatus_freeze_threshold) { + return FALSE; + } + + /* Check minimum suspended process threshold. */ + if (!memorystatus_can_freeze_processes()) { + return FALSE; + } - found = kern_hibernation_get_node(pid); - if (found) { - found->state = state; - err = 0; - kern_hibernation_release_node(found); + /* Is swap running low? */ + if (*memorystatus_freeze_swap_low) { + /* If there's been no movement in free swap pages since we last attempted freeze, return. */ + if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) { + return FALSE; + } + + /* Pages have been freed - we can retry. */ + *memorystatus_freeze_swap_low = FALSE; } - return err; + /* OK */ + return TRUE; } static void -kern_hibernation_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval) +memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval) { if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) { if (!interval->max_pageouts) { - interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * HIBERNATION_DAILY_PAGEOUTS_MAX) / (24 * 60))); + interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * FREEZE_DAILY_PAGEOUTS_MAX) / (24 * 60))); } else { - printf("jetsam: %d minute throttle timeout, resetting\n", interval->mins); + printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins); } interval->ts.tv_sec = interval->mins * 60; interval->ts.tv_nsec = 0; ADD_MACH_TIMESPEC(&interval->ts, ts); - /* Since we update the throttle stats pre-hibernation, adjust for overshoot here */ + /* Since we update the throttle stats pre-freeze, adjust for overshoot here */ if (interval->pageouts > interval->max_pageouts) { interval->pageouts -= interval->max_pageouts; } else { @@ -984,18 +1574,17 @@ kern_hibernation_update_throttle_interval(mach_timespec_t *ts, struct throttle_i } interval->throttle = FALSE; } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) { - printf("jetsam: %d minute pageout limit exceeded; enabling throttle\n", interval->mins); + printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins); interval->throttle = TRUE; } -#ifdef DEBUG - printf("jetsam: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", + + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, interval->throttle ? "on" : "off"); -#endif } static boolean_t -kern_hibernation_throttle_update(void) +memorystatus_freeze_update_throttle(void) { clock_sec_t sec; clock_nsec_t nsec; @@ -1004,7 +1593,7 @@ kern_hibernation_throttle_update(void) boolean_t throttled = FALSE; #if DEVELOPMENT || DEBUG - if (!kern_memorystatus_hibernation_throttle_enabled) + if (!memorystatus_freeze_throttle_enabled) return FALSE; #endif @@ -1012,14 +1601,14 @@ kern_hibernation_throttle_update(void) ts.tv_sec = sec; ts.tv_nsec = nsec; - /* Check hibernation pageouts over multiple intervals and throttle if we've exceeded our budget. + /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget. * - * This ensures that periods of inactivity can't be used as 'credit' towards hibernation if the device has + * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in * order to allow for bursts of activity. */ for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { - kern_hibernation_update_throttle_interval(&ts, &throttle_intervals[i]); + memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]); if (throttle_intervals[i].throttle == TRUE) throttled = TRUE; } @@ -1028,159 +1617,276 @@ kern_hibernation_throttle_update(void) } static void -kern_hibernation_cull(void) +memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused) { - hibernation_node *node, *next; - lck_mtx_lock(hibernation_mlock); + static boolean_t memorystatus_freeze_swap_low = FALSE; + + if (memorystatus_freeze_enabled) { + if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { + /* Only freeze if we've not exceeded our pageout budgets */ + if (!memorystatus_freeze_update_throttle()) { + memorystatus_freeze_top_proc(&memorystatus_freeze_swap_low); + } else { + printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n"); + memorystatus_freeze_throttle_count++; /* Throttled, update stats */ + } + } + } - for (node = RB_MIN(hibernation_tree, &hibernation_tree_head); node != NULL; node = next) { - proc_t p; + assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT); + thread_block((thread_continue_t) memorystatus_freeze_thread); +} + +#endif /* CONFIG_FREEZE */ - next = RB_NEXT(hibernation_tree, &hibernation_tree_head, node); +#if CONFIG_JETSAM - /* TODO: probably suboptimal, so revisit should it cause a performance issue */ - p = proc_find(node->pid); - if (p) { - proc_rele(p); - } else { - kern_hibernation_free_node(node, FALSE); +#if VM_PRESSURE_EVENTS + +static inline boolean_t +memorystatus_get_pressure_locked(void) { + if (memorystatus_available_pages > memorystatus_available_pages_pressure) { + /* Too many free pages */ + return kVMPressureNormal; + } + +#if CONFIG_FREEZE + if (memorystatus_frozen_count > 0) { + /* Frozen processes exist */ + return kVMPressureNormal; + } +#endif + + if (memorystatus_suspended_count > MEMORYSTATUS_SUSPENDED_THRESHOLD) { + /* Too many supended processes */ + return kVMPressureNormal; + } + + if (memorystatus_suspended_count > 0) { + /* Some suspended processes - warn */ + return kVMPressureWarning; + } + + /* Otherwise, pressure level is urgent */ + return kVMPressureUrgent; +} + +pid_t +memorystatus_request_vm_pressure_candidate(void) { + memorystatus_node *node; + pid_t pid = -1; + + lck_mtx_lock(memorystatus_list_mlock); + + /* Are we in a low memory state? */ + memorystatus_vm_pressure_level = memorystatus_get_pressure_locked(); + if (kVMPressureNormal != memorystatus_vm_pressure_level) { + TAILQ_FOREACH(node, &memorystatus_list, link) { + /* Skip ineligible processes */ + if (node->state & (kProcessKilled | kProcessLocked | kProcessSuspended | kProcessFrozen | kProcessNotifiedForPressure)) { + continue; + } + node->state |= kProcessNotifiedForPressure; + pid = node->pid; + break; } } + + lck_mtx_unlock(memorystatus_list_mlock); - lck_mtx_unlock(hibernation_mlock); + return pid; +} + +void +memorystatus_send_pressure_note(pid_t pid) { + memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid)); } static void -kern_hibernation_thread(void) -{ - if (vm_freeze_enabled) { - if (kern_hibernation_can_hibernate()) { - - /* Cull dead processes */ - kern_hibernation_cull(); - - /* Only hibernate if we've not exceeded our pageout budgets */ - if (!kern_hibernation_throttle_update()) { - jetsam_hibernate_top_proc(); - } else { - printf("kern_hibernation_thread: in throttle, ignoring hibernation\n"); - kern_memorystatus_hibernation_throttle_count++; /* Throttled, update stats */ +memorystatus_check_pressure_reset() { + lck_mtx_lock(memorystatus_list_mlock); + + if (kVMPressureNormal != memorystatus_vm_pressure_level) { + memorystatus_vm_pressure_level = memorystatus_get_pressure_locked(); + if (kVMPressureNormal == memorystatus_vm_pressure_level) { + memorystatus_node *node; + TAILQ_FOREACH(node, &memorystatus_list, link) { + node->state &= ~kProcessNotifiedForPressure; } } } - - assert_wait((event_t) &kern_hibernation_wakeup, THREAD_UNINT); - thread_block((thread_continue_t) kern_hibernation_thread); + + lck_mtx_unlock(memorystatus_list_mlock); } -#endif /* CONFIG_FREEZE */ +#endif /* VM_PRESSURE_EVENTS */ + +/* Sysctls... */ static int -sysctl_io_variable(struct sysctl_req *req, void *pValue, size_t currentsize, size_t maxsize, size_t *newsize) +sysctl_memorystatus_list_change SYSCTL_HANDLER_ARGS { - int error; - - /* Copy blob out */ - error = SYSCTL_OUT(req, pValue, currentsize); + int ret; + memorystatus_priority_entry_t entry; - /* error or nothing to set */ - if (error || !req->newptr) - return(error); +#pragma unused(oidp, arg1, arg2) - if (req->newlen > maxsize) { + if (!req->newptr || req->newlen > sizeof(entry)) { return EINVAL; } - error = SYSCTL_IN(req, pValue, req->newlen); - if (!error) { - *newsize = req->newlen; + ret = SYSCTL_IN(req, &entry, req->newlen); + if (ret) { + return ret; } - return(error); + memorystatus_list_change(FALSE, entry.pid, entry.priority, entry.flags, -1); + + return ret; } +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_jetsam_change, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_list_change, "I", ""); + static int -sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +sysctl_memorystatus_priority_list(__unused struct sysctl_oid *oid, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { - int i, ret; - jetsam_priority_entry_t temp_list[kMaxPriorityEntries]; - size_t newsize, currentsize; - - if (req->oldptr) { - lck_mtx_lock(jetsam_list_mlock); - for (i = 0; i < jetsam_priority_list_count; i++) { - temp_list[i] = jetsam_priority_list[i]; + int ret; + size_t allocated_size, list_size = 0; + memorystatus_priority_entry_t *list; + uint32_t list_count, i = 0; + memorystatus_node *node; + + /* Races, but this is only for diagnostic purposes */ + list_count = memorystatus_list_count; + allocated_size = sizeof(memorystatus_priority_entry_t) * list_count; + list = kalloc(allocated_size); + if (!list) { + return ENOMEM; + } + + memset(list, 0, allocated_size); + + lck_mtx_lock(memorystatus_list_mlock); + + TAILQ_FOREACH(node, &memorystatus_list, link) { + list[i].pid = node->pid; + list[i].priority = node->priority; + list[i].flags = memorystatus_build_flags_from_state(node->state); + list[i].hiwat_pages = node->hiwat_pages; + list_size += sizeof(memorystatus_priority_entry_t); + if (++i >= list_count) { + break; + } + } + + lck_mtx_unlock(memorystatus_list_mlock); + + if (!list_size) { + if (req->oldptr) { + MEMORYSTATUS_DEBUG(1, "kern.memorystatus_priority_list returning EINVAL\n"); + return EINVAL; + } + else { + MEMORYSTATUS_DEBUG(1, "kern.memorystatus_priority_list returning 0 for size\n"); } - lck_mtx_unlock(jetsam_list_mlock); + } else { + MEMORYSTATUS_DEBUG(1, "kern.memorystatus_priority_list returning %ld for size\n", (long)list_size); } + + ret = SYSCTL_OUT(req, list, list_size); - currentsize = sizeof(jetsam_priority_list[0]) * jetsam_priority_list_count; + kfree(list, allocated_size); + + return ret; +} - ret = sysctl_io_variable(req, &temp_list[0], currentsize, sizeof(temp_list), &newsize); +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_priority_list, CTLTYPE_OPAQUE|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_memorystatus_priority_list, "S,jetsam_priorities", ""); - if (!ret && req->newptr) { - int temp_list_count = newsize / sizeof(jetsam_priority_list[0]); -#if DEBUG - printf("set jetsam priority pids = { "); - for (i = 0; i < temp_list_count; i++) { - printf("(%d, 0x%08x, %d) ", temp_list[i].pid, temp_list[i].flags, temp_list[i].hiwat_pages); - } - printf("}\n"); -#endif /* DEBUG */ - lck_mtx_lock(jetsam_list_mlock); -#if CONFIG_FREEZE - jetsam_priority_list_hibernation_index = 0; +static void +memorystatus_update_levels_locked(void) { + /* Set the baseline levels in pages */ + memorystatus_available_pages_critical = (CRITICAL_PERCENT / DELTA_PERCENT) * memorystatus_delta; + memorystatus_available_pages_highwater = (HIGHWATER_PERCENT / DELTA_PERCENT) * memorystatus_delta; +#if VM_PRESSURE_EVENTS + memorystatus_available_pages_pressure = (PRESSURE_PERCENT / DELTA_PERCENT) * memorystatus_delta; #endif - jetsam_priority_list_index = 0; - jetsam_priority_list_count = temp_list_count; - for (i = 0; i < temp_list_count; i++) { - jetsam_priority_list[i] = temp_list[i]; - } - for (i = temp_list_count; i < kMaxPriorityEntries; i++) { - jetsam_priority_list[i].pid = 0; - jetsam_priority_list[i].flags = 0; - jetsam_priority_list[i].hiwat_pages = -1; - jetsam_priority_list[i].hiwat_reserved1 = -1; - jetsam_priority_list[i].hiwat_reserved2 = -1; - jetsam_priority_list[i].hiwat_reserved3 = -1; - } - lck_mtx_unlock(jetsam_list_mlock); - } - return ret; + +#if DEBUG || DEVELOPMENT + if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) { + memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic; + memorystatus_available_pages_highwater += memorystatus_jetsam_policy_offset_pages_diagnostic; +#if VM_PRESSURE_EVENTS + memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic; +#endif + } +#endif + + /* Only boost the critical level - it's more important to kill right away than issue warnings */ + if (memorystatus_jetsam_policy & kPolicyMoreFree) { + memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_more_free; + } +} + +static int +sysctl_memorystatus_jetsam_policy_more_free SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + int error, more_free = 0; + + error = priv_check_cred(kauth_cred_get(), PRIV_VM_JETSAM, 0); + if (error) + return (error); + + error = sysctl_handle_int(oidp, &more_free, 0, req); + if (error || !req->newptr) + return (error); + + lck_mtx_lock(memorystatus_list_mlock); + + if (more_free) { + memorystatus_jetsam_policy |= kPolicyMoreFree; + } else { + memorystatus_jetsam_policy &= ~kPolicyMoreFree; + } + + memorystatus_update_levels_locked(); + + lck_mtx_unlock(memorystatus_list_mlock); + + return 0; } +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_jetsam_policy_more_free, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED|CTLFLAG_ANYBODY, + 0, 0, &sysctl_memorystatus_jetsam_policy_more_free, "I", ""); + static int -sysctl_handle_kern_memorystatus_snapshot(__unused struct sysctl_oid *oid, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +sysctl_handle_memorystatus_snapshot(__unused struct sysctl_oid *oid, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int ret; size_t currentsize = 0; - if (jetsam_snapshot_list_count > 0) { - currentsize = sizeof(jetsam_kernel_stats_t) + sizeof(size_t) + sizeof(jetsam_snapshot_entry_t) * jetsam_snapshot_list_count; + if (memorystatus_jetsam_snapshot_list_count > 0) { + currentsize = sizeof(memorystatus_jetsam_snapshot_t) + sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_list_count - 1); } if (!currentsize) { if (req->oldptr) { -#ifdef DEBUG - printf("kern.memorystatus_snapshot returning EINVAL\n"); -#endif + MEMORYSTATUS_DEBUG(1, "kern.memorystatus_snapshot returning EINVAL\n"); return EINVAL; } else { -#ifdef DEBUG - printf("kern.memorystatus_snapshot returning 0 for size\n"); -#endif + MEMORYSTATUS_DEBUG(1, "kern.memorystatus_snapshot returning 0 for size\n"); } } else { -#ifdef DEBUG - printf("kern.memorystatus_snapshot returning %ld for size\n", (long)currentsize); -#endif + MEMORYSTATUS_DEBUG(1, "kern.memorystatus_snapshot returning %ld for size\n", (long)currentsize); } - ret = sysctl_io_variable(req, &jetsam_snapshot, currentsize, 0, NULL); + ret = SYSCTL_OUT(req, &memorystatus_jetsam_snapshot, currentsize); if (!ret && req->oldptr) { - jetsam_snapshot.entry_count = jetsam_snapshot_list_count = 0; + memorystatus_jetsam_snapshot.entry_count = memorystatus_jetsam_snapshot_list_count = 0; } return ret; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_priority_list, CTLTYPE_OPAQUE|CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_handle_kern_memorystatus_priority_list, "S,jetsam_priorities", ""); -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_snapshot, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_handle_kern_memorystatus_snapshot, "S,jetsam_snapshot", ""); +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_snapshot, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_handle_memorystatus_snapshot, "S,memorystatus_snapshot", ""); + +#endif /* CONFIG_JETSAM */ diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 5cc239bd0..7c27eb16d 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -405,7 +405,7 @@ SYSCTL_PROC(_hw, HW_L2SETTINGS, l2settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L3SETTINGS, sysctl_hw_generic, "I", ""); SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputhreadtype, 0, ""); -#if defined (__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined(__x86_64__) int mmx_flag = -1; int sse_flag = -1; int sse2_flag = -1; @@ -435,6 +435,8 @@ SYSCTL_INT(_hw_optional, OID_AUTO, avx1_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_L SYSCTL_INT(_hw_optional, OID_AUTO, rdrand, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &rdrand_flag, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, f16c, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &f16c_flag, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, enfstrg, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &enfstrg_flag, 0, ""); +#else +#error Unsupported arch #endif /* !__i386__ && !__x86_64 && !__arm__ */ /* @@ -479,6 +481,8 @@ sysctl_mib_init(void) cputhreadtype = cpu_threadtype(); #if defined(__i386__) || defined (__x86_64__) cpu64bit = (_get_cpu_capabilities() & k64Bit) == k64Bit; +#else +#error Unsupported arch #endif /* @@ -530,8 +534,8 @@ sysctl_mib_init(void) packages = roundup(ml_cpu_cache_sharing(0), cpuid_info()->thread_count) / cpuid_info()->thread_count; -#else /* end __arm__ */ -# error unknown architecture +#else +#error unknown architecture #endif /* !__i386__ && !__x86_64 && !__arm__ */ } diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 979af3e5d..13a64cb93 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -115,8 +115,11 @@ #include #include +#include + #include #include +#include #include #include @@ -145,8 +148,8 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) int err=0; vm_map_t user_map; kern_return_t result; - mach_vm_offset_t user_addr; - mach_vm_size_t user_size; + vm_map_offset_t user_addr; + vm_map_size_t user_size; vm_object_offset_t pageoff; vm_object_offset_t file_pos; int alloc_flags=0; @@ -161,8 +164,8 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) int fd = uap->fd; int num_retries = 0; - user_addr = (mach_vm_offset_t)uap->addr; - user_size = (mach_vm_size_t) uap->len; + user_addr = (vm_map_offset_t)uap->addr; + user_size = (vm_map_size_t) uap->len; AUDIT_ARG(addr, user_addr); AUDIT_ARG(len, user_size); @@ -207,7 +210,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) user_size += pageoff; /* low end... */ user_size = mach_vm_round_page(user_size); /* hi end */ - if ((flags & MAP_JIT) && ((flags & MAP_FIXED) || (flags & MAP_SHARED) || (flags & MAP_FILE))){ + if ((flags & MAP_JIT) && ((flags & MAP_FIXED) || (flags & MAP_SHARED) || !(flags & MAP_ANON))){ return EINVAL; } /* @@ -247,12 +250,11 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) #if CONFIG_MACF /* * Entitlement check. - * Re-enable once mac* is implemented. */ - /*error = mac_proc_check_map_anon(p, user_addr, user_size, prot, flags, &maxprot); + error = mac_proc_check_map_anon(p, user_addr, user_size, prot, flags, &maxprot); if (error) { return EINVAL; - }*/ + } #endif /* MAC */ /* @@ -279,6 +281,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); + if (flags & MAP_JIT) + return EINVAL; + /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. @@ -403,13 +408,10 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) #if CONFIG_PROTECT { - void *cnode; - if ((cnode = cp_get_protected_cnode(vp)) != NULL) { - error = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); - if (error) { - (void) vnode_put(vp); - goto bad; - } + error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0); + if (error) { + (void) vnode_put(vp); + goto bad; } } #endif /* CONFIG_PROTECT */ @@ -616,7 +618,6 @@ bad: KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_mmap) | DBG_FUNC_NONE), fd, (uint32_t)(*retval), (uint32_t)user_size, error, 0); KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO2, SYS_mmap) | DBG_FUNC_NONE), (uint32_t)(*retval >> 32), (uint32_t)(user_size >> 32), (uint32_t)(file_pos >> 32), (uint32_t)file_pos, 0); - return(error); } @@ -639,9 +640,7 @@ msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused int3 addr = (mach_vm_offset_t) uap->addr; size = (mach_vm_size_t)uap->len; - KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0); - if (addr & PAGE_MASK_64) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ return EINVAL; @@ -1175,14 +1174,11 @@ map_fd_funneled( #if CONFIG_PROTECT /* check for content protection access */ { - void *cnode; - if ((cnode = cp_get_protected_cnode(vp)) != NULL) { - err = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); - if (err != 0) { - (void)vnode_put(vp); - goto bad; - } - } + err = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0); + if (err != 0) { + (void) vnode_put(vp); + goto bad; + } } #endif /* CONFIG_PROTECT */ diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index f352a55bf..6d696b424 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -103,6 +103,7 @@ #include #include /* vm_map_switch_protect() */ #include +#include #if CONFIG_MACF #include @@ -174,6 +175,7 @@ static void pgrp_remove(proc_t p); static void pgrp_replace(proc_t p, struct pgrp *pgrp); static void pgdelete_dropref(struct pgrp *pgrp); extern void pg_rele_dropref(struct pgrp * pgrp); +static int csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user_addr_t uaddittoken); struct fixjob_iterargs { struct pgrp * pg; @@ -353,6 +355,23 @@ proc_findinternal(int pid, int locked) return(p); } +proc_t +proc_findthread(thread_t thread) +{ + proc_t p = PROC_NULL; + struct uthread *uth; + + proc_list_lock(); + uth = get_bsdthread_info(thread); + if (uth && (uth->uu_flag & UT_VFORK)) + p = uth->uu_proc; + else + p = (proc_t)(get_bsdthreadtask_info(thread)); + p = proc_ref_locked(p); + proc_list_unlock(); + return(p); +} + int proc_rele(proc_t p) { @@ -733,6 +752,12 @@ proc_suser(proc_t p) return(error); } +task_t +proc_task(proc_t proc) +{ + return (task_t)proc->task; +} + /* * Obtain the first thread in a process * @@ -1686,10 +1711,31 @@ SYSCTL_INT(_kern_lctx, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &maxlcid, 0, int csops(__unused proc_t p, struct csops_args *uap, __unused int32_t *retval) { - int ops = uap->ops; - pid_t pid = uap->pid; - user_addr_t uaddr = uap->useraddr; - size_t usize = (size_t)CAST_DOWN(size_t, uap->usersize); + return(csops_internal(uap->pid, uap->ops, uap->useraddr, + uap->usersize, USER_ADDR_NULL)); +} + +int +csops_audittoken(__unused proc_t p, struct csops_audittoken_args *uap, __unused int32_t *retval) +{ + if (uap->uaudittoken == USER_ADDR_NULL) + return(EINVAL); + switch (uap->ops) { + case CS_OPS_PIDPATH: + case CS_OPS_ENTITLEMENTS_BLOB: + break; + default: + return(EINVAL); + }; + + return(csops_internal(uap->pid, uap->ops, uap->useraddr, + uap->usersize, uap->uaudittoken)); +} + +static int +csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user_addr_t uaudittoken) +{ + size_t usize = (size_t)CAST_DOWN(size_t, usersize); proc_t pt; uint32_t retflags; int vid, forself; @@ -1698,6 +1744,8 @@ csops(__unused proc_t p, struct csops_args *uap, __unused int32_t *retval) off_t toff; char * buf; unsigned char cdhash[SHA1_RESULTLEN]; + audit_token_t token; + unsigned int upid=0, uidversion = 0; forself = error = 0; @@ -1714,15 +1762,37 @@ csops(__unused proc_t p, struct csops_args *uap, __unused int32_t *retval) return(EOVERFLOW); if (kauth_cred_issuser(kauth_cred_get()) != TRUE) return(EPERM); - } else if ((forself == 0) && ((ops != CS_OPS_STATUS) && (ops != CS_OPS_CDHASH) && (ops != CS_OPS_PIDOFFSET) && (kauth_cred_issuser(kauth_cred_get()) != TRUE))) { - return(EPERM); + } else { + switch (ops) { + case CS_OPS_STATUS: + case CS_OPS_CDHASH: + case CS_OPS_PIDOFFSET: + case CS_OPS_ENTITLEMENTS_BLOB: + break; /* unrestricted */ + default: + if (forself == 0 && kauth_cred_issuser(kauth_cred_get()) != TRUE) + return(EPERM); + break; + } } pt = proc_find(pid); if (pt == PROC_NULL) return(ESRCH); - + upid = pt->p_pid; + uidversion = pt->p_idversion; + if (uaudittoken != USER_ADDR_NULL) { + + error = copyin(uaudittoken, &token, sizeof(audit_token_t)); + if (error != 0) + goto out; + /* verify the audit token pid/idversion matches with proc */ + if ((token.val[5] != upid) || (token.val[7] != uidversion)) { + error = ESRCH; + goto out; + } + } switch (ops) { @@ -1833,20 +1903,34 @@ csops(__unused proc_t p, struct csops_args *uap, __unused int32_t *retval) return error; case CS_OPS_ENTITLEMENTS_BLOB: { - char zeros[8] = { 0 }; + char fakeheader[8] = { 0 }; void *start; size_t length; - if (0 != (error = cs_entitlements_blob_get(pt, - &start, &length))) + if ((pt->p_csflags & CS_VALID) == 0) { + error = EINVAL; break; - if (usize < sizeof(zeros) || usize < length) { + } + if (usize < sizeof(fakeheader)) { error = ERANGE; break; } + if (0 != (error = cs_entitlements_blob_get(pt, + &start, &length))) + break; + /* if no entitlement, fill in zero header */ if (NULL == start) { - start = zeros; - length = sizeof(zeros); + start = fakeheader; + length = sizeof(fakeheader); + } else if (usize < length) { + /* ... if input too short, copy out length of entitlement */ + uint32_t length32 = htonl((uint32_t)length); + memcpy(&fakeheader[4], &length32, sizeof(length32)); + + error = copyout(fakeheader, uaddr, sizeof(fakeheader)); + if (error == 0) + error = ERANGE; /* input buffer to short, ERANGE signals that */ + break; } error = copyout(start, uaddr, length); break; @@ -1867,7 +1951,6 @@ out: return(error); } - int proc_iterate(flags, callout, arg, filterfn, filterarg) int flags; diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index d2473dbf0..ca41339ea 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -113,7 +113,8 @@ int uthread_get_background_state(uthread_t); static void do_background_socket(struct proc *p, thread_t thread, int priority); static int do_background_thread(struct proc *curp, thread_t thread, int priority); static int do_background_proc(struct proc *curp, struct proc *targetp, int priority); -void proc_apply_task_networkbg_internal(proc_t); +void proc_apply_task_networkbg_internal(proc_t, thread_t); +void proc_restore_task_networkbg_internal(proc_t, thread_t); rlim_t maxdmap = MAXDSIZ; /* XXX */ rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */ @@ -368,6 +369,9 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r } case PRIO_DARWIN_THREAD: { + /* process marked for termination no priority management */ + if ((curp->p_lflag & P_LPTERMINATE) != 0) + return(EINVAL); /* we currently only support the current thread */ if (uap->who != 0) { return (EINVAL); @@ -390,11 +394,16 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r refheld = 1; } - error = do_background_proc(curp, p, uap->prio); - if (!error) { - (void) do_background_socket(p, NULL, uap->prio); - } + /* process marked for termination no priority management */ + if ((p->p_lflag & P_LPTERMINATE) != 0) { + error = EINVAL; + } else { + error = do_background_proc(curp, p, uap->prio); + if (!error) { + (void) do_background_socket(p, NULL, uap->prio); + } + } found++; if (refheld != 0) proc_rele(p); @@ -461,16 +470,13 @@ do_background_proc(struct proc *curp, struct proc *targetp, int priority) int error = 0; kauth_cred_t ucred; kauth_cred_t target_cred; -#if CONFIG_EMBEDDED - task_category_policy_data_t info; -#endif ucred = kauth_cred_get(); target_cred = kauth_cred_proc_ref(targetp); if (!kauth_cred_issuser(ucred) && kauth_cred_getruid(ucred) && - kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) && - kauth_cred_getruid(ucred) != kauth_cred_getuid(target_cred)) + kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) && + kauth_cred_getruid(ucred) != kauth_cred_getuid(target_cred)) { error = EPERM; goto out; @@ -482,49 +488,12 @@ do_background_proc(struct proc *curp, struct proc *targetp, int priority) goto out; #endif -#if !CONFIG_EMBEDDED if (priority == PRIO_DARWIN_NONUI) error = proc_apply_task_gpuacc(targetp->task, TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); else - error = proc_set1_bgtaskpolicy(targetp->task, priority); + error = proc_set_and_apply_bgtaskpolicy(targetp->task, priority); if (error) goto out; -#else /* !CONFIG_EMBEDDED */ - - /* set the max scheduling priority on the task */ - if (priority == PRIO_DARWIN_BG) { - info.role = TASK_THROTTLE_APPLICATION; - } - else if (priority == PRIO_DARWIN_NONUI) { - info.role = TASK_NONUI_APPLICATION; - } - else { - info.role = TASK_DEFAULT_APPLICATION; - } - - error = task_policy_set(targetp->task, - TASK_CATEGORY_POLICY, - (task_policy_t) &info, - TASK_CATEGORY_POLICY_COUNT); - - if (error) - goto out; - - proc_lock(targetp); - - /* mark proc structure as backgrounded */ - if (priority == PRIO_DARWIN_BG) { - targetp->p_lflag |= P_LBACKGROUND; - } else { - targetp->p_lflag &= ~P_LBACKGROUND; - } - - /* set or reset the disk I/O priority */ - targetp->p_iopol_disk = (priority == PRIO_DARWIN_BG ? - IOPOL_THROTTLE : IOPOL_DEFAULT); - - proc_unlock(targetp); -#endif /* !CONFIG_EMBEDDED */ out: kauth_cred_unref(&target_cred); @@ -610,11 +579,7 @@ static int do_background_thread(struct proc *curp __unused, thread_t thread, int priority) { struct uthread *ut; -#if !CONFIG_EMBEDDED int error = 0; -#else /* !CONFIG_EMBEDDED */ - thread_precedence_policy_data_t policy; -#endif /* !CONFIG_EMBEDDED */ ut = get_bsdthread_info(thread); @@ -623,61 +588,9 @@ do_background_thread(struct proc *curp __unused, thread_t thread, int priority) return(EPERM); } -#if !CONFIG_EMBEDDED - error = proc_set1_bgthreadpolicy(curp->task, thread_tid(thread), priority); + error = proc_set_and_apply_bgthreadpolicy(curp->task, thread_tid(thread), priority); return(error); -#else /* !CONFIG_EMBEDDED */ - if ( (priority & PRIO_DARWIN_BG) == 0 ) { - /* turn off backgrounding of thread */ - if ( (ut->uu_flag & UT_BACKGROUND) == 0 ) { - /* already off */ - return(0); - } - - /* - * Clear background bit in thread and disable disk IO - * throttle as well as network traffic management. - * The corresponding socket flags for sockets created by - * this thread will be cleared in do_background_socket(). - */ - ut->uu_flag &= ~(UT_BACKGROUND | UT_BACKGROUND_TRAFFIC_MGT); - ut->uu_iopol_disk = IOPOL_NORMAL; - - /* reset thread priority (we did not save previous value) */ - policy.importance = 0; - thread_policy_set( thread, THREAD_PRECEDENCE_POLICY, - (thread_policy_t)&policy, - THREAD_PRECEDENCE_POLICY_COUNT ); - return(0); - } - - /* background this thread */ - if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) { - /* already backgrounded */ - return(0); - } - /* - * Tag thread as background and throttle disk IO, as well - * as regulate network traffics. Future sockets created - * by this thread will have their corresponding socket - * flags set at socket create time. - */ - ut->uu_flag |= (UT_BACKGROUND | UT_BACKGROUND_TRAFFIC_MGT); - ut->uu_iopol_disk = IOPOL_THROTTLE; - - policy.importance = INT_MIN; - thread_policy_set( thread, THREAD_PRECEDENCE_POLICY, - (thread_policy_t)&policy, - THREAD_PRECEDENCE_POLICY_COUNT ); - - /* throttle networking IO happens in socket( ) syscall. - * If UT_{BACKGROUND,BACKGROUND_TRAFFIC_MGT} is set in the current - * thread then TRAFFIC_MGT_SO_{BACKGROUND,BG_REGULATE} is set. - * Existing sockets are taken care of by do_background_socket(). - */ -#endif /* !CONFIG_EMBEDDED */ - return(0); } #if CONFIG_EMBEDDED @@ -726,29 +639,6 @@ out: } #endif /* CONFIG_EMBEDDED */ -#if CONFIG_EMBEDDED -/* - * If the thread or its proc has been put into the background - * with setpriority(PRIO_DARWIN_{THREAD,PROCESS}, *, PRIO_DARWIN_BG), - * report that status. - * - * Returns: PRIO_DARWIN_BG if background - * 0 if foreground - */ -int -uthread_get_background_state(uthread_t uth) -{ - proc_t p = uth->uu_proc; - if (p && (p->p_lflag & P_LBACKGROUND)) - return PRIO_DARWIN_BG; - - if (uth->uu_flag & UT_BACKGROUND) - return PRIO_DARWIN_BG; - - return 0; -} -#endif /* CONFIG_EMBEDDED */ - /* * Returns: 0 Success * copyin:EFAULT @@ -891,12 +781,7 @@ dosetrlimit(struct proc *p, u_int which, struct rlimit *limp) size = round_page_64(limp->rlim_cur); size -= round_page_64(alimp->rlim_cur); -#if STACK_GROWTH_UP - /* go to top of current stack */ - addr = p->user_stack + round_page_64(alimp->rlim_cur); -#else /* STACK_GROWTH_UP */ addr = p->user_stack - round_page_64(limp->rlim_cur); -#endif /* STACK_GROWTH_UP */ kr = mach_vm_protect(current_map(), addr, size, FALSE, VM_PROT_DEFAULT); @@ -918,28 +803,6 @@ dosetrlimit(struct proc *p, u_int which, struct rlimit *limp) */ cur_sp = thread_adjuserstack(current_thread(), 0); -#if STACK_GROWTH_UP - if (cur_sp >= p->user_stack && - cur_sp < (p->user_stack + - round_page_64(alimp->rlim_cur))) { - /* current stack pointer is in main stack */ - if (cur_sp >= (p->user_stack + - round_page_64(limp->rlim_cur))) { - /* - * New limit would cause - * current usage to be invalid: - * reject new limit. - */ - error = EINVAL; - goto out; - } - } else { - /* not on the main stack: reject */ - error = EINVAL; - goto out; - } - -#else /* STACK_GROWTH_UP */ if (cur_sp <= p->user_stack && cur_sp > (p->user_stack - round_page_64(alimp->rlim_cur))) { @@ -959,16 +822,11 @@ dosetrlimit(struct proc *p, u_int which, struct rlimit *limp) error = EINVAL; goto out; } -#endif /* STACK_GROWTH_UP */ size = round_page_64(alimp->rlim_cur); size -= round_page_64(limp->rlim_cur); -#if STACK_GROWTH_UP - addr = p->user_stack + round_page_64(limp->rlim_cur); -#else /* STACK_GROWTH_UP */ addr = p->user_stack - round_page_64(alimp->rlim_cur); -#endif /* STACK_GROWTH_UP */ kr = mach_vm_protect(current_map(), addr, size, @@ -1092,15 +950,15 @@ calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *i task = p->task; if (task) { - task_basic_info_32_data_t tinfo; + mach_task_basic_info_data_t tinfo; task_thread_times_info_data_t ttimesinfo; task_events_info_data_t teventsinfo; mach_msg_type_number_t task_info_count, task_ttimes_count; mach_msg_type_number_t task_events_count; struct timeval ut,st; - task_info_count = TASK_BASIC_INFO_32_COUNT; - task_info(task, TASK_BASIC2_INFO_32, + task_info_count = MACH_TASK_BASIC_INFO_COUNT; + task_info(task, MACH_TASK_BASIC_INFO, (task_info_t)&tinfo, &task_info_count); ut.tv_sec = tinfo.user_time.seconds; ut.tv_usec = tinfo.user_time.microseconds; @@ -1136,7 +994,7 @@ calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *i if (p->p_stats->p_ru.ru_nivcsw < 0) p->p_stats->p_ru.ru_nivcsw = 0; - p->p_stats->p_ru.ru_maxrss = tinfo.resident_size; + p->p_stats->p_ru.ru_maxrss = tinfo.resident_size_max; } } @@ -1330,13 +1188,7 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un { int error = 0; struct _iopol_param_t iop_param; -#if !CONFIG_EMBEDDED int processwide = 0; -#else /* !CONFIG_EMBEDDED */ - thread_t thread = THREAD_NULL; - struct uthread *ut = NULL; - int *policy; -#endif /* !CONFIG_EMBEDDED */ if ((error = copyin(uap->arg, &iop_param, sizeof(iop_param))) != 0) goto out; @@ -1346,7 +1198,6 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un goto out; } -#if !CONFIG_EMBEDDED switch (iop_param.iop_scope) { case IOPOL_SCOPE_PROCESS: processwide = 1; @@ -1366,6 +1217,7 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un case IOPOL_NORMAL: case IOPOL_THROTTLE: case IOPOL_PASSIVE: + case IOPOL_UTILITY: if(processwide != 0) proc_apply_task_diskacc(current_task(), iop_param.iop_policy); else @@ -1392,61 +1244,6 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; } -#else /* !CONFIG_EMBEDDED */ - switch (iop_param.iop_scope) { - case IOPOL_SCOPE_PROCESS: - policy = &p->p_iopol_disk; - break; - case IOPOL_SCOPE_THREAD: - thread = current_thread(); - ut = get_bsdthread_info(thread); - policy = &ut->uu_iopol_disk; - break; - default: - error = EINVAL; - goto out; - } - - switch(uap->cmd) { - case IOPOL_CMD_SET: - switch (iop_param.iop_policy) { - case IOPOL_DEFAULT: - case IOPOL_NORMAL: - case IOPOL_THROTTLE: - case IOPOL_PASSIVE: - proc_lock(p); - *policy = iop_param.iop_policy; - proc_unlock(p); - break; - default: - error = EINVAL; - goto out; - } - break; - case IOPOL_CMD_GET: - switch (*policy) { - case IOPOL_DEFAULT: - case IOPOL_NORMAL: - case IOPOL_THROTTLE: - case IOPOL_PASSIVE: - iop_param.iop_policy = *policy; - break; - default: // in-kernel - // this should never happen - printf("%s: unknown I/O policy %d\n", __func__, *policy); - // restore to default value - *policy = IOPOL_DEFAULT; - iop_param.iop_policy = *policy; - } - - error = copyout((caddr_t)&iop_param, uap->arg, sizeof(iop_param)); - break; - default: - error = EINVAL; // unknown command - break; - } - -#endif /* !CONFIG_EMBEDDED */ out: *retval = error; return (error); @@ -1458,28 +1255,7 @@ boolean_t thread_is_io_throttled(void); boolean_t thread_is_io_throttled(void) { - -#if !CONFIG_EMBEDDED - return(proc_get_task_selfdiskacc() == IOPOL_THROTTLE); - -#else /* !CONFIG_EMBEDDED */ - int policy; - struct uthread *ut; - - ut = get_bsdthread_info(current_thread()); - - if(ut){ - policy = current_proc()->p_iopol_disk; - - if (ut->uu_iopol_disk != IOPOL_DEFAULT) - policy = ut->uu_iopol_disk; - - if (policy == IOPOL_THROTTLE) - return TRUE; - } - return FALSE; -#endif /* !CONFIG_EMBEDDED */ } void @@ -1523,10 +1299,17 @@ proc_set_task_networkbg(void * bsdinfo, int setbg) } void -proc_apply_task_networkbg_internal(proc_t p) +proc_apply_task_networkbg_internal(proc_t p, thread_t thread) { if (p != PROC_NULL) { - do_background_socket(p, NULL, PRIO_DARWIN_BG); + do_background_socket(p, thread, PRIO_DARWIN_BG); + } +} +void +proc_restore_task_networkbg_internal(proc_t p, thread_t thread) +{ + if (p != PROC_NULL) { + do_background_socket(p, thread, PRIO_DARWIN_BG); } } diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index de5455812..d656dcaf3 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -121,7 +121,6 @@ extern int thread_enable_fpe(thread_t act, int onoff); extern thread_t port_name_to_thread(mach_port_name_t port_name); extern kern_return_t get_signalact(task_t , thread_t *, int); -extern boolean_t thread_should_abort(thread_t); extern unsigned int get_useraddr(void); /* @@ -655,7 +654,7 @@ siginit(proc_t p) { int i; - for (i = 0; i < NSIG; i++) + for (i = 1; i < NSIG; i++) if (sigprop[i] & SA_IGNORE && i != SIGCONT) p->p_sigignore |= sigmask(i); } @@ -1637,7 +1636,7 @@ threadsignal(thread_t sig_actthread, int signum, mach_exception_code_t code) p = (proc_t)(get_bsdtask_info(sig_task)); uth = get_bsdthread_info(sig_actthread); - if (uth && (uth->uu_flag & UT_VFORK)) + if (uth->uu_flag & UT_VFORK) p = uth->uu_proc; proc_lock(p); @@ -2069,7 +2068,6 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) * Process will be running after 'run' */ sig_proc->p_stat = SRUN; - proc_unlock(sig_proc); /* * In scenarios where suspend/resume are racing * the signal we are missing AST_BSD by the time @@ -2079,6 +2077,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ act_set_astbsd(sig_thread); thread_abort(sig_thread); + proc_unlock(sig_proc); goto psigout; @@ -2206,7 +2205,7 @@ psignal_uthread(thread_t thread, int signum) * postsig(signum); */ int -issignal(proc_t p) +issignal_locked(proc_t p) { int signum, mask, prop, sigbits; thread_t cur_act; @@ -2223,13 +2222,11 @@ issignal(proc_t p) ram_printf(3); } #endif /* SIGNAL_DEBUG */ - proc_lock(p); /* * Try to grab the signal lock. */ if (sig_try_locked(p) <= 0) { - proc_unlock(p); return(0); } @@ -2362,6 +2359,7 @@ issignal(proc_t p) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, p->p_pid, W_EXITCODE(0, SIGKILL), 2, 0, 0); exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); + proc_lock(p); return(0); } @@ -2503,7 +2501,6 @@ issignal(proc_t p) /* NOTREACHED */ out: proc_signalend(p, 1); - proc_unlock(p); return(retval); } @@ -2653,7 +2650,7 @@ stop(proc_t p, proc_t parent) * from the current set of pending signals. */ void -postsig(int signum) +postsig_locked(int signum) { proc_t p = current_proc(); struct sigacts *ps = p->p_sigacts; @@ -2672,12 +2669,10 @@ postsig(int signum) panic("psig not on master"); #endif - proc_lock(p); /* * Try to grab the signal lock. */ if (sig_try_locked(p) <= 0) { - proc_unlock(p); return; } @@ -2713,6 +2708,16 @@ postsig(int signum) ut->t_dtrace_siginfo.si_uid = p->si_uid; ut->t_dtrace_siginfo.si_status = WEXITSTATUS(p->si_status); + /* Fire DTrace proc:::fault probe when signal is generated by hardware. */ + switch (signum) { + case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP: + DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo)); + break; + default: + break; + } + + DTRACE_PROC3(signal__handle, int, signum, siginfo_t *, &(ut->t_dtrace_siginfo), void (*)(void), SIG_DFL); #endif @@ -2720,6 +2725,7 @@ postsig(int signum) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, p->p_pid, W_EXITCODE(0, signum), 3, 0, 0); exit1(p, W_EXITCODE(0, signum), (int *)NULL); + proc_lock(p); return; } else { /* @@ -2767,7 +2773,6 @@ postsig(int signum) sendsig(p, catcher, signum, returnmask, code); } proc_signalend(p, 1); - proc_unlock(p); } /* @@ -2963,10 +2968,12 @@ bsd_ast(thread_t thread) #endif /* CONFIG_DTRACE */ + proc_lock(p); if (CHECK_SIGNALS(p, current_thread(), ut)) { - while ( (signum = issignal(p)) ) - postsig(signum); + while ( (signum = issignal_locked(p)) ) + postsig_locked(signum); } + proc_unlock(p); if (!bsd_init_done) { bsd_init_done = 1; @@ -3116,7 +3123,10 @@ sig_lock_to_exit(proc_t p) p->exit_thread = self; proc_unlock(p); - (void) task_suspend(p->task); + + task_hold(p->task); + task_wait(p->task, FALSE); + proc_lock(p); } diff --git a/bsd/kern/kern_subr.c b/bsd/kern/kern_subr.c index cc05a7db7..9e9587bea 100644 --- a/bsd/kern/kern_subr.c +++ b/bsd/kern/kern_subr.c @@ -153,7 +153,7 @@ uiomove64(const addr64_t c_cp, int n, struct uio *uio) if (n > 0 && acnt > (uint64_t)n) acnt = n; - switch (uio->uio_segflg) { + switch ((int) uio->uio_segflg) { case UIO_USERSPACE64: case UIO_USERISPACE64: @@ -280,7 +280,7 @@ ureadc(int c, struct uio *uio) if (uio_curriovlen(uio) <= 0) panic("ureadc: non-positive iovlen"); - switch (uio->uio_segflg) { + switch ((int) uio->uio_segflg) { case UIO_USERSPACE32: case UIO_USERSPACE: diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index b1db73f0c..d0d467494 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -78,7 +78,8 @@ struct kern_direct_file_io_ref_t struct vnode * vp; dev_t device; uint32_t blksize; - off_t filelength; + off_t filelength; + char pinned; }; @@ -95,8 +96,82 @@ static int device_ioctl(void * p1, __unused void * p2, u_long theIoctl, caddr_t return (VNOP_IOCTL(p1, theIoctl, result, 0, p2)); } -void -kern_unmap_file(struct kern_direct_file_io_ref_t * ref, off_t f_offset, off_t end); +static int +kern_ioctl_file_extents(struct kern_direct_file_io_ref_t * ref, u_long theIoctl, off_t offset, off_t end) +{ + int error; + int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); + void * p1; + void * p2; + uint64_t fileblk; + size_t filechunk; + dk_extent_t extent; + dk_unmap_t unmap; + _dk_cs_pin_t pin; + + bzero(&extent, sizeof(dk_extent_t)); + bzero(&unmap, sizeof(dk_unmap_t)); + bzero(&pin, sizeof(pin)); + if (ref->vp->v_type == VREG) + { + p1 = &ref->device; + p2 = kernproc; + do_ioctl = &file_ioctl; + } + else + { + /* Partition. */ + p1 = ref->vp; + p2 = ref->ctx; + do_ioctl = &device_ioctl; + } + while (offset < end) + { + if (ref->vp->v_type == VREG) + { + daddr64_t blkno; + filechunk = 1*1024*1024*1024; + if (filechunk > (size_t)(end - offset)) + filechunk = (size_t)(end - offset); + error = VNOP_BLOCKMAP(ref->vp, offset, filechunk, &blkno, &filechunk, NULL, 0, NULL); + if (error) break; + fileblk = blkno * ref->blksize; + } + else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) + { + fileblk = offset; + filechunk = ref->filelength; + } + + if (DKIOCUNMAP == theIoctl) + { + extent.offset = fileblk; + extent.length = filechunk; + unmap.extents = &extent; + unmap.extentsCount = 1; + error = do_ioctl(p1, p2, theIoctl, (caddr_t)&unmap); +// printf("DKIOCUNMAP(%d) 0x%qx, 0x%qx\n", error, extent.offset, extent.length); + } + else if (_DKIOCCSPINEXTENT == theIoctl) + { + pin.cp_extent.offset = fileblk; + pin.cp_extent.length = filechunk; + pin.cp_flags = _DKIOCSPINDISCARDDATA; + error = do_ioctl(p1, p2, theIoctl, (caddr_t)&pin); + if (error && (ENOTTY != error)) + { + printf("_DKIOCCSPINEXTENT(%d) 0x%qx, 0x%qx\n", + error, pin.cp_extent.offset, pin.cp_extent.length); + } + } + else error = EINVAL; + + if (error) break; + offset += filechunk; + } + return (error); +} + int kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, caddr_t addr, vm_size_t len); @@ -143,7 +218,7 @@ kern_open_file_for_direct_io(const char * name, goto out; } - ref->vp = NULL; + bzero(ref, sizeof(*ref)); p = kernproc; ref->ctx = vfs_context_create(vfs_context_current()); @@ -197,13 +272,6 @@ kern_open_file_for_direct_io(const char * name, } ref->device = device; - // generate the block list - - error = do_ioctl(p1, p2, DKIOCLOCKPHYSICALEXTENTS, NULL); - if (error) - goto out; - locked = TRUE; - // get block size error = do_ioctl(p1, p2, DKIOCGETBLOCKSIZE, (caddr_t) &ref->blksize); @@ -220,6 +288,19 @@ kern_open_file_for_direct_io(const char * name, ref->filelength = fileblk * ref->blksize; } + // pin logical extents + + error = kern_ioctl_file_extents(ref, _DKIOCCSPINEXTENT, 0, ref->filelength); + if (error && (ENOTTY != error)) goto out; + ref->pinned = (error == 0); + + // generate the block list + + error = do_ioctl(p1, p2, DKIOCLOCKPHYSICALEXTENTS, NULL); + if (error) + goto out; + locked = TRUE; + f_offset = 0; while (f_offset < ref->filelength) { @@ -370,60 +451,6 @@ kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, caddr_t ad vfs_context_proc(ref->ctx))); } -void -kern_unmap_file(struct kern_direct_file_io_ref_t * ref, off_t offset, off_t end) -{ - int error; - int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); - void * p1; - void * p2; - dk_extent_t extent; - dk_unmap_t unmap; - uint64_t fileblk; - size_t filechunk; - - bzero(&extent, sizeof(dk_extent_t)); - bzero(&unmap, sizeof(dk_unmap_t)); - if (ref->vp->v_type == VREG) - { - p1 = &ref->device; - p2 = kernproc; - do_ioctl = &file_ioctl; - } - else - { - /* Partition. */ - p1 = ref->vp; - p2 = ref->ctx; - do_ioctl = &device_ioctl; - } - while (offset < end) - { - if (ref->vp->v_type == VREG) - { - daddr64_t blkno; - filechunk = 1*1024*1024*1024; - if (filechunk > (size_t)(end - offset)) - filechunk = (size_t)(end - offset); - error = VNOP_BLOCKMAP(ref->vp, offset, filechunk, &blkno, &filechunk, NULL, 0, NULL); - if (error) break; - fileblk = blkno * ref->blksize; - } - else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) - { - fileblk = offset; - filechunk = ref->filelength; - } - extent.offset = fileblk; - extent.length = filechunk; - unmap.extents = &extent; - unmap.extentsCount = 1; - error = do_ioctl(p1, p2, DKIOCUNMAP, (caddr_t)&unmap); -// kprintf("DKIOCUNMAP(%d) 0x%qx, 0x%qx\n", error, extent.offset, extent.length); - if (error) break; - offset += filechunk; - } -} void kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, @@ -460,9 +487,9 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, { (void) kern_write_file(ref, write_offset, addr, write_length); } - if (discard_offset && discard_end) + if (discard_offset && discard_end && !ref->pinned) { - (void) kern_unmap_file(ref, discard_offset, discard_end); + (void) kern_ioctl_file_extents(ref, DKIOCUNMAP, discard_offset, discard_end); } error = vnode_close(ref->vp, FWRITE, ref->ctx); diff --git a/bsd/kern/kern_synch.c b/bsd/kern/kern_synch.c index c6b4888c3..34cb1520a 100644 --- a/bsd/kern/kern_synch.c +++ b/bsd/kern/kern_synch.c @@ -58,7 +58,6 @@ #include /* for unix_syscall_return() */ #include -extern boolean_t thread_should_abort(thread_t); /* XXX */ extern void compute_averunnable(void *); /* XXX */ diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index e1f693be2..56782c39c 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -112,6 +112,7 @@ #include #include #include +#include #include #include #include @@ -136,6 +137,16 @@ #include #endif +#if CONFIG_FREEZE +#include +#endif + +/* + * deliberately setting max requests to really high number + * so that runaway settings do not cause MALLOC overflows + */ +#define AIO_MAX_REQUESTS (128 * CONFIG_AIO_MAX) + extern sysctlfn net_sysctl; extern sysctlfn cpu_sysctl; extern int aio_max_requests; @@ -147,6 +158,7 @@ extern int nx_enabled; extern int speculative_reads_disabled; extern int ignore_is_ssd; extern unsigned int speculative_prefetch_max; +extern unsigned int speculative_prefetch_max_iosize; extern unsigned int preheat_pages_max; extern unsigned int preheat_pages_min; extern long numvnodes; @@ -175,15 +187,18 @@ extern unsigned int vm_page_speculative_q_age_ms; extern boolean_t mach_timer_coalescing_enabled; STATIC void -fill_user32_eproc(proc_t p, struct user32_eproc *ep); +fill_user32_eproc(proc_t, struct user32_eproc *__restrict); +STATIC void +fill_user32_externproc(proc_t, struct user32_extern_proc *__restrict); STATIC void -fill_user32_externproc(proc_t p, struct user32_extern_proc *exp); +fill_user64_eproc(proc_t, struct user64_eproc *__restrict); STATIC void -fill_user64_eproc(proc_t p, struct user64_eproc *ep); +fill_user64_proc(proc_t, struct user64_kinfo_proc *__restrict); STATIC void -fill_user64_proc(proc_t p, struct user64_kinfo_proc *kp); +fill_user64_externproc(proc_t, struct user64_extern_proc *__restrict); STATIC void -fill_user64_externproc(proc_t p, struct user64_extern_proc *exp); +fill_user32_proc(proc_t, struct user32_kinfo_proc *__restrict); + extern int kdbg_control(int *name, u_int namelen, user_addr_t where, size_t * sizep); #if NFSCLIENT @@ -195,8 +210,6 @@ pcsamples_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t p); __private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, unsigned int val2); -STATIC void -fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp); int sysctl_procargs(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc); @@ -221,11 +234,9 @@ int sysdoproc_callback(proc_t p, void *arg); /* forward declarations for non-static STATIC */ STATIC void fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64); STATIC void fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32); -STATIC int sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_handle_kern_threadname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_sched_stats(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_sched_stats_enable(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); -STATIC int sysctl_file(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_kdebug_ops SYSCTL_HANDLER_ARGS; STATIC int sysctl_dotranslate SYSCTL_HANDLER_ARGS; STATIC int sysctl_doaffinity SYSCTL_HANDLER_ARGS; @@ -278,6 +289,7 @@ STATIC int sysctl_sysctl_native(struct sysctl_oid *oidp, void *arg1, int arg2, s STATIC int sysctl_sysctl_cputype(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_safeboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_singleuser(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_slide(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); extern void IORegistrySetOSBuildVersion(char * build_version); @@ -653,75 +665,6 @@ SYSCTL_PROC(_kern, KERN_TRANSLATE, translate, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_ NULL, /* Data pointer */ ""); -int -set_archhandler(__unused proc_t p, int arch) -{ - int error; - struct nameidata nd; - struct vnode_attr va; - vfs_context_t ctx = vfs_context_current(); - struct exec_archhandler *archhandler; - - switch(arch) { - case CPU_TYPE_POWERPC: - archhandler = &exec_archhandler_ppc; - break; - default: - return (EBADARCH); - } - - NDINIT(&nd, LOOKUP, OP_GETATTR, FOLLOW | LOCKLEAF, UIO_SYSSPACE, - CAST_USER_ADDR_T(archhandler->path), ctx); - error = namei(&nd); - if (error) - return (error); - nameidone(&nd); - - /* Check mount point */ - if ((nd.ni_vp->v_mount->mnt_flag & MNT_NOEXEC) || - (nd.ni_vp->v_type != VREG)) { - vnode_put(nd.ni_vp); - return (EACCES); - } - - VATTR_INIT(&va); - VATTR_WANTED(&va, va_fsid); - VATTR_WANTED(&va, va_fileid); - error = vnode_getattr(nd.ni_vp, &va, ctx); - if (error) { - vnode_put(nd.ni_vp); - return (error); - } - vnode_put(nd.ni_vp); - - archhandler->fsid = va.va_fsid; - archhandler->fileid = va.va_fileid; - return 0; -} - - -STATIC int -sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, - int arg2, struct sysctl_req *req) -{ - int error = 0; - - if (req->newptr && !kauth_cred_issuser(kauth_cred_get())) - return (EPERM); - - error = sysctl_handle_string(oidp, arg1, arg2, req); - - if (error) - goto done; - - if (req->newptr) - error = set_archhandler(req->p, CPU_TYPE_POWERPC); - -done: - return error; - -} - STATIC int sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -781,16 +724,6 @@ sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void * SYSCTL_PROC(_kern, KERN_THREADNAME, threadname, CTLFLAG_ANYBODY | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_handle_kern_threadname,"A",""); -SYSCTL_NODE(_kern, KERN_EXEC, exec, CTLFLAG_RD|CTLFLAG_LOCKED, 0, ""); - -SYSCTL_NODE(_kern_exec, OID_AUTO, archhandler, CTLFLAG_RD|CTLFLAG_LOCKED, 0, ""); - -SYSCTL_PROC(_kern_exec_archhandler, OID_AUTO, powerpc, - CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, - exec_archhandler_ppc.path, - sizeof(exec_archhandler_ppc.path), - sysctl_handle_exec_archhandler_ppc, "A", ""); - #define BSD_HOST 1 STATIC int sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -1244,57 +1177,6 @@ sysctl_rdstruct(user_addr_t oldp, size_t *oldlenp, return (error); } -/* - * Get file structures. - */ -STATIC int -sysctl_file -(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) -{ - int error; - struct fileglob *fg; - struct extern_file nef; - - if (req->oldptr == USER_ADDR_NULL) { - /* - * overestimate by 10 files - */ - req->oldidx = sizeof(filehead) + (nfiles + 10) * sizeof(struct extern_file); - return (0); - } - - /* - * first copyout filehead - */ - error = SYSCTL_OUT(req, &filehead, sizeof(filehead)); - if (error) - return (error); - - /* - * followed by an array of file structures - */ - for (fg = filehead.lh_first; fg != 0; fg = fg->f_list.le_next) { - nef.f_list.le_next = (struct extern_file *)fg->f_list.le_next; - nef.f_list.le_prev = (struct extern_file **)fg->f_list.le_prev; - nef.f_flag = (fg->fg_flag & FMASK); - nef.f_type = fg->fg_type; - nef.f_count = fg->fg_count; - nef.f_msgcount = fg->fg_msgcount; - nef.f_cred = fg->fg_cred; - nef.f_ops = fg->fg_ops; - nef.f_offset = fg->fg_offset; - nef.f_data = fg->fg_data; - error = SYSCTL_OUT(req, &nef, sizeof(nef)); - if (error) - return (error); - } - return (0); -} - -SYSCTL_PROC(_kern, KERN_FILE, file, - CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, - 0, 0, sysctl_file, "S,filehead", ""); - STATIC int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg) { @@ -1391,12 +1273,12 @@ sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg) #define KERN_PROCSLOP (5 * sizeof (struct kinfo_proc)) struct sysdoproc_args { int buflen; - caddr_t kprocp; + void *kprocp; boolean_t is_64_bit; user_addr_t dp; size_t needed; int sizeof_kproc; - int * errorp; + int *errorp; int uidcheck; int ruidcheck; int ttycheck; @@ -1404,37 +1286,33 @@ struct sysdoproc_args { }; int -sysdoproc_callback(proc_t p, void * arg) +sysdoproc_callback(proc_t p, void *arg) { - struct sysdoproc_args * args = (struct sysdoproc_args *)arg; - int error=0; + struct sysdoproc_args *args = arg; if (args->buflen >= args->sizeof_kproc) { - if ((args->ruidcheck != 0) && (sysdoproc_filt_KERN_PROC_RUID(p, &args->uidval) == 0)) - return(PROC_RETURNED); - if ((args->uidcheck != 0) && (sysdoproc_filt_KERN_PROC_UID(p, &args->uidval) == 0)) - return(PROC_RETURNED); - if ((args->ttycheck != 0) && (sysdoproc_filt_KERN_PROC_TTY(p, &args->uidval) == 0)) - return(PROC_RETURNED); + if ((args->ruidcheck != 0) && (sysdoproc_filt_KERN_PROC_RUID(p, &args->uidval) == 0)) + return (PROC_RETURNED); + if ((args->uidcheck != 0) && (sysdoproc_filt_KERN_PROC_UID(p, &args->uidval) == 0)) + return (PROC_RETURNED); + if ((args->ttycheck != 0) && (sysdoproc_filt_KERN_PROC_TTY(p, &args->uidval) == 0)) + return (PROC_RETURNED); bzero(args->kprocp, args->sizeof_kproc); - if (args->is_64_bit) { - fill_user64_proc(p, (struct user64_kinfo_proc *) args->kprocp); - } - else { - fill_user32_proc(p, (struct user32_kinfo_proc *) args->kprocp); - } - error = copyout(args->kprocp, args->dp, args->sizeof_kproc); + if (args->is_64_bit) + fill_user64_proc(p, args->kprocp); + else + fill_user32_proc(p, args->kprocp); + int error = copyout(args->kprocp, args->dp, args->sizeof_kproc); if (error) { *args->errorp = error; - return(PROC_RETURNED_DONE); - return (error); + return (PROC_RETURNED_DONE); } args->dp += args->sizeof_kproc; args->buflen -= args->sizeof_kproc; } args->needed += args->sizeof_kproc; - return(PROC_RETURNED); + return (PROC_RETURNED); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); @@ -1450,11 +1328,11 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS size_t needed = 0; int buflen = where != USER_ADDR_NULL ? req->oldlen : 0; int error = 0; - boolean_t is_64_bit = FALSE; + boolean_t is_64_bit = proc_is64bit(current_proc()); struct user32_kinfo_proc user32_kproc; struct user64_kinfo_proc user_kproc; int sizeof_kproc; - caddr_t kprocp; + void *kprocp; int (*filterfn)(proc_t, void *) = 0; struct sysdoproc_args args; int uidcheck = 0; @@ -1464,17 +1342,14 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS if (namelen != 1 && !(namelen == 0 && cmd == KERN_PROC_ALL)) return (EINVAL); - is_64_bit = proc_is64bit(current_proc()); if (is_64_bit) { sizeof_kproc = sizeof(user_kproc); - kprocp = (caddr_t) &user_kproc; - } - else { + kprocp = &user_kproc; + } else { sizeof_kproc = sizeof(user32_kproc); - kprocp = (caddr_t) &user32_kproc; + kprocp = &user32_kproc; } - switch (cmd) { case KERN_PROC_PID: @@ -1522,12 +1397,13 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS args.ttycheck = ttycheck; args.sizeof_kproc = sizeof_kproc; if (namelen) - args.uidval = name[0]; + args.uidval = name[0]; - proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), sysdoproc_callback, &args, filterfn, name); + proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), + sysdoproc_callback, &args, filterfn, name); if (error) - return(error); + return (error); dp = args.dp; needed = args.needed; @@ -1544,6 +1420,7 @@ sysctl_prochandle SYSCTL_HANDLER_ARGS req->oldidx += req->oldlen; return (0); } + /* * We specify the subcommand code for multiple nodes as the 'req->arg2' value * in the sysctl declaration itself, which comes into the handler function @@ -1611,42 +1488,30 @@ SYSCTL_PROC(_kern_proc, KERN_PROC_LCID, lcid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_ /* - * Fill in an eproc structure for the specified process. + * Fill in non-zero fields of an eproc structure for the specified process. */ STATIC void -fill_user32_eproc(proc_t p, struct user32_eproc *ep) +fill_user32_eproc(proc_t p, struct user32_eproc *__restrict ep) { struct tty *tp; + struct pgrp *pg; + struct session *sessp; kauth_cred_t my_cred; - struct pgrp * pg; - struct session * sessp; pg = proc_pgrp(p); sessp = proc_session(p); - ep->e_paddr = CAST_DOWN_EXPLICIT(uint32_t,p); - if (pg != PGRP_NULL) { - ep->e_sess = CAST_DOWN_EXPLICIT(uint32_t,sessp); ep->e_pgid = p->p_pgrpid; ep->e_jobc = pg->pg_jobc; - if ((sessp != SESSION_NULL) && sessp->s_ttyvp) + if (sessp != SESSION_NULL && sessp->s_ttyvp) ep->e_flag = EPROC_CTTY; - } else { - ep->e_sess = 0; - ep->e_pgid = 0; - ep->e_jobc = 0; } #if CONFIG_LCTX - if (p->p_lctx) { + if (p->p_lctx) ep->e_lcid = p->p_lctx->lc_id; - } else { - ep->e_lcid = 0; - } #endif ep->e_ppid = p->p_ppid; - /* Pre-zero the fake historical pcred */ - bzero(&ep->e_pcred, sizeof(ep->e_pcred)); if (p->p_ucred) { my_cred = kauth_cred_proc_ref(p); @@ -1655,78 +1520,58 @@ fill_user32_eproc(proc_t p, struct user32_eproc *ep) ep->e_pcred.p_svuid = kauth_cred_getsvuid(my_cred); ep->e_pcred.p_rgid = kauth_cred_getrgid(my_cred); ep->e_pcred.p_svgid = kauth_cred_getsvgid(my_cred); + /* A fake historical *kauth_cred_t */ ep->e_ucred.cr_ref = my_cred->cr_ref; ep->e_ucred.cr_uid = kauth_cred_getuid(my_cred); ep->e_ucred.cr_ngroups = posix_cred_get(my_cred)->cr_ngroups; - bcopy(posix_cred_get(my_cred)->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + bcopy(posix_cred_get(my_cred)->cr_groups, + ep->e_ucred.cr_groups, NGROUPS * sizeof (gid_t)); kauth_cred_unref(&my_cred); } - if (p->p_stat == SIDL || p->p_stat == SZOMB) { - ep->e_vm.vm_tsize = 0; - ep->e_vm.vm_dsize = 0; - ep->e_vm.vm_ssize = 0; - } - ep->e_vm.vm_rssize = 0; if ((p->p_flag & P_CONTROLT) && (sessp != SESSION_NULL) && (tp = SESSION_TP(sessp))) { ep->e_tdev = tp->t_dev; ep->e_tpgid = sessp->s_ttypgrpid; - ep->e_tsess = CAST_DOWN_EXPLICIT(uint32_t,tp->t_session); } else ep->e_tdev = NODEV; - if (SESS_LEADER(p, sessp)) - ep->e_flag |= EPROC_SLEADER; - bzero(&ep->e_wmesg[0], WMESGLEN+1); - ep->e_xsize = ep->e_xrssize = 0; - ep->e_xccount = ep->e_xswrss = 0; - if (sessp != SESSION_NULL) + if (sessp != SESSION_NULL) { + if (SESS_LEADER(p, sessp)) + ep->e_flag |= EPROC_SLEADER; session_rele(sessp); - if(pg != PGRP_NULL) + } + if (pg != PGRP_NULL) pg_rele(pg); } /* - * Fill in an LP64 version of eproc structure for the specified process. + * Fill in non-zero fields of an LP64 eproc structure for the specified process. */ STATIC void -fill_user64_eproc(proc_t p, struct user64_eproc *ep) +fill_user64_eproc(proc_t p, struct user64_eproc *__restrict ep) { struct tty *tp; - struct session *sessp = NULL; - struct pgrp * pg; + struct pgrp *pg; + struct session *sessp; kauth_cred_t my_cred; pg = proc_pgrp(p); sessp = proc_session(p); - ep->e_paddr = CAST_USER_ADDR_T(p); if (pg != PGRP_NULL) { - ep->e_sess = CAST_USER_ADDR_T(sessp); ep->e_pgid = p->p_pgrpid; ep->e_jobc = pg->pg_jobc; - if (sessp != SESSION_NULL) { - if (sessp->s_ttyvp) - ep->e_flag = EPROC_CTTY; - } - } else { - ep->e_sess = USER_ADDR_NULL; - ep->e_pgid = 0; - ep->e_jobc = 0; + if (sessp != SESSION_NULL && sessp->s_ttyvp) + ep->e_flag = EPROC_CTTY; } #if CONFIG_LCTX - if (p->p_lctx) { + if (p->p_lctx) ep->e_lcid = p->p_lctx->lc_id; - } else { - ep->e_lcid = 0; - } #endif ep->e_ppid = p->p_ppid; - /* Pre-zero the fake historical pcred */ - bzero(&ep->e_pcred, sizeof(ep->e_pcred)); if (p->p_ucred) { my_cred = kauth_cred_proc_ref(p); @@ -1740,180 +1585,135 @@ fill_user64_eproc(proc_t p, struct user64_eproc *ep) ep->e_ucred.cr_ref = my_cred->cr_ref; ep->e_ucred.cr_uid = kauth_cred_getuid(my_cred); ep->e_ucred.cr_ngroups = posix_cred_get(my_cred)->cr_ngroups; - bcopy(posix_cred_get(my_cred)->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + bcopy(posix_cred_get(my_cred)->cr_groups, + ep->e_ucred.cr_groups, NGROUPS * sizeof (gid_t)); kauth_cred_unref(&my_cred); } - if (p->p_stat == SIDL || p->p_stat == SZOMB) { - ep->e_vm.vm_tsize = 0; - ep->e_vm.vm_dsize = 0; - ep->e_vm.vm_ssize = 0; - } - ep->e_vm.vm_rssize = 0; if ((p->p_flag & P_CONTROLT) && (sessp != SESSION_NULL) && (tp = SESSION_TP(sessp))) { ep->e_tdev = tp->t_dev; ep->e_tpgid = sessp->s_ttypgrpid; - ep->e_tsess = CAST_USER_ADDR_T(tp->t_session); } else ep->e_tdev = NODEV; - if (SESS_LEADER(p, sessp)) - ep->e_flag |= EPROC_SLEADER; - bzero(&ep->e_wmesg[0], WMESGLEN+1); - ep->e_xsize = ep->e_xrssize = 0; - ep->e_xccount = ep->e_xswrss = 0; - if (sessp != SESSION_NULL) + if (sessp != SESSION_NULL) { + if (SESS_LEADER(p, sessp)) + ep->e_flag |= EPROC_SLEADER; session_rele(sessp); + } if (pg != PGRP_NULL) pg_rele(pg); } /* * Fill in an eproc structure for the specified process. + * bzeroed by our caller, so only set non-zero fields. */ STATIC void -fill_user32_externproc(proc_t p, struct user32_extern_proc *exp) +fill_user32_externproc(proc_t p, struct user32_extern_proc *__restrict exp) { - exp->p_forw = exp->p_back = 0; exp->p_starttime.tv_sec = p->p_start.tv_sec; exp->p_starttime.tv_usec = p->p_start.tv_usec; - exp->p_vmspace = 0; - exp->p_sigacts = CAST_DOWN_EXPLICIT(uint32_t,p->p_sigacts); - exp->p_flag = p->p_flag; + exp->p_flag = p->p_flag; if (p->p_lflag & P_LTRACED) exp->p_flag |= P_TRACED; if (p->p_lflag & P_LPPWAIT) exp->p_flag |= P_PPWAIT; if (p->p_lflag & P_LEXIT) exp->p_flag |= P_WEXIT; - exp->p_stat = p->p_stat ; - exp->p_pid = p->p_pid ; - exp->p_oppid = p->p_oppid ; + exp->p_stat = p->p_stat; + exp->p_pid = p->p_pid; + exp->p_oppid = p->p_oppid; /* Mach related */ - exp->user_stack = p->user_stack; - exp->exit_thread = CAST_DOWN_EXPLICIT(uint32_t,p->exit_thread); - exp->p_debugger = p->p_debugger ; - exp->sigwait = p->sigwait ; + exp->user_stack = p->user_stack; + exp->p_debugger = p->p_debugger; + exp->sigwait = p->sigwait; /* scheduling */ #ifdef _PROC_HAS_SCHEDINFO_ - exp->p_estcpu = p->p_estcpu ; - exp->p_pctcpu = p->p_pctcpu ; - exp->p_slptime = p->p_slptime ; -#else - exp->p_estcpu = 0 ; - exp->p_pctcpu = 0 ; - exp->p_slptime = 0 ; + exp->p_estcpu = p->p_estcpu; + exp->p_pctcpu = p->p_pctcpu; + exp->p_slptime = p->p_slptime; #endif - exp->p_cpticks = 0 ; - exp->p_wchan = 0 ; - exp->p_wmesg = 0 ; - exp->p_swtime = 0 ; - bcopy(&p->p_realtimer, &exp->p_realtimer,sizeof(struct itimerval)); - bcopy(&p->p_rtime, &exp->p_rtime,sizeof(struct timeval)); - exp->p_uticks = 0 ; - exp->p_sticks = 0 ; - exp->p_iticks = 0 ; - exp->p_traceflag = 0; - exp->p_tracep = 0 ; - exp->p_siglist = 0 ; /* No longer relevant */ - exp->p_textvp = CAST_DOWN_EXPLICIT(uint32_t,p->p_textvp) ; - exp->p_holdcnt = 0 ; - exp->p_sigmask = 0 ; /* no longer avaialable */ - exp->p_sigignore = p->p_sigignore ; - exp->p_sigcatch = p->p_sigcatch ; - exp->p_priority = p->p_priority ; - exp->p_usrpri = 0 ; - exp->p_nice = p->p_nice ; - bcopy(&p->p_comm, &exp->p_comm,MAXCOMLEN); - exp->p_comm[MAXCOMLEN] = '\0'; - exp->p_pgrp = CAST_DOWN_EXPLICIT(uint32_t,p->p_pgrp) ; - exp->p_addr = 0; - exp->p_xstat = p->p_xstat ; - exp->p_acflag = p->p_acflag ; - exp->p_ru = CAST_DOWN_EXPLICIT(uint32_t,p->p_ru) ; /* XXX may be NULL */ + exp->p_realtimer.it_interval.tv_sec = + (user32_time_t)p->p_realtimer.it_interval.tv_sec; + exp->p_realtimer.it_interval.tv_usec = + (__int32_t)p->p_realtimer.it_interval.tv_usec; + + exp->p_realtimer.it_value.tv_sec = + (user32_time_t)p->p_realtimer.it_value.tv_sec; + exp->p_realtimer.it_value.tv_usec = + (__int32_t)p->p_realtimer.it_value.tv_usec; + + exp->p_rtime.tv_sec = (user32_time_t)p->p_rtime.tv_sec; + exp->p_rtime.tv_usec = (__int32_t)p->p_rtime.tv_usec; + + exp->p_sigignore = p->p_sigignore; + exp->p_sigcatch = p->p_sigcatch; + exp->p_priority = p->p_priority; + exp->p_nice = p->p_nice; + bcopy(&p->p_comm, &exp->p_comm, MAXCOMLEN); + exp->p_xstat = p->p_xstat; + exp->p_acflag = p->p_acflag; } /* * Fill in an LP64 version of extern_proc structure for the specified process. */ STATIC void -fill_user64_externproc(proc_t p, struct user64_extern_proc *exp) +fill_user64_externproc(proc_t p, struct user64_extern_proc *__restrict exp) { - exp->p_forw = exp->p_back = USER_ADDR_NULL; exp->p_starttime.tv_sec = p->p_start.tv_sec; exp->p_starttime.tv_usec = p->p_start.tv_usec; - exp->p_vmspace = USER_ADDR_NULL; - exp->p_sigacts = CAST_USER_ADDR_T(p->p_sigacts); - exp->p_flag = p->p_flag; + exp->p_flag = p->p_flag; if (p->p_lflag & P_LTRACED) exp->p_flag |= P_TRACED; if (p->p_lflag & P_LPPWAIT) exp->p_flag |= P_PPWAIT; if (p->p_lflag & P_LEXIT) exp->p_flag |= P_WEXIT; - exp->p_stat = p->p_stat ; - exp->p_pid = p->p_pid ; - exp->p_oppid = p->p_oppid ; + exp->p_stat = p->p_stat; + exp->p_pid = p->p_pid; + exp->p_oppid = p->p_oppid; /* Mach related */ - exp->user_stack = p->user_stack; - exp->exit_thread = CAST_USER_ADDR_T(p->exit_thread); - exp->p_debugger = p->p_debugger ; - exp->sigwait = p->sigwait ; + exp->user_stack = p->user_stack; + exp->p_debugger = p->p_debugger; + exp->sigwait = p->sigwait; /* scheduling */ #ifdef _PROC_HAS_SCHEDINFO_ - exp->p_estcpu = p->p_estcpu ; - exp->p_pctcpu = p->p_pctcpu ; - exp->p_slptime = p->p_slptime ; -#else - exp->p_estcpu = 0 ; - exp->p_pctcpu = 0 ; - exp->p_slptime = 0 ; + exp->p_estcpu = p->p_estcpu; + exp->p_pctcpu = p->p_pctcpu; + exp->p_slptime = p->p_slptime; #endif - exp->p_cpticks = 0 ; - exp->p_wchan = 0; - exp->p_wmesg = 0; - exp->p_swtime = 0 ; exp->p_realtimer.it_interval.tv_sec = p->p_realtimer.it_interval.tv_sec; exp->p_realtimer.it_interval.tv_usec = p->p_realtimer.it_interval.tv_usec; + exp->p_realtimer.it_value.tv_sec = p->p_realtimer.it_value.tv_sec; exp->p_realtimer.it_value.tv_usec = p->p_realtimer.it_value.tv_usec; + exp->p_rtime.tv_sec = p->p_rtime.tv_sec; exp->p_rtime.tv_usec = p->p_rtime.tv_usec; - exp->p_uticks = 0 ; - exp->p_sticks = 0 ; - exp->p_iticks = 0 ; - exp->p_traceflag = 0 ; - exp->p_tracep = 0; - exp->p_siglist = 0 ; /* No longer relevant */ - exp->p_textvp = CAST_USER_ADDR_T(p->p_textvp); - exp->p_holdcnt = 0 ; - exp->p_sigmask = 0 ; /* no longer avaialable */ - exp->p_sigignore = p->p_sigignore ; - exp->p_sigcatch = p->p_sigcatch ; - exp->p_priority = p->p_priority ; - exp->p_usrpri = 0 ; - exp->p_nice = p->p_nice ; - bcopy(&p->p_comm, &exp->p_comm,MAXCOMLEN); - exp->p_comm[MAXCOMLEN] = '\0'; - exp->p_pgrp = CAST_USER_ADDR_T(p->p_pgrp); - exp->p_addr = USER_ADDR_NULL; - exp->p_xstat = p->p_xstat ; - exp->p_acflag = p->p_acflag ; - exp->p_ru = CAST_USER_ADDR_T(p->p_ru); /* XXX may be NULL */ + + exp->p_sigignore = p->p_sigignore; + exp->p_sigcatch = p->p_sigcatch; + exp->p_priority = p->p_priority; + exp->p_nice = p->p_nice; + bcopy(&p->p_comm, &exp->p_comm, MAXCOMLEN); + exp->p_xstat = p->p_xstat; + exp->p_acflag = p->p_acflag; } STATIC void -fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp) +fill_user32_proc(proc_t p, struct user32_kinfo_proc *__restrict kp) { - /* on a 64 bit kernel, 32 bit users will get some truncated information */ + /* on a 64 bit kernel, 32 bit users get some truncated information */ fill_user32_externproc(p, &kp->kp_proc); fill_user32_eproc(p, &kp->kp_eproc); } STATIC void -fill_user64_proc(proc_t p, struct user64_kinfo_proc *kp) +fill_user64_proc(proc_t p, struct user64_kinfo_proc *__restrict kp) { fill_user64_externproc(p, &kp->kp_proc); fill_user64_eproc(p, &kp->kp_eproc); @@ -1958,6 +1758,10 @@ sysctl_kdebug_ops SYSCTL_HANDLER_ARGS case KERN_KDSETRTCDEC: case KERN_KDSETBUF: case KERN_KDGETENTROPY: + case KERN_KDENABLE_BG_TRACE: + case KERN_KDDISABLE_BG_TRACE: + case KERN_KDSET_TYPEFILTER: + ret = kdbg_control(name, namelen, oldp, oldlenp); break; default: @@ -2303,7 +2107,7 @@ sysctl_aiomax int error = sysctl_io_number(req, aio_max_requests, sizeof(int), &new_value, &changed); if (changed) { /* make sure the system-wide limit is greater than the per process limit */ - if (new_value >= aio_max_requests_per_process) + if (new_value >= aio_max_requests_per_process && new_value <= AIO_MAX_REQUESTS) aio_max_requests = new_value; else error = EINVAL; @@ -2642,6 +2446,10 @@ SYSCTL_UINT(_kern, OID_AUTO, speculative_prefetch_max, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &speculative_prefetch_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, speculative_prefetch_max_iosize, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &speculative_prefetch_max_iosize, 0, ""); + SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_target, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &vm_page_free_target, 0, ""); @@ -2976,6 +2784,40 @@ SYSCTL_PROC(_kern, KERN_RAGEVNODE, rage_vnode, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_rage_vnode, "I", ""); +/* XXX move this interface into libproc and remove this sysctl */ +STATIC int +sysctl_setthread_cpupercent +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value, old_value; + int error = 0; + kern_return_t kret = KERN_SUCCESS; + uint8_t percent = 0; + int ms_refill = 0; + + old_value = 0; + + if ((error = sysctl_io_number(req, old_value, sizeof(old_value), &new_value, NULL)) != 0) + return (error); + + percent = new_value & 0xff; /* low 8 bytes for perent */ + ms_refill = (new_value >> 8) & 0xffffff; /* upper 24bytes represent ms refill value */ + if (percent > 100) + return (EINVAL); + + /* + * If the caller is specifying a percentage of 0, this will unset the CPU limit, if present. + */ + if ((kret = thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, percent, ms_refill * NSEC_PER_MSEC)) != 0) + return (EIO); + + return (0); +} + +SYSCTL_PROC(_kern, OID_AUTO, setthread_cpupercent, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, + 0, 0, sysctl_setthread_cpupercent, "I", "set thread cpu percentage limit"); + STATIC int sysctl_kern_check_openevt @@ -3119,12 +2961,6 @@ SYSCTL_PROC(_vm, VM_SWAPUSAGE, swapusage, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_swapusage, "S,xsw_usage", ""); -#if CONFIG_EMBEDDED -/* */ -boolean_t vm_freeze_enabled = FALSE; -#endif /* CONFIG_EMBEDDED */ - - #if CONFIG_FREEZE extern void vm_page_reactivate_all_throttled(void); @@ -3132,7 +2968,7 @@ static int sysctl_freeze_enabled SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) - int error, val = vm_freeze_enabled ? 1 : 0; + int error, val = memorystatus_freeze_enabled ? 1 : 0; boolean_t disabled; error = sysctl_handle_int(oidp, &val, 0, req); @@ -3142,9 +2978,9 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS /* * If freeze is being disabled, we need to move dirty pages out from the throttle to the active queue. */ - disabled = (!val && vm_freeze_enabled); + disabled = (!val && memorystatus_freeze_enabled); - vm_freeze_enabled = val ? TRUE : FALSE; + memorystatus_freeze_enabled = val ? TRUE : FALSE; if (disabled) { vm_page_reactivate_all_throttled(); @@ -3153,7 +2989,7 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS return (0); } -SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW, &vm_freeze_enabled, 0, sysctl_freeze_enabled, "I", ""); +SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW, &memorystatus_freeze_enabled, 0, sysctl_freeze_enabled, "I", ""); #endif /* CONFIG_FREEZE */ /* this kernel does NOT implement shared_region_make_private_np() */ @@ -3296,6 +3132,24 @@ SYSCTL_INT (_kern, OID_AUTO, affinity_sets_enabled, SYSCTL_INT (_kern, OID_AUTO, affinity_sets_mapping, CTLFLAG_RW | CTLFLAG_LOCKED, &affinity_sets_mapping, 0, "mapping policy"); +/* + * Boolean indicating if KASLR is active. + */ +STATIC int +sysctl_slide +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + uint32_t slide; + + slide = vm_kernel_slide ? 1 : 0; + + return sysctl_io_number( req, slide, sizeof(int), NULL, NULL); +} + +SYSCTL_PROC(_kern, OID_AUTO, slide, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_slide, "I", ""); + /* * Limit on total memory users can wire. * @@ -3316,7 +3170,6 @@ vm_map_size_t vm_user_wire_limit; /* * There needs to be a more automatic/elegant way to do this */ - SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_no_user_wire_amount, ""); SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_user_wire_limit, ""); SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_user_wire_limit, ""); @@ -3370,3 +3223,16 @@ SYSCTL_STRING(_kern, OID_AUTO, sched, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, sched_string, sizeof(sched_string), "Timeshare scheduler implementation"); + +/* + * Only support runtime modification on embedded platforms + * with development config enabled + */ +#if CONFIG_EMBEDDED +#if !SECURE_KERNEL +extern int precise_user_kernel_time; +SYSCTL_INT(_kern, OID_AUTO, precise_user_kernel_time, + CTLFLAG_RW | CTLFLAG_LOCKED, + &precise_user_kernel_time, 0, "Precise accounting of kernel vs. user time"); +#endif +#endif diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index 70ab53b31..259873728 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -44,6 +45,9 @@ static const mbuf_flags_t mbuf_flags_mask = (MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | MBUF_LOOP | MBUF_BCAST | MBUF_MCAST | MBUF_FRAG | MBUF_FIRSTFRAG | MBUF_LASTFRAG | MBUF_PROMISC | MBUF_HASFCS); +#define MBUF_PKTAUXF_MASK \ + (MBUF_PKTAUXF_INET_RESOLVE_RTR | MBUF_PKTAUXF_INET6_RESOLVE_RTR) + void* mbuf_data(mbuf_t mbuf) { return mbuf->m_data; @@ -1105,38 +1109,67 @@ mbuf_get_minclsize(void) return (MHLEN + MLEN); } -mbuf_traffic_class_t +mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t m) { - mbuf_traffic_class_t prio = MBUF_TC_BE; - if (m == NULL || !(m->m_flags & M_PKTHDR)) - return (prio); - - if (m->m_pkthdr.prio <= MBUF_TC_VO) - prio = m->m_pkthdr.prio; + return (MBUF_TC_BE); - return (prio); + return (m_get_traffic_class(m)); } -errno_t +errno_t mbuf_set_traffic_class(mbuf_t m, mbuf_traffic_class_t tc) { - errno_t error = 0; - + if (m == NULL || !(m->m_flags & M_PKTHDR) || + ((u_int32_t)tc >= MBUF_TC_MAX)) + return (EINVAL); + + return (m_set_traffic_class(m, tc)); +} + +int +mbuf_is_traffic_class_privileged(mbuf_t m) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR) || + !MBUF_VALID_SC(m->m_pkthdr.svc)) + return (0); + + return (m->m_pkthdr.aux_flags & MAUXF_PRIO_PRIVILEGED); +} + +mbuf_svc_class_t +mbuf_get_service_class(mbuf_t m) +{ if (m == NULL || !(m->m_flags & M_PKTHDR)) - return EINVAL; + return (MBUF_SC_BE); - switch (tc) { - case MBUF_TC_BE: - case MBUF_TC_BK: - case MBUF_TC_VI: - case MBUF_TC_VO: - m->m_pkthdr.prio = tc; - break; - default: - error = EINVAL; - break; - } - return error; + return (m_get_service_class(m)); +} + +errno_t +mbuf_set_service_class(mbuf_t m, mbuf_svc_class_t sc) +{ + if (m == NULL || !(m->m_flags & M_PKTHDR)) + return (EINVAL); + + return (m_set_service_class(m, sc)); +} + +errno_t +mbuf_pkthdr_aux_flags(mbuf_t m, mbuf_pkthdr_aux_flags_t *flagsp) +{ + u_int32_t flags; + if (m == NULL || !(m->m_flags & M_PKTHDR) || flagsp == NULL) + return (EINVAL); + + flags = m->m_pkthdr.aux_flags & MBUF_PKTAUXF_MASK; + + /* These 2 flags are mutually exclusive */ + VERIFY((flags & + (MBUF_PKTAUXF_INET_RESOLVE_RTR | MBUF_PKTAUXF_INET6_RESOLVE_RTR)) != + (MBUF_PKTAUXF_INET_RESOLVE_RTR | MBUF_PKTAUXF_INET6_RESOLVE_RTR)); + + *flagsp = flags; + return (0); } diff --git a/bsd/kern/kpi_socket.c b/bsd/kern/kpi_socket.c index 70507beff..3de525cbe 100644 --- a/bsd/kern/kpi_socket.c +++ b/bsd/kern/kpi_socket.c @@ -514,11 +514,11 @@ sock_setsockopt( /* * This follows the recommended mappings between DSCP code points and WMM access classes */ -static u_int8_t so_tc_from_dscp(u_int8_t dscp); -static u_int8_t +static u_int32_t so_tc_from_dscp(u_int8_t dscp); +static u_int32_t so_tc_from_dscp(u_int8_t dscp) { - u_int8_t tc; + u_int32_t tc; if (dscp >= 0x30 && dscp <= 0x3f) tc = SO_TC_VO; @@ -529,7 +529,7 @@ so_tc_from_dscp(u_int8_t dscp) else tc = SO_TC_BE; - return tc; + return (tc); } errno_t @@ -946,6 +946,8 @@ sock_socket( #endif (*new_so)->so_upcall = (so_upcall)callback; (*new_so)->so_upcallarg = context; + (*new_so)->last_pid = 0; + (*new_so)->last_upid = 0; } return error; } @@ -978,7 +980,7 @@ sock_release(socket_t sock) return; socket_lock(sock, 1); - if (sock->so_flags & SOF_UPCALLINUSE) + if (sock->so_upcallusecount) soclose_wait_locked(sock); sock->so_retaincnt--; diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index 67a944c2d..f456d6246 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -33,11 +33,20 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -258,63 +267,64 @@ sflt_attach_locked( struct socket_filter_entry *entry = NULL; if (filter == NULL) - error = ENOENT; - - if (error == 0) { - /* allocate the socket filter entry */ - MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, M_WAITOK); - if (entry == NULL) { - error = ENOMEM; - } - } - - if (error == 0) { - /* Initialize the socket filter entry */ - entry->sfe_cookie = NULL; - entry->sfe_flags = SFEF_ATTACHED; - entry->sfe_refcount = 1; // corresponds to SFEF_ATTACHED flag set + return ENOENT; + + for (entry = so->so_filt; entry; entry = entry->sfe_next_onfilter) + if (entry->sfe_filter->sf_filter.sf_handle == + filter->sf_filter.sf_handle) + return EEXIST; + + /* allocate the socket filter entry */ + MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, + M_WAITOK); + if (entry == NULL) + return ENOMEM; + + /* Initialize the socket filter entry */ + entry->sfe_cookie = NULL; + entry->sfe_flags = SFEF_ATTACHED; + entry->sfe_refcount = 1; // corresponds to SFEF_ATTACHED flag set + + /* Put the entry in the filter list */ + sflt_retain_locked(filter); + entry->sfe_filter = filter; + entry->sfe_next_onfilter = filter->sf_entry_head; + filter->sf_entry_head = entry; + + /* Put the entry on the socket filter list */ + entry->sfe_socket = so; + entry->sfe_next_onsocket = so->so_filt; + so->so_filt = entry; + + if (entry->sfe_filter->sf_filter.sf_attach) { + // Retain the entry while we call attach + sflt_entry_retain(entry); - /* Put the entry in the filter list */ - sflt_retain_locked(filter); - entry->sfe_filter = filter; - entry->sfe_next_onfilter = filter->sf_entry_head; - filter->sf_entry_head = entry; + // Release the filter lock -- callers must be aware we will do this + lck_rw_unlock_exclusive(sock_filter_lock); - /* Put the entry on the socket filter list */ - entry->sfe_socket = so; - entry->sfe_next_onsocket = so->so_filt; - so->so_filt = entry; + // Unlock the socket + if (socklocked) + socket_unlock(so, 0); - if (entry->sfe_filter->sf_filter.sf_attach) { - // Retain the entry while we call attach - sflt_entry_retain(entry); - - // Release the filter lock -- callers must be aware we will do this - lck_rw_unlock_exclusive(sock_filter_lock); - - // Unlock the socket - if (socklocked) - socket_unlock(so, 0); - - // It's finally safe to call the filter function - error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); - - // Lock the socket again - if (socklocked) - socket_lock(so, 0); - - // Lock the filters again - lck_rw_lock_exclusive(sock_filter_lock); - - // If the attach function returns an error, this filter must be detached - if (error) { - entry->sfe_flags |= SFEF_NODETACH; // don't call sf_detach - sflt_detach_locked(entry); - } - - // Release the retain we held through the attach call - sflt_entry_release(entry); + // It's finally safe to call the filter function + error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); + + // Lock the socket again + if (socklocked) + socket_lock(so, 0); + + // Lock the filters again + lck_rw_lock_exclusive(sock_filter_lock); + + // If the attach function returns an error, this filter must be detached + if (error) { + entry->sfe_flags |= SFEF_NODETACH; // don't call sf_detach + sflt_detach_locked(entry); } + + // Release the retain we held through the attach call + sflt_entry_release(entry); } return error; @@ -450,21 +460,25 @@ sflt_termsock( lck_rw_unlock_exclusive(sock_filter_lock); } -__private_extern__ void -sflt_notify( + +static void +sflt_notify_internal( struct socket *so, sflt_event_t event, - void *param) + void *param, + sflt_handle handle) { if (so->so_filt == NULL) return; struct socket_filter_entry *entry; - int unlocked = 0; + int unlocked = 0; lck_rw_lock_shared(sock_filter_lock); for (entry = so->so_filt; entry; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) - && entry->sfe_filter->sf_filter.sf_notify) { + && entry->sfe_filter->sf_filter.sf_notify && + ((handle && entry->sfe_filter->sf_filter.sf_handle != handle) || + !handle)) { // Retain the filter entry and release the socket filter lock sflt_entry_retain(entry); lck_rw_unlock_shared(sock_filter_lock); @@ -491,6 +505,24 @@ sflt_notify( } } +__private_extern__ void +sflt_notify( + struct socket *so, + sflt_event_t event, + void *param) +{ + sflt_notify_internal(so, event, param, 0); +} + +static void +sflt_notify_after_register( + struct socket *so, + sflt_event_t event, + sflt_handle handle) +{ + sflt_notify_internal(so, event, NULL, handle); +} + __private_extern__ int sflt_ioctl( struct socket *so, @@ -1075,6 +1107,11 @@ sflt_detach( return result; } +struct solist { + struct solist *next; + struct socket *so; +}; + errno_t sflt_register( const struct sflt_filter *filter, @@ -1087,6 +1124,9 @@ sflt_register( int error = 0; struct protosw *pr = pffindproto(domain, protocol, type); unsigned int len; + struct socket *so; + struct inpcb *inp; + struct solist *solisthead = NULL, *solist = NULL; if (pr == NULL) return ENOENT; @@ -1141,12 +1181,95 @@ sflt_register( sflt_retain_locked(sock_filt); } lck_rw_unlock_exclusive(sock_filter_lock); - + if (match != NULL) { FREE(sock_filt, M_IFADDR); return EEXIST; } + if (!(filter->sf_flags & SFLT_EXTENDED_REGISTRY)) + return error; + + /* + * Setup the filter on the TCP and UDP sockets already created. + */ +#define SOLIST_ADD(_so) do { \ + solist->next = solisthead; \ + sock_retain((_so)); \ + solist->so = (_so); \ + solisthead = solist; \ +} while (0) + if (protocol == IPPROTO_TCP) { + lck_rw_lock_shared(tcbinfo.mtx); + LIST_FOREACH(inp, tcbinfo.listhead, inp_list) { + so = inp->inp_socket; + if (so == NULL || so->so_state & SS_DEFUNCT || + so->so_state & SS_NOFDREF || + !INP_CHECK_SOCKAF(so, domain) || + !INP_CHECK_SOCKTYPE(so, type)) + continue; + MALLOC(solist, struct solist *, sizeof(*solist), + M_IFADDR, M_NOWAIT); + if (!solist) + continue; + SOLIST_ADD(so); + } + lck_rw_done(tcbinfo.mtx); + } else if (protocol == IPPROTO_UDP) { + lck_rw_lock_shared(udbinfo.mtx); + LIST_FOREACH(inp, udbinfo.listhead, inp_list) { + so = inp->inp_socket; + if (so == NULL || so->so_state & SS_DEFUNCT || + so->so_state & SS_NOFDREF || + !INP_CHECK_SOCKAF(so, domain) || + !INP_CHECK_SOCKTYPE(so, type)) + continue; + MALLOC(solist, struct solist *, sizeof(*solist), + M_IFADDR, M_NOWAIT); + if (!solist) + continue; + SOLIST_ADD(so); + } + lck_rw_done(udbinfo.mtx); + } + /* XXX it's possible to walk the raw socket list as well */ +#undef SOLIST_ADD + + while (solisthead) { + sflt_handle handle = filter->sf_handle; + + so = solisthead->so; + sflt_initsock(so); + + if (so->so_state & SS_ISCONNECTING) + sflt_notify_after_register(so, sock_evt_connecting, + handle); + else if (so->so_state & SS_ISCONNECTED) + sflt_notify_after_register(so, sock_evt_connected, + handle); + else if ((so->so_state & + (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE)) == + (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE)) + sflt_notify_after_register(so, sock_evt_disconnecting, + handle); + else if ((so->so_state & + (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED)) == + (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED)) + sflt_notify_after_register(so, sock_evt_disconnected, + handle); + else if (so->so_state & SS_CANTSENDMORE) + sflt_notify_after_register(so, sock_evt_cantsendmore, + handle); + else if (so->so_state & SS_CANTRCVMORE) + sflt_notify_after_register(so, sock_evt_cantrecvmore, + handle); + /* XXX no easy way to post the sock_evt_closing event */ + sock_release(so); + solist = solisthead; + solisthead = solisthead->next; + FREE(solist, M_IFADDR); + } + return error; } diff --git a/bsd/kern/mach_fat.c b/bsd/kern/mach_fat.c index 9811047d7..e2fd1e350 100644 --- a/bsd/kern/mach_fat.c +++ b/bsd/kern/mach_fat.c @@ -198,30 +198,14 @@ fatfile_getarch_affinity( struct vnode *vp, vm_offset_t data_ptr, struct fat_arch *archret, - int affinity) + int affinity __unused) { - load_return_t lret; - int handler = (exec_archhandler_ppc.path[0] != 0); - cpu_type_t primary_type, fallback_type; - - if (handler && affinity) { - primary_type = CPU_TYPE_POWERPC; - fallback_type = cpu_type(); - } else { - primary_type = cpu_type(); - fallback_type = CPU_TYPE_POWERPC; - } /* * Ignore all architectural bits when determining if an image * in a fat file should be skipped or graded. */ - lret = fatfile_getarch2(vp, data_ptr, primary_type, + return fatfile_getarch2(vp, data_ptr, cpu_type(), CPU_ARCH_MASK, archret); - if ((lret != 0) && handler) { - lret = fatfile_getarch2(vp, data_ptr, fallback_type, - CPU_SUBTYPE_LIB64, archret); - } - return lret; } /********************************************************************** diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index 47253a898..02b660be1 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -84,19 +84,8 @@ * XXX vm/pmap.h should not treat these prototypes as MACH_KERNEL_PRIVATE * when KERNEL is defined. */ -extern pmap_t pmap_create(vm_map_size_t size, boolean_t is_64bit); -extern void pmap_switch(pmap_t); - -/* - * XXX kern/thread.h should not treat these prototypes as MACH_KERNEL_PRIVATE - * when KERNEL is defined. - */ -extern kern_return_t thread_setstatus(thread_t thread, int flavor, - thread_state_t tstate, - mach_msg_type_number_t count); - -extern kern_return_t thread_state_initialize(thread_t thread); - +extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t size, + boolean_t is_64bit); /* XXX should have prototypes in a shared header file */ extern int get_map_nentries(vm_map_t); @@ -109,12 +98,15 @@ static load_result_t load_result_null = { .mach_header = MACH_VM_MIN_ADDRESS, .entry_point = MACH_VM_MIN_ADDRESS, .user_stack = MACH_VM_MIN_ADDRESS, + .user_stack_size = 0, .all_image_info_addr = MACH_VM_MIN_ADDRESS, .all_image_info_size = 0, .thread_count = 0, .unixproc = 0, .dynlinker = 0, - .customstack = 0, + .needs_dynlinker = 0, + .prog_allocated_stack = 0, + .prog_stack_size = 0, .validentry = 0, .csflags = 0, .uuid = { 0 }, @@ -166,9 +158,19 @@ set_code_unprotect( struct encryption_info_command *lcp, caddr_t addr, vm_map_t map, + int64_t slide, struct vnode *vp); #endif +static +load_return_t +load_main( + struct entry_point_command *epc, + thread_t thread, + int64_t slide, + load_result_t *result +); + static load_return_t load_unixthread( struct thread_command *tcp, @@ -282,6 +284,7 @@ load_machfile( struct vnode *vp = imgp->ip_vp; off_t file_offset = imgp->ip_arch_offset; off_t macho_size = imgp->ip_arch_size; + off_t file_size = imgp->ip_vattr->va_data_size; pmap_t pmap = 0; /* protected by create_map */ vm_map_t map; @@ -296,6 +299,10 @@ load_machfile( mach_vm_offset_t aslr_offset = 0; kern_return_t kret; + if (macho_size > file_size) { + return(LOAD_BADMACHO); + } + if (new_map == VM_MAP_NULL) { create_map = TRUE; old_task = current_task(); @@ -313,7 +320,8 @@ load_machfile( } if (create_map) { - pmap = pmap_create((vm_map_size_t) 0, (imgp->ip_flags & IMGPF_IS_64BIT)); + pmap = pmap_create(get_task_ledger(task), (vm_map_size_t) 0, + (imgp->ip_flags & IMGPF_IS_64BIT)); pal_switch_pmap(thread, pmap, imgp->ip_flags & IMGPF_IS_64BIT); map = vm_map_create(pmap, 0, @@ -359,6 +367,19 @@ load_machfile( return(lret); } +#if CONFIG_EMBEDDED + /* + * Check to see if the page zero is enforced by the map->min_offset. + */ + if (vm_map_has_hard_pagezero(map, 0x1000) == FALSE) { + if (create_map) { + vm_map_deallocate(map); /* will lose pmap reference too */ + } + printf("Cannot enforce a hard page-zero for %s\n", imgp->ip_strings); + psignal(vfs_context_proc(imgp->ip_vfs_context), SIGKILL); + return (LOAD_BADMACHO); + } +#else /* * For 64-bit users, check for presence of a 4GB page zero * which will enable the kernel to share the user's address space @@ -366,9 +387,10 @@ load_machfile( */ if ((imgp->ip_flags & IMGPF_IS_64BIT) && - vm_map_has_4GB_pagezero(map)) + vm_map_has_4GB_pagezero(map)) { vm_map_set_4GB_pagezero(map); - + } +#endif /* * Commit to new map. * @@ -396,23 +418,23 @@ load_machfile( * * NOTE: task_start_halt() makes sure that no new * threads are created in the task during the transition. - * We need to mark the workqueue as exiting before we - * wait for threads to terminate (at the end of which - * we no longer have a prohibition on thread creation). - * - * Finally, clean up any lingering workqueue data structures - * that may have been left behind by the workqueue threads - * as they exited (and then clean up the work queue itself). - */ - kret = task_start_halt(task); - if (kret != KERN_SUCCESS) { - return(kret); - } - proc_transcommit(p, 0); - workqueue_mark_exiting(p); - task_complete_halt(task); - workqueue_exit(p); - } + * We need to mark the workqueue as exiting before we + * wait for threads to terminate (at the end of which + * we no longer have a prohibition on thread creation). + * + * Finally, clean up any lingering workqueue data structures + * that may have been left behind by the workqueue threads + * as they exited (and then clean up the work queue itself). + */ + kret = task_start_halt(task); + if (kret != KERN_SUCCESS) { + return(kret); + } + proc_transcommit(p, 0); + workqueue_mark_exiting(p); + task_complete_halt(task); + workqueue_exit(p); + } old_map = swap_task_map(old_task, thread, map, !spawn); vm_map_clear_4GB_pagezero(old_map); vm_map_deallocate(old_map); @@ -566,7 +588,6 @@ parse_machfile( */ for (pass = 1; pass <= 3; pass++) { -#if CONFIG_EMBEDDED /* * Check that the entry point is contained in an executable segments */ @@ -575,7 +596,6 @@ parse_machfile( ret = LOAD_FAILURE; break; } -#endif /* * Loop through each of the load_commands indicated by the @@ -637,6 +657,17 @@ parse_machfile( slide, result); break; + case LC_MAIN: + if (pass != 1) + break; + if (depth != 1) + break; + ret = load_main( + (struct entry_point_command *) lcp, + thread, + slide, + result); + break; case LC_LOAD_DYLINKER: if (pass != 3) break; @@ -683,7 +714,7 @@ parse_machfile( break; ret = set_code_unprotect( (struct encryption_info_command *) lcp, - addr, map, vp); + addr, map, slide, vp); if (ret != LOAD_SUCCESS) { printf("proc %d: set_code_unprotect() error %d " "for file \"%s\"\n", @@ -717,16 +748,21 @@ parse_machfile( } } - if (dlp != 0) { + /* Make sure if we need dyld, we got it */ + if (result->needs_dynlinker && !dlp) { + ret = LOAD_FAILURE; + } + + if ((ret == LOAD_SUCCESS) && (dlp != 0)) { /* load the dylinker, and always slide it by the ASLR * offset regardless of PIE */ ret = load_dylinker(dlp, dlarchbits, map, thread, depth, aslr_offset, result); } - if(depth == 1) { - if (result->thread_count == 0) { - ret = LOAD_FAILURE; - } + if((ret == LOAD_SUCCESS) && (depth == 1)) { + if (result->thread_count == 0) { + ret = LOAD_FAILURE; + } } } @@ -823,12 +859,13 @@ load_segment( { struct segment_command_64 segment_command, *scp; kern_return_t ret; - mach_vm_offset_t map_addr, map_offset; - mach_vm_size_t map_size, seg_size, delta_size; + vm_map_offset_t map_addr, map_offset; + vm_map_size_t map_size, seg_size, delta_size; vm_prot_t initprot; vm_prot_t maxprot; size_t segment_command_size, total_section_size, single_section_size; + boolean_t prohibit_pagezero_mapping = FALSE; if (LC_SEGMENT_64 == lcp->cmd) { segment_command_size = sizeof(struct segment_command_64); @@ -888,9 +925,15 @@ load_segment( */ seg_size += slide; slide = 0; - +#if CONFIG_EMBEDDED + prohibit_pagezero_mapping = TRUE; +#endif /* XXX (4596982) this interferes with Rosetta, so limit to 64-bit tasks */ if (scp->cmd == LC_SEGMENT_64) { + prohibit_pagezero_mapping = TRUE; + } + + if (prohibit_pagezero_mapping) { /* * This is a "page zero" segment: it starts at address 0, * is not mapped from the binary file and is not accessible. @@ -1001,6 +1044,65 @@ load_segment( return ret; } + + +static +load_return_t +load_main( + struct entry_point_command *epc, + thread_t thread, + int64_t slide, + load_result_t *result +) +{ + mach_vm_offset_t addr; + kern_return_t ret; + + if (epc->cmdsize < sizeof(*epc)) + return (LOAD_BADMACHO); + if (result->thread_count != 0) { + printf("load_main: already have a thread!"); + return (LOAD_FAILURE); + } + + if (thread == THREAD_NULL) + return (LOAD_SUCCESS); + + /* LC_MAIN specifies stack size but not location */ + if (epc->stacksize) { + result->prog_stack_size = 1; + result->user_stack_size = epc->stacksize; + } else { + result->prog_stack_size = 0; + result->user_stack_size = MAXSSIZ; + } + result->prog_allocated_stack = 0; + + /* use default location for stack */ + ret = thread_userstackdefault(thread, &addr); + if (ret != KERN_SUCCESS) + return(LOAD_FAILURE); + + /* The stack slides down from the default location */ + result->user_stack = addr; + result->user_stack -= slide; + + /* kernel does *not* use entryoff from LC_MAIN. Dyld uses it. */ + result->needs_dynlinker = TRUE; + result->validentry = TRUE; + + ret = thread_state_initialize( thread ); + if (ret != KERN_SUCCESS) { + return(LOAD_FAILURE); + } + + result->unixproc = TRUE; + result->thread_count++; + + return(LOAD_SUCCESS); +} + + static load_return_t load_unixthread( @@ -1012,6 +1114,7 @@ load_unixthread( { load_return_t ret; int customstack =0; + mach_vm_offset_t addr; if (tcp->cmdsize < sizeof(*tcp)) return (LOAD_BADMACHO); @@ -1027,26 +1130,35 @@ load_unixthread( (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), tcp->cmdsize - sizeof(struct thread_command), - &result->user_stack, + &addr, &customstack); if (ret != LOAD_SUCCESS) return(ret); - if (customstack) - result->customstack = 1; - else - result->customstack = 0; + /* LC_UNIXTHREAD optionally specifies stack size and location */ + + if (customstack) { + result->prog_stack_size = 0; /* unknown */ + result->prog_allocated_stack = 1; + } else { + result->prog_allocated_stack = 0; + result->prog_stack_size = 0; + result->user_stack_size = MAXSSIZ; + } - result->user_stack += slide; + /* The stack slides down from the default location */ + result->user_stack = addr; + result->user_stack -= slide; ret = load_threadentry(thread, (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), tcp->cmdsize - sizeof(struct thread_command), - &result->entry_point); + &addr); if (ret != LOAD_SUCCESS) return(ret); + result->entry_point = addr; result->entry_point += slide; ret = load_threadstate(thread, @@ -1325,6 +1437,7 @@ load_dylinker( if (ret == LOAD_SUCCESS) { result->dynlinker = TRUE; result->entry_point = myresult->entry_point; + result->validentry = myresult->validentry; result->all_image_info_addr = myresult->all_image_info_addr; result->all_image_info_size = myresult->all_image_info_size; } @@ -1439,6 +1552,7 @@ set_code_unprotect( struct encryption_info_command *eip, caddr_t addr, vm_map_t map, + int64_t slide, struct vnode *vp) { int result, len; @@ -1517,7 +1631,7 @@ set_code_unprotect( if ((seg64->fileoff <= eip->cryptoff) && (seg64->fileoff+seg64->filesize >= eip->cryptoff+eip->cryptsize)) { - map_offset = seg64->vmaddr + eip->cryptoff - seg64->fileoff; + map_offset = seg64->vmaddr + eip->cryptoff - seg64->fileoff + slide; map_size = eip->cryptsize; goto remap_now; } @@ -1526,7 +1640,7 @@ set_code_unprotect( if ((seg32->fileoff <= eip->cryptoff) && (seg32->fileoff+seg32->filesize >= eip->cryptoff+eip->cryptsize)) { - map_offset = seg32->vmaddr + eip->cryptoff - seg32->fileoff; + map_offset = seg32->vmaddr + eip->cryptoff - seg32->fileoff + slide; map_size = eip->cryptsize; goto remap_now; } diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index fd8e585db..ece41929e 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -53,13 +53,16 @@ typedef struct _load_result { user_addr_t mach_header; user_addr_t entry_point; user_addr_t user_stack; + mach_vm_size_t user_stack_size; mach_vm_address_t all_image_info_addr; mach_vm_size_t all_image_info_size; int thread_count; unsigned int /* boolean_t */ unixproc :1, + needs_dynlinker : 1, dynlinker :1, - customstack :1, + prog_allocated_stack :1, + prog_stack_size : 1, validentry :1, :0; unsigned int csflags; diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index 7ec55c799..1a5cac88f 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -313,6 +313,11 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) * is resumed by adding NSIG to p_cursig. [see issig] */ proc_unlock(t); +#if NOTYET + error = mac_proc_check_signal(p, t, SIGKILL); + if (0 != error) + goto resume; +#endif psignal(t, SIGKILL); goto resume; @@ -342,8 +347,15 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) if (uap->req == PT_STEP) { /* - * set trace bit + * set trace bit + * we use sending SIGSTOP as a comparable security check. */ +#if NOTYET + error = mac_proc_check_signal(p, t, SIGSTOP); + if (0 != error) { + goto out; + } +#endif if (thread_setsinglestep(th_act, 1) != KERN_SUCCESS) { error = ENOTSUP; goto out; @@ -351,7 +363,14 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) } else { /* * clear trace bit if on + * we use sending SIGCONT as a comparable security check. */ +#if NOTYET + error = mac_proc_check_signal(p, t, SIGCONT); + if (0 != error) { + goto out; + } +#endif if (thread_setsinglestep(th_act, 0) != KERN_SUCCESS) { error = ENOTSUP; goto out; diff --git a/bsd/kern/makesyscalls.sh b/bsd/kern/makesyscalls.sh index 301905871..d585708d1 100755 --- a/bsd/kern/makesyscalls.sh +++ b/bsd/kern/makesyscalls.sh @@ -172,7 +172,7 @@ s/\$//g } NR == 1 { printf "\n/* The casts are bogus but will do for now. */\n" > sysent - printf "__private_extern__ struct sysent %s[] = {\n",switchname > sysent + printf "__private_extern__ const struct sysent %s[] = {\n",switchname > sysent printf "#ifndef %s\n", sysproto_h > sysarg printf "#define\t%s\n\n", sysproto_h > sysarg @@ -210,7 +210,7 @@ s/\$//g printf "#define\tPADR_(t)\t0\n" > sysarg printf "#endif\n" > sysarg printf "\n__BEGIN_DECLS\n" > sysarg - printf "#ifndef __arm__\n" > sysarg + printf "#if !defined(__arm__)\n" > sysarg printf "void munge_w(const void *, void *); \n" > sysarg printf "void munge_ww(const void *, void *); \n" > sysarg printf "void munge_www(const void *, void *); \n" > sysarg @@ -243,7 +243,7 @@ s/\$//g printf "void munge_wwwsw(const void *, void *); \n" > sysarg printf "void munge_llllll(const void *, void *); \n" > sysarg printf "#else \n" > sysarg - printf "/* ARM does not need mungers for BSD system calls */\n" > sysarg + printf "/* ARM does not need mungers for BSD system calls. */\n" > sysarg printf "#define munge_w NULL \n" > sysarg printf "#define munge_ww NULL \n" > sysarg printf "#define munge_www NULL \n" > sysarg diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index a0c6cfb69..f3570ae41 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * Copyright (c) 2006-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,7 +75,7 @@ (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_SIZE) #define MCACHE_CPU(c) \ - (mcache_cpu_t *)((char *)(c) + MCACHE_SIZE(cpu_number())) + (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number()))) /* * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used @@ -1416,7 +1416,7 @@ mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp) __private_extern__ void mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size) { - u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size); + u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size)); u_int64_t *buf = (u_int64_t *)buf_arg; VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t))); @@ -1429,7 +1429,7 @@ mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size) __private_extern__ void * mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size) { - u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size); + u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size)); u_int64_t *buf; VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t))); @@ -1446,7 +1446,7 @@ __private_extern__ void * mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg, size_t size) { - u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size); + u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size)); u_int64_t *buf; VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t))); diff --git a/bsd/kern/netboot.c b/bsd/kern/netboot.c index 1eb975ed2..dd238f066 100644 --- a/bsd/kern/netboot.c +++ b/bsd/kern/netboot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2012 Apple Inc. All rights reserved. + * Copyright (c) 2001-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index e5573a99f..09c792833 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -229,6 +229,7 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_INT_HOOK(proc_check_getaudit) CHECK_SET_INT_HOOK(proc_check_getauid) CHECK_SET_INT_HOOK(proc_check_getlcid) + CHECK_SET_INT_HOOK(proc_check_ledger) CHECK_SET_INT_HOOK(proc_check_map_anon) CHECK_SET_INT_HOOK(proc_check_mprotect) CHECK_SET_INT_HOOK(proc_check_sched) @@ -271,6 +272,7 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_INT_HOOK(system_check_swapoff) CHECK_SET_INT_HOOK(system_check_swapon) CHECK_SET_INT_HOOK(system_check_sysctl) + CHECK_SET_INT_HOOK(system_check_kas_info) CHECK_SET_INT_HOOK(sysvmsq_check_enqueue) CHECK_SET_INT_HOOK(sysvmsq_check_msgrcv) CHECK_SET_INT_HOOK(sysvmsq_check_msgrmid) @@ -447,7 +449,7 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_VOID_HOOK(task_label_update) CHECK_SET_VOID_HOOK(vnode_label_associate_devfs) CHECK_SET_VOID_HOOK(vnode_label_associate_file) - CHECK_SET_VOID_HOOK(vnode_label_associate_pipe) + CHECK_SET_VOID_HOOK(thread_userret) CHECK_SET_VOID_HOOK(vnode_label_associate_posixsem) CHECK_SET_VOID_HOOK(vnode_label_associate_posixshm) CHECK_SET_VOID_HOOK(vnode_label_associate_singlelabel) @@ -458,11 +460,8 @@ static struct mac_policy_ops policy_ops = { CHECK_SET_VOID_HOOK(vnode_label_recycle) CHECK_SET_VOID_HOOK(vnode_label_update) CHECK_SET_VOID_HOOK(vnode_notify_rename) - .mpo_reserved12 = common_void_hook, - .mpo_reserved14 = common_void_hook, - .mpo_reserved15 = common_void_hook, - .mpo_reserved16 = common_void_hook, - .mpo_reserved17 = common_void_hook, + CHECK_SET_VOID_HOOK(thread_label_init) + CHECK_SET_VOID_HOOK(thread_label_destroy) .mpo_reserved18 = common_void_hook, .mpo_reserved19 = common_void_hook, .mpo_reserved20 = common_void_hook, diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index 617d1dc9f..d46eb0b3c 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -499,8 +499,8 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) pinfo->pshm_mode = cmode; pinfo->pshm_uid = kauth_getuid(); pinfo->pshm_gid = kauth_getgid(); - bcopy(pnbuf, &pinfo->pshm_name[0], PSHMNAMLEN); - pinfo->pshm_name[PSHMNAMLEN]=0; + bcopy(pnbuf, &pinfo->pshm_name[0], pathlen); + pinfo->pshm_name[pathlen]=0; #if CONFIG_MACF error = mac_posixshm_check_create(kauth_cred_get(), nameptr); if (error) { @@ -530,7 +530,7 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode); #if CONFIG_MACF - if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo))) { + if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo, fmode))) { goto bad; } #endif @@ -550,7 +550,7 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) goto bad; } #if CONFIG_MACF - if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo))) { + if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo, fmode))) { goto bad; } #endif @@ -829,10 +829,10 @@ pshm_access(struct pshminfo *pinfo, int mode, kauth_cred_t cred, __unused proc_t int pshm_mmap(__unused proc_t p, struct mmap_args *uap, user_addr_t *retval, struct fileproc *fp, off_t pageoff) { - mach_vm_offset_t user_addr = (mach_vm_offset_t)uap->addr; - mach_vm_size_t user_size = (mach_vm_size_t)uap->len ; - mach_vm_offset_t user_start_addr; - mach_vm_size_t map_size, mapped_size; + vm_map_offset_t user_addr = (vm_map_offset_t)uap->addr; + vm_map_size_t user_size = (vm_map_size_t)uap->len ; + vm_map_offset_t user_start_addr; + vm_map_size_t map_size, mapped_size; int prot = uap->prot; int flags = uap->flags; vm_object_offset_t file_pos = (vm_object_offset_t)uap->pos; @@ -898,9 +898,9 @@ pshm_mmap(__unused proc_t p, struct mmap_args *uap, user_addr_t *retval, struct if ((flags & MAP_FIXED) == 0) { alloc_flags = VM_FLAGS_ANYWHERE; - user_addr = mach_vm_round_page(user_addr); + user_addr = vm_map_round_page(user_addr); } else { - if (user_addr != mach_vm_trunc_page(user_addr)) + if (user_addr != vm_map_round_page(user_addr)) return (EINVAL); /* * We do not get rid of the existing mappings here because @@ -1099,15 +1099,23 @@ shm_unlink(__unused proc_t p, struct shm_unlink_args *uap, AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode); - /* - * JMM - How should permissions be checked? + /* + * following file semantics, unlink should be allowed + * for users with write permission only. */ + if ( (error = pshm_access(pinfo, FWRITE, kauth_cred_get(), p)) ) { + PSHM_SUBSYS_UNLOCK(); + goto bad; + } pinfo->pshm_flags |= PSHM_INDELETE; pshm_cache_delete(pcache); pinfo->pshm_flags |= PSHM_REMOVED; /* release the existence reference */ if (!--pinfo->pshm_usecount) { +#if CONFIG_MACF + mac_posixshm_label_destroy(pinfo); +#endif PSHM_SUBSYS_UNLOCK(); /* * If this is the last reference going away on the object, diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index a907fad59..482f83e0e 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -48,6 +48,7 @@ #include #include #include +#include #include @@ -96,6 +97,8 @@ int proc_pidfdinfo(int pid, int flavor,int fd, user_addr_t buffer, uint32_t buff int proc_kernmsgbuf(user_addr_t buffer, uint32_t buffersize, int32_t * retval); int proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int proc_pidfileportinfo(int pid, int flavor, mach_port_name_t name, user_addr_t buffer, uint32_t buffersize, int32_t *retval); +int proc_dirtycontrol(int pid, int flavor, uint64_t arg, int32_t * retval); +int proc_terminate(int pid, int32_t * retval); /* protos for procpidinfo calls */ int proc_pidfdlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); @@ -103,7 +106,7 @@ int proc_pidbsdinfo(proc_t p, struct proc_bsdinfo *pbsd, int zombie); int proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo *pbsd_shortp, int zombie); int proc_pidtaskinfo(proc_t p, struct proc_taskinfo *ptinfo); int proc_pidallinfo(proc_t p, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); -int proc_pidthreadinfo(proc_t p, uint64_t arg, struct proc_threadinfo *pthinfo); +int proc_pidthreadinfo(proc_t p, uint64_t arg, int thuniqueid, struct proc_threadinfo *pthinfo); int proc_pidthreadpathinfo(proc_t p, uint64_t arg, struct proc_threadwithpathinfo *pinfo); int proc_pidlistthreads(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidregioninfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); @@ -127,11 +130,16 @@ int pid_atalkinfo(struct atalk * at, struct fileproc * fp, int closeonexec, us /* protos for misc */ +void proc_dirty_start(struct proc *p); +void proc_dirty_end(struct proc *p); + int fill_vnodeinfo(vnode_t vp, struct vnode_info *vinfo); void fill_fileinfo(struct fileproc * fp, int closeonexec, struct proc_fileinfo * finfo); static int proc_security_policy(proc_t p); static void munge_vinfo_stat(struct stat64 *sbp, struct vinfo_stat *vsbp); +extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int, int); + uint64_t get_dispatchqueue_offset_from_proc(void *p) { if(p != NULL) { @@ -169,7 +177,10 @@ proc_info_internal(int callnum, int pid, int flavor, uint64_t arg, user_addr_t b return(proc_setcontrol(pid, flavor, arg, buffer, buffersize, retval)); case 6: /* proc_pidfileportinfo */ return(proc_pidfileportinfo(pid, flavor, (mach_port_name_t)arg, buffer, buffersize, retval)); - + case 7: /* proc_terminate */ + return(proc_terminate(pid, retval)); + case 8: /* proc_dirtycontrol */ + return(proc_dirtycontrol(pid, flavor, arg, retval)); default: return(EINVAL); } @@ -525,6 +536,10 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) pbsd->pbi_flags |= PROC_FLAG_CTTY; } +#if !CONFIG_EMBEDDED + if ((p->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP) + pbsd->pbi_flags |= PROC_FLAG_DELAYIDLESLEEP; +#endif /* !CONFIG_EMBEDDED */ switch(PROC_CONTROL_STATE(p)) { case P_PCTHROTTLE: @@ -553,6 +568,8 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) if (zombie == 0) pbsd->pbi_nfiles = p->p_fd->fd_nfiles; + + pbsd->e_tdev = NODEV; if (pg != PGRP_NULL) { pbsd->pbi_pgid = p->p_pgrpid; pbsd->pbi_pjobc = pg->pg_jobc; @@ -600,6 +617,10 @@ proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo * pbsd_shortp, int zombi pbsd_shortp->pbsi_flags |= PROC_FLAG_PSUGID; if ((p->p_flag & P_EXEC) == P_EXEC) pbsd_shortp->pbsi_flags |= PROC_FLAG_EXEC; +#if !CONFIG_EMBEDDED + if ((p->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP) + pbsd_shortp->pbsi_flags |= PROC_FLAG_DELAYIDLESLEEP; +#endif /* !CONFIG_EMBEDDED */ switch(PROC_CONTROL_STATE(p)) { case P_PCTHROTTLE: @@ -652,14 +673,14 @@ proc_pidtaskinfo(proc_t p, struct proc_taskinfo * ptinfo) int -proc_pidthreadinfo(proc_t p, uint64_t arg, struct proc_threadinfo *pthinfo) +proc_pidthreadinfo(proc_t p, uint64_t arg, int thuniqueid, struct proc_threadinfo *pthinfo) { int error = 0; uint64_t threadaddr = (uint64_t)arg; bzero(pthinfo, sizeof(struct proc_threadinfo)); - error = fill_taskthreadinfo(p->task, threadaddr, (struct proc_threadinfo_internal *)pthinfo, NULL, NULL); + error = fill_taskthreadinfo(p->task, threadaddr, thuniqueid, (struct proc_threadinfo_internal *)pthinfo, NULL, NULL); if (error) return(ESRCH); else @@ -704,7 +725,7 @@ proc_pidthreadpathinfo(proc_t p, uint64_t arg, struct proc_threadwithpathinfo * bzero(pinfo, sizeof(struct proc_threadwithpathinfo)); - error = fill_taskthreadinfo(p->task, threadaddr, (struct proc_threadinfo_internal *)&pinfo->pt, (void *)&vp, &vid); + error = fill_taskthreadinfo(p->task, threadaddr, 0, (struct proc_threadinfo_internal *)&pinfo->pt, (void *)&vp, &vid); if (error) return(ESRCH); @@ -937,6 +958,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu int refheld = 0, shortversion = 0; uint32_t size; int zombie = 0; + int thuniqueid = 0; switch (flavor) { case PROC_PIDLISTFDS: @@ -989,6 +1011,9 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu if (buffer == (user_addr_t)0) size = 0; break; + case PROC_PIDTHREADID64INFO: + size = PROC_PIDTHREADID64INFO_SIZE; + break; default: return(EINVAL); } @@ -1099,10 +1124,12 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDTHREADID64INFO: + thuniqueid = 1; case PROC_PIDTHREADINFO:{ struct proc_threadinfo pthinfo; - error = proc_pidthreadinfo(p, arg, &pthinfo); + error = proc_pidthreadinfo(p, arg, thuniqueid, &pthinfo); if (error == 0) { error = copyout(&pthinfo, buffer, sizeof(struct proc_threadinfo)); if (error == 0) @@ -1756,6 +1783,15 @@ proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t } break; + case PROC_SELFSET_DELAYIDLESLEEP: { + /* mark or clear the process property to delay idle sleep disk IO */ + if (pcontrol != 0) + OSBitOrAtomic(P_DELAYIDLESLEEP, &pself->p_flag); + else + OSBitAndAtomic(~((uint32_t)P_DELAYIDLESLEEP), &pself->p_flag); + } + break; + default: error = ENOTSUP; } @@ -1764,6 +1800,262 @@ out: return(error); } +void +proc_dirty_start(struct proc *p) +{ + proc_lock(p); + while (p->p_dirty & P_DIRTY_BUSY) { + msleep(&p->p_dirty, &p->p_mlock, 0, "proc_dirty_start", NULL); + } + p->p_dirty |= P_DIRTY_BUSY; + proc_unlock(p); +} + +void +proc_dirty_end(struct proc *p) +{ + proc_lock(p); + if (p->p_dirty & P_DIRTY_BUSY) { + p->p_dirty &= ~P_DIRTY_BUSY; + wakeup(&p->p_dirty); + } + proc_unlock(p); +} + +static boolean_t +proc_validate_track_flags(uint32_t pcontrol, struct proc *target_p) { + /* Check idle exit isn't specified independently */ + if ((pcontrol & PROC_DIRTY_TRACK_MASK) == PROC_DIRTY_ALLOW_IDLE_EXIT) { + return false; + } + + /* See that the process isn't marked for termination */ + if (target_p->p_dirty & P_DIRTY_TERMINATED) { + return false; + } + + return true; +} + +int +proc_dirtycontrol(int pid, int flavor, uint64_t arg, int32_t *retval) { + struct proc *target_p; + int error = 0; + uint32_t pcontrol = (uint32_t)arg; + kauth_cred_t my_cred, target_cred; + boolean_t self = FALSE; + boolean_t child = FALSE; + pid_t selfpid; + + target_p = proc_find(pid); + if (target_p == PROC_NULL) { + return(ESRCH); + } + + my_cred = kauth_cred_get(); + target_cred = kauth_cred_proc_ref(target_p); + + selfpid = proc_selfpid(); + if (pid == selfpid) { + self = TRUE; + } else if (target_p->p_ppid == selfpid) { + child = TRUE; + } + + switch (flavor) { + case PROC_DIRTYCONTROL_TRACK: { + /* Only allow the process itself, its parent, or root */ + if ((self == FALSE) && (child == FALSE) && kauth_cred_issuser(kauth_cred_get()) != TRUE) { + error = EPERM; + goto out; + } + + proc_dirty_start(target_p); + + if (proc_validate_track_flags(pcontrol, target_p)) { + /* Cumulative, as per */ + target_p->p_dirty |= + ((pcontrol & PROC_DIRTY_TRACK) ? P_DIRTY_TRACK : 0) | + ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) ? P_DIRTY_ALLOW_IDLE_EXIT : 0); +#if CONFIG_MEMORYSTATUS + if ((target_p->p_dirty & P_DIRTY_CAN_IDLE_EXIT) == P_DIRTY_CAN_IDLE_EXIT) { + memorystatus_on_track_dirty(pid, TRUE); + } +#endif + } else { + error = EINVAL; + } + + proc_dirty_end(target_p); + } + break; + + case PROC_DIRTYCONTROL_SET: { + boolean_t kill = false; + + /* Check privileges; use cansignal() here since the process could be terminated */ + if (!cansignal(current_proc(), my_cred, target_p, SIGKILL, 0)) { + error = EPERM; + goto out; + } + + proc_dirty_start(target_p); + + if (!(target_p->p_dirty & P_DIRTY_TRACK)) { + /* Dirty tracking not enabled */ + error = EINVAL; + } else if (pcontrol && (target_p->p_dirty & P_DIRTY_TERMINATED)) { + /* + * Process is set to be terminated and we're attempting to mark it dirty. + * Set for termination and marking as clean is OK - see . + */ + error = EBUSY; + } else { + int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN; + if (pcontrol && !(target_p->p_dirty & flag)) { + target_p->p_dirty |= flag; + } else if ((pcontrol == 0) && (target_p->p_dirty & flag)) { + if ((flag == P_DIRTY_SHUTDOWN) && (!target_p->p_dirty & P_DIRTY)) { + /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */ + target_p->p_dirty |= P_DIRTY_TERMINATED; + kill = true; + } else if ((flag == P_DIRTY) && (target_p->p_dirty & P_DIRTY_TERMINATED)) { + /* Kill previously terminated processes if set clean */ + kill = true; + } + target_p->p_dirty &= ~flag; + } else { + /* Already set */ + error = EALREADY; + } + } +#if CONFIG_MEMORYSTATUS + if ((error == 0) && ((target_p->p_dirty & P_DIRTY_CAN_IDLE_EXIT) == P_DIRTY_CAN_IDLE_EXIT)) { + memorystatus_on_dirty(pid, pcontrol ? TRUE : FALSE); + } +#endif + proc_dirty_end(target_p); + + if ((error == 0) && (kill == true)) { + psignal(target_p, SIGKILL); + } + } + break; + + case PROC_DIRTYCONTROL_GET: { + /* No permissions check - dirty state is freely available */ + if (retval) { + proc_dirty_start(target_p); + + *retval = 0; + if (target_p->p_dirty & P_DIRTY_TRACK) { + *retval |= PROC_DIRTY_TRACKED; + if (target_p->p_dirty & P_DIRTY_ALLOW_IDLE_EXIT) { + *retval |= PROC_DIRTY_ALLOWS_IDLE_EXIT; + } + if (target_p->p_dirty & P_DIRTY) { + *retval |= PROC_DIRTY_IS_DIRTY; + } + } + + proc_dirty_end(target_p); + } else { + error = EINVAL; + } + } + break; + } + +out: + proc_rele(target_p); + kauth_cred_unref(&target_cred); + + return(error); +} + +/* + * proc_terminate() provides support for sudden termination. + * SIGKILL is issued to tracked, clean processes; otherwise, + * SIGTERM is sent. + */ + +int +proc_terminate(int pid, int32_t *retval) +{ + int error = 0; + proc_t p; + kauth_cred_t uc = kauth_cred_get(); + int sig; + +#if 0 + /* XXX: Check if these are necessary */ + AUDIT_ARG(pid, pid); + AUDIT_ARG(signum, sig); +#endif + + if (pid <= 0 || retval == NULL) { + return (EINVAL); + } + + if ((p = proc_find(pid)) == NULL) { + return (ESRCH); + } + +#if 0 + /* XXX: Check if these are necessary */ + AUDIT_ARG(process, p); +#endif + + /* Check privileges; if SIGKILL can be issued, then SIGTERM is also OK */ + if (!cansignal(current_proc(), uc, p, SIGKILL, 0)) { + error = EPERM; + goto out; + } + + proc_dirty_start(p); + + p->p_dirty |= P_DIRTY_TERMINATED; + + if ((p->p_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) { + /* Clean; mark as terminated and issue SIGKILL */ + sig = SIGKILL; + } else { + /* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */ + sig = SIGTERM; + } + + proc_dirty_end(p); + + proc_removethrottle(p); + + psignal(p, sig); + *retval = sig; + +out: + proc_rele(p); + + return error; +} + +void +proc_removethrottle(proc_t p) + +{ + /* remove throttled states in all threads; process is going to terminate soon */ + proc_lock(p); + + /* if already marked marked for proc_termiantion.. */ + if ((p->p_lflag & P_LPTERMINATE) != 0) { + proc_unlock(p); + return; + } + p->p_lflag |= P_LPTERMINATE; + proc_unlock(p); + + (void)proc_task_remove_throttle(p->task); + +} + /* * copy stat64 structure into vinfo_stat structure. diff --git a/bsd/kern/process_policy.c b/bsd/kern/process_policy.c index e6596dad4..ff919538f 100644 --- a/bsd/kern/process_policy.c +++ b/bsd/kern/process_policy.c @@ -69,6 +69,10 @@ #include #include +#if CONFIG_EMBEDDED +#include +#include +#endif /* CONFIG_EMBEDDED */ static int handle_background(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); static int handle_hwaccess(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); @@ -79,6 +83,11 @@ static int handle_apptype(int scope, int action, int policy, int policy_subtype, extern kern_return_t task_suspend(task_t); extern kern_return_t task_resume(task_t); +#if CONFIG_EMBEDDED +static int handle_applifecycle(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +#endif /* CONFIG_EMBEDDED */ + + /***************************** process_policy ********************/ /* @@ -91,7 +100,7 @@ extern kern_return_t task_resume(task_t); /* system call implementaion */ int -process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_t *retval) +process_policy(__unused struct proc *p, struct process_policy_args * uap, __unused int32_t *retval) { int error = 0; int scope = uap->scope; @@ -101,7 +110,7 @@ process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_ user_addr_t attrp = uap->attrp; pid_t target_pid = uap->target_pid; uint64_t target_threadid = uap->target_threadid; - proc_t proc = PROC_NULL; + proc_t target_proc = PROC_NULL; proc_t curp = current_proc(); kauth_cred_t my_cred; #if CONFIG_EMBEDDED @@ -111,17 +120,17 @@ process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_ if ((scope != PROC_POLICY_SCOPE_PROCESS) && (scope != PROC_POLICY_SCOPE_THREAD)) { return(EINVAL); } - proc = proc_find(target_pid); - if (proc == PROC_NULL) { - return(EINVAL); + target_proc = proc_find(target_pid); + if (target_proc == PROC_NULL) { + return(ESRCH); } - my_cred = kauth_cred_proc_ref(curp); + my_cred = kauth_cred_get(); #if CONFIG_EMBEDDED - target_cred = kauth_cred_proc_ref(proc); + target_cred = kauth_cred_proc_ref(target_proc); - if (suser(my_cred, NULL) && kauth_cred_getruid(my_cred) && + if (!kauth_cred_issuser(my_cred) && kauth_cred_getruid(my_cred) && kauth_cred_getuid(my_cred) != kauth_cred_getuid(target_cred) && kauth_cred_getruid(my_cred) != kauth_cred_getuid(target_cred)) #else @@ -131,7 +140,7 @@ process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_ */ if ((policy != PROC_POLICY_RESOURCE_STARVATION) && (policy != PROC_POLICY_APPTYPE) && - (suser(my_cred, NULL) && curp != p)) + (!kauth_cred_issuser(my_cred) && curp != p)) #endif { error = EPERM; @@ -139,27 +148,39 @@ process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_ } #if CONFIG_MACF - error = mac_proc_check_sched(curp, p); - if (error) - goto out; -#endif +#if CONFIG_EMBEDDED + /* Lifecycle management will invoke approp macf checks */ + if (policy != PROC_POLICY_APP_LIFECYCLE) { +#endif /* CONFIG_EMBEDDED */ + error = mac_proc_check_sched(curp, target_proc); + if (error) + goto out; +#if CONFIG_EMBEDDED + } +#endif /* CONFIG_EMBEDDED */ +#endif /* CONFIG_MACF */ switch(policy) { case PROC_POLICY_BACKGROUND: - error = handle_background(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + error = handle_background(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); break; case PROC_POLICY_HARDWARE_ACCESS: - error = handle_hwaccess(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + error = handle_hwaccess(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); break; case PROC_POLICY_RESOURCE_STARVATION: - error = handle_lowresrouce(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + error = handle_lowresrouce(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); break; case PROC_POLICY_RESOURCE_USAGE: - error = handle_resourceuse(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + error = handle_resourceuse(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); + break; +#if CONFIG_EMBEDDED + case PROC_POLICY_APP_LIFECYCLE: + error = handle_applifecycle(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); break; +#endif /* CONFIG_EMBEDDED */ case PROC_POLICY_APPTYPE: - error = handle_apptype(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + error = handle_apptype(scope, action, policy, policy_subtype, attrp, target_proc, target_threadid); break; default: error = EINVAL; @@ -167,8 +188,7 @@ process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_ } out: - proc_rele(proc); - kauth_cred_unref(&my_cred); + proc_rele(target_proc); #if CONFIG_EMBEDDED kauth_cred_unref(&target_cred); #endif @@ -355,6 +375,12 @@ handle_resourceuse(__unused int scope, __unused int action, __unused int policy, cpuattr.ppattr_cpu_attr_interval, cpuattr.ppattr_cpu_attr_deadline); } + break; + + case PROC_POLICY_ACTION_RESTORE: + error = proc_clear_task_ruse_cpu(proc->task); + break; + default: error = EINVAL; break; @@ -364,13 +390,123 @@ handle_resourceuse(__unused int scope, __unused int action, __unused int policy, return(error); } +#if CONFIG_EMBEDDED +static int +handle_applifecycle(__unused int scope, int action, __unused int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid) +{ + + int error = 0; + int state = 0, oldstate = 0; + int noteval = 0; + + + + switch(policy_subtype) { + case PROC_POLICY_APPLIFE_NONE: + error = 0; + break; + + case PROC_POLICY_APPLIFE_STATE: +#if CONFIG_MACF + error = mac_proc_check_sched(current_proc(), proc); + if (error) + goto out; +#endif + switch (action) { + case PROC_POLICY_ACTION_GET : + state = proc_lf_getappstate(proc->task); + error = copyout((int *)&state, (user_addr_t)attrp, sizeof(int)); + break; + case PROC_POLICY_ACTION_APPLY : + case PROC_POLICY_ACTION_SET : + error = copyin((user_addr_t)attrp, (int *)&state, sizeof(int)); + if ((error == 0) && (state != TASK_APPSTATE_NONE)) { + oldstate = proc_lf_getappstate(proc->task); + error = proc_lf_setappstate(proc->task, state); + if (error == 0) { + switch (state) { + case TASK_APPSTATE_ACTIVE: + noteval = NOTE_APPACTIVE; + break; + case TASK_APPSTATE_BACKGROUND: + noteval = NOTE_APPBACKGROUND; + break; + case TASK_APPSTATE_NONUI: + noteval = NOTE_APPNONUI; + break; + case TASK_APPSTATE_INACTIVE: + noteval = NOTE_APPINACTIVE; + break; + } + + proc_lock(proc); + proc_knote(proc, noteval); + proc_unlock(proc); + } + } + break; + + default: + error = EINVAL; + break; + } + break; + + case PROC_POLICY_APPLIFE_DEVSTATUS: +#if CONFIG_MACF + /* ToDo - this should be a generic check, since we could potentially hang other behaviours here. */ + error = mac_proc_check_suspend_resume(current_proc(), MAC_PROC_CHECK_HIBERNATE); + if (error) { + error = EPERM; + goto out; + } +#endif + if (action == PROC_POLICY_ACTION_APPLY) { + /* Used as a freeze hint */ + memorystatus_on_inactivity(-1); + + /* in future use devicestatus for pid_socketshutdown() */ + error = 0; + } else { + error = EINVAL; + } + break; + + case PROC_POLICY_APPLIFE_PIDBIND: +#if CONFIG_MACF + error = mac_proc_check_suspend_resume(current_proc(), MAC_PROC_CHECK_PIDBIND); + if (error) { + error = EPERM; + goto out; + } +#endif + error = copyin((user_addr_t)attrp, (int *)&state, sizeof(int)); + if (error != 0) + goto out; + if (action == PROC_POLICY_ACTION_APPLY) { + /* bind the thread in target_thread in current process to target_proc */ + error = proc_lf_pidbind(current_task(), target_threadid, proc->task, state); + } else + error = EINVAL; + break; + default: + error = EINVAL; + break; + } + +out: + return(error); +} +#endif /* CONFIG_EMBEDDED */ + static int -handle_apptype(__unused int scope, int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +handle_apptype(__unused int scope, int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, proc_t target_proc, __unused uint64_t target_threadid) { int error = 0; switch(policy_subtype) { +#if !CONFIG_EMBEDDED case PROC_POLICY_OSX_APPTYPE_TAL: /* need to be super user to do this */ if (kauth_cred_issuser(kauth_cred_get()) == 0) { @@ -381,9 +517,14 @@ handle_apptype(__unused int scope, int action, __unused int policy, int policy_s case PROC_POLICY_OSX_APPTYPE_DASHCLIENT: /* no special priv needed */ break; +#endif /* !CONFIG_EMBEDDED */ case PROC_POLICY_OSX_APPTYPE_NONE: +#if CONFIG_EMBEDDED + case PROC_POLICY_IOS_RESV1_APPTYPE: + case PROC_POLICY_IOS_APPLE_DAEMON: case PROC_POLICY_IOS_APPTYPE: case PROC_POLICY_IOS_NONUITYPE: +#endif /* CONFIG_EMBEDDED */ return(ENOTSUP); break; default: @@ -393,21 +534,24 @@ handle_apptype(__unused int scope, int action, __unused int policy, int policy_s switch (action) { case PROC_POLICY_ACTION_ENABLE: /* reapply the app foreground/background policy */ - error = proc_enable_task_apptype(proc->task, policy_subtype); + error = proc_enable_task_apptype(target_proc->task, policy_subtype); break; case PROC_POLICY_ACTION_DISABLE: /* remove the app foreground/background policy */ - error = proc_disable_task_apptype(proc->task, policy_subtype); + error = proc_disable_task_apptype(target_proc->task, policy_subtype); break; default: error = EINVAL; break; } +#if !CONFIG_EMBEDDED out: +#endif /* !CONFIG_EMBEDDED */ return(error); } + int proc_apply_resource_actions(void * bsdinfo, int type, int action) { @@ -426,11 +570,15 @@ proc_apply_resource_actions(void * bsdinfo, int type, int action) psignal(p, SIGKILL); break; - case PROC_POLICY_RSRCACT_NOTIFY: + case PROC_POLICY_RSRCACT_NOTIFY_KQ: proc_lock(p); proc_knote(p, NOTE_RESOURCEEND | (type & 0xff)); proc_unlock(p); break; + + case PROC_POLICY_RSRCACT_NOTIFY_EXC: + panic("shouldn't be applying exception notification to process!"); + break; } return(0); @@ -445,7 +593,8 @@ proc_restore_resource_actions(void * bsdinfo, __unused int type, int action) switch(action) { case PROC_POLICY_RSRCACT_THROTTLE: case PROC_POLICY_RSRCACT_TERMINATE: - case PROC_POLICY_RSRCACT_NOTIFY: + case PROC_POLICY_RSRCACT_NOTIFY_KQ: + case PROC_POLICY_RSRCACT_NOTIFY_EXC: /* no need to do anything */ break; diff --git a/bsd/kern/pthread_support.c b/bsd/kern/pthread_support.c index bcb0b0997..6e96434e1 100644 --- a/bsd/kern/pthread_support.c +++ b/bsd/kern/pthread_support.c @@ -87,7 +87,9 @@ #include #define __PSYNCH_DEBUG__ 0 /* debug panic actions */ +#if (KDEBUG && STANDARD_KDEBUG) #define _PSYNCH_TRACE_ 1 /* kdebug trace */ +#endif #define __TESTMODE__ 2 /* 0 - return error on user error conditions */ /* 1 - log error on user error conditions */ @@ -1739,8 +1741,13 @@ out: * psynch_rw_longrdlock: This system call is used for psync rwlock long readers to block. */ int +#ifdef NOTYET +psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, __unused uint32_t * retval) +#else /* NOTYET */ psynch_rw_longrdlock(__unused proc_t p, __unused struct psynch_rw_longrdlock_args * uap, __unused uint32_t * retval) +#endif /* NOTYET */ { +#ifdef NOTYET user_addr_t rwlock = uap->rwlock; uint32_t lgen = uap->lgenval; uint32_t ugen = uap->ugenval; @@ -1875,8 +1882,12 @@ out: __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); +#else /* NOTYET */ + return(ESRCH); +#endif /* NOTYET */ } + /* * psynch_rw_wrlock: This system call is used for psync rwlock writers to block. */ @@ -2029,8 +2040,13 @@ out1: * psynch_rw_yieldwrlock: This system call is used for psync rwlock yielding writers to block. */ int +#ifdef NOTYET psynch_rw_yieldwrlock(__unused proc_t p, __unused struct psynch_rw_yieldwrlock_args * uap, __unused uint32_t * retval) +#else /* NOTYET */ +psynch_rw_yieldwrlock(__unused proc_t p, __unused struct __unused psynch_rw_yieldwrlock_args * uap, __unused uint32_t * retval) +#endif /* NOTYET */ { +#ifdef NOTYET user_addr_t rwlock = uap->rwlock; uint32_t lgen = uap->lgenval; uint32_t ugen = uap->ugenval; @@ -2166,6 +2182,9 @@ out: __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); +#else /* NOTYET */ + return(ESRCH); +#endif /* NOTYET */ } #if NOTYET @@ -2657,12 +2676,13 @@ pth_proc_hashdelete(proc_t p) pthread_debug_proc = PROC_NULL; #endif /* _PSYNCH_TRACE_ */ hashptr = p->p_pthhash; + p->p_pthhash = NULL; if (hashptr == NULL) return; + pthread_list_lock(); for(i= 0; i < hashsize; i++) { while ((kwq = LIST_FIRST(&hashptr[i])) != NULL) { - pthread_list_lock(); if ((kwq->kw_pflags & KSYN_WQ_INHASH) != 0) { kwq->kw_pflags &= ~KSYN_WQ_INHASH; LIST_REMOVE(kwq, kw_hash); @@ -2679,10 +2699,11 @@ pth_proc_hashdelete(proc_t p) ksyn_freeallkwe(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER]); lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); zfree(kwq_zone, kwq); + pthread_list_lock(); } } - FREE(p->p_pthhash, M_PROC); - p->p_pthhash = NULL; + pthread_list_unlock(); + FREE(hashptr, M_PROC); } /* no lock held for this as the waitqueue is getting freed */ @@ -3066,8 +3087,8 @@ ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, ksyn_waitq_ele #endif { kern_return_t kret; - int error = 0; #if _PSYNCH_TRACE_ + int error = 0; uthread_t uth = NULL; #endif /* _PSYNCH_TRACE_ */ @@ -4161,7 +4182,7 @@ update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq) uint32_t find_nextlowseq(ksyn_wait_queue_t kwq) { - uint32_t numbers[4]; + uint32_t numbers[KSYN_QUEUE_MAX]; int count = 0, i; uint32_t lowest; @@ -4188,7 +4209,7 @@ find_nextlowseq(ksyn_wait_queue_t kwq) uint32_t find_nexthighseq(ksyn_wait_queue_t kwq) { - uint32_t numbers[4]; + uint32_t numbers[KSYN_QUEUE_MAX]; int count = 0, i; uint32_t highest; diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 80112d7c2..d037ee0a1 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -114,28 +114,34 @@ lck_grp_attr_t *pthread_lck_grp_attr; lck_grp_t *pthread_lck_grp; lck_attr_t *pthread_lck_attr; -extern kern_return_t thread_getstatus(register thread_t act, int flavor, - thread_state_t tstate, mach_msg_type_number_t *count); -extern kern_return_t thread_setstatus(thread_t thread, int flavor, - thread_state_t tstate, mach_msg_type_number_t count); extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64); extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t); extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t); extern void workqueue_thread_yielded(void); -static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity); -static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t th, - user_addr_t oc_item, int oc_prio, int oc_affinity); -static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, +#if defined(__i386__) || defined(__x86_64__) +extern boolean_t is_useraddr64_canonical(uint64_t addr64); +#endif + +static boolean_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th, boolean_t force_oc, + boolean_t overcommit, int oc_prio, int oc_affinity); + +static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, int priority); + +static void wq_runreq(proc_t p, boolean_t overcommit, uint32_t priority, thread_t th, struct threadlist *tl, int reuse_thread, int wake_thread, int return_directly); + +static int setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, uint32_t priority, int reuse_thread, struct threadlist *tl); + static void wq_unpark_continue(void); static void wq_unsuspend_continue(void); -static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl); + static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread); static void workqueue_removethread(struct threadlist *tl, int fromexit); static void workqueue_lock_spin(proc_t); static void workqueue_unlock(proc_t); + int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc); int proc_setalltargetconc(pid_t pid, int32_t * targetconcp); @@ -150,6 +156,12 @@ int proc_setalltargetconc(pid_t pid, int32_t * targetconcp); #define TRUNC_DOWN64(a,c) ((((uint64_t)a)-(c)) & ((uint64_t)(-(c)))) +/* flag values for reuse field in the libc side _pthread_wqthread */ +#define WQ_FLAG_THREAD_PRIOMASK 0x0000ffff +#define WQ_FLAG_THREAD_OVERCOMMIT 0x00010000 /* thread is with overcommit prio */ +#define WQ_FLAG_THREAD_REUSE 0x00020000 /* thread is being reused */ +#define WQ_FLAG_THREAD_NEWSPI 0x00040000 /* the call is with new SPIs */ + /* * Flags filed passed to bsdthread_create and back in pthread_start 31 <---------------------------------> 0 @@ -322,6 +334,13 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us */ ts64->rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN); + /* Disallow setting non-canonical PC or stack */ + if (!is_useraddr64_canonical(ts64->rsp) || + !is_useraddr64_canonical(ts64->rip)) { + error = EINVAL; + goto out; + } + thread_set_wq_state64(th, (thread_state_t)ts64); } } @@ -332,8 +351,16 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us if ((flags & PTHREAD_START_SETSCHED) != 0) { thread_extended_policy_data_t extinfo; thread_precedence_policy_data_t precedinfo; +#if CONFIG_EMBEDDED + int ret = 0; +#endif /* CONFIG_EMBEDDED */ importance = (flags & PTHREAD_START_IMPORTANCE_MASK); +#if CONFIG_EMBEDDED + /* sets the saved importance for apple ios daemon if backgrounded. else returns 0 */ + ret = proc_setthread_saved_importance(th, importance); + if (ret == 0) { +#endif /* CONFIG_EMBEDDED */ policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK; if (policy == SCHED_OTHER) @@ -345,6 +372,9 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us #define BASEPRI_DEFAULT 31 precedinfo.importance = (importance - BASEPRI_DEFAULT); thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT); +#if CONFIG_EMBEDDED + } +#endif /* CONFIG_EMBEDDED */ } kret = thread_resume(th); @@ -510,7 +540,7 @@ workqueue_interval_timer_start(struct workqueue *wq) thread_call_enter_delayed(wq->wq_atimer_call, deadline); - KERNEL_DEBUG(0xefffd110, wq, wq->wq_itemcount, wq->wq_flags, wq->wq_timer_interval, 0); + KERNEL_DEBUG(0xefffd110, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0); } @@ -630,14 +660,14 @@ again: * new work within our acceptable time interval because * there were no idle threads left to schedule */ - if (wq->wq_itemcount) { + if (wq->wq_reqcount) { uint32_t priority; uint32_t affinity_tag; uint32_t i; uint64_t curtime; for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) { - if (wq->wq_list_bitmap & (1 << priority)) + if (wq->wq_requests[priority]) break; } assert(priority < WORKQUEUE_NUMPRIOS); @@ -675,23 +705,23 @@ again: break; } } - if (wq->wq_itemcount) { + if (wq->wq_reqcount) { /* * as long as we have threads to schedule, and we successfully * scheduled new work, keep trying */ while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) { /* - * workqueue_run_nextitem is responsible for + * workqueue_run_nextreq is responsible for * dropping the workqueue lock in all cases */ - retval = workqueue_run_nextitem(p, wq, THREAD_NULL, 0, 0, 0); + retval = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, FALSE, 0, 0); workqueue_lock_spin(p); if (retval == FALSE) break; } - if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_itemcount) { + if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) { if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE) goto again; @@ -699,7 +729,7 @@ again: if (wq->wq_thidlecount == 0 || busycount) WQ_TIMER_NEEDED(wq, start_timer); - KERNEL_DEBUG(0xefffd108 | DBG_FUNC_NONE, wq, wq->wq_itemcount, wq->wq_thidlecount, busycount, 0); + KERNEL_DEBUG(0xefffd108 | DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0); } } } @@ -734,12 +764,12 @@ workqueue_thread_yielded(void) p = current_proc(); - if ((wq = p->p_wqptr) == NULL || wq->wq_itemcount == 0) + if ((wq = p->p_wqptr) == NULL || wq->wq_reqcount == 0) return; workqueue_lock_spin(p); - if (wq->wq_itemcount) { + if (wq->wq_reqcount) { uint64_t curtime; uint64_t elapsed; clock_sec_t secs; @@ -752,7 +782,7 @@ workqueue_thread_yielded(void) workqueue_unlock(p); return; } - KERNEL_DEBUG(0xefffd138 | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 0, 0); + KERNEL_DEBUG(0xefffd138 | DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0); wq->wq_thread_yielded_count = 0; @@ -768,11 +798,11 @@ workqueue_thread_yielded(void) * 'workqueue_addnewthread' drops the workqueue lock * when creating the new thread and then retakes it before * returning... this window allows other threads to process - * work on the queue, so we need to recheck for available work + * requests, so we need to recheck for available work * if none found, we just return... the newly created thread * will eventually get used (if it hasn't already)... */ - if (wq->wq_itemcount == 0) { + if (wq->wq_reqcount == 0) { workqueue_unlock(p); return; } @@ -780,9 +810,8 @@ workqueue_thread_yielded(void) if (wq->wq_thidlecount) { uint32_t priority; uint32_t affinity = -1; - user_addr_t item; - struct workitem *witem = NULL; - struct workitemlist *wl = NULL; + boolean_t overcommit = FALSE; + boolean_t force_oc = FALSE; struct uthread *uth; struct threadlist *tl; @@ -791,38 +820,31 @@ workqueue_thread_yielded(void) affinity = tl->th_affinity_tag; for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) { - if (wq->wq_list_bitmap & (1 << priority)) { - wl = (struct workitemlist *)&wq->wq_list[priority]; + if (wq->wq_requests[priority]) break; - } } - assert(wl != NULL); - assert(!(TAILQ_EMPTY(&wl->wl_itemlist))); + assert(priority < WORKQUEUE_NUMPRIOS); - witem = TAILQ_FIRST(&wl->wl_itemlist); - TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry); + wq->wq_reqcount--; + wq->wq_requests[priority]--; - if (TAILQ_EMPTY(&wl->wl_itemlist)) - wq->wq_list_bitmap &= ~(1 << priority); - wq->wq_itemcount--; + if (wq->wq_ocrequests[priority]) { + wq->wq_ocrequests[priority]--; + overcommit = TRUE; + } else + force_oc = TRUE; - item = witem->wi_item; - witem->wi_item = (user_addr_t)0; - witem->wi_affinity = 0; - - TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry); - - (void)workqueue_run_nextitem(p, wq, THREAD_NULL, item, priority, affinity); + (void)workqueue_run_nextreq(p, wq, THREAD_NULL, force_oc, overcommit, priority, affinity); /* - * workqueue_run_nextitem is responsible for + * workqueue_run_nextreq is responsible for * dropping the workqueue lock in all cases */ - KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 1, 0); + KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0); return; } } - KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_itemcount, 2, 0); + KERNEL_DEBUG(0xefffd138 | DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0); } workqueue_unlock(p); } @@ -868,7 +890,7 @@ workqueue_callback(int type, thread_t thread) OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr); - if (wq->wq_itemcount) + if (wq->wq_reqcount) WQ_TIMER_NEEDED(wq, start_timer); if (start_timer == TRUE) @@ -1090,13 +1112,11 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 int wq_size; char * ptr; char * nptr; - int j; uint32_t i; uint32_t num_cpus; int error = 0; boolean_t need_wakeup = FALSE; - struct workitem * witem; - struct workitemlist *wl; + if ((p->p_lflag & P_LREGISTER) == 0) return(EINVAL); @@ -1138,10 +1158,10 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 workqueue_unlock(p); wq_size = sizeof(struct workqueue) + - (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) + + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint16_t)) + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint64_t)) + - sizeof(uint64_t); + sizeof(uint32_t) + sizeof(uint64_t); ptr = (char *)kalloc(wq_size); bzero(ptr, wq_size); @@ -1153,25 +1173,20 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 wq->wq_task = current_task(); wq->wq_map = current_map(); - for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) { - wl = (struct workitemlist *)&wq->wq_list[i]; - TAILQ_INIT(&wl->wl_itemlist); - TAILQ_INIT(&wl->wl_freelist); - - for (j = 0; j < WORKITEM_SIZE; j++) { - witem = &wq->wq_array[(i*WORKITEM_SIZE) + j]; - TAILQ_INSERT_TAIL(&wl->wl_freelist, witem, wi_entry); - } + for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) wq->wq_reqconc[i] = wq->wq_affinity_max; - } + nptr = ptr + sizeof(struct workqueue); for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) { - wq->wq_thactive_count[i] = (uint32_t *)nptr; - nptr += (num_cpus * sizeof(uint32_t)); + wq->wq_thscheduled_count[i] = (uint16_t *)nptr; + nptr += (num_cpus * sizeof(uint16_t)); } + nptr += (sizeof(uint32_t) - 1); + nptr = (char *)((uintptr_t)nptr & ~(sizeof(uint32_t) - 1)); + for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) { - wq->wq_thscheduled_count[i] = (uint32_t *)nptr; + wq->wq_thactive_count[i] = (uint32_t *)nptr; nptr += (num_cpus * sizeof(uint32_t)); } /* @@ -1208,59 +1223,86 @@ out: return(error); } + int workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused int32_t *retval) { - user_addr_t item = uap->item; - int options = uap->options; - int prio = uap->prio; /* should be used to find the right workqueue */ - int affinity = uap->affinity; - int error = 0; - thread_t th = THREAD_NULL; - user_addr_t oc_item = 0; struct workqueue *wq; + int error = 0; if ((p->p_lflag & P_LREGISTER) == 0) return(EINVAL); - /* - * affinity not yet hooked up on this path - */ - affinity = -1; + switch (uap->options) { - switch (options) { + case WQOPS_QUEUE_NEWSPISUPP: + break; + + case WQOPS_QUEUE_REQTHREADS: { + /* + * for this operation, we re-purpose the affinity + * argument as the number of threads to start + */ + boolean_t overcommit = FALSE; + int priority = uap->prio; + int reqcount = uap->affinity; - case WQOPS_QUEUE_ADD: { - - if (prio & WORKQUEUE_OVERCOMMIT) { - prio &= ~WORKQUEUE_OVERCOMMIT; - oc_item = item; + if (priority & WORKQUEUE_OVERCOMMIT) { + priority &= ~WORKQUEUE_OVERCOMMIT; + overcommit = TRUE; } - if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS)) - return (EINVAL); + if ((reqcount <= 0) || (priority < 0) || (priority >= WORKQUEUE_NUMPRIOS)) { + error = EINVAL; + break; + } + workqueue_lock_spin(p); - workqueue_lock_spin(p); + if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { + workqueue_unlock(p); - if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { - workqueue_unlock(p); - return (EINVAL); - } - if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_constrained_threads_scheduled < wq->wq_affinity_max))) { + error = EINVAL; + break; + } + if (overcommit == FALSE) { + wq->wq_reqcount += reqcount; + wq->wq_requests[priority] += reqcount; + + KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, priority, wq->wq_requests[priority], reqcount, 0); - workqueue_addnewthread(wq, oc_item ? TRUE : FALSE); + while (wq->wq_reqcount) { + if (workqueue_run_one(p, wq, overcommit, priority) == FALSE) + break; + } + } else { + KERNEL_DEBUG(0xefffd13c | DBG_FUNC_NONE, wq, priority, wq->wq_requests[priority], reqcount, 0); - if (wq->wq_thidlecount == 0) - oc_item = 0; + while (reqcount) { + if (workqueue_run_one(p, wq, overcommit, priority) == FALSE) + break; + reqcount--; + } + if (reqcount) { + /* + * we need to delay starting some of the overcommit requests... + * we should only fail to create the overcommit threads if + * we're at the max thread limit... as existing threads + * return to the kernel, we'll notice the ocrequests + * and spin them back to user space as the overcommit variety + */ + wq->wq_reqcount += reqcount; + wq->wq_requests[priority] += reqcount; + wq->wq_ocrequests[priority] += reqcount; + + KERNEL_DEBUG(0xefffd140 | DBG_FUNC_NONE, wq, priority, wq->wq_requests[priority], reqcount, 0); + } } - if (oc_item == 0) - error = workqueue_additem(wq, prio, item, affinity); + workqueue_unlock(p); - KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, prio, affinity, oc_item, 0); } break; - case WQOPS_THREAD_RETURN: { - th = current_thread(); + case WQOPS_THREAD_RETURN: { + thread_t th = current_thread(); struct uthread *uth = get_bsdthread_info(th); /* reset signal mask on the workqueue thread to default state */ @@ -1269,50 +1311,29 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused in uth->uu_sigmask = ~workq_threadmask; proc_unlock(p); } - workqueue_lock_spin(p); if ((wq = (struct workqueue *)p->p_wqptr) == NULL || (uth->uu_threadlist == NULL)) { workqueue_unlock(p); - return (EINVAL); + + error = EINVAL; + break; } KERNEL_DEBUG(0xefffd004 | DBG_FUNC_END, wq, 0, 0, 0, 0); - } - break; - case WQOPS_THREAD_SETCONC: { - if ((prio < 0) || (prio > WORKQUEUE_NUMPRIOS)) - return (EINVAL); - - workqueue_lock_spin(p); - - if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { - workqueue_unlock(p); - return (EINVAL); - } + (void)workqueue_run_nextreq(p, wq, th, FALSE, FALSE, 0, -1); /* - * for this operation, we re-purpose the affinity - * argument as the concurrency target + * workqueue_run_nextreq is responsible for + * dropping the workqueue lock in all cases */ - if (prio < WORKQUEUE_NUMPRIOS) - wq->wq_reqconc[prio] = affinity; - else { - for (prio = 0; prio < WORKQUEUE_NUMPRIOS; prio++) - wq->wq_reqconc[prio] = affinity; - - } } break; + default: - return (EINVAL); + error = EINVAL; + break; } - (void)workqueue_run_nextitem(p, wq, th, oc_item, prio, affinity); - /* - * workqueue_run_nextitem is responsible for - * dropping the workqueue lock in all cases - */ return (error); - } /* @@ -1426,30 +1447,6 @@ workqueue_exit(struct proc *p) } } -static int -workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity) -{ - struct workitem *witem; - struct workitemlist *wl; - - wl = (struct workitemlist *)&wq->wq_list[prio]; - - if (TAILQ_EMPTY(&wl->wl_freelist)) - return (ENOMEM); - - witem = (struct workitem *)TAILQ_FIRST(&wl->wl_freelist); - TAILQ_REMOVE(&wl->wl_freelist, witem, wi_entry); - - witem->wi_item = item; - witem->wi_affinity = affinity; - TAILQ_INSERT_TAIL(&wl->wl_itemlist, witem, wi_entry); - - wq->wq_list_bitmap |= (1 << prio); - - wq->wq_itemcount++; - - return (0); -} static int workqueue_importance[WORKQUEUE_NUMPRIOS] = { @@ -1464,37 +1461,69 @@ static int workqueue_policy[WORKQUEUE_NUMPRIOS] = }; + +static boolean_t +workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, int priority) +{ + boolean_t ran_one; + + if (wq->wq_thidlecount == 0) { + if (overcommit == FALSE) { + if (wq->wq_constrained_threads_scheduled < wq->wq_affinity_max) + workqueue_addnewthread(wq, overcommit); + } else { + workqueue_addnewthread(wq, overcommit); + + if (wq->wq_thidlecount == 0) + return (FALSE); + } + } + ran_one = workqueue_run_nextreq(p, wq, THREAD_NULL, FALSE, overcommit, priority, -1); + /* + * workqueue_run_nextreq is responsible for + * dropping the workqueue lock in all cases + */ + workqueue_lock_spin(p); + + return (ran_one); +} + + + /* - * workqueue_run_nextitem: + * workqueue_run_nextreq: * called with the workqueue lock held... * responsible for dropping it in all cases */ static boolean_t -workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_addr_t oc_item, int oc_prio, int oc_affinity) +workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread, + boolean_t force_oc, boolean_t overcommit, int oc_prio, int oc_affinity) { - struct workitem *witem = NULL; - user_addr_t item = 0; thread_t th_to_run = THREAD_NULL; thread_t th_to_park = THREAD_NULL; int wake_thread = 0; - int reuse_thread = 1; + int reuse_thread = WQ_FLAG_THREAD_REUSE; uint32_t priority, orig_priority; uint32_t affinity_tag, orig_affinity_tag; uint32_t i, n; - uint32_t activecount; uint32_t busycount; uint32_t us_to_wait; struct threadlist *tl = NULL; struct threadlist *ttl = NULL; struct uthread *uth = NULL; - struct workitemlist *wl = NULL; boolean_t start_timer = FALSE; boolean_t adjust_counters = TRUE; uint64_t curtime; - KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_itemcount, 0); + KERNEL_DEBUG(0xefffd000 | DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_reqcount, 0); + + if (thread != THREAD_NULL) { + uth = get_bsdthread_info(thread); + if ( (tl = uth->uu_threadlist) == NULL) + panic("wq thread with no threadlist "); + } /* * from here until we drop the workq lock * we can't be pre-empted since we hold @@ -1504,14 +1533,15 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add * and these values are used to index the multi-dimensional * counter arrays in 'workqueue_callback' */ - if (oc_item) { +dispatch_overcommit: + + if (overcommit == TRUE || force_oc == TRUE) { uint32_t min_scheduled = 0; uint32_t scheduled_count; uint32_t active_count; uint32_t t_affinity = 0; priority = oc_prio; - item = oc_item; if ((affinity_tag = oc_affinity) == (uint32_t)-1) { for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) { @@ -1536,37 +1566,55 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add } affinity_tag = t_affinity; } + if (thread != THREAD_NULL) { + th_to_run = thread; + goto pick_up_work; + } goto grab_idle_thread; } + if (wq->wq_reqcount) { + for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) { + if (wq->wq_requests[priority]) + break; + } + assert(priority < WORKQUEUE_NUMPRIOS); + + if (wq->wq_ocrequests[priority] && (thread != THREAD_NULL || wq->wq_thidlecount)) { + /* + * handle delayed overcommit request... + * they have priority over normal requests + * within a given priority level + */ + wq->wq_reqcount--; + wq->wq_requests[priority]--; + wq->wq_ocrequests[priority]--; + + oc_prio = priority; + overcommit = TRUE; + + goto dispatch_overcommit; + } + } /* * if we get here, the work should be handled by a constrained thread */ - if (wq->wq_itemcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + if (wq->wq_reqcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { /* * no work to do, or we're already at or over the scheduling limit for * constrained threads... just return or park the thread... * do not start the timer for this condition... if we don't have any work, * we'll check again when new work arrives... if we're over the limit, we need 1 or more - * constrained threads to return to the kernel before we can dispatch work from our queue + * constrained threads to return to the kernel before we can dispatch additional work */ if ((th_to_park = thread) == THREAD_NULL) goto out_of_work; goto parkit; } - for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) { - if (wq->wq_list_bitmap & (1 << priority)) { - wl = (struct workitemlist *)&wq->wq_list[priority]; - break; - } - } - assert(wl != NULL); - assert(!(TAILQ_EMPTY(&wl->wl_itemlist))); curtime = mach_absolute_time(); if (thread != THREAD_NULL) { - uth = get_bsdthread_info(thread); - tl = uth->uu_threadlist; + affinity_tag = tl->th_affinity_tag; /* @@ -1576,6 +1624,10 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add * we're considering running work for */ if (affinity_tag < wq->wq_reqconc[priority]) { + uint32_t bcount = 0; + uint32_t acount = 0; + uint32_t tcount = 0; + /* * we're a worker thread from the pool... currently we * are considered 'active' which means we're counted @@ -1583,56 +1635,84 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add * add up the active counts of all the priority levels * up to and including the one we want to schedule */ - for (activecount = 0, i = 0; i <= priority; i++) { - uint32_t acount; + for (i = 0; i <= priority; i++) { - acount = wq->wq_thactive_count[i][affinity_tag]; + tcount = wq->wq_thactive_count[i][affinity_tag]; + acount += tcount; - if (acount == 0 && wq->wq_thscheduled_count[i][affinity_tag]) { + if (tcount == 0 && wq->wq_thscheduled_count[i][affinity_tag]) { if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) - acount = 1; + bcount++; } - activecount += acount; } - if (activecount == 1) { + if ((acount + bcount) == 1) { /* * we're the only active thread associated with our * affinity group at this priority level and higher, + * and there are no threads considered 'busy', * so pick up some work and keep going */ th_to_run = thread; goto pick_up_work; } + if (wq->wq_reqconc[priority] == 1) { + /* + * we have at least one other active or busy thread running at this + * priority level or higher and since we only have + * 1 affinity group to schedule against, no need + * to try and find another... we can't start up another thread to + * service the request and we already have the info + * needed to determine if we need to start a timer or not + */ + if (acount == 1) { + /* + * we're the only active thread, but we must have found + * at least 1 busy thread, so indicate that we need + * to start a timer + */ + busycount = 1; + } else + busycount = 0; + + affinity_tag = 1; + goto cant_schedule; + } } /* * there's more than 1 thread running in this affinity group * or the concurrency level has been cut back for this priority... - * lets continue on and look for an 'empty' group to run this - * work item in + * let's continue on and look for an 'empty' group to run this + * work request in */ } busycount = 0; for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority]; affinity_tag++) { + boolean_t can_schedule; + /* * look for first affinity group that is currently not active * i.e. no active threads at this priority level or higher * and no threads that have run recently */ - for (activecount = 0, i = 0; i <= priority; i++) { - if ((activecount = wq->wq_thactive_count[i][affinity_tag])) + for (i = 0; i <= priority; i++) { + can_schedule = FALSE; + + if (wq->wq_thactive_count[i][affinity_tag]) break; - if (wq->wq_thscheduled_count[i][affinity_tag]) { - if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) { - busycount++; - break; - } + if (wq->wq_thscheduled_count[i][affinity_tag] && + wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i][affinity_tag])) { + busycount++; + break; } + can_schedule = TRUE; } - if (activecount == 0 && busycount == 0) + if (can_schedule == TRUE) break; } +cant_schedule: + if (affinity_tag >= wq->wq_reqconc[priority]) { /* * we've already got at least 1 thread per @@ -1644,7 +1724,7 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add * 'busy' state... make sure we start * the timer because if they are the only * threads keeping us from scheduling - * this workitem, we won't get a callback + * this work request, we won't get a callback * to kick off the timer... we need to * start it now... */ @@ -1671,6 +1751,8 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add th_to_run = thread; goto pick_up_work; } + +grab_idle_thread: if (wq->wq_thidlecount == 0) { /* * we don't have a thread to schedule, but we have @@ -1683,14 +1765,12 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add goto no_thread_to_run; } - -grab_idle_thread: /* * we've got a candidate (affinity group with no currently * active threads) to start a new thread on... * we already know there is both work available * and an idle thread, so activate a thread and then - * fall into the code that pulls a new workitem... + * fall into the code that pulls a new work request... */ TAILQ_FOREACH(ttl, &wq->wq_thidlelist, th_entry) { if (ttl->th_affinity_tag == affinity_tag || ttl->th_affinity_tag == (uint16_t)-1) { @@ -1727,18 +1807,9 @@ grab_idle_thread: th_to_run = tl->th_thread; pick_up_work: - if (item == 0) { - witem = TAILQ_FIRST(&wl->wl_itemlist); - TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry); - - if (TAILQ_EMPTY(&wl->wl_itemlist)) - wq->wq_list_bitmap &= ~(1 << priority); - wq->wq_itemcount--; - - item = witem->wi_item; - witem->wi_item = (user_addr_t)0; - witem->wi_affinity = 0; - TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry); + if (overcommit == FALSE && force_oc == FALSE) { + wq->wq_reqcount--; + wq->wq_requests[priority]--; if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) { wq->wq_constrained_threads_scheduled++; @@ -1792,38 +1863,25 @@ pick_up_work: thread_precedence_policy_data_t precedinfo; thread_extended_policy_data_t extinfo; uint32_t policy; +#if CONFIG_EMBEDDED + int retval = 0; + /* sets the saved importance for apple ios daemon if backgrounded. else returns 0 */ + retval = proc_setthread_saved_importance(th_to_run, workqueue_importance[priority]); + if (retval == 0) { +#endif /* CONFIG_EMBEDDED */ policy = workqueue_policy[priority]; KERNEL_DEBUG(0xefffd120 | DBG_FUNC_START, wq, orig_priority, tl->th_policy, 0, 0); if ((orig_priority == WORKQUEUE_BG_PRIOQUEUE) || (priority == WORKQUEUE_BG_PRIOQUEUE)) { - struct uthread *ut = NULL; - - ut = get_bsdthread_info(th_to_run); - if (orig_priority == WORKQUEUE_BG_PRIOQUEUE) { /* remove the disk throttle, importance will be reset in anycase */ -#if !CONFIG_EMBEDDED proc_restore_workq_bgthreadpolicy(th_to_run); -#else /* !CONFIG_EMBEDDED */ - if ((ut->uu_flag & UT_BACKGROUND) != 0) { - ut->uu_flag &= ~UT_BACKGROUND; - ut->uu_iopol_disk = IOPOL_NORMAL; - } -#endif /* !CONFIG_EMBEDDED */ } if (priority == WORKQUEUE_BG_PRIOQUEUE) { -#if !CONFIG_EMBEDDED - proc_apply_workq_bgthreadpolicy(th_to_run); -#else /* !CONFIG_EMBEDDED */ - if ((ut->uu_flag & UT_BACKGROUND) == 0) { - /* set diskthrottling */ - ut->uu_flag |= UT_BACKGROUND; - ut->uu_iopol_disk = IOPOL_THROTTLE; - } -#endif /* !CONFIG_EMBEDDED */ + proc_apply_workq_bgthreadpolicy(th_to_run); } } @@ -1839,6 +1897,9 @@ pick_up_work: KERNEL_DEBUG(0xefffd120 | DBG_FUNC_END, wq, priority, policy, 0, 0); +#if CONFIG_EMBEDDED + } +#endif /* CONFIG_EMBEDDED */ } if (kdebug_enable) { int lpri = -1; @@ -1866,11 +1927,11 @@ pick_up_work: } } /* - * if current thread is reused for workitem, does not return via unix_syscall + * if current thread is reused for work request, does not return via unix_syscall */ - wq_runitem(p, item, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run)); + wq_runreq(p, overcommit, priority, th_to_run, tl, reuse_thread, wake_thread, (thread == th_to_run)); - KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(th_to_run), item, 1, 0); + KERNEL_DEBUG(0xefffd000 | DBG_FUNC_END, wq, thread_tid(th_to_run), overcommit, 1, 0); return (TRUE); @@ -1894,11 +1955,6 @@ parkit: * this is a workqueue thread with no more * work to do... park it for now */ - uth = get_bsdthread_info(th_to_park); - tl = uth->uu_threadlist; - if (tl == 0) - panic("wq thread with no threadlist "); - TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry); tl->th_flags &= ~TH_LIST_RUNNING; @@ -2032,7 +2088,7 @@ wq_unpark_continue(void) if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) { /* * a normal wakeup of this thread occurred... no need - * for any synchronization with the timer and wq_runitem + * for any synchronization with the timer and wq_runreq */ normal_return_to_user: thread_sched_call(th_to_unpark, workqueue_callback); @@ -2088,7 +2144,7 @@ normal_return_to_user: static void -wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, +wq_runreq(proc_t p, boolean_t overcommit, uint32_t priority, thread_t th, struct threadlist *tl, int reuse_thread, int wake_thread, int return_directly) { int ret = 0; @@ -2096,7 +2152,7 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, KERNEL_DEBUG1(0xefffd004 | DBG_FUNC_START, tl->th_workq, tl->th_priority, tl->th_affinity_tag, thread_tid(current_thread()), thread_tid(th)); - ret = setup_wqthread(p, th, item, reuse_thread, tl); + ret = setup_wqthread(p, th, overcommit, priority, reuse_thread, tl); if (ret != 0) panic("setup_wqthread failed %x\n", ret); @@ -2106,7 +2162,7 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, thread_exception_return(); - panic("wq_runitem: thread_exception_return returned ...\n"); + panic("wq_runreq: thread_exception_return returned ...\n"); } if (wake_thread) { workqueue_lock_spin(p); @@ -2141,8 +2197,15 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, int -setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl) +setup_wqthread(proc_t p, thread_t th, boolean_t overcommit, uint32_t priority, int reuse_thread, struct threadlist *tl) { + uint32_t flags = reuse_thread | WQ_FLAG_THREAD_NEWSPI; + + if (overcommit == TRUE) + flags |= WQ_FLAG_THREAD_OVERCOMMIT; + + flags |= priority; + #if defined(__i386__) || defined(__x86_64__) int isLP64 = 0; @@ -2158,16 +2221,14 @@ setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct ts->eax = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE); ts->ebx = (unsigned int)tl->th_thport; ts->ecx = (unsigned int)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE); - ts->edx = (unsigned int)item; - ts->edi = (unsigned int)reuse_thread; + ts->edx = (unsigned int)0; + ts->edi = (unsigned int)flags; ts->esi = (unsigned int)0; /* * set stack pointer */ ts->esp = (int)((vm_offset_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_32_STK_ALIGN)); - if ((reuse_thread != 0) && (ts->eax == (unsigned int)0)) - panic("setup_wqthread: setting reuse thread with null pthread\n"); thread_set_wq_state32(th, (thread_state_t)ts); } else { @@ -2178,8 +2239,8 @@ setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct ts64->rdi = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE); ts64->rsi = (uint64_t)(tl->th_thport); ts64->rdx = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE); - ts64->rcx = (uint64_t)item; - ts64->r8 = (uint64_t)reuse_thread; + ts64->rcx = (uint64_t)0; + ts64->r8 = (uint64_t)flags; ts64->r9 = (uint64_t)0; /* @@ -2187,8 +2248,6 @@ setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct */ ts64->rsp = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_64_REDZONE_LEN); - if ((reuse_thread != 0) && (ts64->rdi == (uint64_t)0)) - panic("setup_wqthread: setting reuse thread with null pthread\n"); thread_set_wq_state64(th, (thread_state_t)ts64); } #else diff --git a/bsd/kern/socket_info.c b/bsd/kern/socket_info.c index 9d489e122..ffbaaf456 100644 --- a/bsd/kern/socket_info.c +++ b/bsd/kern/socket_info.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -67,12 +68,12 @@ fill_sockbuf_info(struct sockbuf *sb, struct sockbuf_info *sbi) static void fill_common_sockinfo(struct socket *so, struct socket_info *si) { - si->soi_so = (u_int64_t)((uintptr_t)so); + si->soi_so = (u_int64_t)VM_KERNEL_ADDRPERM(so); si->soi_type = so->so_type; - si->soi_options = so->so_options; + si->soi_options = (short)(so->so_options & 0xffff); si->soi_linger = so->so_linger; si->soi_state = so->so_state; - si->soi_pcb = (u_int64_t)((uintptr_t)so->so_pcb); + si->soi_pcb = (u_int64_t)VM_KERNEL_ADDRPERM(so->so_pcb); if (so->so_proto) { si->soi_protocol = so->so_proto->pr_protocol; if (so->so_proto->pr_domain) @@ -148,7 +149,8 @@ fill_socketinfo(struct socket *so, struct socket_info *si) tcpsi->tcpsi_timer[TCPT_2MSL] = tp->t_timer[TCPT_2MSL]; tcpsi->tcpsi_mss = tp->t_maxseg; tcpsi->tcpsi_flags = tp->t_flags; - tcpsi->tcpsi_tp = (u_int64_t)((uintptr_t)tp); + tcpsi->tcpsi_tp = + (u_int64_t)VM_KERNEL_ADDRPERM(tp); } break; } @@ -158,10 +160,11 @@ fill_socketinfo(struct socket *so, struct socket_info *si) si->soi_kind = SOCKINFO_UN; - unsi->unsi_conn_pcb = (uint64_t)((uintptr_t)unp->unp_conn); + unsi->unsi_conn_pcb = + (uint64_t)VM_KERNEL_ADDRPERM(unp->unp_conn); if (unp->unp_conn) - unsi->unsi_conn_so = (uint64_t)((uintptr_t)unp->unp_conn->unp_socket); - + unsi->unsi_conn_so = (uint64_t) + VM_KERNEL_ADDRPERM(unp->unp_conn->unp_socket); if (unp->unp_addr) { size_t addrlen = unp->unp_addr->sun_len; diff --git a/bsd/kern/subr_prf.c b/bsd/kern/subr_prf.c index 45bddb431..3d4e18b60 100644 --- a/bsd/kern/subr_prf.c +++ b/bsd/kern/subr_prf.c @@ -446,6 +446,8 @@ vprintf(const char *fmt, va_list ap) return 0; } +#if !CONFIG_EMBEDDED + /* * Scaled down version of vsprintf(3). * @@ -467,6 +469,7 @@ vsprintf(char *buf, const char *cfmt, va_list ap) } return 0; } +#endif /* !CONFIG_EMBEDDED */ /* * Scaled down version of snprintf(3). diff --git a/bsd/kern/subr_prof.c b/bsd/kern/subr_prof.c index 4d07853d9..80b6edc27 100644 --- a/bsd/kern/subr_prof.c +++ b/bsd/kern/subr_prof.c @@ -343,30 +343,6 @@ overflow: #define PROFILE_UNLOCK(x) -int -profil(struct proc *p, struct profil_args *uap, int32_t *retval) -{ - void *tmp; - - tmp = p; - tmp = uap; - tmp = retval; - - return EINVAL; -} - -int -add_profil(struct proc *p, struct add_profil_args *uap, int32_t *retval) -{ - void *tmp; - - tmp = p; - tmp = uap; - tmp = retval; - - return EINVAL; -} - /* * Scale is a fixed-point number with the binary point 16 bits * into the value, and is <= 1.0. pc is at most 32 bits, so the diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index bacd02b79..cc950bd84 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -100,6 +101,8 @@ #include #include #include +#include +#include #include #include @@ -1567,6 +1570,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data) { struct poll_continue_args *cont = (struct poll_continue_args *)data; struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata); + short prev_revents = fds->revents; short mask; /* convert the results back into revents */ @@ -1606,7 +1610,7 @@ poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data) break; } - if (fds->revents) + if (fds->revents != 0 && prev_revents == 0) cont->pca_rfds++; return 0; @@ -2044,14 +2048,14 @@ postpipeevent(struct pipe *pipep, int event) evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt; } if ((evq->ee_eventmask & EV_WR) && - (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) { + (MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) { if (pipep->pipe_state & PIPE_EOF) { mask |= EV_WR|EV_RESET; break; } mask |= EV_WR; - evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt; + evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt; } break; @@ -2819,3 +2823,111 @@ gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retv return (error); } + +/* + * ledger + * + * Description: Omnibus system call for ledger operations + */ +int +ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval) +{ + int rval, pid, len, error; +#ifdef LEDGER_DEBUG + struct ledger_limit_args lla; +#endif + task_t task; + proc_t proc; + + /* Finish copying in the necessary args before taking the proc lock */ + error = 0; + len = 0; + if (args->cmd == LEDGER_ENTRY_INFO) + error = copyin(args->arg3, (char *)&len, sizeof (len)); + else if (args->cmd == LEDGER_TEMPLATE_INFO) + error = copyin(args->arg2, (char *)&len, sizeof (len)); +#ifdef LEDGER_DEBUG + else if (args->cmd == LEDGER_LIMIT) + error = copyin(args->arg2, (char *)&lla, sizeof (lla)); +#endif + if (error) + return (error); + if (len < 0) + return (EINVAL); + + rval = 0; + if (args->cmd != LEDGER_TEMPLATE_INFO) { + pid = args->arg1; + proc = proc_find(pid); + if (proc == NULL) + return (ESRCH); + +#if CONFIG_MACF + error = mac_proc_check_ledger(p, proc, args->cmd); + if (error) { + proc_rele(proc); + return (error); + } +#endif + + task = proc->task; + } + + switch (args->cmd) { +#ifdef LEDGER_DEBUG + case LEDGER_LIMIT: { + if (!is_suser()) + rval = EPERM; + rval = ledger_limit(task, &lla); + proc_rele(proc); + break; + } +#endif + case LEDGER_INFO: { + struct ledger_info info; + + rval = ledger_info(task, &info); + proc_rele(proc); + if (rval == 0) + rval = copyout(&info, args->arg2, + sizeof (info)); + break; + } + + case LEDGER_ENTRY_INFO: { + void *buf; + int sz; + + rval = ledger_entry_info(task, &buf, &len); + proc_rele(proc); + if ((rval == 0) && (len > 0)) { + sz = len * sizeof (struct ledger_entry_info); + rval = copyout(buf, args->arg2, sz); + kfree(buf, sz); + } + if (rval == 0) + rval = copyout(&len, args->arg3, sizeof (len)); + break; + } + + case LEDGER_TEMPLATE_INFO: { + void *buf; + int sz; + + rval = ledger_template_info(&buf, &len); + if ((rval == 0) && (len > 0)) { + sz = len * sizeof (struct ledger_template_info); + rval = copyout(buf, args->arg1, sz); + kfree(buf, sz); + } + if (rval == 0) + rval = copyout(&len, args->arg2, sizeof (len)); + break; + } + + default: + rval = EINVAL; + } + + return (rval); +} diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index 27f2461b4..9aa8ac04c 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -55,46 +55,65 @@ * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. + * + * Pipes are implemented as circular buffers. Following are the valid states in pipes operations + * + * _________________________________ + * 1. |_________________________________| r=w, c=0 + * + * _________________________________ + * 2. |__r:::::wc_______________________| r <= w , c > 0 + * + * _________________________________ + * 3. |::::wc_____r:::::::::::::::::::::| r>w , c > 0 + * + * _________________________________ + * 4. |:::::::wrc:::::::::::::::::::::::| w=r, c = Max size + * + * + * Nomenclature:- + * a-z define the steps in a program flow + * 1-4 are the states as defined aboe + * Action: is what file operation is done on the pipe + * + * Current:None Action: initialize with size M=200 + * a. State 1 ( r=0, w=0, c=0) + * + * Current: a Action: write(100) (w < M) + * b. State 2 (r=0, w=100, c=100) + * + * Current: b Action: write(100) (w = M-w) + * c. State 4 (r=0,w=0,c=200) + * + * Current: b Action: read(70) ( r < c ) + * d. State 2(r=70,w=100,c=30) + * + * Current: d Action: write(75) ( w < (m-w)) + * e. State 2 (r=70,w=175,c=105) + * + * Current: d Action: write(110) ( w > (m-w)) + * f. State 3 (r=70,w=10,c=140) + * + * Current: d Action: read(30) (r >= c ) + * g. State 1 (r=100,w=100,c=0) + * */ /* - * This code has two modes of operation, a small write mode and a large - * write mode. The small write mode acts like conventional pipes with - * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the - * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT - * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and - * the receiving process can copy it directly from the pages in the sending - * process. - * - * If the sending process receives a signal, it is possible that it will - * go away, and certainly its address space can change, because control - * is returned back to the user-mode side. In that case, the pipe code - * arranges to copy the buffer supplied by the user process, to a pageable - * kernel buffer, and the receiving process will grab the data from the - * pageable kernel buffer. Since signals don't happen all that often, - * the copy operation is normally eliminated. - * - * The constant PIPE_MINDIRECT is chosen to make sure that buffering will - * happen for small transfers so that the system will not spend all of - * its time context switching. + * This code create half duplex pipe buffers for facilitating file like + * operations on pipes. The initial buffer is very small, but this can + * dynamically change to larger sizes based on usage. The buffer size is never + * reduced. The total amount of kernel memory used is governed by maxpipekva. + * In case of dynamic expansion limit is reached, the output thread is blocked + * until the pipe buffer empties enough to continue. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable - * address space available to us in pipe_map. Whenever the amount in use - * exceeds half of this value, all new pipes will be created with size - * SMALL_PIPE_SIZE, rather than PIPE_SIZE. Big pipe creation will be limited - * as well. This value is loader tunable only. - * - * kern.ipc.maxpipekvawired - This value limits the amount of memory that may - * be wired in order to facilitate direct copies using page flipping. - * Whenever this value is exceeded, pipes will fall back to using regular - * copies. This value is sysctl controllable at all times. - * - * These values are autotuned in subr_param.c. + * address space available to us in pipe_map. * * Memory usage may be monitored through the sysctls - * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired. + * kern.ipc.pipes, kern.ipc.pipekva. * */ @@ -124,6 +143,7 @@ #include #include +#include #include #include @@ -134,53 +154,23 @@ #define f_ops f_fglob->fg_ops #define f_offset f_fglob->fg_offset #define f_data f_fglob->fg_data -/* - * Use this define if you want to disable *fancy* VM things. Expect an - * approx 30% decrease in transfer rate. This could be useful for - * NetBSD or OpenBSD. - * - * this needs to be ported to X and the performance measured - * before committing to supporting it - */ -#define PIPE_NODIRECT 1 - -#ifndef PIPE_NODIRECT - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#endif /* - * interfaces to the outside world + * interfaces to the outside world exported through file operations */ static int pipe_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); - static int pipe_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); - static int pipe_close(struct fileglob *fg, vfs_context_t ctx); - static int pipe_select(struct fileproc *fp, int which, void * wql, vfs_context_t ctx); - static int pipe_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); - static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, vfs_context_t ctx); - static int pipe_drain(struct fileproc *fp,vfs_context_t ctx); - struct fileops pipeops = { pipe_read, pipe_write, @@ -190,7 +180,6 @@ struct fileops pipeops = pipe_kqfilter, pipe_drain }; - static void filt_pipedetach(struct knote *kn); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); @@ -200,33 +189,18 @@ static struct filterops pipe_rfiltops = { .f_detach = filt_pipedetach, .f_event = filt_piperead, }; + static struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite, }; -/* - * Default pipe buffer size(s), this can be kind-of large now because pipe - * space is pageable. The pipe code will try to maintain locality of - * reference for performance reasons, so small amounts of outstanding I/O - * will not wipe the cache. - */ -#define MINPIPESIZE (PIPE_SIZE/3) +static int nbigpipe; /* for compatibility sake. no longer used */ +static int amountpipes; /* total number of pipes in system */ +static int amountpipekva; /* total memory used by pipes */ -/* - * Limit the number of "big" pipes - */ -#define LIMITBIGPIPES 32 -static int nbigpipe; - -static int amountpipes; -static int amountpipekva; - -#ifndef PIPE_NODIRECT -static int amountpipekvawired; -#endif -int maxpipekva = 1024 * 1024 * 16; +int maxpipekva = PIPE_KVAMAX; /* allowing 16MB max. */ #if PIPE_SYSCTLS SYSCTL_DECL(_kern_ipc); @@ -248,29 +222,24 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED, static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe **cpipep); +static int pipespace(struct pipe *cpipe, int size); +static int choose_pipespace(unsigned long current, unsigned long expected); +static int expand_pipespace(struct pipe *p, int target_size); static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe); -static __inline int pipelock(struct pipe *cpipe, int catch); -static __inline void pipeunlock(struct pipe *cpipe); - -#ifndef PIPE_NODIRECT -static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); -static void pipe_destroy_write_buffer(struct pipe *wpipe); -static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); -static void pipe_clone_write_buffer(struct pipe *wpipe); -#endif +static __inline int pipeio_lock(struct pipe *cpipe, int catch); +static __inline void pipeio_unlock(struct pipe *cpipe); extern int postpipeevent(struct pipe *, int); extern void evpipefree(struct pipe *cpipe); - -static int pipespace(struct pipe *cpipe, int size); - static lck_grp_t *pipe_mtx_grp; static lck_attr_t *pipe_mtx_attr; static lck_grp_attr_t *pipe_mtx_grp_attr; static zone_t pipe_zone; +#define MAX_PIPESIZE(pipe) ( MAX(PIPE_SIZE, (pipe)->pipe_buffer.size) ) + #define PIPE_GARBAGE_AGE_LIMIT 5000 /* In milliseconds */ #define PIPE_GARBAGE_QUEUE_LIMIT 32000 @@ -286,26 +255,26 @@ static struct pipe_garbage *pipe_garbage_tail = NULL; static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT; static int pipe_garbage_count = 0; static lck_mtx_t *pipe_garbage_lock; +static void pipe_garbage_collect(struct pipe *cpipe); SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); +/* initial setup done at time of sysinit */ void pipeinit(void) { + nbigpipe=0; vm_size_t zone_size; - + zone_size = 8192 * sizeof(struct pipe); pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone"); - /* - * allocate lock group attribute and group for pipe mutexes - */ + + /* allocate lock group attribute and group for pipe mutexes */ pipe_mtx_grp_attr = lck_grp_attr_alloc_init(); pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr); - /* - * allocate the lock attribute for pipe mutexes - */ + /* allocate the lock attribute for pipe mutexes */ pipe_mtx_attr = lck_attr_alloc_init(); /* @@ -316,6 +285,7 @@ pipeinit(void) pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage), zone_size, 4096, "pipe garbage zone"); pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr); + } /* Bitmap for things to touch in pipe_touch() */ @@ -346,10 +316,80 @@ pipe_touch(struct pipe *tpipe, int touch) } } +static const unsigned int pipesize_blocks[] = {128,256,1024,2048,PAGE_SIZE, PAGE_SIZE * 2, PIPE_SIZE , PIPE_SIZE * 4 }; + +/* + * finds the right size from possible sizes in pipesize_blocks + * returns the size which matches max(current,expected) + */ +static int +choose_pipespace(unsigned long current, unsigned long expected) +{ + int i = sizeof(pipesize_blocks)/sizeof(unsigned int) -1; + unsigned long target; + + if (expected > current) + target = expected; + else + target = current; + + while ( i >0 && pipesize_blocks[i-1] > target) { + i=i-1; + + } + + return pipesize_blocks[i]; +} +/* + * expand the size of pipe while there is data to be read, + * and then free the old buffer once the current buffered + * data has been transferred to new storage. + * Required: PIPE_LOCK and io lock to be held by caller. + * returns 0 on success or no expansion possible + */ +static int +expand_pipespace(struct pipe *p, int target_size) +{ + struct pipe tmp, oldpipe; + int error; + tmp.pipe_buffer.buffer = 0; + + if (p->pipe_buffer.size >= (unsigned) target_size) { + return 0; /* the existing buffer is max size possible */ + } + + /* create enough space in the target */ + error = pipespace(&tmp, target_size); + if (error != 0) + return (error); + + oldpipe.pipe_buffer.buffer = p->pipe_buffer.buffer; + oldpipe.pipe_buffer.size = p->pipe_buffer.size; + + memcpy(tmp.pipe_buffer.buffer, p->pipe_buffer.buffer, p->pipe_buffer.size); + if (p->pipe_buffer.cnt > 0 && p->pipe_buffer.in <= p->pipe_buffer.out ){ + /* we are in State 3 and need extra copying for read to be consistent */ + memcpy(&tmp.pipe_buffer.buffer[p->pipe_buffer.size], p->pipe_buffer.buffer, p->pipe_buffer.size); + p->pipe_buffer.in += p->pipe_buffer.size; + } + + p->pipe_buffer.buffer = tmp.pipe_buffer.buffer; + p->pipe_buffer.size = tmp.pipe_buffer.size; + + + pipe_free_kmem(&oldpipe); + return 0; +} + /* * The pipe system call for the DTYPE_PIPE type of pipes + * + * returns: + * FREAD | fd0 | -->[struct rpipe] --> |~~buffer~~| \ + * (pipe_mutex) + * FWRITE | fd1 | -->[struct wpipe] --X / */ /* ARGSUSED */ @@ -372,22 +412,12 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval) /* * allocate the space for the normal I/O direction up * front... we'll delay the allocation for the other - * direction until a write actually occurs (most - * likely it won't)... - * - * Reduce to 1/4th pipe size if we're over our global max. + * direction until a write actually occurs (most likely it won't)... */ - if (amountpipekva > maxpipekva / 2) - error = pipespace(rpipe, SMALL_PIPE_SIZE); - else - error = pipespace(rpipe, PIPE_SIZE); + error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0)); if (error) goto freepipes; -#ifndef PIPE_NODIRECT - rpipe->pipe_state |= PIPE_DIRECTOK; - wpipe->pipe_state |= PIPE_DIRECTOK; -#endif TAILQ_INIT(&rpipe->pipe_evlist); TAILQ_INIT(&wpipe->pipe_evlist); @@ -398,9 +428,8 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval) retval[0] = fd; /* - * for now we'll create half-duplex - * pipes... this is what we've always - * supported.. + * for now we'll create half-duplex pipes(refer returns section above). + * this is what we've always supported.. */ rf->f_flag = FREAD; rf->f_type = DTYPE_PIPE; @@ -419,7 +448,8 @@ pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval) rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; - rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; + /* both structures share the same mutex */ + rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; retval[1] = fd; #if CONFIG_MACF @@ -476,20 +506,16 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64) } #endif if (cpipe->pipe_buffer.buffer == 0) { - /* - * must be stat'ing the write fd - */ + /* must be stat'ing the write fd */ if (cpipe->pipe_peer) { - /* - * the peer still exists, use it's info - */ - pipe_size = cpipe->pipe_peer->pipe_buffer.size; + /* the peer still exists, use it's info */ + pipe_size = MAX_PIPESIZE(cpipe->pipe_peer); pipe_count = cpipe->pipe_peer->pipe_buffer.cnt; } else { pipe_count = 0; } } else { - pipe_size = cpipe->pipe_buffer.size; + pipe_size = MAX_PIPESIZE(cpipe); pipe_count = cpipe->pipe_buffer.cnt; } /* @@ -497,7 +523,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64) * we might catch it in transient state */ if (pipe_size == 0) - pipe_size = PIPE_SIZE; + pipe_size = MAX(PIPE_SIZE, pipesize_blocks[0]); if (isstat64 != 0) { sb64 = (struct stat64 *)ub; @@ -525,7 +551,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64) * address of this pipe's struct pipe. This number may be recycled * relatively quickly. */ - sb64->st_ino = (ino64_t)((uintptr_t)cpipe); + sb64->st_ino = (ino64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe); } else { sb = (struct stat *)ub; @@ -552,7 +578,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64) * address of this pipe's struct pipe. This number may be recycled * relatively quickly. */ - sb->st_ino = (ino_t)(uintptr_t)cpipe; + sb->st_ino = (ino_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe); } PIPE_UNLOCK(cpipe); @@ -579,10 +605,11 @@ pipespace(struct pipe *cpipe, int size) { vm_offset_t buffer; - size = round_page(size); + if (size <= 0) + return(EINVAL); - if (kmem_alloc(kernel_map, &buffer, size) != KERN_SUCCESS) - return(ENOMEM); + if ((buffer = (vm_offset_t)kalloc(size)) == 0 ) + return(ENOMEM); /* free old resources if we're resizing */ pipe_free_kmem(cpipe); @@ -605,7 +632,6 @@ static int pipe_create(struct pipe **cpipep) { struct pipe *cpipe; - cpipe = (struct pipe *)zalloc(pipe_zone); if ((*cpipep = cpipe) == NULL) @@ -619,7 +645,6 @@ pipe_create(struct pipe **cpipep) /* Initial times are all the time of creation of the pipe */ pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME); - return (0); } @@ -628,20 +653,17 @@ pipe_create(struct pipe **cpipep) * lock a pipe for I/O, blocking other access */ static inline int -pipelock(struct pipe *cpipe, int catch) +pipeio_lock(struct pipe *cpipe, int catch) { int error; - while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; - error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; - return (0); } @@ -649,16 +671,18 @@ pipelock(struct pipe *cpipe, int catch) * unlock a pipe I/O lock */ static inline void -pipeunlock(struct pipe *cpipe) +pipeio_unlock(struct pipe *cpipe) { cpipe->pipe_state &= ~PIPE_LOCKFL; - if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } +/* + * wakeup anyone whos blocked in select + */ static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe) { @@ -679,6 +703,10 @@ pipeselwakeup(struct pipe *cpipe, struct pipe *spipe) } } +/* + * Read n bytes from the buffer. Semantics are similar to file read. + * returns: number of bytes read from the buffer + */ /* ARGSUSED */ static int pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, @@ -692,7 +720,7 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, PIPE_LOCK(rpipe); ++rpipe->pipe_busy; - error = pipelock(rpipe, 1); + error = pipeio_lock(rpipe, 1); if (error) goto unlocked_error; @@ -702,11 +730,17 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, goto locked_error; #endif + while (uio_resid(uio)) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { + /* + * # bytes to read is min( bytes from read pointer until end of buffer, + * total unread bytes, + * user requested byte count) + */ size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; @@ -714,7 +748,7 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, if (size > (u_int) uio_resid(uio)) size = (u_int) uio_resid(uio); - PIPE_UNLOCK(rpipe); + PIPE_UNLOCK(rpipe); /* we still hold io lock.*/ error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); @@ -727,7 +761,7 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; - + /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves @@ -738,32 +772,6 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, rpipe->pipe_buffer.out = 0; } nread += size; -#ifndef PIPE_NODIRECT - /* - * Direct copy, bypassing a kernel buffer. - */ - } else if ((size = rpipe->pipe_map.cnt) && - (rpipe->pipe_state & PIPE_DIRECTW)) { - caddr_t va; - // LP64todo - fix this! - if (size > (u_int) uio_resid(uio)) - size = (u_int) uio_resid(uio); - - va = (caddr_t) rpipe->pipe_map.kva + - rpipe->pipe_map.pos; - PIPE_UNLOCK(rpipe); - error = uiomove(va, size, uio); - PIPE_LOCK(rpipe); - if (error) - break; - nread += size; - rpipe->pipe_map.pos += size; - rpipe->pipe_map.cnt -= size; - if (rpipe->pipe_map.cnt == 0) { - rpipe->pipe_state &= ~PIPE_DIRECTW; - wakeup(rpipe); - } -#endif } else { /* * detect EOF condition @@ -782,7 +790,7 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, } /* - * Break if some data was read. + * Break if some data was read in previous iteration. */ if (nread > 0) break; @@ -792,7 +800,7 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, * We will either break out with an error or we will * sleep and relock to loop. */ - pipeunlock(rpipe); + pipeio_unlock(rpipe); /* * Handle non-blocking mode operation or @@ -802,11 +810,9 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; - error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0); - if (error == 0) - error = pipelock(rpipe, 1); + error = pipeio_lock(rpipe, 1); } if (error) goto unlocked_error; @@ -815,7 +821,7 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags, #if CONFIG_MACF locked_error: #endif - pipeunlock(rpipe); + pipeio_unlock(rpipe); unlocked_error: --rpipe->pipe_busy; @@ -826,7 +832,7 @@ unlocked_error: if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); - } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { + } else if (rpipe->pipe_buffer.cnt < rpipe->pipe_buffer.size) { /* * Handle write blocking hysteresis. */ @@ -836,7 +842,7 @@ unlocked_error: } } - if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) > 0) pipeselwakeup(rpipe, rpipe->pipe_peer); /* update last read time */ @@ -847,250 +853,10 @@ unlocked_error: return (error); } - - -#ifndef PIPE_NODIRECT /* - * Map the sending processes' buffer into kernel space and wire it. - * This is similar to a physical write operation. + * perform a write of n bytes into the read side of buffer. Since + * pipes are unidirectional a write is meant to be read by the otherside only. */ -static int -pipe_build_write_buffer(wpipe, uio) - struct pipe *wpipe; - struct uio *uio; -{ - pmap_t pmap; - u_int size; - int i, j; - vm_offset_t addr, endaddr; - - - size = (u_int) uio->uio_iov->iov_len; - if (size > wpipe->pipe_buffer.size) - size = wpipe->pipe_buffer.size; - - pmap = vmspace_pmap(curproc->p_vmspace); - endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); - addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); - for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { - /* - * vm_fault_quick() can sleep. Consequently, - * vm_page_lock_queue() and vm_page_unlock_queue() - * should not be performed outside of this loop. - */ - race: - if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { - vm_page_lock_queues(); - for (j = 0; j < i; j++) - vm_page_unhold(wpipe->pipe_map.ms[j]); - vm_page_unlock_queues(); - return (EFAULT); - } - wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, - VM_PROT_READ); - if (wpipe->pipe_map.ms[i] == NULL) - goto race; - } - -/* - * set up the control block - */ - wpipe->pipe_map.npages = i; - wpipe->pipe_map.pos = - ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; - wpipe->pipe_map.cnt = size; - -/* - * and map the buffer - */ - if (wpipe->pipe_map.kva == 0) { - /* - * We need to allocate space for an extra page because the - * address range might (will) span pages at times. - */ - wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map, - wpipe->pipe_buffer.size + PAGE_SIZE); - atomic_add_int(&amountpipekvawired, - wpipe->pipe_buffer.size + PAGE_SIZE); - } - pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, - wpipe->pipe_map.npages); - -/* - * and update the uio data - */ - - uio->uio_iov->iov_len -= size; - uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; - if (uio->uio_iov->iov_len == 0) - uio->uio_iov++; - uio_setresid(uio, (uio_resid(uio) - size)); - uio->uio_offset += size; - return (0); -} - -/* - * unmap and unwire the process buffer - */ -static void -pipe_destroy_write_buffer(wpipe) - struct pipe *wpipe; -{ - int i; - - if (wpipe->pipe_map.kva) { - pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); - - if (amountpipekvawired > maxpipekvawired / 2) { - /* Conserve address space */ - vm_offset_t kva = wpipe->pipe_map.kva; - wpipe->pipe_map.kva = 0; - kmem_free(kernel_map, kva, - wpipe->pipe_buffer.size + PAGE_SIZE); - atomic_subtract_int(&amountpipekvawired, - wpipe->pipe_buffer.size + PAGE_SIZE); - } - } - vm_page_lock_queues(); - for (i = 0; i < wpipe->pipe_map.npages; i++) { - vm_page_unhold(wpipe->pipe_map.ms[i]); - } - vm_page_unlock_queues(); - wpipe->pipe_map.npages = 0; -} - -/* - * In the case of a signal, the writing process might go away. This - * code copies the data into the circular buffer so that the source - * pages can be freed without loss of data. - */ -static void -pipe_clone_write_buffer(wpipe) - struct pipe *wpipe; -{ - int size; - int pos; - - size = wpipe->pipe_map.cnt; - pos = wpipe->pipe_map.pos; - - wpipe->pipe_buffer.in = size; - wpipe->pipe_buffer.out = 0; - wpipe->pipe_buffer.cnt = size; - wpipe->pipe_state &= ~PIPE_DIRECTW; - - PIPE_UNLOCK(wpipe); - bcopy((caddr_t) wpipe->pipe_map.kva + pos, - wpipe->pipe_buffer.buffer, size); - pipe_destroy_write_buffer(wpipe); - PIPE_LOCK(wpipe); -} - -/* - * This implements the pipe buffer write mechanism. Note that only - * a direct write OR a normal pipe write can be pending at any given time. - * If there are any characters in the pipe buffer, the direct write will - * be deferred until the receiving process grabs all of the bytes from - * the pipe buffer. Then the direct mapping write is set-up. - */ -static int -pipe_direct_write(wpipe, uio) - struct pipe *wpipe; - struct uio *uio; -{ - int error; - -retry: - while (wpipe->pipe_state & PIPE_DIRECTW) { - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - wpipe->pipe_state |= PIPE_WANTW; - error = msleep(wpipe, PIPE_MTX(wpipe), - PRIBIO | PCATCH, "pipdww", 0); - if (error) - goto error1; - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - error = EPIPE; - goto error1; - } - } - wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ - if (wpipe->pipe_buffer.cnt > 0) { - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - - wpipe->pipe_state |= PIPE_WANTW; - error = msleep(wpipe, PIPE_MTX(wpipe), - PRIBIO | PCATCH, "pipdwc", 0); - if (error) - goto error1; - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - error = EPIPE; - goto error1; - } - goto retry; - } - - wpipe->pipe_state |= PIPE_DIRECTW; - - pipelock(wpipe, 0); - PIPE_UNLOCK(wpipe); - error = pipe_build_write_buffer(wpipe, uio); - PIPE_LOCK(wpipe); - pipeunlock(wpipe); - if (error) { - wpipe->pipe_state &= ~PIPE_DIRECTW; - goto error1; - } - - error = 0; - while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - pipelock(wpipe, 0); - PIPE_UNLOCK(wpipe); - pipe_destroy_write_buffer(wpipe); - PIPE_LOCK(wpipe); - pipeselwakeup(wpipe, wpipe); - pipeunlock(wpipe); - error = EPIPE; - goto error1; - } - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - pipeselwakeup(wpipe, wpipe); - error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, - "pipdwt", 0); - } - - pipelock(wpipe,0); - if (wpipe->pipe_state & PIPE_DIRECTW) { - /* - * this bit of trickery substitutes a kernel buffer for - * the process that might be going away. - */ - pipe_clone_write_buffer(wpipe); - } else { - PIPE_UNLOCK(wpipe); - pipe_destroy_write_buffer(wpipe); - PIPE_LOCK(wpipe); - } - pipeunlock(wpipe); - return (error); - -error1: - wakeup(wpipe); - return (error); -} -#endif - - - static int pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, __unused vfs_context_t ctx) @@ -1099,6 +865,9 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, int orig_resid; int pipe_size; struct pipe *wpipe, *rpipe; + // LP64todo - fix this! + orig_resid = uio_resid(uio); + int space; rpipe = (struct pipe *)fp->f_data; @@ -1123,54 +892,35 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, pipe_size = 0; - if (wpipe->pipe_buffer.buffer == 0) { - /* - * need to allocate some storage... we delay the allocation - * until the first write on fd[0] to avoid allocating storage for both - * 'pipe ends'... most pipes are half-duplex with the writes targeting - * fd[1], so allocating space for both ends is a waste... - * - * Reduce to 1/4th pipe size if we're over our global max. - */ - if (amountpipekva > maxpipekva / 2) - pipe_size = SMALL_PIPE_SIZE; - else - pipe_size = PIPE_SIZE; - } - /* - * If it is advantageous to resize the pipe buffer, do - * so. + * need to allocate some storage... we delay the allocation + * until the first write on fd[0] to avoid allocating storage for both + * 'pipe ends'... most pipes are half-duplex with the writes targeting + * fd[1], so allocating space for both ends is a waste... */ - if ((uio_resid(uio) > PIPE_SIZE) && - (wpipe->pipe_buffer.size <= PIPE_SIZE) && - (amountpipekva < maxpipekva / 2) && - (nbigpipe < LIMITBIGPIPES) && -#ifndef PIPE_NODIRECT - (wpipe->pipe_state & PIPE_DIRECTW) == 0 && -#endif - (wpipe->pipe_buffer.cnt == 0)) { - pipe_size = BIG_PIPE_SIZE; + if ( wpipe->pipe_buffer.buffer == 0 || ( + (unsigned)orig_resid > wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt && + amountpipekva < maxpipekva ) ) { + pipe_size = choose_pipespace(wpipe->pipe_buffer.size, wpipe->pipe_buffer.cnt + orig_resid); } if (pipe_size) { /* * need to do initial allocation or resizing of pipe + * holding both structure and io locks. */ - if ((error = pipelock(wpipe, 1)) == 0) { - PIPE_UNLOCK(wpipe); - if (pipespace(wpipe, pipe_size) == 0) - OSAddAtomic(1, &nbigpipe); - PIPE_LOCK(wpipe); - pipeunlock(wpipe); - - if (wpipe->pipe_buffer.buffer == 0) { - /* - * initial allocation failed - */ + if ((error = pipeio_lock(wpipe, 1)) == 0) { + if (wpipe->pipe_buffer.cnt == 0) + error = pipespace(wpipe, pipe_size); + else + error = expand_pipespace(wpipe, pipe_size); + + pipeio_unlock(wpipe); + + /* allocation failed */ + if (wpipe->pipe_buffer.buffer == 0) error = ENOMEM; - } } if (error) { /* @@ -1187,91 +937,35 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, return(error); } } - // LP64todo - fix this! - orig_resid = uio_resid(uio); while (uio_resid(uio)) { - int space; - -#ifndef PIPE_NODIRECT - /* - * If the transfer is large, we can gain performance if - * we do process-to-process copies directly. - * If the write is non-blocking, we don't use the - * direct write mechanism. - * - * The direct write mechanism will detect the reader going - * away on us. - */ - if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && - (fp->f_flag & FNONBLOCK) == 0 && - amountpipekvawired + uio_resid(uio) < maxpipekvawired) { - error = pipe_direct_write(wpipe, uio); - if (error) - break; - continue; - } - /* - * Pipe buffered writes cannot be coincidental with - * direct writes. We wait until the currently executing - * direct write is completed before we start filling the - * pipe buffer. We break out if a signal occurs or the - * reader goes away. - */ retrywrite: - while (wpipe->pipe_state & PIPE_DIRECTW) { - if (wpipe->pipe_state & PIPE_WANTR) { - wpipe->pipe_state &= ~PIPE_WANTR; - wakeup(wpipe); - } - error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipbww", 0); - - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) - break; - if (error) - break; - } -#else - retrywrite: -#endif space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; - /* - * Writes of size <= PIPE_BUF must be atomic. - */ + /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { - if ((error = pipelock(wpipe,1)) == 0) { + if ((error = pipeio_lock(wpipe,1)) == 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - pipeunlock(wpipe); + pipeio_unlock(wpipe); error = EPIPE; break; } -#ifndef PIPE_NODIRECT - /* - * It is possible for a direct write to - * slip in on us... handle it here... - */ - if (wpipe->pipe_state & PIPE_DIRECTW) { - pipeunlock(wpipe); - goto retrywrite; - } -#endif /* - * If a process blocked in pipelock, our + * If a process blocked in pipeio_lock, our * value for space might be bad... the mutex * is dropped while we're blocked */ if (space > (int)(wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt)) { - pipeunlock(wpipe); + pipeio_unlock(wpipe); goto retrywrite; } @@ -1307,7 +1001,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, /* * Transfer remaining part now, to * support atomic writes. Wraparound - * happened. + * happened. (State 3) */ if (wpipe->pipe_buffer.in + segsize != wpipe->pipe_buffer.size) @@ -1320,9 +1014,12 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, size - segsize, uio); PIPE_LOCK(rpipe); } + /* + * readers never know to read until count is updated. + */ if (error == 0) { wpipe->pipe_buffer.in += size; - if (wpipe->pipe_buffer.in >= + if (wpipe->pipe_buffer.in > wpipe->pipe_buffer.size) { if (wpipe->pipe_buffer.in != size - segsize + @@ -1339,7 +1036,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, panic("Pipe buffer overflow"); } - pipeunlock(wpipe); + pipeio_unlock(wpipe); } if (error) break; @@ -1453,12 +1150,7 @@ pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, return (0); case FIONREAD: -#ifndef PIPE_NODIRECT - if (mpipe->pipe_state & PIPE_DIRECTW) - *(int *)data = mpipe->pipe_map.cnt; - else -#endif - *(int *)data = mpipe->pipe_buffer.cnt; + *(int *)data = mpipe->pipe_buffer.cnt; PIPE_UNLOCK(mpipe); return (0); @@ -1493,6 +1185,7 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) PIPE_LOCK(rpipe); wpipe = rpipe->pipe_peer; + #if CONFIG_MACF /* @@ -1524,7 +1217,7 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) wpipe->pipe_state |= PIPE_WSELECT; if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && - (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { + (MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt) > 0)) { retnum = 1; } else { @@ -1553,7 +1246,6 @@ pipe_close(struct fileglob *fg, __unused vfs_context_t ctx) cpipe = (struct pipe *)fg->fg_data; fg->fg_data = NULL; proc_fdunlock(vfs_context_proc(ctx)); - if (cpipe) pipeclose(cpipe); @@ -1563,102 +1255,14 @@ pipe_close(struct fileglob *fg, __unused vfs_context_t ctx) static void pipe_free_kmem(struct pipe *cpipe) { - if (cpipe->pipe_buffer.buffer != NULL) { - if (cpipe->pipe_buffer.size > PIPE_SIZE) - OSAddAtomic(-1, &nbigpipe); OSAddAtomic(-(cpipe->pipe_buffer.size), &amountpipekva); OSAddAtomic(-1, &amountpipes); - - kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer, + kfree((void *)cpipe->pipe_buffer.buffer, cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; + cpipe->pipe_buffer.size = 0; } -#ifndef PIPE_NODIRECT - if (cpipe->pipe_map.kva != 0) { - atomic_subtract_int(&amountpipekvawired, - cpipe->pipe_buffer.size + PAGE_SIZE); - kmem_free(kernel_map, - cpipe->pipe_map.kva, - cpipe->pipe_buffer.size + PAGE_SIZE); - cpipe->pipe_map.cnt = 0; - cpipe->pipe_map.kva = 0; - cpipe->pipe_map.pos = 0; - cpipe->pipe_map.npages = 0; - } -#endif -} - -/* - * When a thread sets a write-select on a pipe, it creates an implicit, - * untracked dependency between that thread and the peer of the pipe - * on which the select is set. If the peer pipe is closed and freed - * before the select()ing thread wakes up, the system will panic as - * it attempts to unwind the dangling select(). To avoid that panic, - * we notice whenever a dangerous select() is set on a pipe, and - * defer the final deletion of the pipe until that select()s are all - * resolved. Since we can't currently detect exactly when that - * resolution happens, we use a simple garbage collection queue to - * reap the at-risk pipes 'later'. - */ -static void -pipe_garbage_collect(struct pipe *cpipe) -{ - uint64_t old, now; - struct pipe_garbage *pgp; - - /* Convert msecs to nsecs and then to abstime */ - old = pipe_garbage_age_limit * 1000000; - nanoseconds_to_absolutetime(old, &old); - - lck_mtx_lock(pipe_garbage_lock); - - /* Free anything that's been on the queue for seconds */ - now = mach_absolute_time(); - old = now - old; - while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) { - pipe_garbage_head = pgp->pg_next; - if (pipe_garbage_head == NULL) - pipe_garbage_tail = NULL; - pipe_garbage_count--; - zfree(pipe_zone, pgp->pg_pipe); - zfree(pipe_garbage_zone, pgp); - } - - /* Add the new pipe (if any) to the tail of the garbage queue */ - if (cpipe) { - cpipe->pipe_state = PIPE_DEAD; - pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone); - if (pgp == NULL) { - /* - * We're too low on memory to garbage collect the - * pipe. Freeing it runs the risk of panicing the - * system. All we can do is leak it and leave - * a breadcrumb behind. The good news, such as it - * is, is that this will probably never happen. - * We will probably hit the panic below first. - */ - printf("Leaking pipe %p - no room left in the queue", - cpipe); - lck_mtx_unlock(pipe_garbage_lock); - return; - } - - pgp->pg_pipe = cpipe; - pgp->pg_timestamp = now; - pgp->pg_next = NULL; - - if (pipe_garbage_tail) - pipe_garbage_tail->pg_next = pgp; - pipe_garbage_tail = pgp; - if (pipe_garbage_head == NULL) - pipe_garbage_head = pipe_garbage_tail; - - if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT) - panic("Length of pipe garbage queue exceeded %d", - PIPE_GARBAGE_QUEUE_LIMIT); - } - lck_mtx_unlock(pipe_garbage_lock); } /* @@ -1671,7 +1275,6 @@ pipeclose(struct pipe *cpipe) if (cpipe == NULL) return; - /* partially created pipes won't have a valid mutex. */ if (PIPE_MTX(cpipe) != NULL) PIPE_LOCK(cpipe); @@ -1745,6 +1348,7 @@ pipeclose(struct pipe *cpipe) zfree(pipe_zone, cpipe); pipe_garbage_collect(NULL); } + } /*ARGSUSED*/ @@ -1838,11 +1442,6 @@ filt_piperead(struct knote *kn, long hint) wpipe = rpipe->pipe_peer; kn->kn_data = rpipe->pipe_buffer.cnt; - -#ifndef PIPE_NODIRECT - if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) - kn->kn_data = rpipe->pipe_map.cnt; -#endif if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) || (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) { kn->kn_flags |= EV_EOF; @@ -1850,8 +1449,8 @@ filt_piperead(struct knote *kn, long hint) } else { int64_t lowwat = 1; if (kn->kn_sfflags & NOTE_LOWAT) { - if (rpipe->pipe_buffer.size && kn->kn_sdata > rpipe->pipe_buffer.size) - lowwat = rpipe->pipe_buffer.size; + if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe)) + lowwat = MAX_PIPESIZE(rpipe); else if (kn->kn_sdata > lowwat) lowwat = kn->kn_sdata; } @@ -1890,18 +1489,12 @@ filt_pipewrite(struct knote *kn, long hint) PIPE_UNLOCK(rpipe); return (1); } - kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; - if (!kn->kn_data && wpipe->pipe_buffer.size == 0) - kn->kn_data = PIPE_BUF; /* unwritten pipe is ready for write */ + kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt; -#ifndef PIPE_NODIRECT - if (wpipe->pipe_state & PIPE_DIRECTW) - kn->kn_data = 0; -#endif int64_t lowwat = PIPE_BUF; if (kn->kn_sfflags & NOTE_LOWAT) { - if (wpipe->pipe_buffer.size && kn->kn_sdata > wpipe->pipe_buffer.size) - lowwat = wpipe->pipe_buffer.size; + if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe)) + lowwat = MAX_PIPESIZE(wpipe); else if (kn->kn_sdata > lowwat) lowwat = kn->kn_sdata; } @@ -1942,13 +1535,13 @@ fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo) /* * the peer still exists, use it's info */ - pipe_size = cpipe->pipe_peer->pipe_buffer.size; + pipe_size = MAX_PIPESIZE(cpipe->pipe_peer); pipe_count = cpipe->pipe_peer->pipe_buffer.cnt; } else { pipe_count = 0; } } else { - pipe_size = cpipe->pipe_buffer.size; + pipe_size = MAX_PIPESIZE(cpipe); pipe_count = cpipe->pipe_buffer.cnt; } /* @@ -2024,6 +1617,75 @@ pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx) } + /* + * When a thread sets a write-select on a pipe, it creates an implicit, + * untracked dependency between that thread and the peer of the pipe + * on which the select is set. If the peer pipe is closed and freed + * before the select()ing thread wakes up, the system will panic as + * it attempts to unwind the dangling select(). To avoid that panic, + * we notice whenever a dangerous select() is set on a pipe, and + * defer the final deletion of the pipe until that select()s are all + * resolved. Since we can't currently detect exactly when that + * resolution happens, we use a simple garbage collection queue to + * reap the at-risk pipes 'later'. + */ +static void +pipe_garbage_collect(struct pipe *cpipe) +{ + uint64_t old, now; + struct pipe_garbage *pgp; + + /* Convert msecs to nsecs and then to abstime */ + old = pipe_garbage_age_limit * 1000000; + nanoseconds_to_absolutetime(old, &old); + + lck_mtx_lock(pipe_garbage_lock); + + /* Free anything that's been on the queue for seconds */ + now = mach_absolute_time(); + old = now - old; + while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) { + pipe_garbage_head = pgp->pg_next; + if (pipe_garbage_head == NULL) + pipe_garbage_tail = NULL; + pipe_garbage_count--; + zfree(pipe_zone, pgp->pg_pipe); + zfree(pipe_garbage_zone, pgp); + } + + /* Add the new pipe (if any) to the tail of the garbage queue */ + if (cpipe) { + cpipe->pipe_state = PIPE_DEAD; + pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone); + if (pgp == NULL) { + /* + * We're too low on memory to garbage collect the + * pipe. Freeing it runs the risk of panicing the + * system. All we can do is leak it and leave + * a breadcrumb behind. The good news, such as it + * is, is that this will probably never happen. + * We will probably hit the panic below first. + */ + printf("Leaking pipe %p - no room left in the queue", + cpipe); + lck_mtx_unlock(pipe_garbage_lock); + return; + } + + pgp->pg_pipe = cpipe; + pgp->pg_timestamp = now; + pgp->pg_next = NULL; + if (pipe_garbage_tail) + pipe_garbage_tail->pg_next = pgp; + pipe_garbage_tail = pgp; + if (pipe_garbage_head == NULL) + pipe_garbage_head = pipe_garbage_tail; + if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT) + panic("Length of pipe garbage queue exceeded %d", + PIPE_GARBAGE_QUEUE_LIMIT); + } + lck_mtx_unlock(pipe_garbage_lock); +} diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 431e47658..d06b9cb9c 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -189,6 +189,7 @@ soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) { int error = 0; int dropsockref = -1; + int int_arg; socket_lock(so, 1); @@ -201,16 +202,18 @@ soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) switch (cmd) { - case FIONBIO: - if (*(int *)data) + case FIONBIO: /* int */ + bcopy(data, &int_arg, sizeof (int_arg)); + if (int_arg) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; goto out; - case FIOASYNC: - if (*(int *)data) { + case FIOASYNC: /* int */ + bcopy(data, &int_arg, sizeof (int_arg)); + if (int_arg) { so->so_state |= SS_ASYNC; so->so_rcv.sb_flags |= SB_ASYNC; so->so_snd.sb_flags |= SB_ASYNC; @@ -221,29 +224,32 @@ soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) } goto out; - case FIONREAD: - *(int *)data = so->so_rcv.sb_cc; + case FIONREAD: /* int */ + bcopy(&so->so_rcv.sb_cc, data, sizeof (u_int32_t)); goto out; - case SIOCSPGRP: - so->so_pgid = *(int *)data; + case SIOCSPGRP: /* int */ + bcopy(data, &so->so_pgid, sizeof (pid_t)); goto out; - case SIOCGPGRP: - *(int *)data = so->so_pgid; + case SIOCGPGRP: /* int */ + bcopy(&so->so_pgid, data, sizeof (pid_t)); goto out; - case SIOCATMARK: - *(int *)data = (so->so_state&SS_RCVATMARK) != 0; + case SIOCATMARK: /* int */ + int_arg = (so->so_state & SS_RCVATMARK) != 0; + bcopy(&int_arg, data, sizeof (int_arg)); goto out; - case SIOCSETOT: { + case SIOCSETOT: { /* int */ /* * Set socket level options here and then call protocol * specific routine. */ struct socket *cloned_so = NULL; - int cloned_fd = *(int *)data; + int cloned_fd; + + bcopy(data, &cloned_fd, sizeof (cloned_fd)); /* let's make sure it's either -1 or a valid file descriptor */ if (cloned_fd != -1) { @@ -441,8 +447,8 @@ soo_stat(struct socket *so, void *ub, int isstat64) if ((so->so_state & SS_CANTSENDMORE) == 0) sb64->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; sb64->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; - sb64->st_uid = so->so_uid; - sb64->st_gid = so->so_gid; + sb64->st_uid = kauth_cred_getuid(so->so_cred); + sb64->st_gid = kauth_cred_getgid(so->so_cred); } else { sb->st_mode = S_IFSOCK; if ((so->so_state & SS_CANTRCVMORE) == 0 || @@ -451,8 +457,8 @@ soo_stat(struct socket *so, void *ub, int isstat64) if ((so->so_state & SS_CANTSENDMORE) == 0) sb->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; sb->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; - sb->st_uid = so->so_uid; - sb->st_gid = so->so_gid; + sb->st_uid = kauth_cred_getuid(so->so_cred); + sb->st_gid = kauth_cred_getgid(so->so_cred); } ret = (*so->so_proto->pr_usrreqs->pru_sense)(so, ub, isstat64); @@ -489,6 +495,7 @@ soo_drain(struct fileproc *fp, __unused vfs_context_t ctx) wakeup((caddr_t)&so->so_timeo); sorwakeup(so); sowwakeup(so); + soevent(so, SO_FILT_HINT_LOCKED); socket_unlock(so, 1); } diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 009dd377b..0a2583804 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -91,7 +91,7 @@ 41 AUE_DUP ALL { int dup(u_int fd); } 42 AUE_PIPE ALL { int pipe(void); } 43 AUE_GETEGID ALL { int getegid(void); } -44 AUE_PROFILE ALL { int profil(short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale); } +44 AUE_NULL ALL { int nosys(void); } { old profil } 45 AUE_NULL ALL { int nosys(void); } { old ktrace } 46 AUE_SIGACTION ALL { int sigaction(int signum, struct __sigaction *nsa, struct sigaction *osa) NO_SYSCALL_STUB; } 47 AUE_GETGID ALL { int getgid(void); } @@ -255,13 +255,13 @@ 167 AUE_MOUNT ALL { int mount(char *type, char *path, int flags, caddr_t data); } 168 AUE_NULL ALL { int nosys(void); } { old ustat } 169 AUE_CSOPS ALL { int csops(pid_t pid, uint32_t ops, user_addr_t useraddr, user_size_t usersize); } -170 AUE_NULL HN { int nosys(void); } { old table } +170 AUE_CSOPS ALL { int csops_audittoken(pid_t pid, uint32_t ops, user_addr_t useraddr, user_size_t usersize, user_addr_t uaudittoken); } 171 AUE_NULL ALL { int nosys(void); } { old wait3 } 172 AUE_NULL ALL { int nosys(void); } { old rpause } 173 AUE_WAITID ALL { int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options); } 174 AUE_NULL ALL { int nosys(void); } { old getdents } 175 AUE_NULL ALL { int nosys(void); } { old gc_control } -176 AUE_ADDPROFILE ALL { int add_profil(short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale); } +176 AUE_NULL ALL { int nosys(void); } { old add_profil } 177 AUE_NULL ALL { int nosys(void); } 178 AUE_NULL ALL { int nosys(void); } 179 AUE_NULL ALL { int nosys(void); } @@ -322,10 +322,12 @@ ; to HFS semantics, they are not specific to the HFS filesystem. ; We expect all filesystems to recognize the call and report that it is ; not supported or to actually implement it. -216 AUE_MKCOMPLEX UHN { int mkcomplex(const char *path, mode_t mode, u_long type); } { soon to be obsolete } -217 AUE_STATV UHN { int statv(const char *path, struct vstat *vsb); } { soon to be obsolete } -218 AUE_LSTATV UHN { int lstatv(const char *path, struct vstat *vsb); } { soon to be obsolete } -219 AUE_FSTATV UHN { int fstatv(int fd, struct vstat *vsb); } { soon to be obsolete } + +; 216-> 219 used to be mkcomplex and {f,l}statv variants. They are gone now. +216 AUE_NULL ALL { int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode) NO_SYSCALL_STUB; } +217 AUE_NULL ALL { int nosys(void); } +218 AUE_NULL ALL { int nosys(void); } +219 AUE_NULL ALL { int nosys(void); } 220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 221 AUE_SETATTRLIST ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 222 AUE_GETDIRENTRIESATTR ALL { int getdirentriesattr(int fd, struct attrlist *alist, void *buffer, size_t buffersize, u_long *count, u_long *basep, u_long *newstate, u_long options); } @@ -417,7 +419,7 @@ #endif 266 AUE_SHMOPEN ALL { int shm_open(const char *name, int oflag, int mode); } 267 AUE_SHMUNLINK ALL { int shm_unlink(const char *name); } -268 AUE_SEMOPEN ALL { user_addr_t sem_open(const char *name, int oflag, int mode, int value); } +268 AUE_SEMOPEN ALL { user_addr_t sem_open(const char *name, int oflag, int mode, int value) NO_SYSCALL_STUB; } 269 AUE_SEMCLOSE ALL { int sem_close(sem_t *sem); } 270 AUE_SEMUNLINK ALL { int sem_unlink(const char *name); } 271 AUE_SEMWAIT ALL { int sem_wait(sem_t *sem); } @@ -442,7 +444,11 @@ 290 AUE_GETWGROUPS ALL { int getwgroups(user_addr_t setlen, user_addr_t guidset) NO_SYSCALL_STUB; } 291 AUE_MKFIFO_EXTENDED ALL { int mkfifo_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity) NO_SYSCALL_STUB; } 292 AUE_MKDIR_EXTENDED ALL { int mkdir_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity) NO_SYSCALL_STUB; } +#if CONFIG_EXT_RESOLVER 293 AUE_IDENTITYSVC ALL { int identitysvc(int opcode, user_addr_t message) NO_SYSCALL_STUB; } +#else +293 AUE_NULL ALL { int nosys(void); } +#endif 294 AUE_NULL ALL { int shared_region_check_np(uint64_t *start_address) NO_SYSCALL_STUB; } 295 AUE_NULL ALL { int nosys(void); } { old shared_region_map_np } 296 AUE_NULL ALL { int vm_pressure_monitor(int wait_for_pressure, int nsecs_monitored, uint32_t *pages_reclaimed); } @@ -534,8 +540,8 @@ 352 AUE_NULL ALL { int nosys(void); } 353 AUE_GETAUID ALL { int getauid(au_id_t *auid); } 354 AUE_SETAUID ALL { int setauid(au_id_t *auid); } -355 AUE_GETAUDIT ALL { int getaudit(struct auditinfo *auditinfo); } -356 AUE_SETAUDIT ALL { int setaudit(struct auditinfo *auditinfo); } +355 AUE_NULL ALL { int nosys(void); } { old getaudit } +356 AUE_NULL ALL { int nosys(void); } { old setaudit } 357 AUE_GETAUDIT_ADDR ALL { int getaudit_addr(struct auditinfo_addr *auditinfo_addr, int length); } 358 AUE_SETAUDIT_ADDR ALL { int setaudit_addr(struct auditinfo_addr *auditinfo_addr, int length); } 359 AUE_AUDITCTL ALL { int auditctl(char *path); } @@ -568,7 +574,7 @@ 371 AUE_NULL ALL { int nosys(void); } { old __semwait_signal } #endif 372 AUE_NULL ALL { uint64_t thread_selfid (void) NO_SYSCALL_STUB; } -373 AUE_NULL ALL { int nosys(void); } +373 AUE_LEDGER ALL { int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3); } 374 AUE_NULL ALL { int nosys(void); } 375 AUE_NULL ALL { int nosys(void); } 376 AUE_NULL ALL { int nosys(void); } @@ -658,9 +664,9 @@ 435 AUE_NULL ALL { int pid_hibernate(int pid); } 436 AUE_NULL ALL { int pid_shutdown_sockets(int pid, int level); } #else -435 AUE_NULL ALL { int nosys(void); } +435 AUE_NULL ALL { int nosys(void); } 436 AUE_NULL ALL { int nosys(void); } #endif 437 AUE_NULL ALL { int nosys(void); } { old shared_region_slide_np } 438 AUE_NULL ALL { int shared_region_map_and_slide_np(int fd, uint32_t count, const struct shared_file_mapping_np *mappings, uint32_t slide, uint64_t* slide_start, uint32_t slide_size) NO_SYSCALL_STUB; } - +439 AUE_NULL ALL { int kas_info(int selector, void *value, size_t *size); } diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index ed43ec893..f172333ef 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -1044,7 +1044,7 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) { int semid = uap->semid; int nsops = uap->nsops; - struct sembuf sops[MAX_SOPS]; + struct sembuf sops[seminfo.semopm]; register struct semid_kernel *semakptr; register struct sembuf *sopptr = NULL; /* protected by 'semptr' */ register struct sem *semptr = NULL; /* protected by 'if' */ @@ -1084,14 +1084,15 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) goto semopout; } - if (nsops < 0 || nsops > MAX_SOPS) { + if (nsops < 0 || nsops > seminfo.semopm) { #ifdef SEM_DEBUG - printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops); + printf("too many sops (max=%d, nsops=%d)\n", + seminfo.semopm, nsops); #endif eval = E2BIG; goto semopout; } - + /* OK for LP64, since sizeof(struct sembuf) is currently invariant */ if ((eval = copyin(uap->sops, &sops, nsops * sizeof(struct sembuf))) != 0) { #ifdef SEM_DEBUG diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes index fbc026fb2..fd215a76a 100644 --- a/bsd/kern/trace.codes +++ b/bsd/kern/trace.codes @@ -57,20 +57,17 @@ 0x10c001c MSC_kern_invalid_#7 0x10c0020 MSC_kern_invalid_#8 0x10c0024 MSC_kern_invalid_#9 -0x10c0028 MSC_kern_invalid_#10 -0x10c002c MSC_kern_invalid_#11 -0x10c0030 MSC_kern_invalid_#12 -0x10c0034 MSC_kern_invalid_#13 -0x10c0038 MSC_kern_invalid_#14 -0x10c003c MSC_kern_invalid_#15 -0x10c0040 MSC_kern_invalid_#16 -0x10c0044 MSC_kern_invalid_#17 -0x10c0048 MSC_kern_invalid_#18 -0x10c004c MSC_kern_invalid_#19 -0x10c0050 MSC_kern_invalid_#20 -0x10c0054 MSC_kern_invalid_#21 -0x10c0058 MSC_kern_invalid_#22 -0x10c005c MSC_kern_invalid_#23 +0x10c0028 MSC_mach_vm_allocate_trap +0x10c0030 MSC_mach_vm_deallocate_trap +0x10c0038 MSC_mach_vm_protect_trap +0x10c0040 MSC_mach_port_allocate_trap +0x10c0044 MSC_mach_port_destroy_trap +0x10c0048 MSC_mach_port_deallocate_trap +0x10c004c MSC_mach_port_mod_refs_trap +0x10c0050 MSC_mach_port_move_member_trap +0x10c0054 MSC_mach_port_insert_right_trap +0x10c0058 MSC_mach_port_insert_member_trap +0x10c005c MSC_mach_port_extract_member_trap 0x10c0060 MSC_kern_invalid_#24 0x10c0064 MSC_kern_invalid_#25 0x10c0068 MSC_mach_reply_port @@ -181,6 +178,10 @@ 0x1300104 MACH_purgable_token_delete 0x1300108 MACH_purgable_token_ripened 0x130010c MACH_purgable_token_purged +0x1300120 MACH_purgable_object_add +0x1300124 MACH_purgable_object_remove +0x1300128 MACH_purgable_object_purge +0x130012c MACH_purgable_object_purge_all 0x1300400 MACH_vm_check_zf_delay 0x1300404 MACH_vm_cow_delay 0x1300408 MACH_vm_zf_delay @@ -204,10 +205,11 @@ 0x1400024 MACH_IDLE 0x1400028 MACH_STACK_DEPTH 0x140002c MACH_MOVED -0x1400030 MACH_FAIRSHARE_ENTER -0x1400034 MACH_FAIRSHARE_EXIT +0x1400030 MACH_FAIRSHARE_ENTER +0x1400034 MACH_FAIRSHARE_EXIT 0x1400038 MACH_FAILSAFE -0x1400040 MACH_STKHANDOFF_BT +0x140003C MACH_BLOCK +0x1400040 MACH_WAIT 0x1400044 MACH_SCHED_BT 0x1400048 MACH_IDLE_BT 0x1400050 MACH_SCHED_GET_URGENCY @@ -548,7 +550,33 @@ 0x3020154 P_PgOutAsyncPDone 0x3020158 P_PgInAsyncP 0x302015C P_PgInAsyncPDone +0x3020200 P_WrDataN +0x3020208 P_RdDataN +0x3020210 P_WrDataAsyncN +0x3020218 P_RdDataAsyncN +0x3020204 P_WrDataNDone +0x302020C P_RdDataNDone +0x3020214 P_WrDataAsyncNDone +0x302021C P_RdDataAsyncNDone +0x3020280 P_WrDataNT +0x3020288 P_RdDataNT +0x3020290 P_WrDataAsyncNT +0x3020298 P_RdDataAsyncNT +0x3020284 P_WrDataNTDone +0x302028C P_RdDataNTDone +0x3020294 P_WrDataAsyncNTDone +0x302029C P_RdDataAsyncNTDone +0x3020300 P_WrDataNP +0x3020308 P_RdDataNP +0x3020310 P_WrDataAsyncNP +0x3020318 P_RdDataAsyncNP +0x3020304 P_WrDataNPDone +0x302030C P_RdDataNPDone +0x3020314 P_WrDataAsyncNPDone +0x302031C P_RdDataAsyncNPDone 0x3050004 journal_flush +0x3060000 SPEC_ioctl +0x3060004 SPEC_trim_extent 0x3070004 BootCache_tag 0x3070008 BootCache_batch 0x4010004 proc_exit @@ -769,10 +797,10 @@ 0x40c0354 BSC_#213 0x40c0358 BSC_#214 0x40c035c BSC_#215 -0x40c0360 BSC_mkcomplex -0x40c0364 BSC_statv -0x40c0368 BSC_lstatv -0x40c036c BSC_fstatv +0x40c0360 BSC_obs_mkcomplex +0x40c0364 BSC_obs_statv +0x40c0368 BSC_obs_lstatv +0x40c036c BSC_obs_fstatv 0x40c0370 BSC_getattrlist 0x40c0374 BSC_setattrlist 0x40c0378 BSC_getdirentriesattr @@ -926,7 +954,7 @@ 0x40c05c8 BSC_obs_semwait_signal 0x40c05cc BSC_obs_semwait_signal_nocancel 0x40c05d0 BSC_thread_selfid -0x40c05d4 BSC_#373 +0x40c05d4 BSC_ledger 0x40c05d8 BSC_#374 0x40c05dc BSC_#375 0x40c05e0 BSC_#376 @@ -990,7 +1018,8 @@ 0x40c06cc BSC_pid_hibernate 0x40c06d0 BSC_pid_shutdown_sockets 0x40c06d4 BSC_shared_region_slide_np -0x40c06fc BSC_shared_region_map_and_slide_np +0x40c06d8 BSC_shared_region_map_and_slide_np +0x40c06dc BSC_kas_info 0x40e0104 BSC_msync_extended_info 0x40e0264 BSC_pread_extended_info 0x40e0268 BSC_pwrite_extended_info @@ -1099,8 +1128,12 @@ 0x5230030 HID_DispatchKeyboard 0x5230034 HID_EjectCallback 0x5230038 HID_CapsCallback -0x523003c HID_#3c -0x523004c HID_#4c +0x523003c HID_HandleReport +0x5230040 HID_DispatchTabletPointer +0x5230044 HID_DispatchTabletProx +0x5230048 HID_DispatchHIDEvent +0x523004c HID_CalculateCapsDelay +0x5230050 HID_Invalid 0x5310004 CPUPM_PSTATE 0x5310008 CPUPM_IDLE_CSTATE 0x531000c CPUPM_IDLE_HALT @@ -1768,6 +1801,28 @@ 0x2200002c LAUNCHD_bsd_kevent 0x22000030 LAUNCHD_vproc_trans_incr 0x22000034 LAUNCHD_vproc_trans_decr +0x25000000 PERF_Event +0x25010000 PERF_THD_Sample +0x25010004 PERF_THD_Data +0x25010008 PERF_THD_XSample +0x2501000c PERF_THD_XPend +0x25010010 PERF_THD_XData +0x25020000 PERF_STK_KSample +0x25020004 PERF_STK_USched +0x25020008 PERF_STK_USample +0x2502000c PERF_STK_KData +0x25020010 PERF_STK_UData +0x25030000 PERF_TMR_AllSched +0x25030004 PERF_TMR_Schedule +0x25030008 PERF_TMR_Handler +0x25040000 PERF_ATS_Thread +0x25040004 PERF_ATS_Error +0x25040008 PERF_ATS_Run +0x2504000c PERF_ATS_Pause +0x25040010 PERF_ATS_Idle +0x25040014 PERF_ATS_Sample +0x25050000 PERF_AST_Handler +0x25050004 PERF_AST_Error 0xff000104 MSG_mach_notify_port_deleted 0xff000114 MSG_mach_notify_port_destroyed 0xff000118 MSG_mach_notify_no_senders @@ -1916,7 +1971,6 @@ 0xff002c40 MSG_io_service_wait_quiet 0xff002c44 MSG_io_registry_entry_create_iterator 0xff002c48 MSG_io_iterator_is_valid -0xff002c4c MSG_io_make_matching 0xff002c50 MSG_io_catalog_send_data 0xff002c54 MSG_io_catalog_terminate 0xff002c58 MSG_io_catalog_get_data diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index b97eb780e..4d7c5b9fa 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -142,6 +142,9 @@ static void ttyunblock(struct tty *tp); static int ttywflush(struct tty *tp); static int proc_compare(proc_t p1, proc_t p2); +static void ttyhold(struct tty *tp); +static void ttydeallocate(struct tty *tp); + static int isctty(proc_t p, struct tty *tp); static int isctty_sp(proc_t p, struct tty *tp, struct session *sessp); @@ -339,8 +342,9 @@ int ttyopen(dev_t device, struct tty *tp) { proc_t p = current_proc(); - struct pgrp * pg, * oldpg; + struct pgrp *pg, *oldpg; struct session *sessp, *oldsess; + struct tty *oldtp; TTY_LOCK_OWNED(tp); /* debug assert */ @@ -359,15 +363,15 @@ ttyopen(dev_t device, struct tty *tp) /* * First tty open affter setsid() call makes this tty its controlling * tty, if the tty does not already have a session associated with it. - * Only do this if the process */ - if (SESS_LEADER(p, sessp) && /* process is session leader */ + if (SESS_LEADER(p, sessp) && /* the process is the session leader */ sessp->s_ttyvp == NULL && /* but has no controlling tty */ - tp->t_session == NULL ) { /* and tty not controlling */ + tp->t_session == NULL ) { /* and tty not controlling */ session_lock(sessp); if ((sessp->s_flags & S_NOCTTY) == 0) { /* and no O_NOCTTY */ - /* Hold on to the reference */ - sessp->s_ttyp = tp; /* XXX NOT A REFERENCE */ + oldtp = sessp->s_ttyp; + ttyhold(tp); + sessp->s_ttyp = tp; OSBitOrAtomic(P_CONTROLT, &p->p_flag); session_unlock(sessp); proc_list_lock(); @@ -385,6 +389,8 @@ ttyopen(dev_t device, struct tty *tp) pg_rele(oldpg); if (oldsess != SESSION_NULL) session_rele(oldsess); + if (NULL != oldtp) + ttyfree(oldtp); tty_lock(tp); goto out; } @@ -1047,8 +1053,9 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) { int error = 0; struct uthread *ut; - struct pgrp * pg, *oldpg; - struct session *sessp, * oldsessp; + struct pgrp *pg, *oldpg; + struct session *sessp, *oldsessp; + struct tty *oldtp; TTY_LOCK_OWNED(tp); /* debug assert */ @@ -1404,7 +1411,9 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) tp->t_pgrp = pg; proc_list_unlock(); session_lock(sessp); - sessp->s_ttyp = tp; /* XXX NOT A REFERENCE */ + oldtp = sessp->s_ttyp; + ttyhold(tp); + sessp->s_ttyp = tp; session_unlock(sessp); OSBitOrAtomic(P_CONTROLT, &p->p_flag); /* SAFE: All callers drop the lock on return */ @@ -1414,6 +1423,8 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) session_rele(oldsessp); if (oldpg != PGRP_NULL) pg_rele(oldpg); + if (NULL != oldtp) + ttyfree(oldtp); tty_lock(tp); break; @@ -3038,19 +3049,48 @@ ttymalloc(void) lck_mtx_init(&tp->t_lock, tty_lck_grp, tty_lck_attr); klist_init(&tp->t_rsel.si_note); klist_init(&tp->t_wsel.si_note); + tp->t_refcnt = 1; } - return(tp); + return (tp); } +/* + * Increment the reference count on a tty. + */ +static void +ttyhold(struct tty *tp) +{ + TTY_LOCK_OWNED(tp); + tp->t_refcnt++; +} /* - * Free a tty structure and its buffers. - * - * Locks: The tty_lock() is assumed to not be held at the time of - * the free; this functions destroys the mutex. + * Drops a reference count on a tty structure; if the reference count reaches + * zero, then also frees the structure and associated buffers. */ void ttyfree(struct tty *tp) +{ + TTY_LOCK_NOTOWNED(tp); + + tty_lock(tp); + if (--tp->t_refcnt == 0) { + tty_unlock(tp); + ttydeallocate(tp); + } else if (tp->t_refcnt < 0) { + panic("%s: freeing free tty %p", __func__, tp); + } else + tty_unlock(tp); +} + +/* + * Deallocate a tty structure and its buffers. + * + * Locks: The tty_lock() is assumed to not be held at the time of + * the free; this function destroys the mutex. + */ +static void +ttydeallocate(struct tty *tp) { TTY_LOCK_NOTOWNED(tp); /* debug assert */ @@ -3097,4 +3137,3 @@ isctty_sp(proc_t p, struct tty *tp, struct session *sessp) return(sessp == tp->t_session && p->p_flag & P_CONTROLT); } - diff --git a/bsd/kern/tty_tty.c b/bsd/kern/tty_tty.c index ec7bee445..9cb8339bf 100644 --- a/bsd/kern/tty_tty.c +++ b/bsd/kern/tty_tty.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1997-2012 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ /* Forward declarations for cdevsw[] entry */ /* XXX we should consider making these static */ int cttyopen(dev_t dev, int flag, int mode, proc_t p); +int cttyclose(dev_t dev, int flag, int mode, proc_t p); int cttyread(dev_t dev, struct uio *uio, int flag); int cttywrite(dev_t dev, struct uio *uio, int flag); int cttyioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, proc_t p); @@ -85,31 +86,65 @@ static vnode_t cttyvp(proc_t p); int cttyopen(dev_t dev, int flag, __unused int mode, proc_t p) { - vnode_t ttyvp = cttyvp(p); - struct vfs_context context; + vnode_t ttyvp; int error; - if (ttyvp == NULL) - return (ENXIO); - - context.vc_thread = current_thread(); - context.vc_ucred = kauth_cred_proc_ref(p); - /* * A little hack--this device, used by many processes, - * happens to do an open on another device, which can - * cause unhappiness if the second-level open blocks indefinitely - * (as could be the case if the master side has hung up). Since - * we know that this driver doesn't care about the serializing - * opens and closes, we can drop the lock. + * does an open on another device, which can cause unhappiness + * if the second-level open blocks indefinitely (e.g. if the + * master side has hung up). This driver doesn't care + * about serializing opens and closes, so drop the lock. */ devsw_unlock(dev, S_IFCHR); - error = VNOP_OPEN(ttyvp, flag, &context); + + if ((ttyvp = cttyvp(p)) == NULL) { + error = ENXIO; + } else { + struct vfs_context context; + + context.vc_thread = current_thread(); + context.vc_ucred = kauth_cred_proc_ref(p); + + error = VNOP_OPEN(ttyvp, flag, &context); + + kauth_cred_unref(&context.vc_ucred); + vnode_put(ttyvp); + } + devsw_lock(dev, S_IFCHR); + return (error); +} - vnode_put(ttyvp); - kauth_cred_unref(&context.vc_ucred); +/* + * This driver is marked D_TRACKCLOSE and so gets a close + * for every open so that ttyvp->v_specinfo->si_count can be kept sane. + */ +int +cttyclose(dev_t dev, int flag, __unused int mode, proc_t p) +{ + vnode_t ttyvp; + int error; + + /* See locking commentary above. */ + + devsw_unlock(dev, S_IFCHR); + + if ((ttyvp = cttyvp(p)) == NULL) { + error = ENXIO; + } else { + struct vfs_context context; + + context.vc_thread = current_thread(); + context.vc_ucred = kauth_cred_proc_ref(p); + + error = VNOP_CLOSE(ttyvp, flag, &context); + kauth_cred_unref(&context.vc_ucred); + vnode_put(ttyvp); + } + + devsw_lock(dev, S_IFCHR); return (error); } diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index c7661e41b..c89ea82ab 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -1984,9 +1984,20 @@ ubc_create_upl( uplflags |= UPL_FOR_PAGEOUT | UPL_CLEAN_IN_PLACE | UPL_COPYOUT_FROM | UPL_SET_INTERNAL | UPL_SET_LITE; } else { - uplflags |= UPL_RET_ONLY_ABSENT | UPL_NOBLOCK | + uplflags |= UPL_RET_ONLY_ABSENT | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE; + + /* + * if the requested size == PAGE_SIZE, we don't want to set + * the UPL_NOBLOCK since we may be trying to recover from a + * previous partial pagein I/O that occurred because we were low + * on memory and bailed early in order to honor the UPL_NOBLOCK... + * since we're only asking for a single page, we can block w/o fear + * of tying up pages while waiting for more to become available + */ + if (bufsize > PAGE_SIZE) + uplflags |= UPL_NOBLOCK; } } else { uplflags &= ~UPL_FOR_PAGEOUT; @@ -2344,6 +2355,16 @@ UBCINFOEXISTS(struct vnode * vp) } +void +ubc_upl_range_needed( + upl_t upl, + int index, + int count) +{ + upl_range_needed(upl, index, count); +} + + /* * CODE SIGNING */ @@ -2356,7 +2377,9 @@ static SInt32 cs_blob_count_peak = 0; int cs_validation = 1; +#ifndef SECURE_KERNEL SYSCTL_INT(_vm, OID_AUTO, cs_validation, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_validation, 0, "Do validate code signatures"); +#endif SYSCTL_INT(_vm, OID_AUTO, cs_blob_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)(uintptr_t)&cs_blob_count, 0, "Current number of code signature blobs"); SYSCTL_INT(_vm, OID_AUTO, cs_blob_size, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)(uintptr_t)&cs_blob_size, 0, "Current size of all code signature blobs"); SYSCTL_INT(_vm, OID_AUTO, cs_blob_count_peak, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_count_peak, 0, "Peak number of code signature blobs"); @@ -2760,6 +2783,7 @@ unsigned long cs_validate_page_bad_hash = 0; boolean_t cs_validate_page( void *_blobs, + memory_object_t pager, memory_object_offset_t page_offset, const void *data, boolean_t *tainted) @@ -2868,8 +2892,8 @@ cs_validate_page( cs_validate_page_no_hash++; if (cs_debug > 1) { printf("CODE SIGNING: cs_validate_page: " - "off 0x%llx: no hash to validate !?\n", - page_offset); + "mobj %p off 0x%llx: no hash to validate !?\n", + pager, page_offset); } validated = FALSE; *tainted = FALSE; @@ -2893,10 +2917,10 @@ cs_validate_page( if (bcmp(expected_hash, actual_hash, SHA1_RESULTLEN) != 0) { if (cs_debug) { printf("CODE SIGNING: cs_validate_page: " - "off 0x%llx size 0x%lx: " + "mobj %p off 0x%llx size 0x%lx: " "actual [0x%x 0x%x 0x%x 0x%x 0x%x] != " "expected [0x%x 0x%x 0x%x 0x%x 0x%x]\n", - page_offset, size, + pager, page_offset, size, asha1[0], asha1[1], asha1[2], asha1[3], asha1[4], esha1[0], esha1[1], esha1[2], @@ -2907,8 +2931,9 @@ cs_validate_page( } else { if (cs_debug > 1) { printf("CODE SIGNING: cs_validate_page: " - "off 0x%llx size 0x%lx: SHA1 OK\n", - page_offset, size); + "mobj %p off 0x%llx size 0x%lx: " + "SHA1 OK\n", + pager, page_offset, size); } *tainted = FALSE; } diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index 1065d3683..a015bddae 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,8 @@ #include #include +#include + #include void init_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); @@ -95,16 +98,12 @@ static void net_update_uptime(void); lck_grp_t *domain_proto_mtx_grp; lck_attr_t *domain_proto_mtx_attr; static lck_grp_attr_t *domain_proto_mtx_grp_attr; -lck_mtx_t *domain_proto_mtx; +decl_lck_mtx_data(static, domain_proto_mtx); extern int do_reclaim; extern sysctlfn net_sysctl; -static u_int64_t uptime; - -#ifdef INET6 -extern void ip6_fin(void); -#endif +static u_int64_t _net_uptime; static void init_proto(struct protosw *pr) @@ -153,10 +152,14 @@ init_domain(struct domain *dp) } /* Recompute for new protocol */ - if (max_linkhdr < 16) /* XXX - Sheesh; everything's ether? */ - max_linkhdr = 16; - if (dp->dom_protohdrlen > max_protohdr) - max_protohdr = dp->dom_protohdrlen; + if (_max_linkhdr < 16) /* XXX - Sheesh; everything's ether? */ + _max_linkhdr = 16; + _max_linkhdr = max_linkhdr; /* round it up */ + + if (dp->dom_protohdrlen > _max_protohdr) + _max_protohdr = dp->dom_protohdrlen; + _max_protohdr = max_protohdr; /* round it up */ + max_hdr = max_linkhdr + max_protohdr; max_datalen = MHLEN - max_hdr; } @@ -164,7 +167,7 @@ init_domain(struct domain *dp) void prepend_domain(struct domain *dp) { - lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&domain_proto_mtx, LCK_MTX_ASSERT_OWNED); dp->dom_next = domains; domains = dp; } @@ -172,15 +175,17 @@ prepend_domain(struct domain *dp) void net_add_domain(struct domain *dp) { + int do_unlock; + kprintf("Adding domain %s (family %d)\n", dp->dom_name, dp->dom_family); /* First, link in the domain */ - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); prepend_domain(dp); init_domain(dp); - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); } @@ -188,11 +193,12 @@ int net_del_domain(struct domain *dp) { register struct domain *dp1, *dp2; register int retval = 0; + int do_unlock; - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); if (dp->dom_refs) { - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return(EBUSY); } @@ -207,7 +213,7 @@ net_del_domain(struct domain *dp) domains = dp1->dom_next; } else retval = EPFNOSUPPORT; - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return(retval); } @@ -294,6 +300,7 @@ void domaininit(void) { register struct domain *dp; + int do_unlock; /* * allocate lock group attribute and group for domain mutexes @@ -307,15 +314,13 @@ domaininit(void) */ domain_proto_mtx_attr = lck_attr_alloc_init(); - if ((domain_proto_mtx = lck_mtx_alloc_init(domain_proto_mtx_grp, domain_proto_mtx_attr)) == NULL) { - printf("domaininit: can't init domain mtx for domain list\n"); - return; /* we have a problem... */ - } + lck_mtx_init(&domain_proto_mtx, domain_proto_mtx_grp, + domain_proto_mtx_attr); /* * Add all the static domains to the domains list */ - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); prepend_domain(&localdomain); prepend_domain(&inetdomain); @@ -351,18 +356,10 @@ domaininit(void) for (dp = domains; dp; dp = dp->dom_next) init_domain(dp); - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); timeout(pfslowtimo, NULL, 1); } -void -domainfin(void) -{ -#ifdef INET6 - ip6_fin(); -#endif -} - static __inline__ struct domain * pffinddomain_locked(int pf) { @@ -383,20 +380,20 @@ pffindtype(int family, int type) { register struct domain *dp; register struct protosw *pr; + int do_unlock; - lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); dp = pffinddomain_locked(family); if (dp == NULL) { - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (NULL); } for (pr = dp->dom_protosw; pr; pr = pr->pr_next) if (pr->pr_type && pr->pr_type == type) { - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (pr); } - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (0); } @@ -404,22 +401,22 @@ struct domain * pffinddomain(int pf) { struct domain *dp; + int do_unlock; - lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); dp = pffinddomain_locked(pf); - lck_mtx_unlock(domain_proto_mtx); - return(dp); - } + domain_proto_mtx_unlock(do_unlock); + return(dp); +} struct protosw * pffindproto(int family, int protocol, int type) { register struct protosw *pr; - lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(domain_proto_mtx); + int do_unlock; + do_unlock = domain_proto_mtx_lock(); pr = pffindproto_locked(family, protocol, type); - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (pr); } @@ -471,13 +468,13 @@ struct protosw * pffindprotonotype(int family, int protocol) { register struct protosw *pr; + int do_unlock; if (protocol == 0) { return (NULL); } - lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); pr = pffindprotonotype_locked(family, protocol, 0); - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (pr); } @@ -488,6 +485,7 @@ net_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, register struct domain *dp; register struct protosw *pr; int family, protocol, error; + int do_unlock; /* * All sysctl names at this level are nonterminal; @@ -501,21 +499,21 @@ net_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (family == 0) return (0); - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); for (dp = domains; dp; dp = dp->dom_next) if (dp->dom_family == family) goto found; - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (ENOPROTOOPT); found: for (pr = dp->dom_protosw; pr; pr = pr->pr_next) if (pr->pr_protocol == protocol && pr->pr_sysctl) { error = (*pr->pr_sysctl)(name + 2, namelen - 2, (void *)(uintptr_t)oldp, oldlenp, (void *)(uintptr_t)newp, newlen); - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (error); } - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); return (ENOPROTOOPT); } @@ -530,16 +528,17 @@ pfctlinput2(int cmd, struct sockaddr *sa, void *ctlparam) { struct domain *dp; struct protosw *pr; + int do_unlock; if (!sa) return; - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr; pr = pr->pr_next) if (pr->pr_ctlinput) (*pr->pr_ctlinput)(cmd, sa, ctlparam); - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); } void @@ -547,6 +546,7 @@ pfslowtimo(__unused void *arg) { register struct domain *dp; register struct protosw *pr; + int do_unlock; /* * Update coarse-grained networking timestamp (in sec.); the idea @@ -555,7 +555,7 @@ pfslowtimo(__unused void *arg) */ net_update_uptime(); - lck_mtx_lock(domain_proto_mtx); + do_unlock = domain_proto_mtx_lock(); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr; pr = pr->pr_next) { if (pr->pr_slowtimo) @@ -565,7 +565,7 @@ pfslowtimo(__unused void *arg) (*pr->pr_drain)(); } do_reclaim = 0; - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(do_unlock); timeout(pfslowtimo, NULL, hz/PR_SLOWHZ); } @@ -575,7 +575,7 @@ net_update_uptime(void) struct timeval tv; microuptime(&tv); - uptime = tv.tv_sec; + _net_uptime = tv.tv_sec; } /* @@ -587,8 +587,30 @@ u_int64_t net_uptime(void) { /* If we get here before pfslowtimo() fires for the first time */ - if (uptime == 0) + if (_net_uptime == 0) net_update_uptime(); - return (uptime); + return (_net_uptime); +} + +int +domain_proto_mtx_lock(void) +{ + int held = net_thread_check_lock(NET_THREAD_HELD_DOMAIN); + if (!held) { + lck_mtx_lock(&domain_proto_mtx); + net_thread_set_lock(NET_THREAD_HELD_DOMAIN); + } + lck_mtx_assert(&domain_proto_mtx, LCK_MTX_ASSERT_OWNED); + return !held; +} + +void +domain_proto_mtx_unlock(int do_unlock) +{ + if (do_unlock) { + net_thread_unset_lock(NET_THREAD_HELD_DOMAIN); + lck_mtx_unlock(&domain_proto_mtx); + lck_mtx_assert(&domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); + } } diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index d8d3ce857..0112f0c02 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -303,7 +303,8 @@ extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); extern vm_map_t mb_map; /* special map */ /* Global lock */ -static lck_mtx_t *mbuf_mlock; +decl_lck_mtx_data(static, mbuf_mlock_data); +static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data; static lck_attr_t *mbuf_mlock_attr; static lck_grp_t *mbuf_mlock_grp; static lck_grp_attr_t *mbuf_mlock_grp_attr; @@ -449,14 +450,15 @@ int njcl; /* # of clusters for jumbo sizes */ int njclbytes; /* size of a jumbo cluster */ union mbigcluster *mbutl; /* first mapped cluster address */ union mbigcluster *embutl; /* ending virtual address of mclusters */ -int max_linkhdr; /* largest link-level header */ -int max_protohdr; /* largest protocol header */ +int _max_linkhdr; /* largest link-level header */ +int _max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ int max_datalen; /* MHLEN - max_hdr */ static boolean_t mclverify; /* debug: pattern-checking */ static boolean_t mcltrace; /* debug: stack tracing */ static boolean_t mclfindleak; /* debug: leak detection */ +static boolean_t mclexpleak; /* debug: expose leak info to user space */ /* mbuf leak detection variables */ static struct mleak_table mleak_table; @@ -495,6 +497,22 @@ struct mtrace { */ #define MLEAK_NUM_TRACES 5 +#define MB_LEAK_SPACING_64 " " +#define MB_LEAK_SPACING_32 " " + + +#define MB_LEAK_HDR_32 "\n\ + trace [1] trace [2] trace [3] trace [4] trace [5] \n\ + ---------- ---------- ---------- ---------- ---------- \n\ +" + +#define MB_LEAK_HDR_64 "\n\ + trace [1] trace [2] trace [3] \ + trace [4] trace [5] \n\ + ------------------ ------------------ ------------------ \ + ------------------ ------------------ \n\ +" + static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; @@ -504,7 +522,8 @@ static struct mtrace *mleak_traces; static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; /* Lock to protect mleak tables from concurrent modification */ -static lck_mtx_t *mleak_lock; +decl_lck_mtx_data(static, mleak_lock_data); +static lck_mtx_t *mleak_lock = &mleak_lock_data; static lck_attr_t *mleak_lock_attr; static lck_grp_t *mleak_lock_grp; static lck_grp_attr_t *mleak_lock_grp_attr; @@ -588,7 +607,9 @@ static int mb_waiters; /* number of waiters */ #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ static struct timeval mb_wdtstart; /* watchdog start timestamp */ -static char mbuf_dump_buf[256]; +static char *mbuf_dump_buf; + +#define MBUF_DUMP_BUF_SIZE 2048 /* * mbuf watchdog is enabled by default on embedded platforms. It is @@ -656,6 +677,8 @@ static void mleak_activate(void); static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); static void mleak_free(mcache_obj_t *); +static void mleak_sort_traces(void); +static void mleak_update_stats(void); static mcl_slab_t *slab_get(void *); static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, @@ -769,8 +792,9 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); (m)->m_pkthdr.vlan_tag = 0; \ (m)->m_pkthdr.socket_id = 0; \ (m)->m_pkthdr.vt_nrecs = 0; \ + (m)->m_pkthdr.aux_flags = 0; \ m_tag_init(m); \ - m_prio_init(m); \ + m_service_class_init(m); \ } \ } @@ -856,7 +880,7 @@ static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) #define MTYPES_CPU(p) \ - ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) + ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) #define mtype_stat_add(type, n) { \ if ((unsigned)(type) < MT_MAX) { \ @@ -1032,42 +1056,14 @@ static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) - mleak_trace_stat_t *mltr; int i; /* Ensure leak tracing turned on */ - if (!mclfindleak) + if (!mclfindleak || !mclexpleak) return (ENXIO); - VERIFY(mleak_stat != NULL); -#ifdef __LP64__ - VERIFY(mleak_stat->ml_isaddr64); -#else - VERIFY(!mleak_stat->ml_isaddr64); -#endif /* !__LP64__ */ - VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); - lck_mtx_lock(mleak_lock); - mltr = &mleak_stat->ml_trace[0]; - bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); - for (i = 0; i < MLEAK_NUM_TRACES; i++) { - int j; - - if (mleak_top_trace[i] == NULL || - mleak_top_trace[i]->allocs == 0) - continue; - - mltr->mltr_collisions = mleak_top_trace[i]->collisions; - mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; - mltr->mltr_allocs = mleak_top_trace[i]->allocs; - mltr->mltr_depth = mleak_top_trace[i]->depth; - - VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); - for (j = 0; j < mltr->mltr_depth; j++) - mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; - - mltr++; - } + mleak_update_stats(); i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); lck_mtx_unlock(mleak_lock); @@ -1081,7 +1077,7 @@ mleak_table_sysctl SYSCTL_HANDLER_ARGS int i = 0; /* Ensure leak tracing turned on */ - if (!mclfindleak) + if (!mclfindleak || !mclexpleak) return (ENXIO); lck_mtx_lock(mleak_lock); @@ -1264,7 +1260,7 @@ typedef struct ncl_tbl { /* Non-server */ static ncl_tbl_t ncl_table[] = { - { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ }, + { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ }, { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ }, { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ }, { 0, 0 } @@ -1272,7 +1268,7 @@ static ncl_tbl_t ncl_table[] = { /* Server */ static ncl_tbl_t ncl_table_srv[] = { - { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ }, + { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ }, { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ }, { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ }, { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ }, @@ -1318,6 +1314,74 @@ mbinit(void) void *buf; thread_t thread = THREAD_NULL; + /* + * These MBUF_ values must be equal to their private counterparts. + */ + _CASSERT(MBUF_EXT == M_EXT); + _CASSERT(MBUF_PKTHDR == M_PKTHDR); + _CASSERT(MBUF_EOR == M_EOR); + _CASSERT(MBUF_LOOP == M_LOOP); + _CASSERT(MBUF_BCAST == M_BCAST); + _CASSERT(MBUF_MCAST == M_MCAST); + _CASSERT(MBUF_FRAG == M_FRAG); + _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG); + _CASSERT(MBUF_LASTFRAG == M_LASTFRAG); + _CASSERT(MBUF_PROMISC == M_PROMISC); + _CASSERT(MBUF_HASFCS == M_HASFCS); + + _CASSERT(MBUF_TYPE_FREE == MT_FREE); + _CASSERT(MBUF_TYPE_DATA == MT_DATA); + _CASSERT(MBUF_TYPE_HEADER == MT_HEADER); + _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET); + _CASSERT(MBUF_TYPE_PCB == MT_PCB); + _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE); + _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE); + _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE); + _CASSERT(MBUF_TYPE_SONAME == MT_SONAME); + _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS); + _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE); + _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS); + _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR); + _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL); + _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA); + + _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); + _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); + _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16); + _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); + _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP); + _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP); + _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP); + _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); + _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); + _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); + _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); + _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); + _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); + + _CASSERT(MBUF_WAITOK == M_WAIT); + _CASSERT(MBUF_DONTWAIT == M_DONTWAIT); + _CASSERT(MBUF_COPYALL == M_COPYALL); + + _CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR); + _CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR); + + _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); + _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); + _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); + _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); + _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); + _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); + _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); + _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); + _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); + _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); + + _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); + _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); + _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); + _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); + if (nmbclusters == 0) nmbclusters = NMBCLUSTERS; @@ -1331,7 +1395,7 @@ mbinit(void) mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); mbuf_mlock_attr = lck_attr_alloc_init(); - mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); + lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr); /* * Allocate cluster slabs table: @@ -1369,13 +1433,14 @@ mbinit(void) mclverify = (mbuf_debug & MCF_VERIFY); mcltrace = (mbuf_debug & MCF_TRACE); mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); + mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG); /* Enable mbuf leak logging, with a lock to protect the tables */ mleak_lock_grp_attr = lck_grp_attr_alloc_init(); mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); mleak_lock_attr = lck_attr_alloc_init(); - mleak_lock = lck_mtx_alloc_init(mleak_lock_grp, mleak_lock_attr); + lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr); mleak_activate(); @@ -1390,7 +1455,7 @@ mbinit(void) bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t)); embutl = (union mbigcluster *) - ((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); + ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES))); VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0); /* Prime up the freelist */ @@ -1501,6 +1566,10 @@ mbinit(void) } } + /* allocate space for mbuf_dump_buf */ + MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK); + VERIFY(mbuf_dump_buf != NULL); + printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n", (nmbclusters << MCLSHIFT) >> MBSHIFT, (nclusters << MCLSHIFT) >> MBSHIFT, @@ -2177,7 +2246,7 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) MEXT_REF(m) = 0; MEXT_FLAGS(m) = 0; - rfa = (mcache_obj_t *)MEXT_RFA(m); + rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); rfa->obj_next = ref_list; ref_list = rfa; MEXT_RFA(m) = NULL; @@ -2331,7 +2400,7 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, rfa = (struct ext_ref *)ref_list; ref_list = ref_list->obj_next; - ((mcache_obj_t *)rfa)->obj_next = NULL; + ((mcache_obj_t *)(void *)rfa)->obj_next = NULL; /* * If auditing is enabled, construct the shadow mbuf @@ -3514,7 +3583,8 @@ m_copy_pkthdr(struct mbuf *to, struct mbuf *from) m_tag_delete_chain(to, NULL); to->m_pkthdr = from->m_pkthdr; /* especially tags */ m_tag_init(from); /* purge tags from src */ - m_prio_init(from); /* reset priority from src */ + m_service_class_init(from); /* reset svc class from src */ + from->m_pkthdr.aux_flags = 0; /* clear aux flags from src */ to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; @@ -3538,6 +3608,14 @@ m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) return (m_tag_copy_chain(to, from, how)); } +void +m_copy_pftag(struct mbuf *to, struct mbuf *from) +{ + to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag; + to->m_pkthdr.pf_mtag.pftag_hdr = NULL; + to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6); +} + /* * Return a list of mbuf hdrs that point to clusters. Try for num_needed; * if wantall is not set, return whatever number were available. Set up the @@ -3645,6 +3723,12 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, return (NULL); } + if (pnum > *num_needed) { + printf("%s: File a radar related to . \ + needed = %u, pnum = %u, num_needed = %u \n", + __func__, needed, pnum, *num_needed); + } + *num_needed = pnum; return (top); } @@ -4086,7 +4170,7 @@ m_freem_list(struct mbuf *m) if (!(m->m_flags & M_EXT)) goto simple_free; - o = (mcache_obj_t *)m->m_ext.ext_buf; + o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; refcnt = m_decref(m); composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); if (refcnt == 0 && !composite) { @@ -4104,7 +4188,7 @@ m_freem_list(struct mbuf *m) m->m_ext.ext_size, m->m_ext.ext_arg); } - rfa = (mcache_obj_t *)MEXT_RFA(m); + rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); rfa->obj_next = ref_list; ref_list = rfa; MEXT_RFA(m) = NULL; @@ -5597,6 +5681,123 @@ m_last(struct mbuf *m) return (m); } +unsigned int +m_fixhdr(struct mbuf *m0) +{ + u_int len; + + len = m_length2(m0, NULL); + m0->m_pkthdr.len = len; + return (len); +} + +unsigned int +m_length2(struct mbuf *m0, struct mbuf **last) +{ + struct mbuf *m; + u_int len; + + len = 0; + for (m = m0; m != NULL; m = m->m_next) { + len += m->m_len; + if (m->m_next == NULL) + break; + } + if (last != NULL) + *last = m; + return (len); +} + +/* + * Defragment a mbuf chain, returning the shortest possible chain of mbufs + * and clusters. If allocation fails and this cannot be completed, NULL will + * be returned, but the passed in chain will be unchanged. Upon success, + * the original chain will be freed, and the new chain will be returned. + * + * If a non-packet header is passed in, the original mbuf (chain?) will + * be returned unharmed. + * + * If offset is specfied, the first mbuf in the chain will have a leading + * space of the amount stated by the "off" parameter. + * + * This routine requires that the m_pkthdr.header field of the original + * mbuf chain is cleared by the caller. + */ +struct mbuf * +m_defrag_offset(struct mbuf *m0, u_int32_t off, int how) +{ + struct mbuf *m_new = NULL, *m_final = NULL; + int progress = 0, length, pktlen; + + if (!(m0->m_flags & M_PKTHDR)) + return (m0); + + VERIFY(off < MHLEN); + m_fixhdr(m0); /* Needed sanity check */ + + pktlen = m0->m_pkthdr.len + off; + if (pktlen > MHLEN) + m_final = m_getcl(how, MT_DATA, M_PKTHDR); + else + m_final = m_gethdr(how, MT_DATA); + + if (m_final == NULL) + goto nospace; + + if (off > 0) { + pktlen -= off; + m_final->m_len -= off; + m_final->m_data += off; + } + + /* + * Caller must have handled the contents pointed to by this + * pointer before coming here, as otherwise it will point to + * the original mbuf which will get freed upon success. + */ + VERIFY(m0->m_pkthdr.header == NULL); + + if (m_dup_pkthdr(m_final, m0, how) == 0) + goto nospace; + + m_new = m_final; + + while (progress < pktlen) { + length = pktlen - progress; + if (length > MCLBYTES) + length = MCLBYTES; + + if (m_new == NULL) { + if (length > MLEN) + m_new = m_getcl(how, MT_DATA, 0); + else + m_new = m_get(how, MT_DATA); + if (m_new == NULL) + goto nospace; + } + + m_copydata(m0, progress, length, mtod(m_new, caddr_t)); + progress += length; + m_new->m_len = length; + if (m_new != m_final) + m_cat(m_final, m_new); + m_new = NULL; + } + m_freem(m0); + m0 = m_final; + return (m0); +nospace: + if (m_final) + m_freem(m_final); + return (NULL); +} + +struct mbuf * +m_defrag(struct mbuf *m0, int how) +{ + return (m_defrag_offset(m0, 0, how)); +} + void m_mchtype(struct mbuf *m, int t) { @@ -6315,7 +6516,6 @@ mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) struct mallocation *allocation; struct mtrace *trace; uint32_t trace_index; - int i; /* Quit if someone else modifying the tables */ if (!lck_mtx_try_lock_spin(mleak_lock)) { @@ -6389,22 +6589,6 @@ mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) mleak_table.alloc_recorded++; mleak_table.outstanding_allocs++; - /* keep a log of the last 5 traces to be top trace, in order */ - for (i = 0; i < MLEAK_NUM_TRACES; i++) { - if (mleak_top_trace[i] == NULL || - mleak_top_trace[i]->allocs <= trace->allocs) { - if (mleak_top_trace[i] != trace) { - int j = MLEAK_NUM_TRACES; - while (--j > i) { - mleak_top_trace[j] = - mleak_top_trace[j - 1]; - } - mleak_top_trace[i] = trace; - } - break; - } - } - lck_mtx_unlock(mleak_lock); return (TRUE); } @@ -6438,6 +6622,90 @@ mleak_free(mcache_obj_t *addr) } } +static void +mleak_sort_traces() +{ + int i, j, k; + struct mtrace *swap; + + for(i = 0; i < MLEAK_NUM_TRACES; i++) + mleak_top_trace[i] = NULL; + + for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) + { + if (mleak_traces[i].allocs <= 0) + continue; + + mleak_top_trace[j] = &mleak_traces[i]; + for (k = j; k > 0; k--) { + if (mleak_top_trace[k]->allocs <= + mleak_top_trace[k-1]->allocs) + break; + + swap = mleak_top_trace[k-1]; + mleak_top_trace[k-1] = mleak_top_trace[k]; + mleak_top_trace[k] = swap; + } + j++; + } + + j--; + for(; i < mleak_trace_buckets; i++) { + if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) + continue; + + mleak_top_trace[j] = &mleak_traces[i]; + + for (k = j; k > 0; k--) { + if (mleak_top_trace[k]->allocs <= + mleak_top_trace[k-1]->allocs) + break; + + swap = mleak_top_trace[k-1]; + mleak_top_trace[k-1] = mleak_top_trace[k]; + mleak_top_trace[k] = swap; + } + } +} + +static void +mleak_update_stats() +{ + mleak_trace_stat_t *mltr; + int i; + + VERIFY(mleak_stat != NULL); +#ifdef __LP64__ + VERIFY(mleak_stat->ml_isaddr64); +#else + VERIFY(!mleak_stat->ml_isaddr64); +#endif /* !__LP64__ */ + VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); + + mleak_sort_traces(); + + mltr = &mleak_stat->ml_trace[0]; + bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + int j; + + if (mleak_top_trace[i] == NULL || + mleak_top_trace[i]->allocs == 0) + continue; + + mltr->mltr_collisions = mleak_top_trace[i]->collisions; + mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; + mltr->mltr_allocs = mleak_top_trace[i]->allocs; + mltr->mltr_depth = mleak_top_trace[i]->depth; + + VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); + for (j = 0; j < mltr->mltr_depth; j++) + mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; + + mltr++; + } +} + static struct mbtypes { int mt_type; const char *mt_name; @@ -6478,8 +6746,9 @@ mbuf_dump(void) uint8_t seen[256]; struct mbtypes *mp; mb_class_stat_t *sp; + mleak_trace_stat_t *mltr; char *c = mbuf_dump_buf; - int i, k, clen = sizeof (mbuf_dump_buf); + int i, k, clen = MBUF_DUMP_BUF_SIZE; mbuf_dump_buf[0] = '\0'; @@ -6577,6 +6846,77 @@ mbuf_dump(void) "in use)\n", totmem / 1024, totpct); MBUF_DUMP_BUF_CHK(); + /* mbuf leak detection statistics */ + mleak_update_stats(); + + k = snprintf(c, clen, "\nmbuf leak detection table:\n"); + MBUF_DUMP_BUF_CHK(); + k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n", + mleak_table.mleak_capture / mleak_table.mleak_sample_factor, + mleak_table.mleak_sample_factor); + MBUF_DUMP_BUF_CHK(); + k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n", + mleak_table.outstanding_allocs); + MBUF_DUMP_BUF_CHK(); + k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n", + mleak_table.alloc_recorded, mleak_table.trace_recorded); + MBUF_DUMP_BUF_CHK(); + k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n", + mleak_table.alloc_collisions, mleak_table.trace_collisions); + MBUF_DUMP_BUF_CHK(); + k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n", + mleak_table.alloc_overwrites, mleak_table.trace_overwrites); + MBUF_DUMP_BUF_CHK(); + k = snprintf(c, clen, "\tlock conflicts: %llu\n\n", + mleak_table.total_conflicts); + MBUF_DUMP_BUF_CHK(); + + k = snprintf(c, clen, "top %d outstanding traces:\n", + mleak_stat->ml_cnt); + MBUF_DUMP_BUF_CHK(); + for (i = 0; i < mleak_stat->ml_cnt; i++) { + mltr = &mleak_stat->ml_trace[i]; + k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), " + "%llu hit(s), %llu collision(s)\n", (i + 1), + mltr->mltr_allocs, mltr->mltr_hitcount, + mltr->mltr_collisions); + MBUF_DUMP_BUF_CHK(); + } + + if (mleak_stat->ml_isaddr64) + k = snprintf(c, clen, MB_LEAK_HDR_64); + else + k = snprintf(c, clen, MB_LEAK_HDR_32); + MBUF_DUMP_BUF_CHK(); + + for (i = 0; i < MLEAK_STACK_DEPTH; i++) { + int j; + k = snprintf(c, clen, "%2d: ", (i + 1)); + MBUF_DUMP_BUF_CHK(); + for (j = 0; j < mleak_stat->ml_cnt; j++) { + mltr = &mleak_stat->ml_trace[j]; + if (i < mltr->mltr_depth) { + if (mleak_stat->ml_isaddr64) { + k = snprintf(c, clen, "0x%0llx ", + mltr->mltr_addr[i]); + } else { + k = snprintf(c, clen, + "0x%08x ", + (u_int32_t)mltr->mltr_addr[i]); + } + } else { + if (mleak_stat->ml_isaddr64) + k = snprintf(c, clen, + MB_LEAK_SPACING_64); + else + k = snprintf(c, clen, + MB_LEAK_SPACING_32); + } + MBUF_DUMP_BUF_CHK(); + } + k = snprintf(c, clen, "\n"); + MBUF_DUMP_BUF_CHK(); + } done: return (mbuf_dump_buf); } diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index 386238460..565d005f9 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -399,13 +399,15 @@ m_tag_create(u_int32_t id, u_int16_t type, int len, int wait, struct mbuf *buf) VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); struct mbuf *m = m_dtom(p); - struct m_taghdr *hdr = (struct m_taghdr *)m->m_data; + struct m_taghdr *hdr = (struct m_taghdr *)(void *)m->m_data; + VERIFY(IS_P2ALIGNED(hdr + 1, sizeof (u_int64_t))); VERIFY(m->m_flags & M_TAGHDR && !(m->m_flags & M_EXT)); /* The mbuf can store this m_tag */ if (M_TAG_ALIGN(len) <= MLEN - m->m_len) { - t = (struct m_tag *)(m->m_data + m->m_len); + t = (struct m_tag *)(void *)(m->m_data + m->m_len); + VERIFY(IS_P2ALIGNED(t, sizeof (u_int64_t))); hdr->refcnt++; m->m_len += M_TAG_ALIGN(len); VERIFY(m->m_len <= MLEN); @@ -445,14 +447,16 @@ m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) m->m_flags |= M_TAGHDR; - hdr = (struct m_taghdr *)m->m_data; + hdr = (struct m_taghdr *)(void *)m->m_data; + VERIFY(IS_P2ALIGNED(hdr + 1, sizeof (u_int64_t))); hdr->refcnt = 1; m->m_len += sizeof (struct m_taghdr); - t = (struct m_tag *)(m->m_data + m->m_len); + t = (struct m_tag *)(void *)(m->m_data + m->m_len); + VERIFY(IS_P2ALIGNED(t, sizeof (u_int64_t))); m->m_len += M_TAG_ALIGN(len); VERIFY(m->m_len <= MLEN); } else if (len + sizeof (struct m_tag) <= MCLBYTES) { - t = (struct m_tag *)m_mclalloc(wait); + t = (struct m_tag *)(void *)m_mclalloc(wait); } else { t = NULL; } @@ -460,6 +464,7 @@ m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) if (t == NULL) return (NULL); + VERIFY(IS_P2ALIGNED(t, sizeof (u_int64_t))); t->m_tag_cookie = M_TAG_VALID_PATTERN; t->m_tag_type = type; t->m_tag_len = len; @@ -489,10 +494,15 @@ m_tag_free(struct m_tag *t) #endif /* INET6 */ if (t == NULL) return; + + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); + if (M_TAG_ALIGN(t->m_tag_len) + sizeof (struct m_taghdr) <= MLEN) { struct mbuf * m = m_dtom(t); VERIFY(m->m_flags & M_TAGHDR); - struct m_taghdr *hdr = (struct m_taghdr *)m->m_data; + struct m_taghdr *hdr = (struct m_taghdr *)(void *)m->m_data; + + VERIFY(IS_P2ALIGNED(hdr + 1, sizeof (u_int64_t))); /* No other tags in this mbuf */ if(--hdr->refcnt == 0) { @@ -665,9 +675,8 @@ m_tag_init(struct mbuf *m) VERIFY(m != NULL); SLIST_INIT(&m->m_pkthdr.tags); -#if PF_PKTHDR bzero(&m->m_pkthdr.pf_mtag, sizeof (m->m_pkthdr.pf_mtag)); -#endif + bzero(&m->m_pkthdr.tcp_mtag, sizeof (m->m_pkthdr.tcp_mtag)); } /* Get first tag in chain. */ @@ -690,9 +699,143 @@ m_tag_next(struct mbuf *m, struct m_tag *t) return (SLIST_NEXT(t, m_tag_link)); } +int +m_set_traffic_class(struct mbuf *m, mbuf_traffic_class_t tc) +{ + u_int32_t val = MBUF_TC2SCVAL(tc); /* just the val portion */ + + return (m_set_service_class(m, m_service_class_from_val(val))); +} + +mbuf_traffic_class_t +m_get_traffic_class(struct mbuf *m) +{ + return (MBUF_SC2TC(m_get_service_class(m))); +} + void -m_prio_init(struct mbuf *m) +m_service_class_init(struct mbuf *m) { if (m->m_flags & M_PKTHDR) - m->m_pkthdr.prio = MBUF_TC_BE; + (void) m_set_service_class(m, MBUF_SC_BE); +} + +int +m_set_service_class(struct mbuf *m, mbuf_svc_class_t sc) +{ + int error = 0; + + VERIFY(m->m_flags & M_PKTHDR); + + if (MBUF_VALID_SC(sc)) + m->m_pkthdr.svc = sc; + else + error = EINVAL; + + return (error); +} + +mbuf_svc_class_t +m_get_service_class(struct mbuf *m) +{ + mbuf_svc_class_t sc; + + VERIFY(m->m_flags & M_PKTHDR); + + if (MBUF_VALID_SC(m->m_pkthdr.svc)) + sc = m->m_pkthdr.svc; + else + sc = MBUF_SC_BE; + + return (sc); +} + +mbuf_svc_class_t +m_service_class_from_idx(u_int32_t i) +{ + mbuf_svc_class_t sc = MBUF_SC_BE; + + switch (i) { + case SCIDX_BK_SYS: + return (MBUF_SC_BK_SYS); + + case SCIDX_BK: + return (MBUF_SC_BK); + + case SCIDX_BE: + return (MBUF_SC_BE); + + case SCIDX_RD: + return (MBUF_SC_RD); + + case SCIDX_OAM: + return (MBUF_SC_OAM); + + case SCIDX_AV: + return (MBUF_SC_AV); + + case SCIDX_RV: + return (MBUF_SC_RV); + + case SCIDX_VI: + return (MBUF_SC_VI); + + case SCIDX_VO: + return (MBUF_SC_VO); + + case SCIDX_CTL: + return (MBUF_SC_CTL); + + default: + break; + } + + VERIFY(0); + /* NOTREACHED */ + return (sc); +} + +mbuf_svc_class_t +m_service_class_from_val(u_int32_t v) +{ + mbuf_svc_class_t sc = MBUF_SC_BE; + + switch (v) { + case SCVAL_BK_SYS: + return (MBUF_SC_BK_SYS); + + case SCVAL_BK: + return (MBUF_SC_BK); + + case SCVAL_BE: + return (MBUF_SC_BE); + + case SCVAL_RD: + return (MBUF_SC_RD); + + case SCVAL_OAM: + return (MBUF_SC_OAM); + + case SCVAL_AV: + return (MBUF_SC_AV); + + case SCVAL_RV: + return (MBUF_SC_RV); + + case SCVAL_VI: + return (MBUF_SC_VI); + + case SCVAL_VO: + return (MBUF_SC_VO); + + case SCVAL_CTL: + return (MBUF_SC_CTL); + + default: + break; + } + + VERIFY(0); + /* NOTREACHED */ + return (sc); } diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index b496895f6..af4b4fbe1 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,7 +93,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -105,7 +107,7 @@ #include #include #include - +#include #include #if CONFIG_MACF @@ -113,7 +115,6 @@ #include #endif /* MAC */ -extern int in6_init_done; int so_cache_hw = 0; int so_cache_timeouts = 0; @@ -137,6 +138,8 @@ static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); +static void filt_sockdetach(struct knote *kn); +static int filt_sockev(struct knote *kn, long hint); static int sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p); @@ -154,6 +157,11 @@ static struct filterops sowrite_filtops = { .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; +static struct filterops sock_filtops = { + .f_isfd = 1, + .f_detach = filt_sockdetach, + .f_event = filt_sockev, +}; #define EVEN_MORE_LOCKING_DEBUG 0 int socket_debug = 0; @@ -214,6 +222,10 @@ int sodefunctlog = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, &sodefunctlog, 0, ""); +int sothrottlelog = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED, + &sothrottlelog, 0, ""); + /* * Socket operation routines. * These routines are called by the routines in @@ -225,6 +237,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, /* sys_generic.c */ extern void postevent(struct socket *, struct sockbuf *, int); extern void evsofree(struct socket *); +extern int tcp_notsent_lowat_check(struct socket *so); /* TODO: these should be in header file */ extern int get_inpcb_str_size(void); @@ -234,10 +247,6 @@ extern struct protosw *pffindprotonotype(int, int); extern int soclose_locked(struct socket *); extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *); -#if CONFIG_EMBEDDED -extern int uthread_get_background_state(uthread_t); -#endif /*CONFIG_EMBEDDED */ - #ifdef __APPLE__ vm_size_t so_cache_zone_element_size; @@ -250,7 +259,12 @@ static void so_cache_timer(void *); void soclose_wait_locked(struct socket *so); int so_isdstlocal(struct socket *so); -__private_extern__ u_int32_t sotcdb = 0; +/* + * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from + * setting the DSCP code on the packet based on the service class; see + * for details. + */ +__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP; SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, &sotcdb, 0, ""); @@ -302,9 +316,11 @@ socketinit(void) sflt_init(); - VERIFY(SO_TC_MAX == SO_TC_STATS_MAX); - + _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX); + socket_tclass_init(); + + socket_flowadv_init(); } static void @@ -369,7 +385,7 @@ cached_sock_alloc(struct socket **so, int waitok) offset = ALIGN(offset); - ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb = + ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb = (caddr_t)offset; #if TEMPDEBUG kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n", @@ -424,13 +440,16 @@ so_update_last_owner_locked( struct socket *so, proc_t self) { - if (self == NULL) - self = current_proc(); - - if (self) + if (so->last_pid != 0) { - so->last_upid = proc_uniqueid(self); - so->last_pid = proc_pid(self); + if (self == NULL) + self = current_proc(); + + if (self) + { + so->last_upid = proc_uniqueid(self); + so->last_pid = proc_pid(self); + } } } @@ -500,7 +519,6 @@ soalloc(int waitok, int dom, int type) return (NULL); } #endif /* MAC_SOCKET */ - so_update_last_owner_locked(so, NULL); } return (so); @@ -525,10 +543,6 @@ socreate(int dom, struct socket **aso, int type, int proto) register struct protosw *prp; register struct socket *so; register int error = 0; -#if CONFIG_EMBEDDED - thread_t thread; - struct uthread *ut; -#endif /* CONFIG_EMBEDDED */ #if TCPDEBUG extern int tcpconsdebug; @@ -558,9 +572,10 @@ socreate(int dom, struct socket **aso, int type, int proto) TAILQ_INIT(&so->so_incomp); TAILQ_INIT(&so->so_comp); so->so_type = type; + so->last_upid = proc_uniqueid(p); + so->last_pid = proc_pid(p); - so->so_uid = kauth_cred_getuid(kauth_cred_get()); - so->so_gid = kauth_cred_getgid(kauth_cred_get()); + so->so_cred = kauth_cred_proc_ref(p); if (!suser(kauth_cred_get(), NULL)) so->so_state = SS_PRIV; @@ -610,38 +625,21 @@ socreate(int dom, struct socket **aso, int type, int proto) /* * If this is a background thread/task, mark the socket as such. */ -#if !CONFIG_EMBEDDED - if (proc_get_self_isbackground() != 0) -#else /* !CONFIG_EMBEDDED */ - thread = current_thread(); - ut = get_bsdthread_info(thread); - if (uthread_get_background_state(ut)) -#endif /* !CONFIG_EMBEDDED */ - { + if (proc_get_self_isbackground() != 0) { socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); so->so_background_thread = current_thread(); } switch (dom) { - /* - * Don't mark Unix domain sockets as eligible for defunct by default. - */ + /* + * Don't mark Unix domain or system sockets as eligible for defunct by default. + */ case PF_LOCAL: + case PF_SYSTEM: so->so_flags |= SOF_NODEFUNCT; break; - /* - * Radar 9119053 - * Since v6 initialization is asynchronous and we can't hold - * up the main boot path, we need to at least hold off any - * sockets attempting to be created until the v6 stack is - * up and ready. - */ - case PF_INET6: - if (in6_init_done == 0) - ip6_fin(); - break; - default: - break; + default: + break; } *aso = so; @@ -677,7 +675,7 @@ sobind(struct socket *so, struct sockaddr *nam) int error = 0; socket_lock(so, 1); - + VERIFY(so->so_usecount > 1); so_update_last_owner_locked(so, p); /* @@ -709,6 +707,8 @@ out: void sodealloc(struct socket *so) { + kauth_cred_unref(&so->so_cred); + /* Remove any filters */ sflt_termsock(so); @@ -754,8 +754,6 @@ solisten(struct socket *so, int backlog) socket_lock(so, 1); - so_update_last_owner_locked(so, p); - if (so->so_proto == NULL) { error = EINVAL; goto out; @@ -898,10 +896,10 @@ soclose_wait_locked(struct socket *so) * Double check here and return if there's no outstanding upcall; * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set. */ - if (!(so->so_flags & SOF_UPCALLINUSE) || - !(so->so_flags & SOF_UPCALLCLOSEWAIT)) + if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) return; - + so->so_rcv.sb_flags &= ~SB_UPCALL; + so->so_snd.sb_flags &= ~SB_UPCALL; so->so_flags |= SOF_CLOSEWAIT; (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1), "soclose_wait_locked", NULL); @@ -1037,6 +1035,15 @@ drop: if (so->so_usecount == 0) panic("soclose: usecount is zero so=%p\n", so); if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) { + /* + * Let NetworkStatistics know this PCB is going away + * before we detach it. + */ + if (nstat_collect && + (so->so_proto->pr_domain->dom_family == AF_INET || + so->so_proto->pr_domain->dom_family == AF_INET6)) + nstat_pcb_detach(so->so_pcb); + int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; @@ -1047,6 +1054,9 @@ discard: if (so->so_pcb && so->so_state & SS_NOFDREF) panic("soclose: NOFDREF"); so->so_state |= SS_NOFDREF; + + if ((so->so_flags & SOF_KNOTE) != 0) + KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED); #ifdef __APPLE__ so->so_proto->pr_domain->dom_refs--; evsofree(so); @@ -1062,7 +1072,7 @@ soclose(struct socket *so) int error = 0; socket_lock(so, 1); - if (so->so_flags & SOF_UPCALLINUSE) + if (so->so_upcallusecount) soclose_wait_locked(so); if (so->so_retaincnt == 0) { @@ -1216,8 +1226,6 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) if (dolock) socket_lock(so, 1); - - so_update_last_owner_locked(so, p); /* * If this is a listening socket or if this is a previously-accepted @@ -1419,8 +1427,9 @@ defunct: if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) return (EMSGSIZE); - if (space < resid + clen && - (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) { + if ((space < resid + clen && + (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) || + (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || assumelock) { return (EWOULDBLOCK); @@ -2052,9 +2061,7 @@ restart: * end up with false positives during select() or poll() * which could put the application in a bad state. */ - if (m == NULL && so->so_rcv.sb_cc != 0) - panic("soreceive corrupted so_rcv: m %p cc %u", - m, so->so_rcv.sb_cc); + SB_MB_CHECK(&so->so_rcv); if (so->so_error) { if (m) @@ -2327,11 +2334,16 @@ dontblock: } cm = cmn; } - orig_resid = 0; - if (sb_rcv->sb_mb != NULL) + /* + * Update the value of nextrecord in case we received new + * records when the socket was unlocked above for + * externalizing SCM_RIGHTS. + */ + if (m != NULL) nextrecord = sb_rcv->sb_mb->m_nextpkt; else - nextrecord = NULL; + nextrecord = sb_rcv->sb_mb; + orig_resid = 0; } if (m != NULL) { @@ -2356,7 +2368,6 @@ dontblock: flags |= MSG_OOB; } else { if (!(flags & MSG_PEEK)) { - so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } } @@ -2603,6 +2614,7 @@ dontblock: if (m) { nextrecord = m->m_nextpkt; } + SB_MB_CHECK(&so->so_rcv); } } #ifdef MORE_LOCKING_DEBUG @@ -2650,6 +2662,7 @@ dontblock: } else if (nextrecord->m_nextpkt == NULL) { so->so_rcv.sb_lastrecord = nextrecord; } + SB_MB_CHECK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); @@ -2975,7 +2988,6 @@ sosetopt(struct socket *so, struct sockopt *sopt) #endif /* MAC_SOCKET */ socket_lock(so, 1); - so_update_last_owner_locked(so, NULL); if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == (SS_CANTRCVMORE | SS_CANTSENDMORE) && @@ -3066,17 +3078,18 @@ sosetopt(struct socket *so, struct sockopt *sopt) switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: - if (sbreserve(sopt->sopt_name == SO_SNDBUF ? - &so->so_snd : &so->so_rcv, - (u_int32_t) optval) == 0) { + { + struct sockbuf *sb = (sopt->sopt_name == SO_SNDBUF) ? + &so->so_snd : &so->so_rcv; + if (sbreserve(sb, (u_int32_t) optval) == 0) { error = ENOBUFS; goto bad; } - if (sopt->sopt_name == SO_SNDBUF) - so->so_snd.sb_flags |= SB_USRSIZE; - else - so->so_rcv.sb_flags |= SB_USRSIZE; + sb->sb_flags |= SB_USRSIZE; + sb->sb_flags &= ~SB_AUTOSIZE; + sb->sb_idealsize = (u_int32_t)optval; break; + } /* * Make sure the low-water is never greater than @@ -3268,12 +3281,12 @@ sosetopt(struct socket *so, struct sockopt *sopt) so->so_flags |= SOF_RECV_TRAFFIC_CLASS; break; } - + case SO_TRAFFIC_CLASS_DBG: { struct so_tcdbg so_tcdbg; - - error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), - sizeof (struct so_tcdbg)); + + error = sooptcopyin(sopt, &so_tcdbg, + sizeof (struct so_tcdbg), sizeof (struct so_tcdbg)); if (error) goto bad; error = so_set_tcdbg(so, &so_tcdbg); @@ -3281,7 +3294,22 @@ sosetopt(struct socket *so, struct sockopt *sopt) goto bad; break; } - + + case SO_PRIVILEGED_TRAFFIC_CLASS: + error = priv_check_cred(kauth_cred_get(), + PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0); + if (error) + goto bad; + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval == 0) + so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS; + else + so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS; + break; + case SO_DEFUNCTOK: error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval)); @@ -3317,6 +3345,25 @@ sosetopt(struct socket *so, struct sockopt *sopt) error = EINVAL; break; + case SO_OPPORTUNISTIC: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error == 0) + error = so_set_opportunistic(so, optval); + break; + + case SO_FLUSH: + /* This option is handled by lower layer(s) */ + error = 0; + break; + + case SO_RECV_ANYIF: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error == 0) + error = so_set_recv_anyif(so, optval); + break; + default: error = ENOPROTOOPT; break; @@ -3414,7 +3461,6 @@ sogetopt(struct socket *so, struct sockopt *sopt) } socket_lock(so, 1); - so_update_last_owner_locked(so, NULL); error = sflt_getsockopt(so, sopt); if (error) { @@ -3593,18 +3639,23 @@ integer: case SO_TRAFFIC_CLASS: optval = so->so_traffic_class; goto integer; - + case SO_RECV_TRAFFIC_CLASS: optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); goto integer; case SO_TRAFFIC_CLASS_STATS: error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats)); + break; case SO_TRAFFIC_CLASS_DBG: error = sogetopt_tcdbg(so, sopt); break; - + + case SO_PRIVILEGED_TRAFFIC_CLASS: + optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); + goto integer; + case SO_DEFUNCTOK: optval = !(so->so_flags & SOF_NODEFUNCT); goto integer; @@ -3613,6 +3664,19 @@ integer: optval = (so->so_flags & SOF_DEFUNCT); goto integer; + case SO_OPPORTUNISTIC: + optval = so_get_opportunistic(so); + goto integer; + + case SO_FLUSH: + /* This option is not gettable */ + error = EINVAL; + break; + + case SO_RECV_ANYIF: + optval = so_get_recv_anyif(so); + goto integer; + default: error = ENOPROTOOPT; break; @@ -3763,7 +3827,6 @@ sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql) int revents = 0; socket_lock(so, 1); - so_update_last_owner_locked(so, p); if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) @@ -3806,7 +3869,7 @@ soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - struct sockbuf *sb; + struct klist *skl; socket_lock(so, 1); @@ -3820,19 +3883,37 @@ soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; - sb = &so->so_rcv; + skl = &so->so_rcv.sb_sel.si_note; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; - sb = &so->so_snd; + skl = &so->so_snd.sb_sel.si_note; + break; + case EVFILT_SOCK: + kn->kn_fop = &sock_filtops; + skl = &so->so_klist; break; default: socket_unlock(so, 1); return (1); } - if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn)) - sb->sb_flags |= SB_KNOTE; + if (KNOTE_ATTACH(skl, kn)) { + switch(kn->kn_filter) { + case EVFILT_READ: + so->so_rcv.sb_flags |= SB_KNOTE; + break; + case EVFILT_WRITE: + so->so_snd.sb_flags |= SB_KNOTE; + break; + case EVFILT_SOCK: + so->so_flags |= SOF_KNOTE; + break; + default: + socket_unlock(so, 1); + return (1); + } + } socket_unlock(so, 1); return (0); } @@ -3945,11 +4026,25 @@ filt_sowdetach(struct knote *kn) socket_unlock(so, 1); } +int +so_wait_for_if_feedback(struct socket *so) +{ + if ((so->so_proto->pr_domain->dom_family == AF_INET || + so->so_proto->pr_domain->dom_family == AF_INET6) && + (so->so_state & SS_ISCONNECTED)) { + struct inpcb *inp = sotoinpcb(so); + if (INP_WAIT_FOR_IF_FEEDBACK(inp)) + return (1); + } + return (0); +} + /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int ret = 0; if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_lock(so, 1); @@ -3958,20 +4053,17 @@ filt_sowrite(struct knote *kn, long hint) if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (1); + ret = 1; + goto out; } if (so->so_error) { /* temporary udp error */ - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (1); + ret = 1; + goto out; } if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (0); + ret = 0; + goto out; } int64_t lowwat = so->so_snd.sb_lowat; if (kn->kn_sfflags & NOTE_LOWAT) @@ -3981,9 +4073,119 @@ filt_sowrite(struct knote *kn, long hint) else if (kn->kn_sdata > lowwat) lowwat = kn->kn_sdata; } + if (kn->kn_data >= lowwat) { + if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) { + ret = tcp_notsent_lowat_check(so); + } else { + ret = 1; + } + } + if (so_wait_for_if_feedback(so)) + ret = 0; +out: if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - return (kn->kn_data >= lowwat); + return(ret); +} + +static void +filt_sockdetach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + socket_lock(so, 1); + + if ((so->so_flags & SOF_KNOTE) != 0) + if (KNOTE_DETACH(&so->so_klist, kn)) + so->so_flags &= ~SOF_KNOTE; + socket_unlock(so, 1); +} + +static int +filt_sockev(struct knote *kn, long hint) +{ + int ret = 0, locked = 0; + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + + if ((hint & SO_FILT_HINT_LOCKED) == 0) { + socket_lock(so, 1); + locked = 1; + } + + switch (hint & SO_FILT_HINT_EV) { + case SO_FILT_HINT_CONNRESET: + if (kn->kn_sfflags & NOTE_CONNRESET) + kn->kn_fflags |= NOTE_CONNRESET; + break; + case SO_FILT_HINT_TIMEOUT: + if (kn->kn_sfflags & NOTE_TIMEOUT) + kn->kn_fflags |= NOTE_TIMEOUT; + break; + case SO_FILT_HINT_NOSRCADDR: + if (kn->kn_sfflags & NOTE_NOSRCADDR) + kn->kn_fflags |= NOTE_NOSRCADDR; + break; + case SO_FILT_HINT_IFDENIED: + if ((kn->kn_sfflags & NOTE_IFDENIED)) + kn->kn_fflags |= NOTE_IFDENIED; + break; + case SO_FILT_HINT_KEEPALIVE: + if (kn->kn_sfflags & NOTE_KEEPALIVE) + kn->kn_fflags |= NOTE_KEEPALIVE; + } + + if ((kn->kn_sfflags & NOTE_READCLOSED) && + (so->so_state & SS_CANTRCVMORE)) + kn->kn_fflags |= NOTE_READCLOSED; + + if ((kn->kn_sfflags & NOTE_WRITECLOSED) && + (so->so_state & SS_CANTSENDMORE)) + kn->kn_fflags |= NOTE_WRITECLOSED; + + if ((kn->kn_sfflags & NOTE_SUSPEND) && + ((hint & SO_FILT_HINT_SUSPEND) || + (so->so_flags & SOF_SUSPENDED))) { + kn->kn_fflags &= + ~(NOTE_SUSPEND | NOTE_RESUME); + kn->kn_fflags |= NOTE_SUSPEND; + } + + if ((kn->kn_sfflags & NOTE_RESUME) && + ((hint & SO_FILT_HINT_RESUME) || + (so->so_flags & SOF_SUSPENDED) == 0)) { + kn->kn_fflags &= + ~(NOTE_SUSPEND | NOTE_RESUME); + kn->kn_fflags |= NOTE_RESUME; + } + + if (so->so_error != 0) { + ret = 1; + kn->kn_data = so->so_error; + kn->kn_flags |= EV_EOF; + } else { + get_sockev_state(so, (u_int32_t *)&(kn->kn_data)); + } + + if (kn->kn_fflags != 0) + ret = 1; + + if (locked) + socket_unlock(so, 1); + + return(ret); +} + +void +get_sockev_state(struct socket *so, u_int32_t *statep) { + u_int32_t state = *(statep); + + if (so->so_state & SS_ISCONNECTED) + state |= SOCKEV_CONNECTED; + else + state &= ~(SOCKEV_CONNECTED); + state |= ((so->so_state & SS_ISDISCONNECTED) ? + SOCKEV_DISCONNECTED : 0); + *(statep) = state; + return; } #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1) @@ -4227,3 +4429,40 @@ sodefunct(struct proc *p, struct socket *so, int level) done: return (0); } + +__private_extern__ int +so_set_recv_anyif(struct socket *so, int optval) +{ + int ret = 0; + +#if INET6 + if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) { +#else + if (INP_SOCKAF(so) == AF_INET) { +#endif /* !INET6 */ + if (optval) + sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF; + else + sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF; + } else { + ret = EPROTONOSUPPORT; + } + + return (ret); +} + +__private_extern__ int +so_get_recv_anyif(struct socket *so) +{ + int ret = 0; + +#if INET6 + if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) { +#else + if (INP_SOCKAF(so) == AF_INET) { +#endif /* !INET6 */ + ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0; + } + + return (ret); +} diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index 4b71dd80c..32b896ee8 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,8 @@ #include #endif +#include + /* TODO: this should be in a header file somewhere */ extern void postevent(struct socket *, struct sockbuf *, int); @@ -122,9 +125,11 @@ u_int32_t sb_max = SB_MAX; /* XXX should be static */ u_int32_t high_sb_max = SB_MAX; static u_int32_t sb_efficiency = 8; /* parameter for sbreserve() */ -__private_extern__ unsigned int total_mb_cnt = 0; -__private_extern__ unsigned int total_cl_cnt = 0; -__private_extern__ int sbspace_factor = 8; +__private_extern__ int32_t total_sbmb_cnt = 0; + +/* Control whether to throttle sockets eligible to be throttled */ +__private_extern__ u_int32_t net_io_policy_throttled = 0; +static int sysctl_io_policy_throttled SYSCTL_HANDLER_ARGS; /* * Procedures to manipulate state flags of socket @@ -197,6 +202,7 @@ soisconnected(struct socket *so) wakeup((caddr_t)&so->so_timeo); sorwakeup(so); sowwakeup(so); + soevent(so, SO_FILT_HINT_LOCKED); } } @@ -205,6 +211,7 @@ soisdisconnecting(struct socket *so) { so->so_state &= ~SS_ISCONNECTING; so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + soevent(so, SO_FILT_HINT_LOCKED); sflt_notify(so, sock_evt_disconnecting, NULL); wakeup((caddr_t)&so->so_timeo); sowwakeup(so); @@ -216,6 +223,7 @@ soisdisconnected(struct socket *so) { so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + soevent(so, SO_FILT_HINT_LOCKED); sflt_notify(so, sock_evt_disconnected, NULL); wakeup((caddr_t)&so->so_timeo); sowwakeup(so); @@ -231,6 +239,7 @@ sodisconnectwakeup(struct socket *so) { so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + soevent(so, SO_FILT_HINT_LOCKED); wakeup((caddr_t)&so->so_timeo); sowwakeup(so); sorwakeup(so); @@ -297,8 +306,10 @@ sonewconn_internal(struct socket *head, int connstatus) so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_pgid = head->so_pgid; - so->so_uid = head->so_uid; - so->so_gid = head->so_gid; + kauth_cred_ref(head->so_cred); + so->so_cred = head->so_cred; + so->last_pid = head->last_pid; + so->last_upid = head->last_upid; /* inherit socket options stored in so_flags */ so->so_flags = head->so_flags & (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | @@ -306,7 +317,10 @@ sonewconn_internal(struct socket *head, int connstatus) SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT | - SOF_NODEFUNCT); + SOF_NODEFUNCT | + SOF_PRIVILEGED_TRAFFIC_CLASS| + SOF_NOTSENT_LOWAT | + SOF_USELRO); so->so_usecount = 1; so->next_lock_lr = 0; so->next_unlock_lr = 0; @@ -330,6 +344,8 @@ sonewconn_internal(struct socket *head, int connstatus) sodealloc(so); return ((struct socket *)0); } + so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE); + so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE); /* * Must be done with head unlocked to avoid deadlock @@ -419,6 +435,7 @@ void socantsendmore(struct socket *so) { so->so_state |= SS_CANTSENDMORE; + soevent(so, SO_FILT_HINT_LOCKED); sflt_notify(so, sock_evt_cantsendmore, NULL); sowwakeup(so); } @@ -427,6 +444,7 @@ void socantrcvmore(struct socket *so) { so->so_state |= SS_CANTRCVMORE; + soevent(so, SO_FILT_HINT_LOCKED); sflt_notify(so, sock_evt_cantrecvmore, NULL); sorwakeup(so); } @@ -576,15 +594,15 @@ sowakeup(struct socket *so, struct sockbuf *sb) so_upcall = so->so_upcall; so_upcallarg = so->so_upcallarg; /* Let close know that we're about to do an upcall */ - so->so_flags |= SOF_UPCALLINUSE; + so->so_upcallusecount++; socket_unlock(so, 0); (*so_upcall)(so, so_upcallarg, M_DONTWAIT); socket_lock(so, 0); - so->so_flags &= ~SOF_UPCALLINUSE; + so->so_upcallusecount--; /* Tell close that it's safe to proceed */ - if (so->so_flags & SOF_CLOSEWAIT) + if (so->so_flags & SOF_CLOSEWAIT && so->so_upcallusecount == 0) wakeup((caddr_t)&so->so_upcall); } } @@ -631,8 +649,14 @@ soreserve(struct socket *so, u_int32_t sndcc, u_int32_t rcvcc) if (sbreserve(&so->so_snd, sndcc) == 0) goto bad; + else + so->so_snd.sb_idealsize = sndcc; + if (sbreserve(&so->so_rcv, rcvcc) == 0) goto bad2; + else + so->so_rcv.sb_idealsize = rcvcc; + if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) @@ -1445,6 +1469,7 @@ sbcreatecontrol(caddr_t p, int size, int type, int level) if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) return ((struct mbuf *)NULL); cp = mtod(m, struct cmsghdr *); + VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t))); /* XXX check size? */ (void) memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); @@ -1464,24 +1489,26 @@ sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf** mp) *mp = sbcreatecontrol(p, size, type, level); return mp; } - + if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN){ mp = &(*mp)->m_next; *mp = sbcreatecontrol(p, size, type, level); return mp; } - + m = *mp; - - cp = (struct cmsghdr *) (mtod(m, char *) + m->m_len); + + cp = (struct cmsghdr *)(void *)(mtod(m, char *) + m->m_len); + /* CMSG_SPACE ensures 32-bit alignment */ + VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t))); m->m_len += CMSG_SPACE(size); - + /* XXX check size? */ (void) memcpy(CMSG_DATA(cp), p, size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; - + return mp; } @@ -1699,9 +1726,10 @@ soreadable(struct socket *so) int sowriteable(struct socket *so) { - return ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && - ((so->so_state&SS_ISCONNECTED) || - (so->so_proto->pr_flags&PR_CONNREQUIRED) == 0)) || + return ((!so_wait_for_if_feedback(so) && + sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && + ((so->so_state & SS_ISCONNECTED) || + (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) || (so->so_state & SS_CANTSENDMORE) || so->so_error); } @@ -1711,7 +1739,7 @@ sowriteable(struct socket *so) void sballoc(struct sockbuf *sb, struct mbuf *m) { - int cnt = 1; + u_int32_t cnt = 1; sb->sb_cc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) @@ -1720,9 +1748,10 @@ sballoc(struct sockbuf *sb, struct mbuf *m) if (m->m_flags & M_EXT) { sb->sb_mbcnt += m->m_ext.ext_size; - cnt += m->m_ext.ext_size / MSIZE ; + cnt += (m->m_ext.ext_size >> MSIZESHIFT) ; } - OSAddAtomic(cnt, &total_mb_cnt); + OSAddAtomic(cnt, &total_sbmb_cnt); + VERIFY(total_sbmb_cnt > 0); } /* adjust counters in sb reflecting freeing of m */ @@ -1730,6 +1759,7 @@ void sbfree(struct sockbuf *sb, struct mbuf *m) { int cnt = -1; + sb->sb_cc -= m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_HEADER && m->m_type != MT_OOBDATA) @@ -1737,9 +1767,10 @@ sbfree(struct sockbuf *sb, struct mbuf *m) sb->sb_mbcnt -= MSIZE; if (m->m_flags & M_EXT) { sb->sb_mbcnt -= m->m_ext.ext_size; - cnt -= m->m_ext.ext_size / MSIZE ; + cnt -= (m->m_ext.ext_size >> MSIZESHIFT) ; } - OSAddAtomic(cnt, &total_mb_cnt); + OSAddAtomic(cnt, &total_sbmb_cnt); + VERIFY(total_sbmb_cnt >= 0); } /* @@ -1818,6 +1849,14 @@ sowwakeup(struct socket *so) if (sb_notify(&so->so_snd)) sowakeup(so, &so->so_snd); } + +void +soevent(struct socket *so, long hint) +{ + if (so->so_flags & SOF_KNOTE) + KNOTE(&so->so_klist, hint); +} + #endif /* __APPLE__ */ /* @@ -1847,12 +1886,12 @@ void sotoxsocket(struct socket *so, struct xsocket *xso) { xso->xso_len = sizeof (*xso); - xso->xso_so = (_XSOCKET_PTR(struct socket *))(uintptr_t)so; + xso->xso_so = (_XSOCKET_PTR(struct socket *))VM_KERNEL_ADDRPERM(so); xso->so_type = so->so_type; - xso->so_options = so->so_options; + xso->so_options = (short)(so->so_options & 0xffff); xso->so_linger = so->so_linger; xso->so_state = so->so_state; - xso->so_pcb = (_XSOCKET_PTR(caddr_t))(uintptr_t)so->so_pcb; + xso->so_pcb = (_XSOCKET_PTR(caddr_t))VM_KERNEL_ADDRPERM(so->so_pcb); if (so->so_proto) { xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; @@ -1868,7 +1907,7 @@ sotoxsocket(struct socket *so, struct xsocket *xso) xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); - xso->so_uid = so->so_uid; + xso->so_uid = kauth_cred_getuid(so->so_cred); } @@ -1878,12 +1917,12 @@ void sotoxsocket64(struct socket *so, struct xsocket64 *xso) { xso->xso_len = sizeof (*xso); - xso->xso_so = (u_int64_t)(uintptr_t)so; + xso->xso_so = (u_int64_t)VM_KERNEL_ADDRPERM(so); xso->so_type = so->so_type; - xso->so_options = so->so_options; + xso->so_options = (short)(so->so_options & 0xffff); xso->so_linger = so->so_linger; xso->so_state = so->so_state; - xso->so_pcb = (u_int64_t)(uintptr_t)so->so_pcb; + xso->so_pcb = (u_int64_t)VM_KERNEL_ADDRPERM(so->so_pcb); if (so->so_proto) { xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; @@ -1899,7 +1938,7 @@ sotoxsocket64(struct socket *so, struct xsocket64 *xso) xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); - xso->so_uid = so->so_uid; + xso->so_uid = kauth_cred_getuid(so->so_cred); } #endif /* !CONFIG_EMBEDDED */ @@ -1925,12 +1964,29 @@ sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) xsb->sb_timeo = 1; } +/* + * Based on the policy set by an all knowing decison maker, throttle sockets + * that either have been marked as belonging to "background" process. + */ int -soisbackground(struct socket *so) +soisthrottled(struct socket *so) { - return (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); + /* + * On non-embedded, we rely on implicit throttling by the application, + * as we're missing the system-wide "decision maker". + */ + return ( +#if CONFIG_EMBEDDED + net_io_policy_throttled && +#endif /* CONFIG_EMBEDDED */ + (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND)); } +int +soisprivilegedtraffic(struct socket *so) +{ + return (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); +} /* * Here is the definition of some of the basic objects in the kern.ipc @@ -1959,6 +2015,27 @@ sysctl_sb_max(__unused struct sysctl_oid *oidp, __unused void *arg1, return error; } +static int +sysctl_io_policy_throttled SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int i, err; + + i = net_io_policy_throttled; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + if (i != net_io_policy_throttled) + SOTHROTTLELOG(("throttle: network IO policy throttling is " + "now %s\n", i ? "ON" : "OFF")); + + net_io_policy_throttled = i; + + return (err); +} + SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size"); @@ -1966,8 +2043,6 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD | CTLFLAG_LOCKED, &maxsockets, 0, "Maximum number of sockets avaliable"); SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, sbspace_factor, CTLFLAG_RW | CTLFLAG_LOCKED, - &sbspace_factor, 0, "Ratio of mbuf/cluster use for socket layers"); SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, njcl, CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, ""); @@ -1976,3 +2051,9 @@ SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat, CTLFLAG_RW | CTLFLAG_ &soqlimitcompat, 1, "Enable socket queue limit compatibility"); SYSCTL_INT(_kern_ipc, OID_AUTO, soqlencomp, CTLFLAG_RW | CTLFLAG_LOCKED, &soqlencomp, 0, "Listen backlog represents only complete queue"); + +SYSCTL_NODE(_kern_ipc, OID_AUTO, io_policy, CTLFLAG_RW, 0, "network IO policy"); + +SYSCTL_PROC(_kern_ipc_io_policy, OID_AUTO, throttled, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &net_io_policy_throttled, 0, + sysctl_io_policy_throttled, "I", ""); diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 8a6356d5a..bd2bd5dd3 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -199,12 +199,7 @@ socket(struct proc *p, struct socket_args *uap, int32_t *retval) ut = get_bsdthread_info(thread); /* if this is a backgrounded thread then throttle all new sockets */ -#if !CONFIG_EMBEDDED - if (proc_get_selfthread_isbackground() != 0) -#else /* !CONFIG_EMBEDDED */ - if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) -#endif /* !CONFIG_EMBEDDED */ - { + if (proc_get_selfthread_isbackground() != 0) { so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND; so->so_background_thread = thread; } @@ -475,17 +470,17 @@ accept_nocancel(struct proc *p, struct accept_nocancel_args *uap, fflag = fp->f_flag; error = falloc(p, &fp, &newfd, vfs_context_current()); if (error) { - /* - * Probably ran out of file descriptors. Put the - * unaccepted connection back onto the queue and - * do another wakeup so some other process might - * have a chance at it. + /* + * Probably ran out of file descriptors. + * + * + * Don't put this back on the socket like we used to, that + * just causes the client to spin. Drop the socket. */ - socket_lock(head, 0); - TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); - head->so_qlen++; - wakeup_one((caddr_t)&head->so_timeo); - socket_unlock(head, 1); + so->so_state &= ~(SS_NOFDREF | SS_COMP); + so->so_head = NULL; + soclose(so); + sodereference(head); goto out; } *retval = newfd; @@ -864,9 +859,10 @@ sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, /* * We check the state without holding the socket lock; * if a race condition occurs, it would simply result - * in an extra call to the MAC check function. + * in an extra call to the MAC check function. */ - if (!(so->so_state & SS_ISCONNECTED) && + if ( to != NULL && + !(so->so_state & SS_DEFUNCT) && (error = mac_socket_check_send(kauth_cred_get(), so, to)) != 0) goto bad; #endif /* MAC_SOCKET_SUBSET */ @@ -1149,7 +1145,8 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, * if a race condition occurs, it would simply result * in an extra call to the MAC check function. */ - if (!(so->so_state & SS_ISCONNECTED) && + if (!(so->so_state & SS_DEFUNCT) && + !(so->so_state & SS_ISCONNECTED) && (error = mac_socket_check_receive(kauth_cred_get(), so)) != 0) goto out1; #endif /* MAC_SOCKET_SUBSET */ @@ -1220,15 +1217,15 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, */ if (cp->cmsg_level == SOL_SOCKET && cp->cmsg_type == SCM_TIMESTAMP) { unsigned char tmp_buffer[CMSG_SPACE(sizeof(struct user64_timeval))]; - struct cmsghdr *tmp_cp = (struct cmsghdr *)tmp_buffer; + struct cmsghdr *tmp_cp = (struct cmsghdr *)(void *)tmp_buffer; int tmp_space; - struct timeval *tv = (struct timeval *)CMSG_DATA(cp); + struct timeval *tv = (struct timeval *)(void *)CMSG_DATA(cp); tmp_cp->cmsg_level = SOL_SOCKET; tmp_cp->cmsg_type = SCM_TIMESTAMP; if (proc_is64bit(p)) { - struct user64_timeval *tv64 = (struct user64_timeval *)CMSG_DATA(tmp_cp); + struct user64_timeval *tv64 = (struct user64_timeval *)(void *)CMSG_DATA(tmp_cp); tv64->tv_sec = tv->tv_sec; tv64->tv_usec = tv->tv_usec; @@ -1236,7 +1233,7 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval)); tmp_space = CMSG_SPACE(sizeof(struct user64_timeval)); } else { - struct user32_timeval *tv32 = (struct user32_timeval *)CMSG_DATA(tmp_cp); + struct user32_timeval *tv32 = (struct user32_timeval *)(void *)CMSG_DATA(tmp_cp); tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; @@ -1278,7 +1275,7 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, len -= tocopy; buflen -= cp_size; - cp = (struct cmsghdr *) ((unsigned char *) cp + cp_size); + cp = (struct cmsghdr *)(void *)((unsigned char *) cp + cp_size); cp_size = CMSG_ALIGN(cp->cmsg_len); } @@ -2073,7 +2070,13 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) size_t sizeof_hdtr; off_t file_size; struct vfs_context context = *vfs_context_current(); - +#define ENXIO_10146739_DBG(err_str) { \ + if (error == ENXIO) { \ + printf(err_str, \ + __func__, \ + "File a radar related to rdar://10146739 \n"); \ + } \ +} KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE | DBG_FUNC_START), uap->s, 0, 0, 0, 0); @@ -2085,6 +2088,7 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) * type and connected socket out, positive offset. */ if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) { + ENXIO_10146739_DBG("%s: fp_getfvp error. %s"); goto done; } if ((fp->f_flag & FREAD) == 0) { @@ -2097,6 +2101,7 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) } error = file_socket(uap->s, &so); if (error) { + ENXIO_10146739_DBG("%s: file_socket error. %s"); goto done1; } if (so == NULL) { @@ -2179,8 +2184,10 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) nuap.iovp = user_hdtr.headers; nuap.iovcnt = user_hdtr.hdr_cnt; error = writev_nocancel(p, &nuap, &writev_retval); - if (error) + if (error) { + ENXIO_10146739_DBG("%s: writev_nocancel error. %s"); goto done2; + } sbytes += writev_retval; } } @@ -2190,8 +2197,10 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) * 1. We don't want to allocate more mbufs than necessary * 2. We don't want to read past the end of file */ - if ((error = vnode_size(vp, &file_size, vfs_context_current())) != 0) + if ((error = vnode_size(vp, &file_size, vfs_context_current())) != 0) { + ENXIO_10146739_DBG("%s: vnode_size error. %s"); goto done2; + } /* * Simply read file data into a chain of mbufs that used with scatter @@ -2264,11 +2273,12 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) pktlen = mbuf_pkt_maxlen(m0); if (pktlen < (size_t)xfsize) xfsize = pktlen; - + auio = uio_createwithbuffer(nbufs, off, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof (uio_buf)); if (auio == NULL) { - //printf("sendfile: uio_createwithbuffer failed\n"); + printf("sendfile failed. nbufs = %d. %s", nbufs, + "File a radar related to rdar://10146739.\n"); mbuf_freem(m0); error = ENXIO; socket_lock(so, 0); @@ -2302,6 +2312,7 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) error == EINTR || error == EWOULDBLOCK)) { error = 0; } else { + ENXIO_10146739_DBG("%s: fo_read error. %s"); mbuf_freem(m0); goto done3; } @@ -2351,6 +2362,7 @@ retry_space: so->so_error = 0; } m_freem(m0); + ENXIO_10146739_DBG("%s: Unexpected socket error. %s"); goto done3; } /* @@ -2393,6 +2405,7 @@ retry_space: error = 0; continue; } + ENXIO_10146739_DBG("%s: sflt_data_out error. %s"); goto done3; } /* @@ -2406,6 +2419,7 @@ retry_space: KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START), uap->s, 0, 0, 0, 0); if (error) { + ENXIO_10146739_DBG("%s: pru_send error. %s"); goto done3; } } @@ -2420,8 +2434,10 @@ retry_space: nuap.iovp = user_hdtr.trailers; nuap.iovcnt = user_hdtr.trl_cnt; error = writev_nocancel(p, &nuap, &writev_retval); - if (error) + if (error) { + ENXIO_10146739_DBG("%s: writev_nocancel error. %s"); goto done2; + } sbytes += writev_retval; } done2: diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index c64053a2c..2368c19bd 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,6 +96,8 @@ #include #endif /* CONFIG_MACF */ +#include + #define f_msgcount f_fglob->fg_msgcount #define f_cred f_fglob->fg_cred #define f_ops f_fglob->fg_ops @@ -699,6 +701,18 @@ uipc_ctloutput(struct socket *so, struct sockopt *sopt) error = EINVAL; } break; + case LOCAL_PEERPID: + if (unp->unp_conn != NULL) { + if (unp->unp_conn->unp_socket != NULL) { + pid_t peerpid = unp->unp_conn->unp_socket->last_pid; + error = sooptcopyout(sopt, &peerpid, sizeof (peerpid)); + } else { + panic("peer is connected but has no socket?"); + } + } else { + error = ENOTCONN; + } + break; default: error = EOPNOTSUPP; break; @@ -821,6 +835,8 @@ unp_detach(struct unpcb *unp) lck_rw_lock_exclusive(unp_list_mtx); LIST_REMOVE(unp, unp_link); + --unp_count; + ++unp_gencnt; lck_rw_done(unp_list_mtx); if (unp->unp_vnode) { struct vnode *tvp = NULL; @@ -1122,8 +1138,16 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0, nam)) == 0) { error = ECONNREFUSED; - socket_unlock(so2, 1); - socket_lock(so, 0); + if (so != so2) { + socket_unlock(so2, 1); + socket_lock(so, 0); + } else { + socket_lock(so, 0); + /* Release the reference held for + * listen socket. + */ + so2->so_usecount--; + } goto out; } unp2 = sotounpcb(so2); @@ -1455,31 +1479,37 @@ static void unpcb_to_compat(struct unpcb *up, struct unpcb_compat *cp) { #if defined(__LP64__) - cp->unp_link.le_next = (u_int32_t)(uintptr_t)up->unp_link.le_next; - cp->unp_link.le_prev = (u_int32_t)(uintptr_t)up->unp_link.le_prev; + cp->unp_link.le_next = (u_int32_t) + VM_KERNEL_ADDRPERM(up->unp_link.le_next); + cp->unp_link.le_prev = (u_int32_t) + VM_KERNEL_ADDRPERM(up->unp_link.le_prev); #else - cp->unp_link.le_next = (struct unpcb_compat *)up->unp_link.le_next; - cp->unp_link.le_prev = (struct unpcb_compat **)up->unp_link.le_prev; + cp->unp_link.le_next = (struct unpcb_compat *) + VM_KERNEL_ADDRPERM(up->unp_link.le_next); + cp->unp_link.le_prev = (struct unpcb_compat **) + VM_KERNEL_ADDRPERM(up->unp_link.le_prev); #endif - cp->unp_socket = (_UNPCB_PTR(struct socket *))(uintptr_t)up->unp_socket; - cp->unp_vnode = (_UNPCB_PTR(struct vnode *))(uintptr_t)up->unp_vnode; + cp->unp_socket = (_UNPCB_PTR(struct socket *)) + VM_KERNEL_ADDRPERM(up->unp_socket); + cp->unp_vnode = (_UNPCB_PTR(struct vnode *)) + VM_KERNEL_ADDRPERM(up->unp_vnode); cp->unp_ino = up->unp_ino; cp->unp_conn = (_UNPCB_PTR(struct unpcb_compat *)) - (uintptr_t)up->unp_conn; - cp->unp_refs = (u_int32_t)(uintptr_t)up->unp_refs.lh_first; + VM_KERNEL_ADDRPERM(up->unp_conn); + cp->unp_refs = (u_int32_t)VM_KERNEL_ADDRPERM(up->unp_refs.lh_first); #if defined(__LP64__) cp->unp_reflink.le_next = - (u_int32_t)(uintptr_t)up->unp_reflink.le_next; + (u_int32_t)VM_KERNEL_ADDRPERM(up->unp_reflink.le_next); cp->unp_reflink.le_prev = - (u_int32_t)(uintptr_t)up->unp_reflink.le_prev; + (u_int32_t)VM_KERNEL_ADDRPERM(up->unp_reflink.le_prev); #else cp->unp_reflink.le_next = - (struct unpcb_compat *)up->unp_reflink.le_next; + (struct unpcb_compat *)VM_KERNEL_ADDRPERM(up->unp_reflink.le_next); cp->unp_reflink.le_prev = - (struct unpcb_compat **)up->unp_reflink.le_prev; + (struct unpcb_compat **)VM_KERNEL_ADDRPERM(up->unp_reflink.le_prev); #endif cp->unp_addr = (_UNPCB_PTR(struct sockaddr_un *)) - (uintptr_t)up->unp_addr; + VM_KERNEL_ADDRPERM(up->unp_addr); cp->unp_cc = up->unp_cc; cp->unp_mbcnt = up->unp_mbcnt; cp->unp_gencnt = up->unp_gencnt; @@ -1563,7 +1593,7 @@ unp_pcblist SYSCTL_HANDLER_ARGS bzero(&xu, sizeof (xu)); xu.xu_len = sizeof (xu); xu.xu_unpp = (_UNPCB_PTR(struct unpcb_compat *)) - (uintptr_t)unp; + VM_KERNEL_ADDRPERM(unp); /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. @@ -1687,20 +1717,24 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS bzero(&xu, xu_len); xu.xu_len = xu_len; - xu.xu_unpp = (u_int64_t)(uintptr_t)unp; - xu.xunp_link.le_next = - (u_int64_t)(uintptr_t)unp->unp_link.le_next; - xu.xunp_link.le_prev = - (u_int64_t)(uintptr_t)unp->unp_link.le_prev; - xu.xunp_socket = (u_int64_t)(uintptr_t)unp->unp_socket; - xu.xunp_vnode = (u_int64_t)(uintptr_t)unp->unp_vnode; + xu.xu_unpp = (u_int64_t)VM_KERNEL_ADDRPERM(unp); + xu.xunp_link.le_next = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_link.le_next); + xu.xunp_link.le_prev = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_link.le_prev); + xu.xunp_socket = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_socket); + xu.xunp_vnode = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_vnode); xu.xunp_ino = unp->unp_ino; - xu.xunp_conn = (u_int64_t)(uintptr_t)unp->unp_conn; - xu.xunp_refs = (u_int64_t)(uintptr_t)unp->unp_refs.lh_first; - xu.xunp_reflink.le_next = - (u_int64_t)(uintptr_t)unp->unp_reflink.le_next; - xu.xunp_reflink.le_prev = - (u_int64_t)(uintptr_t)unp->unp_reflink.le_prev; + xu.xunp_conn = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_conn); + xu.xunp_refs = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_refs.lh_first); + xu.xunp_reflink.le_next = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_reflink.le_next); + xu.xunp_reflink.le_prev = (u_int64_t) + VM_KERNEL_ADDRPERM(unp->unp_reflink.le_prev); xu.xunp_cc = unp->unp_cc; xu.xunp_mbcnt = unp->unp_mbcnt; xu.xunp_gencnt = unp->unp_gencnt; @@ -2327,9 +2361,8 @@ unp_unlock(struct socket *so, int refcount, void * lr) lck_mtx_unlock(mutex_held); - unp->unp_gencnt = ++unp_gencnt; + lck_mtx_destroy(&unp->unp_mtx, unp_mtx_grp); zfree(unp_zone, unp); - --unp_count; unp_gc(); } else { diff --git a/bsd/kern/vm_pressure.c b/bsd/kern/vm_pressure.c index b5fc2f072..f2ae46185 100644 --- a/bsd/kern/vm_pressure.c +++ b/bsd/kern/vm_pressure.c @@ -40,23 +40,49 @@ #include #include #include +#include +#include -void vm_pressure_klist_lock(void); -void vm_pressure_klist_unlock(void); +#if CONFIG_MEMORYSTATUS +#include +#endif + +/* + * This value is the threshold that a process must meet to be considered for scavenging. + */ +#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ +#define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */ + +static void vm_pressure_klist_lock(void); +static void vm_pressure_klist_unlock(void); -void vm_dispatch_memory_pressure(void); -int vm_try_terminate_candidates(void); -int vm_try_pressure_candidates(void); -void vm_recharge_active_list(void); +static void vm_dispatch_memory_pressure(void); +static kern_return_t vm_try_pressure_candidates(void); +static void vm_reset_active_list(void); + +static lck_mtx_t vm_pressure_klist_mutex; struct klist vm_pressure_klist; struct klist vm_pressure_klist_dormant; -void vm_pressure_klist_lock(void) { +#if DEBUG +#define VM_PRESSURE_DEBUG(cond, format, ...) \ +do { \ + if (cond) { printf(format, ##__VA_ARGS__); } \ +} while(0) +#else +#define VM_PRESSURE_DEBUG(cond, format, ...) +#endif + +void vm_pressure_init(lck_grp_t *grp, lck_attr_t *attr) { + lck_mtx_init(&vm_pressure_klist_mutex, grp, attr); +} + +static void vm_pressure_klist_lock(void) { lck_mtx_lock(&vm_pressure_klist_mutex); } -void vm_pressure_klist_unlock(void) { +static void vm_pressure_klist_unlock(void) { lck_mtx_unlock(&vm_pressure_klist_mutex); } @@ -65,13 +91,11 @@ int vm_knote_register(struct knote *kn) { vm_pressure_klist_lock(); - if ((kn->kn_sfflags & (NOTE_VM_PRESSURE))) { -#if DEBUG - printf("[vm_pressure] process %d registering pressure notification\n", kn->kn_kq->kq_p->p_pid); -#endif + if ((kn->kn_sfflags) & (NOTE_VM_PRESSURE)) { KNOTE_ATTACH(&vm_pressure_klist, kn); - } else + } else { rv = ENOTSUP; + } vm_pressure_klist_unlock(); @@ -83,9 +107,7 @@ void vm_knote_unregister(struct knote *kn) { vm_pressure_klist_lock(); -#if DEBUG - printf("[vm_pressure] process %d cancelling pressure notification\n", kn->kn_kq->kq_p->p_pid); -#endif + VM_PRESSURE_DEBUG(0, "[vm_pressure] process %d cancelling pressure notification\n", kn->kn_kq->kq_p->p_pid); SLIST_FOREACH(kn_temp, &vm_pressure_klist, kn_selnext) { if (kn_temp == kn) { @@ -94,139 +116,249 @@ void vm_knote_unregister(struct knote *kn) { return; } } - KNOTE_DETACH(&vm_pressure_klist_dormant, kn); + + SLIST_FOREACH(kn_temp, &vm_pressure_klist_dormant, kn_selnext) { + if (kn_temp == kn) { + KNOTE_DETACH(&vm_pressure_klist_dormant, kn); + vm_pressure_klist_unlock(); + return; + } + } + + vm_pressure_klist_unlock(); +} + +void vm_pressure_proc_cleanup(proc_t p) +{ + struct knote *kn = NULL; + + vm_pressure_klist_lock(); + + VM_PRESSURE_DEBUG(0, "[vm_pressure] process %d exiting pressure notification\n", p->p_pid); + + SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) { + if (kn->kn_kq->kq_p == p) { + KNOTE_DETACH(&vm_pressure_klist, kn); + vm_pressure_klist_unlock(); + return; + } + } + + SLIST_FOREACH(kn, &vm_pressure_klist_dormant, kn_selnext) { + if (kn->kn_kq->kq_p == p) { + KNOTE_DETACH(&vm_pressure_klist_dormant, kn); + vm_pressure_klist_unlock(); + return; + } + } vm_pressure_klist_unlock(); } -/* Interface for event dispatch from vm_pageout_garbage_collect thread */ -void consider_pressure_events(void) { +void consider_vm_pressure_events(void) +{ vm_dispatch_memory_pressure(); } -void vm_dispatch_memory_pressure(void) { +static void vm_dispatch_memory_pressure(void) +{ vm_pressure_klist_lock(); if (!SLIST_EMPTY(&vm_pressure_klist)) { -#if DEBUG - printf("[vm_pressure] vm_dispatch_memory_pressure\n"); -#endif + VM_PRESSURE_DEBUG(1, "[vm_pressure] vm_dispatch_memory_pressure\n"); - if (vm_try_pressure_candidates()) { + if (vm_try_pressure_candidates() == KERN_SUCCESS) { vm_pressure_klist_unlock(); return; } } - /* Else... */ + VM_PRESSURE_DEBUG(1, "[vm_pressure] could not find suitable event candidate\n"); -#if DEBUG - printf("[vm_pressure] could not find suitable event candidate\n"); -#endif - - vm_recharge_active_list(); + vm_reset_active_list(); vm_pressure_klist_unlock(); } -/* - * Try standard pressure event candidates. Called with klist lock held. - */ -int vm_try_pressure_candidates(void) { - /* - * This value is the threshold that a process must meet to be considered for scavenging. - * If a process has sufficiently little resident memory, there is probably no use scavenging it. - * At best, we'll scavenge very little memory. At worst, we'll page in code pages or malloc metadata. - */ - -#define VM_PRESSURE_MINIMUM_RSIZE (10 * 1024 * 1024) - - struct proc *p_max = NULL; - unsigned int resident_max = 0; - struct knote *kn_max = NULL; - struct knote *kn; - +#if CONFIG_JETSAM + +/* Jetsam aware version. Called with lock held */ + +static struct knote * vm_find_knote_from_pid(pid_t pid) { + struct knote *kn = NULL; + SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) { - if ( (kn != NULL ) && ( kn->kn_kq != NULL ) && ( kn->kn_kq->kq_p != NULL ) ) { - if (kn->kn_sfflags & NOTE_VM_PRESSURE) { - struct proc *p = kn->kn_kq->kq_p; - if (!(kn->kn_status & KN_DISABLED)) { - kern_return_t kr = KERN_SUCCESS; - struct task *t = (struct task *)(p->task); - struct task_basic_info basic_info; - mach_msg_type_number_t size = TASK_BASIC_INFO_COUNT; - if( ( kr = task_info(t, TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) == KERN_SUCCESS ) { - unsigned int resident_size = basic_info.resident_size; - /* - * We don't want a small process to block large processes from - * being notified again. - */ - if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { - if (resident_size > resident_max) { - p_max = p; - resident_max = resident_size; - kn_max = kn; - } - } else { -#if DEBUG - /* There was no candidate with enough resident memory to scavenge */ - /* This debug print makes too much noise now */ - //printf("[vm_pressure] threshold failed for pid %d with %u resident, skipping...\n", p->p_pid, resident_size); -#endif - } - } else { -#if DEBUG - printf("[vm_pressure] task_info for pid %d failed with %d\n", p->p_pid, kr); -#endif - } - } else { -#if DEBUG - printf("[vm_pressure] pid %d currently disabled, skipping...\n", p->p_pid); -#endif - } - } - } else { -#if DEBUG - if (kn == NULL) { - printf("[vm_pressure] kn is NULL\n"); - } else if (kn->kn_kq == NULL) { - printf("[vm_pressure] kn->kn_kq is NULL\n"); - } else if (kn->kn_kq->kq_p == NULL) { - printf("[vm_pressure] kn->kn_kq->kq_p is NULL\n"); - } -#endif + struct proc *p; + pid_t current_pid; + + p = kn->kn_kq->kq_p; + current_pid = p->p_pid; + + if (current_pid == pid) { + break; } } - - if (kn_max == NULL) return 0; + + return kn; +} -#if DEBUG - printf("[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max); -#endif +static kern_return_t vm_try_pressure_candidates(void) +{ + struct knote *kn = NULL; + pid_t target_pid = (pid_t)-1; - KNOTE_DETACH(&vm_pressure_klist, kn_max); - struct klist dispatch_klist = { NULL }; - KNOTE_ATTACH(&dispatch_klist, kn_max); - KNOTE(&dispatch_klist, NOTE_VM_PRESSURE); - KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max); - - return 1; + /* If memory is low, and there's a pid to target... */ + target_pid = memorystatus_request_vm_pressure_candidate(); + while (target_pid != -1) { + /* ...look it up in the list, and break if found... */ + if ((kn = vm_find_knote_from_pid(target_pid))) { + break; + } + + /* ...otherwise, go round again. */ + target_pid = memorystatus_request_vm_pressure_candidate(); + } + + if (NULL == kn) { + VM_PRESSURE_DEBUG(0, "[vm_pressure] can't find candidate pid\n"); + return KERN_FAILURE; + } + + /* ...and dispatch the note */ + VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d, free pages %d\n", kn->kn_kq->kq_p->p_pid, memorystatus_available_pages); + + KNOTE(&vm_pressure_klist, target_pid); + + memorystatus_send_pressure_note(target_pid); + + return KERN_SUCCESS; } +static void vm_reset_active_list(void) { + /* No-op */ +} + +#if DEVELOPMENT || DEBUG + +/* Test purposes only */ +boolean_t vm_dispatch_pressure_note_to_pid(pid_t pid) { + struct knote *kn; + + vm_pressure_klist_lock(); + + kn = vm_find_knote_from_pid(pid); + if (kn) { + KNOTE(&vm_pressure_klist, pid); + } + + vm_pressure_klist_unlock(); + + return kn ? TRUE : FALSE; +} + +#endif /* DEVELOPMENT || DEBUG */ + +#else /* CONFIG_MEMORYSTATUS */ + +static kern_return_t vm_try_pressure_candidates(void) +{ + struct knote *kn = NULL, *kn_max = NULL; + unsigned int resident_max = 0; + pid_t target_pid = -1; + struct klist dispatch_klist = { NULL }; + kern_return_t kr = KERN_SUCCESS; + struct timeval curr_tstamp = {0, 0}; + int elapsed_msecs = 0; + proc_t target_proc = PROC_NULL; + + microuptime(&curr_tstamp); + + SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) { + struct mach_task_basic_info basic_info; + mach_msg_type_number_t size = MACH_TASK_BASIC_INFO_COUNT; + unsigned int resident_size = 0; + proc_t p = PROC_NULL; + struct task* t = TASK_NULL; + + p = kn->kn_kq->kq_p; + proc_list_lock(); + if (p != proc_ref_locked(p)) { + p = PROC_NULL; + proc_list_unlock(); + continue; + } + proc_list_unlock(); + + t = (struct task *)(p->task); + + timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp); + elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000; + + if (elapsed_msecs < VM_PRESSURE_NOTIFY_WAIT_PERIOD) { + proc_rele(p); + continue; + } + + if( ( kr = task_info(t, MACH_TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) != KERN_SUCCESS ) { + VM_PRESSURE_DEBUG(1, "[vm_pressure] task_info for pid %d failed with %d\n", p->p_pid, kr); + proc_rele(p); + continue; + } + + /* + * We don't want a small process to block large processes from + * being notified again. + */ + resident_size = (basic_info.resident_size)/(MB); + if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { + if (resident_size > resident_max) { + resident_max = resident_size; + kn_max = kn; + target_pid = p->p_pid; + target_proc = p; + } + } else { + /* There was no candidate with enough resident memory to scavenge */ + VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %u resident...\n", p->p_pid, resident_size); + } + proc_rele(p); + } + + if (kn_max == NULL || target_pid == -1) { + return KERN_FAILURE; + } + + VM_DEBUG_EVENT(vm_pageout_scan, VM_PRESSURE_EVENT, DBG_FUNC_NONE, target_pid, resident_max, 0, 0); + VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max); + + KNOTE_DETACH(&vm_pressure_klist, kn_max); + + target_proc = proc_find(target_pid); + if (target_proc != PROC_NULL) { + KNOTE_ATTACH(&dispatch_klist, kn_max); + KNOTE(&dispatch_klist, target_pid); + KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max); + + microuptime(&target_proc->vm_pressure_last_notify_tstamp); + proc_rele(target_proc); + } + + return KERN_SUCCESS; +} /* * Remove all elements from the dormant list and place them on the active list. * Called with klist lock held. */ -void vm_recharge_active_list(void) { +static void vm_reset_active_list(void) { /* Re-charge the main list from the dormant list if possible */ if (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { -#if DEBUG - printf("[vm_pressure] recharging main list from dormant list\n"); -#endif struct knote *kn; + + VM_PRESSURE_DEBUG(1, "[vm_pressure] recharging main list from dormant list\n"); + while (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { kn = SLIST_FIRST(&vm_pressure_klist_dormant); SLIST_REMOVE_HEAD(&vm_pressure_klist_dormant, kn_selnext); @@ -234,3 +366,5 @@ void vm_recharge_active_list(void) { } } } + +#endif /* CONFIG_MEMORYSTATUS */ diff --git a/bsd/kern/vm_pressure.h b/bsd/kern/vm_pressure.h index 8063c820a..059e9c23c 100644 --- a/bsd/kern/vm_pressure.h +++ b/bsd/kern/vm_pressure.h @@ -31,11 +31,16 @@ #include -static lck_mtx_t vm_pressure_klist_mutex; +void vm_pressure_init(lck_grp_t *grp, lck_attr_t *attr); int vm_knote_register(struct knote *); void vm_knote_unregister(struct knote *); -void consider_pressure_events(void); +void consider_vm_pressure_events(void); +void vm_pressure_proc_cleanup(proc_t); + +#if CONFIG_MEMORYSTATUS && (DEVELOPMENT || DEBUG) +boolean_t vm_dispatch_pressure_note_to_pid(pid_t pid); +#endif #endif /* VM_PRESSURE_H */ diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 8259186d0..223f3a526 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -77,6 +77,7 @@ #include #include + #ifdef __APPLE_API_OBSOLETE /* BCD conversions. */ extern u_char const bcd2bin_data[]; diff --git a/bsd/machine/exec.h b/bsd/machine/exec.h index 1a6417179..a38f0dd74 100644 --- a/bsd/machine/exec.h +++ b/bsd/machine/exec.h @@ -41,14 +41,6 @@ struct exec_info { char **ev; }; -struct exec_archhandler { - char path[MAXPATHLEN]; - uint32_t fsid; - uint64_t fileid; -}; - -extern struct exec_archhandler exec_archhandler_ppc; -int set_archhandler(struct proc *, int); int grade_binary(cpu_type_t, cpu_subtype_t); #if defined (__i386__) || defined(__x86_64__) diff --git a/bsd/machine/setjmp.h b/bsd/machine/setjmp.h index 262acfbc8..b349878d0 100644 --- a/bsd/machine/setjmp.h +++ b/bsd/machine/setjmp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,10 +31,6 @@ #ifndef _MACHINE_SETJMP_H_ #define _MACHINE_SETJMP_H_ -#if defined (__i386__) || defined(__x86_64__) -#include "i386/setjmp.h" -#else -#error architecture not supported -#endif +#include #endif /* _MACHINE_SETJMP_H_ */ diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index e2af8fc60..df7e6ac1b 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -678,7 +678,7 @@ natively. Callers should be aware of this when requesting the full path of a har A .Vt timespec that contains the time that the file system object was created or renamed into -its containing directory. Note that inconsistent behavior may obe observed +its containing directory. Note that inconsistent behavior may be observed when this attribute is requested on hard-linked items. .Pp . diff --git a/bsd/man/man2/getaudit.2 b/bsd/man/man2/getaudit.2 index d2895cd33..8966090a6 100644 --- a/bsd/man/man2/getaudit.2 +++ b/bsd/man/man2/getaudit.2 @@ -1,193 +1 @@ -.\" -.\" Copyright (c) 2008-2009 Apple Inc. All rights reserved. -.\" -.\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ -.\" -.\" This file contains Original Code and/or Modifications of Original Code -.\" as defined in and that are subject to the Apple Public Source License -.\" Version 2.0 (the 'License'). You may not use this file except in -.\" compliance with the License. The rights granted to you under the License -.\" may not be used to create, or enable the creation or redistribution of, -.\" unlawful or unlicensed copies of an Apple operating system, or to -.\" circumvent, violate, or enable the circumvention or violation of, any -.\" terms of an Apple operating system software license agreement. -.\" -.\" Please obtain a copy of the License at -.\" http://www.opensource.apple.com/apsl/ and read it before using this file. -.\" -.\" The Original Code and all software distributed under the License are -.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER -.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, -.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, -.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. -.\" Please see the License for the specific language governing rights and -.\" limitations under the License. -.\" -.\" @APPLE_OSREFERENCE_LICENSE_HEADER_END@ -.\" -.Dd March 6, 2009 -.Dt GETAUDIT 2 -.Os -.Sh NAME -.Nm getaudit , -.Nm getaudit_addr -.Nd "retrieve audit session state" -.Sh SYNOPSIS -.In bsm/audit.h -.Ft int -.Fn getaudit "auditinfo_t *auditinfo" -.Ft int -.Fn getaudit_addr "auditinfo_addr_t *auditinfo_addr" "u_int length" -.Sh DESCRIPTION -The -.Fn getaudit -system call -retrieves the active audit session state for the current process via the -.Vt auditinfo_t -pointed to by -.Fa auditinfo . -The -.Fn getaudit_addr -system call -retrieves extended state via -.Fa auditinfo_addr -and -.Fa length . -.Pp -The -.Fa auditinfo_t -data structure is defined as follows: -.nf -.in +4n -struct auditinfo { - au_id_t ai_auid; /* Audit user ID */ - au_mask_t ai_mask; /* Audit masks */ - au_tid_t ai_termid; /* Terminal ID */ - au_asid_t ai_asid; /* Audit session ID */ -}; -typedef struct auditinfo auditinfo_t; -.in -.fi -.Pp -The -.Fa ai_auid -variable contains the audit identifier which is recorded in the audit log for -each event the process caused. -.Pp -The -.Fa au_mask_t -data structure defines the bit mask for auditing successful and failed events -out of the predefined list of event classes. It is defined as follows: -.nf -.in +4n -struct au_mask { - unsigned int am_success; /* success bits */ - unsigned int am_failure; /* failure bits */ -}; -typedef struct au_mask au_mask_t; -.in -.fi -.Pp -The -.Fa au_termid_t -data structure defines the Terminal ID recorded with every event caused by the -process. It is defined as follows: -.nf -.in +4n -struct au_tid { - dev_t port; - u_int32_t machine; -}; -typedef struct au_tid au_tid_t; -.in -.fi -.Pp -The -.Fa ai_asid -variable contains the audit session ID which is recorded with every event -caused by the process. -.Pp -The -.Fn getaudit_addr -system call -uses the expanded -.Fa auditinfo_addr_t -data structure supports Terminal IDs with larger addresses such as those used -in IP version 6. It is defined as follows: -.nf -.in +4n -struct auditinfo_addr { - au_id_t ai_auid; /* Audit user ID. */ - au_mask_t ai_mask; /* Audit masks. */ - au_tid_addr_t ai_termid; /* Terminal ID. */ - au_asid_t ai_asid; /* Audit session ID. */ - u_int64_t ai_flags; /* Audit session flags. */ -}; -typedef struct auditinfo_addr auditinfo_addr_t; -.in -.fi -.Pp -The -.Fa au_tid_addr_t -data structure which includes a larger address storage field and an additional -field with the type of address stored: -.nf -.in +4n -struct au_tid_addr { - dev_t at_port; - u_int32_t at_type; - u_int32_t at_addr[4]; -}; -typedef struct au_tid_addr au_tid_addr_t; -.in -.fi -.Pp -Without appropriate privilege the audit mask fields will be set to all -ones. -.Sh RETURN VALUES -.Rv -std getaudit getaudit_addr -.Sh ERRORS -The -.Fn getaudit -function will fail if: -.Bl -tag -width Er -.It Bq Er EFAULT -A failure occurred while data transferred to or from -the kernel failed. -.It Bq Er EINVAL -Illegal argument was passed by a system call. -.It Bq Er EOVERFLOW -The -.Fa length -argument indicates an overflow condition will occur. -.It Bq Er ERANGE -The address is too big and, therefore, -.Fn getaudit_addr -should be used instead. -.El -.Sh SEE ALSO -.Xr audit 2 , -.Xr auditon 2 , -.Xr getauid 2 , -.Xr setaudit 2 , -.Xr setauid 2 , -.Xr libbsm 3 -.Sh HISTORY -The OpenBSM implementation was created by McAfee Research, the security -division of McAfee Inc., under contract to Apple Computer Inc.\& in 2004. -It was subsequently adopted by the TrustedBSD Project as the foundation for -the OpenBSM distribution. -.Sh AUTHORS -.An -nosplit -This software was created by McAfee Research, the security research division -of McAfee, Inc., under contract to Apple Computer Inc. -Additional authors include -.An Wayne Salamon , -.An Robert Watson , -and SPARTA Inc. -.Pp -The Basic Security Module (BSM) interface to audit records and audit event -stream format were defined by Sun Microsystems. -.Pp -This manual page was written by -.An Robert Watson Aq rwatson@FreeBSD.org . +.so man2/getaudit_addr.2 diff --git a/bsd/man/man2/getaudit_addr.2 b/bsd/man/man2/getaudit_addr.2 index 25e765cd5..26a349b25 100644 --- a/bsd/man/man2/getaudit_addr.2 +++ b/bsd/man/man2/getaudit_addr.2 @@ -1 +1,214 @@ -.so man2/getaudit.2 +.\" +.\" Copyright (c) 2008-2011 Apple Inc. All rights reserved. +.\" +.\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. The rights granted to you under the License +.\" may not be used to create, or enable the creation or redistribution of, +.\" unlawful or unlicensed copies of an Apple operating system, or to +.\" circumvent, violate, or enable the circumvention or violation of, any +.\" terms of an Apple operating system software license agreement. +.\" +.\" Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +.\" +.Dd March 6, 2011 +.Dt GETAUDIT_ADDR 2 +.Os +.Sh NAME +.Nm getaudit_addr , +.Nm getaudit(NOW DEPRECATED) +.Nd "retrieve audit session state" +.Sh SYNOPSIS +.In bsm/audit.h +.In bsm/audit_session.h +.Ft int +.Fn getaudit_addr "auditinfo_addr_t *auditinfo_addr" "u_int length" +.Sh SYNOPSIS (NOW DEPRECATED) +.In bsm/audit.h +.Ft int +.Fn getaudit "auditinfo_t *auditinfo" +.Sh DESCRIPTION +The +.Fn getaudit_addr +system call +retrieves extended state via +.Fa auditinfo_addr +and +.Fa length . +It +uses the +.Fa auditinfo_addr_t +data structure supports Terminal IDs incuding those with larger addresses such +as those used in IP version 6. It is defined as follows: +.nf +.in +4n +struct auditinfo_addr { + au_id_t ai_auid; /* Audit user ID. */ + au_mask_t ai_mask; /* Audit masks. */ + au_tid_addr_t ai_termid; /* Terminal ID. */ + au_asid_t ai_asid; /* Audit session ID. */ + u_int64_t ai_flags; /* Audit session flags. */ +}; +typedef struct auditinfo_addr auditinfo_addr_t; +.in +.fi +.Pp +The +.Fa ai_auid +variable contains the audit identifier which is recorded in the audit log for +each event the process caused. +.Pp +The +.Fa au_mask_t +data structure defines the bit mask for auditing successful and failed events +out of the predefined list of event classes. It is defined as follows: +.nf +.in +4n +struct au_mask { + unsigned int am_success; /* success bits */ + unsigned int am_failure; /* failure bits */ +}; +typedef struct au_mask au_mask_t; +.in +.fi +.Pp +The +.Fa au_tid_addr_t +data structure which includes a larger address storage field and an additional +field with the type of address stored: +.nf +.in +4n +struct au_tid_addr { + dev_t at_port; + u_int32_t at_type; + u_int32_t at_addr[4]; +}; +typedef struct au_tid_addr au_tid_addr_t; +.in +.fi +.Pp +The +.Fa ai_asid +variable contains the audit session ID which is recorded with every event +caused by the process. +.Pp +The +.Fa ai_flags +variable contains flags that are opaque to the kernel and used by various +consumers of the +.Fa auditinfo_addr +data. Please see the +.Ao Pa bsm/audit_session.h Ac +header file for more information +and flag definitions for this platform. +.Pp +Without appropriate privilege the audit mask fields will be set to all +ones. +.Pp +The +.Fn getaudit +system call (NOW DEPRECATED) +retrieves the active audit session state for the current process via the +.Vt auditinfo_t +pointed to by +.Fa auditinfo . +.Pp +The +.Fa auditinfo_t +data structure (NOW DEPRECATED) is defined as follows: +.nf +.in +4n +struct auditinfo { + au_id_t ai_auid; /* Audit user ID */ + au_mask_t ai_mask; /* Audit masks */ + au_tid_t ai_termid; /* Terminal ID */ + au_asid_t ai_asid; /* Audit session ID */ +}; +typedef struct auditinfo auditinfo_t; +.in +.fi +.Pp +The +.Fa au_termid_t +data structure (NOW DEPRECATED) defines the Terminal ID recorded with +every event caused by the process. It is defined as follows: +.nf +.in +4n +struct au_tid { + dev_t port; + u_int32_t machine; +}; +typedef struct au_tid au_tid_t; +.in +.fi +.Sh RETURN VALUES +.Rv -std getaudit_addr +.Sh ERRORS +The +.Fn getaudit_addr +function will fail if: +.Bl -tag -width Er +.It Bq Er EFAULT +A failure occurred while data transferred to or from +the kernel failed. +.It Bq Er EINVAL +Illegal argument was passed by a system call. +.It Bq Er EOVERFLOW +The +.Fa length +argument indicates an overflow condition will occur. +.It Bq Er ERANGE +The address is too big. +.El +.Sh SEE ALSO +.Xr audit 2 , +.Xr auditon 2 , +.Xr getauid 2 , +.Xr setaudit 2 , +.Xr setauid 2 , +.Xr libbsm 3 +.Sh HISTORY +The OpenBSM implementation was created by McAfee Research, the security +division of McAfee Inc., under contract to Apple Computer Inc.\& in 2004. +It was subsequently adopted by the TrustedBSD Project as the foundation for +the OpenBSM distribution. +.Pp +.Fn getaudit_addr +replaced +.Fn getaudit +in Mac OS X 10.7 to support longer terminal addresses such as those used +by IP version 6. +.Fn getaudit +is now deprecated and +.Fn getaudit_addr +should be used instead. +.Sh AUTHORS +.An -nosplit +This software was created by McAfee Research, the security research division +of McAfee, Inc., under contract to Apple Computer Inc. +Additional authors include +.An Wayne Salamon , +.An Robert Watson , +and SPARTA Inc. +.Pp +The Basic Security Module (BSM) interface to audit records and audit event +stream format were defined by Sun Microsystems. +.Pp +This manual page was written by +.An Robert Watson Aq rwatson@FreeBSD.org +and +.An Stacey Son Aq sson@FreeBSD.org . diff --git a/bsd/man/man2/getgroups.2 b/bsd/man/man2/getgroups.2 index a941bc389..2ae5297fc 100644 --- a/bsd/man/man2/getgroups.2 +++ b/bsd/man/man2/getgroups.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008, 2010 Apple Inc. All rights reserved. +.\" Copyright (c) 2008-2011 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)getgroups.2 8.2 (Berkeley) 4/16/94 .\" -.Dd September 17, 2010 +.Dd October 28, 2011 .Dt GETGROUPS 2 .Os BSD 4.2 .Sh NAME @@ -91,6 +91,13 @@ returns the number of groups without modifying the .Fa grouplist[] array. .Pp +Calling +.Xr initgroups 3 +to opt-in for supplementary groups will cause +.Fn getgroups +to return a single entry, the GID that was passed to +.Xr initgroups 3 . +.Pp To provide compatibility with applications that use .Fn getgroups in environments where users may be in more than diff --git a/bsd/man/man2/getrusage.2 b/bsd/man/man2/getrusage.2 index d06572017..c243f97e1 100644 --- a/bsd/man/man2/getrusage.2 +++ b/bsd/man/man2/getrusage.2 @@ -67,7 +67,7 @@ the following structure: struct rusage { struct timeval ru_utime; /* user time used */ struct timeval ru_stime; /* system time used */ - long ru_maxrss; /* integral max resident set size */ + long ru_maxrss; /* max resident set size */ long ru_ixrss; /* integral shared text memory size */ long ru_idrss; /* integral unshared data size */ long ru_isrss; /* integral unshared stack size */ @@ -92,7 +92,7 @@ the total amount of time spent executing in user mode. the total amount of time spent in the system executing on behalf of the process(es). .It Fa ru_maxrss -the maximum resident set size utilized (in kilobytes). +the maximum resident set size utilized (in bytes). .It Fa ru_ixrss an \*(lqintegral\*(rq value indicating the amount of memory used by the text segment diff --git a/bsd/man/man2/getsockopt.2 b/bsd/man/man2/getsockopt.2 index e0408c272..0ec25a000 100644 --- a/bsd/man/man2/getsockopt.2 +++ b/bsd/man/man2/getsockopt.2 @@ -275,7 +275,7 @@ operation testing the ability to write to a socket will return true only if the low-water mark amount could be processed. The default value for .Dv SO_SNDLOWAT -is set to a convenient size for network efficiency, often 1024. +is set to a convenient size for network efficiency, often 2048. .Pp .Dv SO_RCVLOWAT is an option to set the minimum count for input operations. diff --git a/bsd/man/man2/searchfs.2 b/bsd/man/man2/searchfs.2 index 663f38ac0..511776635 100644 --- a/bsd/man/man2/searchfs.2 +++ b/bsd/man/man2/searchfs.2 @@ -638,6 +638,10 @@ ATTR_CMN_OWNERID ATTR_CMN_GRPID .It ATTR_CMN_ACCESSMASK +.It +ATTR_CMN_FILEID +.It +ATTR_CMN_PARENTID .Pp . .It diff --git a/bsd/man/man2/setaudit.2 b/bsd/man/man2/setaudit.2 index b626e0cf8..1fa5dda1f 100644 --- a/bsd/man/man2/setaudit.2 +++ b/bsd/man/man2/setaudit.2 @@ -1,236 +1 @@ -.\" -.\" Copyright (c) 2008-2009 Apple Inc. All rights reserved. -.\" -.\" @APPLE_LICENSE_HEADER_START@ -.\" -.\" This file contains Original Code and/or Modifications of Original Code -.\" as defined in and that are subject to the Apple Public Source License -.\" Version 2.0 (the 'License'). You may not use this file except in -.\" compliance with the License. Please obtain a copy of the License at -.\" http://www.opensource.apple.com/apsl/ and read it before using this -.\" file. -.\" -.\" The Original Code and all software distributed under the License are -.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER -.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, -.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, -.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. -.\" Please see the License for the specific language governing rights and -.\" limitations under the License. -.\" -.\" @APPLE_LICENSE_HEADER_END@ -.\" -.Dd March 23, 2009 -.Dt SETAUDIT 2 -.Os -.Sh NAME -.Nm setaudit , -.Nm setaudit_addr -.Nd "set audit session state" -.Sh SYNOPSIS -.In bsm/audit.h -.Ft int -.Fn setaudit "auditinfo_t *auditinfo" -.Ft int -.Fn setaudit_addr "auditinfo_addr_t *auditinfo_addr" "u_int length" -.Sh DESCRIPTION -The -.Fn setaudit -system call -sets the active audit session state for the current process via the -.Vt auditinfo_t -pointed to by -.Fa auditinfo . -The -.Fn setaudit_addr -system call -sets extended state via -.Fa auditinfo_addr -and -.Fa length . -.Pp -The -.Fa auditinfo_t -data structure is defined as follows: -.nf -.in +4n -struct auditinfo { - au_id_t ai_auid; /* Audit user ID */ - au_mask_t ai_mask; /* Audit masks */ - au_tid_t ai_termid; /* Terminal ID */ - au_asid_t ai_asid; /* Audit session ID */ -}; -typedef struct auditinfo auditinfo_t; -.in -.fi -.Pp -The -.Fa ai_auid -variable contains the audit identifier which is recorded in the audit log for -each event the process caused. -The value of AU_DEFAUDITID (-1) should not be used. -The exception is if the value of audit identifier is known at the -start of the session but will be determined and set later. -Until -.Fa ai_auid -is set to something other than AU_DEFAUDITID any audit events -generated by the system with be filtered by the non-attributed audit -mask. -.Pp -The -.Fa au_mask_t -data structure defines the bit mask for auditing successful and failed events -out of the predefined list of event classes. It is defined as follows: -.nf -.in +4n -struct au_mask { - unsigned int am_success; /* success bits */ - unsigned int am_failure; /* failure bits */ -}; -typedef struct au_mask au_mask_t; -.in -.fi -.Pp -The -.Fa au_termid_t -data structure defines the Terminal ID recorded with every event caused by the -process. It is defined as follows: -.nf -.in +4n -struct au_tid { - dev_t port; - u_int32_t machine; -}; -typedef struct au_tid au_tid_t; -.in -.fi -.Pp -The -.Fa ai_asid -variable contains the audit session ID which is recorded with every event -caused by the process. It can be any value in the range 1 to PID_MAX (99999). -If the value of AU_ASSIGN_ASID is used for -.Fa ai_asid -a unique session ID will be generated by the kernel. -The audit session ID will be returned in the -.Fa ai_asid -field on success. -.Pp -The -.Fn setaudit_addr -system call -uses the expanded -.Fa auditinfo_addr_t -data structure which supports Terminal IDs with larger addresses -such as those used in IP version 6. It is defined as follows: -.nf -.in +4n -struct auditinfo_addr { - au_id_t ai_auid; /* Audit user ID. */ - au_mask_t ai_mask; /* Audit masks. */ - au_tid_addr_t ai_termid; /* Terminal ID. */ - au_asid_t ai_asid; /* Audit session ID. */ - u_int64_t ai_flags; /* Audit session flags */ -}; -typedef struct auditinfo_addr auditinfo_addr_t; -.in -.fi -.Pp -The -.Fa au_tid_addr_t -data structure includes a larger address storage field and an additional -field with the type of address stored: -.nf -.in +4n -struct au_tid_addr { - dev_t at_port; - u_int32_t at_type; - u_int32_t at_addr[4]; -}; -typedef struct au_tid_addr au_tid_addr_t; -.in -.fi -.Pp -The -.Fa ai_flags -field is opaque to the kernel and can be used to store user -defined session flags. -.Pp -These system calls require an appropriate privilege to complete. -.Pp -These system calls should only be called once at the start of a new -session and not again during the same session to update the session -information. -There are some exceptions, however. -The -.Fa ai_auid -field may be updated later if initially set to the value of -AU_DEFAUDITID (-1). -Likewise, the -.Fa ai_termid -fields may be updated later if the -.Fa at_type -field in -.Fa au_tid_addr -is set to AU_IPv4 and the other -.Fa ai_tid_addr -fields are all set to zero. -Creating a new session is done by setting the -.Fa ai_asid -field to an unique session value or AU_ASSIGN_ASID. -These system calls will fail when attempting to change the -.Fa ai_auid -or -.Fa ai_termid -fields once set to something other than the default values. -The -.Fa ai_flags -field may be updated only according to local access control -policy but this is usually accomplished with -.Xr auditon 2 -using the A_SETSFLAGS command. -The audit preselection masks may be changed at any time -but are usually updated with -.Xr auditon 2 -using the A_SETPMASK command. -.Sh RETURN VALUES -.Rv -std setaudit setaudit_addr -.Sh ERRORS -.Bl -tag -width Er -.It Bq Er EFAULT -A failure occurred while data transferred to or from -the kernel failed. -.It Bq Er EINVAL -Illegal argument was passed by a system call. -.It Bq Er EPERM -The process does not have sufficient permission to complete -the operation. -.El -.Sh SEE ALSO -.Xr audit 2 , -.Xr auditon 2 , -.Xr getaudit 2 , -.Xr getauid 2 , -.Xr setauid 2 , -.Xr libbsm 3 -.Sh HISTORY -The OpenBSM implementation was created by McAfee Research, the security -division of McAfee Inc., under contract to Apple Computer Inc.\& in 2004. -It was subsequently adopted by the TrustedBSD Project as the foundation for -the OpenBSM distribution. -.Sh AUTHORS -.An -nosplit -This software was created by McAfee Research, the security research division -of McAfee, Inc., under contract to Apple Computer Inc. -Additional authors include -.An Wayne Salamon , -.An Robert Watson , -and SPARTA Inc. -.Pp -The Basic Security Module (BSM) interface to audit records and audit event -stream format were defined by Sun Microsystems. -.Pp -This manual page was written by -.An Robert Watson Aq rwatson@FreeBSD.org -and -.An Stacey Son Aq sson@FreeBSD.org . +.so man2/setaudit_addr.2 diff --git a/bsd/man/man2/setaudit_addr.2 b/bsd/man/man2/setaudit_addr.2 index f11b4169f..d6b48c4d5 100644 --- a/bsd/man/man2/setaudit_addr.2 +++ b/bsd/man/man2/setaudit_addr.2 @@ -1 +1,253 @@ -.so man2/setaudit.2 +.\" +.\" Copyright (c) 2008-2011 Apple Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" This file contains Original Code and/or Modifications of Original Code +.\" as defined in and that are subject to the Apple Public Source License +.\" Version 2.0 (the 'License'). You may not use this file except in +.\" compliance with the License. Please obtain a copy of the License at +.\" http://www.opensource.apple.com/apsl/ and read it before using this +.\" file. +.\" +.\" The Original Code and all software distributed under the License are +.\" distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +.\" Please see the License for the specific language governing rights and +.\" limitations under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd March 4, 2011 +.Dt SETAUDIT_ADDR 2 +.Os +.Sh NAME +.Nm setaudit_addr , +.Nm setaudit(NOW DEPRECATED) +.Nd "set audit session state" +.Sh SYNOPSIS +.In bsm/audit.h +.In bsm/audit_session.h +.Ft int +.Fn setaudit_addr "auditinfo_addr_t *auditinfo_addr" "u_int length" +.Sh SYNOPSIS (NOW DEPRECATED) +.In bsm/audit.h +.Ft int +.Fn setaudit "auditinfo_t *auditinfo" +.Sh DESCRIPTION +The +.Fn setaudit_addr +system call +uses the +.Fa auditinfo_addr_t +data structure for the +.Fa auditinfo_addr +argument which supports Terminal IDs with large addresses +such as those used in IP version 6. It is defined as follows: +.nf +.in +4n +struct auditinfo_addr { + au_id_t ai_auid; /* Audit user ID. */ + au_mask_t ai_mask; /* Audit masks. */ + au_tid_addr_t ai_termid; /* Terminal ID. */ + au_asid_t ai_asid; /* Audit session ID. */ + u_int64_t ai_flags; /* Audit session flags */ +}; +typedef struct auditinfo_addr auditinfo_addr_t; +.in +.fi +.Pp +The +.Fa ai_auid +variable contains the audit identifier which is recorded in the audit log for +each event the process caused. The value of AU_DEFAUDITID (-1) should not be +used. The exception is if the value of audit identifier is known at the start +of the session but will be determined and set later. Until +.Fa ai_auid +is set to something other than AU_DEFAUDITID any audit events +generated by the system with be filtered by the non-attributed audit +mask. +.Pp +The +.Fa au_mask_t +data structure defines the bit mask for auditing successful and failed events +out of the predefined list of event classes. It is defined as follows: +.nf +.in +4n +struct au_mask { + unsigned int am_success; /* success bits */ + unsigned int am_failure; /* failure bits */ +}; +typedef struct au_mask au_mask_t; +.in +.fi +.Pp +The +.Fa au_tid_addr_t +data structure includes a larger address storage field and an additional +field with the type of address stored: +.nf +.in +4n +struct au_tid_addr { + dev_t at_port; + u_int32_t at_type; + u_int32_t at_addr[4]; +}; +typedef struct au_tid_addr au_tid_addr_t; +.in +.fi +.Pp +The +.Fa ai_asid +variable contains the audit session ID which is recorded with every event +caused by the process. It can be any value in the range 1 to PID_MAX (99999). +If the value of AU_ASSIGN_ASID is used for +.Fa ai_asid +a unique session ID will be generated by the kernel. +The audit session ID will be returned in the +.Fa ai_asid +field on success. +.Pp +The +.Fa ai_flags +field is opaque to the kernel and can be used to store flags associated +with the audit session. Please see the +.Ao Pa bsm/audit_session.h Ac +header file +for more infomration and flag definitions for this platform. +.Pp +The +.Fa setaudit_addr +system call require an appropriate privilege to complete. +.Pp +This system call should only be called once at the start of a new +session and not again during the same session to update the session +information. +There are some exceptions, however. +The +.Fa ai_auid +field may be updated later if initially set to the value of +AU_DEFAUDITID (-1). +Likewise, the +.Fa ai_termid +fields may be updated later if the +.Fa at_type +field in +.Fa au_tid_addr +is set to AU_IPv4 and the other +.Fa ai_tid_addr +fields are all set to zero. +Creating a new session is done by setting the +.Fa ai_asid +field to an unique session value or AU_ASSIGN_ASID. +These system calls will fail when attempting to change the +.Fa ai_auid +or +.Fa ai_termid +fields once set to something other than the default values. +The +.Fa ai_flags +field may be updated only according to local access control +policy but this is usually accomplished with +.Xr auditon 2 +using the A_SETSFLAGS command. +The audit preselection masks may be changed at any time +but are usually updated with +.Xr auditon 2 +.Pp +The +.Fn setaudit +system call (NOW DEPRECATED) +sets the active audit session state for the current process via the +.Vt auditinfo_t +pointed to by +.Fa auditinfo . +The +.Fn setaudit_addr +system call +sets extended state via +.Fa auditinfo_addr +and +.Fa length . +.Pp +The +.Fa auditinfo_t +data structure (NOW DEPRECATED) is defined as follows: +.nf +.in +4n +struct auditinfo { + au_id_t ai_auid; /* Audit user ID */ + au_mask_t ai_mask; /* Audit masks */ + au_tid_t ai_termid; /* Terminal ID */ + au_asid_t ai_asid; /* Audit session ID */ +}; +typedef struct auditinfo auditinfo_t; +.in +.fi +.Pp +The +.Fa au_termid_t +data structure (NOW DEPRECATED) defines the Terminal ID recorded with every +event caused by the process. It is defined as follows: +.nf +.in +4n +struct au_tid { + dev_t port; + u_int32_t machine; +}; +typedef struct au_tid au_tid_t; +.in +.fi +.Sh RETURN VALUES +.Rv -std setaudit_addr +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EFAULT +A failure occurred while data transferred to or from +the kernel failed. +.It Bq Er EINVAL +Illegal argument was passed by a system call. +.It Bq Er EPERM +The process does not have sufficient permission to complete +the operation. +.El +.Sh SEE ALSO +.Xr audit 2 , +.Xr auditon 2 , +.Xr getaudit 2 , +.Xr getauid 2 , +.Xr setauid 2 , +.Xr libbsm 3 +.Sh HISTORY +The OpenBSM implementation was created by McAfee Research, the security +division of McAfee Inc., under contract to Apple Computer Inc.\& in 2004. +It was subsequently adopted by the TrustedBSD Project as the foundation for +the OpenBSM distribution. +.Pp +.Fn setaudit_addr +replaced +.Fn setaudit +in Mac OS X 10.7 to support longer terminal addresses such as those used +by IP version 6. +.Fn setaudit +is now deprecated and +.Fn setaudit_addr +should be used instead. +.Sh AUTHORS +.An -nosplit +This software was created by McAfee Research, the security research division +of McAfee, Inc., under contract to Apple Computer Inc. +Additional authors include +.An Wayne Salamon , +.An Robert Watson , +and SPARTA Inc. +.Pp +The Basic Security Module (BSM) interface to audit records and audit event +stream format were defined by Sun Microsystems. +.Pp +This manual page was written by +.An Robert Watson Aq rwatson@FreeBSD.org +and +.An Stacey Son Aq sson@FreeBSD.org . diff --git a/bsd/man/man2/setxattr.2 b/bsd/man/man2/setxattr.2 index 957c5bd77..240e8298d 100644 --- a/bsd/man/man2/setxattr.2 +++ b/bsd/man/man2/setxattr.2 @@ -175,6 +175,15 @@ Not enough space left on the file system. .Xr getxattr 2 , .Xr listxattr 2 , .Xr removexattr 2 +.Sh NOTES +Due to historical reasons, the +.Dv XATTR_FINDERINFO_NAME +(defined to be +.Dq com.apple.FinderInfo ) +extended attribute must be 32 bytes; see the +.Dv ATTR_CMN_FNDRINFO +section in +.Xr getattrlist 2 . .Sh HISTORY .Fn setxattr and diff --git a/bsd/man/man2/statfs.2 b/bsd/man/man2/statfs.2 index 85dce6a80..7e2b9ad27 100644 --- a/bsd/man/man2/statfs.2 +++ b/bsd/man/man2/statfs.2 @@ -49,7 +49,7 @@ .Fn statfs "const char *path" "struct statfs *buf" .Ft int .Fn fstatfs "int fd" "struct statfs *buf" -.Sh TRANSITIIONAL SYNOPSIS (NOW DEPRECATED) +.Sh TRANSITIONAL SYNOPSIS (NOW DEPRECATED) .Ft int .br .Fn statfs64 "const char *path" "struct statfs64 *buf" ; @@ -149,7 +149,7 @@ The routine returns the same information about an open file referenced by descriptor .Fa fd . .Sh FLAGS -.Bl -tag -width MNT_UNKOWNPERMISSIONS +.Bl -tag -width MNT_UNKNOWNPERMISSIONS These are some of the flags that may be present in the f_flags field. .It Dv MNT_RDONLY A read-only filesystem @@ -187,6 +187,8 @@ File system is journaled File system should defer writes .It Dv MNT_MULTILABEL MAC support for individual labels +.It Dv MNT_CPROTECT +File system supports per-file encrypted data protection .El .Sh CAVEATS In Mac OS X versions before 10.4, f_iosize is 4096. On these older diff --git a/bsd/man/man3/posix_spawnattr_setspecialport_np.3 b/bsd/man/man3/posix_spawnattr_setspecialport_np.3 index 8110125e2..485f9db3c 100644 --- a/bsd/man/man3/posix_spawnattr_setspecialport_np.3 +++ b/bsd/man/man3/posix_spawnattr_setspecialport_np.3 @@ -33,9 +33,7 @@ .Sh NAME .Nm posix_spawnattr_setspecialport_np .Nm posix_spawnattr_setexceptionports_np -.Nd set or get the -.Em spawn-binpref -attribute on a +.Nd set special ports on a .Em posix_spawnattr_t .Sh SYNOPSIS .Fd #include diff --git a/bsd/man/man4/Makefile b/bsd/man/man4/Makefile index a0f336945..f385d9990 100644 --- a/bsd/man/man4/Makefile +++ b/bsd/man/man4/Makefile @@ -14,7 +14,6 @@ DATAFILES = \ bpf.4 \ divert.4 \ dummynet.4 \ - faith.4 \ fd.4 \ gif.4 \ icmp.4 \ diff --git a/bsd/man/man4/inet6.4 b/bsd/man/man4/inet6.4 index be9d1e818..74fe8baf8 100644 --- a/bsd/man/man4/inet6.4 +++ b/bsd/man/man4/inet6.4 @@ -314,17 +314,17 @@ The node must be a host (not a router) for the option to be meaningful. Defaults to off. -.It Dv IPV6CTL_KEEPFAITH -.Pq ip6.keepfaith -Boolean: enable/disable -.Dq FAITH -TCP relay IPv6-to-IPv4 translator code in the kernel. -Refer -.Xr faith 4 -and -.Xr faithd 8 -for detail. -Defaults to off. +.\".It Dv IPV6CTL_KEEPFAITH +.\".Pq ip6.keepfaith +.\"Boolean: enable/disable +.\".Dq FAITH +.\"TCP relay IPv6-to-IPv4 translator code in the kernel. +.\"Refer +.\".Xr faith 4 +.\"and +.\".Xr faithd 8 +.\"for detail. +.\"Defaults to off. .It Dv IPV6CTL_LOG_INTERVAL .Pq ip6.log_interval Integer: default interval between diff --git a/bsd/man/man4/ip6.4 b/bsd/man/man4/ip6.4 index 25df62c8e..473ed190e 100644 --- a/bsd/man/man4/ip6.4 +++ b/bsd/man/man4/ip6.4 @@ -379,10 +379,10 @@ For wildcard sockets, this can restrict connections to IPv6 only. .\".Ox .\"IPv6 sockets are always IPv6-only, so the socket option is read-only .\"(not modifiable). -.It Dv IPV6_FAITH Fa "int *" -Get or set the status of whether -.Xr faith 4 -connections can be made to this socket. +.\".It Dv IPV6_FAITH Fa "int *" +.\"Get or set the status of whether +.\".Xr faith 4 +.\"connections can be made to this socket. .It Dv IPV6_USE_MIN_MTU Fa "int *" Get or set whether the minimal IPv6 maximum transmission unit (MTU) size will be used to avoid fragmentation from occurring for subsequent diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 52cf1c806..11a0369a3 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,6 +94,8 @@ extern int iskmemdev(dev_t dev); extern int bpfkqfilter(dev_t dev, struct knote *kn); extern int ptsd_kqfilter(dev_t dev, struct knote *kn); +extern int ignore_is_ssd; + struct vnode *speclisth[SPECHSZ]; /* symbolic sleep message strings for devices */ @@ -154,19 +156,41 @@ struct vnodeopv_desc spec_vnodeop_opv_desc = static void set_blocksize(vnode_t, dev_t); +#define THROTTLE_LEVEL_NONE -1 +#define THROTTLE_LEVEL_TIER0 0 + +#define THROTTLE_LEVEL_THROTTLED 1 +#define THROTTLE_LEVEL_TIER1 1 +#define THROTTLE_LEVEL_TIER2 2 + +#define THROTTLE_LEVEL_START 0 +#define THROTTLE_LEVEL_END 2 + + struct _throttle_io_info_t { - struct timeval last_normal_IO_timestamp; - struct timeval last_IO_timestamp; - SInt32 numthreads_throttling; - SInt32 refcnt; - SInt32 alloc; + struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1]; + struct timeval throttle_last_write_timestamp; + struct timeval throttle_start_IO_period_timestamp; + + TAILQ_HEAD( , uthread) throttle_uthlist; /* List of throttled uthreads */ + + lck_mtx_t throttle_lock; + thread_call_t throttle_timer_call; + int32_t throttle_timer_running; + int32_t throttle_io_count; + int32_t throttle_io_count_begin; + int32_t throttle_io_period; + uint32_t throttle_io_period_num; + int32_t throttle_refcnt; + int32_t throttle_alloc; }; struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; -static void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd); - +static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd); +static int throttle_get_thread_throttle_level(uthread_t ut, int policy); +__private_extern__ int32_t throttle_legacy_process_count = 0; /* * Trivial lookup routine that always fails. @@ -259,12 +283,7 @@ spec_open(struct vnop_open_args *ap) return (EPERM); } } - if (cdevsw[maj].d_type == D_TTY) { - vnode_lock(vp); - vp->v_flag |= VISTTY; - vnode_unlock(vp); - } - + devsw_lock(dev, S_IFCHR); error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); @@ -274,14 +293,15 @@ spec_open(struct vnop_open_args *ap) devsw_unlock(dev, S_IFCHR); - if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) { + if (error == 0 && (D_TYPEMASK & cdevsw[maj].d_type) == D_DISK && !vp->v_un.vu_specinfo->si_initted) { int isssd = 0; uint64_t throttle_mask = 0; uint32_t devbsdunit = 0; if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) { - - if (VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) { + + if (throttle_mask != 0 && + VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) { /* * as a reasonable approximation, only use the lowest bit of the mask * to generate a disk unit number @@ -315,7 +335,7 @@ spec_open(struct vnop_open_args *ap) * opens for writing of any disk block devices. */ if (securelevel >= 2 && cred != FSCRED && - (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) + (ap->a_mode & FWRITE) && isdisk(dev, VBLK)) return (EPERM); /* * Do not allow opens of block devices that are @@ -403,14 +423,13 @@ spec_read(struct vnop_read_args *ap) switch (vp->v_type) { case VCHR: - if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { + if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { struct _throttle_io_info_t *throttle_info; throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; - throttle_info_update_internal(throttle_info, 0, vp->v_un.vu_specinfo->si_isssd); + throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd); } - error = (*cdevsw[major(vp->v_rdev)].d_read) (vp->v_rdev, uio, ap->a_ioflag); @@ -497,16 +516,15 @@ spec_write(struct vnop_write_args *ap) switch (vp->v_type) { case VCHR: - if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { + if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) { struct _throttle_io_info_t *throttle_info; throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit]; - throttle_info_update_internal(throttle_info, 0, vp->v_un.vu_specinfo->si_isssd); + throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd); - microuptime(&throttle_info->last_IO_timestamp); + microuptime(&throttle_info->throttle_last_write_timestamp); } - error = (*cdevsw[major(vp->v_rdev)].d_write) (vp->v_rdev, uio, ap->a_ioflag); @@ -615,8 +633,21 @@ spec_ioctl(struct vnop_ioctl_args *ap) break; case VBLK: - retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, - ap->a_fflag, p); + if (kdebug_enable) { + if (ap->a_command == DKIOCUNMAP) { + dk_unmap_t *unmap; + dk_extent_t *extent; + uint32_t i; + + unmap = (dk_unmap_t *)ap->a_data; + extent = unmap->extents; + + for (i = 0; i < unmap->extentsCount; i++, extent++) { + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0); + } + } + } + retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p); break; default: @@ -693,29 +724,38 @@ spec_fsync(struct vnop_fsync_args *ap) return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context); } + /* * Just call the device strategy routine */ extern int hard_throttle_on_root; -void IOSleep(int); -// the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond -#define LOWPRI_INITIAL_WINDOW_MSECS 100 -#define LOWPRI_WINDOW_MSECS_INC 50 -#define LOWPRI_MAX_WINDOW_MSECS 200 -#define LOWPRI_MAX_WAITING_MSECS 200 +void throttle_init(void); + +#define LOWPRI_THROTTLE_WINDOW_MSECS 500 +#define LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS 200 +#define LOWPRI_IO_PERIOD_MSECS 200 +#define LOWPRI_IO_PERIOD_SSD_MSECS 20 +#define LOWPRI_TIMER_PERIOD_MSECS 10 + + +int lowpri_throttle_window_msecs = LOWPRI_THROTTLE_WINDOW_MSECS; +int lowpri_legacy_throttle_window_msecs = LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS; +int lowpri_io_period_msecs = LOWPRI_IO_PERIOD_MSECS; +int lowpri_io_period_ssd_msecs = LOWPRI_IO_PERIOD_SSD_MSECS; +int lowpri_timer_period_msecs = LOWPRI_TIMER_PERIOD_MSECS; + +/* + * If a process requiring legacy iothrottle behavior is running on the + * system, use legacy limits for throttle window and max IO size. + */ #if CONFIG_EMBEDDED -#define LOWPRI_SLEEP_INTERVAL 5 +#define THROTTLE_WINDOW (lowpri_throttle_window_msecs) #else -#define LOWPRI_SLEEP_INTERVAL 2 +#define THROTTLE_WINDOW (throttle_legacy_process_count == 0 ? lowpri_throttle_window_msecs : lowpri_legacy_throttle_window_msecs) #endif -int lowpri_IO_initial_window_msecs = LOWPRI_INITIAL_WINDOW_MSECS; -int lowpri_IO_window_msecs_inc = LOWPRI_WINDOW_MSECS_INC; -int lowpri_max_window_msecs = LOWPRI_MAX_WINDOW_MSECS; -int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS; - #if 0 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \ do { \ @@ -727,10 +767,17 @@ int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS; #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) #endif -SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_window_msecs, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_legacy_throttle_window_msecs, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_msecs, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_ssd_msecs, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_timer_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_timer_period_msecs, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_process_count, CTLFLAG_RD | CTLFLAG_LOCKED, &throttle_legacy_process_count, 0, ""); + +static lck_grp_t *throttle_mtx_grp; +static lck_attr_t *throttle_mtx_attr; +static lck_grp_attr_t *throttle_mtx_grp_attr; + /* * throttled I/O helper function @@ -741,7 +788,7 @@ num_trailing_0(uint64_t n) { /* * since in most cases the number of trailing 0s is very small, - * we simply counting sequentially from the lowest bit + * we simply counting sequentially from the lowest bit */ if (n == 0) return sizeof(n) * 8; @@ -753,6 +800,7 @@ num_trailing_0(uint64_t n) return count; } + /* * Release the reference and if the item was allocated and this is the last * reference then free it. @@ -762,7 +810,7 @@ num_trailing_0(uint64_t n) static int throttle_info_rel(struct _throttle_io_info_t *info) { - SInt32 oldValue = OSDecrementAtomic(&info->refcnt); + SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt); DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", info, (int)(oldValue -1), info ); @@ -775,13 +823,16 @@ throttle_info_rel(struct _throttle_io_info_t *info) * Once reference count is zero, no one else should be able to take a * reference */ - if ((info->refcnt == 0) && (info->alloc)) { - DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info ); + if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) { + DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info); + + lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp); FREE(info, M_TEMP); } return oldValue; } + /* * Just take a reference on the throttle info structure. * @@ -790,17 +841,211 @@ throttle_info_rel(struct _throttle_io_info_t *info) static SInt32 throttle_info_ref(struct _throttle_io_info_t *info) { - SInt32 oldValue = OSIncrementAtomic(&info->refcnt); + SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt); DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", info, (int)(oldValue -1), info ); /* Allocated items should never have a reference of zero */ - if (info->alloc && (oldValue == 0)) + if (info->throttle_alloc && (oldValue == 0)) panic("Taking a reference without calling create throttle info!\n"); return oldValue; } + +/* + * on entry the throttle_lock is held... + * this function is responsible for taking + * and dropping the reference on the info + * structure which will keep it from going + * away while the timer is running if it + * happens to have been dynamically allocated by + * a network fileystem kext which is now trying + * to free it + */ +static uint32_t +throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count) +{ + struct timeval elapsed; + int elapsed_msecs; + int throttle_level; + uint64_t deadline; + + if (update_io_count == TRUE) { + info->throttle_io_count_begin = info->throttle_io_count; + info->throttle_io_period_num++; + + microuptime(&info->throttle_start_IO_period_timestamp); + } + for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) { + + microuptime(&elapsed); + timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]); + elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; + + if (elapsed_msecs < THROTTLE_WINDOW) { + /* + * we had an I/O occur in this level within + * our throttle window, so we need to + * to make sure the timer continues to run + */ + break; + } + } + if (throttle_level >= THROTTLE_LEVEL_END) { + /* + * we're outside all of the throttle windows... + * don't start a new timer + */ + info->throttle_timer_running = 0; + + return (THROTTLE_LEVEL_END); + } + if (info->throttle_timer_running == 0) { + /* + * take a reference for the timer + */ + throttle_info_ref(info); + + info->throttle_timer_running = 1; + } + clock_interval_to_deadline(lowpri_timer_period_msecs, 1000000, &deadline); + + thread_call_enter_delayed(info->throttle_timer_call, deadline); + + return (throttle_level); +} + + +static void +throttle_timer(struct _throttle_io_info_t *info) +{ + uthread_t ut, utlist; + struct timeval elapsed; + int elapsed_msecs; + int throttle_level; + boolean_t update_io_count = FALSE; + boolean_t need_wakeup = FALSE; + boolean_t need_release = FALSE; + + lck_mtx_lock(&info->throttle_lock); + + microuptime(&elapsed); + timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp); + elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; + + if (elapsed_msecs >= info->throttle_io_period) { + /* + * we're closing out the current IO period... + * if we have a waiting thread, wake it up + * after we have reset the I/O window info + */ + need_wakeup = TRUE; + update_io_count = TRUE; + } + if ((throttle_level = throttle_timer_start(info, update_io_count)) == THROTTLE_LEVEL_END) { + /* + * we are now outside of the throttle window + * for all throttle levels... + * + * the timer is not restarted in this case, so + * we need to get rid of the reference we took when + * we started up the timer... we can't do this + * until we are entirely done playing with 'info' + */ + need_release = TRUE; + } + + TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist, uu_throttlelist, utlist) { + /* + * if we are now outside of the throttle window release + * all of the currently blocked threads, otherwise + * look for threads that have had their IO policy changed + * by someone else and are no longer throttleable, or are + * not at the current throttle level and unblock them + */ + if (throttle_level == THROTTLE_LEVEL_END || throttle_get_thread_throttle_level(ut, -1) <= throttle_level) { + + TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); + ut->uu_on_throttlelist = 0; + + wakeup(&ut->uu_on_throttlelist); + } + } + if (need_wakeup && !TAILQ_EMPTY(&info->throttle_uthlist)) { + /* + * we've entered a new I/O period and we're still + * in the throttle window, so wakeup the next guy in line + */ + ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist); + TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); + ut->uu_on_throttlelist = 0; + + wakeup(&ut->uu_on_throttlelist); + } + lck_mtx_unlock(&info->throttle_lock); + + if (need_release == TRUE) + throttle_info_rel(info); +} + + +void +throttle_init(void) +{ + struct _throttle_io_info_t *info; + int i; + + /* + * allocate lock group attribute and group + */ + throttle_mtx_grp_attr = lck_grp_attr_alloc_init(); + throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr); + + /* + * allocate the lock attribute + */ + throttle_mtx_attr = lck_attr_alloc_init(); + + for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) { + info = &_throttle_io_info[i]; + + lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); + info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); + + TAILQ_INIT(&info->throttle_uthlist); + } +} + + +/* + * KPI routine + * + * wakeup and remove the specified thread from the throttle queue + * if it's no longer in a throttleable state... + * takes a valid uthread (which may or may not be on the + * throttle queue) as input + */ +void +unthrottle_thread(uthread_t ut) +{ + struct _throttle_io_info_t *info; + + if ((info = ut->uu_throttle_info) == NULL) + return; + + lck_mtx_lock(&info->throttle_lock); + + if (ut->uu_on_throttlelist && throttle_get_thread_throttle_level(ut, -1) <= THROTTLE_LEVEL_THROTTLED) { + TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); + ut->uu_on_throttlelist = 0; + + wakeup(&ut->uu_on_throttlelist); + } + lck_mtx_unlock(&info->throttle_lock); +} + + /* * KPI routine * @@ -819,9 +1064,15 @@ throttle_info_create(void) return NULL; /* Mark that this one was allocated and needs to be freed */ DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); - info->alloc = TRUE; + info->throttle_alloc = TRUE; + + lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr); + info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); + + TAILQ_INIT(&info->throttle_uthlist); + /* Take a reference */ - OSIncrementAtomic(&info->refcnt); + OSIncrementAtomic(&info->throttle_refcnt); return info; } @@ -855,7 +1106,10 @@ throttle_info_mount_ref(mount_t mp, void *throttle_info) if ((throttle_info == NULL) || (mp == NULL)) return; throttle_info_ref(throttle_info); - /* We already have a reference release it before adding the new one */ + + /* + * We already have a reference release it before adding the new one + */ if (mp->mnt_throttle_info) throttle_info_rel(mp->mnt_throttle_info); mp->mnt_throttle_info = throttle_info; @@ -868,10 +1122,9 @@ throttle_info_mount_ref(mount_t mp, void *throttle_info) * handle must be released by throttle_info_rel_by_mask */ int -throttle_info_ref_by_mask(uint64_t throttle_mask, - throttle_info_handle_t *throttle_info_handle) +throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle) { - int dev_index; + int dev_index; struct _throttle_io_info_t *info; if (throttle_info_handle == NULL) @@ -881,6 +1134,7 @@ throttle_info_ref_by_mask(uint64_t throttle_mask, info = &_throttle_io_info[dev_index]; throttle_info_ref(info); *(struct _throttle_io_info_t**)throttle_info_handle = info; + return 0; } @@ -892,7 +1146,9 @@ throttle_info_ref_by_mask(uint64_t throttle_mask, void throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) { - /* for now the handle is just a pointer to _throttle_io_info_t */ + /* + * for now the handle is just a pointer to _throttle_io_info_t + */ throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); } @@ -916,13 +1172,13 @@ throttle_info_get_last_io_time(mount_t mp, struct timeval *tv) struct _throttle_io_info_t *info; if (mp == NULL) - info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; + info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; else if (mp->mnt_throttle_info == NULL) - info = &_throttle_io_info[mp->mnt_devbsdunit]; + info = &_throttle_io_info[mp->mnt_devbsdunit]; else - info = mp->mnt_throttle_info; + info = mp->mnt_throttle_info; - *tv = info->last_IO_timestamp; + *tv = info->throttle_last_write_timestamp; } void @@ -931,69 +1187,101 @@ update_last_io_time(mount_t mp) struct _throttle_io_info_t *info; if (mp == NULL) - info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; + info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; else if (mp->mnt_throttle_info == NULL) - info = &_throttle_io_info[mp->mnt_devbsdunit]; + info = &_throttle_io_info[mp->mnt_devbsdunit]; else - info = mp->mnt_throttle_info; + info = mp->mnt_throttle_info; - microuptime(&info->last_IO_timestamp); + microuptime(&info->throttle_last_write_timestamp); } -#if CONFIG_EMBEDDED - -int throttle_get_io_policy(struct uthread **ut) +int +throttle_get_io_policy(uthread_t *ut) { - int policy = IOPOL_DEFAULT; - proc_t p = current_proc(); - *ut = get_bsdthread_info(current_thread()); - - if (p != NULL) - policy = p->p_iopol_disk; - - if (*ut != NULL) { - // the I/O policy of the thread overrides that of the process - // unless the I/O policy of the thread is default - if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) - policy = (*ut)->uu_iopol_disk; - } - return policy; + + return (proc_get_task_selfdiskacc()); } -#else -int throttle_get_io_policy(__unused struct uthread **ut) -{ - *ut = get_bsdthread_info(current_thread()); - return (proc_get_task_selfdiskacc()); + +static int +throttle_get_thread_throttle_level(uthread_t ut, int policy) +{ + int thread_throttle_level = THROTTLE_LEVEL_NONE; + + if (ut == NULL) + ut = get_bsdthread_info(current_thread()); + + if (policy == -1) + policy = proc_get_diskacc(ut->uu_thread); + + switch (policy) { + + case IOPOL_DEFAULT: + case IOPOL_NORMAL: + thread_throttle_level = THROTTLE_LEVEL_TIER0; + case IOPOL_PASSIVE: + if (ut->uu_throttle_bc == TRUE) + thread_throttle_level = THROTTLE_LEVEL_TIER2; + break; + case IOPOL_THROTTLE: + thread_throttle_level = THROTTLE_LEVEL_TIER2; + break; + case IOPOL_UTILITY: + thread_throttle_level = THROTTLE_LEVEL_TIER1; + break; + default: + printf("unknown I/O policy %d", policy); + break; + } + return (thread_throttle_level); } -#endif static int -throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info) +throttle_io_will_be_throttled_internal(void * throttle_info) { struct _throttle_io_info_t *info = throttle_info; struct timeval elapsed; - int elapsed_msecs; - int policy; - struct uthread *ut; - - policy = throttle_get_io_policy(&ut); + int elapsed_msecs; + int thread_throttle_level; + int throttle_level; - if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE) + if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL, -1)) < THROTTLE_LEVEL_THROTTLED) return (0); - microuptime(&elapsed); - timevalsub(&elapsed, &info->last_normal_IO_timestamp); - elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; + for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { - if (lowpri_window_msecs == -1) // use the max waiting time - lowpri_window_msecs = lowpri_max_waiting_msecs; + microuptime(&elapsed); + timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]); + elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; - return elapsed_msecs < lowpri_window_msecs; + if (elapsed_msecs < THROTTLE_WINDOW) + break; + } + if (throttle_level >= thread_throttle_level) { + /* + * we're beyond all of the throttle windows + * that affect the throttle level of this thread, + * so go ahead and treat as normal I/O + */ + return (0); + } + if (info->throttle_io_count != info->throttle_io_count_begin) { + /* + * we've already issued at least one throttleable I/O + * in the current I/O window, so avoid issuing another one + */ + return (2); + } + /* + * we're in the throttle window, so + * cut the I/O size back + */ + return (1); } /* @@ -1002,83 +1290,123 @@ throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_ * the correct throttle info array element. */ int -throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp) +throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp) { - void *info; + void *info; - /* Should we just return zero if no mount point */ + /* + * Should we just return zero if no mount point + */ if (mp == NULL) - info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; + info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; else if (mp->mnt_throttle_info == NULL) - info = &_throttle_io_info[mp->mnt_devbsdunit]; + info = &_throttle_io_info[mp->mnt_devbsdunit]; else - info = mp->mnt_throttle_info; - return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info); + info = mp->mnt_throttle_info; + + return throttle_io_will_be_throttled_internal(info); } + uint32_t throttle_lowpri_io(int sleep_amount) { - int sleep_cnt = 0; - int numthreads_throttling; - int max_try_num; - struct uthread *ut; + uthread_t ut; struct _throttle_io_info_t *info; - int max_waiting_msecs; + int throttle_type = 0; + int sleep_cnt = 0; + int locked = 0; + uint32_t throttle_io_period_num = 0; + boolean_t insert_tail = TRUE; ut = get_bsdthread_info(current_thread()); - if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL)) - goto done; + if (ut->uu_lowpri_window == 0) + return (0); info = ut->uu_throttle_info; - if (sleep_amount != 0) { -#if CONFIG_EMBEDDED - max_waiting_msecs = lowpri_max_waiting_msecs; -#else - if (ut->uu_throttle_isssd == TRUE) - max_waiting_msecs = lowpri_max_waiting_msecs / 100; - else - max_waiting_msecs = lowpri_max_waiting_msecs; -#endif - if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL) - max_waiting_msecs = LOWPRI_SLEEP_INTERVAL; + if ((sleep_amount == 0) || (info == NULL)) + goto done; - numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1; - max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling); + if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE) + sleep_amount = 0; - for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) { - if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) { - if (sleep_cnt == 0) { - KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, - ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0); - } - IOSleep(LOWPRI_SLEEP_INTERVAL); - DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info ); - } else { + throttle_io_period_num = info->throttle_io_period_num; + + while ( (throttle_type = throttle_io_will_be_throttled_internal(info)) ) { + + if (throttle_type == 1) { + if (sleep_amount == 0) + break; + if (info->throttle_io_period_num < throttle_io_period_num) + break; + if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) break; - } } - if (sleep_cnt) { - KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, - ut->uu_lowpri_window, sleep_cnt, 0, 0, 0); + if (!locked) { + lck_mtx_lock(&info->throttle_lock); + locked = 1; } - } - SInt32 oldValue; - oldValue = OSDecrementAtomic(&info->numthreads_throttling); + if (info->throttle_timer_running == 0) { + /* + * try to start the timer since it's + * currently not running. on failure, no + * timer reference to drop since it wasn't started + */ + if (throttle_timer_start(info, TRUE) == THROTTLE_LEVEL_END) + goto done; + } + if (sleep_cnt == 0) { + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, + ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0); + } + if (ut->uu_on_throttlelist == 0) { + if (insert_tail == TRUE) + TAILQ_INSERT_TAIL(&info->throttle_uthlist, ut, uu_throttlelist); + else + TAILQ_INSERT_HEAD(&info->throttle_uthlist, ut, uu_throttlelist); + + ut->uu_on_throttlelist = 1; + } + msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL); - if (oldValue <= 0) { - panic("%s: numthreads negative", __func__); + sleep_cnt++; + + if (sleep_amount == 0) + insert_tail = FALSE; + else if (info->throttle_io_period_num < throttle_io_period_num || + (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) { + insert_tail = FALSE; + sleep_amount = 0; + } } done: - ut->uu_lowpri_window = 0; - if (ut->uu_throttle_info) - throttle_info_rel(ut->uu_throttle_info); + if (ut->uu_on_throttlelist) { + if (!locked) { + lck_mtx_lock(&info->throttle_lock); + locked = 1; + } + if (ut->uu_on_throttlelist) { + TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist); + + ut->uu_on_throttlelist = 0; + } + } + if (locked) + lck_mtx_unlock(&info->throttle_lock); + + if (sleep_cnt) + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, + ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0); + if (info) + throttle_info_rel(info); + ut->uu_throttle_info = NULL; ut->uu_throttle_bc = FALSE; + ut->uu_lowpri_window = 0; - return (sleep_cnt * LOWPRI_SLEEP_INTERVAL); + return (sleep_cnt); } /* @@ -1091,85 +1419,59 @@ done: */ void throttle_set_thread_io_policy(int policy) { -#if !CONFIG_EMBEDDED proc_apply_thread_selfdiskacc(policy); -#else /* !CONFIG_EMBEDDED */ - struct uthread *ut; - ut = get_bsdthread_info(current_thread()); - ut->uu_iopol_disk = policy; -#endif /* !CONFIG_EMBEDDED */ } static -void throttle_info_reset_window(struct uthread *ut) +void throttle_info_reset_window(uthread_t ut) { struct _throttle_io_info_t *info; - info = ut->uu_throttle_info; + if ( (info = ut->uu_throttle_info) ) { + throttle_info_rel(info); - OSDecrementAtomic(&info->numthreads_throttling); - throttle_info_rel(info); - ut->uu_throttle_info = NULL; - ut->uu_lowpri_window = 0; + ut->uu_throttle_info = NULL; + ut->uu_lowpri_window = 0; + ut->uu_throttle_bc = FALSE; + } } static -void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle) +void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle) { - SInt32 oldValue; + if (ut->uu_throttle_info == NULL) { - ut->uu_throttle_info = info; - throttle_info_ref(info); - DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); + ut->uu_throttle_info = info; + throttle_info_ref(info); + DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); - oldValue = OSIncrementAtomic(&info->numthreads_throttling); - if (oldValue < 0) { - panic("%s: numthreads negative", __func__); + ut->uu_lowpri_window = THROTTLE_WINDOW; + ut->uu_throttle_bc = BC_throttle; } - ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; - ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; - ut->uu_throttle_isssd = isssd; - ut->uu_throttle_bc = BC_throttle; } static -void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd) +void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd) { - struct _throttle_io_info_t *info = throttle_info; - struct uthread *ut; - int policy; - int is_throttleable_io = 0; - int is_passive_io = 0; + int thread_throttle_level; - if (!lowpri_IO_initial_window_msecs || (info == NULL)) + if (THROTTLE_WINDOW == 0) return; - policy = throttle_get_io_policy(&ut); - switch (policy) { - case IOPOL_DEFAULT: - case IOPOL_NORMAL: - break; - case IOPOL_THROTTLE: - is_throttleable_io = 1; - break; - case IOPOL_PASSIVE: - is_passive_io = 1; - break; - default: - printf("unknown I/O policy %d", policy); - break; - } + if (ut == NULL) + ut = get_bsdthread_info(current_thread()); - if (!is_throttleable_io && ISSET(flags, B_PASSIVE)) - is_passive_io |= 1; + thread_throttle_level = throttle_get_thread_throttle_level(ut, policy); - if (!is_throttleable_io) { - if (!is_passive_io){ - microuptime(&info->last_normal_IO_timestamp); - } - } else if (ut) { + if (thread_throttle_level == THROTTLE_LEVEL_TIER0 && ISSET(flags, B_PASSIVE)) + thread_throttle_level = THROTTLE_LEVEL_NONE; + + if (thread_throttle_level != THROTTLE_LEVEL_NONE) + microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]); + + if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) { /* * I'd really like to do the IOSleep here, but * we may be holding all kinds of filesystem related locks @@ -1180,42 +1482,54 @@ void throttle_info_update_internal(void *throttle_info, int flags, boolean_t iss * do the delay just before we return from the system * call that triggered this I/O or from vnode_pagein */ - if (ut->uu_lowpri_window == 0) - throttle_info_set_initial_window(ut, info, isssd, FALSE); - else { - /* The thread sends I/Os to different devices within the same system call */ - if (ut->uu_throttle_info != info) { - struct _throttle_io_info_t *old_info = ut->uu_throttle_info; - - // keep track of the numthreads in the right device - OSDecrementAtomic(&old_info->numthreads_throttling); - OSIncrementAtomic(&info->numthreads_throttling); - - DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info ); - DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info ); - /* This thread no longer needs a reference on that throttle info */ - throttle_info_rel(ut->uu_throttle_info); - ut->uu_throttle_info = info; - /* Need to take a reference on this throttle info */ - throttle_info_ref(ut->uu_throttle_info); - } - int numthreads = MAX(1, info->numthreads_throttling); - ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads; - if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads) - ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads; - - if (isssd == FALSE) { - /* - * we're here because we've actually issued I/Os to different devices... - * if at least one of them was a non SSD, then thottle the thread - * using the policy for non SSDs - */ - ut->uu_throttle_isssd = FALSE; - } + if (info->throttle_io_period == 0) { + + if (isssd == TRUE) + info->throttle_io_period = lowpri_io_period_ssd_msecs; + else + info->throttle_io_period = lowpri_io_period_msecs; + + if (info->throttle_io_period < lowpri_timer_period_msecs) + info->throttle_io_period = lowpri_timer_period_msecs; } + OSAddAtomic(1, &info->throttle_io_count); + + throttle_info_set_initial_window(ut, info, FALSE); + } +} + +void throttle_info_update_by_mount(mount_t mp) +{ + struct _throttle_io_info_t *info; + uthread_t ut; + boolean_t isssd = FALSE; + + ut = get_bsdthread_info(current_thread()); + + if (ut->uu_lowpri_window) + return; + + if (mp != NULL) { + if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + isssd = TRUE; + info = &_throttle_io_info[mp->mnt_devbsdunit]; + } else + info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; + + if (info->throttle_io_period == 0) { + + if (isssd == TRUE) + info->throttle_io_period = lowpri_io_period_ssd_msecs; + else + info->throttle_io_period = lowpri_io_period_msecs; + + if (info->throttle_io_period < lowpri_timer_period_msecs) + info->throttle_io_period = lowpri_timer_period_msecs; } + throttle_info_set_initial_window(ut, info, FALSE); } + /* * KPI routine * @@ -1224,7 +1538,8 @@ void throttle_info_update_internal(void *throttle_info, int flags, boolean_t iss */ void throttle_info_update(void *throttle_info, int flags) { - throttle_info_update_internal(throttle_info, flags, FALSE); + if (throttle_info) + throttle_info_update_internal(throttle_info, NULL, -1, flags, FALSE); } /* @@ -1236,7 +1551,9 @@ void throttle_info_update(void *throttle_info, int flags) void throttle_info_update_by_mask(void *throttle_info_handle, int flags) { void *throttle_info = throttle_info_handle; - /* for now we only use the lowest bit of the throttle mask, so the + + /* + * for now we only use the lowest bit of the throttle mask, so the * handle is the same as the throttle_info. Later if we store a * set of throttle infos in the handle, we will want to loop through * them and call throttle_info_update in a loop @@ -1244,20 +1561,77 @@ void throttle_info_update_by_mask(void *throttle_info_handle, int flags) throttle_info_update(throttle_info, flags); } -extern int ignore_is_ssd; + +int throttle_info_io_will_be_throttled(void * throttle_info, int policy) +{ + struct _throttle_io_info_t *info = throttle_info; + struct timeval elapsed; + int elapsed_msecs; + int throttle_level; + int thread_throttle_level; + + switch (policy) { + + case IOPOL_THROTTLE: + thread_throttle_level = THROTTLE_LEVEL_TIER2; + break; + case IOPOL_UTILITY: + thread_throttle_level = THROTTLE_LEVEL_TIER1; + break; + default: + thread_throttle_level = THROTTLE_LEVEL_TIER0; + break; + } + for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) { + + microuptime(&elapsed); + timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]); + elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000; + + if (elapsed_msecs < THROTTLE_WINDOW) + break; + } + if (throttle_level >= thread_throttle_level) { + /* + * we're beyond all of the throttle windows + * so go ahead and treat as normal I/O + */ + return (0); + } + /* + * we're in the throttle window + */ + return (1); +} + +void +throttle_legacy_process_incr(void) +{ + OSIncrementAtomic(&throttle_legacy_process_count); +} + +void +throttle_legacy_process_decr(void) +{ + OSDecrementAtomic(&throttle_legacy_process_count); +} + int spec_strategy(struct vnop_strategy_args *ap) { - buf_t bp; + buf_t bp; int bflags; int policy; dev_t bdev; uthread_t ut; mount_t mp; - int strategy_ret; + int strategy_ret; struct _throttle_io_info_t *throttle_info; boolean_t isssd = FALSE; +#if !CONFIG_EMBEDDED + proc_t curproc = current_proc(); +#endif /* !CONFIG_EMBEDDED */ bp = ap->a_bp; bdev = buf_device(bp); @@ -1265,13 +1639,21 @@ spec_strategy(struct vnop_strategy_args *ap) policy = throttle_get_io_policy(&ut); - if (policy == IOPOL_THROTTLE) { + if (bp->b_flags & B_META) + bp->b_attr.ba_flags |= BA_META; + + if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY) { bp->b_flags |= B_THROTTLED_IO; bp->b_attr.ba_flags |= BA_THROTTLED_IO; bp->b_flags &= ~B_PASSIVE; } else if (policy == IOPOL_PASSIVE) bp->b_flags |= B_PASSIVE; +#if !CONFIG_EMBEDDED + if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) + bp->b_attr.ba_flags |= BA_DELAYIDLESLEEP; +#endif /* !CONFIG_EMBEDDED */ + bflags = bp->b_flags; if (kdebug_enable) { @@ -1292,10 +1674,13 @@ spec_strategy(struct vnop_strategy_args *ap) else if (bflags & B_PASSIVE) code |= DKIO_PASSIVE; - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, - bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); + if (bp->b_attr.ba_flags & BA_NOCACHE) + code |= DKIO_NOCACHE; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, + bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); } - if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && + if (((bflags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && mp && (mp->mnt_kern_flag & MNTK_ROOTDEV)) hard_throttle_on_root = 1; @@ -1306,10 +1691,11 @@ spec_strategy(struct vnop_strategy_args *ap) } else throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; - throttle_info_update_internal(throttle_info, bflags, isssd); + throttle_info_update_internal(throttle_info, ut, policy, bflags, isssd); if ((bflags & B_READ) == 0) { - microuptime(&throttle_info->last_IO_timestamp); + microuptime(&throttle_info->throttle_last_write_timestamp); + if (mp) { INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); } @@ -1342,19 +1728,19 @@ spec_strategy(struct vnop_strategy_args *ap) strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); - if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) { + if (IO_SATISFIED_BY_CACHE == strategy_ret) { /* * If this was a throttled IO satisfied by the boot cache, * don't delay the thread. */ throttle_info_reset_window(ut); - } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) { + } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) { /* * If the boot cache indicates this IO should be throttled, * delay the thread. */ - throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE); + throttle_info_set_initial_window(ut, throttle_info, TRUE); } return (0); } @@ -1394,15 +1780,16 @@ spec_close(struct vnop_close_args *ap) * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 1 (this is the very - * last close) + * last close) */ sessp = proc_session(p); if (sessp != SESSION_NULL) { - if ((vcount(vp) == 1) && - (vp == sessp->s_ttyvp)) { + if (vp == sessp->s_ttyvp && vcount(vp) == 1) { + struct tty *tp; session_lock(sessp); if (vp == sessp->s_ttyvp) { + tp = SESSION_TP(sessp); sessp->s_ttyvp = NULL; sessp->s_ttyvid = 0; sessp->s_ttyp = TTY_NULL; @@ -1413,6 +1800,8 @@ spec_close(struct vnop_close_args *ap) if (do_rele) { vnode_rele(vp); + if (NULL != tp) + ttyfree(tp); } } session_rele(sessp); @@ -1420,20 +1809,15 @@ spec_close(struct vnop_close_args *ap) devsw_lock(dev, S_IFCHR); - vp->v_specinfo->si_opencount--; + if (--vp->v_specinfo->si_opencount < 0) + panic("negative open count (c, %u, %u)", major(dev), minor(dev)); - if (vp->v_specinfo->si_opencount < 0) { - panic("Negative open count?"); - } /* - * close on last reference or on vnode revoke call + * close always, or close on last reference, or close on revoke */ - if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) { - devsw_unlock(dev, S_IFCHR); - return (0); - } - - error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); + if ((D_TRACKCLOSE & cdevsw[major(dev)].d_type) != 0 || + vcount(vp) == 0 || (flags & IO_REVOKE) != 0) + error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); devsw_unlock(dev, S_IFCHR); break; @@ -1465,18 +1849,11 @@ spec_close(struct vnop_close_args *ap) devsw_lock(dev, S_IFBLK); - vp->v_specinfo->si_opencount--; - - if (vp->v_specinfo->si_opencount < 0) { - panic("Negative open count?"); - } - - if (vcount(vp) > 0) { - devsw_unlock(dev, S_IFBLK); - return (0); - } + if (--vp->v_specinfo->si_opencount < 0) + panic("negative open count (b, %u, %u)", major(dev), minor(dev)); - error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); + if (vcount(vp) == 0) + error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); devsw_unlock(dev, S_IFBLK); break; diff --git a/bsd/miscfs/specfs/specdev.h b/bsd/miscfs/specfs/specdev.h index 7b44d40e3..2b14d796b 100644 --- a/bsd/miscfs/specfs/specdev.h +++ b/bsd/miscfs/specfs/specdev.h @@ -112,9 +112,9 @@ struct specinfo { */ #define SPECHSZ 64 #if ((SPECHSZ&(SPECHSZ-1)) == 0) -#define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1)) +#define SPECHASH(rdev) (((rdev>>21)+(rdev))&(SPECHSZ-1)) #else -#define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ) +#define SPECHASH(rdev) (((unsigned)((rdev>>21)+(rdev)))%SPECHSZ) #endif extern struct vnode *speclisth[SPECHSZ]; diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 79c622bf8..5a186e2b6 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -8,10 +8,12 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ + altq classq pktsched INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ + altq classq pktsched EXPINC_SUBDIRS_I386 = \ @@ -19,7 +21,8 @@ DATAFILES= \ bpf.h dlil.h \ ethernet.h if.h if_arp.h \ if_dl.h if_llc.h if_media.h if_mib.h \ - if_types.h if_utun.h if_var.h \ + if_types.h if_var.h \ + if_utun.h if_utun_crypto.h if_utun_crypto_ipsec.h \ kext_net.h ndrv.h pfkeyv2.h \ route.h @@ -29,14 +32,14 @@ KERNELFILES= \ PRIVATE_DATAFILES = \ if_vlan_var.h if_ppp.h firewire.h \ - ppp_defs.h radix.h if_bond_var.h lacp.h ndrv_var.h \ + ppp_defs.h radix.h if_bond_var.h if_bond_internal.h lacp.h ndrv_var.h \ netsrc.h raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \ - if_bridgevar.h ntstat.h if_llreach.h + if_bridgevar.h ntstat.h iptap.h if_llreach.h PRIVATE_KERNELFILES = ${KERNELFILES} \ - bpfdesc.h dlil_pvt.h ppp_comp.h \ + bpfdesc.h ppp_comp.h \ zlib.h bpf_compat.h net_osdep.h \ - ntstat.h if_llreach.h + ntstat.h iptap.h if_llreach.h flowadv.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/crypto/aes/Makefile b/bsd/net/altq/Makefile similarity index 52% rename from bsd/crypto/aes/Makefile rename to bsd/net/altq/Makefile index 6b96dbd34..b8ad37152 100644 --- a/bsd/crypto/aes/Makefile +++ b/bsd/net/altq/Makefile @@ -3,42 +3,41 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_I386 = \ - i386 - -INSTINC_SUBDIRS_X86_64 = \ - i386 +INSTINC_SUBDIRS_PPC = \ -INSTINC_SUBDIRS_ARM = \ +INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ +EXPINC_SUBDIRS_PPC = \ + EXPINC_SUBDIRS_I386 = \ -EXPINC_SUBDIRS_X86_64 = \ +DATAFILES= \ -EXPINC_SUBDIRS_ARM = \ +KERNELFILES= \ PRIVATE_DATAFILES = \ - aes.h + altq.h altq_cbq.h altq_fairq.h altq_hfsc.h altq_priq.h altq_qfq.h -INSTALL_MI_DIR = crypto +PRIVATE_KERNELFILES = ${KERNELFILES} -EXPORT_MI_DIR = ${INSTALL_MI_DIR} +INSTALL_MI_LIST = ${DATAFILES} -EXPORT_MI_LIST = aes.h +INSTALL_MI_DIR = net/altq -INSTALL_KF_MI_LIST = +EXPORT_MI_LIST = ${INSTALL_MI_LIST} ${KERNELFILES} -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} +EXPORT_MI_DIR = ${INSTALL_MI_DIR} -include $(MakeInc_rule) -include $(MakeInc_dir) +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/net/altq/altq.h b/bsd/net/altq/altq.h new file mode 100644 index 000000000..590c6810b --- /dev/null +++ b/bsd/net/altq/altq.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq.h,v 1.4 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq.h,v 1.10 2003/07/10 12:07:47 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _ALTQ_ALTQ_H_ +#define _ALTQ_ALTQ_H_ + +#ifdef PRIVATE +#include + +/* altq discipline type */ +#define ALTQT_NONE PKTSCHEDT_NONE /* reserved */ +#define ALTQT_CBQ PKTSCHEDT_CBQ /* cbq */ +#define ALTQT_HFSC PKTSCHEDT_HFSC /* hfsc */ +#define ALTQT_PRIQ PKTSCHEDT_PRIQ /* priority queue */ +#define ALTQT_FAIRQ PKTSCHEDT_FAIRQ /* fairq */ +#define ALTQT_QFQ PKTSCHEDT_QFQ /* quick fair queueing */ +#define ALTQT_MAX PKTSCHEDT_MAX /* should be max disc type + 1 */ +#endif /* PRIVATE */ +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _ALTQ_ALTQ_H_ */ diff --git a/bsd/net/altq/altq_cbq.c b/bsd/net/altq/altq_cbq.c new file mode 100644 index 000000000..31b3573c9 --- /dev/null +++ b/bsd/net/altq/altq_cbq.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_cbq.c,v 1.23 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_cbq.c,v 1.9 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#if PF_ALTQ && PKTSCHED_CBQ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * Forward Declarations. + */ +static int altq_cbq_request(struct ifaltq *, enum altrq, void *); +static int altq_cbq_enqueue(struct ifaltq *, struct mbuf *); +static struct mbuf *altq_cbq_dequeue(struct ifaltq *, enum altdq_op); + +int +altq_cbq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + IFCQ_LOCK(&ifp->if_snd); + error = altq_attach(IFCQ_ALTQ(&ifp->if_snd), ALTQT_CBQ, a->altq_disc, + altq_cbq_enqueue, altq_cbq_dequeue, NULL, altq_cbq_request); + IFCQ_UNLOCK(&ifp->if_snd); + + return (error); +} + +int +altq_cbq_add(struct pf_altq *a) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(IFCQ_ALTQ(&ifp->if_snd))) + return (ENODEV); + + cbqp = cbq_alloc(ifp, M_WAITOK, TRUE); + if (cbqp == NULL) + return (ENOMEM); + + /* keep the state in pf_altq */ + a->altq_disc = cbqp; + + return (0); +} + +int +altq_cbq_remove(struct pf_altq *a) +{ + cbq_state_t *cbqp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + return (cbq_destroy(cbqp)); +} + +int +altq_cbq_add_queue(struct pf_altq *a) +{ + struct cbq_opts *opts = &a->pq_u.cbq_opts; + cbq_state_t *cbqp; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(cbqp->ifnp.ifq_); + err = cbq_add_queue(cbqp, a->qlimit, a->priority, + opts->minburst, opts->maxburst, opts->pktsize, opts->maxpktsize, + opts->ns_per_byte, opts->maxidle, opts->minidle, opts->offtime, + opts->flags, a->parent_qid, a->qid, NULL); + IFCQ_UNLOCK(cbqp->ifnp.ifq_); + + return (err); +} + +int +altq_cbq_remove_queue(struct pf_altq *a) +{ + cbq_state_t *cbqp; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(cbqp->ifnp.ifq_); + err = cbq_remove_queue(cbqp, a->qid); + IFCQ_UNLOCK(cbqp->ifnp.ifq_); + + return (err); +} + +int +altq_cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct ifclassq *ifq = NULL; + cbq_state_t *cbqp; + class_stats_t stats; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((unsigned)*nbytes < sizeof (stats)) + return (EINVAL); + + if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL) + return (EBADF); + + ifq = cbqp->ifnp.ifq_; + IFCQ_LOCK_ASSERT_HELD(ifq); /* lock held by altq_lookup */ + error = cbq_get_class_stats(cbqp, a->qid, &stats); + IFCQ_UNLOCK(ifq); + if (error != 0) + return (error); + + if ((error = copyout((caddr_t)&stats, (user_addr_t)(uintptr_t)ubuf, + sizeof (stats))) != 0) + return (error); + + *nbytes = sizeof (stats); + + return (0); +} + +static int +altq_cbq_request(struct ifaltq *altq, enum altrq req, void *arg) +{ + cbq_state_t *cbqp = (cbq_state_t *)altq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + cbq_purge(cbqp); + break; + + case ALTRQ_PURGE_SC: + /* not supported for ALTQ instance */ + break; + + case ALTRQ_EVENT: + cbq_event(cbqp, (cqev_t)arg); + break; + } + return (0); +} + +/* + * altq_cbq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +altq_cbq_enqueue(struct ifaltq *altq, struct mbuf *m) +{ + /* grab class set by classifier */ + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + printf("%s: packet for %s does not have pkthdr\n", __func__, + if_name(altq->altq_ifcq->ifcq_ifp)); + m_freem(m); + return (ENOBUFS); + } + + return (cbq_enqueue(altq->altq_disc, NULL, m, m_pftag(m))); +} + +/* + * altq_cbq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +altq_cbq_dequeue(struct ifaltq *altq, enum altdq_op op) +{ + return (cbq_dequeue(altq->altq_disc, (cqdq_op_t)op)); +} +#endif /* PF_ALTQ && PKTSCHED_CBQ */ diff --git a/bsd/net/altq/altq_cbq.h b/bsd/net/altq/altq_cbq.h new file mode 100644 index 000000000..fba7310c9 --- /dev/null +++ b/bsd/net/altq/altq_cbq.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_cbq.h,v 1.8 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _NET_ALTQ_ALTQ_CBQ_H_ +#define _NET_ALTQ_ALTQ_CBQ_H_ + +#include +#include +#include + +#ifdef BSD_KERNEL_PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +extern int altq_cbq_pfattach(struct pf_altq *); +extern int altq_cbq_add(struct pf_altq *); +extern int altq_cbq_remove(struct pf_altq *); +extern int altq_cbq_add_queue(struct pf_altq *); +extern int altq_cbq_remove_queue(struct pf_altq *); +extern int altq_cbq_getqstats(struct pf_altq *, void *, int *); + +#ifdef __cplusplus +} +#endif + +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* !_NET_ALTQ_ALTQ_CBQ_H_ */ diff --git a/bsd/net/altq/altq_fairq.c b/bsd/net/altq/altq_fairq.c new file mode 100644 index 000000000..284654761 --- /dev/null +++ b/bsd/net/altq/altq_fairq.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.2 2008/05/14 11:59:23 sephe Exp $ + */ +/* + * Matt: I gutted altq_priq.c and used it as a skeleton on which to build + * fairq. The fairq algorithm is completely different then priq, of course, + * but because I used priq's skeleton I believe I should include priq's + * copyright. + * + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if PF_ALTQ && PKTSCHED_FAIRQ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * function prototypes + */ +static int altq_fairq_enqueue(struct ifaltq *, struct mbuf *); +static struct mbuf *altq_fairq_dequeue(struct ifaltq *, enum altdq_op); +static int altq_fairq_request(struct ifaltq *, enum altrq, void *); + +int +altq_fairq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + IFCQ_LOCK(&ifp->if_snd); + error = altq_attach(IFCQ_ALTQ(&ifp->if_snd), ALTQT_FAIRQ, a->altq_disc, + altq_fairq_enqueue, altq_fairq_dequeue, NULL, altq_fairq_request); + IFCQ_UNLOCK(&ifp->if_snd); + + return (error); +} + +int +altq_fairq_add(struct pf_altq *a) +{ + struct fairq_if *fif; + struct ifnet *ifp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(IFCQ_ALTQ(&ifp->if_snd))) + return (ENODEV); + + fif = fairq_alloc(ifp, M_WAITOK, TRUE); + if (fif == NULL) + return (ENOMEM); + + /* keep the state in pf_altq */ + a->altq_disc = fif; + + return (0); +} + +int +altq_fairq_remove(struct pf_altq *a) +{ + struct fairq_if *fif; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((fif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + return (fairq_destroy(fif)); +} + +int +altq_fairq_add_queue(struct pf_altq *a) +{ + struct fairq_if *fif; + struct fairq_opts *opts = &a->pq_u.fairq_opts; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((fif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(fif->fif_ifq); + err = fairq_add_queue(fif, a->priority, a->qlimit, a->bandwidth, + opts->nbuckets, opts->flags, opts->hogs_m1, opts->lssc_m1, + opts->lssc_d, opts->lssc_m2, a->qid, NULL); + IFCQ_UNLOCK(fif->fif_ifq); + + return (err); +} + +int +altq_fairq_remove_queue(struct pf_altq *a) +{ + struct fairq_if *fif; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((fif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(fif->fif_ifq); + err = fairq_remove_queue(fif, a->qid); + IFCQ_UNLOCK(fif->fif_ifq); + + return (err); +} + +int +altq_fairq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct ifclassq *ifq = NULL; + struct fairq_if *fif; + struct fairq_classstats stats; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((unsigned)*nbytes < sizeof (stats)) + return (EINVAL); + + if ((fif = altq_lookup(a->ifname, ALTQT_FAIRQ)) == NULL) + return (EBADF); + + ifq = fif->fif_ifq; + IFCQ_LOCK_ASSERT_HELD(ifq); /* lock held by altq_lookup */ + error = fairq_get_class_stats(fif, a->qid, &stats); + IFCQ_UNLOCK(ifq); + if (error != 0) + return (error); + + if ((error = copyout((caddr_t)&stats, (user_addr_t)(uintptr_t)ubuf, + sizeof (stats))) != 0) + return (error); + + *nbytes = sizeof (stats); + + return (0); +} + +static int +altq_fairq_request(struct ifaltq *altq, enum altrq req, void *arg) +{ + struct fairq_if *fif = (struct fairq_if *)altq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + fairq_purge(fif); + break; + + case ALTRQ_PURGE_SC: + /* not supported for ALTQ instance */ + break; + + case ALTRQ_EVENT: + fairq_event(fif, (cqev_t)arg); + break; + } + return (0); +} + +/* + * altq_fairq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +altq_fairq_enqueue(struct ifaltq *altq, struct mbuf *m) +{ + /* grab class set by classifier */ + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + printf("%s: packet for %s does not have pkthdr\n", __func__, + if_name(altq->altq_ifcq->ifcq_ifp)); + m_freem(m); + return (ENOBUFS); + } + + return (fairq_enqueue(altq->altq_disc, NULL, m, m_pftag(m))); +} + +/* + * altq_fairq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +altq_fairq_dequeue(struct ifaltq *altq, enum altdq_op op) +{ + return (fairq_dequeue(altq->altq_disc, (cqdq_op_t)op)); +} +#endif /* PF_ALTQ && PKTSCHED_FAIRQ */ diff --git a/bsd/net/altq/altq_fairq.h b/bsd/net/altq/altq_fairq.h new file mode 100644 index 000000000..d9d536ca8 --- /dev/null +++ b/bsd/net/altq/altq_fairq.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/altq/altq_fairq.h,v 1.1 2008/04/06 18:58:15 dillon Exp $ + */ + +#ifndef _NET_ALTQ_ALTQ_FAIRQ_H_ +#define _NET_ALTQ_ALTQ_FAIRQ_H_ + +#include +#include +#include + +#ifdef BSD_KERNEL_PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +extern int altq_fairq_pfattach(struct pf_altq *); +extern int altq_fairq_add(struct pf_altq *); +extern int altq_fairq_remove(struct pf_altq *); +extern int altq_fairq_add_queue(struct pf_altq *); +extern int altq_fairq_remove_queue(struct pf_altq *); +extern int altq_fairq_getqstats(struct pf_altq *, void *, int *); + +#ifdef __cplusplus +} +#endif +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _NET_ALTQ_ALTQ_FAIRQ_H_ */ diff --git a/bsd/net/altq/altq_hfsc.c b/bsd/net/altq/altq_hfsc.c new file mode 100644 index 000000000..1e58df421 --- /dev/null +++ b/bsd/net/altq/altq_hfsc.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_hfsc.c,v 1.25 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_hfsc.c,v 1.17 2002/11/29 07:48:33 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ + +#include + +#if PF_ALTQ && PKTSCHED_HFSC + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * function prototypes + */ +static int altq_hfsc_request(struct ifaltq *, enum altrq, void *); +static int altq_hfsc_enqueue(struct ifaltq *, struct mbuf *); +static struct mbuf *altq_hfsc_dequeue(struct ifaltq *, enum altdq_op); + +int +altq_hfsc_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + IFCQ_LOCK(&ifp->if_snd); + error = altq_attach(IFCQ_ALTQ(&ifp->if_snd), ALTQT_HFSC, a->altq_disc, + altq_hfsc_enqueue, altq_hfsc_dequeue, NULL, altq_hfsc_request); + IFCQ_UNLOCK(&ifp->if_snd); + + return (error); +} + +int +altq_hfsc_add(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct ifnet *ifp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(IFCQ_ALTQ(&ifp->if_snd))) + return (ENODEV); + + hif = hfsc_alloc(ifp, M_WAITOK, TRUE); + if (hif == NULL) + return (ENOMEM); + + /* keep the state in pf_altq */ + a->altq_disc = hif; + + return (0); +} + +int +altq_hfsc_remove(struct pf_altq *a) +{ + struct hfsc_if *hif; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + return (hfsc_destroy(hif)); +} + +int +altq_hfsc_add_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_opts *opts = &a->pq_u.hfsc_opts; + struct service_curve rtsc, lssc, ulsc; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + bzero(&rtsc, sizeof (rtsc)); + bzero(&lssc, sizeof (lssc)); + bzero(&ulsc, sizeof (ulsc)); + + rtsc.m1 = opts->rtsc_m1; + rtsc.d = opts->rtsc_d; + rtsc.m2 = opts->rtsc_m2; + rtsc.fl = opts->rtsc_fl; + lssc.m1 = opts->lssc_m1; + lssc.d = opts->lssc_d; + lssc.m2 = opts->lssc_m2; + lssc.fl = opts->lssc_fl; + ulsc.m1 = opts->ulsc_m1; + ulsc.d = opts->ulsc_d; + ulsc.m2 = opts->ulsc_m2; + ulsc.fl = opts->ulsc_fl; + + IFCQ_LOCK(hif->hif_ifq); + err = hfsc_add_queue(hif, &rtsc, &lssc, &ulsc, a->qlimit, + opts->flags, a->parent_qid, a->qid, NULL); + IFCQ_UNLOCK(hif->hif_ifq); + + return (err); +} + +int +altq_hfsc_remove_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(hif->hif_ifq); + err = hfsc_remove_queue(hif, a->qid); + IFCQ_UNLOCK(hif->hif_ifq); + + return (err); +} + +int +altq_hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct ifclassq *ifq = NULL; + struct hfsc_if *hif; + struct hfsc_classstats stats; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((unsigned)*nbytes < sizeof (stats)) + return (EINVAL); + + if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + ifq = hif->hif_ifq; + IFCQ_LOCK_ASSERT_HELD(ifq); /* lock held by altq_lookup */ + error = hfsc_get_class_stats(hif, a->qid, &stats); + IFCQ_UNLOCK(ifq); + if (error != 0) + return (error); + + if ((error = copyout((caddr_t)&stats, (user_addr_t)(uintptr_t)ubuf, + sizeof (stats))) != 0) + return (error); + + *nbytes = sizeof (stats); + + return (0); +} + +static int +altq_hfsc_request(struct ifaltq *altq, enum altrq req, void *arg) +{ + struct hfsc_if *hif = (struct hfsc_if *)altq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + hfsc_purge(hif); + break; + + case ALTRQ_PURGE_SC: + /* not supported for ALTQ instance */ + break; + + case ALTRQ_EVENT: + hfsc_event(hif, (cqev_t)arg); + break; + } + return (0); +} + +/* + * altq_hfsc_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +altq_hfsc_enqueue(struct ifaltq *altq, struct mbuf *m) +{ + /* grab class set by classifier */ + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + printf("%s: packet for %s does not have pkthdr\n", __func__, + if_name(altq->altq_ifcq->ifcq_ifp)); + m_freem(m); + return (ENOBUFS); + } + + return (hfsc_enqueue(altq->altq_disc, NULL, m, m_pftag(m))); +} + +/* + * altq_hfsc_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +altq_hfsc_dequeue(struct ifaltq *altq, enum altdq_op op) +{ + return (hfsc_dequeue(altq->altq_disc, (cqdq_op_t)op)); +} +#endif /* PF_ALTQ && PKTSCHED_HFSC */ diff --git a/bsd/net/altq/altq_hfsc.h b/bsd/net/altq/altq_hfsc.h new file mode 100644 index 000000000..6b46293e7 --- /dev/null +++ b/bsd/net/altq/altq_hfsc.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_hfsc.h,v 1.8 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +#ifndef _NET_ALTQ_ALTQ_HFSC_H_ +#define _NET_ALTQ_ALTQ_HFSC_H_ + +#include +#include +#include + +#ifdef BSD_KERNEL_PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +extern int altq_hfsc_pfattach(struct pf_altq *); +extern int altq_hfsc_add(struct pf_altq *); +extern int altq_hfsc_remove(struct pf_altq *); +extern int altq_hfsc_add_queue(struct pf_altq *); +extern int altq_hfsc_remove_queue(struct pf_altq *); +extern int altq_hfsc_getqstats(struct pf_altq *, void *, int *); + +#ifdef __cplusplus +} +#endif +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _NET_ALTQ_ALTQ_HFSC_H_ */ diff --git a/bsd/net/altq/altq_priq.c b/bsd/net/altq/altq_priq.c new file mode 100644 index 000000000..a86a48383 --- /dev/null +++ b/bsd/net/altq/altq_priq.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_priq.c,v 1.21 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_priq.c,v 1.1 2000/10/18 09:15:23 kjc Exp $ */ + +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * priority queue + */ + +#if PF_ALTQ && PKTSCHED_PRIQ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * function prototypes + */ +static int altq_priq_enqueue(struct ifaltq *, struct mbuf *); +static struct mbuf *altq_priq_dequeue(struct ifaltq *, enum altdq_op); +static int altq_priq_request(struct ifaltq *, enum altrq, void *); + +int +altq_priq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + IFCQ_LOCK(&ifp->if_snd); + error = altq_attach(IFCQ_ALTQ(&ifp->if_snd), ALTQT_PRIQ, a->altq_disc, + altq_priq_enqueue, altq_priq_dequeue, NULL, altq_priq_request); + IFCQ_UNLOCK(&ifp->if_snd); + + return (error); +} + +int +altq_priq_add(struct pf_altq *a) +{ + struct priq_if *pif; + struct ifnet *ifp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(IFCQ_ALTQ(&ifp->if_snd))) + return (ENODEV); + + pif = priq_alloc(ifp, M_WAITOK, TRUE); + if (pif == NULL) + return (ENOMEM); + + /* keep the state in pf_altq */ + a->altq_disc = pif; + + return (0); +} + +int +altq_priq_remove(struct pf_altq *a) +{ + struct priq_if *pif; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + return (priq_destroy(pif)); +} + +int +altq_priq_add_queue(struct pf_altq *a) +{ + struct priq_if *pif; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(pif->pif_ifq); + err = priq_add_queue(pif, a->priority, a->qlimit, + a->pq_u.priq_opts.flags, a->qid, NULL); + IFCQ_UNLOCK(pif->pif_ifq); + + return (err); +} + +int +altq_priq_remove_queue(struct pf_altq *a) +{ + struct priq_if *pif; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(pif->pif_ifq); + err = priq_remove_queue(pif, a->qid); + IFCQ_UNLOCK(pif->pif_ifq); + + return (err); +} + +int +altq_priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct ifclassq *ifq = NULL; + struct priq_if *pif; + struct priq_classstats stats; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((unsigned)*nbytes < sizeof (stats)) + return (EINVAL); + + if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + ifq = pif->pif_ifq; + IFCQ_LOCK_ASSERT_HELD(ifq); /* lock held by altq_lookup */ + error = priq_get_class_stats(pif, a->qid, &stats); + IFCQ_UNLOCK(ifq); + if (error != 0) + return (error); + + if ((error = copyout((caddr_t)&stats, (user_addr_t)(uintptr_t)ubuf, + sizeof (stats))) != 0) + return (error); + + *nbytes = sizeof (stats); + + return (0); +} + +static int +altq_priq_request(struct ifaltq *altq, enum altrq req, void *arg) +{ + struct priq_if *pif = (struct priq_if *)altq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + priq_purge(pif); + break; + + case ALTRQ_PURGE_SC: + case ALTRQ_THROTTLE: + /* not supported for ALTQ instance */ + break; + + case ALTRQ_EVENT: + priq_event(pif, (cqev_t)arg); + break; + } + return (0); +} + +/* + * altq_priq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +altq_priq_enqueue(struct ifaltq *altq, struct mbuf *m) +{ + /* grab class set by classifier */ + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + printf("%s: packet for %s does not have pkthdr\n", __func__, + if_name(altq->altq_ifcq->ifcq_ifp)); + m_freem(m); + return (ENOBUFS); + } + + return (priq_enqueue(altq->altq_disc, NULL, m, m_pftag(m))); +} + +/* + * altq_priq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +altq_priq_dequeue(struct ifaltq *altq, enum altdq_op op) +{ + return (priq_dequeue(altq->altq_disc, (cqdq_op_t)op)); +} +#endif /* PF_ALTQ && PKTSCHED_PRIQ */ diff --git a/bsd/net/altq/altq_priq.h b/bsd/net/altq/altq_priq.h new file mode 100644 index 000000000..f6b6372e9 --- /dev/null +++ b/bsd/net/altq/altq_priq.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_priq.h,v 1.7 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $ */ +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_ALTQ_ALTQ_PRIQ_H_ +#define _NET_ALTQ_ALTQ_PRIQ_H_ + +#include +#include +#include + +#ifdef BSD_KERNEL_PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +extern int altq_priq_pfattach(struct pf_altq *); +extern int altq_priq_add(struct pf_altq *); +extern int altq_priq_remove(struct pf_altq *); +extern int altq_priq_add_queue(struct pf_altq *); +extern int altq_priq_remove_queue(struct pf_altq *); +extern int altq_priq_getqstats(struct pf_altq *, void *, int *); + +#ifdef __cplusplus +} +#endif +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _NET_ALTQ_ALTQ_PRIQ_H_ */ diff --git a/bsd/net/altq/altq_qfq.c b/bsd/net/altq/altq_qfq.c new file mode 100644 index 000000000..d45437e25 --- /dev/null +++ b/bsd/net/altq/altq_qfq.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * quick fair queueing + */ + +#if PF_ALTQ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * function prototypes + */ +static int altq_qfq_enqueue(struct ifaltq *, struct mbuf *); +static struct mbuf *altq_qfq_dequeue(struct ifaltq *, enum altdq_op); +static int altq_qfq_request(struct ifaltq *, enum altrq, void *); + +int +altq_qfq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + IFCQ_LOCK(&ifp->if_snd); + error = altq_attach(IFCQ_ALTQ(&ifp->if_snd), ALTQT_QFQ, a->altq_disc, + altq_qfq_enqueue, altq_qfq_dequeue, NULL, altq_qfq_request); + IFCQ_UNLOCK(&ifp->if_snd); + + return (error); +} + +int +altq_qfq_add(struct pf_altq *a) +{ + struct qfq_if *qif; + struct ifnet *ifp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(IFCQ_ALTQ(&ifp->if_snd))) + return (ENODEV); + + qif = qfq_alloc(ifp, M_WAITOK, TRUE); + if (qif == NULL) + return (ENOMEM); + + /* keep the state in pf_altq */ + a->altq_disc = qif; + + return (0); +} + +int +altq_qfq_remove(struct pf_altq *a) +{ + struct qfq_if *qif; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((qif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + return (qfq_destroy(qif)); +} + +int +altq_qfq_add_queue(struct pf_altq *a) +{ + struct qfq_if *qif; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((qif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(qif->qif_ifq); + err = qfq_add_queue(qif, a->qlimit, a->weight, a->pq_u.qfq_opts.lmax, + a->pq_u.qfq_opts.flags, a->qid, NULL); + IFCQ_UNLOCK(qif->qif_ifq); + + return (err); +} + +int +altq_qfq_remove_queue(struct pf_altq *a) +{ + struct qfq_if *qif; + int err; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((qif = a->altq_disc) == NULL) + return (EINVAL); + + IFCQ_LOCK(qif->qif_ifq); + err = qfq_remove_queue(qif, a->qid); + IFCQ_UNLOCK(qif->qif_ifq); + + return (err); +} + +int +altq_qfq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct ifclassq *ifq = NULL; + struct qfq_if *qif; + struct qfq_classstats stats; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((unsigned)*nbytes < sizeof (stats)) + return (EINVAL); + + if ((qif = altq_lookup(a->ifname, ALTQT_QFQ)) == NULL) + return (EBADF); + + ifq = qif->qif_ifq; + IFCQ_LOCK_ASSERT_HELD(ifq); /* lock held by altq_lookup */ + error = qfq_get_class_stats(qif, a->qid, &stats); + IFCQ_UNLOCK(ifq); + if (error != 0) + return (error); + + if ((error = copyout((caddr_t)&stats, (user_addr_t)(uintptr_t)ubuf, + sizeof (stats))) != 0) + return (error); + + *nbytes = sizeof (stats); + + return (0); +} + +static int +altq_qfq_request(struct ifaltq *altq, enum altrq req, void *arg) +{ + struct qfq_if *qif = (struct qfq_if *)altq->altq_disc; + + switch (req) { + case ALTRQ_PURGE: + qfq_purge(qif); + break; + + case ALTRQ_PURGE_SC: + /* not supported for ALTQ instance */ + break; + + case ALTRQ_EVENT: + qfq_event(qif, (cqev_t)arg); + break; + } + return (0); +} + +/* + * altq_qfq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +altq_qfq_enqueue(struct ifaltq *altq, struct mbuf *m) +{ + /* grab class set by classifier */ + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + printf("%s: packet for %s does not have pkthdr\n", __func__, + if_name(altq->altq_ifcq->ifcq_ifp)); + m_freem(m); + return (ENOBUFS); + } + + return (qfq_enqueue(altq->altq_disc, NULL, m, m_pftag(m))); +} + +/* + * altq_qfq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +altq_qfq_dequeue(struct ifaltq *altq, enum altdq_op op) +{ + return (qfq_dequeue(altq->altq_disc, (cqdq_op_t)op)); +} +#endif /* PF_ALTQ */ diff --git a/bsd/net/altq/altq_qfq.h b/bsd/net/altq/altq_qfq.h new file mode 100644 index 000000000..790742229 --- /dev/null +++ b/bsd/net/altq/altq_qfq.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_ALTQ_ALTQ_QFQ_H_ +#define _NET_ALTQ_ALTQ_QFQ_H_ + +#include +#include +#include + +#ifdef BSD_KERNEL_PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +extern int altq_qfq_pfattach(struct pf_altq *); +extern int altq_qfq_add(struct pf_altq *); +extern int altq_qfq_remove(struct pf_altq *); +extern int altq_qfq_add_queue(struct pf_altq *); +extern int altq_qfq_remove_queue(struct pf_altq *); +extern int altq_qfq_getqstats(struct pf_altq *, void *, int *); + +#ifdef __cplusplus +} +#endif +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _NET_ALTQ_ALTQ_QFQ_H_ */ diff --git a/bsd/net/altq/altq_subr.c b/bsd/net/altq/altq_subr.c new file mode 100644 index 000000000..5b00e6f5b --- /dev/null +++ b/bsd/net/altq/altq_subr.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_subr.c,v 1.24 2007/12/11 00:30:14 mikeb Exp $ */ +/* $KAME: altq_subr.c,v 1.11 2002/01/11 08:11:49 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +SYSCTL_NODE(_net, OID_AUTO, altq, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "ALTQ"); + +static u_int32_t altq_debug; +SYSCTL_UINT(_net_altq, OID_AUTO, debug, CTLFLAG_RW, &altq_debug, 0, + "Enable ALTQ debugging"); + +/* + * look up the queue state by the interface name and the queueing type; + * upon success, returns with the interface send queue lock held, and + * the caller is responsible for releasing it. + */ +void * +altq_lookup(char *name, u_int32_t type) +{ + struct ifnet *ifp; + void *state = NULL; + + if ((ifp = ifunit(name)) != NULL) { + IFCQ_LOCK(&ifp->if_snd); + if (type != ALTQT_NONE && + IFCQ_ALTQ(&ifp->if_snd)->altq_type == type) + state = IFCQ_ALTQ(&ifp->if_snd)->altq_disc; + if (state == NULL) + IFCQ_UNLOCK(&ifp->if_snd); + } + + if (state != NULL) + IFCQ_LOCK_ASSERT_HELD(&ifp->if_snd); + + return (state); +} + +int +altq_attach(struct ifaltq *altq, u_int32_t type, void *discipline, + altq_enq_func enqueue, altq_deq_func dequeue, + altq_deq_sc_func dequeue_sc, altq_req_func request) +{ + IFCQ_LOCK_ASSERT_HELD(altq->altq_ifcq); + + if (!ALTQ_IS_READY(altq)) + return (ENXIO); + + VERIFY(enqueue != NULL); + VERIFY(!(dequeue != NULL && dequeue_sc != NULL)); + VERIFY(request != NULL); + + altq->altq_type = type; + altq->altq_disc = discipline; + altq->altq_enqueue = enqueue; + altq->altq_dequeue = dequeue; + altq->altq_dequeue_sc = dequeue_sc; + altq->altq_request = request; + altq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); + + return (0); +} + +int +altq_detach(struct ifaltq *altq) +{ + IFCQ_LOCK_ASSERT_HELD(altq->altq_ifcq); + + if (!ALTQ_IS_READY(altq)) + return (ENXIO); + if (ALTQ_IS_ENABLED(altq)) + return (EBUSY); + if (!ALTQ_IS_ATTACHED(altq)) + return (0); + + altq->altq_type = ALTQT_NONE; + altq->altq_disc = NULL; + altq->altq_enqueue = NULL; + altq->altq_dequeue = NULL; + altq->altq_dequeue_sc = NULL; + altq->altq_request = NULL; + altq->altq_flags &= ALTQF_CANTCHANGE; + + return (0); +} + +int +altq_enable(struct ifaltq *altq) +{ + struct ifclassq *ifq = altq->altq_ifcq; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!ALTQ_IS_READY(altq)) + return (ENXIO); + if (ALTQ_IS_ENABLED(altq)) + return (0); + + altq->altq_flags |= ALTQF_ENABLED; + + return (0); +} + +int +altq_disable(struct ifaltq *altq) +{ + struct ifclassq *ifq = altq->altq_ifcq; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!ALTQ_IS_ENABLED(altq)) + return (0); + + if_qflush(ifq->ifcq_ifp, 1); + + altq->altq_flags &= ~ALTQF_ENABLED; + + return (0); +} + +/* + * add a discipline or a queue + */ +int +altq_add(struct pf_altq *a) +{ + int error = 0; + + VERIFY(machclk_freq != 0); + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if (a->qname[0] != 0) + return (altq_add_queue(a)); + + switch (a->scheduler) { +#if PKTSCHED_CBQ + case ALTQT_CBQ: + error = altq_cbq_add(a); + break; +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ + case ALTQT_PRIQ: + error = altq_priq_add(a); + break; +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_HFSC + case ALTQT_HFSC: + error = altq_hfsc_add(a); + break; +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ + case ALTQT_FAIRQ: + error = altq_fairq_add(a); + break; +#endif /* PKTSCHED_FAIRQ */ + case ALTQT_QFQ: + error = altq_qfq_add(a); + break; + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a discipline or a queue + */ +int +altq_remove(struct pf_altq *a) +{ + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if (a->qname[0] != 0) + return (altq_remove_queue(a)); + + switch (a->scheduler) { +#if PKTSCHED_CBQ + case ALTQT_CBQ: + error = altq_cbq_remove(a); + break; +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ + case ALTQT_PRIQ: + error = altq_priq_remove(a); + break; +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_HFSC + case ALTQT_HFSC: + error = altq_hfsc_remove(a); + break; +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ + case ALTQT_FAIRQ: + error = altq_fairq_remove(a); + break; +#endif /* PKTSCHED_FAIRQ */ + case ALTQT_QFQ: + error = altq_qfq_remove(a); + break; + default: + error = ENXIO; + } + + return (error); +} + +/* + * add a queue to the discipline + */ +int +altq_add_queue(struct pf_altq *a) +{ + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + switch (a->scheduler) { +#if PKTSCHED_CBQ + case ALTQT_CBQ: + error = altq_cbq_add_queue(a); + break; +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ + case ALTQT_PRIQ: + error = altq_priq_add_queue(a); + break; +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_HFSC + case ALTQT_HFSC: + error = altq_hfsc_add_queue(a); + break; +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ + case ALTQT_FAIRQ: + error = altq_fairq_add_queue(a); + break; +#endif /* PKTSCHED_FAIRQ */ + case ALTQT_QFQ: + error = altq_qfq_add_queue(a); + break; + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a queue from the discipline + */ +int +altq_remove_queue(struct pf_altq *a) +{ + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + switch (a->scheduler) { +#if PKTSCHED_CBQ + case ALTQT_CBQ: + error = altq_cbq_remove_queue(a); + break; +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ + case ALTQT_PRIQ: + error = altq_priq_remove_queue(a); + break; +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_HFSC + case ALTQT_HFSC: + error = altq_hfsc_remove_queue(a); + break; +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ + case ALTQT_FAIRQ: + error = altq_fairq_remove_queue(a); + break; +#endif /* PKTSCHED_FAIRQ */ + case ALTQT_QFQ: + error = altq_qfq_remove_queue(a); + break; + default: + error = ENXIO; + } + + return (error); +} + +/* + * get queue statistics + */ +int +altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + switch (a->scheduler) { +#if PKTSCHED_CBQ + case ALTQT_CBQ: + error = altq_cbq_getqstats(a, ubuf, nbytes); + break; +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ + case ALTQT_PRIQ: + error = altq_priq_getqstats(a, ubuf, nbytes); + break; +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_HFSC + case ALTQT_HFSC: + error = altq_hfsc_getqstats(a, ubuf, nbytes); + break; +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ + case ALTQT_FAIRQ: + error = altq_fairq_getqstats(a, ubuf, nbytes); + break; +#endif /* PKTSCHED_FAIRQ */ + case ALTQT_QFQ: + error = altq_qfq_getqstats(a, ubuf, nbytes); + break; + default: + error = ENXIO; + } + + return (error); +} + +/* + * attach a discipline to the interface. if one already exists, it is + * overridden. + */ +int +altq_pfattach(struct pf_altq *a) +{ + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + switch (a->scheduler) { + case ALTQT_NONE: + break; +#if PKTSCHED_CBQ + case ALTQT_CBQ: + error = altq_cbq_pfattach(a); + break; +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ + case ALTQT_PRIQ: + error = altq_priq_pfattach(a); + break; +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_HFSC + case ALTQT_HFSC: + error = altq_hfsc_pfattach(a); + break; +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ + case ALTQT_FAIRQ: + error = altq_fairq_pfattach(a); + break; +#endif /* PKTSCHED_FAIRQ */ + case ALTQT_QFQ: + error = altq_qfq_pfattach(a); + break; + default: + error = ENXIO; + } + + return (error); +} + +/* + * detach a discipline from the interface. + * it is possible that the discipline was already overridden by another + * discipline. + */ +int +altq_pfdetach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + + /* if this discipline is no longer referenced, just return */ + IFCQ_LOCK(&ifp->if_snd); + if (a->altq_disc == NULL || + a->altq_disc != IFCQ_ALTQ(&ifp->if_snd)->altq_disc) { + IFCQ_UNLOCK(&ifp->if_snd); + return (0); + } + + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(&ifp->if_snd))) + error = altq_disable(IFCQ_ALTQ(&ifp->if_snd)); + if (error == 0) + error = altq_detach(IFCQ_ALTQ(&ifp->if_snd)); + IFCQ_UNLOCK(&ifp->if_snd); + return (error); +} + + diff --git a/bsd/net/altq/altq_var.h b/bsd/net/altq/altq_var.h new file mode 100644 index 000000000..e866a4dc4 --- /dev/null +++ b/bsd/net/altq/altq_var.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_var.h,v 1.10 2006/10/15 13:17:13 peter Exp $ */ +/* $KAME: altq_var.h,v 1.18 2005/04/13 03:44:25 suz Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NET_ALTQ_ALTQ_VAR_H_ +#define _NET_ALTQ_ALTQ_VAR_H_ + +#ifdef BSD_KERNEL_PRIVATE +#if PF_ALTQ +#include +#include +#include +#include +#include +#include +#if PKTSCHED_HFSC +#include +#endif /* PKTSCHED_HFSC */ +#if PKTSCHED_FAIRQ +#include +#endif /* PKTSCHED_FAIRQ */ +#if PKTSCHED_CBQ +#include +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_PRIQ +#include +#endif /* PKTSCHED_PRIQ */ +#include + +struct pf_altq; + +extern void *altq_lookup(char *, u_int32_t); +extern int altq_pfattach(struct pf_altq *); +extern int altq_pfdetach(struct pf_altq *); +extern int altq_add(struct pf_altq *); +extern int altq_remove(struct pf_altq *); +extern int altq_add_queue(struct pf_altq *); +extern int altq_remove_queue(struct pf_altq *); +extern int altq_getqstats(struct pf_altq *, void *, int *); + +#endif /* PF_ALTQ */ +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _NET_ALTQ_ALTQ_VAR_H_ */ diff --git a/bsd/net/altq/if_altq.h b/bsd/net/altq/if_altq.h new file mode 100644 index 000000000..6d634cf5d --- /dev/null +++ b/bsd/net/altq/if_altq.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* $OpenBSD: if_altq.h,v 1.11 2007/11/18 12:51:48 mpf Exp $ */ +/* $KAME: if_altq.h,v 1.6 2001/01/29 19:59:09 itojun Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NET_ALTQ_IF_ALTQ_H_ +#define _NET_ALTQ_IF_ALTQ_H_ + +#ifdef BSD_KERNEL_PRIVATE +#if PF_ALTQ +#include + +/* altq request types */ +typedef enum altrq { + ALTRQ_PURGE = CLASSQRQ_PURGE, /* purge all packets */ + ALTRQ_PURGE_SC = CLASSQRQ_PURGE_SC, /* purge SC flow */ + ALTRQ_EVENT = CLASSQRQ_EVENT, /* interface events */ + ALTRQ_THROTTLE = CLASSQRQ_THROTTLE, /* throttle packets */ +} altrq_t; + +struct ifaltq; +enum altdq_op; + +typedef int (*altq_enq_func)(struct ifaltq *, struct mbuf *); +typedef struct mbuf *(*altq_deq_func)(struct ifaltq *, enum altdq_op); +typedef struct mbuf *(*altq_deq_sc_func)(struct ifaltq *, + mbuf_svc_class_t, enum altdq_op); +typedef int (*altq_req_func)(struct ifaltq *, enum altrq, void *); + +/* + * Structure defining a queue for a network interface. + */ +struct ifaltq { + struct ifclassq *altq_ifcq; /* back pointer to interface queue */ + + /* alternate queueing related fields */ + u_int32_t altq_type; /* discipline type */ + u_int32_t altq_flags; /* flags (e.g. ready, in-use) */ + void *altq_disc; /* for discipline-specific use */ + + altq_enq_func altq_enqueue; + altq_deq_func altq_dequeue; + altq_deq_sc_func altq_dequeue_sc; + altq_req_func altq_request; +}; + +/* altq_flags */ +#define ALTQF_READY 0x01 /* driver supports alternate queueing */ +#define ALTQF_ENABLED 0x02 /* altq is in use */ +#define ALTQF_DRIVER1 0x40 /* driver specific */ + +/* altq_flags set internally only: */ +#define ALTQF_CANTCHANGE (ALTQF_READY) + +/* altq_dequeue op arg */ +typedef enum altdq_op { + ALTDQ_REMOVE = CLASSQDQ_REMOVE, /* dequeue mbuf from the queue */ + ALTDQ_POLL = CLASSQDQ_POLL, /* don't dequeue mbuf from the queue */ +} altdq_op_t; + +#define ALTQ_IS_READY(_altq) ((_altq)->altq_flags & ALTQF_READY) +#define ALTQ_IS_ENABLED(_altq) ((_altq)->altq_flags & ALTQF_ENABLED) +#define ALTQ_IS_ATTACHED(_altq) ((_altq)->altq_disc != NULL) + +#define ALTQ_ENQUEUE(_altq, _m, _err) do { \ + (_err) = (*(_altq)->altq_enqueue)(_altq, _m); \ +} while (0) + +#define ALTQ_DEQUEUE(_altq, _m) do { \ + (_m) = (*(_altq)->altq_dequeue)(_altq, ALTDQ_REMOVE); \ +} while (0) + +#define ALTQ_DEQUEUE_SC(_altq, _sc, _m) do { \ + (_m) = (*(_altq)->altq_dequeue_sc)(_altq, _sc, ALTDQ_REMOVE); \ +} while (0) + +#define ALTQ_POLL(_altq, _m) do { \ + (_m) = (*(_altq)->altq_dequeue)(_altq, ALTDQ_POLL); \ +} while (0) + +#define ALTQ_POLL_SC(_altq, _sc, _m) do { \ + (_m) = (*(_altq)->altq_dequeue_sc)(_altq, _sc, ALTDQ_POLL); \ +} while (0) + +#define ALTQ_PURGE(_altq) do { \ + (void) (*(_altq)->altq_request)(_altq, ALTRQ_PURGE, NULL); \ +} while (0) + +#define ALTQ_PURGE_SC(_altq, _sc, _flow, _packets, _bytes) do { \ + cqrq_purge_sc_t _req = { _sc, _flow, 0, 0 }; \ + (void) (*(_altq)->altq_request)(_altq, ALTRQ_PURGE_SC, &_req); \ + (_packets) = _req.packets; \ + (_bytes) = _req.bytes; \ +} while (0) + +#define ALTQ_UPDATE(_altq, _ev) do { \ + (void) (*(_altq)->altq_request)(_altq, ALTRQ_EVENT, \ + (void *)(_ev)); \ +} while (0) + +#define ALTQ_SET_READY(_altq) do { \ + IFCQ_LOCK_ASSERT_HELD((_altq)->altq_ifcq); \ + (_altq)->altq_flags |= ALTQF_READY; \ +} while (0) + +#define ALTQ_CLEAR_READY(_altq) do { \ + IFCQ_LOCK_ASSERT_HELD((_altq)->altq_ifcq); \ + (_altq)->altq_flags &= ~ALTQF_READY; \ +} while (0) + +extern int altq_attach(struct ifaltq *, u_int32_t, void *, + altq_enq_func, altq_deq_func, altq_deq_sc_func, altq_req_func); +extern int altq_detach(struct ifaltq *); +extern int altq_enable(struct ifaltq *); +extern int altq_disable(struct ifaltq *); +#endif /* PF_ALTQ */ +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* _NET_ALTQ_IF_ALTQ_H_ */ diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index e370dfc5e..b1ac5f1e8 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,12 +97,10 @@ #include #include -#if defined(sparc) && BSD < 199103 -#include -#endif #include #include +#include #include #include @@ -110,6 +108,13 @@ #include #include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -127,19 +132,8 @@ extern int tvtohz(struct timeval *); -/* - * Older BSDs don't have kernel malloc. - */ -#if BSD < 199103 -extern bcopy(); -static caddr_t bpf_alloc(); -#include -#define BPF_BUFSIZE (MCLBYTES-8) -#define UIOMOVE(cp, len, code, uio) uiomove(cp, len, code, uio) -#else #define BPF_BUFSIZE 4096 #define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio) -#endif #define PRINET 26 /* interruptible */ @@ -177,7 +171,8 @@ static struct bpf_d **bpf_dtab = NULL; static unsigned int bpf_dtab_size = 0; static unsigned int nbpfilter = 0; -static lck_mtx_t *bpf_mlock; +decl_lck_mtx_data(static, bpf_mlock_data); +static lck_mtx_t *bpf_mlock = &bpf_mlock_data; static lck_grp_t *bpf_mlock_grp; static lck_grp_attr_t *bpf_mlock_grp_attr; static lck_attr_t *bpf_mlock_attr; @@ -199,13 +194,14 @@ static int bpf_movein(struct uio *, int, static int bpf_setif(struct bpf_d *, ifnet_t ifp, u_int32_t dlt); static void bpf_timed_out(void *, void *); static void bpf_wakeup(struct bpf_d *); -static void catchpacket(struct bpf_d *, u_char *, u_int, - u_int, void (*)(const void *, void *, size_t)); +static void catchpacket(struct bpf_d *, u_char *, struct mbuf *, u_int, + u_int, int, void (*)(const void *, void *, size_t)); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, u_int bf_len, user_addr_t bf_insns); -static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *, - struct proc *); +static int bpf_getdltlist(struct bpf_d *, caddr_t, struct proc *); static int bpf_setdlt(struct bpf_d *, u_int); +static int bpf_set_traffic_class(struct bpf_d *, int); +static void bpf_set_packet_service_class(struct mbuf *, int); /*static void *bpf_devfs_token[MAXBPFILTER];*/ @@ -222,8 +218,8 @@ static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m); d_close_t bpfclose; d_read_t bpfread; d_write_t bpfwrite; - ioctl_fcn_t bpfioctl; - select_fcn_t bpfselect; + ioctl_fcn_t bpfioctl; + select_fcn_t bpfselect; /* Darwin's cdevsw struct differs slightly from BSDs */ @@ -234,15 +230,15 @@ static struct cdevsw bpf_cdevsw = { /* read */ bpfread, /* write */ bpfwrite, /* ioctl */ bpfioctl, - /* stop */ eno_stop, - /* reset */ eno_reset, - /* tty */ NULL, - /* select */ bpfselect, - /* mmap */ eno_mmap, - /* strategy*/ eno_strat, - /* getc */ eno_getc, - /* putc */ eno_putc, - /* type */ 0 + /* stop */ eno_stop, + /* reset */ eno_reset, + /* tty */ NULL, + /* select */ bpfselect, + /* mmap */ eno_mmap, + /* strategy*/ eno_strat, + /* getc */ eno_getc, + /* putc */ eno_putc, + /* type */ 0 }; #define SOCKADDR_HDR_LEN offsetof(struct sockaddr, sa_data) @@ -316,7 +312,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc sa_family = AF_IEEE80211; hlen = 0; break; - + case DLT_IEEE802_11_RADIO: sa_family = AF_IEEE80211; hlen = 0; @@ -360,13 +356,8 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc if (m == 0) return (ENOBUFS); if ((unsigned)len > MHLEN) { -#if BSD >= 199103 MCLGET(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) { -#else - MCLGET(m); - if (m->m_len != MCLBYTES) { -#endif error = ENOBUFS; goto bad; } @@ -381,11 +372,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc if (hlen != 0) { m->m_pkthdr.len -= hlen; m->m_len -= hlen; -#if BSD >= 199103 m->m_data += hlen; /* XXX */ -#else - m->m_off += hlen; -#endif error = UIOMOVE((caddr_t)sockp->sa_data, hlen, UIO_WRITE, uio); if (error) goto bad; @@ -691,7 +678,8 @@ bpfopen(dev_t dev, int flags, __unused int fmt, d->bd_seesent = 1; d->bd_oflags = flags; d->bd_state = BPF_IDLE; - d->bd_thread_call = thread_call_allocate(bpf_timed_out, d); + d->bd_thread_call = thread_call_allocate(bpf_timed_out, d); + d->bd_traffic_class = SO_TC_BE; if (d->bd_thread_call == NULL) { printf("bpfopen: malloc thread call failed\n"); @@ -817,6 +805,26 @@ bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo) return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime); } +static struct inpcb * +bpf_findinpcb(struct inpcbinfo *pcbinfo, uint32_t flowhash) +{ + struct inpcb *inp = NULL; + + if (!flowhash) return (NULL); + + lck_rw_lock_shared(pcbinfo->mtx); + LIST_FOREACH(inp, pcbinfo->listhead, inp_list) { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + if (inp->inp_flowhash == flowhash) + break; + in_pcb_checkstate(inp, WNT_RELEASE, 0); + } + } + lck_rw_done(pcbinfo->mtx); + + return (inp); +} + /* * Rotate the packet buffers in descriptor d. Move the store buffer * into the hold slot, and the free buffer into the store slot. @@ -936,6 +944,39 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) * At this point, we know we have something in the hold slot. */ + /* + * Before we move data to userland, we fill out the extended + * header fields. + */ + if (d->bd_extendedhdr) { + char *p; + + p = d->bd_hbuf; + while (p < d->bd_hbuf + d->bd_hlen) { + struct bpf_hdr_ext *ehp; + struct inpcb *inp; + uint32_t flowhash; + pid_t pid; + + ehp = (struct bpf_hdr_ext *)(void *)p; + if ((flowhash = ehp->bh_flowhash)) { + if (ehp->bh_flags & BPF_HDR_EXT_FLAGS_TCP) + inp = bpf_findinpcb(&tcbinfo, flowhash); + else + inp = bpf_findinpcb(&udbinfo, flowhash); + if (inp) { + socket_lock(inp->inp_socket, 0); + pid = inp->inp_socket->last_pid; + in_pcb_checkstate(inp, WNT_RELEASE, 1); + socket_unlock(inp->inp_socket, 0); + ehp->bh_pid = pid; + proc_name(pid, ehp->bh_comm, MAXCOMLEN); + } + ehp->bh_flowhash = 0; + } + p += BPF_WORDALIGN(ehp->bh_hdrlen + ehp->bh_caplen); + } + } /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since @@ -965,20 +1006,12 @@ bpf_wakeup(struct bpf_d *d) if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(d->bd_sigio, d->bd_sig); -#if BSD >= 199103 selwakeup(&d->bd_sel); KNOTE(&d->bd_sel.si_note, 1); #ifndef __APPLE__ /* XXX */ d->bd_sel.si_pid = 0; #endif -#else - if (d->bd_selproc) { - selwakeup(d->bd_selproc, (int)d->bd_selcoll); - d->bd_selcoll = 0; - d->bd_selproc = 0; - } -#endif } @@ -1050,26 +1083,26 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) } ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf); - /* - * fix for PR-6849527 - * geting variables onto stack before dropping lock for bpf_movein() - */ - bif_dlt = (int)d->bd_bif->bif_dlt; - bd_hdrcmplt = d->bd_hdrcmplt; - + /* + * fix for PR-6849527 + * geting variables onto stack before dropping lock for bpf_movein() + */ + bif_dlt = (int)d->bd_bif->bif_dlt; + bd_hdrcmplt = d->bd_hdrcmplt; + /* bpf_movein allocating mbufs; drop lock */ - lck_mtx_unlock(bpf_mlock); + lck_mtx_unlock(bpf_mlock); error = bpf_movein(uio, bif_dlt, &m, - bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, - &datlen); - - if (error) { + bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, + &datlen); + + if (error) { return (error); } /* taking the lock again and verifying whether device is open */ - lck_mtx_lock(bpf_mlock); + lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; if (d == 0 || d == (void *)1) { lck_mtx_unlock(bpf_mlock); @@ -1093,16 +1126,19 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) #if CONFIG_MACF_NET mac_mbuf_label_associate_bpfdesc(d, m); #endif + + bpf_set_packet_service_class(m, d->bd_traffic_class); + lck_mtx_unlock(bpf_mlock); if (d->bd_hdrcmplt) { if (d->bd_bif->bif_send) error = d->bd_bif->bif_send(ifp, d->bd_bif->bif_dlt, m); else - error = dlil_output(ifp, 0, m, NULL, NULL, 1); - } - else { - error = dlil_output(ifp, PF_INET, m, NULL, (struct sockaddr *)dst_buf, 0); + error = dlil_output(ifp, 0, m, NULL, NULL, 1, NULL); + } else { + error = dlil_output(ifp, PF_INET, m, NULL, + (struct sockaddr *)dst_buf, 0, NULL); } /* @@ -1148,6 +1184,9 @@ reset_d(struct bpf_d *d) * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGSEESENT Get "see packets sent" flag * BIOCSSEESENT Set "see packets sent" flag + * BIOCSETTC Set traffic class. + * BIOCGETTC Get traffic class. + * BIOCSEXTHDR Set "extended header" flag */ /* ARGSUSED */ int @@ -1155,7 +1194,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, struct proc *p) { struct bpf_d *d; - int error = 0; + int error = 0, int_arg; + struct ifreq ifr; lck_mtx_lock(bpf_mlock); @@ -1178,7 +1218,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Check for read packet available. */ - case FIONREAD: + case FIONREAD: /* int */ { int n; @@ -1186,11 +1226,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, if (d->bd_hbuf) n += d->bd_hlen; - *(int *)addr = n; + bcopy(&n, addr, sizeof (n)); break; } - case SIOCGIFADDR: + case SIOCGIFADDR: /* struct ifreq */ { struct ifnet *ifp; @@ -1206,44 +1246,47 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Get buffer len [for read()]. */ - case BIOCGBLEN: - *(u_int *)addr = d->bd_bufsize; + case BIOCGBLEN: /* u_int */ + bcopy(&d->bd_bufsize, addr, sizeof (u_int)); break; /* * Set buffer length. */ - case BIOCSBLEN: -#if BSD < 199103 - error = EINVAL; -#else + case BIOCSBLEN: /* u_int */ if (d->bd_bif != 0) error = EINVAL; else { - u_int size = *(u_int *)addr; + u_int size; + + bcopy(addr, &size, sizeof (size)); if (size > bpf_maxbufsize) - *(u_int *)addr = size = bpf_maxbufsize; + size = bpf_maxbufsize; else if (size < BPF_MINBUFSIZE) - *(u_int *)addr = size = BPF_MINBUFSIZE; + size = BPF_MINBUFSIZE; + bcopy(&size, addr, sizeof (size)); d->bd_bufsize = size; } -#endif break; /* * Set link layer read filter. */ - case BIOCSETF32: { - struct bpf_program32 *prg32 = (struct bpf_program32 *)addr; - error = bpf_setf(d, prg32->bf_len, - CAST_USER_ADDR_T(prg32->bf_insns)); + case BIOCSETF32: { /* struct bpf_program32 */ + struct bpf_program32 prg32; + + bcopy(addr, &prg32, sizeof (prg32)); + error = bpf_setf(d, prg32.bf_len, + CAST_USER_ADDR_T(prg32.bf_insns)); break; } - case BIOCSETF64: { - struct bpf_program64 *prg64 = (struct bpf_program64 *)addr; - error = bpf_setf(d, prg64->bf_len, prg64->bf_insns); + case BIOCSETF64: { /* struct bpf_program64 */ + struct bpf_program64 prg64; + + bcopy(addr, &prg64, sizeof (prg64)); + error = bpf_setf(d, prg64.bf_len, prg64.bf_insns); break; } @@ -1277,56 +1320,62 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Get device parameters. */ - case BIOCGDLT: + case BIOCGDLT: /* u_int */ if (d->bd_bif == 0) error = EINVAL; else - *(u_int *)addr = d->bd_bif->bif_dlt; + bcopy(&d->bd_bif->bif_dlt, addr, sizeof (u_int)); break; /* * Get a list of supported data link types. */ - case BIOCGDLTLIST: + case BIOCGDLTLIST: /* struct bpf_dltlist */ if (d->bd_bif == NULL) { error = EINVAL; } else { - error = bpf_getdltlist(d, - (struct bpf_dltlist *)addr, p); + error = bpf_getdltlist(d, addr, p); } break; /* * Set data link type. */ - case BIOCSDLT: - if (d->bd_bif == NULL) - error = EINVAL; - else - error = bpf_setdlt(d, *(u_int *)addr); - break; + case BIOCSDLT: /* u_int */ + if (d->bd_bif == NULL) { + error = EINVAL; + } else { + u_int dlt; + + bcopy(addr, &dlt, sizeof (dlt)); + error = bpf_setdlt(d, dlt); + } + break; /* * Get interface name. */ - case BIOCGETIF: + case BIOCGETIF: /* struct ifreq */ if (d->bd_bif == 0) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; - struct ifreq *const ifr = (struct ifreq *)addr; - snprintf(ifr->ifr_name, sizeof(ifr->ifr_name), - "%s%d", ifp->if_name, ifp->if_unit); + snprintf(((struct ifreq *)(void *)addr)->ifr_name, + sizeof (ifr.ifr_name), "%s%d", ifp->if_name, + ifp->if_unit); } break; /* * Set interface. */ - case BIOCSETIF: { + case BIOCSETIF: { /* struct ifreq */ ifnet_t ifp; - ifp = ifunit(((struct ifreq *)addr)->ifr_name); + + bcopy(addr, &ifr, sizeof (ifr)); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + ifp = ifunit(ifr.ifr_name); if (ifp == NULL) error = ENXIO; else @@ -1337,122 +1386,145 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Set read timeout. */ - case BIOCSRTIMEOUT32: - { - struct user32_timeval *_tv = (struct user32_timeval *)addr; - struct timeval tv; + case BIOCSRTIMEOUT32: { /* struct user32_timeval */ + struct user32_timeval _tv; + struct timeval tv; - tv.tv_sec = _tv->tv_sec; - tv.tv_usec = _tv->tv_usec; + bcopy(addr, &_tv, sizeof (_tv)); + tv.tv_sec = _tv.tv_sec; + tv.tv_usec = _tv.tv_usec; + + /* + * Subtract 1 tick from tvtohz() since this isn't + * a one-shot timer. + */ + if ((error = itimerfix(&tv)) == 0) + d->bd_rtout = tvtohz(&tv) - 1; + break; + } + + case BIOCSRTIMEOUT64: { /* struct user64_timeval */ + struct user64_timeval _tv; + struct timeval tv; + + bcopy(addr, &_tv, sizeof (_tv)); + tv.tv_sec = _tv.tv_sec; + tv.tv_usec = _tv.tv_usec; + + /* + * Subtract 1 tick from tvtohz() since this isn't + * a one-shot timer. + */ + if ((error = itimerfix(&tv)) == 0) + d->bd_rtout = tvtohz(&tv) - 1; + break; + } - /* - * Subtract 1 tick from tvtohz() since this isn't - * a one-shot timer. - */ - if ((error = itimerfix(&tv)) == 0) - d->bd_rtout = tvtohz(&tv) - 1; - break; - } - - case BIOCSRTIMEOUT64: - { - struct user64_timeval *_tv = (struct user64_timeval *)addr; - struct timeval tv; - - tv.tv_sec = _tv->tv_sec; - tv.tv_usec = _tv->tv_usec; - - /* - * Subtract 1 tick from tvtohz() since this isn't - * a one-shot timer. - */ - if ((error = itimerfix(&tv)) == 0) - d->bd_rtout = tvtohz(&tv) - 1; - break; - } - /* * Get read timeout. */ - case BIOCGRTIMEOUT32: - { - struct user32_timeval *tv = (struct user32_timeval *)addr; + case BIOCGRTIMEOUT32: { /* struct user32_timeval */ + struct user32_timeval tv; - tv->tv_sec = d->bd_rtout / hz; - tv->tv_usec = (d->bd_rtout % hz) * tick; - break; - } + bzero(&tv, sizeof (tv)); + tv.tv_sec = d->bd_rtout / hz; + tv.tv_usec = (d->bd_rtout % hz) * tick; + bcopy(&tv, addr, sizeof (tv)); + break; + } - case BIOCGRTIMEOUT64: - { - struct user64_timeval *tv = (struct user64_timeval *)addr; + case BIOCGRTIMEOUT64: { /* struct user64_timeval */ + struct user64_timeval tv; - tv->tv_sec = d->bd_rtout / hz; - tv->tv_usec = (d->bd_rtout % hz) * tick; - break; - } + bzero(&tv, sizeof (tv)); + tv.tv_sec = d->bd_rtout / hz; + tv.tv_usec = (d->bd_rtout % hz) * tick; + bcopy(&tv, addr, sizeof (tv)); + break; + } /* * Get packet stats. */ - case BIOCGSTATS: - { - struct bpf_stat *bs = (struct bpf_stat *)addr; + case BIOCGSTATS: { /* struct bpf_stat */ + struct bpf_stat bs; - bs->bs_recv = d->bd_rcount; - bs->bs_drop = d->bd_dcount; - break; - } + bzero(&bs, sizeof (bs)); + bs.bs_recv = d->bd_rcount; + bs.bs_drop = d->bd_dcount; + bcopy(&bs, addr, sizeof (bs)); + break; + } /* * Set immediate mode. */ - case BIOCIMMEDIATE: - d->bd_immediate = *(u_int *)addr; + case BIOCIMMEDIATE: /* u_int */ + bcopy(addr, &d->bd_immediate, sizeof (u_int)); break; - case BIOCVERSION: - { - struct bpf_version *bv = (struct bpf_version *)addr; + case BIOCVERSION: { /* struct bpf_version */ + struct bpf_version bv; - bv->bv_major = BPF_MAJOR_VERSION; - bv->bv_minor = BPF_MINOR_VERSION; - break; - } + bzero(&bv, sizeof (bv)); + bv.bv_major = BPF_MAJOR_VERSION; + bv.bv_minor = BPF_MINOR_VERSION; + bcopy(&bv, addr, sizeof (bv)); + break; + } /* * Get "header already complete" flag */ - case BIOCGHDRCMPLT: - *(u_int *)addr = d->bd_hdrcmplt; + case BIOCGHDRCMPLT: /* u_int */ + bcopy(&d->bd_hdrcmplt, addr, sizeof (u_int)); break; /* * Set "header already complete" flag */ - case BIOCSHDRCMPLT: - d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; + case BIOCSHDRCMPLT: /* u_int */ + bcopy(addr, &int_arg, sizeof (int_arg)); + d->bd_hdrcmplt = int_arg ? 1 : 0; break; /* * Get "see sent packets" flag */ - case BIOCGSEESENT: - *(u_int *)addr = d->bd_seesent; + case BIOCGSEESENT: /* u_int */ + bcopy(&d->bd_seesent, addr, sizeof (u_int)); break; /* * Set "see sent packets" flag */ - case BIOCSSEESENT: - d->bd_seesent = *(u_int *)addr; + case BIOCSSEESENT: /* u_int */ + bcopy(addr, &d->bd_seesent, sizeof (u_int)); + break; + + /* + * Set traffic service class + */ + case BIOCSETTC: { /* int */ + int tc; + + bcopy(addr, &tc, sizeof (int)); + error = bpf_set_traffic_class(d, tc); break; + } - case FIONBIO: /* Non-blocking I/O */ + /* + * Get traffic service class + */ + case BIOCGETTC: /* int */ + bcopy(&d->bd_traffic_class, addr, sizeof (int)); break; - case FIOASYNC: /* Send signal on receive packets */ - d->bd_async = *(int *)addr; + case FIONBIO: /* Non-blocking I/O; int */ + break; + + case FIOASYNC: /* Send signal on receive packets; int */ + bcopy(addr, &d->bd_async, sizeof (int)); break; #ifndef __APPLE__ case FIOSETOWN: @@ -1473,23 +1545,25 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, *(int *)addr = -fgetown(d->bd_sigio); break; #endif - case BIOCSRSIG: /* Set receive signal */ - { - u_int sig; + case BIOCSRSIG: { /* Set receive signal; u_int */ + u_int sig; - sig = *(u_int *)addr; + bcopy(addr, &sig, sizeof (u_int)); - if (sig >= NSIG) - error = EINVAL; - else - d->bd_sig = sig; - break; - } - case BIOCGRSIG: - *(u_int *)addr = d->bd_sig; + if (sig >= NSIG) + error = EINVAL; + else + d->bd_sig = sig; break; } - + case BIOCGRSIG: /* u_int */ + bcopy(&d->bd_sig, addr, sizeof (u_int)); + break; + case BIOCSEXTHDR: + bcopy(addr, &d->bd_extendedhdr, sizeof (u_int)); + break; + } + lck_mtx_unlock(bpf_mlock); return (error); @@ -1592,18 +1666,20 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant, u_int32_t dlt) * Get a list of available data link type of the interface. */ static int -bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl, struct proc *p) +bpf_getdltlist(struct bpf_d *d, caddr_t addr, struct proc *p) { u_int n; int error; struct ifnet *ifp; struct bpf_if *bp; user_addr_t dlist; + struct bpf_dltlist bfl; + bcopy(addr, &bfl, sizeof (bfl)); if (proc_is64bit(p)) { - dlist = (user_addr_t)bfl->bfl_u.bflu_pad; + dlist = (user_addr_t)bfl.bfl_u.bflu_pad; } else { - dlist = CAST_USER_ADDR_T(bfl->bfl_u.bflu_list); + dlist = CAST_USER_ADDR_T(bfl.bfl_u.bflu_list); } ifp = d->bd_bif->bif_ifp; @@ -1613,16 +1689,20 @@ bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl, struct proc *p) if (bp->bif_ifp != ifp) continue; if (dlist != USER_ADDR_NULL) { - if (n >= bfl->bfl_len) { + if (n >= bfl.bfl_len) { return (ENOMEM); } error = copyout(&bp->bif_dlt, dlist, sizeof (bp->bif_dlt)); + if (error != 0) + break; dlist += sizeof (bp->bif_dlt); } n++; } - bfl->bfl_len = n; + bfl.bfl_len = n; + bcopy(&bfl, addr, sizeof (bfl)); + return (error); } @@ -1669,6 +1749,29 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) return (bp == NULL ? EINVAL : 0); } +static int +bpf_set_traffic_class(struct bpf_d *d, int tc) +{ + int error = 0; + + if (!SO_VALID_TC(tc)) + error = EINVAL; + else + d->bd_traffic_class = tc; + + return (error); +} + +static void +bpf_set_packet_service_class(struct mbuf *m, int tc) +{ + if (!(m->m_flags & M_PKTHDR)) + return; + + VERIFY(SO_VALID_TC(tc)); + (void) m_set_service_class(m, so_tc2msc(tc)); +} + /* * Support for select() * @@ -1843,17 +1946,6 @@ filt_bpfread(struct knote *kn, long hint) return (ready); } -static inline void* -_cast_non_const(const void * ptr) { - union { - const void* cval; - void* val; - } ret; - - ret.cval = ptr; - return (ret.val); -} - /* * Copy data from an mbuf chain into a buffer. This code is derived * from m_copydata in sys/uipc_mbuf.c. @@ -1861,7 +1953,7 @@ _cast_non_const(const void * ptr) { static void bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) { - struct mbuf *m = _cast_non_const(src_arg); + struct mbuf *m = (struct mbuf *)(uintptr_t)(src_arg); u_int count; u_char *dst; @@ -1884,9 +1976,10 @@ bpf_tap_imp( mbuf_t m, void* hdr, size_t hlen, - int outbound) + int outbound) { struct bpf_if *bp; + struct mbuf *savedm = m; /* * It's possible that we get here after the bpf descriptor has been @@ -1953,7 +2046,8 @@ bpf_tap_imp( if (mac_bpfdesc_check_receive(d, bp->bif_ifp) != 0) continue; #endif - catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcopy); + catchpacket(d, (u_char *)m, savedm, pktlen, + slen, outbound, bpf_mcopy); } } } @@ -1999,13 +2093,19 @@ static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m) * pkt is really an mbuf. */ static void -catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, +catchpacket(struct bpf_d *d, u_char *pkt, struct mbuf *m, u_int pktlen, + u_int snaplen, int outbound, void (*cpfn)(const void *, void *, size_t)) { struct bpf_hdr *hp; + struct bpf_hdr_ext *ehp; int totlen, curlen; - int hdrlen = d->bd_bif->bif_hdrlen; + int hdrlen, caplen; int do_wakeup = 0; + u_char *payload; + + hdrlen = d->bd_extendedhdr ? d->bd_bif->bif_exthdrlen : + d->bd_bif->bif_hdrlen; /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -2049,17 +2149,41 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, /* * Append the bpf header. */ - hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); struct timeval tv; microtime(&tv); - hp->bh_tstamp.tv_sec = tv.tv_sec; - hp->bh_tstamp.tv_usec = tv.tv_usec; - hp->bh_datalen = pktlen; - hp->bh_hdrlen = hdrlen; + if (d->bd_extendedhdr) { + ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen); + memset(ehp, 0, sizeof(*ehp)); + ehp->bh_tstamp.tv_sec = tv.tv_sec; + ehp->bh_tstamp.tv_usec = tv.tv_usec; + ehp->bh_datalen = pktlen; + ehp->bh_hdrlen = hdrlen; + ehp->bh_caplen = totlen - hdrlen; + if (outbound) { + if (m->m_pkthdr.m_fhflags & PF_TAG_FLOWHASH) + ehp->bh_flowhash = m->m_pkthdr.m_flowhash; + ehp->bh_svc = so_svc2tc(m->m_pkthdr.svc); + ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_OUT; + if (m->m_pkthdr.m_fhflags & PF_TAG_TCP) + ehp->bh_flags |= BPF_HDR_EXT_FLAGS_TCP; + } else + ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN; + payload = (u_char *)ehp + hdrlen; + caplen = ehp->bh_caplen; + } else { + hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen); + hp->bh_tstamp.tv_sec = tv.tv_sec; + hp->bh_tstamp.tv_usec = tv.tv_usec; + hp->bh_datalen = pktlen; + hp->bh_hdrlen = hdrlen; + hp->bh_caplen = totlen - hdrlen; + payload = (u_char *)hp + hdrlen; + caplen = hp->bh_caplen; + } /* * Copy the packet data into the store buffer and update its length. */ - (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); + (*cpfn)(pkt, payload, caplen); d->bd_slen = curlen + totlen; if (do_wakeup) @@ -2180,6 +2304,8 @@ bpf_attach( * performance reasons and to alleviate alignment restrictions). */ bp_new->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; + bp_new->bif_exthdrlen = BPF_WORDALIGN(hdrlen + + sizeof(struct bpf_hdr_ext)) - hdrlen; /* Take a reference on the interface */ ifnet_reference(ifp); @@ -2258,18 +2384,10 @@ bpf_init(__unused void *unused) bpf_mlock_attr = lck_attr_alloc_init(); - bpf_mlock = lck_mtx_alloc_init(bpf_mlock_grp, bpf_mlock_attr); + lck_mtx_init(bpf_mlock, bpf_mlock_grp, bpf_mlock_attr); - if (bpf_mlock == 0) { - printf("bpf_init: failed to allocate bpf_mlock\n"); - bpf_devsw_installed = 0; - return; - } - maj = cdevsw_add(CDEV_MAJOR, &bpf_cdevsw); if (maj == -1) { - if (bpf_mlock) - lck_mtx_free(bpf_mlock, bpf_mlock_grp); if (bpf_mlock_attr) lck_attr_free(bpf_mlock_attr); if (bpf_mlock_grp) diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index 92a5f31a0..3ed4d951f 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ #ifndef _NET_BPF_H_ #define _NET_BPF_H_ +#include #include #include #include @@ -193,6 +194,11 @@ struct bpf_version { #define BIOCSSEESENT _IOW('B',119, u_int) #define BIOCSDLT _IOW('B',120, u_int) #define BIOCGDLTLIST _IOWR('B',121, struct bpf_dltlist) +#ifdef PRIVATE +#define BIOCGETTC _IOR('B', 122, int) +#define BIOCSETTC _IOW('B', 123, int) +#define BIOCSEXTHDR _IOW('B', 124, u_int) +#endif /* PRIVATE */ /* * Structure prepended to each packet. @@ -204,15 +210,36 @@ struct bpf_hdr { u_short bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; +#ifdef KERNEL /* * Because the structure above is not a multiple of 4 bytes, some compilers * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work. * Only the kernel needs to know about it; applications use bh_hdrlen. */ -#ifdef KERNEL #define SIZEOF_BPF_HDR (sizeof(struct bpf_hdr) <= 20 ? 18 : \ sizeof(struct bpf_hdr)) #endif +#ifdef PRIVATE +/* + * This structure must be a multiple of 4 bytes. + * It includes padding and spare fields that we can use later if desired. + */ +struct bpf_hdr_ext { + struct BPF_TIMEVAL bh_tstamp; /* time stamp */ + bpf_u_int32 bh_caplen; /* length of captured portion */ + bpf_u_int32 bh_datalen; /* original length of packet */ + u_short bh_hdrlen; /* length of bpf header */ + u_short bh_flags; +#define BPF_HDR_EXT_FLAGS_DIR_IN 0x0000 +#define BPF_HDR_EXT_FLAGS_DIR_OUT 0x0001 +#define BPF_HDR_EXT_FLAGS_TCP 0x0002 + pid_t bh_pid; /* process PID */ + char bh_comm[MAXCOMLEN+1]; /* process command */ + u_char _bh_pad2[3]; + bpf_u_int32 bh_svc; /* service class */ + bpf_u_int32 bh_flowhash; /* kernel reserved; 0 in userland */ +}; +#endif /* PRIVATE */ /* * Data-link level type codes. diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 69d35371f..3ec0f2866 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,18 +68,21 @@ */ #include +#include #ifdef sun #include #endif -#if defined(sparc) || defined(mips) || defined(ibm032) || defined(__alpha__) -#define BPF_ALIGN -#endif +#if !defined(__i386__) && !defined(__x86_64__) +#define BPF_ALIGN 1 +#else /* defined(__i386__) || defined(__x86_64__) */ +#define BPF_ALIGN 0 +#endif /* defined(__i386__) || defined(__x86_64__) */ -#ifndef BPF_ALIGN -#define EXTRACT_SHORT(p) ((u_int16_t)ntohs(*(u_int16_t *)p)) -#define EXTRACT_LONG(p) (ntohl(*(u_int32_t *)p)) +#if !BPF_ALIGN +#define EXTRACT_SHORT(p) ((u_int16_t)ntohs(*(u_int16_t *)(void *)p)) +#define EXTRACT_LONG(p) (ntohl(*(u_int32_t *)(void *)p)) #else #define EXTRACT_SHORT(p)\ ((u_int16_t)\ @@ -211,6 +214,8 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) register bpf_u_int32 k; int32_t mem[BPF_MEMWORDS]; + bzero(mem, sizeof(mem)); + if (pc == 0) /* * No filter means accept all. @@ -242,7 +247,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - A = m_xword((struct mbuf *)p, k, &merr); + A = m_xword((struct mbuf *)(void *)p, k, &merr); if (merr != 0) return 0; continue; @@ -255,7 +260,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) A = EXTRACT_LONG(&p[k]); else #endif - A = ntohl(*(int32_t *)(p + k)); + A = ntohl(*(int32_t *)(void *)(p + k)); continue; case BPF_LD|BPF_H|BPF_ABS: @@ -266,7 +271,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - A = m_xhalf((struct mbuf *)p, k, &merr); + A = m_xhalf((struct mbuf *)(void *)p, k, &merr); continue; #else return 0; @@ -283,7 +288,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - m = (struct mbuf *)p; + m = (struct mbuf *)(void *)p; MINDEX(m, k); A = mtod(m, u_char *)[k]; continue; @@ -311,7 +316,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - A = m_xword((struct mbuf *)p, k, &merr); + A = m_xword((struct mbuf *)(void *)p, k, &merr); if (merr != 0) return 0; continue; @@ -324,7 +329,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) A = EXTRACT_LONG(&p[k]); else #endif - A = ntohl(*(int32_t *)(p + k)); + A = ntohl(*(int32_t *)(void *)(p + k)); continue; case BPF_LD|BPF_H|BPF_IND: @@ -336,7 +341,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - A = m_xhalf((struct mbuf *)p, k, &merr); + A = m_xhalf((struct mbuf *)(void *)p, k, &merr); if (merr != 0) return 0; continue; @@ -355,7 +360,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - m = (struct mbuf *)p; + m = (struct mbuf *)(void *)p; MINDEX(m, k); A = mtod(m, u_char *)[k]; continue; @@ -374,7 +379,7 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) if (buflen != 0) return 0; - m = (struct mbuf *)p; + m = (struct mbuf *)(void *)p; MINDEX(m, k); X = (mtod(m, u_char *)[k] & 0xf) << 2; continue; diff --git a/bsd/net/bpfdesc.h b/bsd/net/bpfdesc.h index e0507f935..d96300bc6 100644 --- a/bsd/net/bpfdesc.h +++ b/bsd/net/bpfdesc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -132,6 +132,8 @@ struct bpf_d { #if CONFIG_MACF_NET struct label * bd_label; /* MAC label for descriptor */ #endif + int bd_traffic_class; /* traffic service class */ + int bd_extendedhdr; /* process req. the extended header */ }; /* Values for bd_state */ @@ -154,6 +156,7 @@ struct bpf_if { struct bpf_d *bif_dlist; /* descriptor list */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of header (with padding) */ + u_int bif_exthdrlen; /* length of ext header */ struct ifnet *bif_ifp; /* corresponding interface */ bpf_send_func bif_send; bpf_tap_func bif_tap; diff --git a/bsd/net/bridgestp.c b/bsd/net/bridgestp.c index 1d6922f28..c7fc659a7 100644 --- a/bsd/net/bridgestp.c +++ b/bsd/net/bridgestp.c @@ -94,7 +94,8 @@ #include -static lck_mtx_t *bstp_task_mtx = NULL; +decl_lck_mtx_data(static, bstp_task_mtx_data); +static lck_mtx_t *bstp_task_mtx = &bstp_task_mtx_data; static lck_grp_t *bstp_task_grp = NULL; static lck_attr_t *bstp_task_attr = NULL; static thread_t bstp_task_thread; @@ -142,10 +143,9 @@ static void bstp_task_drain(struct bstp_task *); #define INFO_SAME 0 #define INFO_WORSE -1 -const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; - LIST_HEAD(, bstp_state) bstp_list; -static lck_mtx_t *bstp_list_mtx; +decl_lck_mtx_data(static, bstp_list_mtx_data); +static lck_mtx_t *bstp_list_mtx = &bstp_list_mtx_data; static lck_grp_t *bstp_lock_grp = NULL; static lck_attr_t *bstp_lock_attr = NULL; @@ -2326,7 +2326,7 @@ bstp_sys_init(void) #if BRIDGE_DEBUG lck_attr_setdebug(bstp_lock_attr); #endif - bstp_list_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr); + lck_mtx_init(bstp_list_mtx, bstp_lock_grp, bstp_lock_attr); lck_grp_attr_free(lck_grp_attr); LIST_INIT(&bstp_list); @@ -2349,7 +2349,7 @@ bstp_create_task_thread(void) #if BRIDGE_DEBUG lck_attr_setdebug(bstp_task_attr); #endif - bstp_task_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr); + lck_mtx_init(bstp_task_mtx, bstp_lock_grp, bstp_lock_attr); lck_grp_attr_free(lck_grp_attr); error = kernel_thread_start((thread_continue_t)bstp_task_thread_func, NULL, &bstp_task_thread); diff --git a/bsd/net/bridgestp.h b/bsd/net/bridgestp.h index a70f7aaba..412fface8 100644 --- a/bsd/net/bridgestp.h +++ b/bsd/net/bridgestp.h @@ -408,8 +408,6 @@ struct bstp_state { bstp_rtage_cb_t bs_rtage_cb; }; -extern const uint8_t bstp_etheraddr[]; - void bstp_attach(struct bstp_state *, struct bstp_cb_ops *); void bstp_detach(struct bstp_state *); void bstp_init(struct bstp_state *); diff --git a/bsd/crypto/des/Makefile b/bsd/net/classq/Makefile similarity index 51% rename from bsd/crypto/des/Makefile rename to bsd/net/classq/Makefile index 2eee6301a..9e99d6fbe 100644 --- a/bsd/crypto/des/Makefile +++ b/bsd/net/classq/Makefile @@ -3,36 +3,42 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_I386 = \ - -INSTINC_SUBDIRS_X86_64 = \ +INSTINC_SUBDIRS_PPC = \ -INSTINC_SUBDIRS_ARM = \ +INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ +EXPINC_SUBDIRS_PPC = \ + EXPINC_SUBDIRS_I386 = \ -EXPINC_SUBDIRS_X86_64 = \ +DATAFILES= \ -EXPINC_SUBDIRS_ARM = \ +KERNELFILES= \ PRIVATE_DATAFILES = \ - des.h + classq.h classq_blue.h classq_red.h classq_rio.h classq_sfb.h \ + if_classq.h + +PRIVATE_KERNELFILES = ${KERNELFILES} -INSTALL_MI_DIR = crypto +INSTALL_MI_LIST = ${DATAFILES} + +INSTALL_MI_DIR = net/classq + +EXPORT_MI_LIST = ${INSTALL_MI_LIST} ${KERNELFILES} EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/bsd/net/classq/classq.c b/bsd/net/classq/classq.c new file mode 100644 index 000000000..92b76007f --- /dev/null +++ b/bsd/net/classq/classq.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +u_int32_t classq_verbose; /* more noise if greater than 1 */ + +SYSCTL_NODE(_net, OID_AUTO, classq, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "classq"); + +SYSCTL_UINT(_net_classq, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED, + &classq_verbose, 0, "Class queue verbosity level"); + +void +_qinit(class_queue_t *q, int type, int lim) +{ + MBUFQ_INIT(&q->mbufq); + qlimit(q) = lim; + qlen(q) = 0; + qsize(q) = 0; + qtype(q) = type; + qstate(q) = QS_RUNNING; +} + +/* add a packet at the tail of the queue */ +void +_addq(class_queue_t *q, struct mbuf *m) +{ + MBUFQ_ENQUEUE(&q->mbufq, m); + qlen(q)++; + VERIFY(qlen(q) != 0); + qsize(q) += m_length(m); +} + +/* add one or more packets at the tail of the queue */ +void +_addq_multi(class_queue_t *q, struct mbuf *m_head, struct mbuf *m_tail, + u_int32_t cnt, u_int32_t size) +{ + MBUFQ_ENQUEUE_MULTI(&q->mbufq, m_head, m_tail); + qlen(q) += cnt; + qsize(q) += size; +} + +/* get a packet at the head of the queue */ +struct mbuf * +_getq(class_queue_t *q) +{ + struct mbuf *m; + + MBUFQ_DEQUEUE(&q->mbufq, m); + if (m == NULL) { + VERIFY(qlen(q) == 0); + if (qsize(q) > 0) + qsize(q) = 0; + return (NULL); + } + VERIFY(qlen(q) > 0); + qlen(q)--; + + /* qsize is an approximation, so adjust if necessary */ + if (((int)qsize(q) - m_length(m)) > 0) + qsize(q) -= m_length(m); + else if (qsize(q) != 0) + qsize(q) = 0; + + return (m); +} + +/* get a packet of a specific flow beginning from the head of the queue */ +struct mbuf * +_getq_flow(class_queue_t *q, u_int32_t flow) +{ + struct mbuf *m, *m_tmp; + + MBUFQ_FOREACH_SAFE(m, &q->mbufq, m_tmp) { + if (flow == 0 || ((m->m_flags & M_PKTHDR) && + m->m_pkthdr.m_flowhash == flow)) { + /* remove it from the class queue */ + MBUFQ_REMOVE(&q->mbufq, m); + MBUFQ_NEXT(m) = NULL; + break; + } + } + + if (m != NULL) { + u_int32_t l = m_length(m); + + VERIFY(qlen(q) > 0); + qlen(q)--; + + /* qsize is an approximation, so adjust if necessary */ + if (((int)qsize(q) - l) > 0) + qsize(q) -= l; + else if (qsize(q) != 0) + qsize(q) = 0; + } + + return (m); +} + +/* get all packets starting from the head of the queue */ +struct mbuf * +_getq_all(class_queue_t *q) +{ + struct mbuf *m; + + m = MBUFQ_FIRST(&q->mbufq); + MBUFQ_INIT(&q->mbufq); + qlen(q) = 0; + qsize(q) = 0; + + return (m); +} + +/* drop a packet at the tail of the queue */ +struct mbuf * +_getq_tail(class_queue_t *q) +{ + struct mq_head *head = &q->mbufq; + struct mbuf *m = MBUFQ_LAST(head); + + if (m != NULL) { + struct mbuf *n = MBUFQ_FIRST(head); + + while (n != NULL) { + struct mbuf *next = MBUFQ_NEXT(n); + if (next == m) { + MBUFQ_NEXT(n) = NULL; + break; + } + n = next; + } + VERIFY(n != NULL || + (qlen(q) == 1 && m == MBUFQ_FIRST(head))); + VERIFY(qlen(q) > 0); + --qlen(q); + + /* qsize is an approximation, so adjust if necessary */ + if (((int)qsize(q) - m_length(m)) > 0) + qsize(q) -= m_length(m); + else if (qsize(q) != 0) + qsize(q) = 0; + + if (qempty(q)) { + VERIFY(MBUFQ_EMPTY(head)); + MBUFQ_INIT(head); + } else { + VERIFY(n != NULL); + head->mq_last = &MBUFQ_NEXT(n); + } + } + return (m); +} + +/* randomly select a packet in the queue */ +struct mbuf * +_getq_random(class_queue_t *q) +{ + struct mq_head *head = &q->mbufq; + struct mbuf *m = NULL; + unsigned int n; + u_int32_t rnd; + + n = qlen(q); + if (n == 0) { + VERIFY(MBUFQ_EMPTY(head)); + if (qsize(q) > 0) + qsize(q) = 0; + return (NULL); + } + + m = MBUFQ_FIRST(head); + read_random(&rnd, sizeof (rnd)); + n = (rnd % n) + 1; + + if (n == 1) { + if ((MBUFQ_FIRST(head) = MBUFQ_NEXT(m)) == NULL) + (head)->mq_last = &MBUFQ_FIRST(head); + } else { + struct mbuf *p = NULL; + + VERIFY(n > 1); + while (n--) { + if (MBUFQ_NEXT(m) == NULL) + break; + p = m; + m = MBUFQ_NEXT(m); + } + VERIFY(p != NULL && MBUFQ_NEXT(p) == m); + + if ((MBUFQ_NEXT(p) = MBUFQ_NEXT(m)) == NULL) + (head)->mq_last = &MBUFQ_NEXT(p); + } + + VERIFY(qlen(q) > 0); + --qlen(q); + + /* qsize is an approximation, so adjust if necessary */ + if (((int)qsize(q) - m_length(m)) > 0) + qsize(q) -= m_length(m); + else if (qsize(q) != 0) + qsize(q) = 0; + + MBUFQ_NEXT(m) = NULL; + + return (m); +} + +/* remove a packet from the queue */ +void +_removeq(class_queue_t *q, struct mbuf *m) +{ + struct mq_head *head = &q->mbufq; + struct mbuf *m0, **mtail; + + m0 = MBUFQ_FIRST(head); + if (m0 == NULL) + return; + + if (m0 != m) { + while (MBUFQ_NEXT(m0) != m) { + if (m0 == NULL) + return; + m0 = MBUFQ_NEXT(m0); + } + mtail = &MBUFQ_NEXT(m0); + } else { + mtail = &MBUFQ_FIRST(head); + } + + *mtail = MBUFQ_NEXT(m); + if (*mtail == NULL) + head->mq_last = mtail; + + VERIFY(qlen(q) > 0); + --qlen(q); + + /* qsize is an approximation, so adjust if necessary */ + if (((int)qsize(q) - m_length(m)) > 0) + qsize(q) -= m_length(m); + else if (qsize(q) != 0) + qsize(q) = 0; + + MBUFQ_NEXT(m) = NULL; +} + +void +_flushq(class_queue_t *q) +{ + (void) _flushq_flow(q, 0, NULL, NULL); +} + +void +_flushq_flow(class_queue_t *q, u_int32_t flow, u_int32_t *cnt, u_int32_t *len) +{ + MBUFQ_HEAD(mq_freeq) freeq; + struct mbuf *m, *m_tmp; + u_int32_t c = 0, l = 0; + + MBUFQ_INIT(&freeq); + + MBUFQ_FOREACH_SAFE(m, &q->mbufq, m_tmp) { + if (flow == 0 || ((m->m_flags & M_PKTHDR) && + m->m_pkthdr.m_flowhash == flow)) { + /* remove it from the class queue */ + MBUFQ_REMOVE(&q->mbufq, m); + MBUFQ_NEXT(m) = NULL; + + /* and add it to the free queue */ + MBUFQ_ENQUEUE(&freeq, m); + + l += m_length(m); + c++; + } + } + VERIFY(c == 0 || !MBUFQ_EMPTY(&freeq)); + + if (c > 0) { + VERIFY(qlen(q) >= c); + qlen(q) -= c; + + /* qsize is an approximation, so adjust if necessary */ + if (((int)qsize(q) - l) > 0) + qsize(q) -= l; + else if (qsize(q) != 0) + qsize(q) = 0; + } + + if (!MBUFQ_EMPTY(&freeq)) + m_freem_list(MBUFQ_FIRST(&freeq)); + + if (cnt != NULL) + *cnt = c; + if (len != NULL) + *len = l; +} diff --git a/bsd/net/classq/classq.h b/bsd/net/classq/classq.h new file mode 100644 index 000000000..fa18ae4ca --- /dev/null +++ b/bsd/net/classq/classq.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_classq.h,v 1.7 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_classq.h,v 1.6 2003/01/07 07:33:38 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * class queue definitions extracted from rm_class.h. + */ +#ifndef _NET_CLASSQ_CLASSQ_H_ +#define _NET_CLASSQ_CLASSQ_H_ + +#ifdef PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Packet Queue types + */ +typedef enum classq_type { + Q_DROPHEAD, + Q_DROPTAIL, + Q_RED, + Q_RIO, + Q_BLUE, + Q_SFB +} classq_type_t; + +/* + * Packet Queue states + */ +typedef enum classq_state { + QS_RUNNING, + QS_SUSPENDED +} classq_state_t; + +#define DEFAULT_QLIMIT 128 /* default */ + +/* + * generic packet counter + */ +struct pktcntr { + u_int64_t packets; + u_int64_t bytes; +}; + +#ifdef BSD_KERNEL_PRIVATE +#include +#include +#include + +/* + * Packet Queue structures and macros to manipulate them. + */ +typedef struct _class_queue_ { + MBUFQ_HEAD(mq_head) mbufq; /* Packet queue */ + u_int32_t qlen; /* Queue length (in number of packets) */ + u_int32_t qsize; /* Approx. queue size (in number of bytes) */ + u_int32_t qlim; /* Queue limit (in number of packets*) */ + classq_type_t qtype; /* Queue type */ + classq_state_t qstate; /* Queue state */ +} class_queue_t; + +#define qtype(q) (q)->qtype /* Get queue type */ +#define qstate(q) (q)->qstate /* Get queue state */ +#define qlimit(q) (q)->qlim /* Max packets to be queued */ +#define qlen(q) (q)->qlen /* Current queue length. */ +#define qsize(q) (q)->qsize /* Approx. bytes in queue */ +/* #define qtail(q) MBUFQ_LAST(&(q)->mbufq) */ +#define qhead(q) MBUFQ_FIRST(&(q)->mbufq) + +#define qempty(q) (qlen(q) == 0) /* Is the queue empty?? */ +#define q_is_red(q) (qtype(q) == Q_RED) /* Is the queue a RED queue */ +#define q_is_rio(q) (qtype(q) == Q_RIO) /* Is the queue a RIO queue */ +#define q_is_blue(q) (qtype(q) == Q_BLUE) /* Is the queue a BLUE queue */ +#define q_is_sfb(q) (qtype(q) == Q_SFB) /* Is the queue a SFB queue */ +#define q_is_red_or_rio(q) (qtype(q) == Q_RED || qtype(q) == Q_RIO) +#define q_is_suspended(q) (qstate(q) == QS_SUSPENDED) + +#define PKTCNTR_ADD(_cntr, _pkt, _len) do { \ + (_cntr)->packets += (_pkt); \ + (_cntr)->bytes += (_len); \ +} while (0) + +#define PKTCNTR_CLEAR(_cntr) do { \ + (_cntr)->packets = 0; \ + (_cntr)->bytes = 0; \ +} while (0) + +/* flags for mark_ecn() */ +#define CLASSQF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define CLASSQF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define CLASSQF_ECN (CLASSQF_ECN4 | CLASSQF_ECN6) + +extern u_int32_t classq_verbose; + +SYSCTL_DECL(_net_classq); + +extern void _qinit(class_queue_t *, int, int); +extern void _addq(class_queue_t *, struct mbuf *); +extern void _addq_multi(class_queue_t *, struct mbuf *, struct mbuf *, + u_int32_t, u_int32_t); +extern struct mbuf *_getq(class_queue_t *); +extern struct mbuf *_getq_all(class_queue_t *); +extern struct mbuf *_getq_tail(class_queue_t *); +extern struct mbuf *_getq_random(class_queue_t *); +extern struct mbuf *_getq_flow(class_queue_t *, u_int32_t); +extern void _removeq(class_queue_t *, struct mbuf *); +extern void _flushq(class_queue_t *); +extern void _flushq_flow(class_queue_t *, u_int32_t, u_int32_t *, u_int32_t *); + +extern void classq_init(void); + +extern u_int8_t read_dsfield(struct mbuf *, struct pf_mtag *); +extern void write_dsfield(struct mbuf *, struct pf_mtag *, u_int8_t); +extern int mark_ecn(struct mbuf *, struct pf_mtag *, int); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_CLASSQ_H_ */ diff --git a/bsd/net/classq/classq_blue.c b/bsd/net/classq/classq_blue.c new file mode 100644 index 000000000..6b67d94d1 --- /dev/null +++ b/bsd/net/classq/classq_blue.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_blue.c,v 1.21 2006/11/16 01:32:37 christos Exp $ */ +/* $KAME: altq_blue.c,v 1.15 2005/04/13 03:44:24 suz Exp $ */ + +/* + * Copyright (C) 1997-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#if CLASSQ_BLUE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#if INET6 +#include +#endif + +#include +#include + +/* + * Blue is proposed and implemented by Wu-chang Feng . + * more information on Blue is available from + * http://www.eecs.umich.edu/~wuchang/blue/ + */ + +#define BLUE_LIMIT 200 /* default max queue lenght */ + +#define BLUE_ZONE_MAX 32 /* maximum elements in zone */ +#define BLUE_ZONE_NAME "classq_blue" /* zone name */ + +static unsigned int blue_size; /* size of zone element */ +static struct zone *blue_zone; /* zone for blue */ + +/* internal function prototypes */ +static struct mbuf *blue_getq_flow(struct blue *, class_queue_t *, + u_int32_t, boolean_t); +static int blue_drop_early(struct blue *); + +void +blue_init(void) +{ + _CASSERT(BLUEF_ECN4 == CLASSQF_ECN4); + _CASSERT(BLUEF_ECN6 == CLASSQF_ECN6); + + blue_size = sizeof (struct blue); + blue_zone = zinit(blue_size, BLUE_ZONE_MAX * blue_size, + 0, BLUE_ZONE_NAME); + if (blue_zone == NULL) { + panic("%s: failed allocating %s", __func__, BLUE_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(blue_zone, Z_EXPAND, TRUE); + zone_change(blue_zone, Z_CALLERACCT, TRUE); +} + +/* + * blue support routines + */ +struct blue * +blue_alloc(struct ifnet *ifp, u_int32_t max_pmark, u_int32_t hold_time, + u_int32_t flags) +{ + struct blue *bp; + + VERIFY(ifp != NULL); + + bp = zalloc(blue_zone); + if (bp == NULL) + return (NULL); + + bzero(bp, blue_size); + bp->blue_idle = 1; + bp->blue_flags = (flags & BLUEF_USERFLAGS); + bp->blue_ifp = ifp; + + if (max_pmark == 0) + bp->blue_max_pmark = 1000; + else + bp->blue_max_pmark = max_pmark; + + if (hold_time == 0) + bp->blue_hold_time = 50000; + else + bp->blue_hold_time = hold_time; + + microuptime(&bp->blue_last); + + return (bp); +} + +void +blue_destroy(struct blue *bp) +{ + zfree(blue_zone, bp); +} + +void +blue_getstats(struct blue *bp, struct blue_stats *sp) +{ + sp->q_pmark = bp->blue_pmark; + sp->drop_forced = bp->blue_stats.drop_forced; + sp->drop_unforced = bp->blue_stats.drop_unforced; + sp->marked_packets = bp->blue_stats.marked_packets; +} + +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +int +blue_addq(struct blue *bp, class_queue_t *q, struct mbuf *m, + struct pf_mtag *tag) +{ + int droptype; + + /* + * if we were idle, this is an enqueue onto an empty queue + * and we should decrement marking probability + */ + if (bp->blue_idle) { + struct timeval now; + u_int32_t t; + + bp->blue_idle = 0; + microuptime(&now); + t = (now.tv_sec - bp->blue_last.tv_sec); + if (t > 1) { + bp->blue_pmark = 1; + microuptime(&bp->blue_last); + } else { + t = t * 1000000 + (now.tv_usec - bp->blue_last.tv_usec); + if (t > bp->blue_hold_time) { + bp->blue_pmark--; + if (bp->blue_pmark < 0) + bp->blue_pmark = 0; + microuptime(&bp->blue_last); + } + } + } + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (blue_drop_early(bp) && qlen(q) > 1) { + /* mark or drop by blue */ + if ((bp->blue_flags & BLUEF_ECN) && + (tag->pftag_flags & PF_TAG_TCP) && /* only for TCP */ + mark_ecn(m, tag, bp->blue_flags)) { + /* successfully marked. do not drop. */ + bp->blue_stats.marked_packets++; + } else { + /* unforced drop by blue */ + droptype = DTYPE_EARLY; + } + } + + /* if the queue length hits the hard limit, it's a forced drop */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); + + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ + bp->blue_stats.drop_unforced++; + } else { + struct timeval now; + u_int32_t t; + /* forced drop, select a victim packet in the queue. */ + m = _getq_random(q); + microuptime(&now); + t = (now.tv_sec - bp->blue_last.tv_sec); + t = t * 1000000 + (now.tv_usec - bp->blue_last.tv_usec); + if (t > bp->blue_hold_time) { + bp->blue_pmark += bp->blue_max_pmark >> 3; + if (bp->blue_pmark > bp->blue_max_pmark) + bp->blue_pmark = bp->blue_max_pmark; + microuptime(&bp->blue_last); + } + bp->blue_stats.drop_forced++; + } + IFCQ_CONVERT_LOCK(&bp->blue_ifp->if_snd); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + /* successfully queued */ + return (CLASSQEQ_SUCCESS); +} + +static struct mbuf * +blue_getq_flow(struct blue *bp, class_queue_t *q, u_int32_t flow, + boolean_t purge) +{ +#pragma unused(purge) + struct mbuf *m; + + /* flow of 0 means head of queue */ + if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { + if (bp->blue_idle == 0) { + bp->blue_idle = 1; + microuptime(&bp->blue_last); + } + return (NULL); + } + + bp->blue_idle = 0; + return (m); +} + +struct mbuf * +blue_getq(struct blue *bp, class_queue_t *q) +{ + return (blue_getq_flow(bp, q, 0, FALSE)); +} + +void +blue_purgeq(struct blue *bp, class_queue_t *q, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes) +{ + u_int32_t cnt = 0, len = 0; + struct mbuf *m; + + IFCQ_CONVERT_LOCK(&bp->blue_ifp->if_snd); + + while ((m = blue_getq_flow(bp, q, flow, TRUE)) != NULL) { + cnt++; + len += m_pktlen(m); + m_freem(m); + } + + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +/* + * early-drop probability is kept in blue_pmark + */ +static int +blue_drop_early(struct blue *bp) +{ + if ((random() % (unsigned)bp->blue_max_pmark) < + (unsigned)bp->blue_pmark) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +void +blue_updateq(struct blue *bp, cqev_t ev) +{ +#pragma unused(bp, ev) + /* nothing for now */ +} + +int +blue_suspendq(struct blue *bp, class_queue_t *q, boolean_t on) +{ +#pragma unused(bp, q, on) + return (ENOTSUP); +} +#endif /* CLASSQ_BLUE */ diff --git a/bsd/net/classq/classq_blue.h b/bsd/net/classq/classq_blue.h new file mode 100644 index 000000000..e6c546e23 --- /dev/null +++ b/bsd/net/classq/classq_blue.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_blue.h,v 1.5 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_blue.h,v 1.7 2002/11/29 04:36:22 kjc Exp $ */ + +/* + * Copyright (C) 1997-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_CLASSQ_CLASSQ_BLUE_H_ +#define _NET_CLASSQ_CLASSQ_BLUE_H_ + +#ifdef PRIVATE +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +extern "C" { +#endif + +struct blue_stats { + int32_t q_pmark; + u_int32_t _pad; + u_int64_t drop_forced; + u_int64_t drop_unforced; + u_int64_t marked_packets; +}; + +#ifdef BSD_KERNEL_PRIVATE +/* blue flags */ +#define BLUEF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define BLUEF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define BLUEF_ECN (BLUEF_ECN4 | BLUEF_ECN6) + +#define BLUEF_USERFLAGS \ + (BLUEF_ECN4 | BLUEF_ECN6) + +typedef struct blue { + u_int32_t blue_flags; /* blue flags */ + + /* blue parameters */ + int32_t blue_pmark; /* 0-1000 (mark probability*10000) */ + int32_t blue_max_pmark; /* sets precision of marking probability */ + u_int32_t blue_hold_time; /* hold time in usec */ + struct ifnet *blue_ifp; /* back pointer to ifnet */ + + /* variables for internal use */ + u_int32_t blue_idle; /* queue was empty */ + struct timeval blue_last; /* timestamp when the queue becomes idle */ + + /* statistics */ + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int64_t drop_forced; + u_int64_t drop_unforced; + u_int64_t marked_packets; + } blue_stats; +} blue_t; + +extern void blue_init(void); +extern struct blue *blue_alloc(struct ifnet *, u_int32_t, u_int32_t, u_int32_t); +extern void blue_destroy(struct blue *); +extern int blue_addq(struct blue *, class_queue_t *, struct mbuf *, + struct pf_mtag *); +extern struct mbuf *blue_getq(struct blue *, class_queue_t *); +extern void blue_purgeq(struct blue *, class_queue_t *, u_int32_t, + u_int32_t *, u_int32_t *); +extern void blue_getstats(struct blue *, struct blue_stats *); +extern void blue_updateq(struct blue *, cqev_t); +extern int blue_suspendq(struct blue *, class_queue_t *, boolean_t); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_CLASSQ_BLUE_H_ */ diff --git a/bsd/net/classq/classq_red.c b/bsd/net/classq/classq_red.c new file mode 100644 index 000000000..825b62db8 --- /dev/null +++ b/bsd/net/classq/classq_red.c @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#if CLASSQ_RED + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#if INET6 +#include +#endif + +#include + +/* + * ALTQ/RED (Random Early Detection) implementation using 32-bit + * fixed-point calculation. + * + * written by kjc using the ns code as a reference. + * you can learn more about red and ns from Sally's home page at + * http://www-nrg.ee.lbl.gov/floyd/ + * + * most of the red parameter values are fixed in this implementation + * to prevent fixed-point overflow/underflow. + * if you change the parameters, watch out for overflow/underflow! + * + * the parameters used are recommended values by Sally. + * the corresponding ns config looks: + * q_weight=0.00195 + * minthresh=5 maxthresh=15 queue-size=60 + * linterm=30 + * dropmech=drop-tail + * bytes=false (can't be handled by 32-bit fixed-point) + * doubleq=false dqthresh=false + * wait=true + */ +/* + * alternative red parameters for a slow link. + * + * assume the queue length becomes from zero to L and keeps L, it takes + * N packets for q_avg to reach 63% of L. + * when q_weight is 0.002, N is about 500 packets. + * for a slow link like dial-up, 500 packets takes more than 1 minute! + * when q_weight is 0.008, N is about 127 packets. + * when q_weight is 0.016, N is about 63 packets. + * bursts of 50 packets are allowed for 0.002, bursts of 25 packets + * are allowed for 0.016. + * see Sally's paper for more details. + */ +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RED_LIMIT 60 /* default max queue lenght */ + +#define RED_ZONE_MAX 32 /* maximum elements in zone */ +#define RED_ZONE_NAME "classq_red" /* zone name */ + +static unsigned int red_size; /* size of zone element */ +static struct zone *red_zone; /* zone for red */ + +/* + * our default policy for forced-drop is drop-tail. + * (in altq-1.1.2 or earlier, the default was random-drop. + * but it makes more sense to punish the cause of the surge.) + * to switch to the random-drop policy, define "RED_RANDOM_DROP". + */ + +/* default red parameter values */ +static int default_th_min = TH_MIN; +static int default_th_max = TH_MAX; +static int default_inv_pmax = INV_P_MAX; + +static struct mbuf *red_getq_flow(struct red *, class_queue_t *, + u_int32_t, boolean_t); + +void +red_init(void) +{ + _CASSERT(REDF_ECN4 == CLASSQF_ECN4); + _CASSERT(REDF_ECN6 == CLASSQF_ECN6); + + red_size = sizeof (red_t); + red_zone = zinit(red_size, RED_ZONE_MAX * red_size, + 0, RED_ZONE_NAME); + if (red_zone == NULL) { + panic("%s: failed allocating %s", __func__, RED_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(red_zone, Z_EXPAND, TRUE); + zone_change(red_zone, Z_CALLERACCT, TRUE); +} + +/* + * red support routines + */ +red_t * +red_alloc(struct ifnet *ifp, int weight, int inv_pmax, int th_min, + int th_max, int flags, int pkttime) +{ + red_t *rp; + int w, i; + int npkts_per_sec; + + VERIFY(ifp != NULL); + + rp = zalloc(red_zone); + if (rp == NULL) + return (NULL); + + bzero(rp, red_size); + rp->red_avg = 0; + rp->red_idle = 1; + + if (weight == 0) + rp->red_weight = W_WEIGHT; + else + rp->red_weight = weight; + if (inv_pmax == 0) + rp->red_inv_pmax = default_inv_pmax; + else + rp->red_inv_pmax = inv_pmax; + if (th_min == 0) + rp->red_thmin = default_th_min; + else + rp->red_thmin = th_min; + if (th_max == 0) + rp->red_thmax = default_th_max; + else + rp->red_thmax = th_max; + + rp->red_flags = (flags & REDF_USERFLAGS); + rp->red_ifp = ifp; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->red_pkttime = 800; + else + rp->red_pkttime = pkttime; + + if (weight == 0) { + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->red_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->red_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->red_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->red_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->red_wshift = i; + w = 1 << rp->red_wshift; + if (w != rp->red_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->red_weight, w); + rp->red_weight = w; + } + + /* + * thmin_s and thmax_s are scaled versions of th_min and th_max + * to be compared with avg. + */ + rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); + rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) * + rp->red_inv_pmax) << FP_SHIFT; + + /* allocate weight table */ + rp->red_wtab = wtab_alloc(rp->red_weight); + if (rp->red_wtab == NULL) { + red_destroy(rp); + return (NULL); + } + + microuptime(&rp->red_last); + return (rp); +} + +void +red_destroy(red_t *rp) +{ + if (rp->red_wtab != NULL) { + wtab_destroy(rp->red_wtab); + rp->red_wtab = NULL; + } + zfree(red_zone, rp); +} + +void +red_getstats(red_t *rp, struct red_stats *sp) +{ + sp->q_avg = rp->red_avg >> rp->red_wshift; + sp->drop_forced = rp->red_stats.drop_forced; + sp->drop_unforced = rp->red_stats.drop_unforced; + sp->marked_packets = rp->red_stats.marked_packets; +} + +int +red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag) +{ + int avg, droptype; + int n; + + avg = rp->red_avg; + + /* + * if we were idle, we pretend that n packets arrived during + * the idle period. + */ + if (rp->red_idle) { + struct timeval now; + int t; + + rp->red_idle = 0; + microuptime(&now); + t = (now.tv_sec - rp->red_last.tv_sec); + if (t > 60) { + /* + * being idle for more than 1 minute, set avg to zero. + * this prevents t from overflow. + */ + avg = 0; + } else { + t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); + n = t / rp->red_pkttime - 1; + + /* the following line does (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->red_wtab, n); + } + } + + /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ + avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); + rp->red_avg = avg; /* save the new value */ + + /* + * red_count keeps a tally of arriving traffic that has not + * been dropped. + */ + rp->red_count++; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= rp->red_thmin_s && qlen(q) > 1) { + if (avg >= rp->red_thmax_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (rp->red_old == 0) { + /* first exceeds th_min */ + rp->red_count = 1; + rp->red_old = 1; + } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, + rp->red_probd, rp->red_count)) { + /* mark or drop by red */ + if ((rp->red_flags & REDF_ECN) && + (tag->pftag_flags & PF_TAG_TCP) && /* only TCP */ + mark_ecn(m, tag, rp->red_flags)) { + /* successfully marked. do not drop. */ + rp->red_count = 0; + rp->red_stats.marked_packets++; + } else { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } + } else { + /* avg < th_min */ + rp->red_old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + +#ifdef RED_RANDOM_DROP + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); +#else + /* if successful, enqueue this packet. */ + if (droptype == DTYPE_NODROP) + _addq(q, m); +#endif + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ + rp->red_stats.drop_unforced++; + } else { + /* forced drop, select a victim packet in the queue. */ +#ifdef RED_RANDOM_DROP + m = _getq_random(q); +#endif + rp->red_stats.drop_forced++; + } + rp->red_count = 0; + IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + /* successfully queued */ + return (CLASSQEQ_SUCCESS); +} + +/* + * early-drop probability is calculated as follows: + * prob = p_max * (avg - th_min) / (th_max - th_min) + * prob_a = prob / (2 - count*prob) + * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) + * here prob_a increases as successive undrop count increases. + * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), + * becomes 1 when (count >= (2 / prob))). + */ +int +drop_early(int fp_len, int fp_probd, int count) +{ + int d; /* denominator of drop-probability */ + + d = fp_probd - count * fp_len; + if (d <= 0) + /* count exceeds the hard limit: drop or mark */ + return (1); + + /* + * now the range of d is [1..600] in fixed-point. (when + * th_max-th_min=10 and p_max=1/30) + * drop probability = (avg - TH_MIN) / d + */ + + if ((random() % d) < (unsigned)fp_len) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +static struct mbuf * +red_getq_flow(struct red *rp, class_queue_t *q, u_int32_t flow, boolean_t purge) +{ +#pragma unused(purge) + struct mbuf *m; + + /* flow of 0 means head of queue */ + if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { + if (rp->red_idle == 0) { + rp->red_idle = 1; + microuptime(&rp->red_last); + } + return (NULL); + } + + rp->red_idle = 0; + return (m); +} + +struct mbuf * +red_getq(red_t *rp, class_queue_t *q) +{ + return (red_getq_flow(rp, q, 0, FALSE)); +} + +void +red_purgeq(struct red *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, + u_int32_t *bytes) +{ + u_int32_t cnt = 0, len = 0; + struct mbuf *m; + + IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); + + while ((m = red_getq_flow(rp, q, flow, TRUE)) != NULL) { + cnt++; + len += m_pktlen(m); + m_freem(m); + } + + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +void +red_updateq(red_t *rp, cqev_t ev) +{ +#pragma unused(rp, ev) + /* nothing for now */ +} + +int +red_suspendq(red_t *rp, class_queue_t *q, boolean_t on) +{ +#pragma unused(rp, q, on) + return (ENOTSUP); +} + +/* + * helper routine to calibrate avg during idle. + * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point + * here Wq = 1/weight and the code assumes Wq is close to zero. + * + * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. + */ +static struct wtab *wtab_list = NULL; /* pointer to wtab list */ + +struct wtab * +wtab_alloc(int weight) +{ + struct wtab *w; + int i; + + for (w = wtab_list; w != NULL; w = w->w_next) + if (w->w_weight == weight) { + w->w_refcount++; + return (w); + } + + w = _MALLOC(sizeof (struct wtab), M_DEVBUF, M_WAITOK|M_ZERO); + if (w == NULL) + return (NULL); + + w->w_weight = weight; + w->w_refcount = 1; + w->w_next = wtab_list; + wtab_list = w; + + /* initialize the weight table */ + w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; + for (i = 1; i < 32; i++) { + w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; + if (w->w_tab[i] == 0 && w->w_param_max == 0) + w->w_param_max = 1 << i; + } + + return (w); +} + +void +wtab_destroy(struct wtab *w) +{ + struct wtab *prev; + + if (--w->w_refcount > 0) + return; + + if (wtab_list == w) + wtab_list = w->w_next; + else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) + if (prev->w_next == w) { + prev->w_next = w->w_next; + break; + } + + _FREE(w, M_DEVBUF); +} + +int32_t +pow_w(struct wtab *w, int n) +{ + int i, bit; + int32_t val; + + if (n >= w->w_param_max) + return (0); + + val = 1 << FP_SHIFT; + if (n <= 0) + return (val); + + bit = 1; + i = 0; + while (n) { + if (n & bit) { + val = (val * w->w_tab[i]) >> FP_SHIFT; + n &= ~bit; + } + i++; + bit <<= 1; + } + return (val); +} + +#endif /* CLASSQ_RED */ diff --git a/bsd/net/classq/classq_red.h b/bsd/net/classq/classq_red.h new file mode 100644 index 000000000..58956b504 --- /dev/null +++ b/bsd/net/classq/classq_red.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_red.h,v 1.5 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_red.h,v 1.8 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_CLASSQ_CLASSQ_RED_H_ +#define _NET_CLASSQ_CLASSQ_RED_H_ + +#ifdef PRIVATE +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * simpler versions of red parameters and statistics used by other + * disciplines (e.g., CBQ) + */ +struct redparams { + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + int inv_pmax; /* inverse of max drop probability */ +}; + +struct red_stats { + int32_t q_avg; + u_int32_t _pad; + u_int32_t drop_forced; + u_int32_t drop_unforced; + u_int32_t marked_packets; +}; + +#ifdef BSD_KERNEL_PRIVATE +/* weight table structure for idle time calibration */ +struct wtab { + struct wtab *w_next; + int w_weight; + int w_param_max; + int w_refcount; + int32_t w_tab[32]; +}; + +/* red flags */ +#define REDF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define REDF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define REDF_ECN (REDF_ECN4 | REDF_ECN6) +#define REDF_FLOWVALVE 0x04 /* use flowvalve (aka penalty-box) */ + +#define REDF_USERFLAGS \ + (REDF_ECN4 | REDF_ECN6 | REDF_FLOWVALVE) + +typedef struct red { + int red_pkttime; /* average packet time in micro sec */ + /* used for idle calibration */ + int red_flags; /* red flags */ + struct ifnet *red_ifp; /* back pointer to ifnet */ + + /* red parameters */ + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + + /* variables for internal use */ + int red_wshift; /* log(red_weight) */ + int red_thmin_s; /* th_min scaled by avgshift */ + int red_thmax_s; /* th_max scaled by avgshift */ + int red_probd; /* drop probability denominator */ + + int red_avg; /* queue len avg scaled by avgshift */ + int red_count; /* packet count since last dropped/ */ + /* marked packet */ + int red_idle; /* queue was empty */ + int red_old; /* avg is above th_min */ + struct wtab *red_wtab; /* weight table */ + struct timeval red_last; /* time when the queue becomes idle */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int32_t drop_forced; + u_int32_t drop_unforced; + u_int32_t marked_packets; + } red_stats; +} red_t; + +/* red drop types */ +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +extern void red_init(void); +extern red_t *red_alloc(struct ifnet *, int, int, int, int, int, int); +extern void red_destroy(red_t *); +extern void red_getstats(red_t *, struct red_stats *); +extern int red_addq(red_t *, class_queue_t *, struct mbuf *, struct pf_mtag *); +extern struct mbuf *red_getq(red_t *, class_queue_t *); +extern void red_purgeq(struct red *, class_queue_t *, u_int32_t, + u_int32_t *, u_int32_t *); +extern void red_updateq(red_t *, cqev_t); +extern int red_suspendq(red_t *, class_queue_t *, boolean_t); + +extern int drop_early(int, int, int); +extern struct wtab *wtab_alloc(int); +extern void wtab_destroy(struct wtab *); +extern int32_t pow_w(struct wtab *, int); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_CLASSQ_RED_H_ */ diff --git a/bsd/net/classq/classq_rio.c b/bsd/net/classq/classq_rio.c new file mode 100644 index 000000000..20a44ee2e --- /dev/null +++ b/bsd/net/classq/classq_rio.c @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_rio.c,v 1.11 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_rio.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#if CLASSQ_RIO + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#if INET6 +#include +#endif + +#include +#include + +/* + * RIO: RED with IN/OUT bit + * described in + * "Explicit Allocation of Best Effort Packet Delivery Service" + * David D. Clark and Wenjia Fang, MIT Lab for Computer Science + * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} + * + * this implementation is extended to support more than 2 drop precedence + * values as described in RFC2597 (Assured Forwarding PHB Group). + * + */ +/* + * AF DS (differentiated service) codepoints. + * (classes can be mapped to CBQ or H-FSC classes.) + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | CLASS |DropPre| 0 | CU | + * +---+---+---+---+---+---+---+---+ + * + * class 1: 001 + * class 2: 010 + * class 3: 011 + * class 4: 100 + * + * low drop prec: 01 + * medium drop prec: 10 + * high drop prec: 11 + */ + +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RIO_LIMIT 60 /* default max queue lenght */ + +/* default rio parameter values */ +static struct redparams default_rio_params[RIO_NDROPPREC] = { + /* th_min, th_max, inv_pmax */ + { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ + { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ + { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ +}; + +#define RIO_ZONE_MAX 32 /* maximum elements in zone */ +#define RIO_ZONE_NAME "classq_rio" /* zone name */ + +static unsigned int rio_size; /* size of zone element */ +static struct zone *rio_zone; /* zone for rio */ + +/* internal function prototypes */ +static struct mbuf *rio_getq_flow(struct rio *, class_queue_t *, + u_int32_t, boolean_t); +static int dscp2index(u_int8_t); + +void +rio_init(void) +{ + _CASSERT(RIOF_ECN4 == CLASSQF_ECN4); + _CASSERT(RIOF_ECN6 == CLASSQF_ECN6); + + rio_size = sizeof (rio_t); + rio_zone = zinit(rio_size, RIO_ZONE_MAX * rio_size, + 0, RIO_ZONE_NAME); + if (rio_zone == NULL) { + panic("%s: failed allocating %s", __func__, RIO_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(rio_zone, Z_EXPAND, TRUE); + zone_change(rio_zone, Z_CALLERACCT, TRUE); +} + +rio_t * +rio_alloc(struct ifnet *ifp, int weight, struct redparams *params, + int flags, int pkttime) +{ + rio_t *rp; + int w, i; + int npkts_per_sec; + + VERIFY(ifp != NULL); + + rp = zalloc(rio_zone); + if (rp == NULL) + return (NULL); + + bzero(rp, rio_size); + rp->rio_flags = (flags & RIOF_USERFLAGS); + rp->rio_ifp = ifp; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->rio_pkttime = 800; + else + rp->rio_pkttime = pkttime; + + if (weight != 0) + rp->rio_weight = weight; + else { + /* use default */ + rp->rio_weight = W_WEIGHT; + + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->rio_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->rio_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->rio_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->rio_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->rio_wshift = i; + w = 1 << rp->rio_wshift; + if (w != rp->rio_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->rio_weight, w); + rp->rio_weight = w; + } + + /* allocate weight table */ + rp->rio_wtab = wtab_alloc(rp->rio_weight); + if (rp->rio_wtab == NULL) { + rio_destroy(rp); + return (NULL); + } + + for (i = 0; i < RIO_NDROPPREC; i++) { + struct dropprec_state *prec = &rp->rio_precstate[i]; + + prec->avg = 0; + prec->idle = 1; + + if (params == NULL || params[i].inv_pmax == 0) + prec->inv_pmax = default_rio_params[i].inv_pmax; + else + prec->inv_pmax = params[i].inv_pmax; + if (params == NULL || params[i].th_min == 0) + prec->th_min = default_rio_params[i].th_min; + else + prec->th_min = params[i].th_min; + if (params == NULL || params[i].th_max == 0) + prec->th_max = default_rio_params[i].th_max; + else + prec->th_max = params[i].th_max; + + /* + * th_min_s and th_max_s are scaled versions of th_min + * and th_max to be compared with avg. + */ + prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); + prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + prec->probd = (2 * (prec->th_max - prec->th_min) * + prec->inv_pmax) << FP_SHIFT; + + microuptime(&prec->last); + } + + return (rp); +} + +void +rio_destroy(rio_t *rp) +{ + if (rp->rio_wtab != NULL) { + wtab_destroy(rp->rio_wtab); + rp->rio_wtab = NULL; + } + zfree(rio_zone, rp); +} + +void +rio_getstats(rio_t *rp, struct red_stats *sp) +{ + int i; + + for (i = 0; i < RIO_NDROPPREC; i++) { + bcopy(&rp->q_stats[i], sp, sizeof (struct red_stats)); + sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; + sp++; + } +} + +#if (RIO_NDROPPREC == 3) +/* + * internally, a drop precedence value is converted to an index + * starting from 0. + */ +static int +dscp2index(u_int8_t dscp) +{ +#define AF_DROPPRECMASK 0x18 + + int dpindex = dscp & AF_DROPPRECMASK; + + if (dpindex == 0) + return (0); + return ((dpindex >> 3) - 1); +} +#endif + +#define RIOM_SET_PRECINDEX(t, idx) do { \ + (t)->pftag_qpriv32 = (idx); \ +} while (0) + +#define RIOM_GET_PRECINDEX(t) \ + ({ u_int32_t idx; idx = (t)->pftag_qpriv32; \ + RIOM_SET_PRECINDEX(t, 0); idx; }) + +int +rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag) +{ +#define DSCP_MASK 0xfc + int avg, droptype; + u_int8_t dsfield, odsfield; + int dpindex, i, n, t; + struct timeval now; + struct dropprec_state *prec; + + dsfield = odsfield = read_dsfield(m, tag); + dpindex = dscp2index(dsfield); + + /* + * update avg of the precedence states whose drop precedence + * is larger than or equal to the drop precedence of the packet + */ + now.tv_sec = 0; + for (i = dpindex; i < RIO_NDROPPREC; i++) { + prec = &rp->rio_precstate[i]; + avg = prec->avg; + if (prec->idle) { + prec->idle = 0; + if (now.tv_sec == 0) + microuptime(&now); + t = (now.tv_sec - prec->last.tv_sec); + if (t > 60) + avg = 0; + else { + t = t * 1000000 + + (now.tv_usec - prec->last.tv_usec); + n = t / rp->rio_pkttime; + /* calculate (avg = (1 - Wq)^n * avg) */ + if (n > 0) { + avg = (avg >> FP_SHIFT) * + pow_w(rp->rio_wtab, n); + } + } + } + + /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ + avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); + prec->avg = avg; /* save the new value */ + /* + * count keeps a tally of arriving traffic that has not + * been dropped. + */ + prec->count++; + } + + prec = &rp->rio_precstate[dpindex]; + avg = prec->avg; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= prec->th_min_s && prec->qlen > 1) { + if (avg >= prec->th_max_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (prec->old == 0) { + /* first exceeds th_min */ + prec->count = 1; + prec->old = 1; + } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, + prec->probd, prec->count)) { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } else { + /* avg < th_min */ + prec->old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + if (droptype != DTYPE_NODROP) { + /* always drop incoming packet (as opposed to randomdrop) */ + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].count = 0; + + if (droptype == DTYPE_EARLY) + rp->q_stats[dpindex].drop_unforced++; + else + rp->q_stats[dpindex].drop_forced++; + + IFCQ_CONVERT_LOCK(&rp->rio_ifp->if_snd); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].qlen++; + + /* save drop precedence index in mbuf hdr */ + RIOM_SET_PRECINDEX(tag, dpindex); + + if (rp->rio_flags & RIOF_CLEARDSCP) + dsfield &= ~DSCP_MASK; + + if (dsfield != odsfield) + write_dsfield(m, tag, dsfield); + + _addq(q, m); + + return (CLASSQEQ_SUCCESS); +} + +static struct mbuf * +rio_getq_flow(struct rio *rp, class_queue_t *q, u_int32_t flow, boolean_t purge) +{ +#pragma unused(purge) + struct mbuf *m; + int dpindex, i; + + /* flow of 0 means head of queue */ + if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) + return (NULL); + + VERIFY(m->m_flags & M_PKTHDR); + + dpindex = RIOM_GET_PRECINDEX(m_pftag(m)); + for (i = dpindex; i < RIO_NDROPPREC; i++) { + if (--rp->rio_precstate[i].qlen == 0) { + if (rp->rio_precstate[i].idle == 0) { + rp->rio_precstate[i].idle = 1; + microuptime(&rp->rio_precstate[i].last); + } + } + } + return (m); +} + +struct mbuf * +rio_getq(rio_t *rp, class_queue_t *q) +{ + return (rio_getq_flow(rp, q, 0, FALSE)); +} + +void +rio_purgeq(struct rio *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, + u_int32_t *bytes) +{ + u_int32_t cnt = 0, len = 0; + struct mbuf *m; + + IFCQ_CONVERT_LOCK(&rp->rio_ifp->if_snd); + + while ((m = rio_getq_flow(rp, q, flow, TRUE)) != NULL) { + cnt++; + len += m_pktlen(m); + m_freem(m); + } + + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +void +rio_updateq(rio_t *rp, cqev_t ev) +{ +#pragma unused(rp, ev) + /* nothing for now */ +} + +int +rio_suspendq(rio_t *rp, class_queue_t *q, boolean_t on) +{ +#pragma unused(rp, q, on) + return (ENOTSUP); +} +#endif /* CLASSQ_RIO */ diff --git a/bsd/net/classq/classq_rio.h b/bsd/net/classq/classq_rio.h new file mode 100644 index 000000000..fb3c24199 --- /dev/null +++ b/bsd/net/classq/classq_rio.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_rio.h,v 1.5 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_rio.h,v 1.9 2003/07/10 12:07:49 kjc Exp $ */ + +/* + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_CLASSQ_CLASSQ_RIO_H_ +#define _NET_CLASSQ_CLASSQ_RIO_H_ + +#ifdef PRIVATE +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * RIO: RED with IN/OUT bit + * (extended to support more than 2 drop precedence values) + */ +#define RIO_NDROPPREC 3 /* number of drop precedence values */ + +#ifdef BSD_KERNEL_PRIVATE +/* rio flags */ +#define RIOF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define RIOF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define RIOF_ECN (RIOF_ECN4 | RIOF_ECN6) +#define RIOF_CLEARDSCP 0x200 /* clear diffserv codepoint */ + +#define RIOF_USERFLAGS \ + (RIOF_ECN4 | RIOF_ECN6 | RIOF_CLEARDSCP) + +typedef struct rio { + /* per drop precedence structure */ + struct dropprec_state { + /* red parameters */ + int inv_pmax; /* inverse of max drop probability */ + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + + /* variables for internal use */ + int th_min_s; /* th_min scaled by avgshift */ + int th_max_s; /* th_max scaled by avgshift */ + int probd; /* drop probability denominator */ + + int qlen; /* queue length */ + int avg; /* (scaled) queue length average */ + int count; /* packet count since the last */ + /* dropped/marked packet */ + int idle; /* queue was empty */ + int old; /* avg is above th_min */ + struct timeval last; /* timestamp when queue becomes idle */ + } rio_precstate[RIO_NDROPPREC]; + + int rio_wshift; /* log(red_weight) */ + int rio_weight; /* weight for EWMA */ + struct wtab *rio_wtab; /* weight table */ + + int rio_pkttime; /* average packet time in micro sec */ + /* used for idle calibration */ + int rio_flags; /* rio flags */ + struct ifnet *rio_ifp; /* back pointer to ifnet */ + + u_int8_t rio_codepoint; /* codepoint value to tag packets */ + u_int8_t rio_codepointmask; /* codepoint mask bits */ + + struct red_stats q_stats[RIO_NDROPPREC]; /* statistics */ +} rio_t; + +extern void rio_init(void); +extern rio_t *rio_alloc(struct ifnet *, int, struct redparams *, int, int); +extern void rio_destroy(rio_t *); +extern void rio_getstats(rio_t *, struct red_stats *); +extern int rio_addq(rio_t *, class_queue_t *, struct mbuf *, struct pf_mtag *); +extern struct mbuf *rio_getq(rio_t *, class_queue_t *); +extern void rio_purgeq(struct rio *, class_queue_t *, u_int32_t, + u_int32_t *, u_int32_t *); +extern void rio_updateq(rio_t *, cqev_t); +extern int rio_suspendq(rio_t *, class_queue_t *, boolean_t); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_CLASSQ_RIO_H_ */ diff --git a/bsd/net/classq/classq_sfb.c b/bsd/net/classq/classq_sfb.c new file mode 100644 index 000000000..c0f575a3e --- /dev/null +++ b/bsd/net/classq/classq_sfb.c @@ -0,0 +1,1184 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#if INET6 +#include +#endif + +#include +#include +#include + +/* + * Stochastic Fair Blue + * + * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin + * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf + * + * Based on the NS code with the following parameters: + * + * bytes: false + * decrement: 0.001 + * increment: 0.005 + * hold-time: 10ms-50ms (randomized) + * algorithm: 0 + * pbox: 1 + * pbox-time: 50-100ms (randomized) + * hinterval: 11-23 (randomized) + * + * This implementation uses L = 2 and N = 32 for 2 sets of: + * + * B[L][N]: L x N array of bins (L levels, N bins per level) + * + * Each set effectively creates 32^2 virtual buckets (bin combinations) + * while using only O(32*2) states. + * + * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are + * used as index for the bins across the 2 levels, where level 1 uses [0,2] + * and level 2 uses [1,3]. The 2 values per level correspond to the indices + * for the current and warm-up sets (section 4.4. in the SFB paper regarding + * Moving Hash Functions explains the purposes of these 2 sets.) + */ + +/* + * Use Murmur3A_x86_32 for hash function. It seems to perform consistently + * across platforms for 1-word key (32-bit flowhash value). See flowhash.h + * for other alternatives. We only need 16-bit hash output. + */ +#define SFB_HASH net_flowhash_mh3_x86_32 +#define SFB_HASHMASK HASHMASK(16) + +#define SFB_BINMASK(_x) \ + ((_x) & HASHMASK(SFB_BINS_SHIFT)) + +#define SFB_BINST(_sp, _l, _n, _c) \ + (&(*(_sp)->sfb_bins)[_c].stats[_l][_n]) + +#define SFB_BINFT(_sp, _l, _n, _c) \ + (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n]) + +#define SFB_FC_LIST(_sp, _n) \ + (&(*(_sp)->sfb_fc_lists)[_n]) + +/* + * The holdtime parameter determines the minimum time interval between + * two successive updates of the marking probability. In the event the + * uplink speed is not known, a default value is chosen and is randomized + * to be within the following range. + */ +#define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */ +#define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */ +#define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */ + +/* + * The pboxtime parameter determines the bandwidth allocated for rogue + * flows, i.e. the rate limiting bandwidth. In the event the uplink speed + * is not known, a default value is chosen and is randomized to be within + * the following range. + */ +#define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */ +#define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */ +#define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */ + +#define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin)) + +#define SFB_PKT_PBOX PF_TAG_QUEUE1 /* in penalty box */ + +/* The following mantissa values are in SFB_FP_SHIFT Q format */ +#define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */ + +/* + * These are d1 (increment) and d2 (decrement) parameters, used to determine + * the amount by which the marking probability is incremented when the queue + * overflows, or is decremented when the link is idle. d1 is set higher than + * d2, because link underutilization can occur when congestion management is + * either too conservative or too aggressive, but packet loss occurs only + * when congestion management is too conservative. By weighing heavily + * against packet loss, it can quickly reach to a substantial increase in + * traffic load. + */ +#define SFB_INCREMENT 82 /* Q14 representation of 0.005 */ +#define SFB_DECREMENT 16 /* Q14 representation of 0.001 */ + +#define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */ +#define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */ + +#define SFB_PMARK_INC(_bin) do { \ + (_bin)->pmark += sfb_increment; \ + if ((_bin)->pmark > SFB_MAX_PMARK) \ + (_bin)->pmark = SFB_MAX_PMARK; \ +} while (0) + +#define SFB_PMARK_DEC(_bin) do { \ + if ((_bin)->pmark > 0) { \ + (_bin)->pmark -= sfb_decrement; \ + if ((_bin)->pmark < 0) \ + (_bin)->pmark = 0; \ + } \ +} while (0) + +#define HINTERVAL_MIN (10) /* 10 seconds */ +#define HINTERVAL_MAX (20) /* 20 seconds */ +#define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN) + +#define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */ +#define DEQUEUE_SPIKE(_new, _old) \ + ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11)) + +#define ABS(v) (((v) > 0) ? (v) : -(v)) + +#define SFB_ZONE_MAX 32 /* maximum elements in zone */ +#define SFB_ZONE_NAME "classq_sfb" /* zone name */ + +/* Place the flow control entries in current bin on level 0 */ +#define SFB_FC_LEVEL 0 + +static unsigned int sfb_size; /* size of zone element */ +static struct zone *sfb_zone; /* zone for sfb */ + +/* internal function prototypes */ +static u_int32_t sfb_random(struct sfb *); +static struct mbuf *sfb_getq_flow(struct sfb *, class_queue_t *, u_int32_t, + boolean_t); +static void sfb_resetq(struct sfb *, cqev_t); +static void sfb_calc_holdtime(struct sfb *, u_int64_t); +static void sfb_calc_pboxtime(struct sfb *, u_int64_t); +static void sfb_calc_hinterval(struct sfb *, u_int64_t *); +static void sfb_swap_bins(struct sfb *, u_int32_t); +static inline int sfb_pcheck(struct sfb *, struct pf_mtag *); +static int sfb_penalize(struct sfb *, struct pf_mtag *, struct timespec *); +static void sfb_adjust_bin(struct sfb *, struct sfbbinstats *, + struct timespec *, struct timespec *, boolean_t); +static void sfb_decrement_bin(struct sfb *, struct sfbbinstats *, + struct timespec *, struct timespec *); +static void sfb_increment_bin(struct sfb *, struct sfbbinstats *, + struct timespec *, struct timespec *); +static inline void sfb_dq_update_bins(struct sfb *, struct pf_mtag *, + struct timespec *); +static inline void sfb_eq_update_bins(struct sfb *, struct pf_mtag *); +static int sfb_drop_early(struct sfb *, struct pf_mtag *, u_int16_t *, + struct timespec *); +static boolean_t sfb_bin_addfcentry(struct sfb *, struct pf_mtag *); +static void sfb_fclist_append(struct sfb *, struct sfb_fc_list *); +static void sfb_fclists_clean(struct sfb *sp); + +SYSCTL_NODE(_net_classq, OID_AUTO, sfb, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SFB"); + +static u_int64_t sfb_holdtime = 0; /* 0 indicates "automatic" */ +SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, holdtime, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_holdtime, "SFB freeze time in nanoseconds"); + +static u_int64_t sfb_pboxtime = 0; /* 0 indicates "automatic" */ +SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, pboxtime, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_pboxtime, "SFB penalty box time in nanoseconds"); + +static u_int64_t sfb_hinterval; +SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_hinterval, "SFB hash interval in nanoseconds"); + +static u_int32_t sfb_increment = SFB_INCREMENT; +SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_increment, SFB_INCREMENT, "SFB increment [d1]"); + +static u_int32_t sfb_decrement = SFB_DECREMENT; +SYSCTL_UINT(_net_classq_sfb, OID_AUTO, decrement, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_decrement, SFB_DECREMENT, "SFB decrement [d2]"); + +static u_int32_t sfb_allocation = 0; /* 0 means "automatic" */ +SYSCTL_UINT(_net_classq_sfb, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_allocation, 0, "SFB bin allocation"); + +static u_int32_t sfb_ratelimit = 0; +SYSCTL_UINT(_net_classq_sfb, OID_AUTO, ratelimit, CTLFLAG_RW|CTLFLAG_LOCKED, + &sfb_ratelimit, 0, "SFB rate limit"); + +#define MBPS (1ULL * 1000 * 1000) +#define GBPS (MBPS * 1000) + +struct sfb_time_tbl { + u_int64_t speed; /* uplink speed */ + u_int64_t holdtime; /* hold time */ + u_int64_t pboxtime; /* penalty box time */ +}; + +static struct sfb_time_tbl sfb_ttbl[] = { + { 1 * MBPS, HOLDTIME_BASE * 1000, PBOXTIME_BASE * 1000 }, + { 10 * MBPS, HOLDTIME_BASE * 100, PBOXTIME_BASE * 100 }, + { 100 * MBPS, HOLDTIME_BASE * 10, PBOXTIME_BASE * 10 }, + { 1 * GBPS, HOLDTIME_BASE, PBOXTIME_BASE }, + { 10 * GBPS, HOLDTIME_BASE / 10, PBOXTIME_BASE / 10 }, + { 100 * GBPS, HOLDTIME_BASE / 100, PBOXTIME_BASE / 100 }, + { 0, 0, 0 } +}; + +void +sfb_init(void) +{ + _CASSERT(SFBF_ECN4 == CLASSQF_ECN4); + _CASSERT(SFBF_ECN6 == CLASSQF_ECN6); + + sfb_size = sizeof (struct sfb); + sfb_zone = zinit(sfb_size, SFB_ZONE_MAX * sfb_size, + 0, SFB_ZONE_NAME); + if (sfb_zone == NULL) { + panic("%s: failed allocating %s", __func__, SFB_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(sfb_zone, Z_EXPAND, TRUE); + zone_change(sfb_zone, Z_CALLERACCT, TRUE); +} + +static u_int32_t +sfb_random(struct sfb *sp) +{ + IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); + return (random()); +} + +static void +sfb_calc_holdtime(struct sfb *sp, u_int64_t outbw) +{ + u_int64_t holdtime; + + if (sfb_holdtime != 0) { + holdtime = sfb_holdtime; + } else if (outbw == 0) { + holdtime = SFB_RANDOM(sp, HOLDTIME_MIN, HOLDTIME_MAX); + } else { + unsigned int n, i; + + n = sfb_ttbl[0].holdtime; + for (i = 0; sfb_ttbl[i].speed != 0; i++) { + if (outbw < sfb_ttbl[i].speed) + break; + n = sfb_ttbl[i].holdtime; + } + holdtime = n; + } + net_nsectimer(&holdtime, &sp->sfb_holdtime); +} + +static void +sfb_calc_pboxtime(struct sfb *sp, u_int64_t outbw) +{ + u_int64_t pboxtime; + + if (sfb_pboxtime != 0) { + pboxtime = sfb_pboxtime; + } else if (outbw == 0) { + pboxtime = SFB_RANDOM(sp, PBOXTIME_MIN, PBOXTIME_MAX); + } else { + unsigned int n, i; + + n = sfb_ttbl[0].pboxtime; + for (i = 0; sfb_ttbl[i].speed != 0; i++) { + if (outbw < sfb_ttbl[i].speed) + break; + n = sfb_ttbl[i].pboxtime; + } + pboxtime = n; + } + net_nsectimer(&pboxtime, &sp->sfb_pboxtime); + net_timerclear(&sp->sfb_pboxfreeze); +} + +static void +sfb_calc_hinterval(struct sfb *sp, u_int64_t *t) +{ + u_int64_t hinterval; + struct timespec now; + + if (t != NULL) { + /* + * TODO adi@apple.com: use dq_avg to derive hinterval. + */ + hinterval = *t; + } + + if (sfb_hinterval != 0) + hinterval = sfb_hinterval; + else if (t == NULL || hinterval == 0) + hinterval = ((u_int64_t)SFB_HINTERVAL(sp) * NSEC_PER_SEC); + + net_nsectimer(&hinterval, &sp->sfb_hinterval); + + nanouptime(&now); + net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset); +} + +/* + * sfb support routines + */ +struct sfb * +sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags) +{ + struct sfb *sp; + + VERIFY(ifp != NULL && qlim > 0); + + sp = zalloc(sfb_zone); + if (sp == NULL) { + log(LOG_ERR, "%s: SFB unable to allocate\n", if_name(ifp)); + return (NULL); + } + + bzero(sp, sfb_size); + if ((sp->sfb_bins = _MALLOC(sizeof (*sp->sfb_bins), M_DEVBUF, + M_WAITOK|M_ZERO)) == NULL) { + log(LOG_ERR, "%s: SFB unable to allocate bins\n", if_name(ifp)); + sfb_destroy(sp); + return (NULL); + } + + if ((sp->sfb_fc_lists = _MALLOC(sizeof (*sp->sfb_fc_lists), M_DEVBUF, + M_WAITOK|M_ZERO)) == NULL) { + log(LOG_ERR, "%s: SFB unable to allocate flow control lists\n", + if_name(ifp)); + sfb_destroy(sp); + return(NULL); + } + + sp->sfb_flags = (flags & SFBF_USERFLAGS); + sp->sfb_ifp = ifp; + sp->sfb_qlim = qlim; + sp->sfb_qid = qid; + + sfb_resetq(sp, -1); + + return (sp); +} + +static void +sfb_fclist_append(struct sfb *sp, struct sfb_fc_list *fcl) +{ + IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); + ifnet_fclist_append(sp, fcl); +} + +static void +sfb_fclists_clean(struct sfb *sp) +{ + int i; + + /* Move all the flow control entries to the ifnet list */ + for (i = 0; i < SFB_BINS; ++i) { + struct sfb_fc_list *fcl = SFB_FC_LIST(sp, i); + if (!SLIST_EMPTY(fcl)) + sfb_fclist_append(sp, fcl); + } +} + +void +sfb_destroy(struct sfb *sp) +{ + sfb_fclists_clean(sp); + if (sp->sfb_bins != NULL) { + _FREE(sp->sfb_bins, M_DEVBUF); + sp->sfb_bins = NULL; + } + if (sp->sfb_fc_lists != NULL) { + _FREE(sp->sfb_fc_lists, M_DEVBUF); + sp->sfb_fc_lists = NULL; + } + zfree(sfb_zone, sp); +} + +static void +sfb_resetq(struct sfb *sp, cqev_t ev) +{ + struct ifnet *ifp = sp->sfb_ifp; + u_int64_t eff_rate; + + VERIFY(ifp != NULL); + + if (ev != CLASSQ_EV_LINK_DOWN) { + (*sp->sfb_bins)[0].fudge = sfb_random(sp); + (*sp->sfb_bins)[1].fudge = sfb_random(sp); + sp->sfb_allocation = ((sfb_allocation == 0) ? + (sp->sfb_qlim / 3) : sfb_allocation); + sp->sfb_drop_thresh = sp->sfb_allocation + + (sp->sfb_allocation >> 1); + } + + sp->sfb_clearpkts = 0; + sp->sfb_current = 0; + + eff_rate = ifnet_output_linkrate(ifp); + sp->sfb_eff_rate = eff_rate; + + sfb_calc_holdtime(sp, eff_rate); + sfb_calc_pboxtime(sp, eff_rate); + sfb_calc_hinterval(sp, NULL); + + if (ev == CLASSQ_EV_LINK_DOWN || + ev == CLASSQ_EV_LINK_UP) + sfb_fclists_clean(sp); + + bzero(sp->sfb_bins, sizeof (*sp->sfb_bins)); + bzero(&sp->sfb_stats, sizeof (sp->sfb_stats)); + + if (ev == CLASSQ_EV_LINK_DOWN || !classq_verbose) + return; + + log(LOG_DEBUG, "%s: SFB qid=%d, holdtime=%llu nsec, " + "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, " + "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps\n", + if_name(ifp), sp->sfb_qid, (u_int64_t)sp->sfb_holdtime.tv_nsec, + (u_int64_t)sp->sfb_pboxtime.tv_nsec, + (u_int32_t)sp->sfb_allocation, (u_int32_t)sp->sfb_drop_thresh, + (int)sp->sfb_hinterval.tv_sec, (int)sizeof (*sp->sfb_bins), + eff_rate); +} + +void +sfb_getstats(struct sfb *sp, struct sfb_stats *sps) +{ + sps->allocation = sp->sfb_allocation; + sps->dropthresh = sp->sfb_drop_thresh; + sps->clearpkts = sp->sfb_clearpkts; + sps->current = sp->sfb_current; + + net_timernsec(&sp->sfb_holdtime, &sp->sfb_stats.hold_time); + net_timernsec(&sp->sfb_pboxtime, &sp->sfb_stats.pbox_time); + net_timernsec(&sp->sfb_hinterval, &sp->sfb_stats.rehash_intval); + *(&(sps->sfbstats)) = *(&(sp->sfb_stats)); + + _CASSERT(sizeof ((*sp->sfb_bins)[0].stats) == + sizeof (sps->binstats[0].stats)); + + bcopy(&(*sp->sfb_bins)[0].stats, &sps->binstats[0].stats, + sizeof (sps->binstats[0].stats)); + bcopy(&(*sp->sfb_bins)[1].stats, &sps->binstats[1].stats, + sizeof (sps->binstats[1].stats)); +} + +static void +sfb_swap_bins(struct sfb *sp, u_int32_t len) +{ + int i, j, s; + + if (sp->sfb_flags & SFBF_SUSPENDED) + return; + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + (*sp->sfb_bins)[s].fudge = sfb_random(sp); /* recompute perturbation */ + sp->sfb_clearpkts = len; + sp->sfb_stats.num_rehash++; + + s = (sp->sfb_current ^= 1); /* flip the bit (swap current) */ + + if (classq_verbose) { + log(LOG_DEBUG, "%s: SFB qid=%d, set %d is now current, " + "qlen=%d\n", if_name(sp->sfb_ifp), sp->sfb_qid, s, len); + } + + /* clear freezetime for all current bins */ + bzero(&(*sp->sfb_bins)[s].freezetime, + sizeof ((*sp->sfb_bins)[s].freezetime)); + + /* clear/adjust bin statistics and flow control lists */ + for (i = 0; i < SFB_BINS; i++) { + struct sfb_fc_list *fcl = SFB_FC_LIST(sp, i); + + if (!SLIST_EMPTY(fcl)) + sfb_fclist_append(sp, fcl); + + for (j = 0; j < SFB_LEVELS; j++) { + struct sfbbinstats *cbin, *wbin; + + cbin = SFB_BINST(sp, j, i, s); /* current */ + wbin = SFB_BINST(sp, j, i, s ^ 1); /* warm-up */ + + cbin->pkts = 0; + if (cbin->pmark > SFB_MAX_PMARK) + cbin->pmark = SFB_MAX_PMARK; + if (cbin->pmark < 0) + cbin->pmark = 0; + + /* + * Keep pmark from before to identify + * non-responsives immediately. + */ + if (wbin->pmark > SFB_PMARK_WARM) + wbin->pmark = SFB_PMARK_WARM; + } + } +} + +static inline int +sfb_pcheck(struct sfb *sp, struct pf_mtag *t) +{ +#if SFB_LEVELS != 2 + int i, n; +#endif /* SFB_LEVELS != 2 */ + int s; + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + /* + * For current bins, returns 1 if all pmark >= SFB_PMARK_TH, + * 0 otherwise; optimize for SFB_LEVELS=2. + */ +#if SFB_LEVELS == 2 + /* + * Level 0: bin index at [0] for set 0; [2] for set 1 + * Level 1: bin index at [1] for set 0; [3] for set 1 + */ + if (SFB_BINST(sp, 0, SFB_BINMASK(t->pftag_qpriv8[(s << 1)]), + s)->pmark < SFB_PMARK_TH || + SFB_BINST(sp, 1, SFB_BINMASK(t->pftag_qpriv8[(s << 1) + 1]), + s)->pmark < SFB_PMARK_TH) + return (0); +#else /* SFB_LEVELS != 2 */ + for (i = 0; i < SFB_LEVELS; i++) { + if (s == 0) /* set 0, bin index [0,1] */ + n = SFB_BINMASK(t->pftag_qpriv8[i]); + else /* set 1, bin index [2,3] */ + n = SFB_BINMASK(t->pftag_qpriv8[i + 2]); + + if (SFB_BINST(sp, i, n, s)->pmark < SFB_PMARK_TH) + return (0); + } +#endif /* SFB_LEVELS != 2 */ + return (1); +} + +static int +sfb_penalize(struct sfb *sp, struct pf_mtag *t, struct timespec *now) +{ + struct timespec delta = { 0, 0 }; + + /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */ + if (!sfb_ratelimit || !sfb_pcheck(sp, t)) + return (0); + + net_timersub(now, &sp->sfb_pboxfreeze, &delta); + if (net_timercmp(&delta, &sp->sfb_pboxtime, <)) { +#if SFB_LEVELS != 2 + int i; +#endif /* SFB_LEVELS != 2 */ + struct sfbbinstats *bin; + int n, w; + + w = sp->sfb_current ^ 1; + VERIFY((w + (w ^ 1)) == 1); + + /* + * Update warm-up bins; optimize for SFB_LEVELS=2 + */ +#if SFB_LEVELS == 2 + /* Level 0: bin index at [0] for set 0; [2] for set 1 */ + n = SFB_BINMASK(t->pftag_qpriv8[(w << 1)]); + bin = SFB_BINST(sp, 0, n, w); + if (bin->pkts >= sp->sfb_allocation) + sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, w), now); + + /* Level 0: bin index at [1] for set 0; [3] for set 1 */ + n = SFB_BINMASK(t->pftag_qpriv8[(w << 1) + 1]); + bin = SFB_BINST(sp, 1, n, w); + if (bin->pkts >= sp->sfb_allocation) + sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, w), now); +#else /* SFB_LEVELS != 2 */ + for (i = 0; i < SFB_LEVELS; i++) { + if (w == 0) /* set 0, bin index [0,1] */ + n = SFB_BINMASK(t->pftag_qpriv8[i]); + else /* set 1, bin index [2,3] */ + n = SFB_BINMASK(t->pftag_qpriv8[i + 2]); + + bin = SFB_BINST(sp, i, n, w); + if (bin->pkts >= sp->sfb_allocation) { + sfb_increment_bin(sp, bin, + SFB_BINFT(sp, i, n, w), now); + } + } +#endif /* SFB_LEVELS != 2 */ + return (1); + } + + /* non-conformant or else misclassified flow; queue it anyway */ + t->pftag_flags |= SFB_PKT_PBOX; + *(&sp->sfb_pboxfreeze) = *now; + + return (0); +} + +static void +sfb_adjust_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft, + struct timespec *now, boolean_t inc) +{ + struct timespec delta; + + net_timersub(now, ft, &delta); + if (net_timercmp(&delta, &sp->sfb_holdtime, <)) { + if (classq_verbose > 1) { + log(LOG_DEBUG, "%s: SFB qid=%d, %s update frozen " + "(delta=%llu nsec)\n", if_name(sp->sfb_ifp), + sp->sfb_qid, inc ? "increment" : "decrement", + (u_int64_t)delta.tv_nsec); + } + return; + } + + /* increment/decrement marking probability */ + *ft = *now; + if (inc) + SFB_PMARK_INC(bin); + else + SFB_PMARK_DEC(bin); +} + +static void +sfb_decrement_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft, + struct timespec *now) +{ + return (sfb_adjust_bin(sp, bin, ft, now, FALSE)); +} + +static void +sfb_increment_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft, + struct timespec *now) +{ + return (sfb_adjust_bin(sp, bin, ft, now, TRUE)); +} + +static inline void +sfb_dq_update_bins(struct sfb *sp, struct pf_mtag *t, struct timespec *now) +{ +#if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 + int i; +#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */ + struct sfbbinstats *bin; + int s, n; + struct sfb_fc_list *fcl = NULL; + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + /* + * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0 + */ +#if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0 + /* Level 0: bin index at [0] for set 0; [2] for set 1 */ + n = SFB_BINMASK(t->pftag_qpriv8[(s << 1)]); + bin = SFB_BINST(sp, 0, n, s); + + VERIFY(bin->pkts > 0); + if (--bin->pkts == 0) { + sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now); + } + if (bin->pkts <= (sp->sfb_allocation >> 2)) { + /* deliver flow control feedback to the sockets */ + fcl = SFB_FC_LIST(sp, n); + if (!SLIST_EMPTY(fcl)) + sfb_fclist_append(sp, fcl); + } + + /* Level 1: bin index at [1] for set 0; [3] for set 1 */ + n = SFB_BINMASK(t->pftag_qpriv8[(s << 1) + 1]); + bin = SFB_BINST(sp, 1, n, s); + + VERIFY(bin->pkts > 0); + if (--bin->pkts == 0) + sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now); +#else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */ + for (i = 0; i < SFB_LEVELS; i++) { + if (s == 0) /* set 0, bin index [0,1] */ + n = SFB_BINMASK(t->pftag_qpriv8[i]); + else /* set 1, bin index [2,3] */ + n = SFB_BINMASK(t->pftag_qpriv8[i + 2]); + + bin = SFB_BINST(sp, i, n, s); + + VERIFY(bin->pkts > 0); + if (--bin->pkts == 0) { + sfb_decrement_bin(sp, bin, + SFB_BINFT(sp, i, n, s), now); + } + if (bin->pkts <= (sp->sfb_allocation >> 2)) { + /* deliver flow control feedback to the sockets */ + if (i == SFB_FC_LEVEL) { + fcl = SFB_FC_LIST(sp, n); + if (!SLIST_EMPTY(fcl)) + sfb_fclist_append(sp, fcl); + } + } + } +#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */ +} + +static inline void +sfb_eq_update_bins(struct sfb *sp, struct pf_mtag *t) +{ +#if SFB_LEVELS != 2 + int i, n; +#endif /* SFB_LEVELS != 2 */ + int s; + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + /* + * Update current bins; optimize for SFB_LEVELS=2 + */ +#if SFB_LEVELS == 2 + /* Level 0: bin index at [0] for set 0; [2] for set 1 */ + SFB_BINST(sp, 0, SFB_BINMASK(t->pftag_qpriv8[(s << 1)]), s)->pkts++; + + /* Level 1: bin index at [1] for set 0; [3] for set 1 */ + SFB_BINST(sp, 1, SFB_BINMASK(t->pftag_qpriv8[(s << 1) + 1]), s)->pkts++; +#else /* SFB_LEVELS != 2 */ + for (i = 0; i < SFB_LEVELS; i++) { + if (s == 0) /* set 0, bin index [0,1] */ + n = SFB_BINMASK(t->pftag_qpriv8[i]); + else /* set 1, bin index [2,3] */ + n = SFB_BINMASK(t->pftag_qpriv8[i + 2]); + + SFB_BINST(sp, i, n, s)->pkts++; + } +#endif /* SFB_LEVELS != 2 */ +} + +static boolean_t +sfb_bin_addfcentry(struct sfb *sp, struct pf_mtag *t) +{ + struct sfb_bin_fcentry *fce; + u_int32_t flowhash; + struct sfb_fc_list *fcl; + int s; + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + flowhash = t->pftag_flowhash; + + if (flowhash == 0) { + sp->sfb_stats.null_flowhash++; + return (FALSE); + } + + /* + * Use value at index 0 for set 0 and + * value at index 2 for set 1 + */ + fcl = SFB_FC_LIST(sp, SFB_BINMASK(t->pftag_qpriv8[(s << 1)])); + SLIST_FOREACH(fce, fcl, fce_link) { + if (fce->fce_flowhash == flowhash) { + /* Already on flow control list; just return */ + return (TRUE); + } + } + + IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); + fce = ifnet_fce_alloc(M_WAITOK); + if (fce != NULL) { + fce->fce_flowhash = flowhash; + SLIST_INSERT_HEAD(fcl, fce, fce_link); + sp->sfb_stats.flow_controlled++; + } + + return (fce != NULL); +} + +/* + * early-drop probability is kept in pmark of each bin of the flow + */ +static int +sfb_drop_early(struct sfb *sp, struct pf_mtag *t, u_int16_t *pmin, + struct timespec *now) +{ +#if SFB_LEVELS != 2 + int i; +#endif /* SFB_LEVELS != 2 */ + struct sfbbinstats *bin; + int s, n, ret = 0; + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + *pmin = (u_int16_t)-1; + + /* + * Update current bins; optimize for SFB_LEVELS=2 + */ +#if SFB_LEVELS == 2 + /* Level 0: bin index at [0] for set 0; [2] for set 1 */ + n = SFB_BINMASK(t->pftag_qpriv8[(s << 1)]); + bin = SFB_BINST(sp, 0, n, s); + if (*pmin > (u_int16_t)bin->pmark) + *pmin = (u_int16_t)bin->pmark; + + if (bin->pkts >= sp->sfb_allocation) { + if (bin->pkts >= sp->sfb_drop_thresh) + ret = 1; /* drop or mark */ + sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now); + } + + /* Level 1: bin index at [1] for set 0; [3] for set 1 */ + n = SFB_BINMASK(t->pftag_qpriv8[(s << 1) + 1]); + bin = SFB_BINST(sp, 1, n, s); + if (*pmin > (u_int16_t)bin->pmark) + *pmin = (u_int16_t)bin->pmark; + + if (bin->pkts >= sp->sfb_allocation) { + if (bin->pkts >= sp->sfb_drop_thresh) + ret = 1; /* drop or mark */ + sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now); + } +#else /* SFB_LEVELS != 2 */ + for (i = 0; i < SFB_LEVELS; i++) { + if (s == 0) /* set 0, bin index [0,1] */ + n = SFB_BINMASK(t->pftag_qpriv8[i]); + else /* set 1, bin index [2,3] */ + n = SFB_BINMASK(t->pftag_qpriv8[i + 2]); + + bin = SFB_BINST(sp, i, n, s); + if (*pmin > (u_int16_t)bin->pmark) + *pmin = (u_int16_t)bin->pmark; + + if (bin->pkts >= sp->sfb_allocation) { + if (bin->pkts >= sp->sfb_drop_thresh) + ret = 1; /* drop or mark */ + sfb_increment_bin(sp, bin, + SFB_BINFT(sp, i, n, s), now); + } + } +#endif /* SFB_LEVELS != 2 */ + + if (sp->sfb_flags & SFBF_SUSPENDED) + ret = 1; /* drop or mark */ + + return (ret); +} + +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +int +sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t) +{ + struct timespec now; + int droptype, s; + u_int16_t pmin; + int fc_adv = 0; + int ret = CLASSQEQ_SUCCESS; + + nanouptime(&now); + + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + + /* time to swap the bins? */ + if (net_timercmp(&now, &sp->sfb_nextreset, >=)) { + net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset); + sfb_swap_bins(sp, qlen(q)); + s = sp->sfb_current; + VERIFY((s + (s ^ 1)) == 1); + } + + t->pftag_flags &= ~SFB_PKT_PBOX; + t->pftag_qpriv16[s] = + (SFB_HASH(&t->pftag_flowhash, sizeof (t->pftag_flowhash), + (*sp->sfb_bins)[s].fudge) & SFB_HASHMASK); + t->pftag_qpriv16[s ^ 1] = + (SFB_HASH(&t->pftag_flowhash, sizeof (t->pftag_flowhash), + (*sp->sfb_bins)[s ^ 1].fudge) & SFB_HASHMASK); + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (sfb_drop_early(sp, t, &pmin, &now)) { + /* flow control, mark or drop by sfb */ + if ((sp->sfb_flags & SFBF_FLOWCTL) && + (t->pftag_flags & PF_TAG_FLOWADV)) { + fc_adv = 1; + /* drop all during suspension or for non-TCP */ + if ((sp->sfb_flags & SFBF_SUSPENDED) || + !(t->pftag_flags & PF_TAG_TCP)) { + droptype = DTYPE_EARLY; + sp->sfb_stats.drop_early++; + } + } else if ((sp->sfb_flags & SFBF_ECN) && + (t->pftag_flags & PF_TAG_TCP) && /* only for TCP */ + ((sfb_random(sp) & SFB_MAX_PMARK) <= pmin) && + mark_ecn(m, t, sp->sfb_flags) && + !(sp->sfb_flags & SFBF_SUSPENDED)) { + /* successfully marked; do not drop. */ + sp->sfb_stats.marked_packets++; + } else { + /* unforced drop by sfb */ + droptype = DTYPE_EARLY; + sp->sfb_stats.drop_early++; + } + } + + /* non-responsive flow penalty? */ + if (droptype == DTYPE_NODROP && sfb_penalize(sp, t, &now)) { + droptype = DTYPE_FORCED; + sp->sfb_stats.drop_pbox++; + } + + /* if the queue length hits the hard limit, it's a forced drop */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) { + droptype = DTYPE_FORCED; + sp->sfb_stats.drop_queue++; + } + + if (fc_adv == 1 && droptype != DTYPE_FORCED && + sfb_bin_addfcentry(sp, t)) { + /* deliver flow control advisory error */ + if (droptype == DTYPE_NODROP) { + ret = CLASSQEQ_SUCCESS_FC; + VERIFY(!(sp->sfb_flags & SFBF_SUSPENDED)); + } else if (sp->sfb_flags & SFBF_SUSPENDED) { + /* dropped due to suspension */ + ret = CLASSQEQ_DROPPED_SP; + } else { + /* dropped due to flow-control */ + ret = CLASSQEQ_DROPPED_FC; + } + } + + /* if successful enqueue this packet, else drop it */ + if (droptype == DTYPE_NODROP) { + _addq(q, m); + } else { + IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); + m_freem(m); + return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED); + } + + if (!(t->pftag_flags & SFB_PKT_PBOX)) + sfb_eq_update_bins(sp, t); + else + sp->sfb_stats.pbox_packets++; + + /* successfully queued */ + return (ret); +} + +static struct mbuf * +sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge) +{ + struct timespec now; + struct mbuf *m; + struct pf_mtag *t; + + if (!purge && (sp->sfb_flags & SFBF_SUSPENDED)) + return (NULL); + + nanouptime(&now); + + /* flow of 0 means head of queue */ + if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { + if (!purge) + net_timerclear(&sp->sfb_getqtime); + return (NULL); + } + + VERIFY(m->m_flags & M_PKTHDR); + + t = m_pftag(m); + + if (!purge) { + /* calculate EWMA of dequeues */ + if (net_timerisset(&sp->sfb_getqtime)) { + struct timespec delta; + u_int64_t avg, new; + + net_timersub(&now, &sp->sfb_getqtime, &delta); + net_timernsec(&delta, &new); + avg = sp->sfb_stats.dequeue_avg; + if (avg > 0) { + int decay = DEQUEUE_DECAY; + /* + * If the time since last dequeue is + * significantly greater than the current + * average, weight the average more against + * the old value. + */ + if (DEQUEUE_SPIKE(new, avg)) + decay += 5; + avg = (((avg << decay) - avg) + new) >> decay; + } else { + avg = new; + } + sp->sfb_stats.dequeue_avg = avg; + } + *(&sp->sfb_getqtime) = *(&now); + } + + /* + * Clearpkts are the ones which were in the queue when the hash + * function was perturbed. Since the perturbation value (fudge), + * and thus bin information for these packets is not known, we do + * not change accounting information while dequeuing these packets. + * It is important not to set the hash interval too small due to + * this reason. A rule of thumb is to set it to K*D, where D is + * the time taken to drain queue. + */ + if (t->pftag_flags & SFB_PKT_PBOX) { + t->pftag_flags &= ~SFB_PKT_PBOX; + if (sp->sfb_clearpkts > 0) + sp->sfb_clearpkts--; + } else if (sp->sfb_clearpkts > 0) { + sp->sfb_clearpkts--; + } else { + sfb_dq_update_bins(sp, t, &now); + } + + return (m); +} + +struct mbuf * +sfb_getq(struct sfb *sp, class_queue_t *q) +{ + return (sfb_getq_flow(sp, q, 0, FALSE)); +} + +void +sfb_purgeq(struct sfb *sp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, + u_int32_t *bytes) +{ + u_int32_t cnt = 0, len = 0; + struct mbuf *m; + + IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); + + while ((m = sfb_getq_flow(sp, q, flow, TRUE)) != NULL) { + cnt++; + len += m_pktlen(m); + m_freem(m); + } + + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +void +sfb_updateq(struct sfb *sp, cqev_t ev) +{ + struct ifnet *ifp = sp->sfb_ifp; + + VERIFY(ifp != NULL); + + switch (ev) { + case CLASSQ_EV_LINK_SPEED: { + u_int64_t eff_rate = ifnet_output_linkrate(ifp); + + /* update parameters only if rate has changed */ + if (eff_rate == sp->sfb_eff_rate) + break; + + if (classq_verbose) { + log(LOG_DEBUG, "%s: SFB qid=%d, adapting to new " + "eff_rate=%llu bps\n", if_name(ifp), sp->sfb_qid, + eff_rate); + } + sfb_calc_holdtime(sp, eff_rate); + sfb_calc_pboxtime(sp, eff_rate); + break; + } + + case CLASSQ_EV_LINK_UP: + case CLASSQ_EV_LINK_DOWN: + if (classq_verbose) { + log(LOG_DEBUG, "%s: SFB qid=%d, resetting due to " + "link %s\n", if_name(ifp), sp->sfb_qid, + (ev == CLASSQ_EV_LINK_UP) ? "UP" : "DOWN"); + } + sfb_resetq(sp, ev); + break; + + case CLASSQ_EV_LINK_MTU: + default: + break; + } +} + +int +sfb_suspendq(struct sfb *sp, class_queue_t *q, boolean_t on) +{ +#pragma unused(q) + struct ifnet *ifp = sp->sfb_ifp; + + VERIFY(ifp != NULL); + + if ((on && (sp->sfb_flags & SFBF_SUSPENDED)) || + (!on && !(sp->sfb_flags & SFBF_SUSPENDED))) + return (0); + + if (!(sp->sfb_flags & SFBF_FLOWCTL)) { + log(LOG_ERR, "%s: SFB qid=%d, unable to %s queue since " + "flow-control is not enabled", if_name(ifp), sp->sfb_qid, + (on ? "suspend" : "resume")); + return (ENOTSUP); + } + + if (classq_verbose) { + log(LOG_DEBUG, "%s: SFB qid=%d, setting state to %s", + if_name(ifp), sp->sfb_qid, (on ? "SUSPENDED" : "RUNNING")); + } + + if (on) { + sp->sfb_flags |= SFBF_SUSPENDED; + } else { + sp->sfb_flags &= ~SFBF_SUSPENDED; + sfb_swap_bins(sp, qlen(q)); + } + + return (0); +} diff --git a/bsd/net/classq/classq_sfb.h b/bsd/net/classq/classq_sfb.h new file mode 100644 index 000000000..911ad31ef --- /dev/null +++ b/bsd/net/classq/classq_sfb.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_CLASSQ_CLASSQ_SFB_H_ +#define _NET_CLASSQ_CLASSQ_SFB_H_ + +#ifdef PRIVATE +#ifdef BSD_KERNEL_PRIVATE +#include +#include +#include +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define SFB_FP_SHIFT 14 /* fixed-point shift (Q14) */ +#define SFB_LEVELS 2 /* L */ +#define SFB_BINS_SHIFT 5 +#define SFB_BINS (1 << SFB_BINS_SHIFT) /* N */ + +struct sfbstats { + u_int64_t drop_early; + u_int64_t drop_pbox; + u_int64_t drop_queue; + u_int64_t marked_packets; + u_int64_t pbox_packets; + u_int64_t pbox_time; + u_int64_t hold_time; + u_int64_t dequeue_avg; + u_int64_t rehash_intval; + u_int64_t num_rehash; + u_int64_t null_flowhash; + u_int64_t flow_controlled; + u_int64_t flow_feedback; +}; + +struct sfbbinstats { + int16_t pmark; /* marking probability in Q format */ + u_int16_t pkts; /* number of packets */ +}; + +struct sfb_stats { + u_int32_t allocation; + u_int32_t dropthresh; + u_int32_t clearpkts; + u_int32_t current; + struct sfbstats sfbstats; + struct sfbbins { + struct sfbbinstats stats[SFB_LEVELS][SFB_BINS]; + } binstats[2] __attribute__((aligned(8))); +}; + +#ifdef BSD_KERNEL_PRIVATE +struct sfb_bin_fcentry { + SLIST_ENTRY(sfb_bin_fcentry) fce_link; + u_int32_t fce_flowhash; +}; + +SLIST_HEAD(sfb_fc_list, sfb_bin_fcentry); + +struct sfb_bins { + u_int32_t fudge; + struct sfbbinstats stats[SFB_LEVELS][SFB_BINS]; + struct timespec freezetime[SFB_LEVELS][SFB_BINS]; +}; + +/* SFB flags */ +#define SFBF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define SFBF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define SFBF_ECN (SFBF_ECN4 | SFBF_ECN6) +#define SFBF_FLOWCTL 0x04 /* enable flow control advisories */ +#define SFBF_SUSPENDED 0x1000 /* queue is suspended */ + +#define SFBF_USERFLAGS \ + (SFBF_ECN4 | SFBF_ECN6 | SFBF_FLOWCTL) + +typedef struct sfb { + /* variables for internal use */ + u_int32_t sfb_flags; /* SFB flags */ + u_int32_t sfb_qlim; + u_int32_t sfb_qid; + u_int16_t sfb_allocation; + u_int16_t sfb_drop_thresh; + u_int32_t sfb_clearpkts; + u_int64_t sfb_eff_rate; /* last known effective rate */ + struct timespec sfb_getqtime; /* last dequeue timestamp */ + struct timespec sfb_holdtime; /* random holdtime in nsec */ + struct ifnet *sfb_ifp; /* back pointer to ifnet */ + + /* moving hash function */ + struct timespec sfb_hinterval; /* random reset interval in sec */ + struct timespec sfb_nextreset; /* reset deadline */ + + /* penalty box */ + struct timespec sfb_pboxtime; /* random pboxtime in nsec */ + struct timespec sfb_pboxfreeze; + + /* B[L][N] bins (2 sets: current and warm-up) */ + u_int32_t sfb_current; /* current set (0 or 1) */ + struct sfb_bins (*sfb_bins)[2]; + + /* Flow control lists for current set */ + struct sfb_fc_list (*sfb_fc_lists)[SFB_BINS]; + + /* statistics */ + struct sfbstats sfb_stats __attribute__((aligned(8))); +} sfb_t; + +extern void sfb_init(void); +extern struct sfb *sfb_alloc(struct ifnet *, u_int32_t, u_int32_t, u_int32_t); +extern void sfb_destroy(struct sfb *); +extern int sfb_addq(struct sfb *, class_queue_t *, struct mbuf *, + struct pf_mtag *); +extern struct mbuf *sfb_getq(struct sfb *, class_queue_t *); +extern void sfb_purgeq(struct sfb *, class_queue_t *, u_int32_t, + u_int32_t *, u_int32_t *); +extern void sfb_getstats(struct sfb *, struct sfb_stats *); +extern void sfb_updateq(struct sfb *, cqev_t); +extern int sfb_suspendq(struct sfb *, class_queue_t *, boolean_t); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_CLASSQ_SFB_H_ */ diff --git a/bsd/net/classq/classq_subr.c b/bsd/net/classq/classq_subr.c new file mode 100644 index 000000000..738c86e23 --- /dev/null +++ b/bsd/net/classq/classq_subr.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#if CLASSQ_RED +#include +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO +#include +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE +#include +#endif /* CLASSQ_BLUE */ +#include +#include + +#include + +#if PF_ALTQ +#include +#endif /* PF_ALTQ */ + +static errno_t ifclassq_dequeue_common(struct ifclassq *, mbuf_svc_class_t, + u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *, + boolean_t); +static struct mbuf *ifclassq_poll_common(struct ifclassq *, + mbuf_svc_class_t, boolean_t); +static struct mbuf *ifclassq_tbr_dequeue_common(struct ifclassq *, int, + mbuf_svc_class_t, boolean_t); + +void +classq_init(void) +{ + _CASSERT(MBUF_TC_BE == 0); + _CASSERT(MBUF_SC_BE == 0); + _CASSERT(IFCQ_SC_MAX == MBUF_SC_MAX_CLASSES); + +#if CLASSQ_RED + red_init(); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + rio_init(); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + blue_init(); +#endif /* CLASSQ_BLUE */ + sfb_init(); +} + +int +ifclassq_setup(struct ifnet *ifp, u_int32_t sflags, boolean_t reuse) +{ +#pragma unused(reuse) + struct ifclassq *ifq = &ifp->if_snd; + int err = 0; + + IFCQ_LOCK(ifq); + VERIFY(IFCQ_IS_EMPTY(ifq)); + ifq->ifcq_ifp = ifp; + IFCQ_LEN(ifq) = 0; + bzero(&ifq->ifcq_xmitcnt, sizeof (ifq->ifcq_xmitcnt)); + bzero(&ifq->ifcq_dropcnt, sizeof (ifq->ifcq_dropcnt)); + + VERIFY(!IFCQ_TBR_IS_ENABLED(ifq)); + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + VERIFY(ifq->ifcq_flags == 0); + VERIFY(ifq->ifcq_sflags == 0); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_enqueue == NULL); + VERIFY(ifq->ifcq_dequeue == NULL); + VERIFY(ifq->ifcq_dequeue_sc == NULL); + VERIFY(ifq->ifcq_request == NULL); + + if (ifp->if_eflags & IFEF_TXSTART) { + u_int32_t maxlen = 0; + + if ((maxlen = IFCQ_MAXLEN(ifq)) == 0) + maxlen = if_sndq_maxlen; + IFCQ_SET_MAXLEN(ifq, maxlen); + + ifq->ifcq_sflags = sflags; + err = ifclassq_pktsched_setup(ifq); + if (err == 0) + ifq->ifcq_flags = (IFCQF_READY | IFCQF_ENABLED); + } + +#if PF_ALTQ + ifq->ifcq_drain = 0; + IFCQ_ALTQ(ifq)->altq_ifcq = ifq; + VERIFY(IFCQ_ALTQ(ifq)->altq_type == ALTQT_NONE); + VERIFY(IFCQ_ALTQ(ifq)->altq_flags == 0); + VERIFY(IFCQ_ALTQ(ifq)->altq_disc == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_enqueue == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_dequeue == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_dequeue_sc == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_request == NULL); + + if ((ifp->if_eflags & IFEF_TXSTART) && + ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED) + ALTQ_SET_READY(IFCQ_ALTQ(ifq)); + else + ALTQ_CLEAR_READY(IFCQ_ALTQ(ifq)); +#endif /* PF_ALTQ */ + IFCQ_UNLOCK(ifq); + + return (err); +} + +void +ifclassq_teardown(struct ifnet *ifp) +{ + struct ifclassq *ifq = &ifp->if_snd; + + IFCQ_LOCK(ifq); +#if PF_ALTQ + if (ALTQ_IS_READY(IFCQ_ALTQ(ifq))) { + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) + altq_disable(IFCQ_ALTQ(ifq)); + if (ALTQ_IS_ATTACHED(IFCQ_ALTQ(ifq))) + altq_detach(IFCQ_ALTQ(ifq)); + IFCQ_ALTQ(ifq)->altq_flags = 0; + } + ifq->ifcq_drain = 0; + IFCQ_ALTQ(ifq)->altq_ifcq = NULL; + VERIFY(IFCQ_ALTQ(ifq)->altq_type == ALTQT_NONE); + VERIFY(IFCQ_ALTQ(ifq)->altq_flags == 0); + VERIFY(IFCQ_ALTQ(ifq)->altq_disc == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_enqueue == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_dequeue == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_dequeue_sc == NULL); + VERIFY(IFCQ_ALTQ(ifq)->altq_request == NULL); +#endif /* PF_ALTQ */ + + if (IFCQ_IS_READY(ifq)) { + if (IFCQ_TBR_IS_ENABLED(ifq)) { + struct tb_profile tb = { 0, 0, 0 }; + (void) ifclassq_tbr_set(ifq, &tb, FALSE); + } + (void) pktsched_teardown(ifq); + ifq->ifcq_flags = 0; + } + ifq->ifcq_sflags = 0; + + VERIFY(IFCQ_IS_EMPTY(ifq)); + VERIFY(!IFCQ_TBR_IS_ENABLED(ifq)); + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + VERIFY(ifq->ifcq_flags == 0); + VERIFY(ifq->ifcq_sflags == 0); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_enqueue == NULL); + VERIFY(ifq->ifcq_dequeue == NULL); + VERIFY(ifq->ifcq_dequeue_sc == NULL); + VERIFY(ifq->ifcq_request == NULL); + IFCQ_LEN(ifq) = 0; + IFCQ_MAXLEN(ifq) = 0; + bzero(&ifq->ifcq_xmitcnt, sizeof (ifq->ifcq_xmitcnt)); + bzero(&ifq->ifcq_dropcnt, sizeof (ifq->ifcq_dropcnt)); + + IFCQ_UNLOCK(ifq); +} + +int +ifclassq_pktsched_setup(struct ifclassq *ifq) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifp->if_eflags & IFEF_TXSTART); + + switch (ifp->if_output_sched_model) { + case IFNET_SCHED_MODEL_DRIVER_MANAGED: + err = pktsched_setup(ifq, PKTSCHEDT_TCQ, ifq->ifcq_sflags); + break; + + case IFNET_SCHED_MODEL_NORMAL: + err = pktsched_setup(ifq, PKTSCHEDT_QFQ, ifq->ifcq_sflags); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (err); +} + +void +ifclassq_set_maxlen(struct ifclassq *ifq, u_int32_t maxqlen) +{ + IFCQ_LOCK(ifq); + if (maxqlen == 0) + maxqlen = if_sndq_maxlen; + IFCQ_SET_MAXLEN(ifq, maxqlen); + IFCQ_UNLOCK(ifq); +} + +u_int32_t +ifclassq_get_maxlen(struct ifclassq *ifq) +{ + return (IFCQ_MAXLEN(ifq)); +} + +u_int32_t +ifclassq_get_len(struct ifclassq *ifq) +{ + return (IFCQ_LEN(ifq)); +} + +errno_t +ifclassq_enqueue(struct ifclassq *ifq, struct mbuf *m) +{ + errno_t err; + + IFCQ_LOCK_SPIN(ifq); + +#if PF_ALTQ + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) { + ALTQ_ENQUEUE(IFCQ_ALTQ(ifq), m, err); + } else { + u_int32_t qlen = IFCQ_LEN(ifq); + IFCQ_ENQUEUE(ifq, m, err); + if (IFCQ_LEN(ifq) > qlen) + ifq->ifcq_drain += (IFCQ_LEN(ifq) - qlen); + } +#else /* !PF_ALTQ */ + IFCQ_ENQUEUE(ifq, m, err); +#endif /* PF_ALTQ */ + + IFCQ_UNLOCK(ifq); + + return (err); +} + +errno_t +ifclassq_dequeue(struct ifclassq *ifq, u_int32_t limit, struct mbuf **head, + struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) +{ + return (ifclassq_dequeue_common(ifq, MBUF_SC_UNSPEC, limit, head, tail, + cnt, len, FALSE)); +} + +errno_t +ifclassq_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc, + u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, + u_int32_t *len) +{ + return (ifclassq_dequeue_common(ifq, sc, limit, head, tail, + cnt, len, TRUE)); +} + +static errno_t +ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, + u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, + u_int32_t *len, boolean_t drvmgt) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + u_int32_t i = 0, l = 0; + struct mbuf **first, *last; +#if PF_ALTQ + struct ifaltq *altq = IFCQ_ALTQ(ifq); + boolean_t draining; +#endif /* PF_ALTQ */ + + VERIFY(!drvmgt || MBUF_VALID_SC(sc)); + + *head = NULL; + first = &(*head); + last = NULL; + + ifq = &ifp->if_snd; + IFCQ_LOCK_SPIN(ifq); + + while (i < limit) { + u_int64_t pktlen; +#if PF_ALTQ + u_int32_t qlen; + + qlen = IFCQ_LEN(ifq); + draining = IFCQ_IS_DRAINING(ifq); + + if (drvmgt) { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_DEQUEUE_SC(ifq, sc, *head); + else if (draining) + IFCQ_DEQUEUE_SC(ifq, sc, *head); + else if (ALTQ_IS_ENABLED(altq)) + ALTQ_DEQUEUE_SC(altq, sc, *head); + else + *head = NULL; + } else { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_DEQUEUE(ifq, *head); + else if (draining) + IFCQ_DEQUEUE(ifq, *head); + else if (ALTQ_IS_ENABLED(altq)) + ALTQ_DEQUEUE(altq, *head); + else + *head = NULL; + } + + if (draining && *head != NULL) { + VERIFY(ifq->ifcq_drain >= (qlen - IFCQ_LEN(ifq))); + ifq->ifcq_drain -= (qlen - IFCQ_LEN(ifq)); + } +#else /* ! PF_ALTQ */ + if (drvmgt) { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_DEQUEUE_SC(ifq, sc, *head); + else + IFCQ_DEQUEUE_SC(ifq, sc, *head); + } else { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_DEQUEUE(ifq, *head); + else + IFCQ_DEQUEUE(ifq, *head); + } +#endif /* !PF_ALTQ */ + + if (*head == NULL) + break; + + (*head)->m_nextpkt = NULL; + last = *head; + + l += (*head)->m_pkthdr.len; + pktlen = (*head)->m_pkthdr.len; + + (*head)->m_pkthdr.pf_mtag.pftag_pktseq = + atomic_add_64_ov(&(ifp->if_bw.cur_seq), pktlen); + + head = &(*head)->m_nextpkt; + i++; + } + + IFCQ_UNLOCK(ifq); + + if (tail != NULL) + *tail = last; + if (cnt != NULL) + *cnt = i; + if (len != NULL) + *len = l; + + return ((*first != NULL) ? 0 : EAGAIN); +} + +struct mbuf * +ifclassq_poll(struct ifclassq *ifq) +{ + return (ifclassq_poll_common(ifq, MBUF_SC_UNSPEC, FALSE)); +} + +struct mbuf * +ifclassq_poll_sc(struct ifclassq *ifq, mbuf_svc_class_t sc) +{ + return (ifclassq_poll_common(ifq, sc, TRUE)); +} + +static struct mbuf * +ifclassq_poll_common(struct ifclassq *ifq, mbuf_svc_class_t sc, + boolean_t drvmgt) +{ +#if PF_ALTQ + struct ifaltq *altq = IFCQ_ALTQ(ifq); +#endif /* PF_ALTQ */ + struct mbuf *m; + + VERIFY(!drvmgt || MBUF_VALID_SC(sc)); + +#if PF_ALTQ + if (drvmgt) { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_POLL_SC(ifq, sc, m); + else if (IFCQ_IS_DRAINING(ifq)) + IFCQ_POLL_SC(ifq, sc, m); + else if (ALTQ_IS_ENABLED(altq)) + ALTQ_POLL_SC(altq, sc, m); + else + m = NULL; + } else { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_POLL(ifq, m); + else if (IFCQ_IS_DRAINING(ifq)) + IFCQ_POLL(ifq, m); + else if (ALTQ_IS_ENABLED(altq)) + ALTQ_POLL(altq, m); + else + m = NULL; + } +#else /* ! PF_ALTQ */ + if (drvmgt) { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_POLL_SC(ifq, sc, m); + else + IFCQ_POLL_SC(ifq, sc, m); + } else { + if (IFCQ_TBR_IS_ENABLED(ifq)) + IFCQ_TBR_POLL(ifq, m); + else + IFCQ_POLL(ifq, m); + } +#endif /* !PF_ALTQ */ + + return (m); +} + +void +ifclassq_update(struct ifclassq *ifq, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(IFCQ_IS_READY(ifq)); + +#if PF_ALTQ + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) + ALTQ_UPDATE(IFCQ_ALTQ(ifq), ev); +#endif /* PF_ALTQ */ + IFCQ_UPDATE(ifq, ev); +} + +int +ifclassq_attach(struct ifclassq *ifq, u_int32_t type, void *discipline, + ifclassq_enq_func enqueue, ifclassq_deq_func dequeue, + ifclassq_deq_sc_func dequeue_sc, ifclassq_req_func request) +{ + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(enqueue != NULL); + VERIFY(!(dequeue != NULL && dequeue_sc != NULL)); + VERIFY(request != NULL); + + ifq->ifcq_type = type; + ifq->ifcq_disc = discipline; + ifq->ifcq_enqueue = enqueue; + ifq->ifcq_dequeue = dequeue; + ifq->ifcq_dequeue_sc = dequeue_sc; + ifq->ifcq_request = request; + + return (0); +} + +int +ifclassq_detach(struct ifclassq *ifq) +{ + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(ifq->ifcq_disc == NULL); + + ifq->ifcq_type = PKTSCHEDT_NONE; + ifq->ifcq_disc = NULL; + ifq->ifcq_enqueue = NULL; + ifq->ifcq_dequeue = NULL; + ifq->ifcq_dequeue_sc = NULL; + ifq->ifcq_request = NULL; + + return (0); +} + +int +ifclassq_getqstats(struct ifclassq *ifq, u_int32_t qid, void *ubuf, + u_int32_t *nbytes) +{ + struct if_ifclassq_stats *ifqs; + int err; + + if (*nbytes < sizeof (*ifqs)) + return (EINVAL); + + ifqs = _MALLOC(sizeof (*ifqs), M_TEMP, M_WAITOK | M_ZERO); + if (ifqs == NULL) + return (ENOMEM); + + IFCQ_LOCK(ifq); + if (!IFCQ_IS_READY(ifq)) { + IFCQ_UNLOCK(ifq); + _FREE(ifqs, M_TEMP); + return (ENXIO); + } + + ifqs->ifqs_len = IFCQ_LEN(ifq); + ifqs->ifqs_maxlen = IFCQ_MAXLEN(ifq); + *(&ifqs->ifqs_xmitcnt) = *(&ifq->ifcq_xmitcnt); + *(&ifqs->ifqs_dropcnt) = *(&ifq->ifcq_dropcnt); + ifqs->ifqs_scheduler = ifq->ifcq_type; + + err = pktsched_getqstats(ifq, qid, ifqs); + IFCQ_UNLOCK(ifq); + + if (err == 0 && (err = copyout((caddr_t)ifqs, + (user_addr_t)(uintptr_t)ubuf, sizeof (*ifqs))) == 0) + *nbytes = sizeof (*ifqs); + + _FREE(ifqs, M_TEMP); + + return (err); +} + +const char * +ifclassq_ev2str(cqev_t ev) +{ + const char *c; + + switch (ev) { + case CLASSQ_EV_LINK_SPEED: + c = "LINK_SPEED"; + break; + + case CLASSQ_EV_LINK_MTU: + c = "LINK_MTU"; + break; + + case CLASSQ_EV_LINK_UP: + c = "LINK_UP"; + break; + + case CLASSQ_EV_LINK_DOWN: + c = "LINK_DOWN"; + break; + + default: + c = "UNKNOWN"; + break; + } + + return (c); +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TBR_SHIFT 32 +#define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) +#define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) + +struct mbuf * +ifclassq_tbr_dequeue(struct ifclassq *ifq, int op) +{ + return (ifclassq_tbr_dequeue_common(ifq, op, MBUF_SC_UNSPEC, FALSE)); +} + +struct mbuf * +ifclassq_tbr_dequeue_sc(struct ifclassq *ifq, int op, mbuf_svc_class_t sc) +{ + return (ifclassq_tbr_dequeue_common(ifq, op, sc, TRUE)); +} + +static struct mbuf * +ifclassq_tbr_dequeue_common(struct ifclassq *ifq, int op, + mbuf_svc_class_t sc, boolean_t drvmgt) +{ + struct tb_regulator *tbr; + struct mbuf *m; + int64_t interval; + u_int64_t now; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(!drvmgt || MBUF_VALID_SC(sc)); + VERIFY(IFCQ_TBR_IS_ENABLED(ifq)); + + tbr = &ifq->ifcq_tbr; + if (op == CLASSQDQ_REMOVE && tbr->tbr_lastop == CLASSQDQ_POLL) { + /* if this is a remove after poll, bypass tbr check */ + } else { + /* update token only when it is negative */ + if (tbr->tbr_token <= 0) { + now = read_machclk(); + interval = now - tbr->tbr_last; + if (interval >= tbr->tbr_filluptime) { + tbr->tbr_token = tbr->tbr_depth; + } else { + tbr->tbr_token += interval * tbr->tbr_rate; + if (tbr->tbr_token > tbr->tbr_depth) + tbr->tbr_token = tbr->tbr_depth; + } + tbr->tbr_last = now; + } + /* if token is still negative, don't allow dequeue */ + if (tbr->tbr_token <= 0) + return (NULL); + } + + /* + * ifclassq takes precedence over ALTQ queue; + * ifcq_drain count is adjusted by the caller. + */ +#if PF_ALTQ + if (IFCQ_IS_DRAINING(ifq)) { +#endif /* PF_ALTQ */ + if (op == CLASSQDQ_POLL) { + if (drvmgt) + IFCQ_POLL_SC(ifq, sc, m); + else + IFCQ_POLL(ifq, m); + } else { + if (drvmgt) + IFCQ_DEQUEUE_SC(ifq, sc, m); + else + IFCQ_DEQUEUE(ifq, m); + } +#if PF_ALTQ + } else { + struct ifaltq *altq = IFCQ_ALTQ(ifq); + if (ALTQ_IS_ENABLED(altq)) { + if (drvmgt) + m = (*altq->altq_dequeue_sc)(altq, sc, op); + else + m = (*altq->altq_dequeue)(altq, op); + } else { + m = NULL; + } + } +#endif /* PF_ALTQ */ + + if (m != NULL && op == CLASSQDQ_REMOVE) + tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); + tbr->tbr_lastop = op; + + return (m); +} + +/* + * set a token bucket regulator. + * if the specified rate is zero, the token bucket regulator is deleted. + */ +int +ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, + boolean_t update) +{ + struct tb_regulator *tbr; + struct ifnet *ifp = ifq->ifcq_ifp; + u_int64_t rate, old_rate; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(IFCQ_IS_READY(ifq)); + + VERIFY(machclk_freq != 0); + + tbr = &ifq->ifcq_tbr; + old_rate = tbr->tbr_rate_raw; + + rate = profile->rate; + if (profile->percent > 0) { + u_int64_t eff_rate; + + if (profile->percent > 100) + return (EINVAL); + if ((eff_rate = ifp->if_output_bw.eff_bw) == 0) + return (ENODEV); + rate = (eff_rate * profile->percent) / 100; + } + + if (rate == 0) { + if (!IFCQ_TBR_IS_ENABLED(ifq)) + return (ENOENT); + + if (pktsched_verbose) + printf("%s: TBR disabled\n", if_name(ifp)); + + /* disable this TBR */ + ifq->ifcq_flags &= ~IFCQF_TBR; + bzero(tbr, sizeof (*tbr)); + ifnet_set_start_cycle(ifp, NULL); + if (update) + ifclassq_update(ifq, CLASSQ_EV_LINK_SPEED); + return (0); + } + + if (pktsched_verbose) { + printf("%s: TBR %s (rate %llu bps depth %u)\n", if_name(ifp), + (ifq->ifcq_flags & IFCQF_TBR) ? "reconfigured" : + "enabled", rate, profile->depth); + } + + /* set the new TBR */ + bzero(tbr, sizeof (*tbr)); + tbr->tbr_rate_raw = rate; + tbr->tbr_percent = profile->percent; + ifq->ifcq_flags |= IFCQF_TBR; + + /* + * Note that the TBR fill up time (hence the ifnet restart time) + * is directly related to the specified TBR depth. The ideal + * depth value should be computed such that the interval time + * between each successive wakeup is adequately spaced apart, + * in order to reduce scheduling overheads. A target interval + * of 10 ms seems to provide good performance balance. This can be + * overridden by specifying the depth profile. Values smaller than + * the ideal depth will reduce delay at the expense of CPU cycles. + */ + tbr->tbr_rate = TBR_SCALE(rate / 8) / machclk_freq; + if (tbr->tbr_rate > 0) { + u_int32_t mtu = ifp->if_mtu; + int64_t ival, idepth = 0; + int i; + + if (mtu < IF_MINMTU) + mtu = IF_MINMTU; + + ival = pktsched_nsecs_to_abstime(10 * NSEC_PER_MSEC); /* 10ms */ + + for (i = 1; ; i++) { + idepth = TBR_SCALE(i * mtu); + if ((idepth / tbr->tbr_rate) > ival) + break; + } + VERIFY(idepth > 0); + + tbr->tbr_depth = TBR_SCALE(profile->depth); + if (tbr->tbr_depth == 0) { + tbr->tbr_filluptime = idepth / tbr->tbr_rate; + /* a little fudge factor to get closer to rate */ + tbr->tbr_depth = idepth + (idepth >> 3); + } else { + tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; + } + } else { + tbr->tbr_depth = TBR_SCALE(profile->depth); + tbr->tbr_filluptime = 0xffffffffffffffffLL; + } + tbr->tbr_token = tbr->tbr_depth; + tbr->tbr_last = read_machclk(); + tbr->tbr_lastop = CLASSQDQ_REMOVE; + + if (tbr->tbr_rate > 0 && (ifp->if_flags & IFF_UP)) { + struct timespec ts = + { 0, pktsched_abs_to_nsecs(tbr->tbr_filluptime) }; + if (pktsched_verbose) { + printf("%s: TBR calculated tokens %lld " + "filluptime %llu ns\n", if_name(ifp), + TBR_UNSCALE(tbr->tbr_token), + pktsched_abs_to_nsecs(tbr->tbr_filluptime)); + } + ifnet_set_start_cycle(ifp, &ts); + } else { + if (pktsched_verbose) { + if (tbr->tbr_rate == 0) { + printf("%s: TBR calculated tokens %lld " + "infinite filluptime\n", if_name(ifp), + TBR_UNSCALE(tbr->tbr_token)); + } else if (!(ifp->if_flags & IFF_UP)) { + printf("%s: TBR suspended (link is down)\n", + if_name(ifp)); + } + } + ifnet_set_start_cycle(ifp, NULL); + } + if (update && tbr->tbr_rate_raw != old_rate) + ifclassq_update(ifq, CLASSQ_EV_LINK_SPEED); + + return (0); +} diff --git a/bsd/net/classq/classq_util.c b/bsd/net/classq/classq_util.c new file mode 100644 index 000000000..e8bf3d5bd --- /dev/null +++ b/bsd/net/classq/classq_util.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#if INET6 +#include +#endif +#include +#include + +#include + +/* + * read and write diffserv field in IPv4 or IPv6 header + */ +u_int8_t +read_dsfield(struct mbuf *m, struct pf_mtag *t) +{ + struct mbuf *m0; + u_int8_t ds_field = 0; + + if (t->pftag_hdr == NULL || + !(t->pftag_flags & (PF_TAG_HDR_INET|PF_TAG_HDR_INET6))) + return ((u_int8_t)0); + + /* verify that hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)t->pftag_hdr >= m0->m_data) && + ((caddr_t)t->pftag_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, tag info is stale */ + printf("%s: can't locate header!\n", __func__); + return ((u_int8_t)0); + } + + if (t->pftag_flags & PF_TAG_HDR_INET) { + struct ip *ip = (struct ip *)(void *)t->pftag_hdr; + + if (((uintptr_t)ip + sizeof (*ip)) > + ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) + return (0); /* out of bounds */ + + if (ip->ip_v != 4) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = ip->ip_tos; + } +#if INET6 + else if (t->pftag_flags & PF_TAG_HDR_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)(void *)t->pftag_hdr; + u_int32_t flowlabel; + + if (((uintptr_t)ip6 + sizeof (*ip6)) > + ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) + return (0); /* out of bounds */ + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = (flowlabel >> 20) & 0xff; + } +#endif + return (ds_field); +} + +void +write_dsfield(struct mbuf *m, struct pf_mtag *t, u_int8_t dsfield) +{ + struct mbuf *m0; + + if (t->pftag_hdr == NULL || + !(t->pftag_flags & (PF_TAG_HDR_INET|PF_TAG_HDR_INET6))) + return; + + /* verify that hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)t->pftag_hdr >= m0->m_data) && + ((caddr_t)t->pftag_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, tag info is stale */ + printf("%s: can't locate header!\n", __func__); + return; + } + + if (t->pftag_flags & PF_TAG_HDR_INET) { + struct ip *ip = (struct ip *)(void *)t->pftag_hdr; + u_int8_t old; + int32_t sum; + + if (((uintptr_t)ip + sizeof (*ip)) > + ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) + return; /* out of bounds */ + + if (ip->ip_v != 4) + return; /* version mismatch! */ + old = ip->ip_tos; + dsfield |= old & 3; /* leave CU bits */ + if (old == dsfield) + return; + ip->ip_tos = dsfield; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xff00 + (~old & 0xff) + dsfield; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); + } +#if INET6 + else if (t->pftag_flags & PF_TAG_HDR_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)t->pftag_hdr; + u_int32_t flowlabel; + + if (((uintptr_t)ip6 + sizeof (*ip6)) > + ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) + return; /* out of bounds */ + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return; /* version mismatch! */ + flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); + ip6->ip6_flow = htonl(flowlabel); + } +#endif +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +int +mark_ecn(struct mbuf *m, struct pf_mtag *t, int flags) +{ + struct mbuf *m0; + void *hdr; + int af; + + if ((hdr = t->pftag_hdr) == NULL || + !(t->pftag_flags & (PF_TAG_HDR_INET|PF_TAG_HDR_INET6))) + return (0); + + /* verify that hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)hdr >= m0->m_data) && + ((caddr_t)hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, tag info is stale */ + printf("%s: can't locate header!\n", __func__); + return (0); + } + + if (t->pftag_flags & PF_TAG_HDR_INET) + af = AF_INET; + else if (t->pftag_flags & PF_TAG_HDR_INET6) + af = AF_INET6; + else + af = AF_UNSPEC; + + switch (af) { + case AF_INET: + if (flags & CLASSQF_ECN4) { /* REDF_ECN4 == BLUEF_ECN4 */ + struct ip *ip = hdr; + u_int8_t otos; + int sum; + + if (((uintptr_t)ip + sizeof (*ip)) > + ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) + return (0); /* out of bounds */ + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); /* not-ECT */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (1); /* already marked */ + + /* + * ecn-capable but not marked, + * mark CE and update checksum + */ + otos = ip->ip_tos; + ip->ip_tos |= IPTOS_ECN_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + return (1); + } + break; +#if INET6 + case AF_INET6: + if (flags & CLASSQF_ECN6) { /* REDF_ECN6 == BLUEF_ECN6 */ + struct ip6_hdr *ip6 = hdr; + u_int32_t flowlabel; + + if (((uintptr_t)ip6 + sizeof (*ip6)) > + ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) + return (0); /* out of bounds */ + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_NOTECT << 20)) + return (0); /* not-ECT */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_CE << 20)) + return (1); /* already marked */ + /* + * ecn-capable but not marked, mark CE + */ + flowlabel |= (IPTOS_ECN_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} diff --git a/bsd/net/classq/if_classq.h b/bsd/net/classq/if_classq.h new file mode 100644 index 000000000..9eb32d8c5 --- /dev/null +++ b/bsd/net/classq/if_classq.h @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_CLASSQ_IF_CLASSQ_H_ +#define _NET_CLASSQ_IF_CLASSQ_H_ + +#ifdef PRIVATE +#define IFCQ_SC_MAX 10 /* max number of queues */ + +#ifdef BSD_KERNEL_PRIVATE +#include +/* classq dequeue op arg */ +typedef enum cqdq_op { + CLASSQDQ_REMOVE = 1, /* dequeue mbuf from the queue */ + CLASSQDQ_POLL = 2, /* don't dequeue mbuf from the queue */ +} cqdq_op_t; + +/* classq request types */ +typedef enum cqrq { + CLASSQRQ_PURGE = 1, /* purge all packets */ + CLASSQRQ_PURGE_SC = 2, /* purge service class (and flow) */ + CLASSQRQ_EVENT = 3, /* interface events */ + CLASSQRQ_THROTTLE = 4, /* throttle packets */ +} cqrq_t; + +/* classq purge_sc request argument */ +typedef struct cqrq_purge_sc { + mbuf_svc_class_t sc; /* (in) service class */ + u_int32_t flow; /* (in) 0 means all flows */ + u_int32_t packets; /* (out) purged packets */ + u_int32_t bytes; /* (out) purged bytes */ +} cqrq_purge_sc_t; + +/* classq throttle request argument */ +typedef struct cqrq_throttle { + u_int32_t set; /* set or get */ + u_int32_t level; /* (in/out) throttling level */ +} cqrq_throttle_t; + +#if PF_ALTQ +#include +#endif /* PF_ALTQ */ + +/* + * A token-bucket regulator limits the rate that a network driver can + * dequeue packets from the output queue. Modern cards are able to buffer + * a large amount of packets and dequeue too many packets at a time. This + * bursty dequeue behavior makes it impossible to schedule packets by + * queueing disciplines. A token-bucket is used to control the burst size + * in a device independent manner. + */ +struct tb_regulator { + u_int64_t tbr_rate_raw; /* (unscaled) token bucket rate */ + u_int32_t tbr_percent; /* token bucket rate in percentage */ + int64_t tbr_rate; /* (scaled) token bucket rate */ + int64_t tbr_depth; /* (scaled) token bucket depth */ + + int64_t tbr_token; /* (scaled) current token */ + int64_t tbr_filluptime; /* (scaled) time to fill up bucket */ + u_int64_t tbr_last; /* last time token was updated */ + + int tbr_lastop; /* last dequeue operation type */ + /* needed for poll-and-dequeue */ +}; + +/* simple token bucket meter profile */ +struct tb_profile { + u_int64_t rate; /* rate in bit-per-sec */ + u_int32_t percent; /* rate in percentage */ + u_int32_t depth; /* depth in bytes */ +}; + +struct ifclassq; +enum cqdq_op; +enum cqrq; + +typedef int (*ifclassq_enq_func)(struct ifclassq *, struct mbuf *); +typedef struct mbuf *(*ifclassq_deq_func)(struct ifclassq *, enum cqdq_op); +typedef struct mbuf *(*ifclassq_deq_sc_func)(struct ifclassq *, + mbuf_svc_class_t, enum cqdq_op); +typedef int (*ifclassq_req_func)(struct ifclassq *, enum cqrq, void *); + +/* + * Structure defining a queue for a network interface. + */ +struct ifclassq { + decl_lck_mtx_data(, ifcq_lock); + + struct ifnet *ifcq_ifp; /* back pointer to interface */ + u_int32_t ifcq_len; + u_int32_t ifcq_maxlen; + struct pktcntr ifcq_xmitcnt; + struct pktcntr ifcq_dropcnt; + + u_int32_t ifcq_type; /* scheduler type */ + u_int32_t ifcq_flags; /* flags */ + u_int32_t ifcq_sflags; /* scheduler flags */ + void *ifcq_disc; /* for scheduler-specific use */ + /* + * ifcq_disc_slots[] represents the leaf classes configured for the + * corresponding discpline/scheduler, ordered by their corresponding + * service class index. Each slot holds the queue ID used to identify + * the class instance, as well as the class instance pointer itself. + * The latter is used during enqueue and dequeue in order to avoid the + * costs associated with looking up the class pointer based on the + * queue ID. The queue ID is used when querying the statistics from + * user space. + * + * Avoiding the use of queue ID during enqueue and dequeue is made + * possible by virtue of knowing the particular mbuf service class + * associated with the packets. The service class index of the + * packet is used as the index to ifcq_disc_slots[]. + * + * ifcq_disc_slots[] therefore also acts as a lookup table which + * provides for the mapping between MBUF_SC values and the actual + * scheduler classes. + */ + struct ifclassq_disc_slot { + u_int32_t qid; + void *cl; + } ifcq_disc_slots[IFCQ_SC_MAX]; /* for discipline use */ + + ifclassq_enq_func ifcq_enqueue; + ifclassq_deq_func ifcq_dequeue; + ifclassq_deq_sc_func ifcq_dequeue_sc; + ifclassq_req_func ifcq_request; + + /* token bucket regulator */ + struct tb_regulator ifcq_tbr; /* TBR */ + +#if PF_ALTQ + u_int32_t ifcq_drain; + struct ifaltq ifcq_altq; +#endif /* PF_ALTQ */ +}; + +#if PF_ALTQ +#define IFCQ_ALTQ(_ifcq) (&(_ifcq)->ifcq_altq) +#define IFCQ_IS_DRAINING(_ifcq) ((_ifcq)->ifcq_drain > 0) +#endif /* PF_ALTQ */ + +/* ifcq_flags */ +#define IFCQF_READY 0x01 /* ifclassq supports discipline */ +#define IFCQF_ENABLED 0x02 /* ifclassq is in use */ +#define IFCQF_TBR 0x04 /* Token Bucket Regulator is in use */ + +#define IFCQ_IS_READY(_ifcq) ((_ifcq)->ifcq_flags & IFCQF_READY) +#define IFCQ_IS_ENABLED(_ifcq) ((_ifcq)->ifcq_flags & IFCQF_ENABLED) +#define IFCQ_TBR_IS_ENABLED(_ifcq) ((_ifcq)->ifcq_flags & IFCQF_TBR) + +/* classq enqueue return value */ +#define CLASSQEQ_DROPPED (-1) /* packet dropped (freed) */ +#define CLASSQEQ_SUCCESS 0 /* success, packet enqueued */ +#define CLASSQEQ_SUCCESS_FC 1 /* packet enqueued; */ + /* give flow control feedback */ +#define CLASSQEQ_DROPPED_FC 2 /* packet dropped; */ + /* give flow control feedback */ +#define CLASSQEQ_DROPPED_SP 3 /* packet dropped due to suspension; */ + /* give flow control feedback */ + +/* interface event argument for CLASSQRQ_EVENT */ +typedef enum cqev { + CLASSQ_EV_LINK_SPEED = 1, /* link speed has changed */ + CLASSQ_EV_LINK_MTU = 2, /* link MTU has changed */ + CLASSQ_EV_LINK_UP = 3, /* link is now up */ + CLASSQ_EV_LINK_DOWN = 4, /* link is now down */ +} cqev_t; +#endif /* BSD_KERNEL_PRIVATE */ + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct if_ifclassq_stats { + u_int32_t ifqs_len; + u_int32_t ifqs_maxlen; + struct pktcntr ifqs_xmitcnt; + struct pktcntr ifqs_dropcnt; + u_int32_t ifqs_scheduler; + union { + struct priq_classstats ifqs_priq_stats; + struct fairq_classstats ifqs_fairq_stats; + struct tcq_classstats ifqs_tcq_stats; + struct cbq_classstats ifqs_cbq_stats; + struct hfsc_classstats ifqs_hfsc_stats; + struct qfq_classstats ifqs_qfq_stats; + }; +} __attribute__((aligned(8))); + +#ifdef __cplusplus +} +#endif + +#ifdef BSD_KERNEL_PRIVATE +/* + * For ifclassq lock + */ +#define IFCQ_LOCK_ASSERT_HELD(_ifcq) \ + lck_mtx_assert(&(_ifcq)->ifcq_lock, LCK_MTX_ASSERT_OWNED) + +#define IFCQ_LOCK_ASSERT_NOTHELD(_ifcq) \ + lck_mtx_assert(&(_ifcq)->ifcq_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFCQ_LOCK(_ifcq) \ + lck_mtx_lock(&(_ifcq)->ifcq_lock) + +#define IFCQ_LOCK_SPIN(_ifcq) \ + lck_mtx_lock_spin(&(_ifcq)->ifcq_lock) + +#define IFCQ_CONVERT_LOCK(_ifcq) do { \ + IFCQ_LOCK_ASSERT_HELD(_ifcq); \ + lck_mtx_convert_spin(&(_ifcq)->ifcq_lock); \ +} while (0) + +#define IFCQ_UNLOCK(_ifcq) \ + lck_mtx_unlock(&(_ifcq)->ifcq_lock) + +/* + * For ifclassq operations + */ +#define IFCQ_ENQUEUE(_ifq, _m, _err) do { \ + (_err) = (*(_ifq)->ifcq_enqueue)(_ifq, _m); \ +} while (0) + +#define IFCQ_DEQUEUE(_ifq, _m) do { \ + (_m) = (*(_ifq)->ifcq_dequeue)(_ifq, CLASSQDQ_REMOVE); \ +} while (0) + +#define IFCQ_DEQUEUE_SC(_ifq, _sc, _m) do { \ + (_m) = (*(_ifq)->ifcq_dequeue_sc)(_ifq, _sc, CLASSQDQ_REMOVE); \ +} while (0) + +#define IFCQ_TBR_DEQUEUE(_ifcq, _m) do { \ + (_m) = ifclassq_tbr_dequeue(_ifcq, CLASSQDQ_REMOVE); \ +} while (0) + +#define IFCQ_TBR_DEQUEUE_SC(_ifcq, _sc, _m) do { \ + (_m) = ifclassq_tbr_dequeue_sc(_ifcq, CLASSQDQ_REMOVE, _sc); \ +} while (0) + +#define IFCQ_POLL(_ifq, _m) do { \ + (_m) = (*(_ifq)->ifcq_dequeue)(_ifq, CLASSQDQ_POLL); \ +} while (0) + +#define IFCQ_POLL_SC(_ifq, _sc, _m) do { \ + (_m) = (*(_ifq)->ifcq_dequeue_sc)(_ifq, _sc, CLASSQDQ_POLL); \ +} while (0) + +#define IFCQ_TBR_POLL(_ifcq, _m) do { \ + (_m) = ifclassq_tbr_dequeue(_ifcq, CLASSQDQ_POLL); \ +} while (0) + +#define IFCQ_TBR_POLL_SC(_ifcq, _sc, _m) do { \ + (_m) = ifclassq_tbr_dequeue_sc(_ifcq, CLASSQDQ_POLL, _sc); \ +} while (0) + +#define IFCQ_PURGE(_ifq) do { \ + (void) (*(_ifq)->ifcq_request)(_ifq, CLASSQRQ_PURGE, NULL); \ +} while (0) + +#define IFCQ_PURGE_SC(_ifq, _sc, _flow, _packets, _bytes) do { \ + cqrq_purge_sc_t _req = { _sc, _flow, 0, 0 }; \ + (void) (*(_ifq)->ifcq_request)(_ifq, CLASSQRQ_PURGE_SC, &_req); \ + (_packets) = _req.packets; \ + (_bytes) = _req.bytes; \ +} while (0) + +#define IFCQ_UPDATE(_ifq, _ev) do { \ + (void) (*(_ifq)->ifcq_request)(_ifq, CLASSQRQ_EVENT, \ + (void *)(_ev)); \ +} while (0) + +#define IFCQ_SET_THROTTLE(_ifq, _level, _err) do { \ + cqrq_throttle_t _req = { 1, _level }; \ + (_err) = (*(_ifq)->ifcq_request) \ + (_ifq, CLASSQRQ_THROTTLE, &_req); \ +} while (0) + +#define IFCQ_GET_THROTTLE(_ifq, _level, _err) do { \ + cqrq_throttle_t _req = { 0, IFNET_THROTTLE_OFF }; \ + (_err) = (*(_ifq)->ifcq_request) \ + (_ifq, CLASSQRQ_THROTTLE, &_req); \ + (_level) = _req.level; \ +} while (0) + +#define IFCQ_LEN(_ifcq) ((_ifcq)->ifcq_len) +#define IFCQ_QFULL(_ifcq) (IFCQ_LEN(_ifcq) >= (_ifcq)->ifcq_maxlen) +#define IFCQ_IS_EMPTY(_ifcq) (IFCQ_LEN(_ifcq) == 0) +#define IFCQ_INC_LEN(_ifcq) (IFCQ_LEN(_ifcq)++) +#define IFCQ_DEC_LEN(_ifcq) (IFCQ_LEN(_ifcq)--) +#define IFCQ_MAXLEN(_ifcq) ((_ifcq)->ifcq_maxlen) +#define IFCQ_SET_MAXLEN(_ifcq, _len) ((_ifcq)->ifcq_maxlen = (_len)) + +#define IFCQ_XMIT_ADD(_ifcq, _pkt, _len) do { \ + PKTCNTR_ADD(&(_ifcq)->ifcq_xmitcnt, _pkt, _len); \ +} while (0) + +#define IFCQ_DROP_ADD(_ifcq, _pkt, _len) do { \ + PKTCNTR_ADD(&(_ifcq)->ifcq_dropcnt, _pkt, _len); \ +} while (0) + +extern int ifclassq_setup(struct ifnet *, u_int32_t, boolean_t); +extern void ifclassq_teardown(struct ifnet *); +extern int ifclassq_pktsched_setup(struct ifclassq *); +extern void ifclassq_set_maxlen(struct ifclassq *, u_int32_t); +extern u_int32_t ifclassq_get_maxlen(struct ifclassq *); +extern u_int32_t ifclassq_get_len(struct ifclassq *); +extern errno_t ifclassq_enqueue(struct ifclassq *, struct mbuf *); +extern errno_t ifclassq_dequeue(struct ifclassq *, u_int32_t, struct mbuf **, + struct mbuf **, u_int32_t *, u_int32_t *); +extern errno_t ifclassq_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, + u_int32_t, struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *); +extern struct mbuf *ifclassq_poll(struct ifclassq *); +extern struct mbuf *ifclassq_poll_sc(struct ifclassq *, mbuf_svc_class_t); +extern void ifclassq_update(struct ifclassq *, cqev_t); +extern int ifclassq_attach(struct ifclassq *, u_int32_t, void *, + ifclassq_enq_func, ifclassq_deq_func, ifclassq_deq_sc_func, + ifclassq_req_func); +extern int ifclassq_detach(struct ifclassq *); +extern int ifclassq_getqstats(struct ifclassq *, u_int32_t, + void *, u_int32_t *); +extern const char *ifclassq_ev2str(cqev_t); +extern int ifclassq_tbr_set(struct ifclassq *, struct tb_profile *, boolean_t); +extern struct mbuf *ifclassq_tbr_dequeue(struct ifclassq *, int); +extern struct mbuf *ifclassq_tbr_dequeue_sc(struct ifclassq *, int, + mbuf_svc_class_t); +#endif /* BSD_KERNEL_PRIVATE */ +#endif /* PRIVATE */ +#endif /* _NET_CLASSQ_IF_CLASSQ_H_ */ diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 51c2d976e..5da68ac68 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -41,12 +41,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -62,10 +64,19 @@ #include #include #include +#include +#include #if INET #include #include +#include +#include +#include +#include +#include +#include +#include #endif /* INET */ #if INET6 @@ -92,6 +103,10 @@ #if PF #include #endif /* PF */ +#if PF_ALTQ +#include +#endif /* PF_ALTQ */ +#include #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) @@ -110,18 +125,12 @@ #define DLIL_PRINTF kprintf #endif -#define _CASSERT(x) \ - switch (0) { case 0: case (x): ; } - #define IF_DATA_REQUIRE_ALIGNED_64(f) \ _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t))) #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \ _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t))) -#define IFNET_IF_TC_REQUIRE_ALIGNED_64(f) \ - _CASSERT(!(offsetof(struct ifnet, if_tc.f) % sizeof (u_int64_t))) - enum { kProtoKPI_v1 = 1, kProtoKPI_v2 = 2 @@ -171,7 +180,7 @@ SLIST_HEAD(proto_hash_entry, if_proto); struct dlil_ifnet { struct ifnet dl_if; /* public ifnet */ /* - * dlil private fields, protected by dl_if_lock + * DLIL private fields, protected by dl_if_lock */ decl_lck_mtx_data(, dl_if_lock); TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */ @@ -186,6 +195,8 @@ struct dlil_ifnet { u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */ u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */ } dl_if_lladdr; + u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */ + struct dlil_threading_info dl_if_inpstorage; /* input thread storage */ ctrace_t dl_if_attach; /* attach PC stacktrace */ ctrace_t dl_if_detach; /* detach PC stacktrace */ }; @@ -234,12 +245,26 @@ static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head; static lck_grp_t *dlil_lock_group; lck_grp_t *ifnet_lock_group; static lck_grp_t *ifnet_head_lock_group; +static lck_grp_t *ifnet_snd_lock_group; +static lck_grp_t *ifnet_rcv_lock_group; lck_attr_t *ifnet_lock_attr; decl_lck_rw_data(static, ifnet_head_lock); decl_lck_mtx_data(static, dlil_ifnet_lock); u_int32_t dlil_filter_count = 0; extern u_int32_t ipv4_ll_arp_aware; +struct sfb_fc_list ifnet_fclist; +decl_lck_mtx_data(static, ifnet_fclist_lock); + +static unsigned int ifnet_fcezone_size; /* size of ifnet_fce */ +static struct zone *ifnet_fcezone; /* zone for ifnet_fce */ + +#define IFNET_FCEZONE_MAX 32 /* maximum elements in zone */ +#define IFNET_FCEZONE_NAME "ifnet_fcezone" /* zone name */ + +static void ifnet_fc_thread_func(void *, wait_result_t); +static void ifnet_fc_init(void); + #if DEBUG static unsigned int ifnet_debug = 1; /* debugging (enabled) */ #else @@ -258,12 +283,6 @@ static struct zone *dlif_filt_zone; /* zone for ifnet_filter */ #define DLIF_FILT_ZONE_MAX 8 /* maximum elements in zone */ #define DLIF_FILT_ZONE_NAME "ifnet_filter" /* zone name */ -static unsigned int dlif_inp_size; /* size of dlil_threading_info */ -static struct zone *dlif_inp_zone; /* zone for dlil_threading_info */ - -#define DLIF_INP_ZONE_MAX DLIF_ZONE_MAX /* maximum elements in zone */ -#define DLIF_INP_ZONE_NAME "ifnet_thread" /* zone name */ - static unsigned int dlif_phash_size; /* size of ifnet proto hash table */ static struct zone *dlif_phash_zone; /* zone for ifnet proto hash table */ @@ -276,6 +295,20 @@ static struct zone *dlif_proto_zone; /* zone for if_proto */ #define DLIF_PROTO_ZONE_MAX (DLIF_ZONE_MAX*2) /* maximum elements in zone */ #define DLIF_PROTO_ZONE_NAME "ifnet_proto" /* zone name */ +static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */ +static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */ +static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */ + +#define DLIF_TCPSTAT_ZONE_MAX 1 /* maximum elements in zone */ +#define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */ + +static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */ +static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */ +static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */ + +#define DLIF_UDPSTAT_ZONE_MAX 1 /* maximum elements in zone */ +#define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */ + /* * Updating this variable should be done by first acquiring the global * radix node head (rnh_lock), in tandem with settting/clearing the @@ -284,11 +317,9 @@ static struct zone *dlif_proto_zone; /* zone for if_proto */ u_int32_t ifnet_aggressive_drainers; static u_int32_t net_rtref; -static struct dlil_threading_info dlil_lo_thread; -__private_extern__ struct dlil_threading_info *dlil_lo_thread_ptr = &dlil_lo_thread; - -static struct mbuf *dlil_lo_input_mbuf_head = NULL; -static struct mbuf *dlil_lo_input_mbuf_tail = NULL; +static struct dlil_main_threading_info dlil_main_input_thread_info; +__private_extern__ struct dlil_threading_info *dlil_main_input_thread = + (struct dlil_threading_info *)&dlil_main_input_thread_info; static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg); static int dlil_detach_filter_internal(interface_filter_t filter, int detached); @@ -327,6 +358,10 @@ static errno_t ifproto_media_send_arp(struct ifnet *, u_short, const struct sockaddr_dl *, const struct sockaddr *); static errno_t ifp_if_output(struct ifnet *, struct mbuf *); +static void ifp_if_start(struct ifnet *); +static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t, + struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *); +static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *); static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *, protocol_family_t *); static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t, @@ -334,20 +369,42 @@ static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t, static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t); static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *); static errno_t ifp_if_framer(struct ifnet *, struct mbuf **, - const struct sockaddr *, const char *, const char *); -static errno_t ifp_if_ioctl(struct ifnet *, unsigned long, void *); + const struct sockaddr *, const char *, const char * +#if CONFIG_EMBEDDED + , + u_int32_t *, u_int32_t * +#endif /* CONFIG_EMBEDDED */ + ); static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func); static void ifp_if_free(struct ifnet *); static void ifp_if_event(struct ifnet *, const struct kev_msg *); +static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *); +static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *); -static void dlil_input_thread_func(struct dlil_threading_info *inpthread); +static void dlil_main_input_thread_func(void *, wait_result_t); +static void dlil_input_thread_func(void *, wait_result_t); +static void dlil_rxpoll_input_thread_func(void *, wait_result_t); +static void dlil_rxpoll_calc_limits(struct dlil_threading_info *); static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *); - -static void ifnet_delayed_thread_func(void); +static void dlil_terminate_input_thread(struct dlil_threading_info *); +static void dlil_input_stats_add(const struct ifnet_stat_increment_param *, + struct dlil_threading_info *, boolean_t); +static void dlil_input_stats_sync(struct ifnet *, struct dlil_threading_info *); +static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *, + u_int32_t, ifnet_model_t, boolean_t); +static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *, + const struct ifnet_stat_increment_param *, boolean_t, boolean_t); + +static void ifnet_detacher_thread_func(void *, wait_result_t); +static int ifnet_detacher_thread_cont(int); static void ifnet_detach_final(struct ifnet *); static void ifnet_detaching_enqueue(struct ifnet *); static struct ifnet *ifnet_detaching_dequeue(void); +static void ifnet_start_thread_fn(void *, wait_result_t); +static void ifnet_poll_thread_fn(void *, wait_result_t); +static void ifnet_poll(struct ifnet *); + static void ifp_src_route_copyout(struct ifnet *, struct route *); static void ifp_src_route_copyin(struct ifnet *, struct route *); #if INET6 @@ -355,6 +412,10 @@ static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *); static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *); #endif /* INET6 */ +static int sysctl_rxpoll SYSCTL_HANDLER_ARGS; +static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS; +static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS; + /* The following are protected by dlil_ifnet_lock */ static TAILQ_HEAD(, ifnet) ifnet_detaching_head; static u_int32_t ifnet_detaching_cnt; @@ -363,6 +424,11 @@ static void *ifnet_delayed_run; /* wait channel for detaching thread */ extern void bpfdetach(struct ifnet*); extern void proto_input_run(void); +extern uint32_t udp_count_opportunistic(unsigned int ifindex, + u_int32_t flags); +extern uint32_t tcp_count_opportunistic(unsigned int ifindex, + u_int32_t flags); + __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); #if DEBUG @@ -370,28 +436,108 @@ static int dlil_verbose = 1; #else static int dlil_verbose = 0; #endif /* DEBUG */ -static int dlil_multithreaded_input = 1; -static int cur_dlil_input_threads = 0; #if IFNET_INPUT_SANITY_CHK -static int dlil_lo_input_mbuf_count = 0; /* sanity checking of input packet lists received */ -static int dlil_input_sanity_check = 0; -#endif +static u_int32_t dlil_input_sanity_check = 0; +#endif /* IFNET_INPUT_SANITY_CHK */ +/* rate limit debug messages */ +struct timespec dlil_dbgrate = { 1, 0 }; SYSCTL_DECL(_net_link_generic_system); -SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, CTLFLAG_RW, - &dlil_verbose, 0, "Log DLIL error messages"); - -SYSCTL_INT(_net_link_generic_system, OID_AUTO, multi_threaded_input, CTLFLAG_RW, - &dlil_multithreaded_input , 0, "Uses multiple input thread for DLIL input"); +SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, + CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages"); + +#define IF_SNDQ_MINLEN 32 +u_int32_t if_sndq_maxlen = IFQ_MAXLEN; +SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN, + sysctl_sndq_maxlen, "I", "Default transmit queue max length"); + +#define IF_RCVQ_MINLEN 32 +#define IF_RCVQ_MAXLEN 256 +u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN; +SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN, + sysctl_rcvq_maxlen, "I", "Default receive queue max length"); + +#define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */ +static u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY, + "ilog2 of EWMA decay rate of avg inbound packets"); + +#define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */ +static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME; +SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime, + "input poll mode freeze time"); + +#define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */ +static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME; +SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_sample_time, + CTLFLAG_RD | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime, + "input poll sampling time"); + +#define IF_RXPOLL_INTERVAL_TIME (1ULL * 1000 * 1000) /* 1 ms */ +static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVAL_TIME; +SYSCTL_QUAD(_net_link_generic_system, OID_AUTO, rxpoll_interval_time, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time, + "input poll interval (time)"); + +#define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */ +static u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts, + IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)"); + +#define IF_RXPOLL_WLOWAT 5 +static u_int32_t if_rxpoll_wlowat = IF_RXPOLL_WLOWAT; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_wlowat, IF_RXPOLL_WLOWAT, + "input poll wakeup low watermark"); + +#define IF_RXPOLL_WHIWAT 100 +static u_int32_t if_rxpoll_whiwat = IF_RXPOLL_WHIWAT; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_whiwat, IF_RXPOLL_WHIWAT, + "input poll wakeup high watermark"); + +static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */ +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0, + "max packets per poll call"); + +static u_int32_t if_rxpoll = 1; +SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0, + sysctl_rxpoll, "I", "enable opportunistic input polling"); + +u_int32_t if_bw_smoothing_val = 3; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, if_bw_smoothing_val, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_smoothing_val, 0, ""); + +u_int32_t if_bw_measure_size = 10; +SYSCTL_INT(_net_link_generic_system, OID_AUTO, if_bw_measure_size, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_bw_measure_size, 0, ""); + +static u_int32_t cur_dlil_input_threads = 0; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads, + CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads , 0, + "Current number of DLIL input threads"); #if IFNET_INPUT_SANITY_CHK -SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, - CTLFLAG_RW, &dlil_input_sanity_check , 0, +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, + CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check , 0, "Turn on sanity checking in DLIL input"); -#endif +#endif /* IFNET_INPUT_SANITY_CHK */ +static u_int32_t if_flowadv = 1; +SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1, + "enable flow-advisory mechanism"); + +unsigned int net_rxpoll = 1; unsigned int net_affinity = 1; static kern_return_t dlil_affinity_set(struct thread *, u_int32_t); @@ -399,10 +545,47 @@ extern u_int32_t inject_buckets; static lck_grp_attr_t *dlil_grp_attributes = NULL; static lck_attr_t *dlil_lck_attributes = NULL; -static lck_grp_t *dlil_input_lock_grp = NULL; #define PROTO_HASH_SLOTS 0x5 +#define DLIL_INPUT_CHECK(m, ifp) { \ + struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \ + if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \ + !(mbuf_flags(m) & MBUF_PKTHDR)) { \ + panic_plain("%s: invalid mbuf %p\n", __func__, m); \ + /* NOTREACHED */ \ + } \ +} + +#define DLIL_EWMA(old, new, decay) do { \ + u_int32_t _avg; \ + if ((_avg = (old)) > 0) \ + _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \ + else \ + _avg = (new); \ + (old) = _avg; \ +} while (0) + +#define MBPS (1ULL * 1000 * 1000) +#define GBPS (MBPS * 1000) + +struct rxpoll_time_tbl { + u_int64_t speed; /* downlink speed */ + u_int32_t plowat; /* packets low watermark */ + u_int32_t phiwat; /* packets high watermark */ + u_int32_t blowat; /* bytes low watermark */ + u_int32_t bhiwat; /* bytes high watermark */ +}; + +static struct rxpoll_time_tbl rxpoll_tbl[] = { + { 10 * MBPS, 2, 8, (1 * 1024), (6 * 1024) }, + { 100 * MBPS, 10, 40, (4 * 1024), (64 * 1024) }, + { 1 * GBPS, 10, 40, (4 * 1024), (64 * 1024) }, + { 10 * GBPS, 10, 40, (4 * 1024), (64 * 1024) }, + { 100 * GBPS, 10, 40, (4 * 1024), (64 * 1024) }, + { 0, 0, 0, 0, 0 } +}; + /* * Internal functions. */ @@ -634,38 +817,154 @@ dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, dlil_event_internal(ifp, &ev_msg); } +__private_extern__ int +dlil_alloc_local_stats(struct ifnet *ifp) +{ + int ret = EINVAL; + void *buf, *base, **pbuf; + + if (ifp == NULL) + goto end; + + if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) { + /* allocate tcpstat_local structure */ + buf = zalloc(dlif_tcpstat_zone); + if (buf == NULL) { + ret = ENOMEM; + goto end; + } + bzero(buf, dlif_tcpstat_bufsize); + + /* Get the 64-bit aligned base address for this object */ + base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t), + sizeof (u_int64_t)); + VERIFY(((intptr_t)base + dlif_tcpstat_size) <= + ((intptr_t)buf + dlif_tcpstat_bufsize)); + + /* + * Wind back a pointer size from the aligned base and + * save the original address so we can free it later. + */ + pbuf = (void **)((intptr_t)base - sizeof (void *)); + *pbuf = buf; + ifp->if_tcp_stat = base; + + /* allocate udpstat_local structure */ + buf = zalloc(dlif_udpstat_zone); + if (buf == NULL) { + ret = ENOMEM; + goto end; + } + bzero(buf, dlif_udpstat_bufsize); + + /* Get the 64-bit aligned base address for this object */ + base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t), + sizeof (u_int64_t)); + VERIFY(((intptr_t)base + dlif_udpstat_size) <= + ((intptr_t)buf + dlif_udpstat_bufsize)); + + /* + * Wind back a pointer size from the aligned base and + * save the original address so we can free it later. + */ + pbuf = (void **)((intptr_t)base - sizeof (void *)); + *pbuf = buf; + ifp->if_udp_stat = base; + + VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof (u_int64_t)) && + IS_P2ALIGNED(ifp->if_udp_stat, sizeof (u_int64_t))); + + ret = 0; + } + +end: + if (ret != 0) { + if (ifp->if_tcp_stat != NULL) { + pbuf = (void **) + ((intptr_t)ifp->if_tcp_stat - sizeof (void *)); + zfree(dlif_tcpstat_zone, *pbuf); + ifp->if_tcp_stat = NULL; + } + if (ifp->if_udp_stat != NULL) { + pbuf = (void **) + ((intptr_t)ifp->if_udp_stat - sizeof (void *)); + zfree(dlif_udpstat_zone, *pbuf); + ifp->if_udp_stat = NULL; + } + } + + return (ret); +} + static int -dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inputthread) +dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp) { + thread_continue_t func; + u_int32_t limit; int error; - bzero(inputthread, sizeof(*inputthread)); - /* loopback ifp may not be configured at dlil_init time. */ - if (ifp == lo_ifp) { - (void) strlcat(inputthread->input_name, - "dlil_input_main_thread_mtx", DLIL_THREADNAME_LEN); + /* NULL ifp indicates the main input thread, called at dlil_init time */ + if (ifp == NULL) { + func = dlil_main_input_thread_func; + VERIFY(inp == dlil_main_input_thread); + (void) strlcat(inp->input_name, + "main_input", DLIL_THREADNAME_LEN); + } else if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) { + func = dlil_rxpoll_input_thread_func; + VERIFY(inp != dlil_main_input_thread); + (void) snprintf(inp->input_name, DLIL_THREADNAME_LEN, + "%s%d_input_poll", ifp->if_name, ifp->if_unit); } else { - (void) snprintf(inputthread->input_name, DLIL_THREADNAME_LEN, - "dlil_input_%s%d_mtx", ifp->if_name, ifp->if_unit); + func = dlil_input_thread_func; + VERIFY(inp != dlil_main_input_thread); + (void) snprintf(inp->input_name, DLIL_THREADNAME_LEN, + "%s%d_input", ifp->if_name, ifp->if_unit); } + VERIFY(inp->input_thr == THREAD_NULL); - inputthread->lck_grp = lck_grp_alloc_init(inputthread->input_name, - dlil_grp_attributes); - lck_mtx_init(&inputthread->input_lck, inputthread->lck_grp, - dlil_lck_attributes); + inp->lck_grp = lck_grp_alloc_init(inp->input_name, dlil_grp_attributes); + lck_mtx_init(&inp->input_lck, inp->lck_grp, dlil_lck_attributes); + + inp->mode = IFNET_MODEL_INPUT_POLL_OFF; + inp->ifp = ifp; /* NULL for main input thread */ + + net_timerclear(&inp->mode_holdtime); + net_timerclear(&inp->mode_lasttime); + net_timerclear(&inp->sample_holdtime); + net_timerclear(&inp->sample_lasttime); + net_timerclear(&inp->dbg_lasttime); + + /* + * For interfaces that support opportunistic polling, set the + * low and high watermarks for outstanding inbound packets/bytes. + * Also define freeze times for transitioning between modes + * and updating the average. + */ + if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) { + limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN); + dlil_rxpoll_calc_limits(inp); + } else { + limit = (u_int32_t)-1; + } + + _qinit(&inp->rcvq_pkts, Q_DROPTAIL, limit); + if (inp == dlil_main_input_thread) { + struct dlil_main_threading_info *inpm = + (struct dlil_main_threading_info *)inp; + _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit); + } - error= kernel_thread_start((thread_continue_t)dlil_input_thread_func, - inputthread, &inputthread->input_thread); - if (error == 0) { - ml_thread_policy(inputthread->input_thread, MACHINE_GROUP, + error = kernel_thread_start(func, inp, &inp->input_thr); + if (error == KERN_SUCCESS) { + ml_thread_policy(inp->input_thr, MACHINE_GROUP, (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); /* - * Except for the loopback dlil input thread, we create - * an affinity set so that the matching workloop thread - * can be scheduled on the same processor set. + * We create an affinity set so that the matching workloop + * thread or the starter thread (for loopback) can be + * scheduled on the same processor set as the input thread. */ - if (net_affinity && inputthread != dlil_lo_thread_ptr) { - struct thread *tp = inputthread->input_thread; + if (net_affinity) { + struct thread *tp = inp->input_thr; u_int32_t tag; /* * Randomize to reduce the probability @@ -674,23 +973,79 @@ dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inputthread) read_random(&tag, sizeof (tag)); if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) { thread_reference(tp); - inputthread->tag = tag; - inputthread->net_affinity = TRUE; + inp->tag = tag; + inp->net_affinity = TRUE; } } + } else if (inp == dlil_main_input_thread) { + panic_plain("%s: couldn't create main input thread", __func__); + /* NOTREACHED */ } else { - panic("%s: couldn't create thread", __func__); + panic_plain("%s: couldn't create %s%d input thread", __func__, + ifp->if_name, ifp->if_unit); /* NOTREACHED */ } OSAddAtomic(1, &cur_dlil_input_threads); -#if DLIL_DEBUG - printf("%s: threadinfo: %p input_thread=%p threads: cur=%d max=%d\n", - __func__, inputthread, inputthread->input_thread, - dlil_multithreaded_input, cur_dlil_input_threads); -#endif + return (error); } +static void +dlil_terminate_input_thread(struct dlil_threading_info *inp) +{ + struct ifnet *ifp; + + VERIFY(current_thread() == inp->input_thr); + VERIFY(inp != dlil_main_input_thread); + + OSAddAtomic(-1, &cur_dlil_input_threads); + + lck_mtx_destroy(&inp->input_lck, inp->lck_grp); + lck_grp_free(inp->lck_grp); + + inp->input_waiting = 0; + inp->wtot = 0; + bzero(inp->input_name, sizeof (inp->input_name)); + ifp = inp->ifp; + inp->ifp = NULL; + VERIFY(qhead(&inp->rcvq_pkts) == NULL && qempty(&inp->rcvq_pkts)); + qlimit(&inp->rcvq_pkts) = 0; + bzero(&inp->stats, sizeof (inp->stats)); + + VERIFY(!inp->net_affinity); + inp->input_thr = THREAD_NULL; + VERIFY(inp->wloop_thr == THREAD_NULL); + VERIFY(inp->poll_thr == THREAD_NULL); + VERIFY(inp->tag == 0); + + inp->mode = IFNET_MODEL_INPUT_POLL_OFF; + bzero(&inp->tstats, sizeof (inp->tstats)); + bzero(&inp->pstats, sizeof (inp->pstats)); + bzero(&inp->sstats, sizeof (inp->sstats)); + + net_timerclear(&inp->mode_holdtime); + net_timerclear(&inp->mode_lasttime); + net_timerclear(&inp->sample_holdtime); + net_timerclear(&inp->sample_lasttime); + net_timerclear(&inp->dbg_lasttime); + +#if IFNET_INPUT_SANITY_CHK + inp->input_mbuf_cnt = 0; +#endif /* IFNET_INPUT_SANITY_CHK */ + + if (dlil_verbose) { + printf("%s%d: input thread terminated\n", + ifp->if_name, ifp->if_unit); + } + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ +} + static kern_return_t dlil_affinity_set(struct thread *tp, u_int32_t tag) { @@ -721,6 +1076,7 @@ dlil_init(void) IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts); IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops); IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto); + IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) @@ -733,19 +1089,7 @@ dlil_init(void) IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops); IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto); - - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkpackets); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkbytes); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkpackets); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkbytes); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivipackets); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivibytes); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovipackets); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovibytes); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivopackets); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivobytes); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovopackets); - IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovobytes); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs); /* * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts. @@ -765,10 +1109,13 @@ dlil_init(void) * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info. */ _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN); + _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN); PE_parse_boot_argn("net_affinity", &net_affinity, sizeof (net_affinity)); + PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof (net_rxpoll)); + PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref)); PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug)); @@ -781,7 +1128,8 @@ dlil_init(void) dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize, 0, DLIF_ZONE_NAME); if (dlif_zone == NULL) { - panic("%s: failed allocating %s", __func__, DLIF_ZONE_NAME); + panic_plain("%s: failed allocating %s", __func__, + DLIF_ZONE_NAME); /* NOTREACHED */ } zone_change(dlif_zone, Z_EXPAND, TRUE); @@ -791,28 +1139,18 @@ dlil_init(void) dlif_filt_zone = zinit(dlif_filt_size, DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME); if (dlif_filt_zone == NULL) { - panic("%s: failed allocating %s", __func__, + panic_plain("%s: failed allocating %s", __func__, DLIF_FILT_ZONE_NAME); /* NOTREACHED */ } zone_change(dlif_filt_zone, Z_EXPAND, TRUE); zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE); - dlif_inp_size = sizeof (struct dlil_threading_info); - dlif_inp_zone = zinit(dlif_inp_size, - DLIF_INP_ZONE_MAX * dlif_inp_size, 0, DLIF_INP_ZONE_NAME); - if (dlif_inp_zone == NULL) { - panic("%s: failed allocating %s", __func__, DLIF_INP_ZONE_NAME); - /* NOTREACHED */ - } - zone_change(dlif_inp_zone, Z_EXPAND, TRUE); - zone_change(dlif_inp_zone, Z_CALLERACCT, FALSE); - dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS; dlif_phash_zone = zinit(dlif_phash_size, DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME); if (dlif_phash_zone == NULL) { - panic("%s: failed allocating %s", __func__, + panic_plain("%s: failed allocating %s", __func__, DLIF_PHASH_ZONE_NAME); /* NOTREACHED */ } @@ -823,13 +1161,47 @@ dlil_init(void) dlif_proto_zone = zinit(dlif_proto_size, DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME); if (dlif_proto_zone == NULL) { - panic("%s: failed allocating %s", __func__, + panic_plain("%s: failed allocating %s", __func__, DLIF_PROTO_ZONE_NAME); /* NOTREACHED */ } zone_change(dlif_proto_zone, Z_EXPAND, TRUE); zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE); + dlif_tcpstat_size = sizeof (struct tcpstat_local); + /* Enforce 64-bit alignment for tcpstat_local structure */ + dlif_tcpstat_bufsize = + dlif_tcpstat_size + sizeof (void *) + sizeof (u_int64_t); + dlif_tcpstat_bufsize = + P2ROUNDUP(dlif_tcpstat_bufsize, sizeof (u_int64_t)); + dlif_tcpstat_zone = zinit(dlif_tcpstat_bufsize, + DLIF_TCPSTAT_ZONE_MAX * dlif_tcpstat_bufsize, 0, + DLIF_TCPSTAT_ZONE_NAME); + if (dlif_tcpstat_zone == NULL) { + panic_plain("%s: failed allocating %s", __func__, + DLIF_TCPSTAT_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_tcpstat_zone, Z_EXPAND, TRUE); + zone_change(dlif_tcpstat_zone, Z_CALLERACCT, FALSE); + + dlif_udpstat_size = sizeof (struct udpstat_local); + /* Enforce 64-bit alignment for udpstat_local structure */ + dlif_udpstat_bufsize = + dlif_udpstat_size + sizeof (void *) + sizeof (u_int64_t); + dlif_udpstat_bufsize = + P2ROUNDUP(dlif_udpstat_bufsize, sizeof (u_int64_t)); + dlif_udpstat_zone = zinit(dlif_udpstat_bufsize, + DLIF_TCPSTAT_ZONE_MAX * dlif_udpstat_bufsize, 0, + DLIF_UDPSTAT_ZONE_NAME); + if (dlif_udpstat_zone == NULL) { + panic_plain("%s: failed allocating %s", __func__, + DLIF_UDPSTAT_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_udpstat_zone, Z_EXPAND, TRUE); + zone_change(dlif_udpstat_zone, Z_CALLERACCT, FALSE); + ifnet_llreach_init(); TAILQ_INIT(&dlil_ifnet_head); @@ -839,13 +1211,15 @@ dlil_init(void) /* Setup the lock groups we will use */ dlil_grp_attributes = lck_grp_attr_alloc_init(); - dlil_lock_group = lck_grp_alloc_init("dlil internal locks", + dlil_lock_group = lck_grp_alloc_init("DLIL internal locks", dlil_grp_attributes); ifnet_lock_group = lck_grp_alloc_init("ifnet locks", dlil_grp_attributes); ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock", dlil_grp_attributes); - dlil_input_lock_grp = lck_grp_alloc_init("dlil input lock", + ifnet_rcv_lock_group = lck_grp_alloc_init("ifnet rcv locks", + dlil_grp_attributes); + ifnet_snd_lock_group = lck_grp_alloc_init("ifnet snd locks", dlil_grp_attributes); /* Setup the lock attributes we will use */ @@ -857,20 +1231,21 @@ dlil_init(void) dlil_lck_attributes); lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes); + ifnet_fc_init(); + lck_attr_free(dlil_lck_attributes); dlil_lck_attributes = NULL; ifa_init(); - /* - * Create and start up the first dlil input thread once everything - * is initialized. + * Create and start up the main DLIL input thread and the interface + * detacher threads once everything is initialized. */ - dlil_create_input_thread(lo_ifp, dlil_lo_thread_ptr); + dlil_create_input_thread(NULL, dlil_main_input_thread); - if (kernel_thread_start((thread_continue_t)ifnet_delayed_thread_func, - NULL, &thread) != 0) { - panic("%s: couldn't create detach thread", __func__); + if (kernel_thread_start(ifnet_detacher_thread_func, + NULL, &thread) != KERN_SUCCESS) { + panic_plain("%s: couldn't create detacher thread", __func__); /* NOTREACHED */ } thread_deallocate(thread); @@ -879,6 +1254,12 @@ dlil_init(void) /* Initialize the packet filter */ pfinit(); #endif /* PF */ + + /* Initialize queue algorithms */ + classq_init(); + + /* Initialize packet schedulers */ + pktsched_init(); } static void @@ -1080,276 +1461,1157 @@ dlil_detach_filter(interface_filter_t filter) dlil_detach_filter_internal(filter, 0); } +/* + * Main input thread: + * + * a) handles all inbound packets for lo0 + * b) handles all inbound packets for interfaces with no dedicated + * input thread (e.g. anything but Ethernet/PDP or those that support + * opportunistic polling.) + * c) protocol registrations + * d) packet injections + */ static void -dlil_input_thread_func(struct dlil_threading_info *inputthread) +dlil_main_input_thread_func(void *v, wait_result_t w) { +#pragma unused(w) + struct dlil_main_threading_info *inpm = v; + struct dlil_threading_info *inp = v; + + VERIFY(inp == dlil_main_input_thread); + VERIFY(inp->ifp == NULL); + VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF); + while (1) { struct mbuf *m = NULL, *m_loop = NULL; -#if IFNET_INPUT_SANITY_CHK - int loop_cnt = 0, mbuf_cnt; - int count; - struct mbuf *m1; -#endif /* IFNET_INPUT_SANITY_CHK */ + u_int32_t m_cnt, m_cnt_loop; + boolean_t proto_req; - lck_mtx_lock_spin(&inputthread->input_lck); + lck_mtx_lock_spin(&inp->input_lck); /* Wait until there is work to be done */ - while (!(inputthread->input_waiting & ~DLIL_INPUT_RUNNING)) { - inputthread->input_waiting &= ~DLIL_INPUT_RUNNING; - msleep(&inputthread->input_waiting, - &inputthread->input_lck, 0, - inputthread->input_name, 0); + while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + inp->input_waiting &= ~DLIL_INPUT_RUNNING; + (void) msleep(&inp->input_waiting, &inp->input_lck, + (PZERO - 1) | PSPIN, inp->input_name, NULL); } - lck_mtx_assert(&inputthread->input_lck, LCK_MTX_ASSERT_OWNED); + inp->input_waiting |= DLIL_INPUT_RUNNING; + inp->input_waiting &= ~DLIL_INPUT_WAITING; - m = inputthread->mbuf_head; - inputthread->mbuf_head = NULL; - inputthread->mbuf_tail = NULL; + /* Main input thread cannot be terminated */ + VERIFY(!(inp->input_waiting & DLIL_INPUT_TERMINATE)); - if (inputthread->input_waiting & DLIL_INPUT_TERMINATE) { - lck_mtx_unlock(&inputthread->input_lck); + proto_req = (inp->input_waiting & + (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)); - if (m != NULL) - mbuf_freem_list(m); + /* Packets for non-dedicated interfaces other than lo0 */ + m_cnt = qlen(&inp->rcvq_pkts); + m = _getq_all(&inp->rcvq_pkts); - OSAddAtomic(-1, &cur_dlil_input_threads); + /* Packets exclusive for lo0 */ + m_cnt_loop = qlen(&inpm->lo_rcvq_pkts); + m_loop = _getq_all(&inpm->lo_rcvq_pkts); - lck_mtx_destroy(&inputthread->input_lck, - inputthread->lck_grp); - lck_grp_free(inputthread->lck_grp); + inp->wtot = 0; - zfree(dlif_inp_zone, inputthread); + lck_mtx_unlock(&inp->input_lck); - /* for the extra refcnt from kernel_thread_start() */ - thread_deallocate(current_thread()); + /* + * NOTE warning %%% attention !!!! + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. + */ + if (m_loop != NULL) + dlil_input_packet_list_extended(lo_ifp, m_loop, + m_cnt_loop, inp->mode); - /* this is the end */ - thread_terminate(current_thread()); - /* NOTREACHED */ - return; - } + if (m != NULL) + dlil_input_packet_list_extended(NULL, m, + m_cnt, inp->mode); + + if (proto_req) + proto_input_run(); + } + + /* NOTREACHED */ + VERIFY(0); /* we should never get here */ +} + +/* + * Input thread for interfaces with legacy input model. + */ +static void +dlil_input_thread_func(void *v, wait_result_t w) +{ +#pragma unused(w) + struct dlil_threading_info *inp = v; + struct ifnet *ifp = inp->ifp; + + VERIFY(inp != dlil_main_input_thread); + VERIFY(ifp != NULL); + VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll); + VERIFY(inp->mode == IFNET_MODEL_INPUT_POLL_OFF); - inputthread->input_waiting |= DLIL_INPUT_RUNNING; - inputthread->input_waiting &= ~DLIL_INPUT_WAITING; + while (1) { + struct mbuf *m = NULL; + u_int32_t m_cnt; + + lck_mtx_lock_spin(&inp->input_lck); - if (inputthread == dlil_lo_thread_ptr) { - m_loop = dlil_lo_input_mbuf_head; - dlil_lo_input_mbuf_head = NULL; - dlil_lo_input_mbuf_tail = NULL; + /* Wait until there is work to be done */ + while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + inp->input_waiting &= ~DLIL_INPUT_RUNNING; + (void) msleep(&inp->input_waiting, &inp->input_lck, + (PZERO - 1) | PSPIN, inp->input_name, NULL); } -#if IFNET_INPUT_SANITY_CHK - if (dlil_input_sanity_check != 0) { - mbuf_cnt = inputthread->mbuf_count; - inputthread->mbuf_count = 0; - if (inputthread == dlil_lo_thread_ptr) { - loop_cnt = dlil_lo_input_mbuf_count; - dlil_lo_input_mbuf_count = 0; - } + inp->input_waiting |= DLIL_INPUT_RUNNING; + inp->input_waiting &= ~DLIL_INPUT_WAITING; - lck_mtx_unlock(&inputthread->input_lck); + /* + * Protocol registration and injection must always use + * the main input thread; in theory the latter can utilize + * the corresponding input thread where the packet arrived + * on, but that requires our knowing the interface in advance + * (and the benefits might not worth the trouble.) + */ + VERIFY(!(inp->input_waiting & + (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER))); - for (m1 = m, count = 0; m1; m1 = mbuf_nextpkt(m1)) { - count++; - } - if (count != mbuf_cnt) { - panic("%s - thread=%p reg. loop queue " - "has %d packets, should have %d\n", - __func__, inputthread, count, mbuf_cnt); - /* NOTREACHED */ - } + /* Packets for this interface */ + m_cnt = qlen(&inp->rcvq_pkts); + m = _getq_all(&inp->rcvq_pkts); - if (inputthread == dlil_lo_thread_ptr) { - for (m1 = m_loop, count = 0; m1; - m1 = mbuf_nextpkt(m1)) { - count++; - } - if (count != loop_cnt) { - panic("%s - thread=%p loop queue " - "has %d packets, should have %d\n", - __func__, inputthread, count, - loop_cnt); - /* NOTREACHED */ - } - } - } else -#endif /* IFNET_INPUT_SANITY_CHK */ - { - lck_mtx_unlock(&inputthread->input_lck); + if (inp->input_waiting & DLIL_INPUT_TERMINATE) { + lck_mtx_unlock(&inp->input_lck); + + /* Free up pending packets */ + if (m != NULL) + mbuf_freem_list(m); + + dlil_terminate_input_thread(inp); + /* NOTREACHED */ + return; } + inp->wtot = 0; + + dlil_input_stats_sync(ifp, inp); + + lck_mtx_unlock(&inp->input_lck); /* * NOTE warning %%% attention !!!! * We should think about putting some thread starvation * safeguards if we deal with long chains of packets. */ - if (m_loop) { - if (inputthread == dlil_lo_thread_ptr) { - dlil_input_packet_list(lo_ifp, m_loop); - } -#if IFNET_INPUT_SANITY_CHK - else { - panic("%s - thread=%p loop queue has %d " - "packets, should have none!\n", __func__, - inputthread, loop_cnt); - /* NOTREACHED */ - } -#endif /* IFNET_INPUT_SANITY_CHK */ - } - if (m != NULL) - dlil_input_packet_list(0, m); - - lck_mtx_lock_spin(&inputthread->input_lck); - - if (inputthread->input_waiting & - (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)) { - lck_mtx_unlock(&inputthread->input_lck); - proto_input_run(); - } else { - lck_mtx_unlock(&inputthread->input_lck); - } + dlil_input_packet_list_extended(NULL, m, + m_cnt, inp->mode); } + + /* NOTREACHED */ + VERIFY(0); /* we should never get here */ } -errno_t -ifnet_input(ifnet_t ifp, mbuf_t m_head, - const struct ifnet_stat_increment_param *stats) +/* + * Input thread for interfaces with opportunistic polling input model. + */ +static void +dlil_rxpoll_input_thread_func(void *v, wait_result_t w) { - struct thread *tp = current_thread(); - mbuf_t m_tail; - struct dlil_threading_info *inp; -#if IFNET_INPUT_SANITY_CHK - u_int32_t pkt_count = 0; -#endif /* IFNET_INPUT_SANITY_CHK */ +#pragma unused(w) + struct dlil_threading_info *inp = v; + struct ifnet *ifp = inp->ifp; + struct timespec ts; - if (ifp == NULL || m_head == NULL) { - if (m_head != NULL) - mbuf_freem_list(m_head); - return (EINVAL); - } + VERIFY(inp != dlil_main_input_thread); + VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL)); - m_tail = m_head; while (1) { -#if IFNET_INPUT_SANITY_CHK - if (dlil_input_sanity_check != 0) { - ifnet_t rcvif; + struct mbuf *m = NULL; + u_int32_t m_cnt, m_size, poll_req = 0; + ifnet_model_t mode; + struct timespec now, delta; - rcvif = mbuf_pkthdr_rcvif(m_tail); - pkt_count++; + lck_mtx_lock_spin(&inp->input_lck); - if (rcvif == NULL || - (ifp->if_type != IFT_LOOP && rcvif != ifp) || - !(mbuf_flags(m_head) & MBUF_PKTHDR)) { - panic("%s - invalid mbuf %p\n", - __func__, m_tail); - /* NOTREACHED */ - } + /* Link parameters changed? */ + if (ifp->if_poll_update != 0) { + ifp->if_poll_update = 0; + dlil_rxpoll_calc_limits(inp); } -#endif /* IFNET_INPUT_SANITY_CHK */ - if (mbuf_nextpkt(m_tail) == NULL) - break; - m_tail = mbuf_nextpkt(m_tail); - } - inp = ifp->if_input_thread; + /* Current operating mode */ + mode = inp->mode; - if (dlil_multithreaded_input == 0 || inp == NULL) - inp = dlil_lo_thread_ptr; - - /* - * If there is a matching dlil input thread associated with an - * affinity set, associate this workloop thread with the same set. - * We will only do this once. - */ - lck_mtx_lock_spin(&inp->input_lck); - if (inp->net_affinity && inp->workloop_thread == NULL) { - u_int32_t tag = inp->tag; - inp->workloop_thread = tp; - lck_mtx_unlock(&inp->input_lck); + /* Wait until there is work to be done */ + while (!(inp->input_waiting & ~DLIL_INPUT_RUNNING) && + qempty(&inp->rcvq_pkts)) { + inp->input_waiting &= ~DLIL_INPUT_RUNNING; + (void) msleep(&inp->input_waiting, &inp->input_lck, + (PZERO - 1) | PSPIN, inp->input_name, NULL); + } - /* Associated the current thread with the new affinity tag */ - (void) dlil_affinity_set(tp, tag); + inp->input_waiting |= DLIL_INPUT_RUNNING; + inp->input_waiting &= ~DLIL_INPUT_WAITING; /* - * Take a reference on the workloop (current) thread; during - * detach, we will need to refer to it in order ot tear down - * its affinity. + * Protocol registration and injection must always use + * the main input thread; in theory the latter can utilize + * the corresponding input thread where the packet arrived + * on, but that requires our knowing the interface in advance + * (and the benefits might not worth the trouble.) */ - thread_reference(tp); - lck_mtx_lock_spin(&inp->input_lck); - } - - /* WARNING - * Because of loopbacked multicast we cannot stuff the ifp in - * the rcvif of the packet header: loopback has its own dlil - * input queue - */ + VERIFY(!(inp->input_waiting & + (DLIL_PROTO_WAITING|DLIL_PROTO_REGISTER))); - if (inp == dlil_lo_thread_ptr && ifp->if_type == IFT_LOOP) { - if (dlil_lo_input_mbuf_head == NULL) - dlil_lo_input_mbuf_head = m_head; - else if (dlil_lo_input_mbuf_tail != NULL) - dlil_lo_input_mbuf_tail->m_nextpkt = m_head; - dlil_lo_input_mbuf_tail = m_tail; -#if IFNET_INPUT_SANITY_CHK - if (dlil_input_sanity_check != 0) { - dlil_lo_input_mbuf_count += pkt_count; - inp->input_mbuf_cnt += pkt_count; - inp->input_wake_cnt++; + if (inp->input_waiting & DLIL_INPUT_TERMINATE) { + /* Free up pending packets */ + _flushq(&inp->rcvq_pkts); + lck_mtx_unlock(&inp->input_lck); - lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED); + dlil_terminate_input_thread(inp); + /* NOTREACHED */ + return; } -#endif - } else { - if (inp->mbuf_head == NULL) - inp->mbuf_head = m_head; - else if (inp->mbuf_tail != NULL) - inp->mbuf_tail->m_nextpkt = m_head; - inp->mbuf_tail = m_tail; -#if IFNET_INPUT_SANITY_CHK - if (dlil_input_sanity_check != 0) { - inp->mbuf_count += pkt_count; - inp->input_mbuf_cnt += pkt_count; - inp->input_wake_cnt++; - lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED); - } -#endif - } + /* Total count of all packets */ + m_cnt = qlen(&inp->rcvq_pkts); + + /* Total bytes of all packets */ + m_size = qsize(&inp->rcvq_pkts); + + /* Packets for this interface */ + m = _getq_all(&inp->rcvq_pkts); + VERIFY(m != NULL || m_cnt == 0); + + nanouptime(&now); + if (!net_timerisset(&inp->sample_lasttime)) + *(&inp->sample_lasttime) = *(&now); + + net_timersub(&now, &inp->sample_lasttime, &delta); + if (if_rxpoll && net_timerisset(&inp->sample_holdtime)) { + u_int32_t ptot, btot; + + /* Accumulate statistics for current sampling */ + PKTCNTR_ADD(&inp->sstats, m_cnt, m_size); + + if (net_timercmp(&delta, &inp->sample_holdtime, <)) + goto skip; + + *(&inp->sample_lasttime) = *(&now); + + /* Calculate min/max of inbound bytes */ + btot = (u_int32_t)inp->sstats.bytes; + if (inp->rxpoll_bmin == 0 || inp->rxpoll_bmin > btot) + inp->rxpoll_bmin = btot; + if (btot > inp->rxpoll_bmax) + inp->rxpoll_bmax = btot; + + /* Calculate EWMA of inbound bytes */ + DLIL_EWMA(inp->rxpoll_bavg, btot, if_rxpoll_decay); + + /* Calculate min/max of inbound packets */ + ptot = (u_int32_t)inp->sstats.packets; + if (inp->rxpoll_pmin == 0 || inp->rxpoll_pmin > ptot) + inp->rxpoll_pmin = ptot; + if (ptot > inp->rxpoll_pmax) + inp->rxpoll_pmax = ptot; + + /* Calculate EWMA of inbound packets */ + DLIL_EWMA(inp->rxpoll_pavg, ptot, if_rxpoll_decay); + + /* Reset sampling statistics */ + PKTCNTR_CLEAR(&inp->sstats); + + /* Calculate EWMA of wakeup requests */ + DLIL_EWMA(inp->rxpoll_wavg, inp->wtot, if_rxpoll_decay); + inp->wtot = 0; + + if (dlil_verbose) { + if (!net_timerisset(&inp->dbg_lasttime)) + *(&inp->dbg_lasttime) = *(&now); + net_timersub(&now, &inp->dbg_lasttime, &delta); + if (net_timercmp(&delta, &dlil_dbgrate, >=)) { + *(&inp->dbg_lasttime) = *(&now); + printf("%s%d: [%s] pkts avg %d max %d " + "limits [%d/%d], wreq avg %d " + "limits [%d/%d], bytes avg %d " + "limits [%d/%d]\n", ifp->if_name, + ifp->if_unit, (inp->mode == + IFNET_MODEL_INPUT_POLL_ON) ? + "ON" : "OFF", inp->rxpoll_pavg, + inp->rxpoll_pmax, + inp->rxpoll_plowat, + inp->rxpoll_phiwat, + inp->rxpoll_wavg, + inp->rxpoll_wlowat, + inp->rxpoll_whiwat, + inp->rxpoll_bavg, + inp->rxpoll_blowat, + inp->rxpoll_bhiwat); + } + } - inp->input_waiting |= DLIL_INPUT_WAITING; - if ((inp->input_waiting & DLIL_INPUT_RUNNING) == 0) { - wakeup((caddr_t)&inp->input_waiting); - } - lck_mtx_unlock(&inp->input_lck); + /* Perform mode transition, if necessary */ + if (!net_timerisset(&inp->mode_lasttime)) + *(&inp->mode_lasttime) = *(&now); + + net_timersub(&now, &inp->mode_lasttime, &delta); + if (net_timercmp(&delta, &inp->mode_holdtime, <)) + goto skip; + + if (inp->rxpoll_pavg <= inp->rxpoll_plowat && + inp->rxpoll_bavg <= inp->rxpoll_blowat && + inp->rxpoll_wavg <= inp->rxpoll_wlowat && + inp->mode != IFNET_MODEL_INPUT_POLL_OFF) { + mode = IFNET_MODEL_INPUT_POLL_OFF; + } else if (inp->rxpoll_pavg >= inp->rxpoll_phiwat && + (inp->rxpoll_bavg >= inp->rxpoll_bhiwat || + inp->rxpoll_wavg >= inp->rxpoll_whiwat) && + inp->mode != IFNET_MODEL_INPUT_POLL_ON) { + mode = IFNET_MODEL_INPUT_POLL_ON; + } - if (stats) { - atomic_add_64(&ifp->if_data.ifi_ipackets, stats->packets_in); - atomic_add_64(&ifp->if_data.ifi_ibytes, stats->bytes_in); - atomic_add_64(&ifp->if_data.ifi_ierrors, stats->errors_in); + if (mode != inp->mode) { + inp->mode = mode; + *(&inp->mode_lasttime) = *(&now); + poll_req++; + } + } +skip: + dlil_input_stats_sync(ifp, inp); - atomic_add_64(&ifp->if_data.ifi_opackets, stats->packets_out); - atomic_add_64(&ifp->if_data.ifi_obytes, stats->bytes_out); - atomic_add_64(&ifp->if_data.ifi_oerrors, stats->errors_out); + lck_mtx_unlock(&inp->input_lck); - atomic_add_64(&ifp->if_data.ifi_collisions, stats->collisions); - atomic_add_64(&ifp->if_data.ifi_iqdrops, stats->dropped); - } + /* + * If there's a mode change and interface is still attached, + * perform a downcall to the driver for the new mode. Also + * hold an IO refcnt on the interface to prevent it from + * being detached (will be release below.) + */ + if (poll_req != 0 && ifnet_is_attached(ifp, 1)) { + struct ifnet_model_params p = { mode, { 0 } }; + errno_t err; + + if (dlil_verbose) { + printf("%s%d: polling is now %s, " + "pkts avg %d max %d limits [%d/%d], " + "wreq avg %d limits [%d/%d], " + "bytes avg %d limits [%d/%d]\n", + ifp->if_name, ifp->if_unit, + (mode == IFNET_MODEL_INPUT_POLL_ON) ? + "ON" : "OFF", inp->rxpoll_pavg, + inp->rxpoll_pmax, inp->rxpoll_plowat, + inp->rxpoll_phiwat, inp->rxpoll_wavg, + inp->rxpoll_wlowat, inp->rxpoll_whiwat, + inp->rxpoll_bavg, inp->rxpoll_blowat, + inp->rxpoll_bhiwat); + } - return (0); -} + if ((err = ((*ifp->if_input_ctl)(ifp, + IFNET_CTL_SET_INPUT_MODEL, sizeof (p), &p))) != 0) { + printf("%s%d: error setting polling mode " + "to %s (%d)\n", ifp->if_name, ifp->if_unit, + (mode == IFNET_MODEL_INPUT_POLL_ON) ? + "ON" : "OFF", err); + } -static int -dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p, - char **frame_header_p, protocol_family_t protocol_family) -{ - struct ifnet_filter *filter; + switch (mode) { + case IFNET_MODEL_INPUT_POLL_OFF: + ifnet_set_poll_cycle(ifp, NULL); + inp->rxpoll_offreq++; + if (err != 0) + inp->rxpoll_offerr++; + break; - /* - * Pass the inbound packet to the interface filters + case IFNET_MODEL_INPUT_POLL_ON: + net_nsectimer(&if_rxpoll_interval_time, &ts); + ifnet_set_poll_cycle(ifp, &ts); + ifnet_poll(ifp); + inp->rxpoll_onreq++; + if (err != 0) + inp->rxpoll_onerr++; + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + /* Release the IO refcnt */ + ifnet_decr_iorefcnt(ifp); + } + + /* + * NOTE warning %%% attention !!!! + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. + */ + if (m != NULL) + dlil_input_packet_list_extended(NULL, m, m_cnt, mode); + } + + /* NOTREACHED */ + VERIFY(0); /* we should never get here */ +} + +static void +dlil_rxpoll_calc_limits(struct dlil_threading_info *inp) +{ + struct ifnet *ifp = inp->ifp; + u_int64_t sample_holdtime, inbw; + + VERIFY(inp != dlil_main_input_thread); + VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL)); + + if ((inbw = ifnet_input_linkrate(ifp)) == 0) { + sample_holdtime = 0; /* polling is disabled */ + inp->rxpoll_wlowat = inp->rxpoll_plowat = + inp->rxpoll_blowat = 0; + inp->rxpoll_whiwat = inp->rxpoll_phiwat = + inp->rxpoll_bhiwat = (u_int32_t)-1; + } else { + unsigned int n, i; + + n = 0; + for (i = 0; rxpoll_tbl[i].speed != 0; i++) { + if (inbw < rxpoll_tbl[i].speed) + break; + n = i; + } + sample_holdtime = if_rxpoll_sample_holdtime; + inp->rxpoll_wlowat = if_rxpoll_wlowat; + inp->rxpoll_whiwat = if_rxpoll_whiwat; + inp->rxpoll_plowat = rxpoll_tbl[n].plowat; + inp->rxpoll_phiwat = rxpoll_tbl[n].phiwat; + inp->rxpoll_blowat = rxpoll_tbl[n].blowat; + inp->rxpoll_bhiwat = rxpoll_tbl[n].bhiwat; + } + + net_nsectimer(&if_rxpoll_mode_holdtime, &inp->mode_holdtime); + net_nsectimer(&sample_holdtime, &inp->sample_holdtime); + + if (dlil_verbose) { + printf("%s%d: speed %llu bps, sample per %llu nsec, " + "pkt limits [%d/%d], wreq limits [%d/%d], " + "bytes limits [%d/%d]\n", ifp->if_name, ifp->if_unit, + inbw, sample_holdtime, inp->rxpoll_plowat, + inp->rxpoll_phiwat, inp->rxpoll_wlowat, inp->rxpoll_whiwat, + inp->rxpoll_blowat, inp->rxpoll_bhiwat); + } +} + +errno_t +ifnet_input(struct ifnet *ifp, struct mbuf *m_head, + const struct ifnet_stat_increment_param *s) +{ + return (ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE)); +} + +errno_t +ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head, + struct mbuf *m_tail, const struct ifnet_stat_increment_param *s) +{ + return (ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE)); +} + +static errno_t +ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, + const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll) +{ + struct thread *tp = current_thread(); + struct mbuf *last; + struct dlil_threading_info *inp; + u_int32_t m_cnt = 0, m_size = 0; + + /* + * Drop the packet(s) if the parameters are invalid, or if the + * interface is no longer attached; else hold an IO refcnt to + * prevent it from being detached (will be released below.) + */ + if (ifp == NULL || m_head == NULL || (s == NULL && ext) || + (ifp != lo_ifp && !ifnet_is_attached(ifp, 1))) { + if (m_head != NULL) + mbuf_freem_list(m_head); + return (EINVAL); + } + + VERIFY(m_tail == NULL || ext); + VERIFY(s != NULL || !ext); + + if (m_tail == NULL) { + last = m_head; + while (1) { +#if IFNET_INPUT_SANITY_CHK + if (dlil_input_sanity_check != 0) + DLIL_INPUT_CHECK(last, ifp); +#endif /* IFNET_INPUT_SANITY_CHK */ + m_cnt++; + m_size += m_length(last); + if (mbuf_nextpkt(last) == NULL) + break; + last = mbuf_nextpkt(last); + } + m_tail = last; + } else { +#if IFNET_INPUT_SANITY_CHK + if (dlil_input_sanity_check != 0) { + last = m_head; + while (1) { + DLIL_INPUT_CHECK(last, ifp); + m_cnt++; + m_size += m_length(last); + if (mbuf_nextpkt(last) == NULL) + break; + last = mbuf_nextpkt(last); + } + } else { + m_cnt = s->packets_in; + m_size = s->bytes_in; + last = m_tail; + } +#else + m_cnt = s->packets_in; + m_size = s->bytes_in; + last = m_tail; +#endif /* IFNET_INPUT_SANITY_CHK */ + } + + if (last != m_tail) { + panic_plain("%s: invalid input packet chain for %s%d, " + "tail mbuf %p instead of %p\n", __func__, ifp->if_name, + ifp->if_unit, m_tail, last); + } + + /* + * Assert packet count only for the extended variant, for backwards + * compatibility, since this came directly from the device driver. + * Relax this assertion for input bytes, as the driver may have + * included the link-layer headers in the computation; hence + * m_size is just an approximation. + */ + if (ext && s->packets_in != m_cnt) { + panic_plain("%s: input packet count mismatch for %s%d, " + "%d instead of %d\n", __func__, ifp->if_name, + ifp->if_unit, s->packets_in, m_cnt); + } + + if ((inp = ifp->if_inp) == NULL) + inp = dlil_main_input_thread; + + /* + * If there is a matching DLIL input thread associated with an + * affinity set, associate this thread with the same set. We + * will only do this once. + */ + lck_mtx_lock_spin(&inp->input_lck); + if (inp != dlil_main_input_thread && inp->net_affinity && + ((!poll && inp->wloop_thr == THREAD_NULL) || + (poll && inp->poll_thr == THREAD_NULL))) { + u_int32_t tag = inp->tag; + + if (poll) { + VERIFY(inp->poll_thr == THREAD_NULL); + inp->poll_thr = tp; + } else { + VERIFY(inp->wloop_thr == THREAD_NULL); + inp->wloop_thr = tp; + } + lck_mtx_unlock(&inp->input_lck); + + /* Associate the current thread with the new affinity tag */ + (void) dlil_affinity_set(tp, tag); + + /* + * Take a reference on the current thread; during detach, + * we will need to refer to it in order ot tear down its + * affinity. + */ + thread_reference(tp); + lck_mtx_lock_spin(&inp->input_lck); + } + + /* + * Because of loopbacked multicast we cannot stuff the ifp in + * the rcvif of the packet header: loopback (lo0) packets use a + * dedicated list so that we can later associate them with lo_ifp + * on their way up the stack. Packets for other interfaces without + * dedicated input threads go to the regular list. + */ + if (inp == dlil_main_input_thread && ifp == lo_ifp) { + struct dlil_main_threading_info *inpm = + (struct dlil_main_threading_info *)inp; + _addq_multi(&inpm->lo_rcvq_pkts, m_head, m_tail, m_cnt, m_size); + } else { + _addq_multi(&inp->rcvq_pkts, m_head, m_tail, m_cnt, m_size); + } + +#if IFNET_INPUT_SANITY_CHK + if (dlil_input_sanity_check != 0) { + u_int32_t count; + struct mbuf *m0; + + for (m0 = m_head, count = 0; m0; m0 = mbuf_nextpkt(m0)) + count++; + + if (count != m_cnt) { + panic_plain("%s%d: invalid packet count %d " + "(expected %d)\n", ifp->if_name, ifp->if_unit, + count, m_cnt); + /* NOTREACHED */ + } + + inp->input_mbuf_cnt += m_cnt; + } +#endif /* IFNET_INPUT_SANITY_CHK */ + + if (s != NULL) { + dlil_input_stats_add(s, inp, poll); + /* + * If we're using the main input thread, synchronize the + * stats now since we have the interface context. All + * other cases involving dedicated input threads will + * have their stats synchronized there. + */ + if (inp == dlil_main_input_thread) + dlil_input_stats_sync(ifp, inp); + } + + inp->input_waiting |= DLIL_INPUT_WAITING; + if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) { + inp->wtot++; + wakeup_one((caddr_t)&inp->input_waiting); + } + lck_mtx_unlock(&inp->input_lck); + + if (ifp != lo_ifp) { + /* Release the IO refcnt */ + ifnet_decr_iorefcnt(ifp); + } + + return (0); +} + +void +ifnet_start(struct ifnet *ifp) +{ + /* + * If the starter thread is inactive, signal it to do work. + */ + lck_mtx_lock_spin(&ifp->if_start_lock); + ifp->if_start_req++; + if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL) { + wakeup_one((caddr_t)&ifp->if_start_thread); + } + lck_mtx_unlock(&ifp->if_start_lock); +} + +static void +ifnet_start_thread_fn(void *v, wait_result_t w) +{ +#pragma unused(w) + struct ifnet *ifp = v; + char ifname[IFNAMSIZ + 1]; + struct timespec *ts = NULL; + struct ifclassq *ifq = &ifp->if_snd; + + /* + * Treat the dedicated starter thread for lo0 as equivalent to + * the driver workloop thread; if net_affinity is enabled for + * the main input thread, associate this starter thread to it + * by binding them with the same affinity tag. This is done + * only once (as we only have one lo_ifp which never goes away.) + */ + if (ifp == lo_ifp) { + struct dlil_threading_info *inp = dlil_main_input_thread; + struct thread *tp = current_thread(); + + lck_mtx_lock(&inp->input_lck); + if (inp->net_affinity) { + u_int32_t tag = inp->tag; + + VERIFY(inp->wloop_thr == THREAD_NULL); + VERIFY(inp->poll_thr == THREAD_NULL); + inp->wloop_thr = tp; + lck_mtx_unlock(&inp->input_lck); + + /* Associate this thread with the affinity tag */ + (void) dlil_affinity_set(tp, tag); + } else { + lck_mtx_unlock(&inp->input_lck); + } + } + + snprintf(ifname, sizeof (ifname), "%s%d_starter", + ifp->if_name, ifp->if_unit); + + lck_mtx_lock_spin(&ifp->if_start_lock); + + for (;;) { + (void) msleep(&ifp->if_start_thread, &ifp->if_start_lock, + (PZERO - 1) | PSPIN, ifname, ts); + + /* interface is detached? */ + if (ifp->if_start_thread == THREAD_NULL) { + ifnet_set_start_cycle(ifp, NULL); + lck_mtx_unlock(&ifp->if_start_lock); + ifnet_purge(ifp); + + if (dlil_verbose) { + printf("%s%d: starter thread terminated\n", + ifp->if_name, ifp->if_unit); + } + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ + return; + } + + ifp->if_start_active = 1; + for (;;) { + u_int32_t req = ifp->if_start_req; + + lck_mtx_unlock(&ifp->if_start_lock); + /* invoke the driver's start routine */ + ((*ifp->if_start)(ifp)); + lck_mtx_lock_spin(&ifp->if_start_lock); + + /* if there's no pending request, we're done */ + if (req == ifp->if_start_req) + break; + } + ifp->if_start_req = 0; + ifp->if_start_active = 0; + /* + * Wakeup N ns from now if rate-controlled by TBR, and if + * there are still packets in the send queue which haven't + * been dequeued so far; else sleep indefinitely (ts = NULL) + * until ifnet_start() is called again. + */ + ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ? + &ifp->if_start_cycle : NULL); + + if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) + ts = NULL; + } + + /* NOTREACHED */ + lck_mtx_unlock(&ifp->if_start_lock); + VERIFY(0); /* we should never get here */ +} + +void +ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts) +{ + if (ts == NULL) + bzero(&ifp->if_start_cycle, sizeof (ifp->if_start_cycle)); + else + *(&ifp->if_start_cycle) = *ts; + + if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) + printf("%s%d: restart interval set to %lu nsec\n", + ifp->if_name, ifp->if_unit, ts->tv_nsec); +} + +static void +ifnet_poll(struct ifnet *ifp) +{ + /* + * If the poller thread is inactive, signal it to do work. + */ + lck_mtx_lock_spin(&ifp->if_poll_lock); + ifp->if_poll_req++; + if (!ifp->if_poll_active && ifp->if_poll_thread != THREAD_NULL) { + wakeup_one((caddr_t)&ifp->if_poll_thread); + } + lck_mtx_unlock(&ifp->if_poll_lock); +} + +static void +ifnet_poll_thread_fn(void *v, wait_result_t w) +{ +#pragma unused(w) + struct dlil_threading_info *inp; + struct ifnet *ifp = v; + char ifname[IFNAMSIZ + 1]; + struct timespec *ts = NULL; + struct ifnet_stat_increment_param s; + + snprintf(ifname, sizeof (ifname), "%s%d_poller", + ifp->if_name, ifp->if_unit); + bzero(&s, sizeof (s)); + + lck_mtx_lock_spin(&ifp->if_poll_lock); + + inp = ifp->if_inp; + VERIFY(inp != NULL); + + for (;;) { + if (ifp->if_poll_thread != THREAD_NULL) { + (void) msleep(&ifp->if_poll_thread, &ifp->if_poll_lock, + (PZERO - 1) | PSPIN, ifname, ts); + } + + /* interface is detached (maybe while asleep)? */ + if (ifp->if_poll_thread == THREAD_NULL) { + ifnet_set_poll_cycle(ifp, NULL); + lck_mtx_unlock(&ifp->if_poll_lock); + + if (dlil_verbose) { + printf("%s%d: poller thread terminated\n", + ifp->if_name, ifp->if_unit); + } + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ + return; + } + + ifp->if_poll_active = 1; + for (;;) { + struct mbuf *m_head, *m_tail; + u_int32_t m_lim, m_cnt, m_totlen; + u_int16_t req = ifp->if_poll_req; + + lck_mtx_unlock(&ifp->if_poll_lock); + + /* + * If no longer attached, there's nothing to do; + * else hold an IO refcnt to prevent the interface + * from being detached (will be released below.) + */ + if (!ifnet_is_attached(ifp, 1)) + break; + + m_lim = (if_rxpoll_max != 0) ? if_rxpoll_max : + MAX((qlimit(&inp->rcvq_pkts)), + (inp->rxpoll_phiwat << 2)); + + if (dlil_verbose > 1) { + printf("%s%d: polling up to %d pkts, " + "pkts avg %d max %d, wreq avg %d, " + "bytes avg %d\n", + ifp->if_name, ifp->if_unit, m_lim, + inp->rxpoll_pavg, inp->rxpoll_pmax, + inp->rxpoll_wavg, inp->rxpoll_bavg); + } + + /* invoke the driver's input poll routine */ + ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail, + &m_cnt, &m_totlen)); + + if (m_head != NULL) { + VERIFY(m_tail != NULL && m_cnt > 0); + + if (dlil_verbose > 1) { + printf("%s%d: polled %d pkts, " + "pkts avg %d max %d, wreq avg %d, " + "bytes avg %d\n", + ifp->if_name, ifp->if_unit, m_cnt, + inp->rxpoll_pavg, inp->rxpoll_pmax, + inp->rxpoll_wavg, inp->rxpoll_bavg); + } + + /* stats are required for extended variant */ + s.packets_in = m_cnt; + s.bytes_in = m_totlen; + + (void) ifnet_input_common(ifp, m_head, m_tail, + &s, TRUE, TRUE); + } else if (dlil_verbose > 1) { + printf("%s%d: no packets, pkts avg %d max %d, " + "wreq avg %d, bytes avg %d\n", ifp->if_name, + ifp->if_unit, inp->rxpoll_pavg, + inp->rxpoll_pmax, inp->rxpoll_wavg, + inp->rxpoll_bavg); + } + + /* Release the io ref count */ + ifnet_decr_iorefcnt(ifp); + + lck_mtx_lock_spin(&ifp->if_poll_lock); + + /* if there's no pending request, we're done */ + if (req == ifp->if_poll_req) + break; + } + ifp->if_poll_req = 0; + ifp->if_poll_active = 0; + + /* + * Wakeup N ns from now, else sleep indefinitely (ts = NULL) + * until ifnet_poll() is called again. + */ + ts = &ifp->if_poll_cycle; + if (ts->tv_sec == 0 && ts->tv_nsec == 0) + ts = NULL; + } + + /* NOTREACHED */ + lck_mtx_unlock(&ifp->if_poll_lock); + VERIFY(0); /* we should never get here */ +} + +void +ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts) +{ + if (ts == NULL) + bzero(&ifp->if_poll_cycle, sizeof (ifp->if_poll_cycle)); + else + *(&ifp->if_poll_cycle) = *ts; + + if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) + printf("%s%d: poll interval set to %lu nsec\n", + ifp->if_name, ifp->if_unit, ts->tv_nsec); +} + +void +ifnet_purge(struct ifnet *ifp) +{ + if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) + if_qflush(ifp, 0); +} + +void +ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!(IFCQ_IS_READY(ifq))) + return; + + if (IFCQ_TBR_IS_ENABLED(ifq)) { + struct tb_profile tb = { ifq->ifcq_tbr.tbr_rate_raw, + ifq->ifcq_tbr.tbr_percent, 0 }; + (void) ifclassq_tbr_set(ifq, &tb, FALSE); + } + + ifclassq_update(ifq, ev); +} + +void +ifnet_update_rcv(struct ifnet *ifp, cqev_t ev) +{ + switch (ev) { + case CLASSQ_EV_LINK_SPEED: + if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) + ifp->if_poll_update++; + break; + + default: + break; + } +} + +errno_t +ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model) +{ + struct ifclassq *ifq; + u_int32_t omodel; + errno_t err; + + if (ifp == NULL || (model != IFNET_SCHED_MODEL_DRIVER_MANAGED && + model != IFNET_SCHED_MODEL_NORMAL)) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART)) + return (ENXIO); + + ifq = &ifp->if_snd; + IFCQ_LOCK(ifq); + omodel = ifp->if_output_sched_model; + ifp->if_output_sched_model = model; + if ((err = ifclassq_pktsched_setup(ifq)) != 0) + ifp->if_output_sched_model = omodel; + IFCQ_UNLOCK(ifq); + + return (err); +} + +errno_t +ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen) +{ + if (ifp == NULL) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART)) + return (ENXIO); + + ifclassq_set_maxlen(&ifp->if_snd, maxqlen); + + return (0); +} + +errno_t +ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen) +{ + if (ifp == NULL || maxqlen == NULL) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART)) + return (ENXIO); + + *maxqlen = ifclassq_get_maxlen(&ifp->if_snd); + + return (0); +} + +errno_t +ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *qlen) +{ + if (ifp == NULL || qlen == NULL) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART)) + return (ENXIO); + + *qlen = ifclassq_get_len(&ifp->if_snd); + + return (0); +} + +errno_t +ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen) +{ + struct dlil_threading_info *inp; + + if (ifp == NULL) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) + return (ENXIO); + + if (maxqlen == 0) + maxqlen = if_rcvq_maxlen; + else if (maxqlen < IF_RCVQ_MINLEN) + maxqlen = IF_RCVQ_MINLEN; + + inp = ifp->if_inp; + lck_mtx_lock(&inp->input_lck); + qlimit(&inp->rcvq_pkts) = maxqlen; + lck_mtx_unlock(&inp->input_lck); + + return (0); +} + +errno_t +ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen) +{ + struct dlil_threading_info *inp; + + if (ifp == NULL || maxqlen == NULL) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) + return (ENXIO); + + inp = ifp->if_inp; + lck_mtx_lock(&inp->input_lck); + *maxqlen = qlimit(&inp->rcvq_pkts); + lck_mtx_unlock(&inp->input_lck); + return (0); +} + +errno_t +ifnet_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + int error; + + if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) || + m->m_nextpkt != NULL) { + if (m != NULL) + m_freem_list(m); + return (EINVAL); + } else if (!(ifp->if_eflags & IFEF_TXSTART) || + !(ifp->if_refflags & IFRF_ATTACHED)) { + /* flag tested without lock for performance */ + m_freem(m); + return (ENXIO); + } else if (!(ifp->if_flags & IFF_UP)) { + m_freem(m); + return (ENETDOWN); + + } + + /* enqueue the packet */ + error = ifclassq_enqueue(&ifp->if_snd, m); + + /* + * Tell the driver to start dequeueing; do this even when the queue + * for the packet is suspended (EQSUSPENDED), as the driver could still + * be dequeueing from other unsuspended queues. + */ + if (error == 0 || error == EQFULL || error == EQSUSPENDED) + ifnet_start(ifp); + + return (error); +} + +errno_t +ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp) +{ + if (ifp == NULL || mp == NULL) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART) || + (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL)) + return (ENXIO); + + return (ifclassq_dequeue(&ifp->if_snd, 1, mp, NULL, NULL, NULL)); +} + +errno_t +ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc, + struct mbuf **mp) +{ + if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART) || + (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED)) + return (ENXIO); + + return (ifclassq_dequeue_sc(&ifp->if_snd, sc, 1, mp, NULL, NULL, NULL)); +} + +errno_t +ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t limit, struct mbuf **head, + struct mbuf **tail, u_int32_t *cnt, u_int32_t *len) +{ + if (ifp == NULL || head == NULL || limit < 1) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART) || + (ifp->if_output_sched_model != IFNET_SCHED_MODEL_NORMAL)) + return (ENXIO); + + return (ifclassq_dequeue(&ifp->if_snd, limit, head, tail, cnt, len)); +} + +errno_t +ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc, + u_int32_t limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, + u_int32_t *len) +{ + + if (ifp == NULL || head == NULL || limit < 1 || !MBUF_VALID_SC(sc)) + return (EINVAL); + else if (!(ifp->if_eflags & IFEF_TXSTART) || + (ifp->if_output_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED)) + return (ENXIO); + + return (ifclassq_dequeue_sc(&ifp->if_snd, sc, limit, head, + tail, cnt, len)); +} + +static int +dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p, + char **frame_header_p, protocol_family_t protocol_family) +{ + struct ifnet_filter *filter; + + /* + * Pass the inbound packet to the interface filters */ lck_mtx_lock_spin(&ifp->if_flt_lock); /* prevent filter list from changing in case we drop the lock */ @@ -1458,8 +2720,110 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) return; } +static void +dlil_input_stats_add(const struct ifnet_stat_increment_param *s, + struct dlil_threading_info *inp, boolean_t poll) +{ + struct ifnet_stat_increment_param *d = &inp->stats; + + if (s->packets_in != 0) + d->packets_in += s->packets_in; + if (s->bytes_in != 0) + d->bytes_in += s->bytes_in; + if (s->errors_in != 0) + d->errors_in += s->errors_in; + + if (s->packets_out != 0) + d->packets_out += s->packets_out; + if (s->bytes_out != 0) + d->bytes_out += s->bytes_out; + if (s->errors_out != 0) + d->errors_out += s->errors_out; + + if (s->collisions != 0) + d->collisions += s->collisions; + if (s->dropped != 0) + d->dropped += s->dropped; + + if (poll) + PKTCNTR_ADD(&inp->tstats, s->packets_in, s->bytes_in); +} + +static void +dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp) +{ + struct ifnet_stat_increment_param *s = &inp->stats; + + /* + * Use of atomic operations is unavoidable here because + * these stats may also be incremented elsewhere via KPIs. + */ + if (s->packets_in != 0) { + atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in); + s->packets_in = 0; + } + if (s->bytes_in != 0) { + atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in); + s->bytes_in = 0; + } + if (s->errors_in != 0) { + atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in); + s->errors_in = 0; + } + + if (s->packets_out != 0) { + atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out); + s->packets_out = 0; + } + if (s->bytes_out != 0) { + atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out); + s->bytes_out = 0; + } + if (s->errors_out != 0) { + atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out); + s->errors_out = 0; + } + + if (s->collisions != 0) { + atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions); + s->collisions = 0; + } + if (s->dropped != 0) { + atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped); + s->dropped = 0; + } + + /* + * No need for atomic operations as they are modified here + * only from within the DLIL input thread context. + */ + if (inp->tstats.packets != 0) { + inp->pstats.ifi_poll_packets += inp->tstats.packets; + inp->tstats.packets = 0; + } + if (inp->tstats.bytes != 0) { + inp->pstats.ifi_poll_bytes += inp->tstats.bytes; + inp->tstats.bytes = 0; + } +} + +__private_extern__ void +dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m) +{ + return (dlil_input_packet_list_common(ifp, m, 0, + IFNET_MODEL_INPUT_POLL_OFF, FALSE)); +} + __private_extern__ void -dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) +dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m, + u_int32_t cnt, ifnet_model_t mode) +{ + return (dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE)); +} + +static void +dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, + u_int32_t cnt, ifnet_model_t mode, boolean_t ext) { int error = 0; protocol_family_t protocol_family; @@ -1469,9 +2833,13 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) struct if_proto * last_ifproto = NULL; mbuf_t pkt_first = NULL; mbuf_t * pkt_next = NULL; + u_int32_t poll_thresh = 0, poll_ival = 0; KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0); + if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 && + (poll_ival = if_rxpoll_interval_pkts) > 0) + poll_thresh = cnt; while (m != NULL) { struct if_proto *ifproto = NULL; @@ -1480,16 +2848,22 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) if (ifp_param == NULL) ifp = m->m_pkthdr.rcvif; + if ((ifp->if_eflags & IFEF_RXPOLL) && poll_thresh != 0 && + poll_ival > 0 && (--poll_thresh % poll_ival) == 0) + ifnet_poll(ifp); + /* Check if this mbuf looks valid */ - MBUF_INPUT_CHECK(m, ifp); + MBUF_INPUT_CHECK(m, ifp); next_packet = m->m_nextpkt; m->m_nextpkt = NULL; frame_header = m->m_pkthdr.header; m->m_pkthdr.header = NULL; - /* Get an IO reference count if the interface is not - * loopback and it is attached. + /* + * Get an IO reference count if the interface is not + * loopback (lo0) and it is attached; lo0 never goes + * away, so optimize for that. */ if (ifp != lo_ifp) { if (!ifnet_is_attached(ifp, 1)) { @@ -1499,22 +2873,7 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) iorefcnt = 1; } - switch (m->m_pkthdr.prio) { - case MBUF_TC_BK: - atomic_add_64(&ifp->if_tc.ifi_ibkpackets, 1); - atomic_add_64(&ifp->if_tc.ifi_ibkbytes, m->m_pkthdr.len); - break; - case MBUF_TC_VI: - atomic_add_64(&ifp->if_tc.ifi_ivipackets, 1); - atomic_add_64(&ifp->if_tc.ifi_ivibytes, m->m_pkthdr.len); - break; - case MBUF_TC_VO: - atomic_add_64(&ifp->if_tc.ifi_ivopackets, 1); - atomic_add_64(&ifp->if_tc.ifi_ivobytes, m->m_pkthdr.len); - break; - default: - break; - } + ifp_inc_traffic_class_in(ifp, m); /* find which protocol family this packet is for */ ifnet_lock_shared(ifp); @@ -1527,6 +2886,10 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) protocol_family = 0; } +#if CONFIG_EMBEDDED + iptap_ipf_input(ifp, protocol_family, m, frame_header); +#endif /* CONFIG_EMBEDDED */ + if (m->m_flags & (M_BCAST|M_MCAST)) atomic_add_64(&ifp->if_imcasts, 1); @@ -1595,7 +2958,7 @@ next: if_proto_free(ifproto); ifproto = NULL; } - + m = next_packet; /* update the driver's multicast filter, if needed */ @@ -1606,7 +2969,6 @@ next: } KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0); - return; } errno_t @@ -1760,27 +3122,78 @@ dlil_get_socket_type(struct mbuf **mp, int family, int raw) } #endif -static void -if_inc_traffic_class_out(ifnet_t ifp, mbuf_t m) +/* + * This is mostly called from the context of the DLIL input thread; + * because of that there is no need for atomic operations. + */ +static __inline void +ifp_inc_traffic_class_in(struct ifnet *ifp, struct mbuf *m) { if (!(m->m_flags & M_PKTHDR)) return; - switch (m->m_pkthdr.prio) { - case MBUF_TC_BK: - atomic_add_64(&ifp->if_tc.ifi_obkpackets, 1); - atomic_add_64(&ifp->if_tc.ifi_obkbytes, m->m_pkthdr.len); - break; - case MBUF_TC_VI: - atomic_add_64(&ifp->if_tc.ifi_ovipackets, 1); - atomic_add_64(&ifp->if_tc.ifi_ovibytes, m->m_pkthdr.len); - break; - case MBUF_TC_VO: - atomic_add_64(&ifp->if_tc.ifi_ovopackets, 1); - atomic_add_64(&ifp->if_tc.ifi_ovobytes, m->m_pkthdr.len); - break; - default: - break; + switch (m_get_traffic_class(m)) { + case MBUF_TC_BE: + ifp->if_tc.ifi_ibepackets++; + ifp->if_tc.ifi_ibebytes += m->m_pkthdr.len; + break; + case MBUF_TC_BK: + ifp->if_tc.ifi_ibkpackets++; + ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len; + break; + case MBUF_TC_VI: + ifp->if_tc.ifi_ivipackets++; + ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len; + break; + case MBUF_TC_VO: + ifp->if_tc.ifi_ivopackets++; + ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len; + break; + default: + break; + } + + if (mbuf_is_traffic_class_privileged(m)) { + ifp->if_tc.ifi_ipvpackets++; + ifp->if_tc.ifi_ipvbytes += m->m_pkthdr.len; + } +} + +/* + * This is called from DLIL output, hence multiple threads could end + * up modifying the statistics. We trade off acccuracy for performance + * by not using atomic operations here. + */ +static __inline void +ifp_inc_traffic_class_out(struct ifnet *ifp, struct mbuf *m) +{ + if (!(m->m_flags & M_PKTHDR)) + return; + + switch (m_get_traffic_class(m)) { + case MBUF_TC_BE: + ifp->if_tc.ifi_obepackets++; + ifp->if_tc.ifi_obebytes += m->m_pkthdr.len; + break; + case MBUF_TC_BK: + ifp->if_tc.ifi_obkpackets++; + ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len; + break; + case MBUF_TC_VI: + ifp->if_tc.ifi_ovipackets++; + ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len; + break; + case MBUF_TC_VO: + ifp->if_tc.ifi_ovopackets++; + ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len; + break; + default: + break; + } + + if (mbuf_is_traffic_class_privileged(m)) { + ifp->if_tc.ifi_opvpackets++; + ifp->if_tc.ifi_opvbytes += m->m_pkthdr.len; } } @@ -1796,10 +3209,14 @@ if_inc_traffic_class_out(ifnet_t ifp, mbuf_t m) * an interface lock if we're going to take both. This makes sense * because a protocol is likely to interact with an ifp while it * is under the protocol lock. + * + * An advisory code will be returned if adv is not null. This + * can be used to provide feedback about interface queues to the + * application. */ errno_t dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, - void *route, const struct sockaddr *dest, int raw) + void *route, const struct sockaddr *dest, int raw, struct flowadv *adv) { char *frame_type = NULL; char *dst_linkaddr = NULL; @@ -1811,6 +3228,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, mbuf_t send_head = NULL; mbuf_t *send_tail = &send_head; int iorefcnt = 0; +#if CONFIG_EMBEDDED + u_int32_t pre = 0, post = 0; +#endif /* CONFIG_EMBEDDED */ KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); @@ -1877,13 +3297,13 @@ preout_again: do { #if CONFIG_DTRACE - if (proto_family == PF_INET) { + if (!raw && proto_family == PF_INET) { struct ip *ip = mtod(m, struct ip*); DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, struct ip *, ip, struct ifnet *, ifp, struct ip *, ip, struct ip6_hdr *, NULL); - } else if (proto_family == PF_INET6) { + } else if (!raw && proto_family == PF_INET6) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*); DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL, struct ip6_hdr *, ip6, struct ifnet*, ifp, @@ -1909,7 +3329,12 @@ preout_again: } retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, - frame_type); + frame_type +#if CONFIG_EMBEDDED + , + &pre, &post +#endif /* CONFIG_EMBEDDED */ + ); if (retval) { if (retval != EJUSTRETURN) m_freem(m); @@ -1987,10 +3412,22 @@ preout_again: *send_tail = m; send_tail = &m->m_nextpkt; } else { - if_inc_traffic_class_out(ifp, m); +#if CONFIG_EMBEDDED + iptap_ipf_output(ifp, proto_family, (struct mbuf *)m, + pre, post); +#endif /* CONFIG_EMBEDDED */ + ifp_inc_traffic_class_out(ifp, m); KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = ifp->if_output(ifp, m); + retval = (*ifp->if_output)(ifp, m); + if (retval == EQFULL || retval == EQSUSPENDED) { + if (adv != NULL && adv->code == FADV_SUCCESS) { + adv->code = (retval == EQFULL ? + FADV_FLOW_CONTROLLED : + FADV_SUSPENDED); + } + retval = 0; + } if (retval && dlil_verbose) { printf("%s: output error on %s%d retval = %d\n", __func__, ifp->if_name, ifp->if_unit, @@ -2010,10 +3447,21 @@ next: } while (m); if (send_head) { - if_inc_traffic_class_out(ifp, send_head); +#if CONFIG_EMBEDDED + iptap_ipf_output(ifp, proto_family, (struct mbuf *)send_head, + pre, post); +#endif /* CONFIG_EMBEDDED */ + ifp_inc_traffic_class_out(ifp, send_head); KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = ifp->if_output(ifp, send_head); + retval = (*ifp->if_output)(ifp, send_head); + if (retval == EQFULL || retval == EQSUSPENDED) { + if (adv != NULL) { + adv->code = (retval == EQFULL ? + FADV_FLOW_CONTROLLED : FADV_SUSPENDED); + } + retval = 0; + } if (retval && dlil_verbose) { printf("%s: output error on %s%d retval = %d\n", __func__, ifp->if_name, ifp->if_unit, retval); @@ -2183,7 +3631,7 @@ dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi); if (resolvep != NULL) result = resolvep(ifp, proto_addr, - (struct sockaddr_dl*)ll_addr, ll_len); + (struct sockaddr_dl*)(void *)ll_addr, ll_len); if_proto_free(proto); } @@ -2229,6 +3677,31 @@ dlil_send_arp_internal(ifnet_t ifp, u_short arpop, return (result); } +__private_extern__ errno_t +net_thread_check_lock(u_int32_t flag) +{ + struct uthread *uth = get_bsdthread_info(current_thread()); + return ((uth->uu_network_lock_held & flag) == flag); +} + +__private_extern__ void +net_thread_set_lock(u_int32_t flag) +{ + struct uthread *uth = get_bsdthread_info(current_thread()); + + VERIFY((uth->uu_network_lock_held & flag) != flag); + uth->uu_network_lock_held |= flag; +} + +__private_extern__ void +net_thread_unset_lock(u_int32_t flag) +{ + struct uthread *uth = get_bsdthread_info(current_thread()); + + VERIFY((uth->uu_network_lock_held & flag) == flag); + uth->uu_network_lock_held &= (~flag); +} + static __inline__ int _is_announcement(const struct sockaddr_in * sender_sin, const struct sockaddr_in * target_sin) @@ -2242,24 +3715,37 @@ _is_announcement(const struct sockaddr_in * sender_sin, __private_extern__ errno_t dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) + const struct sockaddr* target_proto0, u_int32_t rtflags) { errno_t result = 0; const struct sockaddr_in * sender_sin; const struct sockaddr_in * target_sin; + struct sockaddr_inarp target_proto_sinarp; + struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0; if (target_proto == NULL || (sender_proto != NULL && sender_proto->sa_family != target_proto->sa_family)) return (EINVAL); + /* + * If the target is a (default) router, provide that + * information to the send_arp callback routine. + */ + if (rtflags & RTF_ROUTER) { + bcopy(target_proto, &target_proto_sinarp, + sizeof (struct sockaddr_in)); + target_proto_sinarp.sin_other |= SIN_ROUTER; + target_proto = (struct sockaddr *)&target_proto_sinarp; + } + /* * If this is an ARP request and the target IP is IPv4LL, * send the request on all interfaces. The exception is * an announcement, which must only appear on the specific * interface. */ - sender_sin = (const struct sockaddr_in *)sender_proto; - target_sin = (const struct sockaddr_in *)target_proto; + sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto; + target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto; if (target_proto->sa_family == AF_INET && IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) && ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST && @@ -2298,7 +3784,7 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw, /* Copy the source IP address */ source_ip_copy = *(struct sockaddr_in *) - source_ip->ifa_addr; + (void *)source_ip->ifa_addr; IFA_UNLOCK(source_ip); break; } @@ -2316,8 +3802,8 @@ dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw, /* Send the ARP */ new_result = dlil_send_arp_internal(cur_ifp, - arpop, - (struct sockaddr_dl *)source_hw->ifa_addr, + arpop, (struct sockaddr_dl *)(void *) + source_hw->ifa_addr, (struct sockaddr *)&source_ip_copy, NULL, target_proto); @@ -2385,10 +3871,6 @@ ifnet_decr_iorefcnt(struct ifnet *ifp) */ if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING) != 0) { - /* Convert the spinlock to a regular mutex if we have - * to wait for any reason while doing a wakeup. - */ - lck_mtx_convert_spin(&ifp->if_ref_lock); wakeup(&(ifp->if_refio)); } lck_mtx_unlock(&ifp->if_ref_lock); @@ -2767,6 +4249,9 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) struct ifaddr *ifa; struct if_data_internal if_data_saved; struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + struct dlil_threading_info *dl_inp; + u_int32_t sflags = 0; + int err; if (ifp == NULL) return (EINVAL); @@ -2790,7 +4275,7 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) lck_mtx_lock_spin(&ifp->if_ref_lock); if (ifp->if_refflags & IFRF_ATTACHED) { - panic("%s: flags mismatch (attached set) ifp=%p", + panic_plain("%s: flags mismatch (attached set) ifp=%p", __func__, ifp); /* NOTREACHED */ } @@ -2856,9 +4341,6 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(TAILQ_EMPTY(&ifp->if_addrhead)); TAILQ_INIT(&ifp->if_addrhead); - if (ifp->if_snd.ifq_maxlen == 0) - ifp->if_snd.ifq_maxlen = ifqmaxlen; - if (ifp->if_index == 0) { int idx = if_next_index(); @@ -2905,52 +4387,125 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) /* Hold a reference to the underlying dlil_ifnet */ ifnet_reference(ifp); + /* Clear stats (save and restore other fields that we care) */ + if_data_saved = ifp->if_data; + bzero(&ifp->if_data, sizeof (ifp->if_data)); + ifp->if_data.ifi_type = if_data_saved.ifi_type; + ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen; + ifp->if_data.ifi_physical = if_data_saved.ifi_physical; + ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen; + ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen; + ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu; + ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate; + ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist; + ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu; + ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu; + ifnet_touch_lastchange(ifp); + + VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL || + ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED); + + /* By default, use SFB and enable flow advisory */ + sflags = PKTSCHEDF_QALG_SFB; + if (if_flowadv) + sflags |= PKTSCHEDF_QALG_FLOWCTL; + + /* Initialize transmit queue(s) */ + err = ifclassq_setup(ifp, sflags, (dl_if->dl_if_flags & DLIF_REUSE)); + if (err != 0) { + panic_plain("%s: ifp=%p couldn't initialize transmit queue; " + "err=%d", __func__, ifp, err); + /* NOTREACHED */ + } + + /* Sanity checks on the input thread storage */ + dl_inp = &dl_if->dl_if_inpstorage; + bzero(&dl_inp->stats, sizeof (dl_inp->stats)); + VERIFY(dl_inp->input_waiting == 0); + VERIFY(dl_inp->wtot == 0); + VERIFY(dl_inp->ifp == NULL); + VERIFY(qhead(&dl_inp->rcvq_pkts) == NULL && qempty(&dl_inp->rcvq_pkts)); + VERIFY(qlimit(&dl_inp->rcvq_pkts) == 0); + VERIFY(!dl_inp->net_affinity); + VERIFY(ifp->if_inp == NULL); + VERIFY(dl_inp->input_thr == THREAD_NULL); + VERIFY(dl_inp->wloop_thr == THREAD_NULL); + VERIFY(dl_inp->poll_thr == THREAD_NULL); + VERIFY(dl_inp->tag == 0); + VERIFY(dl_inp->mode == IFNET_MODEL_INPUT_POLL_OFF); + bzero(&dl_inp->tstats, sizeof (dl_inp->tstats)); + bzero(&dl_inp->pstats, sizeof (dl_inp->pstats)); + bzero(&dl_inp->sstats, sizeof (dl_inp->sstats)); +#if IFNET_INPUT_SANITY_CHK + VERIFY(dl_inp->input_mbuf_cnt == 0); +#endif /* IFNET_INPUT_SANITY_CHK */ + + /* + * A specific DLIL input thread is created per Ethernet/cellular + * interface or for an interface which supports opportunistic + * input polling. Pseudo interfaces or other types of interfaces + * use the main input thread instead. + */ + if ((net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) || + ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR) { + ifp->if_inp = dl_inp; + err = dlil_create_input_thread(ifp, ifp->if_inp); + if (err != 0) { + panic_plain("%s: ifp=%p couldn't get an input thread; " + "err=%d", __func__, ifp, err); + /* NOTREACHED */ + } + } + /* - * A specific dlil input thread is created per Ethernet/cellular - * interface. pseudo interfaces or other types of interfaces use - * the main ("loopback") thread. - * - * If the sysctl "net.link.generic.system.multi_threaded_input" is set - * to zero, all packets will be handled by the main loopback thread, - * reverting to 10.4.x behaviour. + * If the driver supports the new transmit model, create a workloop + * starter thread to invoke the if_start callback where the packets + * may be dequeued and transmitted. */ - if (dlil_multithreaded_input && - (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR)) { - int err; - - ifp->if_input_thread = zalloc(dlif_inp_zone); - if (ifp->if_input_thread == NULL) { - panic("%s: ifp=%p couldn't alloc threading", - __func__, ifp); + if (ifp->if_eflags & IFEF_TXSTART) { + VERIFY(ifp->if_start != NULL); + VERIFY(ifp->if_start_thread == THREAD_NULL); + + ifnet_set_start_cycle(ifp, NULL); + ifp->if_start_active = 0; + ifp->if_start_req = 0; + if ((err = kernel_thread_start(ifnet_start_thread_fn, ifp, + &ifp->if_start_thread)) != KERN_SUCCESS) { + panic_plain("%s: ifp=%p couldn't get a start thread; " + "err=%d", __func__, ifp, err); /* NOTREACHED */ } - bzero(ifp->if_input_thread, dlif_inp_size); - err = dlil_create_input_thread(ifp, ifp->if_input_thread); - if (err != 0) { - panic("%s: ifp=%p couldn't get a thread. " + ml_thread_policy(ifp->if_start_thread, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP)); + } + + /* + * If the driver supports the new receive model, create a poller + * thread to invoke if_input_poll callback where the packets may + * be dequeued from the driver and processed for reception. + */ + if (ifp->if_eflags & IFEF_RXPOLL) { + VERIFY(ifp->if_input_poll != NULL); + VERIFY(ifp->if_input_ctl != NULL); + VERIFY(ifp->if_poll_thread == THREAD_NULL); + + ifnet_set_poll_cycle(ifp, NULL); + ifp->if_poll_update = 0; + ifp->if_poll_active = 0; + ifp->if_poll_req = 0; + if ((err = kernel_thread_start(ifnet_poll_thread_fn, ifp, + &ifp->if_poll_thread)) != KERN_SUCCESS) { + panic_plain("%s: ifp=%p couldn't get a poll thread; " "err=%d", __func__, ifp, err); /* NOTREACHED */ } -#ifdef DLIL_DEBUG - printf("%s: dlil thread for ifp=%p if_index=%d\n", - __func__, ifp, ifp->if_index); -#endif + ml_thread_policy(ifp->if_poll_thread, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_WORKLOOP)); } - /* Clear stats (save and restore other fields that we care) */ - if_data_saved = ifp->if_data; - bzero(&ifp->if_data, sizeof (ifp->if_data)); - ifp->if_data.ifi_type = if_data_saved.ifi_type; - ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen; - ifp->if_data.ifi_physical = if_data_saved.ifi_physical; - ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen; - ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen; - ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu; - ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate; - ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist; - ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu; - ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu; - ifnet_touch_lastchange(ifp); + VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE); + VERIFY(ifp->if_desc.ifd_len == 0); + VERIFY(ifp->if_desc.ifd_desc != NULL); /* Record attach PC stacktrace */ ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach); @@ -3024,6 +4579,9 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) */ lck_mtx_lock(rnh_lock); ifnet_lock_exclusive(ifp); + /* Initialize Link Quality Metric (loopback [lo0] is always good) */ + ifp->if_lqm = (ifp == lo_ifp) ? IFNET_LQM_THRESH_GOOD : + IFNET_LQM_THRESH_UNKNOWN; lck_mtx_lock_spin(&ifp->if_ref_lock); ifp->if_refflags = IFRF_ATTACHED; lck_mtx_unlock(&ifp->if_ref_lock); @@ -3113,7 +4671,8 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) /* address and mask sockaddr_dl locations */ asdl = (struct sockaddr_dl *)(ifa + 1); bzero(asdl, SOCK_MAXADDRLEN); - msdl = (struct sockaddr_dl *)((char *)asdl + SOCK_MAXADDRLEN); + msdl = (struct sockaddr_dl *)(void *) + ((char *)asdl + SOCK_MAXADDRLEN); bzero(msdl, SOCK_MAXADDRLEN); } else { VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa); @@ -3129,9 +4688,9 @@ dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) } IFA_LOCK(ifa); /* address and mask sockaddr_dl locations */ - asdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.asdl; + asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl; bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl)); - msdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.msdl; + msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl; bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl)); } @@ -3188,8 +4747,8 @@ ifnet_detach(ifnet_t ifp) if (ifp == NULL) return (EINVAL); - ifnet_head_lock_exclusive(); lck_mtx_lock(rnh_lock); + ifnet_head_lock_exclusive(); ifnet_lock_exclusive(ifp); /* @@ -3240,12 +4799,27 @@ ifnet_detach(ifnet_t ifp) ifnet_head_done(); lck_mtx_unlock(rnh_lock); + /* Reset Link Quality Metric (unless loopback [lo0]) */ + if (ifp != lo_ifp) + if_lqm_update(ifp, IFNET_LQM_THRESH_OFF); + + /* Reset TCP local statistics */ + if (ifp->if_tcp_stat != NULL) + bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat)); + + /* Reset UDP local statistics */ + if (ifp->if_udp_stat != NULL) + bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat)); + /* Let BPF know we're detaching */ bpfdetach(ifp); /* Mark the interface as DOWN */ if_down(ifp); + /* Drain send queue */ + ifclassq_teardown(ifp); + /* Disable forwarding cached route */ lck_mtx_lock(&ifp->if_cached_route_lock); ifp->if_fwd_cacheok = 0; @@ -3303,26 +4877,48 @@ ifnet_detaching_dequeue(void) return (ifp); } -static void -ifnet_delayed_thread_func(void) +static int +ifnet_detacher_thread_cont(int err) { +#pragma unused(err) struct ifnet *ifp; for (;;) { - dlil_if_lock(); + dlil_if_lock_assert(); while (ifnet_detaching_cnt == 0) { - (void) msleep(&ifnet_delayed_run, &dlil_ifnet_lock, - (PZERO - 1), "ifnet_delayed_thread", NULL); + (void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock, + (PZERO - 1), "ifnet_detacher_cont", 0, + ifnet_detacher_thread_cont); + /* NOTREACHED */ } VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL); /* Take care of detaching ifnet */ ifp = ifnet_detaching_dequeue(); - dlil_if_unlock(); - if (ifp != NULL) + if (ifp != NULL) { + dlil_if_unlock(); ifnet_detach_final(ifp); + dlil_if_lock(); + } } + /* NOTREACHED */ + return (0); +} + +static void +ifnet_detacher_thread_func(void *v, wait_result_t w) +{ +#pragma unused(v, w) + dlil_if_lock(); + (void) msleep0(&ifnet_delayed_run, &dlil_ifnet_lock, + (PZERO - 1), "ifnet_detacher", 0, ifnet_detacher_thread_cont); + /* + * msleep0() shouldn't have returned as PCATCH was not set; + * therefore assert in this case. + */ + dlil_if_unlock(); + VERIFY(0); } static void @@ -3330,7 +4926,7 @@ ifnet_detach_final(struct ifnet *ifp) { struct ifnet_filter *filter, *filter_next; struct ifnet_filter_head fhead; - struct dlil_threading_info *inputthread; + struct dlil_threading_info *inp; struct ifaddr *ifa; ifnet_detached_func if_free; int i; @@ -3342,8 +4938,10 @@ ifnet_detach_final(struct ifnet *ifp) /* NOTREACHED */ } - /* Wait until the existing IO references get released - * before we proceed with ifnet_detach + /* + * Wait until the existing IO references get released + * before we proceed with ifnet_detach. This is not a + * common case, so block without using a continuation. */ while (ifp->if_refio > 0) { printf("%s: Waiting for IO references on %s%d interface " @@ -3420,65 +5018,87 @@ ifnet_detach_final(struct ifnet *ifp) /* There should not be any addresses left */ VERIFY(TAILQ_EMPTY(&ifp->if_addrhead)); + /* + * Signal the starter thread to terminate itself. + */ + if (ifp->if_start_thread != THREAD_NULL) { + lck_mtx_lock_spin(&ifp->if_start_lock); + ifp->if_start_thread = THREAD_NULL; + wakeup_one((caddr_t)&ifp->if_start_thread); + lck_mtx_unlock(&ifp->if_start_lock); + } + + /* + * Signal the poller thread to terminate itself. + */ + if (ifp->if_poll_thread != THREAD_NULL) { + lck_mtx_lock_spin(&ifp->if_poll_lock); + ifp->if_poll_thread = THREAD_NULL; + wakeup_one((caddr_t)&ifp->if_poll_thread); + lck_mtx_unlock(&ifp->if_poll_lock); + } + /* * If thread affinity was set for the workloop thread, we will need * to tear down the affinity and release the extra reference count - * taken at attach time; + * taken at attach time. Does not apply to lo0 or other interfaces + * without dedicated input threads. */ - if ((inputthread = ifp->if_input_thread) != NULL) { - if (inputthread->net_affinity) { - struct thread *tp; - - if (inputthread == dlil_lo_thread_ptr) { - panic("%s: Thread affinity should not be " - "enabled on the loopback dlil input " - "thread", __func__); - /* NOTREACHED */ + if ((inp = ifp->if_inp) != NULL) { + VERIFY(inp != dlil_main_input_thread); + + if (inp->net_affinity) { + struct thread *tp, *wtp, *ptp; + + lck_mtx_lock_spin(&inp->input_lck); + wtp = inp->wloop_thr; + inp->wloop_thr = THREAD_NULL; + ptp = inp->poll_thr; + inp->poll_thr = THREAD_NULL; + tp = inp->input_thr; /* don't nullify now */ + inp->tag = 0; + inp->net_affinity = FALSE; + lck_mtx_unlock(&inp->input_lck); + + /* Tear down poll thread affinity */ + if (ptp != NULL) { + VERIFY(ifp->if_eflags & IFEF_RXPOLL); + (void) dlil_affinity_set(ptp, + THREAD_AFFINITY_TAG_NULL); + thread_deallocate(ptp); } - lck_mtx_lock_spin(&inputthread->input_lck); - tp = inputthread->workloop_thread; - inputthread->workloop_thread = NULL; - inputthread->tag = 0; - inputthread->net_affinity = FALSE; - lck_mtx_unlock(&inputthread->input_lck); - /* Tear down workloop thread affinity */ - if (tp != NULL) { - (void) dlil_affinity_set(tp, + if (wtp != NULL) { + (void) dlil_affinity_set(wtp, THREAD_AFFINITY_TAG_NULL); - thread_deallocate(tp); + thread_deallocate(wtp); } - /* Tear down dlil input thread affinity */ - tp = inputthread->input_thread; + /* Tear down DLIL input thread affinity */ (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL); thread_deallocate(tp); } - /* cleanup ifp dlil input thread, if any */ - ifp->if_input_thread = NULL; - - if (inputthread != dlil_lo_thread_ptr) { -#ifdef DLIL_DEBUG - printf("%s: wakeup thread threadinfo: %p " - "input_thread=%p threads: cur=%d max=%d\n", - __func__, inputthread, inputthread->input_thread, - dlil_multithreaded_input, cur_dlil_input_threads); -#endif - lck_mtx_lock_spin(&inputthread->input_lck); - - inputthread->input_waiting |= DLIL_INPUT_TERMINATE; - if (!(inputthread->input_waiting & DLIL_INPUT_RUNNING)) - wakeup((caddr_t)&inputthread->input_waiting); + /* disassociate ifp DLIL input thread */ + ifp->if_inp = NULL; - lck_mtx_unlock(&inputthread->input_lck); + lck_mtx_lock_spin(&inp->input_lck); + inp->input_waiting |= DLIL_INPUT_TERMINATE; + if (!(inp->input_waiting & DLIL_INPUT_RUNNING)) { + wakeup_one((caddr_t)&inp->input_waiting); } + lck_mtx_unlock(&inp->input_lck); } /* The driver might unload, so point these to ourselves */ if_free = ifp->if_free; ifp->if_output = ifp_if_output; + ifp->if_pre_enqueue = ifp_if_output; + ifp->if_start = ifp_if_start; + ifp->if_output_ctl = ifp_if_ctl; + ifp->if_input_poll = ifp_if_input_poll; + ifp->if_input_ctl = ifp_if_ctl; ifp->if_ioctl = ifp_if_ioctl; ifp->if_set_bpf_tap = ifp_if_set_bpf_tap; ifp->if_free = ifp_if_free; @@ -3489,6 +5109,12 @@ ifnet_detach_final(struct ifnet *ifp) ifp->if_del_proto = ifp_if_del_proto; ifp->if_check_multi = ifp_if_check_multi; + /* wipe out interface description */ + VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE); + ifp->if_desc.ifd_len = 0; + VERIFY(ifp->if_desc.ifd_desc != NULL); + bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE); + ifnet_lock_done(ifp); #if PF @@ -3505,6 +5131,9 @@ ifnet_detach_final(struct ifnet *ifp) VERIFY(ifp->if_flt_waiters == 0); lck_mtx_unlock(&ifp->if_flt_lock); + /* Last chance to drain send queue */ + if_qflush(ifp, 0); + /* Last chance to cleanup any cached route */ lck_mtx_lock(&ifp->if_cached_route_lock); VERIFY(!ifp->if_fwd_cacheok); @@ -3553,6 +5182,34 @@ ifp_if_output(struct ifnet *ifp, struct mbuf *m) return (0); } +static void +ifp_if_start(struct ifnet *ifp) +{ + ifnet_purge(ifp); +} + +static void +ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt, + struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len) +{ +#pragma unused(ifp, flags, max_cnt) + if (m_head != NULL) + *m_head = NULL; + if (m_tail != NULL) + *m_tail = NULL; + if (cnt != NULL) + *cnt = 0; + if (len != NULL) + *len = 0; +} + +static errno_t +ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg) +{ +#pragma unused(ifp, cmd, arglen, arg) + return (EOPNOTSUPP); +} + static errno_t ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf) { @@ -3583,17 +5240,25 @@ ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa) return (EOPNOTSUPP); } -static errno_t -ifp_if_framer(struct ifnet *ifp, struct mbuf **m, - const struct sockaddr *sa, const char *ll, const char *t) +static errno_t ifp_if_framer(struct ifnet *ifp, struct mbuf **m, +const struct sockaddr *sa, const char *ll, const char *t +#if CONFIG_EMBEDDED + , + u_int32_t *pre, u_int32_t *post +#endif /* CONFIG_EMBEDDED */ + ) { #pragma unused(ifp, m, sa, ll, t) m_freem(*m); *m = NULL; +#if CONFIG_EMBEDDED + *pre = 0; + *post = 0; +#endif /* CONFIG_EMBEDDED */ return (EJUSTRETURN); } -static errno_t +errno_t ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg) { #pragma unused(ifp, cmd, arg) @@ -3697,20 +5362,43 @@ int dlil_if_acquire(u_int32_t family, const void *uniqueid, dlifp1->dl_if_trace = dlil_if_trace; } ifp1->if_name = dlifp1->dl_if_namestorage; + + /* initialize interface description */ + ifp1->if_desc.ifd_maxlen = IF_DESCSIZE; + ifp1->if_desc.ifd_len = 0; + ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage; + #if CONFIG_MACF_NET mac_ifnet_label_init(ifp1); #endif + if ((ret = dlil_alloc_local_stats(ifp1)) != 0) { + DLIL_PRINTF("%s: failed to allocate if local stats, " + "error: %d\n", __func__, ret); + /* This probably shouldn't be fatal */ + ret = 0; + } + lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr); lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr); lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr); lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr); - lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_lock_group, - ifnet_lock_attr); lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group, ifnet_lock_attr); lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr); + /* for send data paths */ + lck_mtx_init(&ifp1->if_start_lock, ifnet_snd_lock_group, + ifnet_lock_attr); + lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_snd_lock_group, + ifnet_lock_attr); + lck_mtx_init(&ifp1->if_snd.ifcq_lock, ifnet_snd_lock_group, + ifnet_lock_attr); + + /* for receive data paths */ + lck_mtx_init(&ifp1->if_poll_lock, ifnet_rcv_lock_group, + ifnet_lock_attr); + TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link); *ifp = ifp1; @@ -3846,7 +5534,9 @@ struct rtentry * ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip) { struct route src_rt; - struct sockaddr_in *dst = (struct sockaddr_in *)(&src_rt.ro_dst); + struct sockaddr_in *dst; + + dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst); ifp_src_route_copyout(ifp, &src_rt); @@ -3900,7 +5590,8 @@ ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6) src_rt.ro_dst.sin6_family = AF_INET6; } src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6); - src_rt.ro_dst.sin6_addr = *src_ip6; + bcopy(src_ip6, &src_rt.ro_dst.sin6_addr, + sizeof (src_rt.ro_dst.sin6_addr)); if (src_rt.ro_rt == NULL) { src_rt.ro_rt = rtalloc1_scoped( @@ -3920,3 +5611,394 @@ ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6) return (src_rt.ro_rt); } #endif /* INET6 */ + +void +if_lqm_update(struct ifnet *ifp, int lqm) +{ + struct kev_dl_link_quality_metric_data ev_lqm_data; + + VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX); + + /* Normalize to edge */ + if (lqm > IFNET_LQM_THRESH_UNKNOWN && lqm <= IFNET_LQM_THRESH_POOR) + lqm = IFNET_LQM_THRESH_POOR; + else if (lqm > IFNET_LQM_THRESH_POOR && lqm <= IFNET_LQM_THRESH_GOOD) + lqm = IFNET_LQM_THRESH_GOOD; + + ifnet_lock_exclusive(ifp); + if (lqm == ifp->if_lqm) { + ifnet_lock_done(ifp); + return; /* nothing to update */ + } + ifp->if_lqm = lqm; + ifnet_lock_done(ifp); + + bzero(&ev_lqm_data, sizeof (ev_lqm_data)); + ev_lqm_data.link_quality_metric = lqm; + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED, + (struct net_event_data *)&ev_lqm_data, sizeof (ev_lqm_data)); +} + +/* for uuid.c */ +int +uuid_get_ethernet(u_int8_t *node) +{ + struct ifnet *ifp; + struct sockaddr_dl *sdl; + + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + ifnet_lock_shared(ifp); + IFA_LOCK_SPIN(ifp->if_lladdr); + sdl = (struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr; + if (sdl->sdl_type == IFT_ETHER) { + memcpy(node, LLADDR(sdl), ETHER_ADDR_LEN); + IFA_UNLOCK(ifp->if_lladdr); + ifnet_lock_done(ifp); + ifnet_head_done(); + return (0); + } + IFA_UNLOCK(ifp->if_lladdr); + ifnet_lock_done(ifp); + } + ifnet_head_done(); + + return (-1); +} + +static int +sysctl_rxpoll SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int i, err; + + i = if_rxpoll; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + if (net_rxpoll == 0) + return (ENXIO); + + if_rxpoll = i; + return (err); +} + +static int +sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int i, err; + + i = if_sndq_maxlen; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + if (i < IF_SNDQ_MINLEN) + i = IF_SNDQ_MINLEN; + + if_sndq_maxlen = i; + return (err); +} + +static int +sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int i, err; + + i = if_rcvq_maxlen; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + if (i < IF_RCVQ_MINLEN) + i = IF_RCVQ_MINLEN; + + if_rcvq_maxlen = i; + return (err); +} + +void +ifnet_fclist_append(struct sfb *sp, struct sfb_fc_list *fcl) +{ + struct sfb_bin_fcentry *fce, *tfce; + + lck_mtx_lock_spin(&ifnet_fclist_lock); + + SLIST_FOREACH_SAFE(fce, fcl, fce_link, tfce) { + SLIST_REMOVE(fcl, fce, sfb_bin_fcentry, fce_link); + SLIST_INSERT_HEAD(&ifnet_fclist, fce, fce_link); + sp->sfb_stats.flow_feedback++; + } + VERIFY(SLIST_EMPTY(fcl) && !SLIST_EMPTY(&ifnet_fclist)); + + wakeup(&ifnet_fclist); + + lck_mtx_unlock(&ifnet_fclist_lock); +} + +struct sfb_bin_fcentry * +ifnet_fce_alloc(int how) +{ + struct sfb_bin_fcentry *fce; + + fce = (how == M_WAITOK) ? zalloc(ifnet_fcezone) : + zalloc_noblock(ifnet_fcezone); + if (fce != NULL) + bzero(fce, ifnet_fcezone_size); + + return (fce); +} + +void +ifnet_fce_free(struct sfb_bin_fcentry *fce) +{ + zfree(ifnet_fcezone, fce); +} + +static void +ifnet_fc_init(void) +{ + thread_t thread = THREAD_NULL; + + SLIST_INIT(&ifnet_fclist); + lck_mtx_init(&ifnet_fclist_lock, ifnet_snd_lock_group, NULL); + + ifnet_fcezone_size = P2ROUNDUP(sizeof (struct sfb_bin_fcentry), + sizeof (u_int64_t)); + ifnet_fcezone = zinit(ifnet_fcezone_size, + IFNET_FCEZONE_MAX * ifnet_fcezone_size, 0, IFNET_FCEZONE_NAME); + if (ifnet_fcezone == NULL) { + panic("%s: failed allocating %s", __func__, IFNET_FCEZONE_NAME); + /* NOTREACHED */ + } + zone_change(ifnet_fcezone, Z_EXPAND, TRUE); + zone_change(ifnet_fcezone, Z_CALLERACCT, FALSE); + + if (kernel_thread_start(ifnet_fc_thread_func, + NULL, &thread) != KERN_SUCCESS) { + panic("%s: couldn't create flow event advisory thread", + __func__); + /* NOTREACHED */ + } + thread_deallocate(thread); +} + +static int +ifnet_fc_thread_cont(int err) +{ +#pragma unused(err) + struct sfb_bin_fcentry *fce; + struct inp_fc_entry *infc; + + for (;;) { + lck_mtx_assert(&ifnet_fclist_lock, LCK_MTX_ASSERT_OWNED); + while (SLIST_EMPTY(&ifnet_fclist)) { + (void) msleep0(&ifnet_fclist, &ifnet_fclist_lock, + (PSOCK | PSPIN), "ifnet_fc_cont", 0, + ifnet_fc_thread_cont); + /* NOTREACHED */ + } + + fce = SLIST_FIRST(&ifnet_fclist); + SLIST_REMOVE(&ifnet_fclist, fce, sfb_bin_fcentry, fce_link); + SLIST_NEXT(fce, fce_link) = NULL; + lck_mtx_unlock(&ifnet_fclist_lock); + + infc = inp_fc_getinp(fce->fce_flowhash); + if (infc == NULL) { + ifnet_fce_free(fce); + lck_mtx_lock_spin(&ifnet_fclist_lock); + continue; + } + VERIFY(infc->infc_inp != NULL); + + inp_fc_feedback(infc->infc_inp); + + inp_fc_entry_free(infc); + ifnet_fce_free(fce); + lck_mtx_lock_spin(&ifnet_fclist_lock); + } +} + +static void +ifnet_fc_thread_func(void *v, wait_result_t w) +{ +#pragma unused(v, w) + lck_mtx_lock(&ifnet_fclist_lock); + (void) msleep0(&ifnet_fclist, &ifnet_fclist_lock, + (PSOCK | PSPIN), "ifnet_fc", 0, ifnet_fc_thread_cont); + /* + * msleep0() shouldn't have returned as PCATCH was not set; + * therefore assert in this case. + */ + lck_mtx_unlock(&ifnet_fclist_lock); + VERIFY(0); +} + +void +dlil_node_present(struct ifnet *ifp, struct sockaddr *sa, + int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48]) +{ + struct kev_dl_node_presence kev; + struct sockaddr_dl *sdl; + struct sockaddr_in6 *sin6; + + VERIFY(ifp); + VERIFY(sa); + VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6); + + bzero(&kev, sizeof (kev)); + sin6 = &kev.sin6_node_address; + sdl = &kev.sdl_node_address; + nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6); + kev.rssi = rssi; + kev.link_quality_metric = lqm; + kev.node_proximity_metric = npm; + bcopy(srvinfo, kev.node_service_info, sizeof (kev.node_service_info)); + + nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm); + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE, + &kev.link_data, sizeof (kev)); +} + +void +dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa) +{ + struct kev_dl_node_absence kev; + struct sockaddr_in6 *sin6; + struct sockaddr_dl *sdl; + + VERIFY(ifp); + VERIFY(sa); + VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6); + + bzero(&kev, sizeof (kev)); + sin6 = &kev.sin6_node_address; + sdl = &kev.sdl_node_address; + nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6); + + nd6_alt_node_absent(ifp, sin6); + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE, + &kev.link_data, sizeof (kev)); +} + +errno_t +ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr, + struct proc *p) +{ + u_int32_t level = IFNET_THROTTLE_OFF; + errno_t result = 0; + + VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC); + + if (cmd == SIOCSIFOPPORTUNISTIC) { + /* + * XXX: Use priv_check_cred() instead of root check? + */ + if ((result = proc_suser(p)) != 0) + return (result); + + if (ifr->ifr_opportunistic.ifo_flags == + IFRIFOF_BLOCK_OPPORTUNISTIC) + level = IFNET_THROTTLE_OPPORTUNISTIC; + else if (ifr->ifr_opportunistic.ifo_flags == 0) + level = IFNET_THROTTLE_OFF; + else + result = EINVAL; + + if (result == 0) + result = ifnet_set_throttle(ifp, level); + } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) { + ifr->ifr_opportunistic.ifo_flags = 0; + if (level == IFNET_THROTTLE_OPPORTUNISTIC) { + ifr->ifr_opportunistic.ifo_flags |= + IFRIFOF_BLOCK_OPPORTUNISTIC; + } + } + + /* + * Return the count of current opportunistic connections + * over the interface. + */ + if (result == 0) { + uint32_t flags = 0; + flags |= (cmd == SIOCSIFOPPORTUNISTIC) ? + INPCB_OPPORTUNISTIC_SETCMD : 0; + flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ? + INPCB_OPPORTUNISTIC_THROTTLEON : 0; + ifr->ifr_opportunistic.ifo_inuse = + udp_count_opportunistic(ifp->if_index, flags) + + tcp_count_opportunistic(ifp->if_index, flags); + } + + if (result == EALREADY) + result = 0; + + return (result); +} + +int +ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level) +{ + struct ifclassq *ifq; + int err = 0; + + if (!(ifp->if_eflags & IFEF_TXSTART)) + return (ENXIO); + + *level = IFNET_THROTTLE_OFF; + + ifq = &ifp->if_snd; + IFCQ_LOCK(ifq); + /* Throttling works only for IFCQ, not ALTQ instances */ + if (IFCQ_IS_ENABLED(ifq)) + IFCQ_GET_THROTTLE(ifq, *level, err); + IFCQ_UNLOCK(ifq); + + return (err); +} + +int +ifnet_set_throttle(struct ifnet *ifp, u_int32_t level) +{ + struct ifclassq *ifq; + int err = 0; + + if (!(ifp->if_eflags & IFEF_TXSTART)) + return (ENXIO); + + switch (level) { + case IFNET_THROTTLE_OFF: + case IFNET_THROTTLE_OPPORTUNISTIC: +#if PF_ALTQ + /* Throttling works only for IFCQ, not ALTQ instances */ + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) + return (ENXIO); +#endif /* PF_ALTQ */ + break; + default: + return (EINVAL); + } + + ifq = &ifp->if_snd; + IFCQ_LOCK(ifq); + if (IFCQ_IS_ENABLED(ifq)) + IFCQ_SET_THROTTLE(ifq, level, err); + IFCQ_UNLOCK(ifq); + + if (err == 0) { + printf("%s%d: throttling level set to %d\n", ifp->if_name, + ifp->if_unit, level); + if (level == IFNET_THROTTLE_OFF) + ifnet_start(ifp); + } + + return (err); +} diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index db1060db8..98ca8e878 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2010 Apple Inc. All rights reserved. + * Copyright (c) 1999-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -39,10 +39,6 @@ enum { BPF_TAP_INPUT_OUTPUT }; -/* Ethernet specific types */ -#define DLIL_DESC_ETYPE2 4 -#define DLIL_DESC_SAP 5 -#define DLIL_DESC_SNAP 6 /* * DLIL_DESC_ETYPE2 - native_type must point to 2 byte ethernet raw protocol, * variants.native_type_length must be set to 2 @@ -58,46 +54,138 @@ enum { * The length of the protocol data specified at native_type must be set in * variants.native_type_length. */ +/* Ethernet specific types */ +#define DLIL_DESC_ETYPE2 4 +#define DLIL_DESC_SAP 5 +#define DLIL_DESC_SNAP 6 #ifdef KERNEL_PRIVATE - #include #include +#include +#include #include #include #include -#if __STDC__ +#ifdef BSD_KERNEL_PRIVATE +/* Operations on timespecs. */ +#define net_timerclear(tvp) (tvp)->tv_sec = (tvp)->tv_nsec = 0 + +#define net_timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) + +#define net_timercmp(tvp, uvp, cmp) \ + (((tvp)->tv_sec == (uvp)->tv_sec) ? \ + ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ + ((tvp)->tv_sec cmp (uvp)->tv_sec)) + +#define net_timeradd(tvp, uvp, vvp) do { \ + (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec; \ + (vvp)->tv_nsec = (tvp)->tv_nsec + (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec >= NSEC_PER_SEC) { \ + (vvp)->tv_sec++; \ + (vvp)->tv_nsec -= NSEC_PER_SEC; \ + } \ +} while (0) + +#define net_timersub(tvp, uvp, vvp) do { \ + (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ + (vvp)->tv_nsec = (tvp)->tv_nsec - (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_nsec += NSEC_PER_SEC; \ + } \ +} while (0) + +#define net_timernsec(tvp, nsp) do { \ + *(nsp) = (tvp)->tv_nsec; \ + if ((tvp)->tv_sec > 0) \ + *(nsp) += ((tvp)->tv_sec * NSEC_PER_SEC); \ +} while (0) + +#define net_nsectimer(nsp, tvp) do { \ + u_int64_t __nsp = *(nsp); \ + net_timerclear(tvp); \ + while ((__nsp) >= NSEC_PER_SEC) { \ + (tvp)->tv_sec++; \ + (__nsp) -= NSEC_PER_SEC; \ + } \ + (tvp)->tv_nsec = (__nsp); \ +} while (0) struct ifnet; struct mbuf; struct ether_header; struct sockaddr_dl; - -#endif - struct iff_filter; #define DLIL_THREADNAME_LEN 32 +/* + * DLIL input thread info + */ struct dlil_threading_info { decl_lck_mtx_data(, input_lck); lck_grp_t *lck_grp; /* lock group (for lock stats) */ - mbuf_t mbuf_head; /* start of mbuf list from if */ - mbuf_t mbuf_tail; - u_int32_t mbuf_count; - boolean_t net_affinity; /* affinity set is available */ u_int32_t input_waiting; /* DLIL condition of thread */ - struct thread *input_thread; /* thread data for this input */ - struct thread *workloop_thread; /* current workloop thread */ - u_int32_t tag; /* current affinity tag */ - char input_name[DLIL_THREADNAME_LEN]; + u_int32_t wtot; /* # of wakeup requests */ + char input_name[DLIL_THREADNAME_LEN]; /* name storage */ + struct ifnet *ifp; /* pointer to interface */ + class_queue_t rcvq_pkts; /* queue of pkts */ + struct ifnet_stat_increment_param stats; /* incremental statistics */ + /* + * Thread affinity (workloop and DLIL threads). + */ + boolean_t net_affinity; /* affinity set is available */ + struct thread *input_thr; /* input thread */ + struct thread *wloop_thr; /* workloop thread */ + struct thread *poll_thr; /* poll thread */ + u_int32_t tag; /* affinity tag */ + /* + * Opportunistic polling. + */ + ifnet_model_t mode; /* current mode */ + struct pktcntr tstats; /* incremental polling statistics */ + struct if_rxpoll_stats pstats; /* polling statistics */ +#define rxpoll_offreq pstats.ifi_poll_off_req +#define rxpoll_offerr pstats.ifi_poll_off_err +#define rxpoll_onreq pstats.ifi_poll_on_req +#define rxpoll_onerr pstats.ifi_poll_on_err +#define rxpoll_wavg pstats.ifi_poll_wakeups_avg +#define rxpoll_wlowat pstats.ifi_poll_wakeups_lowat +#define rxpoll_whiwat pstats.ifi_poll_wakeups_hiwat +#define rxpoll_pavg pstats.ifi_poll_packets_avg +#define rxpoll_pmin pstats.ifi_poll_packets_min +#define rxpoll_pmax pstats.ifi_poll_packets_max +#define rxpoll_plowat pstats.ifi_poll_packets_lowat +#define rxpoll_phiwat pstats.ifi_poll_packets_hiwat +#define rxpoll_bavg pstats.ifi_poll_bytes_avg +#define rxpoll_bmin pstats.ifi_poll_bytes_min +#define rxpoll_bmax pstats.ifi_poll_bytes_max +#define rxpoll_blowat pstats.ifi_poll_bytes_lowat +#define rxpoll_bhiwat pstats.ifi_poll_bytes_hiwat + struct pktcntr sstats; /* packets and bytes per sampling */ + struct timespec mode_holdtime; /* mode holdtime in nsec */ + struct timespec mode_lasttime; /* last mode change time in nsec */ + struct timespec sample_holdtime; /* sampling holdtime in nsec */ + struct timespec sample_lasttime; /* last sampling time in nsec */ + struct timespec dbg_lasttime; /* last debug message time in nsec */ #if IFNET_INPUT_SANITY_CHK - u_int32_t input_wake_cnt; /* number of times the thread was awaken with packets to process */ - u_long input_mbuf_cnt; /* total number of mbuf packets processed by this thread */ + /* + * For debugging. + */ + u_int64_t input_mbuf_cnt; /* total # of packets processed */ #endif }; +/* + * DLIL input thread info (for main/loopback input thread) + */ +struct dlil_main_threading_info { + struct dlil_threading_info inp; + class_queue_t lo_rcvq_pkts; /* queue of lo0 pkts */ +}; + /* * The following are shared with kpi_protocol.c so that it may wakeup * the input thread to run through packets queued for protocol input. @@ -108,8 +196,12 @@ struct dlil_threading_info { #define DLIL_PROTO_WAITING 0x10000000 #define DLIL_INPUT_TERMINATE 0x08000000 +__private_extern__ struct dlil_threading_info *dlil_main_input_thread; + extern void dlil_init(void); +extern errno_t ifp_if_ioctl(struct ifnet *, unsigned long, void *); + extern errno_t dlil_set_bpf_tap(ifnet_t, bpf_tap_mode, bpf_packet_func); /* @@ -119,17 +211,30 @@ extern errno_t dlil_send_arp_internal(ifnet_t, u_int16_t, const struct sockaddr_dl *, const struct sockaddr *, const struct sockaddr_dl *, const struct sockaddr *); +/* + * The following flags used to check if a network thread already + * owns the lock + */ +#define NET_THREAD_HELD_PF 0x1 /* thread is holding PF lock */ +#define NET_THREAD_HELD_DOMAIN 0x2 /* thread is holding domain_proto_mtx */ + +extern errno_t net_thread_check_lock(u_int32_t); +extern void net_thread_set_lock(u_int32_t); +extern void net_thread_unset_lock(u_int32_t); + extern int dlil_output(ifnet_t, protocol_family_t, mbuf_t, void *, - const struct sockaddr *, int); + const struct sockaddr *, int, struct flowadv *); extern void dlil_input_packet_list(struct ifnet *, struct mbuf *); +extern void dlil_input_packet_list_extended(struct ifnet *, struct mbuf *, + u_int32_t, ifnet_model_t); extern errno_t dlil_resolve_multi(struct ifnet *, const struct sockaddr *, struct sockaddr *, size_t); extern errno_t dlil_send_arp(ifnet_t, u_int16_t, const struct sockaddr_dl *, const struct sockaddr *, const struct sockaddr_dl *, - const struct sockaddr *); + const struct sockaddr *, u_int32_t); extern int dlil_attach_filter(ifnet_t, const struct iff_filter *, interface_filter_t *); @@ -140,6 +245,8 @@ extern void dlil_proto_unplumb_all(ifnet_t); extern void dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t, struct net_event_data *, u_int32_t); +extern int dlil_alloc_local_stats(struct ifnet *); + /* * dlil_if_acquire is obsolete. Use ifnet_allocate. */ @@ -155,6 +262,11 @@ extern u_int32_t ifnet_aggressive_drainers; extern errno_t dlil_if_ref(struct ifnet *); extern errno_t dlil_if_free(struct ifnet *); +extern void dlil_node_present(struct ifnet *, struct sockaddr *, int32_t, int, + int, u_int8_t[48]); +extern void dlil_node_absent(struct ifnet *, struct sockaddr *); + +#endif /* BSD_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ #endif /* DLIL_H */ diff --git a/bsd/net/ether_at_pr_module.c b/bsd/net/ether_at_pr_module.c index 1adcbe27e..e7daa051a 100644 --- a/bsd/net/ether_at_pr_module.c +++ b/bsd/net/ether_at_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -186,12 +186,16 @@ ether_at_prmod_ioctl( u_int32_t command, void *data) { - struct ifreq *ifr = data; int error = 0; switch (command) { - case SIOCSIFADDR: + case SIOCSIFADDR: /* struct ifaddr pointer */ + /* + * Note: caller of ifnet_ioctl() passes in pointer to + * struct ifaddr as parameter to SIOCSIFADDR, for legacy + * reasons. + */ if ((ifp->if_flags & IFF_RUNNING) == 0) { ifnet_set_flags(ifp, IFF_UP, IFF_UP); ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); @@ -199,9 +203,12 @@ ether_at_prmod_ioctl( break; - case SIOCGIFADDR: + case SIOCGIFADDR: { /* struct ifreq */ + struct ifreq *ifr = data; + ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); break; + } default: error = EOPNOTSUPP; diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index a1cbfb3d1..60b6846d2 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,7 +97,9 @@ #include #include #include -#include +#if BOND +#include +#endif /* BOND */ #if IF_BRIDGE #include #endif /* IF_BRIDGE */ @@ -357,7 +359,7 @@ ether_demux( char *frame_header, protocol_family_t *protocol_family) { - struct ether_header *eh = (struct ether_header *)frame_header; + struct ether_header *eh = (struct ether_header *)(void *)frame_header; u_short ether_type = eh->ether_type; u_int16_t type; u_int8_t *data; @@ -416,7 +418,7 @@ ether_demux( else if (ether_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header * evl; - evl = (struct ether_vlan_header *)frame_header; + evl = (struct ether_vlan_header *)(void *)frame_header; if (m->m_len < ETHER_VLAN_ENCAP_LEN || ntohs(evl->evl_proto) == ETHERTYPE_VLAN || EVL_VLANOFTAG(ntohs(evl->evl_tag)) != 0) { @@ -443,13 +445,13 @@ ether_demux( */ if (ntohs(ether_type) <= 1500) { - extProto1 = *(u_int32_t*)data; + bcopy(data, &extProto1, sizeof (u_int32_t)); // SAP or SNAP if ((extProto1 & htonl(0xFFFFFF00)) == htonl(0xAAAA0300)) { // SNAP type = DLIL_DESC_SNAP; - extProto2 = *(u_int32_t*)(data + sizeof(u_int32_t)); + bcopy(data + sizeof(u_int32_t), &extProto2, sizeof (u_int32_t)); extProto1 &= htonl(0x000000FF); } else { type = DLIL_DESC_SAP; @@ -504,11 +506,17 @@ ether_demux( */ int ether_frameout( - struct ifnet *ifp, - struct mbuf **m, - const struct sockaddr *ndest, - const char *edst, - const char *ether_type) + struct ifnet *ifp, + struct mbuf **m, + const struct sockaddr *ndest, + const char *edst, + const char *ether_type +#if KPI_INTERFACE_EMBEDDED + , + u_int32_t *prepend_len, + u_int32_t *postpend_len +#endif /* KPI_INTERFACE_EMBEDDED */ + ) { struct ether_header *eh; int hlen; /* link layer header length */ @@ -530,11 +538,11 @@ ether_frameout( if ((*m)->m_flags & M_BCAST) { struct mbuf *n = m_copy(*m, 0, (int)M_COPYALL); if (n != NULL) - dlil_output(lo_ifp, ndest->sa_family, n, NULL, ndest, 0); + dlil_output(lo_ifp, ndest->sa_family, n, NULL, ndest, 0, NULL); } else { if (_ether_cmp(edst, ifnet_lladdr(ifp)) == 0) { - dlil_output(lo_ifp, ndest->sa_family, *m, NULL, ndest, 0); + dlil_output(lo_ifp, ndest->sa_family, *m, NULL, ndest, 0, NULL); return EJUSTRETURN; } } @@ -550,7 +558,11 @@ ether_frameout( return (EJUSTRETURN); } - +#if KPI_INTERFACE_EMBEDDED + *prepend_len = sizeof (struct ether_header); + *postpend_len = 0; +#endif /* KPI_INTERFACE_EMBEDDED */ + eh = mtod(*m, struct ether_header *); (void)memcpy(&eh->ether_type, ether_type, sizeof(eh->ether_type)); @@ -582,7 +594,8 @@ ether_check_multi( break; case AF_LINK: - e_addr = CONST_LLADDR((const struct sockaddr_dl*)proto_addr); + e_addr = CONST_LLADDR((const struct sockaddr_dl*) + (uintptr_t)(size_t)proto_addr); if ((e_addr[0] & 0x01) != 0x01) result = EADDRNOTAVAIL; else diff --git a/bsd/net/ether_inet6_pr_module.c b/bsd/net/ether_inet6_pr_module.c index e8411dec6..78a2b3f07 100644 --- a/bsd/net/ether_inet6_pr_module.c +++ b/bsd/net/ether_inet6_pr_module.c @@ -111,9 +111,12 @@ ether_inet6_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t packet, char *header) { #pragma unused(ifp, protocol) - struct ether_header *eh = (struct ether_header *)header; + struct ether_header *eh = (struct ether_header *)(void *)header; + u_int16_t ether_type; - if (eh->ether_type == htons(ETHERTYPE_IPV6)) { + bcopy(&eh->ether_type, ðer_type, sizeof (ether_type)); + + if (ether_type == htons(ETHERTYPE_IPV6)) { struct ifnet *mifp; /* * Trust the ifp in the mbuf, rather than ifproto's @@ -155,11 +158,13 @@ ether_inet6_pre_output(ifnet_t ifp, protocol_family_t protocol_family, */ m->m_flags |= M_LOOP; - result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6 *)dst_netaddr, - &sdl, sizeof (sdl), route, *m0); + result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6 *) + (uintptr_t)(size_t)dst_netaddr, &sdl, sizeof (sdl), route, *m0); if (result == 0) { - *(u_int16_t *)type = htons(ETHERTYPE_IPV6); + u_int16_t ethertype_ipv6 = htons(ETHERTYPE_IPV6); + + bcopy(ðertype_ipv6, type, sizeof (ethertype_ipv6)); bcopy(LLADDR(&sdl), edst, sdl.sdl_alen); } @@ -173,7 +178,7 @@ ether_inet6_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; const struct sockaddr_in6 *sin6 = - (const struct sockaddr_in6 *)proto_addr; + (const struct sockaddr_in6 *)(uintptr_t)(size_t)proto_addr; if (proto_addr->sa_family != AF_INET6) return (EAFNOSUPPORT); @@ -202,21 +207,28 @@ ether_inet6_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, u_long command, void *data) { #pragma unused(protocol_family) - struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (command) { - case SIOCSIFADDR: + case SIOCSIFADDR: /* struct ifaddr pointer */ + /* + * Note: caller of ifnet_ioctl() passes in pointer to + * struct ifaddr as parameter to SIOCSIFADDR, for legacy + * reasons. + */ if ((ifp->if_flags & IFF_RUNNING) == 0) { ifnet_set_flags(ifp, IFF_UP, IFF_UP); ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); } break; - case SIOCGIFADDR: + case SIOCGIFADDR: { /* struct ifreq */ + struct ifreq *ifr = (struct ifreq *)(void *)data; + (void) ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); break; + } default: error = EOPNOTSUPP; diff --git a/bsd/net/ether_inet_pr_module.c b/bsd/net/ether_inet_pr_module.c index 12a8ead3c..b8820a5a5 100644 --- a/bsd/net/ether_inet_pr_module.c +++ b/bsd/net/ether_inet_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,7 +109,7 @@ extern void kdp_set_ip_and_mac_addresses(struct in_addr *ipaddr, struct ether_addr *macaddr); #define _ip_copy(dst, src) \ - (*(dst) = *(src)) + bcopy(src, dst, sizeof (struct in_addr)) static void ether_inet_arp_input(struct ifnet *ifp, struct mbuf *m) @@ -142,9 +142,9 @@ ether_inet_arp_input(struct ifnet *ifp, struct mbuf *m) bzero(&sender_ip, sizeof (sender_ip)); sender_ip.sin_len = sizeof (sender_ip); sender_ip.sin_family = AF_INET; - _ip_copy(&sender_ip.sin_addr, (const struct in_addr *)ea->arp_spa); + _ip_copy(&sender_ip.sin_addr, ea->arp_spa); target_ip = sender_ip; - _ip_copy(&target_ip.sin_addr, (const struct in_addr *)ea->arp_tpa); + _ip_copy(&target_ip.sin_addr, ea->arp_tpa); bzero(&sender_hw, sizeof (sender_hw)); sender_hw.sdl_len = sizeof (sender_hw); @@ -247,21 +247,24 @@ ether_inet_pre_output(ifnet_t ifp, protocol_family_t protocol_family, struct sockaddr_dl ll_dest; result = arp_lookup_ip(ifp, - (const struct sockaddr_in *)dst_netaddr, &ll_dest, - sizeof (ll_dest), (route_t)route, *m0); + (const struct sockaddr_in *)(uintptr_t)(size_t)dst_netaddr, + &ll_dest, sizeof (ll_dest), (route_t)route, *m0); if (result == 0) { + u_int16_t ethertype_ip = htons(ETHERTYPE_IP); + bcopy(LLADDR(&ll_dest), edst, ETHER_ADDR_LEN); - *(u_int16_t *)type = htons(ETHERTYPE_IP); + bcopy(ðertype_ip, type, sizeof (ethertype_ip)); } - break; + break; } case pseudo_AF_HDRCMPLT: case AF_UNSPEC: m->m_flags &= ~M_LOOP; - eh = (const struct ether_header *)dst_netaddr->sa_data; + eh = (const struct ether_header *)(uintptr_t)(size_t) + dst_netaddr->sa_data; (void) memcpy(edst, eh->ether_dhost, 6); - *(u_short *)type = eh->ether_type; + bcopy(&eh->ether_type, type, sizeof (u_short)); break; default: @@ -281,7 +284,8 @@ ether_inet_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, { static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; - const struct sockaddr_in *sin = (const struct sockaddr_in *)proto_addr; + const struct sockaddr_in *sin = + (const struct sockaddr_in *)(uintptr_t)(size_t)proto_addr; if (proto_addr->sa_family != AF_INET) return (EAFNOSUPPORT); @@ -310,13 +314,18 @@ ether_inet_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, u_long command, void *data) { #pragma unused(protocol_family) - ifaddr_t ifa = data; - struct ifreq *ifr = data; int error = 0; switch (command) { - case SIOCSIFADDR: - case SIOCAIFADDR: + case SIOCSIFADDR: /* struct ifaddr pointer */ + case SIOCAIFADDR: { /* struct ifaddr pointer */ + /* + * Note: caller of ifnet_ioctl() passes in pointer to + * struct ifaddr as parameter to SIOC{A,S}IFADDR, for + * legacy reasons. + */ + struct ifaddr *ifa = data; + if (!(ifnet_flags(ifp) & IFF_RUNNING)) { ifnet_set_flags(ifp, IFF_UP, IFF_UP); ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); @@ -326,6 +335,10 @@ ether_inet_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, break; inet_arp_init_ifaddr(ifp, ifa); + + if (command != SIOCSIFADDR) + break; + /* * Register new IP and MAC addresses with the kernel * debugger if the interface is the same as was registered @@ -334,18 +347,21 @@ ether_inet_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, * Do this only for the first address of the interface * and not for aliases. */ - if (command == SIOCSIFADDR && - ((kdp_get_interface() != 0 && + if ((kdp_get_interface() != 0 && kdp_get_interface() == ifp->if_softc) || - (kdp_get_interface() == 0 && ifp->if_unit == 0))) + (kdp_get_interface() == 0 && ifp->if_unit == 0)) kdp_set_ip_and_mac_addresses(&(IA_SIN(ifa)->sin_addr), ifnet_lladdr(ifp)); break; + } + + case SIOCGIFADDR: { /* struct ifreq */ + struct ifreq *ifr = data; - case SIOCGIFADDR: ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); break; + } default: error = EOPNOTSUPP; @@ -390,9 +406,9 @@ ether_inet_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, struct ether_header *eh; struct ether_arp *ea; const struct sockaddr_in *sender_ip = - (const struct sockaddr_in *)sender_proto; - const struct sockaddr_in *target_ip = - (const struct sockaddr_in *)target_proto; + (const struct sockaddr_in *)(uintptr_t)(size_t)sender_proto; + const struct sockaddr_inarp *target_ip = + (const struct sockaddr_inarp *)(uintptr_t)(size_t)target_proto; char *datap; if (target_ip == NULL) @@ -459,8 +475,9 @@ ether_inet_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, IFA_LOCK(ifa); if (ifa->ifa_addr != NULL && ifa->ifa_addr->sa_family == AF_INET) { - bcopy(&((struct sockaddr_in *)ifa->ifa_addr)-> - sin_addr, ea->arp_spa, sizeof(ea->arp_spa)); + bcopy(&((struct sockaddr_in *)(void *) + ifa->ifa_addr)->sin_addr, ea->arp_spa, + sizeof (ea->arp_spa)); IFA_UNLOCK(ifa); break; } @@ -489,6 +506,23 @@ ether_inet_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, /* Target IP */ bcopy(&target_ip->sin_addr, ea->arp_tpa, sizeof (ea->arp_tpa)); + /* + * If this is an ARP request for a (default) router, mark + * the packet accordingly so that the driver can find out, + * in case it needs to perform driver-specific action(s). + */ + if (arpop == ARPOP_REQUEST && (target_ip->sin_other & SIN_ROUTER)) { + m->m_pkthdr.aux_flags |= MAUXF_INET_RESOLVE_RTR; + VERIFY(!(m->m_pkthdr.aux_flags & MAUXF_INET6_RESOLVE_RTR)); + } + + if (ifp->if_eflags & IFEF_TXSTART) { + /* Use control service class if the interface + * supports transmit-start model + */ + (void) m_set_service_class(m, MBUF_SC_CTL); + } + ifnet_output_raw(ifp, PF_INET, m); return (0); diff --git a/bsd/net/pf_mtag.h b/bsd/net/flowadv.h similarity index 67% rename from bsd/net/pf_mtag.h rename to bsd/net/flowadv.h index 218ca4e8e..96e6e9e9c 100644 --- a/bsd/net/pf_mtag.h +++ b/bsd/net/flowadv.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,35 +26,25 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _NET_PF_MTAG_H_ -#define _NET_PF_MTAG_H_ +#ifndef _NET_FLOWADV_H_ +#define _NET_FLOWADV_H_ -#if PF -#if KERNEL_PRIVATE +#include #ifdef __cplusplus extern "C" { #endif -#define PF_TAG_GENERATED 0x01 -#define PF_TAG_FRAGCACHE 0x02 -#define PF_TAG_TRANSLATE_LOCALHOST 0x04 +#define FADV_SUCCESS 0 /* success */ +#define FADV_FLOW_CONTROLLED 1 /* regular flow control */ +#define FADV_SUSPENDED 2 /* flow control due to suspension */ -struct pf_mtag { - void *hdr; /* saved hdr pos in mbuf, for ECN */ - unsigned int rtableid; /* alternate routing table id */ - u_int32_t qid; /* queue id */ - u_int16_t tag; /* tag id */ - u_int8_t flags; - u_int8_t routed; +struct flowadv { + int32_t code; /* FADV advisory code */ }; -__private_extern__ struct pf_mtag *pf_find_mtag(struct mbuf *); -__private_extern__ struct pf_mtag *pf_get_mtag(struct mbuf *); - #ifdef __cplusplus } #endif -#endif /* KERNEL_PRIVATE */ -#endif /* PF */ -#endif /* _NET_PF_MTAG_H_ */ + +#endif /* _NET_FLOWADV_H_ */ diff --git a/bsd/net/flowhash.c b/bsd/net/flowhash.c new file mode 100644 index 000000000..e63462423 --- /dev/null +++ b/bsd/net/flowhash.c @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * http://code.google.com/p/smhasher/ + * + * Copyright (c) 2009-2011 Austin Appleby. + * + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. The author hereby disclaims copyright to this source code. + */ + +/* + * http://burtleburtle.net/bob/hash/ + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * You can use this free for any purpose. It's in the public domain. + * It has no warranty. + */ + +#include +#include +#include +#include + +static inline u_int32_t getblock32(const u_int32_t *, int); +static inline u_int64_t getblock64(const u_int64_t *, int); +static inline u_int32_t mh3_fmix32(u_int32_t); +static inline u_int64_t mh3_fmix64(u_int64_t); + +#define ALIGNED16(v) ((((uintptr_t)(v)) & 1) == 0) +#define ALIGNED32(v) ((((uintptr_t)(v)) & 3) == 0) +#define ALIGNED64(v) ((((uintptr_t)(v)) & 7) == 0) + +#define ROTL32(x, r) (((x) << (r)) | ((x) >> (32 - (r)))) +#define ROTL64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) + +/* + * The following hash algorithms are selected based on performance: + * + * Intel 32-bit: MurmurHash3_x86_32 + * Intel 64-bit: MurmurHash3_x64_128 + * ARM, et al: JHash + */ +#if defined(__i386__) +net_flowhash_fn_t *net_flowhash = net_flowhash_mh3_x86_32; +#elif defined(__x86_64__) +net_flowhash_fn_t *net_flowhash = net_flowhash_mh3_x64_128; +#else /* !__i386__ && !__x86_64__ */ +net_flowhash_fn_t *net_flowhash = net_flowhash_jhash; +#endif /* !__i386__ && !__x86_64__ */ + +#if defined(__i386__) || defined(__x86_64__) +static inline u_int32_t +getblock32(const u_int32_t *p, int i) +{ + return (p[i]); +} + +static inline u_int64_t +getblock64(const u_int64_t *p, int i) +{ + return (p[i]); +} +#else /* !__i386__ && !__x86_64 */ +static inline u_int32_t +getblock32(const u_int32_t *p, int i) +{ + const u_int8_t *bytes = (u_int8_t *)(void *)(uintptr_t)(p + i); + u_int32_t value; + + if (ALIGNED32(p)) { + value = p[i]; + } else { +#if BYTE_ORDER == BIG_ENDIAN + value = + (((u_int32_t)bytes[0]) << 24) | + (((u_int32_t)bytes[1]) << 16) | + (((u_int32_t)bytes[2]) << 8) | + ((u_int32_t)bytes[3]); +#else /* LITTLE_ENDIAN */ + value = + (((u_int32_t)bytes[3]) << 24) | + (((u_int32_t)bytes[2]) << 16) | + (((u_int32_t)bytes[1]) << 8) | + ((u_int32_t)bytes[0]); +#endif /* LITTLE_ENDIAN */ + } + return (value); +} + +static inline u_int64_t +getblock64(const u_int64_t *p, int i) +{ + const u_int8_t *bytes = (const u_int8_t *)(void *)(uintptr_t)(p + i); + u_int64_t value; + + if (ALIGNED64(p)) { + value = p[i]; + } else { +#if BYTE_ORDER == BIG_ENDIAN + value = + (((u_int64_t)bytes[0]) << 56) | + (((u_int64_t)bytes[1]) << 48) | + (((u_int64_t)bytes[2]) << 40) | + (((u_int64_t)bytes[3]) << 32) | + (((u_int64_t)bytes[4]) << 24) | + (((u_int64_t)bytes[5]) << 16) | + (((u_int64_t)bytes[6]) << 8) | + ((u_int64_t)bytes[7]); +#else /* LITTLE_ENDIAN */ + value = + (((u_int64_t)bytes[7]) << 56) | + (((u_int64_t)bytes[6]) << 48) | + (((u_int64_t)bytes[5]) << 40) | + (((u_int64_t)bytes[4]) << 32) | + (((u_int64_t)bytes[3]) << 24) | + (((u_int64_t)bytes[2]) << 16) | + (((u_int64_t)bytes[1]) << 8) | + ((u_int64_t)bytes[0]); +#endif /* LITTLE_ENDIAN */ + } + return (value); +} +#endif /* !__i386__ && !__x86_64 */ + +static inline u_int32_t +mh3_fmix32(u_int32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return (h); +} + +static inline u_int64_t +mh3_fmix64(u_int64_t k) +{ + k ^= k >> 33; + k *= 0xff51afd7ed558ccdLLU; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53LLU; + k ^= k >> 33; + + return (k); +} + +/* + * MurmurHash3_x86_32 + */ +#define MH3_X86_32_C1 0xcc9e2d51 +#define MH3_X86_32_C2 0x1b873593 + +u_int32_t +net_flowhash_mh3_x86_32(const void *key, u_int32_t len, const u_int32_t seed) +{ + const u_int8_t *data = (const u_int8_t *)key; + const u_int32_t nblocks = len / 4; + const u_int32_t *blocks; + const u_int8_t *tail; + u_int32_t h1 = seed, k1; + int i; + + /* body */ + blocks = (const u_int32_t *)(const void *)(data + nblocks * 4); + + for (i = -nblocks; i; i++) { + k1 = getblock32(blocks, i); + + k1 *= MH3_X86_32_C1; + k1 = ROTL32(k1, 15); + k1 *= MH3_X86_32_C2; + + h1 ^= k1; + h1 = ROTL32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + /* tail */ + tail = (const u_int8_t *)(const void *)(data + nblocks * 4); + k1 = 0; + + switch (len & 3) { + case 3: + k1 ^= tail[2] << 16; + /* FALLTHRU */ + case 2: + k1 ^= tail[1] << 8; + /* FALLTHRU */ + case 1: + k1 ^= tail[0]; + k1 *= MH3_X86_32_C1; + k1 = ROTL32(k1, 15); + k1 *= MH3_X86_32_C2; + h1 ^= k1; + }; + + /* finalization */ + h1 ^= len; + + h1 = mh3_fmix32(h1); + + return (h1); +} + +/* + * MurmurHash3_x64_128 + */ +#define MH3_X64_128_C1 0x87c37b91114253d5LLU +#define MH3_X64_128_C2 0x4cf5ad432745937fLLU + +u_int32_t +net_flowhash_mh3_x64_128(const void *key, u_int32_t len, const u_int32_t seed) +{ + const u_int8_t *data = (const u_int8_t *)key; + const u_int32_t nblocks = len / 16; + const u_int64_t *blocks; + const u_int8_t *tail; + u_int64_t h1 = seed, k1; + u_int64_t h2 = seed, k2; + u_int32_t i; + + /* body */ + blocks = (const u_int64_t *)(const void *)data; + + for (i = 0; i < nblocks; i++) { + k1 = getblock64(blocks, i * 2 + 0); + k2 = getblock64(blocks, i * 2 + 1); + + k1 *= MH3_X64_128_C1; + k1 = ROTL64(k1, 31); + k1 *= MH3_X64_128_C2; + h1 ^= k1; + + h1 = ROTL64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= MH3_X64_128_C2; + k2 = ROTL64(k2, 33); + k2 *= MH3_X64_128_C1; + h2 ^= k2; + + h2 = ROTL64(h2, 31); + h2 += h1; + h2 = h2 * 5+ 0x38495ab5; + } + + /* tail */ + tail = (const u_int8_t *)(const void *)(data + nblocks * 16); + k1 = 0; + k2 = 0; + + switch (len & 15) { + case 15: + k2 ^= ((u_int64_t)tail[14]) << 48; + /* FALLTHRU */ + case 14: + k2 ^= ((u_int64_t)tail[13]) << 40; + /* FALLTHRU */ + case 13: + k2 ^= ((u_int64_t)tail[12]) << 32; + /* FALLTHRU */ + case 12: + k2 ^= ((u_int64_t)tail[11]) << 24; + /* FALLTHRU */ + case 11: + k2 ^= ((u_int64_t)tail[10]) << 16; + /* FALLTHRU */ + case 10: + k2 ^= ((u_int64_t)tail[9]) << 8; + /* FALLTHRU */ + case 9: + k2 ^= ((u_int64_t)tail[8]) << 0; + k2 *= MH3_X64_128_C2; + k2 = ROTL64(k2, 33); + k2 *= MH3_X64_128_C1; + h2 ^= k2; + /* FALLTHRU */ + case 8: + k1 ^= ((u_int64_t)tail[7]) << 56; + /* FALLTHRU */ + case 7: + k1 ^= ((u_int64_t)tail[6]) << 48; + /* FALLTHRU */ + case 6: + k1 ^= ((u_int64_t)tail[5]) << 40; + /* FALLTHRU */ + case 5: + k1 ^= ((u_int64_t)tail[4]) << 32; + /* FALLTHRU */ + case 4: + k1 ^= ((u_int64_t)tail[3]) << 24; + /* FALLTHRU */ + case 3: + k1 ^= ((u_int64_t)tail[2]) << 16; + /* FALLTHRU */ + case 2: + k1 ^= ((u_int64_t)tail[1]) << 8; + /* FALLTHRU */ + case 1: + k1 ^= ((u_int64_t)tail[0]) << 0; + k1 *= MH3_X64_128_C1; + k1 = ROTL64(k1, 31); + k1 *= MH3_X64_128_C2; + h1 ^= k1; + }; + + /* finalization */ + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = mh3_fmix64(h1); + h2 = mh3_fmix64(h2); + + h1 += h2; + h2 += h1; + + /* throw all but lowest 32-bit */ + return (h1 & 0xffffffff); +} + +#define JHASH_INIT 0xdeadbeef + +#define JHASH_MIX(a, b, c) { \ + a -= c; a ^= ROTL32(c, 4); c += b; \ + b -= a; b ^= ROTL32(a, 6); a += c; \ + c -= b; c ^= ROTL32(b, 8); b += a; \ + a -= c; a ^= ROTL32(c, 16); c += b; \ + b -= a; b ^= ROTL32(a, 19); a += c; \ + c -= b; c ^= ROTL32(b, 4); b += a; \ +} + +#define JHASH_FINAL(a, b, c) { \ + c ^= b; c -= ROTL32(b, 14); \ + a ^= c; a -= ROTL32(c, 11); \ + b ^= a; b -= ROTL32(a, 25); \ + c ^= b; c -= ROTL32(b, 16); \ + a ^= c; a -= ROTL32(c, 4); \ + b ^= a; b -= ROTL32(a, 14); \ + c ^= b; c -= ROTL32(b, 24); \ +} + +#if BYTE_ORDER == BIG_ENDIAN +/* + * hashbig() + */ +u_int32_t +net_flowhash_jhash(const void *key, u_int32_t len, const u_int32_t seed) +{ + u_int32_t a, b, c; + + /* Set up the internal state */ + a = b = c = JHASH_INIT + len + seed; + + if (ALIGNED32(key)) { + /* read 32-bit chunks */ + const u_int32_t *k = (const u_int32_t *)key; + + /* + * all but last block: + * aligned reads and affect 32 bits of (a,b,c) + */ + while (len > 12) { + a += k[0]; + b += k[1]; + c += k[2]; + JHASH_MIX(a, b, c); + len -= 12; + k += 3; + } + + /* + * handle the last (probably partial) block + * + * "k[2] << 8" actually reads beyond the end of the string, + * but then shifts out the part it's not allowed to read. + * Because the string is aligned, the illegal read is in + * the same word as the rest of the string. The masking + * trick does make the hash noticably faster for short + * strings (like English words). + */ + switch (len) { + case 12: + c += k[2]; + b += k[1]; + a += k[0]; + break; + + case 11: + c += k[2] & 0xffffff00; + b += k[1]; + a += k[0]; + break; + + case 10: + c += k[2] & 0xffff0000; + b += k[1]; + a += k[0]; + break; + + case 9: + c += k[2] & 0xff000000; + b += k[1]; + a += k[0]; + break; + + case 8: + b += k[1]; + a += k[0]; + break; + + case 7: + b += k[1] & 0xffffff00; + a += k[0]; + break; + + case 6: + b += k[1] & 0xffff0000; + a += k[0]; + break; + + case 5: + b += k[1] & 0xff000000; + a += k[0]; + break; + + case 4: + a += k[0]; + break; + + case 3: + a += k[0] & 0xffffff00; + break; + + case 2: + a += k[0] & 0xffff0000; + break; + + case 1: + a += k[0] & 0xff000000; + break; + + case 0: + /* zero length requires no mixing */ + return (c); + } + + JHASH_FINAL(a, b, c); + + return (c); + } + + /* need to read the key one byte at a time */ + const u_int8_t *k = (const u_int8_t *)key; + + /* all but the last block: affect some 32 bits of (a,b,c) */ + while (len > 12) { + a += ((u_int32_t)k[0]) << 24; + a += ((u_int32_t)k[1]) << 16; + a += ((u_int32_t)k[2]) << 8; + a += ((u_int32_t)k[3]); + b += ((u_int32_t)k[4]) << 24; + b += ((u_int32_t)k[5]) << 16; + b += ((u_int32_t)k[6]) << 8; + b += ((u_int32_t)k[7]); + c += ((u_int32_t)k[8]) << 24; + c += ((u_int32_t)k[9]) << 16; + c += ((u_int32_t)k[10]) << 8; + c += ((u_int32_t)k[11]); + JHASH_MIX(a, b, c); + len -= 12; + k += 12; + } + + /* last block: affect all 32 bits of (c) */ + switch (len) { + case 12: + c += k[11]; + /* FALLTHRU */ + case 11: + c += ((u_int32_t)k[10]) << 8; + /* FALLTHRU */ + case 10: + c += ((u_int32_t)k[9]) << 16; + /* FALLTHRU */ + case 9: + c += ((u_int32_t)k[8]) << 24; + /* FALLTHRU */ + case 8: + b += k[7]; + /* FALLTHRU */ + case 7: + b += ((u_int32_t)k[6]) << 8; + /* FALLTHRU */ + case 6: + b += ((u_int32_t)k[5]) << 16; + /* FALLTHRU */ + case 5: + b += ((u_int32_t)k[4]) << 24; + /* FALLTHRU */ + case 4: + a += k[3]; + /* FALLTHRU */ + case 3: + a += ((u_int32_t)k[2]) << 8; + /* FALLTHRU */ + case 2: + a += ((u_int32_t)k[1]) << 16; + /* FALLTHRU */ + case 1: + a += ((u_int32_t)k[0]) << 24; + break; + + case 0: + /* zero length requires no mixing */ + return (c); + } + + JHASH_FINAL(a, b, c); + + return (c); +} +#else /* LITTLE_ENDIAN */ +/* + * hashlittle() + */ +u_int32_t +net_flowhash_jhash(const void *key, u_int32_t len, const u_int32_t seed) +{ + u_int32_t a, b, c; + + /* Set up the internal state */ + a = b = c = JHASH_INIT + len + seed; + +#if defined(__i386__) || defined(__x86_64__) + /* + * On i386/x86_64, it is faster to read 32-bit chunks if the key + * is aligned 32-bit OR not 16-bit, and perform 16-bit reads if it + * is aligned 16-bit. + */ + if (ALIGNED32(key) || !ALIGNED16(key)) { +#else /* !defined(__i386__) && !defined(__x86_64__) */ + if (ALIGNED32(key)) { +#endif /* !defined(__i386__) && !defined(__x86_64__) */ + /* read 32-bit chunks */ + const u_int32_t *k = (const u_int32_t *)key; + + /* + * all but last block: + * aligned reads and affect 32 bits of (a,b,c) + */ + while (len > 12) { + a += k[0]; + b += k[1]; + c += k[2]; + JHASH_MIX(a, b, c); + len -= 12; + k += 3; + } + + /* + * handle the last (probably partial) block + * + * "k[2] & 0xffffff" actually reads beyond the end of the + * string, but then masks off the part it's not allowed + * to read. Because the string is aligned, the masked-off + * tail is in the same word as the rest of the string. + * The masking trick does make the hash noticably faster + * for short strings (like English words). + */ + switch (len) { + case 12: + c += k[2]; + b += k[1]; + a += k[0]; + break; + + case 11: + c += k[2] & 0xffffff; + b += k[1]; + a += k[0]; + break; + + case 10: + c += k[2] & 0xffff; + b += k[1]; + a += k[0]; + break; + + case 9: + c += k[2] & 0xff; + b += k[1]; + a += k[0]; + break; + + case 8: + b += k[1]; + a += k[0]; + break; + + case 7: + b += k[1] & 0xffffff; + a += k[0]; + break; + + case 6: + b += k[1] & 0xffff; + a += k[0]; + break; + + case 5: + b += k[1] & 0xff; + a += k[0]; + break; + + case 4: + a += k[0]; + break; + + case 3: + a += k[0] & 0xffffff; + break; + + case 2: + a += k[0] & 0xffff; + break; + + case 1: + a += k[0] & 0xff; + break; + + case 0: + /* zero length requires no mixing */ + return (c); + } + + JHASH_FINAL(a, b, c); + + return (c); + } +#if !defined(__i386__) && !defined(__x86_64__) + else if (ALIGNED16(key)) { +#endif /* !defined(__i386__) && !defined(__x86_64__) */ + /* read 16-bit chunks */ + const u_int16_t *k = (const u_int16_t *)key; + const u_int8_t *k8; + + /* all but last block: aligned reads and different mixing */ + while (len > 12) { + a += k[0] + (((u_int32_t)k[1]) << 16); + b += k[2] + (((u_int32_t)k[3]) << 16); + c += k[4] + (((u_int32_t)k[5]) << 16); + JHASH_MIX(a, b, c); + len -= 12; + k += 6; + } + + /* handle the last (probably partial) block */ + k8 = (const u_int8_t *)k; + switch (len) { + case 12: + c += k[4] + (((u_int32_t)k[5]) << 16); + b += k[2] + (((u_int32_t)k[3]) << 16); + a += k[0] + (((u_int32_t)k[1]) << 16); + break; + + case 11: + c += ((u_int32_t)k8[10]) << 16; + /* FALLTHRU */ + case 10: + c += k[4]; + b += k[2] + (((u_int32_t)k[3]) << 16); + a += k[0] + (((u_int32_t)k[1]) << 16); + break; + + case 9: + c += k8[8]; + /* FALLTHRU */ + case 8: + b += k[2] + (((u_int32_t)k[3]) << 16); + a += k[0] + (((u_int32_t)k[1]) << 16); + break; + + case 7: + b += ((u_int32_t)k8[6]) << 16; + /* FALLTHRU */ + case 6: + b += k[2]; + a += k[0] + (((u_int32_t)k[1]) << 16); + break; + + case 5: + b += k8[4]; + /* FALLTHRU */ + case 4: + a += k[0] + (((u_int32_t)k[1]) << 16); + break; + + case 3: + a += ((u_int32_t)k8[2]) << 16; + /* FALLTHRU */ + case 2: + a += k[0]; + break; + + case 1: + a += k8[0]; + break; + + case 0: + /* zero length requires no mixing */ + return (c); + } + + JHASH_FINAL(a, b, c); + + return (c); +#if !defined(__i386__) && !defined(__x86_64__) + } + + /* need to read the key one byte at a time */ + const u_int8_t *k = (const u_int8_t *)key; + + /* all but the last block: affect some 32 bits of (a,b,c) */ + while (len > 12) { + a += k[0]; + a += ((u_int32_t)k[1]) << 8; + a += ((u_int32_t)k[2]) << 16; + a += ((u_int32_t)k[3]) << 24; + b += k[4]; + b += ((u_int32_t)k[5]) << 8; + b += ((u_int32_t)k[6]) << 16; + b += ((u_int32_t)k[7]) << 24; + c += k[8]; + c += ((u_int32_t)k[9]) << 8; + c += ((u_int32_t)k[10]) << 16; + c += ((u_int32_t)k[11]) << 24; + JHASH_MIX(a, b, c); + len -= 12; + k += 12; + } + + /* last block: affect all 32 bits of (c) */ + switch (len) { + case 12: + c += ((u_int32_t)k[11]) << 24; + /* FALLTHRU */ + case 11: + c += ((u_int32_t)k[10]) << 16; + /* FALLTHRU */ + case 10: + c += ((u_int32_t)k[9]) << 8; + /* FALLTHRU */ + case 9: + c += k[8]; + /* FALLTHRU */ + case 8: + b += ((u_int32_t)k[7]) << 24; + /* FALLTHRU */ + case 7: + b += ((u_int32_t)k[6]) << 16; + /* FALLTHRU */ + case 6: + b += ((u_int32_t)k[5]) << 8; + /* FALLTHRU */ + case 5: + b += k[4]; + /* FALLTHRU */ + case 4: + a += ((u_int32_t)k[3]) << 24; + /* FALLTHRU */ + case 3: + a += ((u_int32_t)k[2]) << 16; + /* FALLTHRU */ + case 2: + a += ((u_int32_t)k[1]) << 8; + /* FALLTHRU */ + case 1: + a += k[0]; + break; + + case 0: + /* zero length requires no mixing */ + return (c); + } + + JHASH_FINAL(a, b, c); + + return (c); +#endif /* !defined(__i386__) && !defined(__x86_64__) */ +} +#endif /* LITTLE_ENDIAN */ diff --git a/osfmk/vm/vm_print.h b/bsd/net/flowhash.h similarity index 61% rename from osfmk/vm/vm_print.h rename to bsd/net/flowhash.h index 6decd44f6..dc7c3b58f 100644 --- a/osfmk/vm/vm_print.h +++ b/bsd/net/flowhash.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,48 +22,37 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef VM_PRINT_H -#define VM_PRINT_H - -#include -#include -extern void vm_map_print( - db_addr_t map); +#ifndef _NET_FLOWHASH_H_ +#define _NET_FLOWHASH_H_ -extern void vm_map_copy_print( - db_addr_t copy); +#include -#include +#ifdef __cplusplus +extern "C" { +#endif -extern int vm_follow_object( - vm_object_t object); - -extern void vm_object_print(db_expr_t, boolean_t, db_expr_t, char *); - -#include - -extern void vm_page_print( - db_addr_t p); - -#include -#if MACH_PAGEMAP -#include -extern void vm_external_print( - vm_external_map_t map, - vm_object_size_t size); -#endif /* MACH_PAGEMAP */ +/* + * If 32-bit hash value is too large, use this macro to truncate + * it to n-bit; masking is a faster operation than modulus. + */ +#define HASHMASK(n) ((1UL << (n)) - 1) -extern void db_vm(void); +/* + * Returns 32-bit hash value. Hashes which are capable of returning + * more bits currently have their results truncated to 32-bit. + */ +typedef u_int32_t net_flowhash_fn_t(const void *, u_int32_t, const u_int32_t); -extern vm_map_size_t db_vm_map_total_size( - db_addr_t map); +extern net_flowhash_fn_t *net_flowhash; +extern net_flowhash_fn_t net_flowhash_mh3_x86_32; +extern net_flowhash_fn_t net_flowhash_mh3_x64_128; +extern net_flowhash_fn_t net_flowhash_jhash; +#ifdef __cplusplus +} +#endif -#endif /* VM_PRINT_H */ +#endif /* _NET_FLOWHASH_H_ */ diff --git a/bsd/net/if.c b/bsd/net/if.c index 595fcaea9..1097ff570 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -111,6 +111,11 @@ #include #include #include +#include +#include +#include +#include +#include #if INET6 #include #include @@ -122,6 +127,9 @@ #include #endif +#if PF_ALTQ +#include +#endif /* !PF_ALTQ */ /* * System initialization @@ -132,8 +140,9 @@ lck_attr_t *ifa_mtx_attr; lck_grp_t *ifa_mtx_grp; static lck_grp_attr_t *ifa_mtx_grp_attr; +static int ifioctl_ifreq(struct socket *, u_long, struct ifreq *, + struct proc *); static int ifconf(u_long cmd, user_addr_t ifrp, int * ret_space); -static void if_qflush(struct ifqueue *); __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); void if_rtproto_del(struct ifnet *ifp, int protocol); @@ -151,7 +160,6 @@ static int if_clone_list(int count, int * total, user_addr_t dst); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); -int ifqmaxlen = IFQ_MAXLEN; struct ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head); static int if_cloners_count; @@ -413,8 +421,8 @@ if_next_index(void) } /* switch to the new tables and size */ - ifnet_addrs = (struct ifaddr **)new_ifnet_addrs; - ifindex2ifnet = (struct ifnet **)new_ifindex2ifnet; + ifnet_addrs = (struct ifaddr **)(void *)new_ifnet_addrs; + ifindex2ifnet = (struct ifnet **)(void *)new_ifindex2ifnet; if_indexlim = new_if_indexlim; /* release the old data */ @@ -951,7 +959,8 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) * so do that if we can. */ if (af == AF_LINK) { - const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; + const struct sockaddr_dl *sdl = + (const struct sockaddr_dl *)(uintptr_t)(size_t)addr; if (sdl->sdl_index && sdl->sdl_index <= if_index) { ifa = ifnet_addrs[sdl->sdl_index - 1]; if (ifa != NULL) @@ -1220,6 +1229,7 @@ if_updown( int i; struct ifaddr **ifa; struct timespec tv; + struct ifclassq *ifq = &ifp->if_snd; /* Wait until no one else is changing the up/down state */ while ((ifp->if_eflags & IFEF_UPDOWNCHANGE) != 0) { @@ -1229,16 +1239,16 @@ if_updown( msleep(&ifp->if_eflags, NULL, 0, "if_updown", &tv); ifnet_lock_exclusive(ifp); } - + /* Verify that the interface isn't already in the right state */ if ((!up && (ifp->if_flags & IFF_UP) == 0) || (up && (ifp->if_flags & IFF_UP) == IFF_UP)) { return; } - + /* Indicate that the up/down state is changing */ ifp->if_eflags |= IFEF_UPDOWNCHANGE; - + /* Mark interface up or down */ if (up) { ifp->if_flags |= IFF_UP; @@ -1246,9 +1256,9 @@ if_updown( else { ifp->if_flags &= ~IFF_UP; } - + ifnet_touch_lastchange(ifp); - + /* Drop the lock to notify addresses and route */ ifnet_lock_done(ifp); if (ifnet_get_address_list(ifp, &ifa) == 0) { @@ -1258,15 +1268,19 @@ if_updown( ifnet_free_address_list(ifa); } rt_ifmsg(ifp); - - /* Aquire the lock to clear the changing flag and flush the send queue */ - ifnet_lock_exclusive(ifp); + if (!up) - if_qflush(&ifp->if_snd); + if_qflush(ifp, 0); + + /* Inform all transmit queues about the new link state */ + IFCQ_LOCK(ifq); + ifnet_update_sndq(ifq, up ? CLASSQ_EV_LINK_UP : CLASSQ_EV_LINK_DOWN); + IFCQ_UNLOCK(ifq); + + /* Aquire the lock to clear the changing flag */ + ifnet_lock_exclusive(ifp); ifp->if_eflags &= ~IFEF_UPDOWNCHANGE; wakeup(&ifp->if_eflags); - - return; } /* @@ -1298,19 +1312,61 @@ if_up( /* * Flush an interface queue. */ -static void -if_qflush(struct ifqueue *ifq) +void +if_qflush(struct ifnet *ifp, int ifq_locked) { - struct mbuf *m, *n; + struct ifclassq *ifq = &ifp->if_snd; + + if (!ifq_locked) + IFCQ_LOCK(ifq); + + if (IFCQ_IS_ENABLED(ifq)) + IFCQ_PURGE(ifq); +#if PF_ALTQ + if (IFCQ_IS_DRAINING(ifq)) + ifq->ifcq_drain = 0; + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) + ALTQ_PURGE(IFCQ_ALTQ(ifq)); +#endif /* PF_ALTQ */ - n = ifq->ifq_head; - while ((m = n) != 0) { - n = m->m_act; - m_freem(m); + VERIFY(IFCQ_IS_EMPTY(ifq)); + + if (!ifq_locked) + IFCQ_UNLOCK(ifq); +} + +void +if_qflush_sc(struct ifnet *ifp, mbuf_svc_class_t sc, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes, int ifq_locked) +{ + struct ifclassq *ifq = &ifp->if_snd; + u_int32_t cnt = 0, len = 0; + u_int32_t a_cnt = 0, a_len = 0; + + VERIFY(sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(sc)); + VERIFY(flow != 0); + + if (!ifq_locked) + IFCQ_LOCK(ifq); + + if (IFCQ_IS_ENABLED(ifq)) + IFCQ_PURGE_SC(ifq, sc, flow, cnt, len); +#if PF_ALTQ + if (IFCQ_IS_DRAINING(ifq)) { + VERIFY((signed)(ifq->ifcq_drain - cnt) >= 0); + ifq->ifcq_drain -= cnt; } - ifq->ifq_head = NULL; - ifq->ifq_tail = NULL; - ifq->ifq_len = 0; + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) + ALTQ_PURGE_SC(IFCQ_ALTQ(ifq), sc, flow, a_cnt, a_len); +#endif /* PF_ALTQ */ + + if (!ifq_locked) + IFCQ_UNLOCK(ifq); + + if (packets != NULL) + *packets = cnt + a_cnt; + if (bytes != NULL) + *bytes = len + a_len; } /* @@ -1371,7 +1427,7 @@ struct ifnet * if_withname(struct sockaddr *sa) { char ifname[IFNAMSIZ+1]; - struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa; + struct sockaddr_dl *sdl = (struct sockaddr_dl *)(void *)sa; if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) || (sdl->sdl_nlen > IFNAMSIZ) ) @@ -1396,69 +1452,466 @@ if_withname(struct sockaddr *sa) int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) { - struct ifnet *ifp; - struct ifreq *ifr; - struct ifstat *ifs; + char ifname[IFNAMSIZ + 1]; + struct ifnet *ifp = NULL; + struct ifstat *ifs = NULL; int error = 0; - short oif_flags; - struct kev_msg ev_msg; - struct net_event_data ev_data; - bzero(&ev_data, sizeof(struct net_event_data)); - bzero(&ev_msg, sizeof(struct kev_msg)); + bzero(ifname, sizeof (ifname)); + + /* + * ioctls which don't require ifp, or ifreq ioctls + */ switch (cmd) { - case OSIOCGIFCONF32: - case SIOCGIFCONF32: { - struct ifconf32 *ifc = (struct ifconf32 *)data; - return (ifconf(cmd, CAST_USER_ADDR_T(ifc->ifc_req), - &ifc->ifc_len)); - /* NOTREACHED */ + case OSIOCGIFCONF32: /* struct ifconf32 */ + case SIOCGIFCONF32: { /* struct ifconf32 */ + struct ifconf32 ifc; + bcopy(data, &ifc, sizeof (ifc)); + error = ifconf(cmd, CAST_USER_ADDR_T(ifc.ifc_req), + &ifc.ifc_len); + bcopy(&ifc, data, sizeof (ifc)); + goto done; } - case SIOCGIFCONF64: - case OSIOCGIFCONF64: { - struct ifconf64 *ifc = (struct ifconf64 *)data; - return (ifconf(cmd, ifc->ifc_req, &ifc->ifc_len)); - /* NOTREACHED */ + + case SIOCGIFCONF64: /* struct ifconf64 */ + case OSIOCGIFCONF64: { /* struct ifconf64 */ + struct ifconf64 ifc; + bcopy(data, &ifc, sizeof (ifc)); + error = ifconf(cmd, ifc.ifc_req, &ifc.ifc_len); + bcopy(&ifc, data, sizeof (ifc)); + goto done; } + +#if IF_CLONE_LIST + case SIOCIFGCLONERS32: { /* struct if_clonereq32 */ + struct if_clonereq32 ifcr; + bcopy(data, &ifcr, sizeof (ifcr)); + error = if_clone_list(ifcr.ifcr_count, &ifcr.ifcr_total, + CAST_USER_ADDR_T(ifcr.ifcru_buffer)); + bcopy(&ifcr, data, sizeof (ifcr)); + goto done; + } + + case SIOCIFGCLONERS64: { /* struct if_clonereq64 */ + struct if_clonereq64 ifcr; + bcopy(data, &ifcr, sizeof (ifcr)); + error = if_clone_list(ifcr.ifcr_count, &ifcr.ifcr_total, + ifcr.ifcru_buffer); + bcopy(&ifcr, data, sizeof (ifcr)); + goto done; } - ifr = (struct ifreq *)data; +#endif /* IF_CLONE_LIST */ + + case SIOCSIFDSTADDR: /* struct ifreq */ + case SIOCSIFADDR: /* struct ifreq */ + case SIOCSIFBRDADDR: /* struct ifreq */ + case SIOCSIFNETMASK: /* struct ifreq */ + case OSIOCGIFADDR: /* struct ifreq */ + case OSIOCGIFDSTADDR: /* struct ifreq */ + case OSIOCGIFBRDADDR: /* struct ifreq */ + case OSIOCGIFNETMASK: /* struct ifreq */ + case SIOCSIFKPI: /* struct ifreq */ + if (so->so_proto == NULL) { + error = EOPNOTSUPP; + goto done; + } + /* FALLTHRU */ + case SIOCIFCREATE: /* struct ifreq */ + case SIOCIFCREATE2: /* struct ifreq */ + case SIOCIFDESTROY: /* struct ifreq */ + case SIOCGIFFLAGS: /* struct ifreq */ + case SIOCGIFEFLAGS: /* struct ifreq */ + case SIOCGIFCAP: /* struct ifreq */ + case SIOCGIFMAC: /* struct ifreq */ + case SIOCGIFMETRIC: /* struct ifreq */ + case SIOCGIFMTU: /* struct ifreq */ + case SIOCGIFPHYS: /* struct ifreq */ + case SIOCSIFFLAGS: /* struct ifreq */ + case SIOCSIFCAP: /* struct ifreq */ + case SIOCSIFPHYS: /* struct ifreq */ + case SIOCSIFMTU: /* struct ifreq */ + case SIOCADDMULTI: /* struct ifreq */ + case SIOCDELMULTI: /* struct ifreq */ + case SIOCDIFPHYADDR: /* struct ifreq */ + case SIOCSIFMEDIA: /* struct ifreq */ + case SIOCSIFGENERIC: /* struct ifreq */ + case SIOCSIFLLADDR: /* struct ifreq */ + case SIOCSIFALTMTU: /* struct ifreq */ + case SIOCSIFVLAN: /* struct ifreq */ + case SIOCSIFBOND: /* struct ifreq */ + case SIOCGIFPSRCADDR: /* struct ifreq */ + case SIOCGIFPDSTADDR: /* struct ifreq */ + case SIOCGIFGENERIC: /* struct ifreq */ + case SIOCGIFDEVMTU: /* struct ifreq */ + case SIOCGIFVLAN: /* struct ifreq */ + case SIOCGIFBOND: /* struct ifreq */ + case SIOCGIFWAKEFLAGS: /* struct ifreq */ + case SIOCGIFGETRTREFCNT: /* struct ifreq */ + case SIOCSIFOPPORTUNISTIC: /* struct ifreq */ + case SIOCGIFOPPORTUNISTIC: /* struct ifreq */ + case SIOCGIFLINKQUALITYMETRIC: { /* struct ifreq */ + struct ifreq ifr; + bcopy(data, &ifr, sizeof (ifr)); + error = ifioctl_ifreq(so, cmd, &ifr, p); + bcopy(&ifr, data, sizeof (ifr)); + goto done; + } + } + + /* + * ioctls which require ifp. Note that we acquire dlil_ifnet_lock + * here to ensure that the ifnet, if found, has been fully attached. + */ + dlil_if_lock(); + switch (cmd) { + case SIOCSIFPHYADDR: { /* struct ifaliasreq */ + bcopy(((struct ifaliasreq *)(void *)data)->ifra_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + +#if INET6 + case SIOCSIFPHYADDR_IN6_32: { /* struct in6_aliasreq_32 */ + bcopy(((struct in6_aliasreq_32 *)(void *)data)->ifra_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCSIFPHYADDR_IN6_64: { /* struct in6_aliasreq_64 */ + bcopy(((struct in6_aliasreq_64 *)(void *)data)->ifra_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } +#endif + + case SIOCSLIFPHYADDR: /* struct if_laddrreq */ + case SIOCGLIFPHYADDR: { /* struct if_laddrreq */ + bcopy(((struct if_laddrreq *)(void *)data)->iflr_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCGIFSTATUS: { /* struct ifstat */ + ifs = _MALLOC(sizeof (*ifs), M_DEVBUF, M_WAITOK); + if (ifs == NULL) { + error = ENOMEM; + dlil_if_unlock(); + goto done; + } + bcopy(data, ifs, sizeof (*ifs)); + ifs->ifs_name[IFNAMSIZ - 1] = '\0'; + ifp = ifunit(ifs->ifs_name); + break; + } + + case SIOCGIFMEDIA32: { /* struct ifmediareq32 */ + bcopy(((struct ifmediareq32 *)(void *)data)->ifm_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCGIFMEDIA64: { /* struct ifmediareq64 */ + bcopy(((struct ifmediareq64 *)(void *)data)->ifm_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCSIFDESC: /* struct if_descreq */ + case SIOCGIFDESC: { /* struct if_descreq */ + bcopy(((struct if_descreq *)(void *)data)->ifdr_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCSIFLINKPARAMS: /* struct if_linkparamsreq */ + case SIOCGIFLINKPARAMS: { /* struct if_linkparamsreq */ + bcopy(((struct if_linkparamsreq *)(void *)data)->iflpr_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCGIFQUEUESTATS: { /* struct if_qstatsreq */ + bcopy(((struct if_qstatsreq *)(void *)data)->ifqr_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + case SIOCSIFTHROTTLE: /* struct if_throttlereq */ + case SIOCGIFTHROTTLE: { /* struct if_throttlereq */ + bcopy(((struct if_throttlereq *)(void *)data)->ifthr_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + + default: { + /* + * This is a bad assumption, but the code seems to + * have been doing this in the past; caveat emptor. + */ + bcopy(((struct ifreq *)(void *)data)->ifr_name, + ifname, IFNAMSIZ); + ifp = ifunit(ifname); + break; + } + } + dlil_if_unlock(); + + if (ifp == NULL) { + error = ENXIO; + goto done; + } + + switch (cmd) { + case SIOCSIFPHYADDR: /* struct ifaliasreq */ +#if INET6 + case SIOCSIFPHYADDR_IN6_32: /* struct in6_aliasreq_32 */ + case SIOCSIFPHYADDR_IN6_64: /* struct in6_aliasreq_64 */ +#endif + case SIOCSLIFPHYADDR: /* struct if_laddrreq */ + error = proc_suser(p); + if (error != 0) + break; + + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; + + ifnet_touch_lastchange(ifp); + break; + + case SIOCGIFSTATUS: /* struct ifstat */ + VERIFY(ifs != NULL); + ifs->ascii[0] = '\0'; + + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, (caddr_t)ifs); + + bcopy(ifs, data, sizeof (*ifs)); + break; + + case SIOCGLIFPHYADDR: /* struct if_laddrreq */ + case SIOCGIFMEDIA32: /* struct ifmediareq32 */ + case SIOCGIFMEDIA64: /* struct ifmediareq64 */ + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + break; + + case SIOCSIFDESC: { /* struct if_descreq */ + struct if_descreq *ifdr = (struct if_descreq *)(void *)data; + u_int32_t ifdr_len; + + if ((error = proc_suser(p)) != 0) + break; + + ifnet_lock_exclusive(ifp); + bcopy(&ifdr->ifdr_len, &ifdr_len, sizeof (ifdr_len)); + if (ifdr_len > sizeof (ifdr->ifdr_desc) || + ifdr_len > ifp->if_desc.ifd_maxlen) { + error = EINVAL; + ifnet_lock_done(ifp); + break; + } + + bzero(ifp->if_desc.ifd_desc, ifp->if_desc.ifd_maxlen); + if ((ifp->if_desc.ifd_len = ifdr_len) > 0) { + bcopy(ifdr->ifdr_desc, ifp->if_desc.ifd_desc, + MIN(ifdr_len, ifp->if_desc.ifd_maxlen)); + } + ifnet_lock_done(ifp); + break; + } + + case SIOCGIFDESC: { /* struct if_descreq */ + struct if_descreq *ifdr = (struct if_descreq *)(void *)data; + u_int32_t ifdr_len; + + ifnet_lock_shared(ifp); + ifdr_len = MIN(ifp->if_desc.ifd_len, sizeof (ifdr->ifdr_desc)); + bcopy(&ifdr_len, &ifdr->ifdr_len, sizeof (ifdr_len)); + bzero(&ifdr->ifdr_desc, sizeof (ifdr->ifdr_desc)); + if (ifdr_len > 0) { + bcopy(ifp->if_desc.ifd_desc, ifdr->ifdr_desc, ifdr_len); + } + ifnet_lock_done(ifp); + break; + } + + case SIOCSIFLINKPARAMS: { /* struct if_linkparamsreq */ + struct if_linkparamsreq *iflpr = + (struct if_linkparamsreq *)(void *)data; + struct ifclassq *ifq = &ifp->if_snd; + struct tb_profile tb = { 0, 0, 0 }; + + if ((error = proc_suser(p)) != 0) + break; + + IFCQ_LOCK(ifq); + if (!IFCQ_IS_READY(ifq)) { + error = ENXIO; + IFCQ_UNLOCK(ifq); + break; + } + bcopy(&iflpr->iflpr_output_tbr_rate, &tb.rate, + sizeof (tb.rate)); + bcopy(&iflpr->iflpr_output_tbr_percent, &tb.percent, + sizeof (tb.percent)); + error = ifclassq_tbr_set(ifq, &tb, TRUE); + IFCQ_UNLOCK(ifq); + break; + } + + case SIOCGIFLINKPARAMS: { /* struct if_linkparamsreq */ + struct if_linkparamsreq *iflpr = + (struct if_linkparamsreq *)(void *)data; + struct ifclassq *ifq = &ifp->if_snd; + u_int32_t sched_type = PKTSCHEDT_NONE, flags = 0; + u_int64_t tbr_bw = 0, tbr_pct = 0; + + IFCQ_LOCK(ifq); +#if PF_ALTQ + if (ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) { + sched_type = IFCQ_ALTQ(ifq)->altq_type; + flags |= IFLPRF_ALTQ; + } else +#endif /* PF_ALTQ */ + { + if (IFCQ_IS_ENABLED(ifq)) + sched_type = ifq->ifcq_type; + } + bcopy(&sched_type, &iflpr->iflpr_output_sched, + sizeof (iflpr->iflpr_output_sched)); + + if (IFCQ_TBR_IS_ENABLED(ifq)) { + tbr_bw = ifq->ifcq_tbr.tbr_rate_raw; + tbr_pct = ifq->ifcq_tbr.tbr_percent; + } + bcopy(&tbr_bw, &iflpr->iflpr_output_tbr_rate, + sizeof (iflpr->iflpr_output_tbr_rate)); + bcopy(&tbr_pct, &iflpr->iflpr_output_tbr_percent, + sizeof (iflpr->iflpr_output_tbr_percent)); + IFCQ_UNLOCK(ifq); + + if (ifp->if_output_sched_model == + IFNET_SCHED_MODEL_DRIVER_MANAGED) + flags |= IFLPRF_DRVMANAGED; + bcopy(&flags, &iflpr->iflpr_flags, sizeof (iflpr->iflpr_flags)); + bcopy(&ifp->if_output_bw, &iflpr->iflpr_output_bw, + sizeof (iflpr->iflpr_output_bw)); + bcopy(&ifp->if_input_bw, &iflpr->iflpr_input_bw, + sizeof (iflpr->iflpr_input_bw)); + break; + } + + case SIOCGIFQUEUESTATS: { /* struct if_qstatsreq */ + struct if_qstatsreq *ifqr = (struct if_qstatsreq *)(void *)data; + u_int32_t ifqr_len, ifqr_slot; + + bcopy(&ifqr->ifqr_slot, &ifqr_slot, sizeof (ifqr_slot)); + bcopy(&ifqr->ifqr_len, &ifqr_len, sizeof (ifqr_len)); + error = ifclassq_getqstats(&ifp->if_snd, ifqr_slot, + ifqr->ifqr_buf, &ifqr_len); + if (error != 0) + ifqr_len = 0; + bcopy(&ifqr_len, &ifqr->ifqr_len, sizeof (ifqr_len)); + break; + } + + case SIOCSIFTHROTTLE: { /* struct if_throttlereq */ + struct if_throttlereq *ifthr = + (struct if_throttlereq *)(void *)data; + u_int32_t ifthr_level; + + /* + * XXX: Use priv_check_cred() instead of root check? + */ + if ((error = proc_suser(p)) != 0) + break; + + bcopy(&ifthr->ifthr_level, &ifthr_level, sizeof (ifthr_level)); + error = ifnet_set_throttle(ifp, ifthr_level); + if (error == EALREADY) + error = 0; + break; + } + + case SIOCGIFTHROTTLE: { /* struct if_throttlereq */ + struct if_throttlereq *ifthr = + (struct if_throttlereq *)(void *)data; + u_int32_t ifthr_level; + + if ((error = ifnet_get_throttle(ifp, &ifthr_level)) == 0) { + bcopy(&ifthr_level, &ifthr->ifthr_level, + sizeof (ifthr_level)); + } + break; + } + + default: + if (so->so_proto == NULL) { + error = EOPNOTSUPP; + break; + } + + socket_lock(so, 1); + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, + data, ifp, p)); + socket_unlock(so, 1); + + if (error == EOPNOTSUPP || error == ENOTSUP) { + error = ifnet_ioctl(ifp, + so->so_proto->pr_domain->dom_family, cmd, data); + } + break; + } + +done: + if (ifs != NULL) + _FREE(ifs, M_DEVBUF); + + return (error); +} + +static int +ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) +{ + struct ifnet *ifp; + u_long ocmd = cmd; + int error = 0; + struct kev_msg ev_msg; + struct net_event_data ev_data; + + bzero(&ev_data, sizeof (struct net_event_data)); + bzero(&ev_msg, sizeof (struct kev_msg)); + + ifr->ifr_name[IFNAMSIZ - 1] = '\0'; + switch (cmd) { case SIOCIFCREATE: case SIOCIFCREATE2: error = proc_suser(p); if (error) return (error); - return if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), - cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL); + return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), + cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL)); case SIOCIFDESTROY: error = proc_suser(p); if (error) return (error); - return if_clone_destroy(ifr->ifr_name); -#if IF_CLONE_LIST - case SIOCIFGCLONERS32: { - struct if_clonereq32 *ifcr = (struct if_clonereq32 *)data; - return (if_clone_list(ifcr->ifcr_count, &ifcr->ifcr_total, - CAST_USER_ADDR_T(ifcr->ifcru_buffer))); - /* NOTREACHED */ - - } - case SIOCIFGCLONERS64: { - struct if_clonereq64 *ifcr = (struct if_clonereq64 *)data; - return (if_clone_list(ifcr->ifcr_count, &ifcr->ifcr_total, - ifcr->ifcru_buffer)); - /* NOTREACHED */ - } -#endif /* IF_CLONE_LIST */ + return (if_clone_destroy(ifr->ifr_name)); } - /* - * ioctls which require ifp. Note that we acquire dlil_ifnet_lock - * here to ensure that the ifnet, if found, has been fully attached. - */ - dlil_if_lock(); ifp = ifunit(ifr->ifr_name); - dlil_if_unlock(); if (ifp == NULL) return (ENXIO); @@ -1469,6 +1922,12 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) ifnet_lock_done(ifp); break; + case SIOCGIFEFLAGS: + ifnet_lock_shared(ifp); + ifr->ifr_eflags = ifp->if_eflags; + ifnet_lock_done(ifp); + break; + case SIOCGIFCAP: ifnet_lock_shared(ifp); ifr->ifr_reqcap = ifp->if_capabilities; @@ -1499,6 +1958,24 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) ifnet_lock_done(ifp); break; + case SIOCGIFWAKEFLAGS: + ifnet_lock_shared(ifp); + ifr->ifr_wake_flags = ifnet_get_wake_flags(ifp); + ifnet_lock_done(ifp); + break; + + case SIOCGIFGETRTREFCNT: + ifnet_lock_shared(ifp); + ifr->ifr_route_refcnt = ifp->if_route_refcnt; + ifnet_lock_done(ifp); + break; + + case SIOCGIFLINKQUALITYMETRIC: + ifnet_lock_shared(ifp); + ifr->ifr_link_quality_metric = ifp->if_lqm; + ifnet_lock_done(ifp); + break; + case SIOCSIFFLAGS: error = proc_suser(p); if (error != 0) @@ -1512,7 +1989,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) * for the SIOCSIFFLAGS case. */ (void) ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + cmd, (caddr_t)ifr); /* * Send the event even upon error from the driver because @@ -1544,7 +2021,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; } error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + cmd, (caddr_t)ifr); ifnet_touch_lastchange(ifp); break; @@ -1584,7 +2061,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + cmd, (caddr_t)ifr); if (error != 0) break; @@ -1604,9 +2081,9 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) ifnet_touch_lastchange(ifp); break; - case SIOCSIFMTU: - { + case SIOCSIFMTU: { u_int32_t oldmtu = ifp->if_mtu; + struct ifclassq *ifq = &ifp->if_snd; error = proc_suser(p); if (error != 0) @@ -1621,7 +2098,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; } error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + cmd, (caddr_t)ifr); if (error != 0) break; @@ -1651,6 +2128,10 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) #if INET6 nd6_setmtu(ifp); #endif + /* Inform all transmit queues about the new MTU */ + IFCQ_LOCK(ifq); + ifnet_update_sndq(ifq, CLASSQ_EV_LINK_MTU); + IFCQ_UNLOCK(ifq); } break; } @@ -1710,13 +2191,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) ifnet_touch_lastchange(ifp); break; - case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: -#if INET6 - case SIOCSIFPHYADDR_IN6_32: - case SIOCSIFPHYADDR_IN6_64: -#endif - case SIOCSLIFPHYADDR: case SIOCSIFMEDIA: case SIOCSIFGENERIC: case SIOCSIFLLADDR: @@ -1728,60 +2203,41 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + cmd, (caddr_t)ifr); if (error != 0) break; ifnet_touch_lastchange(ifp); break; - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - ifs->ascii[0] = '\0'; - case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: - case SIOCGLIFPHYADDR: - case SIOCGIFMEDIA32: - case SIOCGIFMEDIA64: case SIOCGIFGENERIC: case SIOCGIFDEVMTU: - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); - break; - case SIOCGIFVLAN: case SIOCGIFBOND: error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + cmd, (caddr_t)ifr); break; - case SIOCGIFWAKEFLAGS: - ifnet_lock_shared(ifp); - ifr->ifr_wake_flags = ifnet_get_wake_flags(ifp); - ifnet_lock_done(ifp); + case SIOCSIFOPPORTUNISTIC: + case SIOCGIFOPPORTUNISTIC: + error = ifnet_getset_opportunistic(ifp, cmd, ifr, p); break; - case SIOCGIFGETRTREFCNT: - ifnet_lock_shared(ifp); - ifr->ifr_route_refcnt = ifp->if_route_refcnt; - ifnet_lock_done(ifp); - break; - - default: - oif_flags = ifp->if_flags; - if (so->so_proto == NULL) { - error = EOPNOTSUPP; - break; - } - { - u_long ocmd = cmd; - - switch (cmd) { - case SIOCSIFDSTADDR: - case SIOCSIFADDR: - case SIOCSIFBRDADDR: - case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: + case OSIOCGIFADDR: + case OSIOCGIFDSTADDR: + case OSIOCGIFBRDADDR: + case OSIOCGIFNETMASK: + case SIOCSIFKPI: + VERIFY(so->so_proto != NULL); + + if (cmd == SIOCSIFDSTADDR || cmd == SIOCSIFADDR || + cmd == SIOCSIFBRDADDR || cmd == SIOCSIFNETMASK) { #if BYTE_ORDER != BIG_ENDIAN if (ifr->ifr_addr.sa_family == 0 && ifr->ifr_addr.sa_len < 16) { @@ -1792,27 +2248,19 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) if (ifr->ifr_addr.sa_len == 0) ifr->ifr_addr.sa_len = 16; #endif - break; - - case OSIOCGIFADDR: - cmd = SIOCGIFADDR; - break; - - case OSIOCGIFDSTADDR: - cmd = SIOCGIFDSTADDR; - break; - - case OSIOCGIFBRDADDR: - cmd = SIOCGIFBRDADDR; - break; - - case OSIOCGIFNETMASK: - cmd = SIOCGIFNETMASK; + } else if (cmd == OSIOCGIFADDR) { + cmd = SIOCGIFADDR; /* struct ifreq */ + } else if (cmd == OSIOCGIFDSTADDR) { + cmd = SIOCGIFDSTADDR; /* struct ifreq */ + } else if (cmd == OSIOCGIFBRDADDR) { + cmd = SIOCGIFBRDADDR; /* struct ifreq */ + } else if (cmd == OSIOCGIFNETMASK) { + cmd = SIOCGIFNETMASK; /* struct ifreq */ } socket_lock(so, 1); error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, - data, ifp, p)); + (caddr_t)ifr, ifp, p)); socket_unlock(so, 1); switch (ocmd) { @@ -1820,22 +2268,28 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case OSIOCGIFDSTADDR: case OSIOCGIFBRDADDR: case OSIOCGIFNETMASK: - *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family; - + bcopy(&ifr->ifr_addr.sa_family, &ifr->ifr_addr, + sizeof (u_short)); } - } + if (cmd == SIOCSIFKPI) { int temperr = proc_suser(p); if (temperr != 0) error = temperr; } - if (error == EOPNOTSUPP || error == ENOTSUP) + if (error == EOPNOTSUPP || error == ENOTSUP) { error = ifnet_ioctl(ifp, - so->so_proto->pr_domain->dom_family, cmd, data); - + so->so_proto->pr_domain->dom_family, cmd, + (caddr_t)ifr); + } break; + + default: + VERIFY(0); + /* NOTREACHED */ } + return (error); } @@ -1959,7 +2413,7 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) addrs++; if (cmd == OSIOCGIFCONF32 || cmd == OSIOCGIFCONF64) { struct osockaddr *osa = - (struct osockaddr *)&ifr.ifr_addr; + (struct osockaddr *)(void *)&ifr.ifr_addr; ifr.ifr_addr = *sa; osa->sa_family = sa->sa_family; error = copyout((caddr_t)&ifr, ifrp, @@ -2018,7 +2472,7 @@ if_allmulti(struct ifnet *ifp, int onswitch) { int error = 0; int modified = 0; - + ifnet_lock_exclusive(ifp); if (onswitch) { @@ -2036,7 +2490,7 @@ if_allmulti(struct ifnet *ifp, int onswitch) } } ifnet_lock_done(ifp); - + if (modified) error = ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); @@ -2345,56 +2799,58 @@ if_addmulti_doesexist(struct ifnet *ifp, const struct sockaddr *sa, * Radar 3642395, make sure all multicasts are in a standard format. */ static struct sockaddr* -copy_and_normalize( - const struct sockaddr *original) +copy_and_normalize(const struct sockaddr *original) { - int alen = 0; + int alen = 0; const u_char *aptr = NULL; struct sockaddr *copy = NULL; struct sockaddr_dl *sdl_new = NULL; - int len = 0; - + int len = 0; + if (original->sa_family != AF_LINK && - original->sa_family != AF_UNSPEC) { + original->sa_family != AF_UNSPEC) { /* Just make a copy */ - MALLOC(copy, struct sockaddr*, original->sa_len, M_IFADDR, M_WAITOK); + MALLOC(copy, struct sockaddr*, original->sa_len, + M_IFADDR, M_WAITOK); if (copy != NULL) bcopy(original, copy, original->sa_len); - return copy; + return (copy); } - + switch (original->sa_family) { case AF_LINK: { - const struct sockaddr_dl *sdl_original = - (const struct sockaddr_dl*)original; - - if (sdl_original->sdl_nlen + sdl_original->sdl_alen + sdl_original->sdl_slen + - offsetof(struct sockaddr_dl, sdl_data) > sdl_original->sdl_len) - return NULL; - + const struct sockaddr_dl *sdl_original = + (struct sockaddr_dl*)(uintptr_t)(size_t)original; + + if (sdl_original->sdl_nlen + sdl_original->sdl_alen + + sdl_original->sdl_slen + + offsetof(struct sockaddr_dl, sdl_data) > + sdl_original->sdl_len) + return (NULL); + alen = sdl_original->sdl_alen; aptr = CONST_LLADDR(sdl_original); } break; - + case AF_UNSPEC: { if (original->sa_len < ETHER_ADDR_LEN + - offsetof(struct sockaddr, sa_data)) { - return NULL; + offsetof(struct sockaddr, sa_data)) { + return (NULL); } - + alen = ETHER_ADDR_LEN; aptr = (const u_char*)original->sa_data; } break; } - + if (alen == 0 || aptr == NULL) - return NULL; - + return (NULL); + len = alen + offsetof(struct sockaddr_dl, sdl_data); MALLOC(sdl_new, struct sockaddr_dl*, len, M_IFADDR, M_WAITOK); - + if (sdl_new != NULL) { bzero(sdl_new, len); sdl_new->sdl_len = len; @@ -2402,8 +2858,8 @@ copy_and_normalize( sdl_new->sdl_alen = alen; bcopy(aptr, LLADDR(sdl_new), alen); } - - return (struct sockaddr*)sdl_new; + + return ((struct sockaddr*)sdl_new); } /* @@ -2888,9 +3344,9 @@ if_data_internal_to_if_data(struct ifnet *ifp, #define COPYFIELD(fld) if_data->fld = if_data_int->fld #define COPYFIELD32(fld) if_data->fld = (u_int32_t)(if_data_int->fld) /* compiler will cast down to 32-bit */ -#define COPYFIELD32_ATOMIC(fld) do { \ - atomic_get_64(if_data->fld, \ - (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ +#define COPYFIELD32_ATOMIC(fld) do { \ + atomic_get_64(if_data->fld, \ + (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ } while (0) COPYFIELD(ifi_type); @@ -2923,7 +3379,7 @@ if_data_internal_to_if_data(struct ifnet *ifp, COPYFIELD(ifi_recvtiming); COPYFIELD(ifi_xmittiming); - + if_data->ifi_lastchange.tv_sec = if_data_int->ifi_lastchange.tv_sec; if_data->ifi_lastchange.tv_usec = if_data_int->ifi_lastchange.tv_usec; @@ -2947,9 +3403,9 @@ if_data_internal_to_if_data64(struct ifnet *ifp, { #pragma unused(ifp) #define COPYFIELD64(fld) if_data64->fld = if_data_int->fld -#define COPYFIELD64_ATOMIC(fld) do { \ - atomic_get_64(if_data64->fld, \ - (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ +#define COPYFIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_data64->fld, \ + (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ } while (0) COPYFIELD64(ifi_type); @@ -2996,11 +3452,16 @@ __private_extern__ void if_copy_traffic_class(struct ifnet *ifp, struct if_traffic_class *if_tc) { -#define COPY_IF_TC_FIELD64_ATOMIC(fld) do { \ - atomic_get_64(if_tc->fld, \ - (u_int64_t *)(void *)(uintptr_t)&ifp->if_tc.fld); \ +#define COPY_IF_TC_FIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_tc->fld, \ + (u_int64_t *)(void *)(uintptr_t)&ifp->if_tc.fld); \ } while (0) + bzero(if_tc, sizeof (*if_tc)); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ibepackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ibebytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_obepackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_obebytes); COPY_IF_TC_FIELD64_ATOMIC(ifi_ibkpackets); COPY_IF_TC_FIELD64_ATOMIC(ifi_ibkbytes); COPY_IF_TC_FIELD64_ATOMIC(ifi_obkpackets); @@ -3013,10 +3474,83 @@ if_copy_traffic_class(struct ifnet *ifp, COPY_IF_TC_FIELD64_ATOMIC(ifi_ivobytes); COPY_IF_TC_FIELD64_ATOMIC(ifi_ovopackets); COPY_IF_TC_FIELD64_ATOMIC(ifi_ovobytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ipvpackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ipvbytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_opvpackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_opvbytes); #undef COPY_IF_TC_FIELD64_ATOMIC } +void +if_copy_data_extended(struct ifnet *ifp, struct if_data_extended *if_de) +{ +#define COPY_IF_DE_FIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_de->fld, \ + (u_int64_t *)(void *)(uintptr_t)&ifp->if_data.fld); \ +} while (0) + + bzero(if_de, sizeof (*if_de)); + COPY_IF_DE_FIELD64_ATOMIC(ifi_alignerrs); + +#undef COPY_IF_DE_FIELD64_ATOMIC +} + +void +if_copy_packet_stats(struct ifnet *ifp, struct if_packet_stats *if_ps) +{ +#define COPY_IF_PS_TCP_FIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_ps->ifi_tcp_##fld, \ + (u_int64_t *)(void *)(uintptr_t)&ifp->if_tcp_stat->fld); \ +} while (0) + +#define COPY_IF_PS_UDP_FIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_ps->ifi_udp_##fld, \ + (u_int64_t *)(void *)(uintptr_t)&ifp->if_udp_stat->fld); \ +} while (0) + + COPY_IF_PS_TCP_FIELD64_ATOMIC(badformat); + COPY_IF_PS_TCP_FIELD64_ATOMIC(unspecv6); + COPY_IF_PS_TCP_FIELD64_ATOMIC(synfin); + COPY_IF_PS_TCP_FIELD64_ATOMIC(badformatipsec); + COPY_IF_PS_TCP_FIELD64_ATOMIC(noconnnolist); + COPY_IF_PS_TCP_FIELD64_ATOMIC(noconnlist); + COPY_IF_PS_TCP_FIELD64_ATOMIC(listbadsyn); + COPY_IF_PS_TCP_FIELD64_ATOMIC(icmp6unreach); + COPY_IF_PS_TCP_FIELD64_ATOMIC(deprecate6); + COPY_IF_PS_TCP_FIELD64_ATOMIC(ooopacket); + COPY_IF_PS_TCP_FIELD64_ATOMIC(rstinsynrcv); + COPY_IF_PS_TCP_FIELD64_ATOMIC(dospacket); + COPY_IF_PS_TCP_FIELD64_ATOMIC(cleanup); + COPY_IF_PS_TCP_FIELD64_ATOMIC(synwindow); + + COPY_IF_PS_UDP_FIELD64_ATOMIC(port_unreach); + COPY_IF_PS_UDP_FIELD64_ATOMIC(faithprefix); + COPY_IF_PS_UDP_FIELD64_ATOMIC(port0); + COPY_IF_PS_UDP_FIELD64_ATOMIC(badlength); + COPY_IF_PS_UDP_FIELD64_ATOMIC(badchksum); + COPY_IF_PS_UDP_FIELD64_ATOMIC(badmcast); + COPY_IF_PS_UDP_FIELD64_ATOMIC(cleanup); + COPY_IF_PS_UDP_FIELD64_ATOMIC(badipsec); + +#undef COPY_IF_PS_TCP_FIELD64_ATOMIC +#undef COPY_IF_PS_UDP_FIELD64_ATOMIC +} + +void +if_copy_rxpoll_stats(struct ifnet *ifp, struct if_rxpoll_stats *if_rs) +{ + bzero(if_rs, sizeof (*if_rs)); + if (!(ifp->if_eflags & IFEF_RXPOLL) || !ifnet_is_attached(ifp, 1)) + return; + + /* by now, ifnet will stay attached so if_inp must be valid */ + VERIFY(ifp->if_inp != NULL); + bcopy(&ifp->if_inp->pstats, if_rs, sizeof (*if_rs)); + + /* Release the IO refcnt */ + ifnet_decr_iorefcnt(ifp); +} struct ifaddr * ifa_remref(struct ifaddr *ifa, int locked) diff --git a/bsd/net/if.h b/bsd/net/if.h index a7974460c..8f80f7b2f 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,28 +78,37 @@ #define KEV_DL_SUBCLASS 2 -#define KEV_DL_SIFFLAGS 1 -#define KEV_DL_SIFMETRICS 2 -#define KEV_DL_SIFMTU 3 -#define KEV_DL_SIFPHYS 4 -#define KEV_DL_SIFMEDIA 5 -#define KEV_DL_SIFGENERIC 6 -#define KEV_DL_ADDMULTI 7 -#define KEV_DL_DELMULTI 8 -#define KEV_DL_IF_ATTACHED 9 -#define KEV_DL_IF_DETACHING 10 -#define KEV_DL_IF_DETACHED 11 -#define KEV_DL_LINK_OFF 12 -#define KEV_DL_LINK_ON 13 -#define KEV_DL_PROTO_ATTACHED 14 -#define KEV_DL_PROTO_DETACHED 15 -#define KEV_DL_LINK_ADDRESS_CHANGED 16 -#define KEV_DL_WAKEFLAGS_CHANGED 17 -#define KEV_DL_IF_IDLE_ROUTE_REFCNT 18 -#define KEV_DL_IFCAP_CHANGED 19 +#define KEV_DL_SIFFLAGS 1 +#define KEV_DL_SIFMETRICS 2 +#define KEV_DL_SIFMTU 3 +#define KEV_DL_SIFPHYS 4 +#define KEV_DL_SIFMEDIA 5 +#define KEV_DL_SIFGENERIC 6 +#define KEV_DL_ADDMULTI 7 +#define KEV_DL_DELMULTI 8 +#define KEV_DL_IF_ATTACHED 9 +#define KEV_DL_IF_DETACHING 10 +#define KEV_DL_IF_DETACHED 11 +#define KEV_DL_LINK_OFF 12 +#define KEV_DL_LINK_ON 13 +#define KEV_DL_PROTO_ATTACHED 14 +#define KEV_DL_PROTO_DETACHED 15 +#define KEV_DL_LINK_ADDRESS_CHANGED 16 +#define KEV_DL_WAKEFLAGS_CHANGED 17 +#define KEV_DL_IF_IDLE_ROUTE_REFCNT 18 +#define KEV_DL_IFCAP_CHANGED 19 +#define KEV_DL_LINK_QUALITY_METRIC_CHANGED 20 +#define KEV_DL_NODE_PRESENCE 21 +#define KEV_DL_NODE_ABSENCE 22 +#define KEV_DL_MASTER_ELECTED 23 #include #include + +#ifdef PRIVATE +#include +#include +#endif #endif #ifdef KERNEL_PRIVATE @@ -142,24 +151,43 @@ struct if_clonereq32 { #define IFF_ALTPHYS IFF_LINK2 /* use alternate physical connection */ #define IFF_MULTICAST 0x8000 /* supports multicast */ -#ifdef KERNEL_PRIVATE +#ifdef PRIVATE /* extended flags definitions: (all bits are reserved for internal/future use) */ -#define IFEF_AUTOCONFIGURING 0x1 -#define IFEF_DVR_REENTRY_OK 0x20 /* When set, driver may be reentered from its own thread */ -#define IFEF_ACCEPT_RTADVD 0x40 /* set to accept IPv6 router advertisement on the interface */ -#define _IFEF_DETACHING 0x80 /* deprecated */ -#define IFEF_USEKPI 0x100 /* Set when interface is created through the KPIs */ +#define IFEF_AUTOCONFIGURING 0x1 /* allow BOOTP/DHCP replies to enter */ +#define _IFEF_DVR_REENTRY_OK 0x20 /* deprecated */ +#define IFEF_ACCEPT_RTADV 0x40 /* set to accept IPv6 Router Advertisement on the interface */ +#define IFEF_TXSTART 0x80 /* interface has start callback */ +#define IFEF_RXPOLL 0x100 /* interface supports opportunistic input polling */ #define IFEF_VLAN 0x200 /* interface has one or more vlans */ #define IFEF_BOND 0x400 /* interface is part of bond */ #define IFEF_ARPLL 0x800 /* ARP for IPv4LL addresses on this port */ #define IFEF_NOWINDOWSCALE 0x1000 /* Don't scale TCP window on iface */ #define IFEF_NOAUTOIPV6LL 0x2000 /* Interface IPv6 LinkLocal address not provided by kernel */ -#define IFEF_SERVICE_TRIGGERED 0x20000 /* interface is on-demand dynamically created/destroyed */ +#define IFEF_IPV4_ROUTER 0x8000 /* set on internal-network-facing interface when in IPv4 router mode */ +#define IFEF_IPV6_ROUTER 0x10000 /* set on internal-network-facing interface when in IPv6 router mode */ +#define IFEF_LOCALNET_PRIVATE 0x20000 /* local private network */ +#define IFEF_IPV6_ND6ALT 0x40000 /* alternative KPI for IPv6 neighbor discovery */ +#define IFEF_SERVICE_TRIGGERED IFEF_LOCALNET_PRIVATE +#define IFEF_RESTRICTED_RECV 0x80000 /* interface restricts inbound pkts */ +#define IFEF_AWDL 0x100000 /* Apple Wireless Direct Link */ +#define IFEF_NOACKPRI 0x200000 /* Don't use TCP ACK prioritization on interface */ #define IFEF_SENDLIST 0x10000000 /* Interface supports sending a list of packets */ #define _IFEF_REUSE 0x20000000 /* deprecated */ #define _IFEF_INUSE 0x40000000 /* deprecated */ #define IFEF_UPDOWNCHANGE 0x80000000 /* Interface's up/down state is changing */ +#ifdef XNU_KERNEL_PRIVATE +/* + * Current requirements for an AWDL interface. Setting/clearing IFEF_AWDL + * will also trigger the setting/clearing of the rest of the flags. Once + * IFEF_AWDL is set, the rest of flags cannot be cleared, by definition. + */ +#define IFEF_AWDL_MASK \ + (IFEF_LOCALNET_PRIVATE | IFEF_IPV6_ND6ALT | IFEF_RESTRICTED_RECV | \ + IFEF_AWDL) +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* PRIVATE */ +#ifdef KERNEL_PRIVATE /* * !!! NOTE !!! * @@ -176,7 +204,6 @@ struct if_clonereq32 { #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI) - #endif /* KERNEL_PRIVATE */ /* @@ -213,7 +240,7 @@ struct if_clonereq32 { #define IFCAP_VALID (IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO | IFCAP_VLAN_MTU | \ IFCAP_VLAN_HWTAGGING | IFCAP_JUMBO_MTU | IFCAP_AV) -#define IFQ_MAXLEN 50 +#define IFQ_MAXLEN 128 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* @@ -368,7 +395,7 @@ struct ifreq { int ifru_mtu; int ifru_phys; int ifru_media; - int ifru_intval; + int ifru_intval; caddr_t ifru_data; #ifdef KERNEL_PRIVATE u_int64_t ifru_data64; /* 64-bit ifru_data */ @@ -377,7 +404,18 @@ struct ifreq { struct ifkpi ifru_kpi; u_int32_t ifru_wake_flags; u_int32_t ifru_route_refcnt; +#ifdef PRIVATE + int ifru_link_quality_metric; +#endif /* PRIVATE */ int ifru_cap[2]; +#ifdef PRIVATE + struct { + uint32_t ifo_flags; +#define IFRIFOF_BLOCK_OPPORTUNISTIC 0x00000001 + uint32_t ifo_inuse; + } ifru_opportunistic; + u_int64_t ifru_eflags; +#endif /* PRIVATE */ } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -401,8 +439,15 @@ struct ifreq { #define ifr_kpi ifr_ifru.ifru_kpi #define ifr_wake_flags ifr_ifru.ifru_wake_flags /* wake capabilities of devive */ #define ifr_route_refcnt ifr_ifru.ifru_route_refcnt /* route references on interface */ +#ifdef PRIVATE +#define ifr_link_quality_metric ifr_ifru.ifru_link_quality_metric /* LQM */ +#endif /* PRIVATE */ #define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ +#ifdef PRIVATE +#define ifr_opportunistic ifr_ifru.ifru_opportunistic /* current capabilities */ +#define ifr_eflags ifr_ifru.ifru_eflags /* extended flags */ +#endif }; #define _SIZEOF_ADDR_IFREQ(ifr) \ @@ -562,6 +607,166 @@ struct if_laddrreq { struct sockaddr_storage dstaddr; /* out */ }; +#ifdef PRIVATE +/* + * Link Quality Metrics + * + * IFNET_LQM_THRESH_OFF Metric is not available; device is off. + * IFNET_LQM_THRESH_UNKNOWN Metric is not (yet) known. + * IFNET_LQM_THRESH_POOR Link quality is considered poor by driver. + * IFNET_LQM_THRESH_GOOD Link quality is considered good by driver. + */ +enum { + IFNET_LQM_THRESH_OFF = (-2), + IFNET_LQM_THRESH_UNKNOWN = (-1), + IFNET_LQM_THRESH_POOR = 50, + IFNET_LQM_THRESH_GOOD = 100 +}; +#ifdef XNU_KERNEL_PRIVATE +#define IFNET_LQM_MIN IFNET_LQM_THRESH_OFF +#define IFNET_LQM_MAX IFNET_LQM_THRESH_GOOD +#endif /* XNU_KERNEL_PRIVATE */ + +/* + * DLIL KEV_DL_LINK_QUALITY_METRIC_CHANGED structure + */ +struct kev_dl_link_quality_metric_data { + struct net_event_data link_data; + int link_quality_metric; +}; + +#define IF_DESCSIZE 128 + +/* + * Structure for SIOC[SG]IFDESC + */ +struct if_descreq { + char ifdr_name[IFNAMSIZ]; /* interface name */ + u_int32_t ifdr_len; /* up to IF_DESCSIZE */ + u_int8_t ifdr_desc[IF_DESCSIZE]; /* opaque data */ +}; + +/* + * Output packet scheduling models + * + * IFNET_SCHED_MODEL_NORMAL The default output packet scheduling model + * where the driver or media does not require strict scheduling + * strategy, and that the networking stack is free to choose the + * most appropriate scheduling and queueing algorithm, including + * shaping traffics. + * IFNET_SCHED_MODEL_DRIVER_MANAGED The alternative output packet + * scheduling model where the driver or media requires strict + * scheduling strategy (e.g. 802.11 WMM), and that the networking + * stack is only responsible for creating multiple queues for the + * corresponding service classes. + */ +enum { + IFNET_SCHED_MODEL_NORMAL = 0, + IFNET_SCHED_MODEL_DRIVER_MANAGED = 1, +#ifdef XNU_KERNEL_PRIVATE + IFNET_SCHED_MODEL_MAX = 2, +#endif /* XNU_KERNEL_PRIVATE */ +}; + +/* + * Values for iflpr_flags + */ +#define IFLPRF_ALTQ 0x1 /* configured via PF/ALTQ */ +#define IFLPRF_DRVMANAGED 0x2 /* output queue scheduled by drv */ + +/* + * Structure for SIOCGIFLINKPARAMS + */ +struct if_linkparamsreq { + char iflpr_name[IFNAMSIZ]; /* interface name */ + u_int32_t iflpr_flags; + u_int32_t iflpr_output_sched; + u_int64_t iflpr_output_tbr_rate; + u_int32_t iflpr_output_tbr_percent; + struct if_bandwidths iflpr_output_bw; + struct if_bandwidths iflpr_input_bw; +}; + +/* + * Structure for SIOCGIFQUEUESTATS + */ +struct if_qstatsreq { + char ifqr_name[IFNAMSIZ]; /* interface name */ + u_int32_t ifqr_slot; + void *ifqr_buf __attribute__((aligned(8))); + int ifqr_len __attribute__((aligned(8))); +}; + +/* + * Node Proximity Metrics + */ +enum { + IFNET_NPM_THRESH_UNKNOWN = (-1), + IFNET_NPM_THRESH_NEAR = 30, + IFNET_NPM_THRESH_GENERAL = 70, + IFNET_NPM_THRESH_FAR = 100, +}; + +/* + * Received Signal Strength Indication [special values] + * + * IFNET_RSSI_UNKNOWN Metric is not (yet) known. + */ +enum { + IFNET_RSSI_UNKNOWN = ((-2147483647)-1), /* INT32_MIN */ +}; + + +/* + * DLIL KEV_DL_NODE_PRESENCE/KEV_DL_NODE_ABSENCE event structures + */ +struct kev_dl_node_presence { + struct net_event_data link_data; + struct sockaddr_in6 sin6_node_address; + struct sockaddr_dl sdl_node_address; + int32_t rssi; + int link_quality_metric; + int node_proximity_metric; + u_int8_t node_service_info[48]; +}; + +struct kev_dl_node_absence { + struct net_event_data link_data; + struct sockaddr_in6 sin6_node_address; + struct sockaddr_dl sdl_node_address; +}; + +/* + * Structure for SIOC[SG]IFTHROTTLE + */ +struct if_throttlereq { + char ifthr_name[IFNAMSIZ]; /* interface name */ + u_int32_t ifthr_level; +}; + +/* + * Interface throttling levels + * + * IFNET_THROTTLE_OFF The default throttling level (no throttling.) + * All service class queues operate normally according to the + * standard packet scheduler configuration. + * IFNET_THROTTLE_OPPORTUNISTIC One or more service class queues that + * are responsible for managing "opportunistic" traffics are + * suspended. Packets enqueued on those queues will be dropped + * and a flow advisory error will be generated to the data + * source. Existing packets in the queues will stay enqueued + * until the interface is no longer throttled, or until they + * are explicitly flushed. + */ +enum { + IFNET_THROTTLE_OFF = 0, + IFNET_THROTTLE_OPPORTUNISTIC = 1, +#ifdef XNU_KERNEL_PRIVATE + IFNET_THROTTLE_MAX = 2, +#endif /* XNU_KERNEL_PRIVATE */ +}; +#endif /* PRIVATE */ + #ifdef KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_IFADDR); diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c index 91790bd3a..1964a5524 100644 --- a/bsd/net/if_bond.c +++ b/bsd/net/if_bond.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2011 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -512,11 +512,15 @@ packet_buffer_allocate(int length) /* leave room for ethernet header */ size = length + sizeof(struct ether_header); if (size > (int)MHLEN) { - /* XXX doesn't handle large payloads */ - printf("bond: packet_buffer_allocate size %d > max %u\n", size, MHLEN); - return (NULL); + if (size > (int)MCLBYTES) { + printf("bond: packet_buffer_allocate size %d > max %u\n", + size, MCLBYTES); + return (NULL); + } + m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); + } else { + m = m_gethdr(M_WAITOK, MT_DATA); } - m = m_gethdr(M_WAITOK, MT_DATA); if (m == NULL) { return (NULL); } @@ -1470,6 +1474,8 @@ bond_output(struct ifnet * ifp, struct mbuf * m) uint32_t h; ifbond_ref ifb; struct ifnet * port_ifp = NULL; + int err; + struct flowadv adv = { FADV_SUCCESS }; if (m == 0) { return (0); @@ -1517,7 +1523,17 @@ bond_output(struct ifnet * ifp, struct mbuf * m) } bond_bpf_output(ifp, m, bpf_func); - return (ifnet_output_raw(port_ifp, PF_BOND, m)); + err = dlil_output(port_ifp, PF_BOND, m, NULL, NULL, 1, &adv); + + if (err == 0) { + if (adv.code == FADV_FLOW_CONTROLLED) { + err = EQFULL; + } else if (adv.code == FADV_SUSPENDED) { + err = EQSUSPENDED; + } + } + + return (err); done: bond_unlock(); @@ -2561,10 +2577,6 @@ static int bond_set_promisc(__unused struct ifnet *ifp) { int error = 0; - /* - * The benefit of doing this currently does not warrant - * the added code complexity. Do nothing and return. - */ return (error); } diff --git a/bsd/net/if_bond_internal.h b/bsd/net/if_bond_internal.h new file mode 100644 index 000000000..99e6058dc --- /dev/null +++ b/bsd/net/if_bond_internal.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _NET_IF_BOND_INTERNAL_H_ + +#ifdef KERNEL_PRIVATE +int bond_family_init(void) __attribute__((section("__TEXT, initcode"))); +#endif /* KERNEL_PRIVATE */ + +#endif /* _NET_IF_BOND_INTERNAL_H_ */ + diff --git a/bsd/net/if_bond_var.h b/bsd/net/if_bond_var.h index fb17c9a90..f92a3f24c 100644 --- a/bsd/net/if_bond_var.h +++ b/bsd/net/if_bond_var.h @@ -95,8 +95,6 @@ struct if_bond_req { #pragma pack() -#ifdef KERNEL_PRIVATE -int bond_family_init(void) __attribute__((section("__TEXT, initcode"))); -#endif /* KERNEL_PRIVATE */ +#include #endif /* _NET_IF_BOND_VAR_H_ */ diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index fd546fa0e..db581d740 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -1,6 +1,5 @@ -/* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -27,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +/* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ /* * Copyright 2001 Wasabi Systems, Inc. * All rights reserved. @@ -102,11 +102,6 @@ */ #include -//__FBSDID("$FreeBSD$"); - -//#include "opt_inet.h" -//#include "opt_inet6.h" -//#include "opt_carp.h" #define BRIDGE_DEBUG 1 #ifndef BRIDGE_DEBUG @@ -121,17 +116,12 @@ #include #include /* for net/if.h */ #include -//#include /* string functions */ #include #include #include #include -//#include -//#include -//#include #include #include -//#include #include #include @@ -144,11 +134,9 @@ #include #endif #include -//#include #include #include #include -//#include #include /* for struct arpcom */ #include @@ -162,12 +150,13 @@ #ifdef DEV_CARP #include #endif -//#include #include /* for struct arpcom */ #include #include #include +#if NVLAN > 0 #include +#endif /* NVLAN > 0 */ #include #include @@ -181,54 +170,57 @@ #if BRIDGE_DEBUG -#define BR_LCKDBG_MAX 4 +#define BR_LCKDBG_MAX 4 -#define BRIDGE_LOCK(_sc) bridge_lock(_sc) -#define BRIDGE_UNLOCK(_sc) bridge_unlock(_sc) -#define BRIDGE_LOCK_ASSERT(_sc) lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) +#define BRIDGE_LOCK(_sc) bridge_lock(_sc) +#define BRIDGE_UNLOCK(_sc) bridge_unlock(_sc) +#define BRIDGE_LOCK_ASSERT(_sc) \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) #define BRIDGE_LOCK2REF(_sc, _err) _err = bridge_lock2ref(_sc) -#define BRIDGE_UNREF(_sc) bridge_unref(_sc) -#define BRIDGE_XLOCK(_sc) bridge_xlock(_sc) -#define BRIDGE_XDROP(_sc) bridge_xdrop(_sc) +#define BRIDGE_UNREF(_sc) bridge_unref(_sc) +#define BRIDGE_XLOCK(_sc) bridge_xlock(_sc) +#define BRIDGE_XDROP(_sc) bridge_xdrop(_sc) #else /* BRIDGE_DEBUG */ #define BRIDGE_LOCK(_sc) lck_mtx_lock((_sc)->sc_mtx) #define BRIDGE_UNLOCK(_sc) lck_mtx_unlock((_sc)->sc_mtx) -#define BRIDGE_LOCK_ASSERT(_sc) lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) -#define BRIDGE_LOCK2REF(_sc, _err) do { \ - lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ - if ((_sc)->sc_iflist_xcnt > 0) \ - (_err) = EBUSY; \ - else \ - (_sc)->sc_iflist_ref++; \ - lck_mtx_unlock((_sc)->sc_mtx); \ +#define BRIDGE_LOCK_ASSERT(_sc) \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) +#define BRIDGE_LOCK2REF(_sc, _err) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + if ((_sc)->sc_iflist_xcnt > 0) \ + (_err) = EBUSY; \ + else \ + (_sc)->sc_iflist_ref++; \ + lck_mtx_unlock((_sc)->sc_mtx); \ } while (0) #define BRIDGE_UNREF(_sc) do { \ lck_mtx_lock((_sc)->sc_mtx); \ (_sc)->sc_iflist_ref--; \ if (((_sc)->sc_iflist_xcnt > 0) && ((_sc)->sc_iflist_ref == 0)) { \ - lck_mtx_unlock((_sc)->sc_mtx); \ - wakeup(&(_sc)->sc_cv); \ - } else \ - lck_mtx_unlock((_sc)->sc_mtx); \ + lck_mtx_unlock((_sc)->sc_mtx); \ + wakeup(&(_sc)->sc_cv); \ + } else \ + lck_mtx_unlock((_sc)->sc_mtx); \ } while (0) -#define BRIDGE_XLOCK(_sc) do { \ +#define BRIDGE_XLOCK(_sc) do { \ lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ - (_sc)->sc_iflist_xcnt++; \ - while ((_sc)->sc_iflist_ref > 0) \ - msleep(&(_sc)->sc_cv, (_sc)->sc_mtx, PZERO, "BRIDGE_XLOCK", NULL); \ + (_sc)->sc_iflist_xcnt++; \ + while ((_sc)->sc_iflist_ref > 0) \ + msleep(&(_sc)->sc_cv, (_sc)->sc_mtx, PZERO, \ + "BRIDGE_XLOCK", NULL); \ } while (0) -#define BRIDGE_XDROP(_sc) do { \ - lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ - (_sc)->sc_iflist_xcnt--; \ +#define BRIDGE_XDROP(_sc) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + (_sc)->sc_iflist_xcnt--; \ } while (0) #endif /* BRIDGE_DEBUG */ #if NBPFILTER > 0 -#define BRIDGE_BPF_MTAP_INPUT(sc, m) \ - if (sc->sc_bpf_input) \ +#define BRIDGE_BPF_MTAP_INPUT(sc, m) \ + if (sc->sc_bpf_input) \ bridge_bpf_input(sc->sc_ifp, m) #else /* NBPFILTER */ #define BRIDGE_BPF_MTAP_INPUT(ifp, m) @@ -294,17 +286,17 @@ struct bridge_iflist { TAILQ_ENTRY(bridge_iflist) bif_next; struct ifnet *bif_ifp; /* member if */ struct bstp_port bif_stp; /* STP state */ - uint32_t bif_flags; /* member if flags */ - int bif_savedcaps; /* saved capabilities */ - uint32_t bif_addrmax; /* max # of addresses */ - uint32_t bif_addrcnt; /* cur. # of addresses */ - uint32_t bif_addrexceeded;/* # of address violations */ - - interface_filter_t bif_iff_ref; - struct bridge_softc *bif_sc; - char bif_promisc; /* promiscuous mode set */ - char bif_proto_attached; /* protocol attached */ - char bif_filter_attached; /* interface filter attached */ + uint32_t bif_flags; /* member if flags */ + int bif_savedcaps; /* saved capabilities */ + uint32_t bif_addrmax; /* max # of addresses */ + uint32_t bif_addrcnt; /* cur. # of addresses */ + uint32_t bif_addrexceeded;/* # of address violations */ + + interface_filter_t bif_iff_ref; + struct bridge_softc *bif_sc; + char bif_promisc; /* promiscuous mode set */ + char bif_proto_attached; /* protocol attached */ + char bif_filter_attached; /* interface filter attached */ }; /* @@ -314,10 +306,10 @@ struct bridge_rtnode { LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ struct bridge_iflist *brt_dst; /* destination if */ - unsigned long brt_expire; /* expiration time */ - uint8_t brt_flags; /* address flags */ - uint8_t brt_addr[ETHER_ADDR_LEN]; - uint16_t brt_vlan; /* vlan id */ + unsigned long brt_expire; /* expiration time */ + uint8_t brt_flags; /* address flags */ + uint8_t brt_addr[ETHER_ADDR_LEN]; + uint16_t brt_vlan; /* vlan id */ }; #define brt_ifp brt_dst->bif_ifp @@ -326,41 +318,41 @@ struct bridge_rtnode { * Software state for each bridge. */ struct bridge_softc { - struct ifnet *sc_ifp; /* make this an interface */ - LIST_ENTRY(bridge_softc) sc_list; - lck_mtx_t *sc_mtx; - void *sc_cv; - uint32_t sc_brtmax; /* max # of addresses */ - uint32_t sc_brtcnt; /* cur. # of addresses */ - uint32_t sc_brttimeout; /* rt timeout in seconds */ - uint32_t sc_iflist_ref; /* refcount for sc_iflist */ - uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */ - TAILQ_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ - LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ - LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ - uint32_t sc_rthash_key; /* key for hash */ - TAILQ_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ - struct bstp_state sc_stp; /* STP state */ - uint32_t sc_brtexceeded; /* # of cache drops */ - uint32_t sc_filter_flags; /* ipf and flags */ - - char sc_if_xname[IFNAMSIZ]; - bpf_packet_func sc_bpf_input; - bpf_packet_func sc_bpf_output; - u_int32_t sc_flags; + struct ifnet *sc_ifp; /* make this an interface */ + LIST_ENTRY(bridge_softc) sc_list; + lck_mtx_t *sc_mtx; + void *sc_cv; + uint32_t sc_brtmax; /* max # of addresses */ + uint32_t sc_brtcnt; /* cur. # of addresses */ + uint32_t sc_brttimeout; /* rt timeout in seconds */ + uint32_t sc_iflist_ref; /* refcount for sc_iflist */ + uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */ + TAILQ_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ + LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ + LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ + uint32_t sc_rthash_key; /* key for hash */ + TAILQ_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ + struct bstp_state sc_stp; /* STP state */ + uint32_t sc_brtexceeded; /* # of cache drops */ + uint32_t sc_filter_flags; /* ipf and flags */ + + char sc_if_xname[IFNAMSIZ]; + bpf_packet_func sc_bpf_input; + bpf_packet_func sc_bpf_output; + u_int32_t sc_flags; #if BRIDGE_DEBUG - void *lock_lr[BR_LCKDBG_MAX]; /* locking calling history */ - int next_lock_lr; - void *unlock_lr[BR_LCKDBG_MAX]; /* unlocking caller history */ - int next_unlock_lr; + void *lock_lr[BR_LCKDBG_MAX]; /* locking calling history */ + int next_lock_lr; + void *unlock_lr[BR_LCKDBG_MAX]; /* unlocking caller history */ + int next_unlock_lr; #endif /* BRIDGE_DEBUG */ }; #define SCF_DETACHING 0x1 -static lck_mtx_t *bridge_list_mtx; -//eventhandler_tag bridge_detach_cookie = NULL; +decl_lck_mtx_data(static, bridge_list_mtx_data); +static lck_mtx_t *bridge_list_mtx = &bridge_list_mtx_data; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; @@ -380,15 +372,17 @@ static int bridge_init(struct ifnet *); #if HAS_BRIDGE_DUMMYNET static void bridge_dummynet(struct mbuf *, struct ifnet *); #endif -static void bridge_stop(struct ifnet *, int); -static errno_t bridge_start(struct ifnet *, struct mbuf *); +static void bridge_ifstop(struct ifnet *, int); +static int bridge_output(struct ifnet *, struct mbuf *); +static void bridge_start(struct ifnet *); __private_extern__ errno_t bridge_input(struct ifnet *, struct mbuf *, void *); #if BRIDGE_MEMBER_OUT_FILTER -static errno_t bridge_iff_output(void *, ifnet_t , protocol_family_t , mbuf_t *); -static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); +static errno_t bridge_iff_output(void *, ifnet_t, protocol_family_t , + mbuf_t *); +static int bridge_member_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); #endif -static void bridge_enqueue(struct bridge_softc *, struct ifnet *, +static int bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); @@ -421,8 +415,10 @@ static int bridge_rtnode_insert(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtnode_destroy(struct bridge_softc *, struct bridge_rtnode *); +#if BRIDGESTP static void bridge_rtable_expire(struct ifnet *, int); static void bridge_state_change(struct ifnet *, int); +#endif /* BRIDGESTP */ static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, const char *name); @@ -495,13 +491,19 @@ static void bridge_detach(ifnet_t ifp); /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ #define VLANTAGOF(_m) 0 +u_int8_t bstp_etheraddr[ETHER_ADDR_LEN] = + { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +#if BRIDGESTP static struct bstp_cb_ops bridge_ops = { .bcb_state = bridge_state_change, .bcb_rtage = bridge_rtable_expire }; +#endif /* BRIDGESTP */ SYSCTL_DECL(_net_link); -SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge"); +SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "Bridge"); #if defined(PFIL_HOOKS) static int pfil_onlyip = 1; /* only pass IP[46] packets when pfil is enabled */ @@ -511,25 +513,27 @@ static int pfil_ipfw = 0; /* layer2 filter with ipfw */ static int pfil_ipfw_arp = 0; /* layer2 filter with ipfw */ static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for locally destined packets */ -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW, +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW|CTLFLAG_LOCKED, &pfil_onlyip, 0, "Only pass IP packets when pfil is enabled"); -SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW, +SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW|CTLFLAG_LOCKED, &pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2"); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW, +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW|CTLFLAG_LOCKED, &pfil_bridge, 0, "Packet filter on the bridge interface"); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW, +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW|CTLFLAG_LOCKED, &pfil_member, 0, "Packet filter on the member interface"); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW, - &pfil_local_phys, 0, +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, + CTLFLAG_RW|CTLFLAG_LOCKED, &pfil_local_phys, 0, "Packet filter on the physical interface for locally destined packets"); #endif /* PFIL_HOOKS */ +#if BRIDGESTP static int log_stp = 0; /* log STP state changes */ SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW, &log_stp, 0, "Log STP state changes"); +#endif /* BRIDGESTP */ struct bridge_control { - int (*bc_func)(struct bridge_softc *, void *); + int (*bc_func)(struct bridge_softc *, void *); unsigned int bc_argsize; unsigned int bc_flags; }; @@ -539,213 +543,216 @@ struct bridge_control { #define BC_F_SUSER 0x04 /* do super-user check */ static const struct bridge_control bridge_control_table32[] = { - { bridge_ioctl_add, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_del, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gifflags, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_COPYOUT }, - { bridge_ioctl_sifflags, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_scache, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_gcache, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - - { bridge_ioctl_gifs32, sizeof(struct ifbifconf32), - BC_F_COPYIN|BC_F_COPYOUT }, - { bridge_ioctl_rts32, sizeof(struct ifbaconf32), - BC_F_COPYIN|BC_F_COPYOUT }, - - { bridge_ioctl_saddr32, sizeof(struct ifbareq32), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sto, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_gto, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - - { bridge_ioctl_daddr32, sizeof(struct ifbareq32), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_flush, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gpri, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_spri, sizeof(struct ifbrparam), + { bridge_ioctl_add, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs32, sizeof (struct ifbifconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts32, sizeof (struct ifbaconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr32, sizeof (struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr32, sizeof (struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfilt, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfilt, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_purge, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof (struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_ght, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sht, sizeof(struct ifbrparam), + { bridge_ioctl_delspan, sizeof (struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gfd, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfd, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gma, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sma, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sifprio, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sifcost, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gfilt, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfilt, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_purge, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gbparam32, sizeof (struct ifbropreq32), + BC_F_COPYOUT }, - { bridge_ioctl_addspan, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_delspan, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gbparam32, sizeof(struct ifbropreq32), - BC_F_COPYOUT }, - - { bridge_ioctl_grte, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - - { bridge_ioctl_gifsstp32, sizeof(struct ifbpstpconf32), - BC_F_COPYIN|BC_F_COPYOUT }, - - { bridge_ioctl_sproto, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_stxhc, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_grte, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp32, sizeof (struct ifbpstpconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, }; static const struct bridge_control bridge_control_table64[] = { - { bridge_ioctl_add, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_del, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gifflags, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_COPYOUT }, - { bridge_ioctl_sifflags, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_scache, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_gcache, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - - { bridge_ioctl_gifs64, sizeof(struct ifbifconf64), - BC_F_COPYIN|BC_F_COPYOUT }, - { bridge_ioctl_rts64, sizeof(struct ifbaconf64), - BC_F_COPYIN|BC_F_COPYOUT }, - - { bridge_ioctl_saddr64, sizeof(struct ifbareq64), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sto, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_gto, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - - { bridge_ioctl_daddr64, sizeof(struct ifbareq64), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_flush, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gpri, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_spri, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_ght, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sht, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gfd, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfd, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gma, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sma, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sifprio, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sifcost, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gfilt, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - { bridge_ioctl_sfilt, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_add, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_purge, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gifflags, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_addspan, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - { bridge_ioctl_delspan, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_gbparam64, sizeof(struct ifbropreq64), - BC_F_COPYOUT }, - - { bridge_ioctl_grte, sizeof(struct ifbrparam), - BC_F_COPYOUT }, - - { bridge_ioctl_gifsstp64, sizeof(struct ifbpstpconf64), - BC_F_COPYIN|BC_F_COPYOUT }, - - { bridge_ioctl_sproto, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_stxhc, sizeof(struct ifbrparam), - BC_F_COPYIN|BC_F_SUSER }, - - { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), - BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_scache, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs64, sizeof (struct ifbifconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts64, sizeof (struct ifbaconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr64, sizeof (struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr64, sizeof (struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfilt, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfilt, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_purge, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_delspan, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gbparam64, sizeof (struct ifbropreq64), + BC_F_COPYOUT }, + + { bridge_ioctl_grte, sizeof (struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp64, sizeof (struct ifbpstpconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof (struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof (struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, }; static const unsigned int bridge_control_table_size = -sizeof(bridge_control_table32) / sizeof(bridge_control_table32[0]); + sizeof (bridge_control_table32) / sizeof (bridge_control_table32[0]); -static LIST_HEAD(, bridge_softc) bridge_list = LIST_HEAD_INITIALIZER(bridge_list); +static LIST_HEAD(, bridge_softc) bridge_list = + LIST_HEAD_INITIALIZER(bridge_list); static lck_grp_t *bridge_lock_grp = NULL; static lck_attr_t *bridge_lock_attr = NULL; static if_clone_t bridge_cloner = NULL; -__private_extern__ int _if_brige_debug = 0; - -SYSCTL_INT(_net_link_bridge, OID_AUTO, debug, CTLFLAG_RW, - &_if_brige_debug, 0, "Bridge debug"); +static int if_bridge_txstart = 0; +SYSCTL_INT(_net_link_bridge, OID_AUTO, txstart, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_bridge_txstart, 0, "Bridge interface uses TXSTART model"); #if BRIDGE_DEBUG +static int if_bridge_debug = 0; +SYSCTL_INT(_net_link_bridge, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_bridge_debug, 0, "Bridge debug"); static void printf_ether_header(struct ether_header *eh); static void printf_mbuf_data(mbuf_t m, size_t offset, size_t len); static void printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix); static void printf_mbuf(mbuf_t m, const char *prefix, const char *suffix); -static void link_print(struct sockaddr_dl * dl_p); +static void link_print(struct sockaddr_dl *dl_p); static void bridge_lock(struct bridge_softc *); static void bridge_unlock(struct bridge_softc *); @@ -754,35 +761,38 @@ static void bridge_unref(struct bridge_softc *); static void bridge_xlock(struct bridge_softc *); static void bridge_xdrop(struct bridge_softc *); -static void bridge_lock(struct bridge_softc *sc) +static void +bridge_lock(struct bridge_softc *sc) { void *lr_saved = __builtin_return_address(0); - + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(sc->sc_mtx); - + sc->lock_lr[sc->next_lock_lr] = lr_saved; sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; } -static void bridge_unlock(struct bridge_softc *sc) +static void +bridge_unlock(struct bridge_softc *sc) { void *lr_saved = __builtin_return_address(0); - + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); sc->unlock_lr[sc->next_unlock_lr] = lr_saved; sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; - + lck_mtx_unlock(sc->sc_mtx); } -static int bridge_lock2ref(struct bridge_softc *sc) +static int +bridge_lock2ref(struct bridge_softc *sc) { int error = 0; void *lr_saved = __builtin_return_address(0); - + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); if (sc->sc_iflist_xcnt > 0) @@ -793,11 +803,12 @@ static int bridge_lock2ref(struct bridge_softc *sc) sc->unlock_lr[sc->next_unlock_lr] = lr_saved; sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; lck_mtx_unlock(sc->sc_mtx); - - return error; + + return (error); } -static void bridge_unref(struct bridge_softc *sc) +static void +bridge_unref(struct bridge_softc *sc) { void *lr_saved = __builtin_return_address(0); @@ -808,17 +819,18 @@ static void bridge_unref(struct bridge_softc *sc) sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; sc->sc_iflist_ref--; - + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; - if ((sc->sc_iflist_xcnt > 0) && (sc->sc_iflist_ref == 0)) { + if ((sc->sc_iflist_xcnt > 0) && (sc->sc_iflist_ref == 0)) { lck_mtx_unlock(sc->sc_mtx); wakeup(&sc->sc_cv); } else lck_mtx_unlock(sc->sc_mtx); } -static void bridge_xlock(struct bridge_softc *sc) +static void +bridge_xlock(struct bridge_softc *sc) { void *lr_saved = __builtin_return_address(0); @@ -828,7 +840,7 @@ static void bridge_xlock(struct bridge_softc *sc) while (sc->sc_iflist_ref > 0) { sc->unlock_lr[sc->next_unlock_lr] = lr_saved; sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; - + msleep(&sc->sc_cv, sc->sc_mtx, PZERO, "BRIDGE_XLOCK", NULL); sc->lock_lr[sc->next_lock_lr] = lr_saved; @@ -836,7 +848,8 @@ static void bridge_xlock(struct bridge_softc *sc) } } -static void bridge_xdrop(struct bridge_softc *sc) +static void +bridge_xdrop(struct bridge_softc *sc) { lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); @@ -848,9 +861,9 @@ printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix) { if (m) printf("%spktlen: %u rcvif: %p header: %p nextpkt: %p%s", - prefix ? prefix : "", - (unsigned int)mbuf_pkthdr_len(m), mbuf_pkthdr_rcvif(m), mbuf_pkthdr_header(m), mbuf_nextpkt(m), - suffix ? suffix : ""); + prefix ? prefix : "", (unsigned int)mbuf_pkthdr_len(m), + mbuf_pkthdr_rcvif(m), mbuf_pkthdr_header(m), + mbuf_nextpkt(m), suffix ? suffix : ""); else printf("%s%s\n", prefix, suffix); } @@ -859,11 +872,12 @@ void printf_mbuf(mbuf_t m, const char *prefix, const char *suffix) { if (m) { - printf("%s%p type: %u flags: 0x%x len: %u data: %p maxlen: %u datastart: %p next: %p%s", - prefix ? prefix : "", - m, mbuf_type(m), mbuf_flags(m), (unsigned int)mbuf_len(m), mbuf_data(m), - (unsigned int)mbuf_maxlen(m), mbuf_datastart(m), mbuf_next(m), - !suffix || (mbuf_flags(m) & MBUF_PKTHDR) ? "" : suffix); + printf("%s%p type: %u flags: 0x%x len: %u data: %p maxlen: %u " + "datastart: %p next: %p%s", prefix ? prefix : "", + m, mbuf_type(m), mbuf_flags(m), (unsigned int)mbuf_len(m), + mbuf_data(m), (unsigned int)mbuf_maxlen(m), + mbuf_datastart(m), mbuf_next(m), + !suffix || (mbuf_flags(m) & MBUF_PKTHDR) ? "" : suffix); if ((mbuf_flags(m) & MBUF_PKTHDR)) printf_mbuf_pkthdr(m, " ", suffix); } else @@ -877,12 +891,12 @@ printf_mbuf_data(mbuf_t m, size_t offset, size_t len) size_t i, j; size_t pktlen, mlen, maxlen; unsigned char *ptr; - + pktlen = mbuf_pkthdr_len(m); - + if (offset > pktlen) return; - + maxlen = (pktlen - offset > len) ? len : pktlen; n = m; mlen = mbuf_len(n); @@ -900,25 +914,25 @@ printf_mbuf_data(mbuf_t m, size_t offset, size_t len) printf("%02x%s", ptr[j], i % 2 ? " " : ""); } } - return; } static void printf_ether_header(struct ether_header *eh) { - printf("%02x:%02x:%02x:%02x:%02x:%02x > %02x:%02x:%02x:%02x:%02x:%02x 0x%04x ", - eh->ether_shost[0], eh->ether_shost[1], eh->ether_shost[2], - eh->ether_shost[3], eh->ether_shost[4], eh->ether_shost[5], - eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], - eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5], - eh->ether_type); + printf("%02x:%02x:%02x:%02x:%02x:%02x > " + "%02x:%02x:%02x:%02x:%02x:%02x 0x%04x ", + eh->ether_shost[0], eh->ether_shost[1], eh->ether_shost[2], + eh->ether_shost[3], eh->ether_shost[4], eh->ether_shost[5], + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5], + eh->ether_type); } static void -link_print(struct sockaddr_dl * dl_p) +link_print(struct sockaddr_dl *dl_p) { int i; - + #if 1 printf("sdl len %d index %d family %d type 0x%x nlen %d alen %d" " slen %d addr ", dl_p->sdl_len, @@ -926,10 +940,8 @@ link_print(struct sockaddr_dl * dl_p) dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); #endif for (i = 0; i < dl_p->sdl_alen; i++) - printf("%s%x", i ? ":" : "", - (CONST_LLADDR(dl_p))[i]); + printf("%s%x", i ? ":" : "", (CONST_LLADDR(dl_p))[i]); printf("\n"); - return; } #endif /* BRIDGE_DEBUG */ @@ -945,39 +957,41 @@ bridgeattach(__unused int n) int error; lck_grp_attr_t *lck_grp_attr = NULL; struct ifnet_clone_params ifnet_clone_params; - - bridge_rtnode_pool = zinit(sizeof(struct bridge_rtnode), 1024 * sizeof(struct bridge_rtnode), - 0, "bridge_rtnode"); + + bridge_rtnode_pool = zinit(sizeof (struct bridge_rtnode), + 1024 * sizeof (struct bridge_rtnode), 0, "bridge_rtnode"); zone_change(bridge_rtnode_pool, Z_CALLERACCT, FALSE); lck_grp_attr = lck_grp_attr_alloc_init(); - + bridge_lock_grp = lck_grp_alloc_init("if_bridge", lck_grp_attr); - + bridge_lock_attr = lck_attr_alloc_init(); - + #if BRIDGE_DEBUG lck_attr_setdebug(bridge_lock_attr); #endif - bridge_list_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); - - // can free the attributes once we've allocated the group lock + lck_mtx_init(bridge_list_mtx, bridge_lock_grp, bridge_lock_attr); + + /* can free the attributes once we've allocated the group lock */ lck_grp_attr_free(lck_grp_attr); - + LIST_INIT(&bridge_list); - + +#if BRIDGESTP bstp_sys_init(); - +#endif /* BRIDGESTP */ + ifnet_clone_params.ifc_name = "bridge"; ifnet_clone_params.ifc_create = bridge_clone_create; ifnet_clone_params.ifc_destroy = bridge_clone_destroy; - + error = ifnet_clone_attach(&ifnet_clone_params, &bridge_cloner); if (error != 0) - printf("bridgeattach: ifnet_clone_attach failed %d\n", error); + printf("%s: ifnet_clone_attach failed %d\n", __func__, error); - return error; + return (error); } #if defined(PFIL_HOOKS) @@ -987,7 +1001,7 @@ bridgeattach(__unused int n) static int sysctl_pfil_ipfw SYSCTL_HANDLER_ARGS { -#pragma unused(arg1,arg2) +#pragma unused(arg1, arg2) int enable = pfil_ipfw; int error; @@ -1012,6 +1026,7 @@ sysctl_pfil_ipfw SYSCTL_HANDLER_ARGS return (error); } + SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT|CTLFLAG_RW, &pfil_ipfw, 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); #endif /* PFIL_HOOKS */ @@ -1027,13 +1042,14 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) struct ifnet *ifp = NULL; struct bridge_softc *sc; u_char eaddr[6]; - struct ifnet_init_params init_params; + struct ifnet_init_eparams init_params; errno_t error = 0; - uint32_t sdl_buffer[offsetof(struct sockaddr_dl, sdl_data) + IFNAMSIZ + ETHER_ADDR_LEN]; + uint32_t sdl_buffer[offsetof(struct sockaddr_dl, sdl_data) + + IFNAMSIZ + ETHER_ADDR_LEN]; struct sockaddr_dl *sdl = (struct sockaddr_dl *)sdl_buffer; - sc = _MALLOC(sizeof(*sc), M_DEVBUF, M_WAITOK); - memset(sc, 0, sizeof(*sc)); + sc = _MALLOC(sizeof (*sc), M_DEVBUF, M_WAITOK); + memset(sc, 0, sizeof (*sc)); sc->sc_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); sc->sc_brtmax = BRIDGE_RTABLE_MAX; @@ -1051,66 +1067,70 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) /* Initialize our routing table. */ error = bridge_rtable_init(sc); if (error != 0) { - printf("bridge_clone_create: bridge_rtable_init failed %d\n", error); + printf("%s: bridge_rtable_init failed %d\n", __func__, error); goto done; } - + TAILQ_INIT(&sc->sc_iflist); TAILQ_INIT(&sc->sc_spanlist); /* use the interface name as the unique id for ifp recycle */ - snprintf(sc->sc_if_xname, sizeof(sc->sc_if_xname), "%s%d", + snprintf(sc->sc_if_xname, sizeof (sc->sc_if_xname), "%s%d", ifc->ifc_name, unit); - memset(&init_params, 0, sizeof(struct ifnet_init_params)); - init_params.uniqueid = sc->sc_if_xname; - init_params.uniqueid_len = strlen(sc->sc_if_xname); - init_params.name = ifc->ifc_name; - init_params.unit = unit; - init_params.family = IFNET_FAMILY_ETHERNET; - init_params.type = IFT_BRIDGE; - init_params.output = bridge_start; - init_params.demux = ether_demux; - init_params.add_proto = ether_add_proto; - init_params.del_proto = ether_del_proto; - init_params.check_multi = ether_check_multi; - init_params.framer = ether_frameout; - init_params.softc = sc; - init_params.ioctl = bridge_ioctl; - init_params.set_bpf_tap = bridge_set_bpf_tap; - init_params.detach = bridge_detach; - init_params.broadcast_addr = etherbroadcastaddr; - init_params.broadcast_len = ETHER_ADDR_LEN; - error = ifnet_allocate(&init_params, &ifp); + bzero(&init_params, sizeof (init_params)); + init_params.ver = IFNET_INIT_CURRENT_VERSION; + init_params.len = sizeof (init_params); + if (if_bridge_txstart) { + init_params.start = bridge_start; + } else { + init_params.flags = IFNET_INIT_LEGACY; + init_params.output = bridge_output; + } + init_params.uniqueid = sc->sc_if_xname; + init_params.uniqueid_len = strlen(sc->sc_if_xname); + init_params.sndq_maxlen = IFQ_MAXLEN; + init_params.name = ifc->ifc_name; + init_params.unit = unit; + init_params.family = IFNET_FAMILY_ETHERNET; + init_params.type = IFT_BRIDGE; + init_params.demux = ether_demux; + init_params.add_proto = ether_add_proto; + init_params.del_proto = ether_del_proto; + init_params.check_multi = ether_check_multi; + init_params.framer = ether_frameout; + init_params.softc = sc; + init_params.ioctl = bridge_ioctl; + init_params.set_bpf_tap = bridge_set_bpf_tap; + init_params.detach = bridge_detach; + init_params.broadcast_addr = etherbroadcastaddr; + init_params.broadcast_len = ETHER_ADDR_LEN; + error = ifnet_allocate_extended(&init_params, &ifp); if (error != 0) { - printf("bridge_clone_create: ifnet_allocate failed %d\n", error); + printf("%s: ifnet_allocate failed %d\n", __func__, error); goto done; } sc->sc_ifp = ifp; - + error = ifnet_set_mtu(ifp, ETHERMTU); if (error != 0) { - printf("bridge_clone_create: ifnet_set_mtu failed %d\n", error); + printf("%s: ifnet_set_mtu failed %d\n", __func__, error); goto done; } error = ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); if (error != 0) { - printf("bridge_clone_create: ifnet_set_addrlen failed %d\n", error); - goto done; - } - error = ifnet_set_baudrate(ifp, 10000000) ; // XXX: this is what IONetworking does - if (error != 0) { - printf("bridge_clone_create: ifnet_set_baudrate failed %d\n", error); + printf("%s: ifnet_set_addrlen failed %d\n", __func__, error); goto done; } error = ifnet_set_hdrlen(ifp, ETHER_HDR_LEN); if (error != 0) { - printf("bridge_clone_create: ifnet_set_hdrlen failed %d\n", error); + printf("%s: ifnet_set_hdrlen failed %d\n", __func__, error); goto done; } - error = ifnet_set_flags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST, - 0xffff); + error = ifnet_set_flags(ifp, + IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST, + 0xffff); if (error != 0) { - printf("bridge_clone_create: ifnet_set_flags failed %d\n", error); + printf("%s: ifnet_set_flags failed %d\n", __func__, error); goto done; } @@ -1125,11 +1145,11 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) */ { int retry; - + for (retry = 1; retry != 0;) { struct ifnet *bifp; struct bridge_softc *sc2; - + read_random(eaddr, ETHER_ADDR_LEN); eaddr[0] &= ~1; /* clear multicast bit */ eaddr[0] |= 2; /* set the LAA bit */ @@ -1137,7 +1157,8 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) lck_mtx_lock(bridge_list_mtx); LIST_FOREACH(sc2, &bridge_list, sc_list) { bifp = sc2->sc_ifp; - if (memcmp(eaddr, ifnet_lladdr(bifp), ETHER_ADDR_LEN) == 0) + if (memcmp(eaddr, ifnet_lladdr(bifp), + ETHER_ADDR_LEN) == 0) retry = 1; } lck_mtx_unlock(bridge_list_mtx); @@ -1150,8 +1171,8 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) */ { uint32_t r; - - read_random(&r, sizeof(r)); + + read_random(&r, sizeof (r)); eaddr[0] = 0xAC; eaddr[1] = 0xDE; eaddr[2] = 0x48; @@ -1161,59 +1182,65 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) } #endif - memset(sdl, 0, sizeof(sdl_buffer)); + memset(sdl, 0, sizeof (sdl_buffer)); sdl->sdl_family = AF_LINK; sdl->sdl_nlen = strlen(sc->sc_if_xname); sdl->sdl_alen = ETHER_ADDR_LEN; sdl->sdl_len = offsetof(struct sockaddr_dl, sdl_data); memcpy(sdl->sdl_data, sc->sc_if_xname, sdl->sdl_nlen); memcpy(LLADDR(sdl), eaddr, ETHER_ADDR_LEN); - + #if BRIDGE_DEBUG - link_print(sdl); + if (if_bridge_debug) + link_print(sdl); #endif error = ifnet_attach(ifp, NULL); if (error != 0) { - printf("bridge_clone_create: ifnet_attach failed %d\n", error); + printf("%s: ifnet_attach failed %d\n", __func__, error); goto done; } - - error = ifnet_set_lladdr_and_type(ifp, eaddr, ETHER_ADDR_LEN, IFT_ETHER); + + error = ifnet_set_lladdr_and_type(ifp, eaddr, ETHER_ADDR_LEN, + IFT_ETHER); if (error != 0) { - printf("bridge_clone_create: ifnet_set_lladdr_and_type failed %d\n", error); + printf("%s: ifnet_set_lladdr_and_type failed %d\n", __func__, + error); goto done; } - + #if APPLE_BRIDGE_HWCKSUM_SUPPORT - /* - * APPLE MODIFICATION - our bridge can support HW checksums + /* + * APPLE MODIFICATION - our bridge can support HW checksums * (useful if underlying interfaces support them) on TX, * RX is not that interesting, since the stack just looks to * see if the packet has been checksummed already (I think) * but we might as well indicate we support it */ ifp->if_capabilities = - IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx | - IFCAP_CSUM_IPv4_Rx | IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_UDPv4_Rx ; + IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx | + IFCAP_CSUM_IPv4_Rx | IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_UDPv4_Rx; #endif - + +#if BRIDGESTP bstp_attach(&sc->sc_stp, &bridge_ops); +#endif /* BRIDGESTP */ lck_mtx_lock(bridge_list_mtx); LIST_INSERT_HEAD(&bridge_list, sc, sc_list); lck_mtx_unlock(bridge_list_mtx); /* attach as ethernet */ - error = bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header), NULL, NULL); + error = bpf_attach(ifp, DLT_EN10MB, sizeof (struct ether_header), + NULL, NULL); done: if (error != 0) { - printf("bridge_clone_create failed error %d\n", error); + printf("%s failed error %d\n", __func__, error); /* Cleanup TBD */ } - - return error; + + return (error); } /* @@ -1231,15 +1258,15 @@ bridge_clone_destroy(struct ifnet *ifp) BRIDGE_LOCK(sc); if ((sc->sc_flags & SCF_DETACHING)) { BRIDGE_UNLOCK(sc); - return 0; + return (0); } sc->sc_flags |= SCF_DETACHING; - bridge_stop(ifp, 1); + bridge_ifstop(ifp, 1); error = ifnet_set_flags(ifp, 0, IFF_UP); if (error != 0) { - printf("bridge_clone_destroy: ifnet_set_flags failed %d\n", error); + printf("%s: ifnet_set_flags failed %d\n", __func__, error); } while ((bif = TAILQ_FIRST(&sc->sc_iflist)) != NULL) @@ -1253,63 +1280,64 @@ bridge_clone_destroy(struct ifnet *ifp) error = ifnet_detach(ifp); if (error != 0) { - panic("bridge_clone_destroy: ifnet_detach(%p) failed %d\n", ifp, error); + panic("bridge_clone_destroy: ifnet_detach(%p) failed %d\n", + ifp, error); if ((sc = (struct bridge_softc *)ifnet_softc(ifp)) != NULL) { BRIDGE_LOCK(sc); sc->sc_flags &= ~SCF_DETACHING; BRIDGE_UNLOCK(sc); } - return 0; + return (0); } - return 0; + return (0); } #define DRVSPEC do { \ - if (ifd->ifd_cmd >= bridge_control_table_size) { \ - error = EINVAL; \ - break; \ - } \ - bc = &bridge_control_table[ifd->ifd_cmd]; \ - \ - if (cmd == SIOCGDRVSPEC && \ - (bc->bc_flags & BC_F_COPYOUT) == 0) { \ - error = EINVAL; \ - break; \ - } \ - else if (cmd == SIOCSDRVSPEC && \ - (bc->bc_flags & BC_F_COPYOUT) != 0) { \ - error = EINVAL; \ - break; \ - } \ - \ - if (bc->bc_flags & BC_F_SUSER) { \ - error = kauth_authorize_generic(kauth_cred_get(), KAUTH_GENERIC_ISSUSER); \ - if (error) \ - break; \ - } \ - \ - if (ifd->ifd_len != bc->bc_argsize || \ - ifd->ifd_len > sizeof(args)) { \ - error = EINVAL; \ - break; \ - } \ - \ - bzero(&args, sizeof(args)); \ - if (bc->bc_flags & BC_F_COPYIN) { \ - error = copyin(ifd->ifd_data, &args, ifd->ifd_len); \ - if (error) \ - break; \ - } \ - \ - BRIDGE_LOCK(sc); \ - error = (*bc->bc_func)(sc, &args); \ - BRIDGE_UNLOCK(sc); \ - if (error) \ - break; \ - \ - if (bc->bc_flags & BC_F_COPYOUT) \ - error = copyout(&args, ifd->ifd_data, ifd->ifd_len); \ + if (ifd->ifd_cmd >= bridge_control_table_size) { \ + error = EINVAL; \ + break; \ + } \ + bc = &bridge_control_table[ifd->ifd_cmd]; \ + \ + if (cmd == SIOCGDRVSPEC && \ + (bc->bc_flags & BC_F_COPYOUT) == 0) { \ + error = EINVAL; \ + break; \ + } else if (cmd == SIOCSDRVSPEC && \ + (bc->bc_flags & BC_F_COPYOUT) != 0) { \ + error = EINVAL; \ + break; \ + } \ + \ + if (bc->bc_flags & BC_F_SUSER) { \ + error = kauth_authorize_generic(kauth_cred_get(), \ + KAUTH_GENERIC_ISSUSER); \ + if (error) \ + break; \ + } \ + \ + if (ifd->ifd_len != bc->bc_argsize || \ + ifd->ifd_len > sizeof (args)) { \ + error = EINVAL; \ + break; \ + } \ + \ + bzero(&args, sizeof (args)); \ + if (bc->bc_flags & BC_F_COPYIN) { \ + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); \ + if (error) \ + break; \ + } \ + \ + BRIDGE_LOCK(sc); \ + error = (*bc->bc_func)(sc, &args); \ + BRIDGE_UNLOCK(sc); \ + if (error) \ + break; \ + \ + if (bc->bc_flags & BC_F_COPYOUT) \ + error = copyout(&args, ifd->ifd_data, ifd->ifd_len); \ } while (0) @@ -1322,25 +1350,21 @@ static errno_t bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) { struct bridge_softc *sc = ifp->if_softc; - struct ifreq *ifr = (struct ifreq *) data; + struct ifreq *ifr = (struct ifreq *)data; int error = 0; lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); #if BRIDGE_DEBUG - if (_if_brige_debug) - printf("bridge_ioctl: ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu)\n", - ifp, - cmd, - (cmd & IOC_IN) ? 'I' : ' ', - (cmd & IOC_OUT) ? 'O' : ' ', - IOCPARM_LEN(cmd), - (char)IOCGROUP(cmd), - cmd & 0xff); + if (if_bridge_debug) + printf("%s: ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu)\n", + __func__, ifp, cmd, (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', IOCPARM_LEN(cmd), + (char)IOCGROUP(cmd), cmd & 0xff); #endif - + switch (cmd) { - + case SIOCSIFADDR: case SIOCAIFADDR: ifnet_set_flags(ifp, IFF_UP, IFF_UP); @@ -1365,9 +1389,10 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) struct ifbrparam ifbrparam; struct ifbropreq32 ifbropreq; } args; - struct ifdrv32 *ifd = (struct ifdrv32 *) data; - const struct bridge_control *bridge_control_table = bridge_control_table32, *bc; - + struct ifdrv32 *ifd = (struct ifdrv32 *)data; + const struct bridge_control *bridge_control_table = + bridge_control_table32, *bc; + DRVSPEC; break; @@ -1382,11 +1407,12 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) struct ifbrparam ifbrparam; struct ifbropreq64 ifbropreq; } args; - struct ifdrv64 *ifd = (struct ifdrv64 *) data; - const struct bridge_control *bridge_control_table = bridge_control_table64, *bc; - + struct ifdrv64 *ifd = (struct ifdrv64 *)data; + const struct bridge_control *bridge_control_table = + bridge_control_table64, *bc; + DRVSPEC; - + break; } @@ -1398,7 +1424,7 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) * then stop and disable it. */ BRIDGE_LOCK(sc); - bridge_stop(ifp, 1); + bridge_ifstop(ifp, 1); BRIDGE_UNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_flags & IFF_RUNNING)) { @@ -1413,9 +1439,11 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) break; case SIOCSIFLLADDR: - error = ifnet_set_lladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); + error = ifnet_set_lladdr(ifp, ifr->ifr_addr.sa_data, + ifr->ifr_addr.sa_len); if (error != 0) - printf("bridge_ioctl: ifnet_set_lladdr failed %d\n", error); + printf("%s: ifnet_set_lladdr failed %d\n", __func__, + error); break; case SIOCSIFMTU: @@ -1424,22 +1452,15 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) break; default: - /* - * drop the lock as ether_ioctl() will call bridge_start() and - * cause the lock to be recursed. - */ error = ether_ioctl(ifp, cmd, data); #if BRIDGE_DEBUG - if (error != 0) - printf("bridge_ioctl: ether_ioctl ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu) failed error: %d\n", - ifp, - cmd, - (cmd & IOC_IN) ? 'I' : ' ', - (cmd & IOC_OUT) ? 'O' : ' ', - IOCPARM_LEN(cmd), - (char) IOCGROUP(cmd), - cmd & 0xff, - error); + if (error != 0 && error != EOPNOTSUPP) + printf("%s: ether_ioctl ifp %p cmd 0x%08lx " + "(%c%c [%lu] %c %lu) failed error: %d\n", + __func__, ifp, cmd, (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), (char)IOCGROUP(cmd), + cmd & 0xff, error); #endif /* BRIDGE_DEBUG */ break; } @@ -1487,7 +1508,7 @@ bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) struct ifreq ifr; int error; - bzero(&ifr, sizeof(ifr)); + bzero(&ifr, sizeof (ifr)); ifr.ifr_reqcap = set; if (ifp->if_capenable != set) { @@ -1495,9 +1516,9 @@ bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); IFF_UNLOCKGIANT(ifp); if (error) - printf("error setting interface capabilities on %s\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), - ifp->if_xname); + printf("%s: error setting interface capabilities " + "on %s\n", __func__, ifnet_name(sc->sc_ifp), + ifnet_unit(sc->sc_ifp), ifp->if_xname); } } #endif /* HAS_IF_CAP */ @@ -1518,9 +1539,9 @@ bridge_lookup_member(struct bridge_softc *sc, const char *name) TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { ifp = bif->bif_ifp; - snprintf(if_xname, sizeof(if_xname), "%s%d", + snprintf(if_xname, sizeof (if_xname), "%s%d", ifnet_name(ifp), ifnet_unit(ifp)); - if (strncmp(if_xname, name, sizeof(if_xname)) == 0) + if (strncmp(if_xname, name, sizeof (if_xname)) == 0) return (bif); } @@ -1547,9 +1568,9 @@ bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) return (NULL); } -static errno_t -bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, - mbuf_t *data, char **frame_ptr) +static errno_t +bridge_iff_input(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol, + mbuf_t *data, char **frame_ptr) { errno_t error = 0; struct bridge_iflist *bif = (struct bridge_iflist *)cookie; @@ -1560,21 +1581,24 @@ bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, if ((m->m_flags & M_PROTO1)) goto out; - - if (*frame_ptr >= (char *)mbuf_datastart(m) && *frame_ptr <= (char *)mbuf_data(m)) { + + if (*frame_ptr >= (char *)mbuf_datastart(m) && + *frame_ptr <= (char *)mbuf_data(m)) { included = 1; frmlen = (char *)mbuf_data(m) - *frame_ptr; } #if BRIDGE_DEBUG - if (_if_brige_debug) { - printf("bridge_iff_input %s%d from %s%d m %p data %p frame %p %s frmlen %lu\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), - ifnet_name(ifp), ifnet_unit(ifp), - m, mbuf_data(m), *frame_ptr, included ? "inside" : "outside", frmlen); - - if (_if_brige_debug > 1) { + if (if_bridge_debug) { + printf("%s: %s%d from %s%d m %p data %p frame %p %s " + "frmlen %lu\n", __func__, ifnet_name(sc->sc_ifp), + ifnet_unit(sc->sc_ifp), ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m), *frame_ptr, + included ? "inside" : "outside", frmlen); + + if (if_bridge_debug > 1) { printf_mbuf(m, "bridge_iff_input[", "\n"); - printf_ether_header((struct ether_header *)*frame_ptr); + printf_ether_header((struct ether_header *) + (void *)*frame_ptr); printf_mbuf_data(m, 0, 20); printf("\n"); } @@ -1583,22 +1607,24 @@ bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, /* Move data pointer to start of frame to the link layer header */ if (included) { - (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, mbuf_len(m) + frmlen); + (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, + mbuf_len(m) + frmlen); (void) mbuf_pkthdr_adjustlen(m, frmlen); } else { - printf("bridge_iff_input: frame_ptr outside mbuf\n"); + printf("%s: frame_ptr outside mbuf\n", __func__); goto out; } - + error = bridge_input(ifp, m, *frame_ptr); - + /* Adjust packet back to original */ if (error == 0) { - (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, mbuf_len(m) - frmlen); + (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, + mbuf_len(m) - frmlen); (void) mbuf_pkthdr_adjustlen(m, -frmlen); } #if BRIDGE_DEBUG - if (_if_brige_debug > 1) { + if (if_bridge_debug > 1) { printf("\n"); printf_mbuf(m, "bridge_iff_input]", "\n"); } @@ -1606,8 +1632,8 @@ bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, out: lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); - - return error; + + return (error); } @@ -1619,39 +1645,39 @@ bridge_iff_output(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol struct bridge_iflist *bif = (struct bridge_iflist *)cookie; struct bridge_softc *sc = bif->bif_sc; mbuf_t m = *data; - + if ((m->m_flags & M_PROTO1)) goto out; - + #if BRIDGE_DEBUG - if (_if_brige_debug) { - printf("bridge_iff_output %s%d from %s%d m %p data %p\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), - ifnet_name(ifp), ifnet_unit(ifp), - m, mbuf_data(m)); + if (if_bridge_debug) { + printf("%s: %s%d from %s%d m %p data %p\n", __func__, + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), m, mbuf_data(m)); } #endif /* BRIDGE_DEBUG */ - error = bridge_output(sc, ifp, m); + error = bridge_member_output(sc, ifp, m); if (error != 0) { - printf("bridge_iff_output: bridge_output failed error %d\n", error); + printf("%s: bridge_member_output failed error %d\n", __func__, + error); } -out: +out: lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); - return error; + return (error); } #endif /* BRIDGE_MEMBER_OUT_FILTER */ -static void -bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, - const struct kev_msg *event_msg) +static void +bridge_iff_event(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol, + const struct kev_msg *event_msg) { struct bridge_iflist *bif = (struct bridge_iflist *)cookie; - - if (event_msg->vendor_code == KEV_VENDOR_APPLE && + + if (event_msg->vendor_code == KEV_VENDOR_APPLE && event_msg->kev_class == KEV_NETWORK_CLASS && event_msg->kev_subclass == KEV_DL_SUBCLASS) { switch (event_msg->event_code) { @@ -1659,30 +1685,37 @@ bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, case KEV_DL_IF_DETACHED: bridge_ifdetach(bif, ifp); break; - + case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: { +#if BRIDGESTP bstp_linkstate(ifp, event_msg->event_code); +#endif /* BRIDGESTP */ break; } - + case KEV_DL_SIFFLAGS: { - if (bif->bif_promisc == 0 && (ifp->if_flags & IFF_UP)) { - errno_t error = ifnet_set_promiscuous(ifp, 1); + if (bif->bif_promisc == 0 && + (ifp->if_flags & IFF_UP)) { + errno_t error = + ifnet_set_promiscuous(ifp, 1); if (error != 0) { - printf("bridge_iff_event: ifnet_set_promiscuous(%s%d) failed %d\n", - ifnet_name(ifp), ifnet_unit(ifp), error); + printf("%s: " + "ifnet_set_promiscuous" + "(%s%d) failed %d\n", + __func__, ifnet_name(ifp), + ifnet_unit(ifp), error); } else { bif->bif_promisc = 1; } } break; } - + default: break; } - } + } } /* @@ -1691,48 +1724,44 @@ bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, * Detach an interface from a bridge. Called when a member * interface is detaching. */ -static void -bridge_iff_detached(void* cookie, __unused ifnet_t ifp) +static void +bridge_iff_detached(void *cookie, __unused ifnet_t ifp) { struct bridge_iflist *bif = (struct bridge_iflist *)cookie; -#if BRIDGE_DEBUG - printf("bridge_iff_detached: %s%d\n", - ifnet_name(ifp), ifnet_unit(ifp)); +#if BRIDGE_DEBUG + printf("%s: %s%d\n", __func__, ifnet_name(ifp), ifnet_unit(ifp)); #endif bridge_ifdetach(bif, ifp); _FREE(bif, M_DEVBUF); - - return; } static errno_t -bridge_proto_input(ifnet_t ifp, __unused protocol_family_t protocol, - __unused mbuf_t packet, __unused char *header) +bridge_proto_input(ifnet_t ifp, __unused protocol_family_t protocol, + __unused mbuf_t packet, __unused char *header) { - printf("bridge_proto_input: unexpected packet from %s%d\n", - ifnet_name(ifp), ifnet_unit(ifp)); - return 0; + printf("%s: unexpected packet from %s%d\n", __func__, + ifnet_name(ifp), ifnet_unit(ifp)); + return (0); } static int bridge_attach_protocol(struct ifnet *ifp) { - int error; + int error; struct ifnet_attach_proto_param reg; - printf("bridge_attach_protocol: %s%d\n", - ifnet_name(ifp), ifnet_unit(ifp)); - - bzero(®, sizeof(reg)); + printf("%s: %s%d\n", __func__, ifnet_name(ifp), ifnet_unit(ifp)); + + bzero(®, sizeof (reg)); reg.input = bridge_proto_input; - + error = ifnet_attach_protocol(ifp, PF_BRIDGE, ®); if (error) - printf("bridge_attach_protocol: ifnet_attach_protocol(%s%d) failed, %d\n", - ifnet_name(ifp), ifnet_unit(ifp), error); + printf("%s: ifnet_attach_protocol(%s%d) failed, %d\n", + __func__, ifnet_name(ifp), ifnet_unit(ifp), error); return (error); } @@ -1742,13 +1771,12 @@ bridge_detach_protocol(struct ifnet *ifp) { int error; - printf("bridge_detach_protocol: %s%d\n", - ifnet_name(ifp), ifnet_unit(ifp)); + printf("%s: %s%d\n", __func__, ifnet_name(ifp), ifnet_unit(ifp)); error = ifnet_detach_protocol(ifp, PF_BRIDGE); if (error) - printf("bridge_attach_protocol: ifnet_detach_protocol(%s%d) failed, %d\n", - ifnet_name(ifp), ifnet_unit(ifp), error); + printf("%s: ifnet_detach_protocol(%s%d) failed, %d\n", + __func__, ifnet_name(ifp), ifnet_unit(ifp), error); return (error); } @@ -1799,8 +1827,10 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, (void) bridge_detach_protocol(ifs); BRIDGE_LOCK(sc); } +#if BRIDGESTP if (bif->bif_flags & IFBIF_STP) bstp_disable(&bif->bif_stp); +#endif /* BRIDGESTP */ ifs->if_bridge = NULL; BRIDGE_XLOCK(sc); @@ -1816,10 +1846,12 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, KASSERT(bif->bif_addrcnt == 0, ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); +#if BRIDGESTP BRIDGE_UNLOCK(sc); bstp_destroy(&bif->bif_stp); /* prepare to free */ BRIDGE_LOCK(sc); - +#endif /* BRIDGESTP */ + if (bif->bif_filter_attached) { /* Respect lock ordering with DLIL lock */ BRIDGE_UNLOCK(sc); @@ -1874,9 +1906,9 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) if (TAILQ_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { - printf("%s%d: invalid MTU for %s%d", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), - ifnet_name(ifs), ifnet_unit(ifs)); + printf("%s: %s%d: invalid MTU for %s%d", __func__, + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifs), ifnet_unit(ifs)); return (EINVAL); } } @@ -1887,7 +1919,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) if (ifs->if_bridge != NULL) return (EBUSY); - bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + bif = _MALLOC(sizeof (*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); @@ -1901,7 +1933,9 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) ifnet_reference(ifs); ifs->if_bridge = sc; +#if BRIDGESTP bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); +#endif /* BRIDGESTP */ /* * XXX: XLOCK HERE!?! */ @@ -1912,7 +1946,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) bridge_mutecaps(sc); #endif /* HAS_IF_CAP */ - + switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: @@ -1946,7 +1980,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) /* * install an interface filter */ - memset(&iff, 0, sizeof(struct iff_filter)); + memset(&iff, 0, sizeof (struct iff_filter)); iff.iff_cookie = bif; iff.iff_name = "com.apple.kernel.bsd.net.if_bridge"; iff.iff_input = bridge_iff_input; @@ -1957,7 +1991,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) iff.iff_detached = bridge_iff_detached; error = iflt_attach(ifs, &iff, &bif->bif_iff_ref); if (error != 0) { - printf("bridge_ioctl_add: iflt_attach failed %d\n", error); + printf("%s: iflt_attach failed %d\n", __func__, error); BRIDGE_LOCK(sc); goto out; } @@ -1968,7 +2002,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) */ if ((error = bridge_attach_protocol(ifs)) != 0) { if (error != 0) { - printf("bridge_ioctl_add: bridge_attach_protocol failed %d\n", error); + printf("%s: bridge_attach_protocol failed %d\n", + __func__, error); BRIDGE_LOCK(sc); goto out; } @@ -1980,7 +2015,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) out: if (error && bif != NULL) bridge_delete_member(sc, bif, 1); - + return (error); } @@ -2001,7 +2036,7 @@ bridge_ioctl_del(struct bridge_softc *sc, void *arg) static int bridge_ioctl_purge(__unused struct bridge_softc *sc, __unused void *arg) -{ +{ return (0); } @@ -2050,19 +2085,21 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; +#if BRIDGESTP struct bstp_port *bp; int error; +#endif /* BRIDGESTP */ bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); - bp = &bif->bif_stp; if (req->ifbr_ifsflags & IFBIF_SPAN) /* SPAN is readonly */ return (EINVAL); - + +#if BRIDGESTP if (req->ifbr_ifsflags & IFBIF_STP) { if ((bif->bif_flags & IFBIF_STP) == 0) { error = bstp_enable(&bif->bif_stp); @@ -2075,10 +2112,15 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) } /* Pass on STP flags */ + bp = &bif->bif_stp; bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); +#else /* !BRIDGESTP */ + if (req->ifbr_ifsflags & IFBIF_STP) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ /* Save the bits relating to the bridge */ bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; @@ -2110,64 +2152,66 @@ bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) #define BRIDGE_IOCTL_GIFS do { \ - struct bridge_iflist *bif; \ - struct ifbreq breq; \ - char *buf, *outbuf; \ - unsigned int count, buflen, len; \ - \ - count = 0; \ - TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) \ - count++; \ - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) \ - count++; \ - \ - buflen = sizeof(breq) * count; \ - if (bifc->ifbic_len == 0) { \ - bifc->ifbic_len = buflen; \ - return (0); \ - } \ - BRIDGE_UNLOCK(sc); \ - outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ - BRIDGE_LOCK(sc); \ - \ - count = 0; \ - buf = outbuf; \ - len = min(bifc->ifbic_len, buflen); \ - bzero(&breq, sizeof(breq)); \ - TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ - if (len < sizeof(breq)) \ - break; \ - \ - snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ - ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ - /* Fill in the ifbreq structure */ \ - error = bridge_ioctl_gifflags(sc, &breq); \ - if (error) \ - break; \ - memcpy(buf, &breq, sizeof(breq)); \ - count++; \ - buf += sizeof(breq); \ - len -= sizeof(breq); \ - } \ - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { \ - if (len < sizeof(breq)) \ - break; \ - \ - snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ - ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ - breq.ifbr_ifsflags = bif->bif_flags; \ - breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; \ - memcpy(buf, &breq, sizeof(breq)); \ - count++; \ - buf += sizeof(breq); \ - len -= sizeof(breq); \ - } \ - \ - BRIDGE_UNLOCK(sc); \ - bifc->ifbic_len = sizeof(breq) * count; \ - error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); \ - BRIDGE_LOCK(sc); \ - _FREE(outbuf, M_TEMP); \ + struct bridge_iflist *bif; \ + struct ifbreq breq; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + \ + count = 0; \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) \ + count++; \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) \ + count++; \ + \ + buflen = sizeof (breq) * count; \ + if (bifc->ifbic_len == 0) { \ + bifc->ifbic_len = buflen; \ + return (0); \ + } \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bifc->ifbic_len, buflen); \ + bzero(&breq, sizeof (breq)); \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof (breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof (breq.ifbr_ifsname), \ + "%s%d", ifnet_name(bif->bif_ifp), \ + ifnet_unit(bif->bif_ifp)); \ + /* Fill in the ifbreq structure */ \ + error = bridge_ioctl_gifflags(sc, &breq); \ + if (error) \ + break; \ + memcpy(buf, &breq, sizeof (breq)); \ + count++; \ + buf += sizeof (breq); \ + len -= sizeof (breq); \ + } \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { \ + if (len < sizeof (breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof (breq.ifbr_ifsname), \ + "%s%d", ifnet_name(bif->bif_ifp), \ + ifnet_unit(bif->bif_ifp)); \ + breq.ifbr_ifsflags = bif->bif_flags; \ + breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; \ + memcpy(buf, &breq, sizeof (breq)); \ + count++; \ + buf += sizeof (breq); \ + len -= sizeof (breq); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + bifc->ifbic_len = sizeof (breq) * count; \ + error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ } while (0) static int @@ -2175,7 +2219,7 @@ bridge_ioctl_gifs64(struct bridge_softc *sc, void *arg) { struct ifbifconf64 *bifc = arg; int error = 0; - + BRIDGE_IOCTL_GIFS; return (error); @@ -2193,55 +2237,57 @@ bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg) } -#define BRIDGE_IOCTL_RTS do { \ - struct bridge_rtnode *brt; \ - char *buf, *outbuf; \ - unsigned int count, buflen, len; \ - struct timespec now; \ - \ - if (bac->ifbac_len == 0) \ - return (0); \ - \ - count = 0; \ - LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) \ - count++; \ - buflen = sizeof(bareq) * count; \ - \ - BRIDGE_UNLOCK(sc); \ - outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ - BRIDGE_LOCK(sc); \ - \ - count = 0; \ - buf = outbuf; \ - len = min(bac->ifbac_len, buflen); \ - bzero(&bareq, sizeof(bareq)); \ - LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { \ - if (len < sizeof(bareq)) \ - goto out; \ - snprintf(bareq.ifba_ifsname, sizeof(bareq.ifba_ifsname), "%s%d", \ - ifnet_name(brt->brt_ifp), ifnet_unit(brt->brt_ifp)); \ - memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); \ - bareq.ifba_vlan = brt->brt_vlan; \ - if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { \ - nanouptime(&now); \ - if ((unsigned long)now.tv_sec < brt->brt_expire) \ - bareq.ifba_expire = brt->brt_expire - now.tv_sec; \ - } else \ - bareq.ifba_expire = 0; \ - bareq.ifba_flags = brt->brt_flags; \ - \ - memcpy(buf, &bareq, sizeof(bareq)); \ - count++; \ - buf += sizeof(bareq); \ - len -= sizeof(bareq); \ - } \ -out: \ - BRIDGE_UNLOCK(sc); \ - bac->ifbac_len = sizeof(bareq) * count; \ - error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); \ - BRIDGE_LOCK(sc); \ - _FREE(outbuf, M_TEMP); \ - return (error); \ +#define BRIDGE_IOCTL_RTS do { \ + struct bridge_rtnode *brt; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + struct timespec now; \ + \ + if (bac->ifbac_len == 0) \ + return (0); \ + \ + count = 0; \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) \ + count++; \ + buflen = sizeof (bareq) * count; \ + \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bac->ifbac_len, buflen); \ + bzero(&bareq, sizeof (bareq)); \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { \ + if (len < sizeof (bareq)) \ + goto out; \ + snprintf(bareq.ifba_ifsname, sizeof (bareq.ifba_ifsname), \ + "%s%d", ifnet_name(brt->brt_ifp), \ + ifnet_unit(brt->brt_ifp)); \ + memcpy(bareq.ifba_dst, brt->brt_addr, sizeof (brt->brt_addr)); \ + bareq.ifba_vlan = brt->brt_vlan; \ + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { \ + nanouptime(&now); \ + if ((unsigned long)now.tv_sec < brt->brt_expire) \ + bareq.ifba_expire = \ + brt->brt_expire - now.tv_sec; \ + } else \ + bareq.ifba_expire = 0; \ + bareq.ifba_flags = brt->brt_flags; \ + \ + memcpy(buf, &bareq, sizeof (bareq)); \ + count++; \ + buf += sizeof (bareq); \ + len -= sizeof (bareq); \ + } \ +out: \ + BRIDGE_UNLOCK(sc); \ + bac->ifbac_len = sizeof (bareq) * count; \ + error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ + return (error); \ } while (0) static int @@ -2250,7 +2296,7 @@ bridge_ioctl_rts64(struct bridge_softc *sc, void *arg) struct ifbaconf64 *bac = arg; struct ifbareq64 bareq; int error = 0; - + BRIDGE_IOCTL_RTS; return (error); @@ -2262,7 +2308,7 @@ bridge_ioctl_rts32(struct bridge_softc *sc, void *arg) struct ifbaconf32 *bac = arg; struct ifbareq32 bareq; int error = 0; - + BRIDGE_IOCTL_RTS; return (error); @@ -2358,9 +2404,14 @@ bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) static int bridge_ioctl_spri(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbrparam *param = arg; return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int @@ -2376,9 +2427,14 @@ bridge_ioctl_ght(struct bridge_softc *sc, void *arg) static int bridge_ioctl_sht(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbrparam *param = arg; return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int @@ -2394,9 +2450,14 @@ bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) static int bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbrparam *param = arg; return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int @@ -2412,14 +2473,20 @@ bridge_ioctl_gma(struct bridge_softc *sc, void *arg) static int bridge_ioctl_sma(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbrparam *param = arg; return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbreq *req = arg; struct bridge_iflist *bif; @@ -2428,11 +2495,16 @@ bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) return (ENOENT); return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbreq *req = arg; struct bridge_iflist *bif; @@ -2441,6 +2513,10 @@ bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) return (ENOENT); return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int @@ -2512,7 +2588,7 @@ bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) return (EINVAL); } - bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + bif = _MALLOC(sizeof (*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); @@ -2549,29 +2625,29 @@ bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) return (0); } -#define BRIDGE_IOCTL_GBPARAM do { \ - struct bstp_state *bs = &sc->sc_stp; \ - struct bstp_port *root_port; \ - \ - req->ifbop_maxage = bs->bs_bridge_max_age >> 8; \ - req->ifbop_hellotime = bs->bs_bridge_htime >> 8; \ - req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; \ - \ - root_port = bs->bs_root_port; \ - if (root_port == NULL) \ - req->ifbop_root_port = 0; \ - else \ - req->ifbop_root_port = root_port->bp_ifp->if_index; \ - \ - req->ifbop_holdcount = bs->bs_txholdcount; \ - req->ifbop_priority = bs->bs_bridge_priority; \ - req->ifbop_protocol = bs->bs_protover; \ - req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; \ - req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; \ - req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; \ - req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; \ - req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; \ - req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; \ +#define BRIDGE_IOCTL_GBPARAM do { \ + struct bstp_state *bs = &sc->sc_stp; \ + struct bstp_port *root_port; \ + \ + req->ifbop_maxage = bs->bs_bridge_max_age >> 8; \ + req->ifbop_hellotime = bs->bs_bridge_htime >> 8; \ + req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; \ + \ + root_port = bs->bs_root_port; \ + if (root_port == NULL) \ + req->ifbop_root_port = 0; \ + else \ + req->ifbop_root_port = root_port->bp_ifp->if_index; \ + \ + req->ifbop_holdcount = bs->bs_txholdcount; \ + req->ifbop_priority = bs->bs_bridge_priority; \ + req->ifbop_protocol = bs->bs_protover; \ + req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; \ + req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; \ + req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; \ + req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; \ + req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; \ + req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; \ } while (0) static int @@ -2580,7 +2656,7 @@ bridge_ioctl_gbparam32(struct bridge_softc *sc, void *arg) struct ifbropreq32 *req = arg; BRIDGE_IOCTL_GBPARAM; - + return (0); } @@ -2594,7 +2670,6 @@ bridge_ioctl_gbparam64(struct bridge_softc *sc, void *arg) return (0); } - static int bridge_ioctl_grte(struct bridge_softc *sc, void *arg) { @@ -2604,60 +2679,60 @@ bridge_ioctl_grte(struct bridge_softc *sc, void *arg) return (0); } -#define BRIDGE_IOCTL_GIFSSTP do { \ - struct bridge_iflist *bif; \ - struct bstp_port *bp; \ - struct ifbpstpreq bpreq; \ - char *buf, *outbuf; \ - unsigned int count, buflen, len; \ - \ - count = 0; \ - TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ - if ((bif->bif_flags & IFBIF_STP) != 0) \ - count++; \ - } \ - \ - buflen = sizeof(bpreq) * count; \ - if (bifstp->ifbpstp_len == 0) { \ - bifstp->ifbpstp_len = buflen; \ - return (0); \ - } \ - \ - BRIDGE_UNLOCK(sc); \ - outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ - BRIDGE_LOCK(sc); \ - \ - count = 0; \ - buf = outbuf; \ - len = min(bifstp->ifbpstp_len, buflen); \ - bzero(&bpreq, sizeof(bpreq)); \ - TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ - if (len < sizeof(bpreq)) \ - break; \ - \ - if ((bif->bif_flags & IFBIF_STP) == 0) \ - continue; \ - \ - bp = &bif->bif_stp; \ - bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; \ - bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; \ - bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; \ - bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; \ +#define BRIDGE_IOCTL_GIFSSTP do { \ + struct bridge_iflist *bif; \ + struct bstp_port *bp; \ + struct ifbpstpreq bpreq; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + \ + count = 0; \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if ((bif->bif_flags & IFBIF_STP) != 0) \ + count++; \ + } \ + \ + buflen = sizeof (bpreq) * count; \ + if (bifstp->ifbpstp_len == 0) { \ + bifstp->ifbpstp_len = buflen; \ + return (0); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bifstp->ifbpstp_len, buflen); \ + bzero(&bpreq, sizeof (bpreq)); \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof (bpreq)) \ + break; \ + \ + if ((bif->bif_flags & IFBIF_STP) == 0) \ + continue; \ + \ + bp = &bif->bif_stp; \ + bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; \ + bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; \ + bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; \ + bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; \ bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; \ - bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; \ - \ - memcpy(buf, &bpreq, sizeof(bpreq)); \ - count++; \ - buf += sizeof(bpreq); \ - len -= sizeof(bpreq); \ - } \ - \ - BRIDGE_UNLOCK(sc); \ - bifstp->ifbpstp_len = sizeof(bpreq) * count; \ + bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; \ + \ + memcpy(buf, &bpreq, sizeof (bpreq)); \ + count++; \ + buf += sizeof (bpreq); \ + len -= sizeof (bpreq); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + bifstp->ifbpstp_len = sizeof (bpreq) * count; \ error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); \ - BRIDGE_LOCK(sc); \ - _FREE(outbuf, M_TEMP); \ - return (error); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ + return (error); \ } while (0) static int @@ -2685,17 +2760,27 @@ bridge_ioctl_gifsstp64(struct bridge_softc *sc, void *arg) static int bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbrparam *param = arg; return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } static int bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) { +#if BRIDGESTP struct ifbrparam *param = arg; return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); +#else /* !BRIDGESTP */ +#pragma unused(sc, arg) + return (EOPNOTSUPP); +#endif /* !BRIDGESTP */ } /* @@ -2710,7 +2795,7 @@ bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp) struct bridge_softc *sc = ifp->if_bridge; #if BRIDGE_DEBUG - printf("bridge_ifdetach %s%d\n", ifnet_name(ifp), ifnet_unit(ifp)); + printf("%s: %s%d\n", __func__, ifnet_name(ifp), ifnet_unit(ifp)); #endif /* Check if the interface is a bridge member */ @@ -2755,26 +2840,28 @@ bridge_init(struct ifnet *ifp) BRIDGE_LOCK_ASSERT(sc); if ((ifnet_flags(ifp) & IFF_RUNNING)) - return 0; + return (0); ts.tv_sec = bridge_rtable_prune_period; ts.tv_nsec = 0; bsd_timeout(bridge_timer, sc, &ts); error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING); +#if BRIDGESTP if (error == 0) bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ +#endif /* BRIDGESTP */ - return error; + return (error); } /* - * bridge_stop: + * bridge_ifstop: * * Stop the bridge interface. */ static void -bridge_stop(struct ifnet *ifp, __unused int disable) +bridge_ifstop(struct ifnet *ifp, __unused int disable) { struct bridge_softc *sc = ifp->if_softc; @@ -2784,7 +2871,9 @@ bridge_stop(struct ifnet *ifp, __unused int disable) return; bsd_untimeout(bridge_timer, sc); +#if BRIDGESTP bstp_stop(&sc->sc_stp); +#endif /* BRIDGESTP */ bridge_rtflush(sc, IFBF_FLUSHDYN); @@ -2797,21 +2886,30 @@ bridge_stop(struct ifnet *ifp, __unused int disable) * Enqueue a packet on a bridge member interface. * */ -static void +static int bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) { int len, error = 0; short mflags; struct mbuf *m0; - /* We may be sending a fragment so traverse the mbuf */ + VERIFY(dst_ifp != NULL); + + /* + * We may be sending a fragment so traverse the mbuf + * + * NOTE: bridge_fragment() is called only when PFIL_HOOKS is enabled. + */ for (; m; m = m0) { + errno_t _error; + struct flowadv adv = { FADV_SUCCESS }; + m0 = m->m_nextpkt; m->m_nextpkt = NULL; len = m->m_pkthdr.len; mflags = m->m_flags; - m->m_flags |= M_PROTO1; //set to avoid loops + m->m_flags |= M_PROTO1; /* set to avoid loops */ #if HAS_IF_CAP /* @@ -2822,32 +2920,45 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if (m == NULL) { - printf("%s%d: unable to prepend VLAN header\n", - ifnet_name(dst_ifp), ifnet_unit(dst_ifp)); - (void) ifnet_stat_increment_out(dst_ifp, 0, 0, 1); + printf("%s: %s%d: unable to prepend VLAN " + "header\n", __func__, ifnet_name(dst_ifp), + ifnet_unit(dst_ifp)); + (void) ifnet_stat_increment_out(dst_ifp, + 0, 0, 1); continue; } m->m_flags &= ~M_VLANTAG; } #endif /* HAS_IF_CAP */ - error = ifnet_output_raw(dst_ifp, 0, m); + _error = dlil_output(dst_ifp, 0, m, NULL, NULL, 1, &adv); + + /* Preserve existing error value */ if (error == 0) { + if (_error != 0) + error = _error; + else if (adv.code == FADV_FLOW_CONTROLLED) + error = EQFULL; + else if (adv.code == FADV_SUSPENDED) + error = EQSUSPENDED; + } + + if (_error == 0) { (void) ifnet_stat_increment_out(sc->sc_ifp, 1, len, 0); } else { (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); } } - return; + return (error); } #if HAS_BRIDGE_DUMMYNET /* * bridge_dummynet: * - * Receive a queued packet from dummynet and pass it on to the output - * interface. + * Receive a queued packet from dummynet and pass it on to the output + * interface. * * The mbuf has the Ethernet header already attached. */ @@ -2879,13 +2990,13 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) return; } - bridge_enqueue(sc, ifp, m); + (void) bridge_enqueue(sc, ifp, m); } #endif /* HAS_BRIDGE_DUMMYNET */ #if BRIDGE_MEMBER_OUT_FILTER /* - * bridge_output: + * bridge_member_output: * * Send output from a bridge member interface. This * performs the bridging function for locally originated @@ -2895,8 +3006,8 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) * enqueue or free the mbuf before returning. */ static int -bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, - __unused struct rtentry *rt) +bridge_member_output(struct ifnet *ifp, struct mbuf *m, + __unused struct sockaddr *sa, __unused struct rtentry *rt) { struct ether_header *eh; struct ifnet *dst_if; @@ -2904,10 +3015,11 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, uint16_t vlan; #if BRIDGE_DEBUG - if (_if_brige_debug) - printf("bridge_output ifp %p %s%d\n", ifp, ifnet_name(ifp), ifnet_unit(ifp)); + if (if_bridge_debug) + printf("%s: ifp %p %s%d\n", __func__, ifp, ifnet_name(ifp), + ifnet_unit(ifp)); #endif /* BRIDGE_DEBUG */ - + if (m->m_len < ETHER_HDR_LEN) { m = m_pullup(m, ETHER_HDR_LEN); if (m == NULL) @@ -2920,7 +3032,8 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, BRIDGE_LOCK(sc); - /* APPLE MODIFICATION + /* + * APPLE MODIFICATION * If the packet is an 802.1X ethertype, then only send on the * original output interface. */ @@ -2928,7 +3041,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, dst_if = ifp; goto sendunicast; } - + /* * If bridge is down, but the original output interface is up, * go ahead and send out that interface. Otherwise, the packet @@ -2984,12 +3097,13 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, } else { mc = m_copypacket(m, M_DONTWAIT); if (mc == NULL) { - (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + (void) ifnet_stat_increment_out( + sc->sc_ifp, 0, 0, 1); continue; } } - bridge_enqueue(sc, dst_if, mc); + (void) bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); @@ -3010,64 +3124,66 @@ sendunicast: } BRIDGE_UNLOCK(sc); - bridge_enqueue(sc, dst_if, m); + (void) bridge_enqueue(sc, dst_if, m); return (0); } #endif /* BRIDGE_MEMBER_OUT_FILTER */ #if APPLE_BRIDGE_HWCKSUM_SUPPORT -static struct mbuf* bridge_fix_txcsum( struct mbuf *m ) +static struct mbuf * +bridge_fix_txcsum(struct mbuf *m) { - // basic tests indicate that the vast majority of packets being processed - // here have an Ethernet header mbuf pre-pended to them (the first case below) - // the second highest are those where the Ethernet and IP/TCP/UDP headers are - // all in one mbuf (second case below) - // the third case has, in fact, never hit for me -- although if I comment out - // the first two cases, that code works for them, so I consider it a - // decent general solution - + /* + * basic tests indicate that the vast majority of packets being + * processed here have an Ethernet header mbuf pre-pended to them + * (the first case below) + * + * the second highest are those where the Ethernet and IP/TCP/UDP + * headers are all in one mbuf (second case below) + * + * the third case has, in fact, never hit for me -- although if I + * comment out the first two cases, that code works for them, so I + * consider it a decent general solution + */ int amt = ETHER_HDR_LEN; - int hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); - int off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); - - /* + int hlen = M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data); + int off = M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); + + /* * NOTE we should never get vlan-attached packets here; * support for those COULD be added, but we don't use them * and it really kinda slows things down to worry about them */ - + #ifdef DIAGNOSTIC - if ( m_tag_find( m, PACKET_TAG_VLAN, NULL ) != NULL ) - { - printf( "bridge: transmitting packet tagged with VLAN?\n" ); - KASSERT( 0 ); - m_freem( m ); - return NULL; + if (m_tag_find(m, PACKET_TAG_VLAN, NULL) != NULL) { + printf("%s: transmitting packet tagged with VLAN?\n", __func__); + KASSERT(0); + m_freem(m); + return (NULL); } #endif - - if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) - { + + if (m->m_pkthdr.csum_flags & M_CSUM_IPv4) { amt += hlen; } - if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) - { - amt += off + sizeof( uint16_t ); + if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) { + amt += off + sizeof (uint16_t); } - - if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) - { - amt += off + sizeof( uint16_t ); + + if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) { + amt += off + sizeof (uint16_t); } - - if ( m->m_len == ETHER_HDR_LEN ) - { - // this is the case where there's an Ethernet header in an mbuf - - // the first mbuf is the Ethernet header -- just strip it off and do the checksum + + if (m->m_len == ETHER_HDR_LEN) { + /* + * this is the case where there's an Ethernet header in an + * mbuf the first mbuf is the Ethernet header -- just strip + * it off and do the checksum + */ + /* set up m_ip so the cksum operations work */ struct mbuf *m_ip = m->m_next; - - // set up m_ip so the cksum operations work + /* APPLE MODIFICATION 22 Apr 2008 * Clear the m_tag list before setting * M_PKTHDR. @@ -3097,298 +3213,306 @@ static struct mbuf* bridge_fix_txcsum( struct mbuf *m ) m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; m_ip->m_pkthdr.len = m->m_pkthdr.len - ETHER_HDR_LEN; - - // set up the header mbuf so we can prepend it back on again later + + /* + * set up the header mbuf so we can prepend it + * back on again later + */ m->m_pkthdr.csum_flags = 0; m->m_pkthdr.csum_data = 0; m->m_pkthdr.len = ETHER_HDR_LEN; m->m_next = NULL; - - - // now do the checksums we need -- first IP - if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) - { - // make sure the IP header (or at least the part with the cksum) is there - m_ip = m_pullup( m_ip, sizeof( struct ip ) ); - if ( m_ip == NULL ) - { - printf( "bridge: failed to flatten header\n "); - m_freem( m ); - return NULL; + + /* now do the checksums we need -- first IP */ + if (m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4) { + /* + * make sure the IP header (or at least the part with + * the cksum) is there + */ + m_ip = m_pullup(m_ip, sizeof (struct ip)); + if (m_ip == NULL) { + printf("%s: failed to flatten header\n", + __func__); + m_freem(m); + return (NULL); } - - // now do the checksum + + /* now do the checksum */ { - struct ip *ip = mtod( m_ip, struct ip* ); - ip->ip_sum = in_cksum( m_ip, hlen ); - + struct ip *ip = mtod(m_ip, struct ip *); + ip->ip_sum = in_cksum(m_ip, hlen); + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf( "bridge: performed IPv4 checksum\n" ); + printf("%s: performed IPv4 checksum\n", + __func__); #endif } } - - // now do a TCP or UDP delayed checksum - if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) - { - in_delayed_cksum( m_ip ); - + + /* now do a TCP or UDP delayed checksum */ + if (m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { + in_delayed_cksum(m_ip); + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); + printf("%s: performed TCPv4/UDPv4 checksum\n", + __func__); #endif } - - // now attach the ethernet header back onto the IP packet + + /* now attach the ethernet header back onto the IP packet */ m->m_next = m_ip; - m->m_pkthdr.len += m_length( m_ip ); - - // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m->m_pkthdr.len += m_length(m_ip); + + /* + * clear the M_PKTHDR flags on the ip packet (again, + * we re-attach later) + */ m_ip->m_flags &= ~M_PKTHDR; - - // and clear any csum flags - m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); - } - else if ( m->m_len >= amt ) - { - // everything fits in the first mbuf, so futz with m->m_data, m->m_len and m->m_pkthdr.len to - // make it work + + /* and clear any csum flags */ + m->m_pkthdr.csum_flags &= + ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } else if (m->m_len >= amt) { + /* + * everything fits in the first mbuf, so futz with + * m->m_data, m->m_len and m->m_pkthdr.len to make it work + */ m->m_len -= ETHER_HDR_LEN; m->m_data += ETHER_HDR_LEN; m->m_pkthdr.len -= ETHER_HDR_LEN; - - // now do the checksums we need -- first IP - if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) - { - struct ip *ip = mtod( m, struct ip* ); - ip->ip_sum = in_cksum( m, hlen ); - + + /* now do the checksums we need -- first IP */ + if (m->m_pkthdr.csum_flags & M_CSUM_IPv4) { + struct ip *ip = mtod(m, struct ip *); + ip->ip_sum = in_cksum(m, hlen); + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf( "bridge: performed IPv4 checksum\n" ); + printf("%s: performed IPv4 checksum\n", __func__); #endif } - + // now do a TCP or UDP delayed checksum - if ( m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) - { - in_delayed_cksum( m ); - + if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { + in_delayed_cksum(m); + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); + printf("%s: performed TCPv4/UDPv4 checksum\n", + __func__); #endif } - - // now stick the ethernet header back on + + /* now stick the ethernet header back on */ m->m_len += ETHER_HDR_LEN; m->m_data -= ETHER_HDR_LEN; m->m_pkthdr.len += ETHER_HDR_LEN; - - // and clear any csum flags - m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); - } - else - { + + /* and clear any csum flags */ + m->m_pkthdr.csum_flags &= + ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } else { struct mbuf *m_ip; - - // general case -- need to simply split it off and deal - - // first, calculate how much needs to be made writable (we may have a read-only mbuf here) - hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); + + /* + * general case -- need to simply split it off and deal + * first, calculate how much needs to be made writable + * (we may have a read-only mbuf here) + */ + hlen = M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data); #if PARANOID - off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); - - if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) - { + off = M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); + + if (m->m_pkthdr.csum_flags & M_CSUM_IPv4) { amt += hlen; } - - if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) - { - amt += sizeof( struct tcphdr * ); + + if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) { + amt += sizeof (struct tcphdr *); amt += off; } - - if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) - { - amt += sizeof( struct udphdr * ); + + if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) { + amt += sizeof (struct udphdr *); amt += off; } #endif - - // now split the ethernet header off of the IP packet (we'll re-attach later) - m_ip = m_split( m, ETHER_HDR_LEN, M_NOWAIT ); - if ( m_ip == NULL ) - { - printf( "bridge_fix_txcsum: could not split ether header\n" ); - - m_freem( m ); - return NULL; + + /* + * now split the ethernet header off of the IP packet + * (we'll re-attach later) + */ + m_ip = m_split(m, ETHER_HDR_LEN, M_NOWAIT); + if (m_ip == NULL) { + printf("%s: could not split ether header\n", __func__); + + m_freem(m); + return (NULL); } - + #if PARANOID - // make sure that the IP packet is writable for the portion we need - if ( m_makewritable( &m_ip, 0, amt, M_DONTWAIT ) != 0 ) - { - printf( "bridge_fix_txcsum: could not make %d bytes writable\n", amt ); - - m_freem( m ); - m_freem( m_ip ); - return NULL; + /* + * make sure that the IP packet is writable + * for the portion we need + */ + if (m_makewritable(&m_ip, 0, amt, M_DONTWAIT) != 0) { + printf("%s: could not make %d bytes writable\n", + __func__, amt); + + m_freem(m); + m_freem(m_ip); + return (NULL); } #endif - + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; - + m->m_pkthdr.csum_flags = 0; m->m_pkthdr.csum_data = 0; - - // now do the checksums we need -- first IP - if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) - { - // make sure the IP header (or at least the part with the cksum) is there - m_ip = m_pullup( m_ip, sizeof( struct ip ) ); - if ( m_ip == NULL ) - { - printf( "bridge: failed to flatten header\n "); - m_freem( m ); - return NULL; + + /* now do the checksums we need -- first IP */ + if (m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4) { + /* + * make sure the IP header (or at least the part + * with the cksum) is there + */ + m_ip = m_pullup(m_ip, sizeof (struct ip)); + if (m_ip == NULL) { + printf("%s: failed to flatten header\n", + __func__); + m_freem(m); + return (NULL); } - - // now do the checksum + + /* now do the checksum */ { - struct ip *ip = mtod( m_ip, struct ip* ); - ip->ip_sum = in_cksum( m_ip, hlen ); - + struct ip *ip = mtod(m_ip, struct ip *); + ip->ip_sum = in_cksum(m_ip, hlen); + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf( "bridge: performed IPv4 checksum\n" ); + printf("%s: performed IPv4 checksum\n", + __func__); #endif } } - - // now do a TCP or UDP delayed checksum - if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) - { - in_delayed_cksum( m_ip ); - + + /* now do a TCP or UDP delayed checksum */ + if (m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { + in_delayed_cksum(m_ip); + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); + printf("%s: performed TCPv4/UDPv4 checksum\n", + __func__); #endif } - + // now attach the ethernet header back onto the IP packet m->m_next = m_ip; - m->m_pkthdr.len += m_length( m_ip ); - - // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m->m_pkthdr.len += m_length(m_ip); + + /* + * clear the M_PKTHDR flags on the ip packet + * (again, we re-attach later) + */ m_ip->m_flags &= ~M_PKTHDR; - - // and clear any csum flags - m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + + /* and clear any csum flags */ + m->m_pkthdr.csum_flags &= + ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); } - - return m; + + return (m); } #endif /* - * bridge_start: - * - * Start output on a bridge. + * Output callback. * + * This routine is called externally from above only when if_bridge_txstart + * is disabled; otherwise it is called internally by bridge_start(). */ -static errno_t -bridge_start(struct ifnet *ifp, struct mbuf *m) +static int +bridge_output(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc = ifnet_softc(ifp); struct ether_header *eh; struct ifnet *dst_if; - - lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + int error = 0; eh = mtod(m, struct ether_header *); - + dst_if = NULL; + BRIDGE_LOCK(sc); - - if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 && - (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 0)) != NULL) { - - { + if (!(m->m_flags & (M_BCAST|M_MCAST))) { + dst_if = bridge_rtlookup(sc, eh->ether_dhost, 0); + } + #if APPLE_BRIDGE_HWCKSUM_SUPPORT - /* - * APPLE MODIFICATION - if the packet needs a checksum (i.e., - * checksum has been deferred for HW support) AND the destination - * interface doesn't support HW checksums, then we - * need to fix-up the checksum here - */ - if ( - ( (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4) ) != 0 ) && - ( (dst_if->if_csum_flags_tx & m->m_pkthdr.csum_flags ) != m->m_pkthdr.csum_flags ) - ) - { - m = bridge_fix_txcsum( m ); - if ( m == NULL ) - { - goto done; - } - } - -#else - if (eh->ether_type == htons(ETHERTYPE_IP)) - mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); - else - m->m_pkthdr.csum_flags = 0; -#endif - #if NBPFILTER > 0 - if (sc->sc_bpf_output) - bridge_bpf_output(ifp, m); - #endif + /* + * APPLE MODIFICATION - if the packet needs a checksum + * (i.e., checksum has been deferred for HW support) + * AND the destination interface doesn't support HW + * checksums, then we need to fix-up the checksum here + */ + if ((m->m_pkthdr.csum_flags & + (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4)) && + (dst_if == NULL || + (dst_if->if_csum_flags_tx & m->m_pkthdr.csum_flags) != + m->m_pkthdr.csum_flags)) { + m = bridge_fix_txcsum(m); + if (m == NULL) { BRIDGE_UNLOCK(sc); - bridge_enqueue(sc, dst_if, m); - } - } else - { -#if APPLE_BRIDGE_HWCKSUM_SUPPORT - - /* - * APPLE MODIFICATION - if the MULTICAST packet needs a checksum (i.e., - * checksum has been deferred for HW support) AND at least one destination - * interface doesn't support HW checksums, then we go ahead and fix it up - * here, since it doesn't make sense to do it more than once - */ - - if ( - (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4)) && - /* - * XXX FIX ME: keep track of whether or not we have any interfaces that - * do not support checksums (for now, assume we do) - */ - ( 1 ) - ) - { - m = bridge_fix_txcsum( m ); - if ( m == NULL ) - { - goto done; - } + return (0); } + } #else - if (eh->ether_type == htons(ETHERTYPE_IP)) - mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); - else - m->m_pkthdr.csum_flags = 0; + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof (*eh)); + else + m->m_pkthdr.csum_flags = 0; +#endif /* APPLE_BRIDGE_HWCKSUM_SUPPORT */ + + atomic_add_64(&ifp->if_obytes, m->m_pkthdr.len); + atomic_add_64(&ifp->if_opackets, 1); + +#if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); #endif - - #if NBPFILTER > 0 - if (sc->sc_bpf_output) - bridge_bpf_output(ifp, m); - #endif + + if (dst_if == NULL) { + /* callee will unlock */ bridge_broadcast(sc, ifp, m, 0); + } else { + BRIDGE_UNLOCK(sc); + error = bridge_enqueue(sc, dst_if, m); } -#if APPLE_BRIDGE_HWCKSUM_SUPPORT -done: -#endif - return 0; + return (error); +} + +/* + * bridge_start: + * + * Start output on a bridge. + * + * This routine is invoked by the start worker thread; because we never call + * it directly, there is no need do deploy any serialization mechanism other + * than what's already used by the worker thread, i.e. this is already single + * threaded. + * + * This routine is called only when if_bridge_txstart is enabled. + */ +static void +bridge_start(struct ifnet *ifp) +{ + struct mbuf *m; + + for (;;) { + if (ifnet_dequeue(ifp, &m) != 0) + break; + + (void) bridge_output(ifp, m); + } } /* @@ -3412,10 +3536,11 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); #if BRIDGE_DEBUG - if (_if_brige_debug) - printf("bridge_forward %s%d m%p\n", ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), m); + if (if_bridge_debug) + printf("%s: %s%d m%p\n", __func__, ifnet_name(sc->sc_ifp), + ifnet_unit(sc->sc_ifp), m); #endif /* BRIDGE_DEBUG */ - + src_if = m->m_pkthdr.rcvif; ifp = sc->sc_ifp; @@ -3488,10 +3613,10 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, */ #if NBPFILTER > 0 if (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH) || - dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) { - m->m_pkthdr.rcvif = ifp; + dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) { + m->m_pkthdr.rcvif = ifp; if (sc->sc_bpf_input) - bridge_bpf_input(ifp, m); + bridge_bpf_input(ifp, m); } #endif /* NBPFILTER */ @@ -3512,13 +3637,13 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, #endif /* PFIL_HOOKS */ if (dst_if == NULL) { - /* - * Clear any in-bound checksum flags for this packet. - */ + /* + * Clear any in-bound checksum flags for this packet. + */ mbuf_inbound_modified(m); - + bridge_broadcast(sc, src_if, m, 1); - + return; } @@ -3573,7 +3698,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, */ mbuf_inbound_modified(m); - bridge_enqueue(sc, dst_if, m); + (void) bridge_enqueue(sc, dst_if, m); return; drop: @@ -3583,15 +3708,15 @@ drop: #if BRIDGE_DEBUG -char * ether_ntop(char *, size_t , const u_char *); +char *ether_ntop(char *, size_t, const u_char *); __private_extern__ char * ether_ntop(char *buf, size_t len, const u_char *ap) { - snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", - ap[0], ap[1], ap[2], ap[3], ap[4], ap[5]); - - return buf; + snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", + ap[0], ap[1], ap[2], ap[3], ap[4], ap[5]); + + return (buf); } #endif /* BRIDGE_DEBUG */ @@ -3614,22 +3739,22 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) int error; #if BRIDGE_DEBUG - if (_if_brige_debug) - printf("bridge_input: %s%d from %s%d m %p data %p\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), - ifnet_name(ifp), ifnet_unit(ifp), - m, mbuf_data(m)); + if (if_bridge_debug) + printf("%s: %s%d from %s%d m %p data %p\n", __func__, + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), m, mbuf_data(m)); #endif /* BRIDGE_DEBUG */ if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { #if BRIDGE_DEBUG - if (_if_brige_debug) - printf( "bridge_input: %s%d not running passing along\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); + if (if_bridge_debug) + printf("%s: %s%d not running passing along\n", + __func__, ifnet_name(sc->sc_ifp), + ifnet_unit(sc->sc_ifp)); #endif /* BRIDGE_DEBUG */ - return 0; + return (0); } - + bifp = sc->sc_ifp; vlan = VLANTAGOF(m); @@ -3645,27 +3770,28 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) BRIDGE_BPF_MTAP_INPUT(sc, m); (void) ifnet_stat_increment_in(bifp, 1, m->m_pkthdr.len, 0); m_freem(m); - return EJUSTRETURN; + return (EJUSTRETURN); } #endif /* IFF_MONITOR */ - /* - * Need to clear the promiscous flags otherwise it will be - * dropped by DLIL after processing filters + /* + * Need to clear the promiscous flags otherwise it will be + * dropped by DLIL after processing filters */ if ((mbuf_flags(m) & MBUF_PROMISC)) mbuf_setflags_mask(m, 0, MBUF_PROMISC); - + BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { BRIDGE_UNLOCK(sc); #if BRIDGE_DEBUG - if (_if_brige_debug) - printf( "bridge_input: %s%d bridge_lookup_member_if failed\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); + if (if_bridge_debug) + printf("%s: %s%d bridge_lookup_member_if failed\n", + __func__, ifnet_name(sc->sc_ifp), + ifnet_unit(sc->sc_ifp)); #endif /* BRIDGE_DEBUG */ - return 0; + return (0); } eh = mtod(m, struct ether_header *); @@ -3675,28 +3801,35 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) if (m->m_flags & (M_BCAST|M_MCAST)) { #if BRIDGE_DEBUG - if (_if_brige_debug) + if (if_bridge_debug) if ((m->m_flags & M_MCAST)) - printf("mulicast: %02x:%02x:%02x:%02x:%02x:%02x\n", - eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], - eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5]); - + printf("%s: mulicast: " + "%02x:%02x:%02x:%02x:%02x:%02x\n", + __func__, + eh->ether_dhost[0], eh->ether_dhost[1], + eh->ether_dhost[2], eh->ether_dhost[3], + eh->ether_dhost[4], eh->ether_dhost[5]); #endif /* BRIDGE_DEBUG */ /* Tap off 802.1D packets; they do not get forwarded. */ if (memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0) { +#if BRIDGESTP m = bstp_input(&bif->bif_stp, ifp, m); +#else /* !BRIDGESTP */ + m_freem(m); + m = NULL; +#endif /* !BRIDGESTP */ if (m == NULL) { BRIDGE_UNLOCK(sc); - return EJUSTRETURN; + return (EJUSTRETURN); } } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { BRIDGE_UNLOCK(sc); - return 0; + return (0); } /* @@ -3707,16 +3840,16 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) mc = m_dup(m, M_DONTWAIT); if (mc == NULL) { BRIDGE_UNLOCK(sc); - return 0; + return (0); } - /* - * Perform the bridge forwarding function with the copy. + /* + * Perform the bridge forwarding function with the copy. * * Note that bridge_forward calls BRIDGE_UNLOCK */ bridge_forward(sc, bif, mc); - + /* * Reinject the mbuf as arriving on the bridge so we have a * chance at claiming multicast packets. We can not loop back @@ -3735,33 +3868,37 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) // mark packet as arriving on the bridge mc2->m_pkthdr.rcvif = bifp; mc2->m_pkthdr.header = mbuf_data(mc2); - + #if NBPFILTER > 0 if (sc->sc_bpf_input) bridge_bpf_input(bifp, mc2); #endif /* NBPFILTER */ - (void) mbuf_setdata(mc2, (char *)mbuf_data(mc2) + ETHER_HDR_LEN, mbuf_len(mc2) - ETHER_HDR_LEN); + (void) mbuf_setdata(mc2, + (char *)mbuf_data(mc2) + ETHER_HDR_LEN, + mbuf_len(mc2) - ETHER_HDR_LEN); (void) mbuf_pkthdr_adjustlen(mc2, - ETHER_HDR_LEN); - - (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(mc2), 0); - + + (void) ifnet_stat_increment_in(bifp, 1, + mbuf_pkthdr_len(mc2), 0); + #if BRIDGE_DEBUG - if (_if_brige_debug) - printf( "bridge_input: %s%d mcast for us\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); + if (if_bridge_debug) + printf("%s: %s%d mcast for us\n", __func__, + ifnet_name(sc->sc_ifp), + ifnet_unit(sc->sc_ifp)); #endif /* BRIDGE_DEBUG */ - + dlil_input_packet_list(bifp, mc2); } /* Return the original packet for local processing. */ - return 0; + return (0); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { BRIDGE_UNLOCK(sc); - return 0; + return (0); } #ifdef DEV_CARP @@ -3784,52 +3921,49 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) #endif #if defined(PFIL_HOOKS) -#define PFIL_PHYS(sc, ifp, m) do { \ - if (pfil_local_phys && \ - (PFIL_HOOKED(&inet_pfil_hook) \ - OR_PFIL_HOOKED_INET6)) { \ - if (bridge_pfil(&m, NULL, ifp, \ - PFIL_IN) != 0 || m == NULL) { \ - BRIDGE_UNLOCK(sc); \ - return (NULL); \ - } \ - } \ - } while (0) +#define PFIL_PHYS(sc, ifp, m) do { \ + if (pfil_local_phys && \ + (PFIL_HOOKED(&inet_pfil_hook) OR_PFIL_HOOKED_INET6)) { \ + if (bridge_pfil(&m, NULL, ifp, \ + PFIL_IN) != 0 || m == NULL) { \ + BRIDGE_UNLOCK(sc); \ + return (NULL); \ + } \ + } \ +} while (0) #else /* PFIL_HOOKS */ #define PFIL_PHYS(sc, ifp, m) #endif /* PFIL_HOOKS */ -#define GRAB_OUR_PACKETS(iface) \ - if ((iface)->if_type == IFT_GIF) \ - continue; \ - /* It is destined for us. */ \ - if (memcmp(ifnet_lladdr((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ - OR_CARP_CHECK_WE_ARE_DST((iface)) \ - ) { \ +#define GRAB_OUR_PACKETS(iface) \ + if ((iface)->if_type == IFT_GIF) \ + continue; \ + /* It is destined for us. */ \ + if (memcmp(ifnet_lladdr((iface)), eh->ether_dhost, \ + ETHER_ADDR_LEN) == 0 OR_CARP_CHECK_WE_ARE_DST((iface))) { \ if ((iface)->if_type == IFT_BRIDGE) { \ BRIDGE_BPF_MTAP_INPUT(sc, m); \ /* Filter on the physical interface. */ \ - PFIL_PHYS(sc, iface, m); \ + PFIL_PHYS(sc, iface, m); \ } \ if (bif->bif_flags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ if (error && bif->bif_addrmax) { \ BRIDGE_UNLOCK(sc); \ - return EJUSTRETURN; \ + return (EJUSTRETURN); \ } \ } \ m->m_pkthdr.rcvif = iface; \ BRIDGE_UNLOCK(sc); \ - return 0; \ + return (0); \ } \ \ /* We just received a packet that we sent out. */ \ - if (memcmp(ifnet_lladdr((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ - OR_CARP_CHECK_WE_ARE_SRC((iface)) \ - ) { \ + if (memcmp(ifnet_lladdr((iface)), eh->ether_shost, \ + ETHER_ADDR_LEN) == 0 OR_CARP_CHECK_WE_ARE_SRC((iface))) { \ BRIDGE_UNLOCK(sc); \ - return EJUSTRETURN; \ + return (EJUSTRETURN); \ } /* @@ -3841,68 +3975,68 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) * local processing. */ if (memcmp(eh->ether_dhost, ifnet_lladdr(bifp), - ETHER_ADDR_LEN) == 0 - OR_CARP_CHECK_WE_ARE_DST(bifp)) { - + ETHER_ADDR_LEN) == 0 OR_CARP_CHECK_WE_ARE_DST(bifp)) { + /* Mark the packet as arriving on the bridge interface */ (void) mbuf_pkthdr_setrcvif(m, bifp); mbuf_pkthdr_setheader(m, frame_header); - + /* * If the interface is learning, and the source * address is valid and not multicast, record * the address. */ if ((bif->bif_flags & IFBIF_LEARNING) != 0 && - ETHER_IS_MULTICAST(eh->ether_shost) == 0 && - (eh->ether_shost[0] | eh->ether_shost[1] | - eh->ether_shost[2] | eh->ether_shost[3] | - eh->ether_shost[4] | eh->ether_shost[5]) != 0) { - (void) bridge_rtupdate(sc, eh->ether_shost, - vlan, bif, 0, IFBAF_DYNAMIC); - } - + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + vlan, bif, 0, IFBAF_DYNAMIC); + } + BRIDGE_BPF_MTAP_INPUT(sc, m); - (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN); + (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, + mbuf_len(m) - ETHER_HDR_LEN); (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN); - + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0); BRIDGE_UNLOCK(sc); - + #if BRIDGE_DEBUG - if (_if_brige_debug) - printf( "bridge_input: %s%d packet for bridge\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); + if (if_bridge_debug) + printf("%s: %s%d packet for bridge\n", __func__, + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); #endif /* BRIDGE_DEBUG */ - + dlil_input_packet_list(bifp, m); - - return EJUSTRETURN; + + return (EJUSTRETURN); } /* - * if the destination of the packet is for the MAC address of + * if the destination of the packet is for the MAC address of * the member interface itself, then we don't need to forward * it -- just pass it back. Note that it'll likely just be - * dropped by the stack, but if something else is bound to + * dropped by the stack, but if something else is bound to * the interface directly (for example, the wireless stats - * protocol -- although that actually uses BPF right now), + * protocol -- although that actually uses BPF right now), * then it will consume the packet * - * ALSO, note that we do this check AFTER checking for the + * ALSO, note that we do this check AFTER checking for the * bridge's own MAC address, because the bridge may be * using the SAME MAC address as one of its interfaces */ - if (memcmp(eh->ether_dhost, ifnet_lladdr(ifp), - ETHER_ADDR_LEN) == 0) { - + if (memcmp(eh->ether_dhost, ifnet_lladdr(ifp), ETHER_ADDR_LEN) == 0) { + #ifdef VERY_VERY_VERY_DIAGNOSTIC - printf("bridge_input: not forwarding packet bound for member interface\n" ); + printf("%s: not forwarding packet bound for member " + "interface\n", __func__); #endif BRIDGE_UNLOCK(sc); - return 0; + return (0); } /* Now check the all bridge members. */ @@ -3915,14 +4049,14 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) #undef OR_PFIL_HOOKED_INET6 #undef GRAB_OUR_PACKETS - /* - * Perform the bridge forwarding function. + /* + * Perform the bridge forwarding function. * * Note that bridge_forward calls BRIDGE_UNLOCK */ bridge_forward(sc, bif, m); - return EJUSTRETURN; + return (EJUSTRETURN); } /* @@ -3994,7 +4128,8 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } else { mc = m_dup(m, M_DONTWAIT); if (mc == NULL) { - (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + (void) ifnet_stat_increment_out(sc->sc_ifp, + 0, 0, 1); continue; } } @@ -4015,7 +4150,8 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, int i = min(mc->m_pkthdr.len, max_protohdr); mc = m_copyup(mc, i, ETHER_ALIGN); if (mc == NULL) { - (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + (void) ifnet_stat_increment_out( + sc->sc_ifp, 0, 0, 1); continue; } } @@ -4026,7 +4162,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } #endif /* PFIL_HOOKS */ - bridge_enqueue(sc, dst_if, mc); + (void) bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); @@ -4066,7 +4202,7 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m) continue; } - bridge_enqueue(sc, dst_if, mc); + (void) bridge_enqueue(sc, dst_if, mc); } } @@ -4089,7 +4225,7 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, /* Check the source address is valid and not multicast. */ if (ETHER_IS_MULTICAST(dst) || (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && - dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) + dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) return (EINVAL); @@ -4147,14 +4283,14 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { struct timespec now; - + nanouptime(&now); brt->brt_expire = now.tv_sec + sc->sc_brttimeout; } if (setflags) brt->brt_flags = flags; - + return (0); } @@ -4226,7 +4362,7 @@ bridge_timer(void *arg) if (sc->sc_ifp->if_flags & IFF_RUNNING) { struct timespec ts; - + ts.tv_sec = bridge_rtable_prune_period; ts.tv_nsec = 0; bsd_timeout(bridge_timer, sc, &ts); @@ -4248,7 +4384,7 @@ bridge_rtage(struct bridge_softc *sc) LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { struct timespec now; - + nanouptime(&now); if ((unsigned long)now.tv_sec >= brt->brt_expire) bridge_rtnode_destroy(sc, brt); @@ -4313,7 +4449,7 @@ bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (brt->brt_ifp == ifp && (full || - (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) bridge_rtnode_destroy(sc, brt); } } @@ -4328,7 +4464,7 @@ bridge_rtable_init(struct bridge_softc *sc) { int i; - sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, + sc->sc_rthash = _MALLOC(sizeof (*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, M_DEVBUF, M_NOWAIT); if (sc->sc_rthash == NULL) return (ENOMEM); @@ -4412,7 +4548,8 @@ bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) * vlan id or if zero then just return the first match. */ static struct bridge_rtnode * -bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, + uint16_t vlan) { struct bridge_rtnode *brt; uint32_t hash; @@ -4499,6 +4636,7 @@ bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) zfree(bridge_rtnode_pool, brt); } +#if BRIDGESTP /* * bridge_rtable_expire: * @@ -4516,18 +4654,19 @@ bridge_rtable_expire(struct ifnet *ifp, int age) * If the age is zero then flush, otherwise set all the expiry times to * age for the interface */ - if (age == 0) + if (age == 0) { bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); - else { + } else { LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { struct timespec now; - + nanouptime(&now); /* Cap the expiry time to 'age' */ if (brt->brt_ifp == ifp && brt->brt_expire > (unsigned long)now.tv_sec + age && (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) - brt->brt_expire = (unsigned long)now.tv_sec + age; + brt->brt_expire = + (unsigned long)now.tv_sec + age; } } BRIDGE_UNLOCK(sc); @@ -4553,10 +4692,10 @@ bridge_state_change(struct ifnet *ifp, int state) if (log_stp) log(LOG_NOTICE, "%s%d: state changed to %s on %s%d\n", - ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), - stpstates[state], - ifnet_name(ifp), ifnet_unit(ifp)); + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + stpstates[state], ifnet_name(ifp), ifnet_unit(ifp)); } +#endif /* BRIDGESTP */ #ifdef PFIL_HOOKS /* @@ -4588,11 +4727,11 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) i = min((*mp)->m_pkthdr.len, max_protohdr); if ((*mp)->m_len < i) { - *mp = m_pullup(*mp, i); - if (*mp == NULL) { - printf("%s: m_pullup failed\n", __func__); - return (-1); - } + *mp = m_pullup(*mp, i); + if (*mp == NULL) { + printf("%s: m_pullup failed\n", __func__); + return (-1); + } } eh1 = mtod(*mp, struct ether_header *); @@ -4645,13 +4784,13 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) } /* Strip off the Ethernet header and keep a copy. */ - m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); + m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t)&eh2); m_adj(*mp, ETHER_HDR_LEN); /* Strip off snap header, if present */ if (snap) { - m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); - m_adj(*mp, sizeof(struct llc)); + m_copydata(*mp, 0, sizeof (struct llc), (caddr_t)&llc1); + m_adj(*mp, sizeof (struct llc)); } /* @@ -4704,7 +4843,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) * packet will return to us via bridge_dummynet(). */ args.oif = ifp; - ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args); + ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args, DN_CLIENT_IPFW); return (error); } @@ -4738,21 +4877,21 @@ ipfwpass: */ if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) error = pfil_run_hooks(&inet_pfil_hook, mp, bifp, - dir, NULL); + dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (pfil_member && ifp != NULL) error = pfil_run_hooks(&inet_pfil_hook, mp, ifp, - dir, NULL); + dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (pfil_bridge && dir == PFIL_IN && bifp != NULL) error = pfil_run_hooks(&inet_pfil_hook, mp, bifp, - dir, NULL); + dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; @@ -4762,7 +4901,7 @@ ipfwpass: i = (*mp)->m_pkthdr.len; if (i > ifp->if_mtu) { error = bridge_fragment(ifp, *mp, &eh2, snap, - &llc1); + &llc1); return (error); } } @@ -4770,7 +4909,7 @@ ipfwpass: /* Recalculate the ip checksum and restore byte ordering */ ip = mtod(*mp, struct ip *); hlen = ip->ip_hl << 2; - if (hlen < sizeof(struct ip)) + if (hlen < sizeof (struct ip)) goto bad; if (hlen > (*mp)->m_len) { if ((*mp = m_pullup(*mp, hlen)) == 0) @@ -4782,7 +4921,7 @@ ipfwpass: ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; - if (hlen == sizeof(struct ip)) + if (hlen == sizeof (struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(*mp, hlen); @@ -4792,21 +4931,21 @@ ipfwpass: case ETHERTYPE_IPV6: if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp, - dir, NULL); + dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (pfil_member && ifp != NULL) error = pfil_run_hooks(&inet6_pfil_hook, mp, ifp, - dir, NULL); + dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (pfil_bridge && dir == PFIL_IN && bifp != NULL) error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp, - dir, NULL); + dir, NULL); break; #endif default: @@ -4825,10 +4964,10 @@ ipfwpass: * Finally, put everything back the way it was and return */ if (snap) { - M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT); + M_PREPEND(*mp, sizeof (struct llc), M_DONTWAIT); if (*mp == NULL) return (error); - bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); + bcopy(&llc1, mtod(*mp, caddr_t), sizeof (struct llc)); } M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); @@ -4869,8 +5008,9 @@ bridge_ip_checkbasic(struct mbuf **mp) return (-1); if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { - if ((m = m_copyup(m, sizeof(struct ip), - (max_linkhdr + 3) & ~3)) == NULL) { + /* max_linkhdr is already rounded up to nearest 4-byte */ + if ((m = m_copyup(m, sizeof (struct ip), + max_linkhdr)) == NULL) { /* XXXJRT new stat, please */ ipstat.ips_toosmall++; goto bad; @@ -4889,7 +5029,7 @@ bridge_ip_checkbasic(struct mbuf **mp) goto bad; } hlen = ip->ip_hl << 2; - if (hlen < sizeof(struct ip)) { /* minimum header length */ + if (hlen < sizeof (struct ip)) { /* minimum header length */ ipstat.ips_badhlen++; goto bad; } @@ -4905,7 +5045,7 @@ bridge_ip_checkbasic(struct mbuf **mp) if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { - if (hlen == sizeof(struct ip)) { + if (hlen == sizeof (struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); @@ -4966,16 +5106,17 @@ bridge_ip6_checkbasic(struct mbuf **mp) */ if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { struct ifnet *inifp = m->m_pkthdr.rcvif; - if ((m = m_copyup(m, sizeof(struct ip6_hdr), - (max_linkhdr + 3) & ~3)) == NULL) { + /* max_linkhdr is already rounded up to nearest 4-byte */ + if ((m = m_copyup(m, sizeof (struct ip6_hdr), + max_linkhdr)) == NULL) { /* XXXJRT new stat, please */ ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } - } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { + } else if (__predict_false(m->m_len < sizeof (struct ip6_hdr))) { struct ifnet *inifp = m->m_pkthdr.rcvif; - if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { + if ((m = m_pullup(m, sizeof (struct ip6_hdr))) == NULL) { ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; @@ -5013,13 +5154,13 @@ bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, struct ip *ip; int error = -1; - if (m->m_len < sizeof(struct ip) && - (m = m_pullup(m, sizeof(struct ip))) == NULL) + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == NULL) goto out; ip = mtod(m, struct ip *); error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, - CSUM_DELAY_IP); + CSUM_DELAY_IP); if (error) goto out; @@ -5027,13 +5168,13 @@ bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, for (m0 = m; m0; m0 = m0->m_nextpkt) { if (error == 0) { if (snap) { - M_PREPEND(m0, sizeof(struct llc), M_DONTWAIT); + M_PREPEND(m0, sizeof (struct llc), M_DONTWAIT); if (m0 == NULL) { error = ENOBUFS; continue; } bcopy(llc, mtod(m0, caddr_t), - sizeof(struct llc)); + sizeof (struct llc)); } M_PREPEND(m0, ETHER_HDR_LEN, M_DONTWAIT); if (m0 == NULL) { @@ -5041,8 +5182,9 @@ bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, continue; } bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN); - } else + } else { m_freem(m); + } } if (error == 0) @@ -5061,78 +5203,81 @@ static errno_t bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); - - //printf("bridge_set_bpf_tap ifp %p mode %d\n", ifp, mode); - + /* TBD locking */ if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) { - return ENODEV; + return (ENODEV); } - + switch (mode) { case BPF_TAP_DISABLE: sc->sc_bpf_input = sc->sc_bpf_output = NULL; break; - + case BPF_TAP_INPUT: sc->sc_bpf_input = bpf_callback; break; - + case BPF_TAP_OUTPUT: sc->sc_bpf_output = bpf_callback; break; - + case BPF_TAP_INPUT_OUTPUT: sc->sc_bpf_input = sc->sc_bpf_output = bpf_callback; break; - + default: break; } - - return 0; + + return (0); } static void bridge_detach(ifnet_t ifp) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); - + +#if BRIDGESTP bstp_detach(&sc->sc_stp); +#endif /* BRIDGESTP */ /* Tear down the routing table. */ bridge_rtable_fini(sc); - + lck_mtx_lock(bridge_list_mtx); LIST_REMOVE(sc, sc_list); lck_mtx_unlock(bridge_list_mtx); - + ifnet_release(ifp); - + lck_mtx_free(sc->sc_mtx, bridge_lock_grp); - + _FREE(sc, M_DEVBUF); - return; } -__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m) +__private_extern__ errno_t +bridge_bpf_input(ifnet_t ifp, struct mbuf *m) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); - + if (sc->sc_bpf_input) { - if (mbuf_pkthdr_rcvif(m) != ifp) - printf("bridge_bpf_input rcvif: %p != ifp %p\n", mbuf_pkthdr_rcvif(m), ifp); + if (mbuf_pkthdr_rcvif(m) != ifp) { + printf("%s: rcvif: %p != ifp %p\n", __func__, + mbuf_pkthdr_rcvif(m), ifp); + } (*sc->sc_bpf_input)(ifp, m); } - return 0; + return (0); } -__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m) +__private_extern__ errno_t +bridge_bpf_output(ifnet_t ifp, struct mbuf *m) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); - + if (sc->sc_bpf_output) { (*sc->sc_bpf_output)(ifp, m); } - return 0; + return (0); } diff --git a/bsd/net/if_bridgevar.h b/bsd/net/if_bridgevar.h index 3d1375aed..f3774c5aa 100644 --- a/bsd/net/if_bridgevar.h +++ b/bsd/net/if_bridgevar.h @@ -492,6 +492,8 @@ struct ifbpstpconf64 { #ifdef XNU_KERNEL_PRIVATE +extern u_int8_t bstp_etheraddr[ETHER_ADDR_LEN]; + int bridgeattach(int); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/net/if_dl.h b/bsd/net/if_dl.h index 1dd0d8c07..3d086e402 100644 --- a/bsd/net/if_dl.h +++ b/bsd/net/if_dl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,7 +109,7 @@ struct sockaddr_dl { #endif #ifdef BSD_KERNEL_PRIVATE -#define SDL(s) ((struct sockaddr_dl *)s) +#define SDL(s) ((struct sockaddr_dl *)(void *)s) #endif #ifndef KERNEL diff --git a/bsd/net/if_ether.h b/bsd/net/if_ether.h index eb29560d2..a0235a74d 100644 --- a/bsd/net/if_ether.h +++ b/bsd/net/if_ether.h @@ -50,7 +50,12 @@ errno_t ether_add_proto(ifnet_t interface, protocol_family_t protocol, errno_t ether_del_proto(ifnet_t interface, protocol_family_t protocol); errno_t ether_frameout(ifnet_t interface, mbuf_t *packet, const struct sockaddr *dest, const char *dest_lladdr, - const char *frame_type); + const char *frame_type +#if KPI_INTERFACE_EMBEDDED + , + u_int32_t *prepend_len, u_int32_t *postpend_len +#endif /* KPI_INTERFACE_EMBEDDED */ + ); errno_t ether_ioctl(ifnet_t interface, u_int32_t command, void* data); errno_t ether_check_multi(ifnet_t ifp, const struct sockaddr *multicast); diff --git a/bsd/net/if_gif.c b/bsd/net/if_gif.c index b25ecb3a5..c638758a2 100644 --- a/bsd/net/if_gif.c +++ b/bsd/net/if_gif.c @@ -737,10 +737,10 @@ gif_ioctl( /* can't configure multiple multi-dest interfaces */ #define multidest(x) \ - (((struct sockaddr_in *)(x))->sin_addr.s_addr == INADDR_ANY) + (((struct sockaddr_in *)(void *)(x))->sin_addr.s_addr == INADDR_ANY) #if INET6 #define multidest6(x) \ - (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)(x))->sin6_addr)) + (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)(void *)(x))->sin6_addr)) #endif if (dst->sa_family == AF_INET && multidest(dst) && multidest(sc2->gif_pdst)) { diff --git a/bsd/net/if_llreach.c b/bsd/net/if_llreach.c index 669beb0f4..db81aa083 100644 --- a/bsd/net/if_llreach.c +++ b/bsd/net/if_llreach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 Apple Inc. All rights reserved. + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -122,6 +122,8 @@ #include #include #include +#include +#include #include #include @@ -343,6 +345,9 @@ found: lr->lr_ifp = ifp; lr->lr_key.proto = llproto; bcopy(addr, &lr->lr_key.addr, IF_LLREACH_MAXLEN); + lr->lr_rssi = IFNET_RSSI_UNKNOWN; + lr->lr_lqm = IFNET_LQM_THRESH_UNKNOWN; + lr->lr_npm = IFNET_NPM_THRESH_UNKNOWN; RB_INSERT(ll_reach_tree, &ifp->if_ll_srcs, lr); IFLR_UNLOCK(lr); lck_rw_done(&ifp->if_llreach_lock); @@ -386,7 +391,7 @@ ifnet_llreach_free(struct if_llreach *lr) } u_int64_t -ifnet_llreach_up2cal(struct if_llreach *lr, u_int64_t uptime) +ifnet_llreach_up2calexp(struct if_llreach *lr, u_int64_t uptime) { u_int64_t calendar = 0; @@ -411,6 +416,62 @@ ifnet_llreach_up2cal(struct if_llreach *lr, u_int64_t uptime) return (calendar); } +u_int64_t +ifnet_llreach_up2upexp(struct if_llreach *lr, u_int64_t uptime) +{ + return (lr->lr_reachable + uptime); +} + +int +ifnet_llreach_get_defrouter(struct ifnet *ifp, int af, + struct ifnet_llreach_info *iflri) +{ + struct radix_node_head *rnh; + struct sockaddr_storage dst_ss, mask_ss; + struct rtentry *rt; + int error = ESRCH; + + VERIFY(ifp != NULL && iflri != NULL && + (af == AF_INET || af == AF_INET6)); + + bzero(iflri, sizeof (*iflri)); + + if ((rnh = rt_tables[af]) == NULL) + return (error); + + bzero(&dst_ss, sizeof (dst_ss)); + bzero(&mask_ss, sizeof (mask_ss)); + dst_ss.ss_family = af; + dst_ss.ss_len = (af == AF_INET) ? sizeof (struct sockaddr_in) : + sizeof (struct sockaddr_in6); + + lck_mtx_lock(rnh_lock); + rt = rt_lookup(TRUE, SA(&dst_ss), SA(&mask_ss), rnh, ifp->if_index); + if (rt != NULL) { + struct rtentry *gwrt; + + RT_LOCK(rt); + if ((rt->rt_flags & RTF_GATEWAY) && + (gwrt = rt->rt_gwroute) != NULL && + rt_key(rt)->sa_family == rt_key(gwrt)->sa_family && + (gwrt->rt_flags & RTF_UP)) { + RT_UNLOCK(rt); + RT_LOCK(gwrt); + if (gwrt->rt_llinfo_get_iflri != NULL) { + (*gwrt->rt_llinfo_get_iflri)(gwrt, iflri); + error = 0; + } + RT_UNLOCK(gwrt); + } else { + RT_UNLOCK(rt); + } + rtfree_locked(rt); + } + lck_mtx_unlock(rnh_lock); + + return (error); +} + static struct if_llreach * iflr_alloc(int how) { @@ -495,6 +556,44 @@ ifnet_lr2ri(struct if_llreach *lr, struct rt_reach_info *ri) ri->ri_refcnt = lri.lri_refcnt; ri->ri_probes = lri.lri_probes; ri->ri_rcv_expire = lri.lri_expire; + ri->ri_rssi = lri.lri_rssi; + ri->ri_lqm = lri.lri_lqm; + ri->ri_npm = lri.lri_npm; +} + +void +ifnet_lr2iflri(struct if_llreach *lr, struct ifnet_llreach_info *iflri) +{ + IFLR_LOCK_ASSERT_HELD(lr); + + bzero(iflri, sizeof (*iflri)); + /* + * Note here we return request count, not actual memory refcnt. + */ + iflri->iflri_refcnt = lr->lr_reqcnt; + iflri->iflri_probes = lr->lr_probes; + iflri->iflri_rcv_expire = ifnet_llreach_up2upexp(lr, lr->lr_lastrcvd); + iflri->iflri_curtime = net_uptime(); + switch (lr->lr_key.proto) { + case ETHERTYPE_IP: + iflri->iflri_netproto = PF_INET; + break; + case ETHERTYPE_IPV6: + iflri->iflri_netproto = PF_INET6; + break; + default: + /* + * This shouldn't be possible for the time being, + * since link-layer reachability records are only + * kept for ARP and ND6. + */ + iflri->iflri_netproto = PF_UNSPEC; + break; + } + bcopy(&lr->lr_key.addr, &iflri->iflri_addr, IF_LLREACH_MAXLEN); + iflri->iflri_rssi = lr->lr_rssi; + iflri->iflri_lqm = lr->lr_lqm; + iflri->iflri_npm = lr->lr_npm; } void @@ -509,9 +608,12 @@ ifnet_lr2lri(struct if_llreach *lr, struct if_llreach_info *lri) lri->lri_refcnt = lr->lr_reqcnt; lri->lri_ifindex = lr->lr_ifp->if_index; lri->lri_probes = lr->lr_probes; - lri->lri_expire = ifnet_llreach_up2cal(lr, lr->lr_lastrcvd); + lri->lri_expire = ifnet_llreach_up2calexp(lr, lr->lr_lastrcvd); lri->lri_proto = lr->lr_key.proto; bcopy(&lr->lr_key.addr, &lri->lri_addr, IF_LLREACH_MAXLEN); + lri->lri_rssi = lr->lr_rssi; + lri->lri_lqm = lr->lr_lqm; + lri->lri_npm = lr->lr_npm; } static int diff --git a/bsd/net/if_llreach.h b/bsd/net/if_llreach.h index e922fb0e4..b36612ce1 100644 --- a/bsd/net/if_llreach.h +++ b/bsd/net/if_llreach.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 Apple Inc. All rights reserved. + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -39,7 +39,8 @@ extern "C" { /* * Per-interface link-layer reachability information (private). */ -#define IF_LLREACHINFO_ADDRLEN 64 /* max ll addr len */ +#define IF_LLREACHINFO_ADDRLEN 64 /* max ll addr len */ +#define IF_LLREACHINFO_RESERVED2 16 /* more reserved bits */ struct if_llreach_info { u_int32_t lri_refcnt; /* reference count */ @@ -49,6 +50,10 @@ struct if_llreach_info { u_int16_t lri_reserved; /* for future use */ u_int16_t lri_proto; /* ll proto */ u_int8_t lri_addr[IF_LLREACHINFO_ADDRLEN]; /* ll addr */ + int32_t lri_rssi; /* received signal strength */ + int32_t lri_lqm; /* link quality metric */ + int32_t lri_npm; /* node proximity metric */ + u_int8_t lri_reserved2[IF_LLREACHINFO_RESERVED2]; }; #ifdef XNU_KERNEL_PRIVATE @@ -92,6 +97,9 @@ struct if_llreach { u_int16_t proto; /* ll proto */ u_int8_t addr[IF_LLREACH_MAXLEN]; /* ll addr */ } lr_key; + int32_t lr_rssi; /* received signal strength */ + int32_t lr_lqm; /* link quality metric */ + int32_t lr_npm; /* node proximity metric */ }; RB_PROTOTYPE_SC_PREV(__private_extern__, ll_reach_tree, if_llreach, @@ -126,6 +134,8 @@ RB_PROTOTYPE_SC_PREV(__private_extern__, ll_reach_tree, if_llreach, #define IFLR_REMREF(_iflr) \ iflr_remref(_iflr) +struct ifnet_llreach_info; /* forward declaration */ + extern void ifnet_llreach_init(void); extern void ifnet_llreach_ifattach(struct ifnet *, boolean_t); extern void ifnet_llreach_ifdetach(struct ifnet *); @@ -136,8 +146,12 @@ extern int ifnet_llreach_reachable(struct if_llreach *); extern int ifnet_llreach_reachable_delta(struct if_llreach *, u_int64_t); extern void ifnet_llreach_set_reachable(struct ifnet *, u_int16_t, void *, unsigned int); -extern u_int64_t ifnet_llreach_up2cal(struct if_llreach *, u_int64_t); +extern u_int64_t ifnet_llreach_up2calexp(struct if_llreach *, u_int64_t); +extern u_int64_t ifnet_llreach_up2upexp(struct if_llreach *, u_int64_t); +extern int ifnet_llreach_get_defrouter(struct ifnet *, int, + struct ifnet_llreach_info *); extern void ifnet_lr2ri(struct if_llreach *, struct rt_reach_info *); +extern void ifnet_lr2iflri(struct if_llreach *, struct ifnet_llreach_info *); extern void ifnet_lr2lri(struct if_llreach *, struct if_llreach_info *); extern void iflr_addref(struct if_llreach *, int); extern void iflr_remref(struct if_llreach *); diff --git a/bsd/net/if_loop.c b/bsd/net/if_loop.c index 5ba5b11a5..d051c7611 100644 --- a/bsd/net/if_loop.c +++ b/bsd/net/if_loop.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -73,6 +73,10 @@ #include "loop.h" #if NLOOP > 0 +#if NLOOP != 1 +#error "More than one loopback interface is not supported." +#endif + #include #include #include @@ -80,6 +84,7 @@ #include #include #include +#include #include #include @@ -87,7 +92,7 @@ #include #include -#if INET +#if INET #include #include #endif @@ -103,104 +108,183 @@ #include #include -#if NETAT -extern struct ifqueue atalkintrq; -#endif - #if CONFIG_MACF_NET #include #endif -#define NLOOP_ATTACHMENTS (NLOOP * 12) +#include + +#define LOMTU 16384 +#define LOSNDQ_MAXLEN 256 + +#define LO_BPF_TAP_OUT(_m) { \ + if (lo_statics[0].bpf_callback != NULL) { \ + bpf_tap_out(lo_ifp, DLT_NULL, _m, \ + &((struct loopback_header *)_m->m_pkthdr.header)->protocol,\ + sizeof (u_int32_t)); \ + } \ +} + +#define LO_BPF_TAP_OUT_MULTI(_m) { \ + if (lo_statics[0].bpf_callback != NULL) { \ + struct mbuf *_n; \ + for (_n = _m; _n != NULL; _n = _n->m_nextpkt) \ + LO_BPF_TAP_OUT(_n); \ + } \ +} struct lo_statics_str { - int bpf_mode; + int bpf_mode; bpf_packet_func bpf_callback; }; -void loopattach(void); - static struct lo_statics_str lo_statics[NLOOP]; -int loopattach_done = 0; /* used to sync ip6_init2 loopback configuration */ +static int lo_txstart = 0; -#ifdef TINY_LOMTU -#define LOMTU (1024+512) -#else -#define LOMTU 16384 -#endif - -ifnet_t lo_ifp = NULL; +struct ifnet *lo_ifp = NULL; struct loopback_header { protocol_family_t protocol; }; +/* Local forward declerations */ +void loopattach(void); +static errno_t lo_demux(struct ifnet *, struct mbuf *, char *, + protocol_family_t *); +#if !KPI_INTERFACE_EMBEDDED +static errno_t lo_framer(struct ifnet *, struct mbuf **, + const struct sockaddr *, + const char *, const char *); +#else +static errno_t +lo_framer(struct ifnet *, struct mbuf **, const struct sockaddr *, + const char *, const char *, u_int32_t *, u_int32_t *); +#endif +static errno_t lo_add_proto(struct ifnet *, protocol_family_t, + const struct ifnet_demux_desc *, u_int32_t); +static errno_t lo_del_proto(struct ifnet *, protocol_family_t); +static int lo_output(struct ifnet *, struct mbuf *); +static errno_t lo_pre_enqueue(struct ifnet *, struct mbuf *); +static void lo_start(struct ifnet *); +static errno_t lo_pre_output(struct ifnet *, protocol_family_t, struct mbuf **, + const struct sockaddr *, void *, char *, char *); +static errno_t lo_input(struct ifnet *, protocol_family_t, struct mbuf *); +static void lo_rtrequest(int, struct rtentry *, struct sockaddr *); +static errno_t lo_ioctl(struct ifnet *, u_long, void *); +static errno_t lo_attach_proto(struct ifnet *, protocol_family_t); static void lo_reg_if_mods(void); +static errno_t lo_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func); +static int sysctl_dequeue_max SYSCTL_HANDLER_ARGS; +static int sysctl_sched_model SYSCTL_HANDLER_ARGS; +static int sysctl_dequeue_scidx SYSCTL_HANDLER_ARGS; -/* Local forward declerations */ +SYSCTL_DECL(_net_link); + +SYSCTL_NODE(_net_link, OID_AUTO, loopback, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "loopback interface"); + +#define LO_BW_SLEEP 10 +static u_int32_t lo_bw_sleep_usec = LO_BW_SLEEP; +SYSCTL_UINT(_net_link_loopback, OID_AUTO, bw_sleep_usec, + CTLFLAG_RW | CTLFLAG_LOCKED, &lo_bw_sleep_usec, LO_BW_SLEEP, ""); + +static u_int32_t lo_bw_measure = 0; +SYSCTL_UINT(_net_link_loopback, OID_AUTO, bw_measure, + CTLFLAG_RW | CTLFLAG_LOCKED, &lo_bw_measure, 0, ""); + +static u_int32_t lo_dequeue_max = LOSNDQ_MAXLEN; +SYSCTL_PROC(_net_link_loopback, OID_AUTO, max_dequeue, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &lo_dequeue_max, LOSNDQ_MAXLEN, + sysctl_dequeue_max, "I", "Maximum number of packets dequeued at a time"); + +static u_int32_t lo_sched_model = IFNET_SCHED_MODEL_NORMAL; +SYSCTL_PROC(_net_link_loopback, OID_AUTO, sched_model, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &lo_sched_model, + IFNET_SCHED_MODEL_NORMAL, sysctl_sched_model, "I", "Scheduling model"); + +static u_int32_t lo_dequeue_sc = MBUF_SC_BE; +static int lo_dequeue_scidx = MBUF_SCIDX(MBUF_SC_BE); +SYSCTL_PROC(_net_link_loopback, OID_AUTO, dequeue_sc, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &lo_dequeue_scidx, + MBUF_SC_BE, sysctl_dequeue_scidx, "I", "Dequeue a specific SC index"); static errno_t -lo_demux( - __unused ifnet_t ifp, - __unused mbuf_t m, - char *frame_header, - protocol_family_t *protocol_family) +lo_demux(struct ifnet *ifp, struct mbuf *m, char *frame_header, + protocol_family_t *protocol_family) { - struct loopback_header *header = (struct loopback_header *)frame_header; - +#pragma unused(ifp, m) + struct loopback_header *header = + (struct loopback_header *)(void *)frame_header; + *protocol_family = header->protocol; - - return 0; -} + return (0); +} +#if !KPI_INTERFACE_EMBEDDED static errno_t -lo_framer( - __unused ifnet_t ifp, - mbuf_t *m, - __unused const struct sockaddr *dest, - __unused const char *dest_linkaddr, - const char *frame_type) +lo_framer(struct ifnet *ifp, struct mbuf **m, const struct sockaddr *dest, + const char *dest_linkaddr, const char *frame_type) +#else +static errno_t +lo_framer(struct ifnet *ifp, struct mbuf **m, const struct sockaddr *dest, + const char *dest_linkaddr, const char *frame_type, + u_int32_t *prepend_len, u_int32_t *postpend_len) +#endif { +#pragma unused(ifp, dest, dest_linkaddr) struct loopback_header *header; - M_PREPEND(*m, sizeof(struct loopback_header), M_WAITOK); - if (*m == NULL) - return EJUSTRETURN; /* Tell caller not to try to free passed-in mbuf */ - header = mtod(*m, struct loopback_header*); - header->protocol = *(const u_int32_t*)frame_type; - return 0; + M_PREPEND(*m, sizeof (struct loopback_header), M_WAITOK); + if (*m == NULL) { + /* Tell caller not to try to free passed-in mbuf */ + return (EJUSTRETURN); + } + +#if KPI_INTERFACE_EMBEDDED + *prepend_len = sizeof (struct loopback_header); + *postpend_len = 0; +#endif /* KPI_INTERFACE_EMBEDDED */ + + header = mtod(*m, struct loopback_header *); + bcopy(frame_type, &header->protocol, sizeof (u_int32_t)); + return (0); } static errno_t -lo_add_proto( - __unused ifnet_t interface, - __unused protocol_family_t protocol_family, - __unused const struct ifnet_demux_desc *demux_array, - __unused u_int32_t demux_count) +lo_add_proto(struct ifnet *interface, protocol_family_t protocol_family, + const struct ifnet_demux_desc *demux_array, u_int32_t demux_count) { - return 0; +#pragma unused(interface, protocol_family, demux_array, demux_count) + return (0); } - static errno_t -lo_del_proto( - __unused ifnet_t ifp, - __unused protocol_family_t protocol) +lo_del_proto(struct ifnet *ifp, protocol_family_t protocol) { - return 0; +#pragma unused(ifp, protocol) + return (0); } +/* + * Output callback. + * + * This routine is called only when lo_txstart is disabled. + */ static int -lo_output( - ifnet_t ifp, - mbuf_t m_list) +lo_output(struct ifnet *ifp, struct mbuf *m_list) { - mbuf_t m; - + struct mbuf *m, *m_tail = NULL; + struct ifnet_stat_increment_param s; + u_int32_t cnt = 0, len = 0; + + bzero(&s, sizeof(s)); + for (m = m_list; m; m = m->m_nextpkt) { if ((m->m_flags & M_PKTHDR) == 0) panic("lo_output: no HDR"); + cnt++; + len += m->m_pkthdr.len; /* * Don't overwrite the rcvif field if it is in use. @@ -210,51 +294,149 @@ lo_output( if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = ifp; - atomic_add_64(&ifp->if_ibytes, m->m_pkthdr.len); - atomic_add_64(&ifp->if_obytes, m->m_pkthdr.len); + m->m_pkthdr.header = mtod(m, char *); + if (apple_hwcksum_tx != 0) { + /* loopback checksums are always OK */ + m->m_pkthdr.csum_data = 0xffff; + m->m_pkthdr.csum_flags = + CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + } + m_adj(m, sizeof (struct loopback_header)); + + LO_BPF_TAP_OUT(m); + if (m->m_nextpkt == NULL) { + m_tail = m; + } + } + + s.packets_in = cnt; + s.packets_out = cnt; + s.bytes_in = len; + s.bytes_out = len; + + return (ifnet_input_extended(ifp, m_list, m_tail, &s)); +} + +/* + * Pre-enqueue callback. + * + * This routine is called only when lo_txstart is enabled. + */ +static errno_t +lo_pre_enqueue(struct ifnet *ifp, struct mbuf *m0) +{ + struct mbuf *m = m0, *n; + int error = 0; + + while (m != NULL) { + VERIFY((m->m_flags & M_PKTHDR)); - atomic_add_64(&ifp->if_opackets, 1); - atomic_add_64(&ifp->if_ipackets, 1); + n = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* + * Don't overwrite the rcvif field if it is in use. + * This is used to match multicast packets, sent looping + * back, with the appropriate group record on input. + */ + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = ifp; m->m_pkthdr.header = mtod(m, char *); if (apple_hwcksum_tx != 0) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; - } - m_adj(m, sizeof(struct loopback_header)); - - { - /* We need to prepend the address family as a four byte field. */ - u_int32_t protocol_family = - ((struct loopback_header*)m->m_pkthdr.header)->protocol; - - bpf_tap_out(ifp, DLT_NULL, m, &protocol_family, sizeof(protocol_family)); + m->m_pkthdr.csum_flags = + CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; } + m_adj(m, sizeof (struct loopback_header)); + + /* + * Let the callee free it in case of error, + * and perform any necessary accounting. + */ + (void) ifnet_enqueue(ifp, m); + + m = n; } - return ifnet_input(ifp, m_list, NULL); + return (error); } +/* + * Start output callback. + * + * This routine is invoked by the start worker thread; because we never call + * it directly, there is no need do deploy any serialization mechanism other + * than what's already used by the worker thread, i.e. this is already single + * threaded. + * + * This routine is called only when lo_txstart is enabled. + */ +static void +lo_start(struct ifnet *ifp) +{ + struct ifnet_stat_increment_param s; + + bzero(&s, sizeof (s)); + + for (;;) { + struct mbuf *m = NULL, *m_tail = NULL; + u_int32_t cnt, len = 0; + int sleep_chan = 0; + struct timespec ts; + + if (lo_sched_model == IFNET_SCHED_MODEL_NORMAL) { + if (ifnet_dequeue_multi(ifp, lo_dequeue_max, &m, + &m_tail, &cnt, &len) != 0) + break; + } else { + if (ifnet_dequeue_service_class_multi(ifp, + lo_dequeue_sc, lo_dequeue_max, &m, + &m_tail, &cnt, &len) != 0) + break; + } + + LO_BPF_TAP_OUT_MULTI(m); + + if (lo_bw_measure) { + if (cnt >= if_bw_measure_size) + ifnet_transmit_burst_start(ifp, m); + if (lo_bw_sleep_usec > 0) { + bzero(&ts, sizeof(ts)); + ts.tv_nsec = (lo_bw_sleep_usec << 10) * cnt; + + /* Add msleep with timeout */ + (void) msleep(&sleep_chan, NULL, + PSOCK, "lo_start", &ts); + } + if (cnt >= if_bw_measure_size) + ifnet_transmit_burst_end(ifp, m_tail); + } + + /* stats are required for extended variant */ + s.packets_in = cnt; + s.packets_out = cnt; + s.bytes_in = len; + s.bytes_out = len; + + (void) ifnet_input_extended(ifp, m, m_tail, &s); + } +} /* * This is a common pre-output route used by INET and INET6. This could * (should?) be split into separate pre-output routines for each protocol. */ - static errno_t -lo_pre_output( - __unused ifnet_t ifp, - protocol_family_t protocol_family, - mbuf_t *m, - __unused const struct sockaddr *dst, - void *route, - char *frame_type, - __unused char *dst_addr) - +lo_pre_output(struct ifnet *ifp, protocol_family_t protocol_family, + struct mbuf **m, const struct sockaddr *dst, void *route, char *frame_type, + char *dst_addr) { - register struct rtentry *rt = route; +#pragma unused(ifp, dst, dst_addr) + struct rtentry *rt = route; (*m)->m_flags |= M_LOOP; @@ -266,7 +448,7 @@ lo_pre_output( if (rt_flags & (RTF_REJECT | RTF_BLACKHOLE)) { if (rt_flags & RTF_BLACKHOLE) { m_freem(*m); - return EJUSTRETURN; + return (EJUSTRETURN); } else { return ((rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); @@ -274,9 +456,9 @@ lo_pre_output( } } - *(protocol_family_t*)frame_type = protocol_family; + bcopy(&protocol_family, frame_type, sizeof (protocol_family)); - return 0; + return (0); } /* @@ -284,26 +466,19 @@ lo_pre_output( * ifq/schednetisr input mechanism. */ static errno_t -lo_input( - __unused ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t m) +lo_input(struct ifnet *ifp, protocol_family_t protocol_family, struct mbuf *m) { +#pragma unused(ifp, protocol_family) if (proto_input(protocol_family, m) != 0) m_freem(m); return (0); } - - - /* ARGSUSED */ static void -lortrequest( - __unused int cmd, - struct rtentry *rt, - __unused struct sockaddr *sa) +lo_rtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa) { +#pragma unused(cmd, sa) if (rt != NULL) { RT_LOCK_ASSERT_HELD(rt); rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ @@ -312,8 +487,7 @@ lortrequest( * should be at least twice the MTU plus a little more for * overhead. */ - rt->rt_rmx.rmx_recvpipe = - rt->rt_rmx.rmx_sendpipe = 3 * LOMTU; + rt->rt_rmx.rmx_recvpipe = rt->rt_rmx.rmx_sendpipe = 3 * LOMTU; } } @@ -321,31 +495,30 @@ lortrequest( * Process an ioctl request. */ static errno_t -loioctl( - ifnet_t ifp, - u_long cmd, - void* data) +lo_ioctl(struct ifnet *ifp, u_long cmd, void *data) { - register struct ifaddr *ifa; - register struct ifreq *ifr = (struct ifreq *)data; - register int error = 0; + int error = 0; switch (cmd) { - case SIOCSIFADDR: - ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); - ifa = (struct ifaddr *)data; + case SIOCSIFADDR: { /* struct ifaddr pointer */ + struct ifaddr *ifa = data; + + ifnet_set_flags(ifp, IFF_UP|IFF_RUNNING, IFF_UP|IFF_RUNNING); IFA_LOCK_SPIN(ifa); - ifa->ifa_rtrequest = lortrequest; + ifa->ifa_rtrequest = lo_rtrequest; IFA_UNLOCK(ifa); /* * Everything else is done at a higher level. */ break; + } - case SIOCADDMULTI: - case SIOCDELMULTI: - if (ifr == 0) { + case SIOCADDMULTI: /* struct ifreq */ + case SIOCDELMULTI: { /* struct ifreq */ + struct ifreq *ifr = data; + + if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } @@ -365,12 +538,16 @@ loioctl( break; } break; + } + + case SIOCSIFMTU: { /* struct ifreq */ + struct ifreq *ifr = data; - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; + bcopy(&ifr->ifr_mtu, &ifp->if_mtu, sizeof (int)); break; + } - case SIOCSIFFLAGS: + case SIOCSIFFLAGS: /* struct ifreq */ break; default: @@ -382,113 +559,208 @@ loioctl( #endif /* NLOOP > 0 */ -static errno_t lo_attach_proto(ifnet_t ifp, protocol_family_t protocol_family) +static errno_t +lo_attach_proto(struct ifnet *ifp, protocol_family_t protocol_family) { struct ifnet_attach_proto_param_v2 proto; errno_t result = 0; - - bzero(&proto, sizeof(proto)); + + bzero(&proto, sizeof (proto)); proto.input = lo_input; proto.pre_output = lo_pre_output; - + result = ifnet_attach_protocol_v2(ifp, protocol_family, &proto); if (result && result != EEXIST) { - printf("lo_attach_proto: ifnet_attach_protocol for %u returned=%d\n", - protocol_family, result); + printf("lo_attach_proto: ifnet_attach_protocol for %u " + "returned=%d\n", protocol_family, result); } - - return result; + + return (result); } -static void lo_reg_if_mods(void) +static void +lo_reg_if_mods(void) { - int error; + int error; /* Register protocol registration functions */ - if ((error = proto_register_plumber(PF_INET, APPLE_IF_FAM_LOOPBACK, lo_attach_proto, NULL)) != 0) - printf("proto_register_plumber failed for AF_INET error=%d\n", error); - - if ((error = proto_register_plumber(PF_INET6, APPLE_IF_FAM_LOOPBACK, lo_attach_proto, NULL)) != 0) - printf("proto_register_plumber failed for AF_INET6 error=%d\n", error); + if ((error = proto_register_plumber(PF_INET, + APPLE_IF_FAM_LOOPBACK, lo_attach_proto, NULL)) != 0) + printf("proto_register_plumber failed for AF_INET " + "error=%d\n", error); + + if ((error = proto_register_plumber(PF_INET6, + APPLE_IF_FAM_LOOPBACK, lo_attach_proto, NULL)) != 0) + printf("proto_register_plumber failed for AF_INET6 " + "error=%d\n", error); } static errno_t -lo_set_bpf_tap( - ifnet_t ifp, - bpf_tap_mode mode, - bpf_packet_func bpf_callback) +lo_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode mode, + bpf_packet_func bpf_callback) { + VERIFY(ifp == lo_ifp); - /* - * NEED MUTEX HERE XXX - */ - if (mode == BPF_TAP_DISABLE) { - lo_statics[ifp->if_unit].bpf_mode = mode; - lo_statics[ifp->if_unit].bpf_callback = bpf_callback; - } - else { - lo_statics[ifp->if_unit].bpf_callback = bpf_callback; - lo_statics[ifp->if_unit].bpf_mode = mode; + lo_statics[0].bpf_mode = mode; + + switch (mode) { + case BPF_TAP_DISABLE: + case BPF_TAP_INPUT: + lo_statics[0].bpf_callback = NULL; + break; + + case BPF_TAP_OUTPUT: + case BPF_TAP_INPUT_OUTPUT: + lo_statics[0].bpf_callback = bpf_callback; + break; } - return 0; + return (0); } - /* ARGSUSED */ void loopattach(void) { - struct ifnet_init_params lo_init; + struct ifnet_init_eparams lo_init; errno_t result = 0; -#if NLOOP != 1 -More than one loopback interface is not supported. -#endif + PE_parse_boot_argn("lo_txstart", &lo_txstart, sizeof (lo_txstart)); lo_reg_if_mods(); - - lo_statics[0].bpf_callback = 0; - lo_statics[0].bpf_mode = BPF_TAP_DISABLE; - - bzero(&lo_init, sizeof(lo_init)); - lo_init.name = "lo"; - lo_init.unit = 0; - lo_init.family = IFNET_FAMILY_LOOPBACK; - lo_init.type = IFT_LOOP; - lo_init.output = lo_output; - lo_init.demux = lo_demux; - lo_init.add_proto = lo_add_proto; - lo_init.del_proto = lo_del_proto; - lo_init.framer = lo_framer; - lo_init.softc = &lo_statics[0]; - lo_init.ioctl = loioctl; - lo_init.set_bpf_tap = lo_set_bpf_tap; - result = ifnet_allocate(&lo_init, &lo_ifp); + + lo_statics[0].bpf_callback = NULL; + lo_statics[0].bpf_mode = BPF_TAP_DISABLE; + + bzero(&lo_init, sizeof (lo_init)); + lo_init.ver = IFNET_INIT_CURRENT_VERSION; + lo_init.len = sizeof (lo_init); + lo_init.sndq_maxlen = LOSNDQ_MAXLEN; + if (lo_txstart) { + lo_init.flags = 0; + lo_init.pre_enqueue = lo_pre_enqueue; + lo_init.start = lo_start; + lo_init.output_sched_model = lo_sched_model; + } else { + lo_init.flags = IFNET_INIT_LEGACY; + lo_init.output = lo_output; + } + lo_init.name = "lo"; + lo_init.unit = 0; + lo_init.family = IFNET_FAMILY_LOOPBACK; + lo_init.type = IFT_LOOP; + lo_init.demux = lo_demux; + lo_init.add_proto = lo_add_proto; + lo_init.del_proto = lo_del_proto; + lo_init.framer = lo_framer; + lo_init.softc = &lo_statics[0]; + lo_init.ioctl = lo_ioctl; + lo_init.set_bpf_tap = lo_set_bpf_tap; + + result = ifnet_allocate_extended(&lo_init, &lo_ifp); if (result != 0) { - printf("ifnet_allocate for lo0 failed - %d\n", result); - return; + panic("%s: couldn't allocate loopback ifnet (%d)\n", + __func__, result); + /* NOTREACHED */ } - + ifnet_set_mtu(lo_ifp, LOMTU); - ifnet_set_flags(lo_ifp, IFF_LOOPBACK | IFF_MULTICAST, IFF_LOOPBACK | IFF_MULTICAST); - ifnet_set_offload(lo_ifp, IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | - IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | - IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); - ifnet_set_hdrlen(lo_ifp, sizeof(struct loopback_header)); + ifnet_set_flags(lo_ifp, IFF_LOOPBACK | IFF_MULTICAST, + IFF_LOOPBACK | IFF_MULTICAST); + ifnet_set_offload(lo_ifp, + IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | + IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); + ifnet_set_hdrlen(lo_ifp, sizeof (struct loopback_header)); ifnet_set_eflags(lo_ifp, IFEF_SENDLIST, IFEF_SENDLIST); #if CONFIG_MACF_NET - mac_ifnet_label_init(ifp); + mac_ifnet_label_init(ifp); #endif result = ifnet_attach(lo_ifp, NULL); if (result != 0) { - printf("ifnet_attach lo0 failed - %d\n", result); - return; + panic("%s: couldn't attach loopback ifnet (%d)\n", + __func__, result); + /* NOTREACHED */ + } + bpfattach(lo_ifp, DLT_NULL, sizeof (u_int32_t)); +} + +static int +sysctl_dequeue_max SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + u_int32_t i; + int err; + + i = lo_dequeue_max; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + if (i < 1) + i = 1; + else if (i > LOSNDQ_MAXLEN) + i = LOSNDQ_MAXLEN; + + lo_dequeue_max = i; + + return (err); +} + +static int +sysctl_sched_model SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + u_int32_t i; + int err; + + i = lo_sched_model; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + switch (i) { + case IFNET_SCHED_MODEL_NORMAL: + case IFNET_SCHED_MODEL_DRIVER_MANAGED: + break; + + default: + err = EINVAL; + break; } - bpfattach(lo_ifp, DLT_NULL, sizeof(u_int)); - - loopattach_done = 1; + + if (err == 0 && (err = ifnet_set_output_sched_model(lo_ifp, i)) == 0) + lo_sched_model = i; + + return (err); +} + +static int +sysctl_dequeue_scidx SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + u_int32_t i; + int err; + + i = lo_dequeue_scidx; + + err = sysctl_handle_int(oidp, &i, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) + return (err); + + if (!MBUF_VALID_SCIDX(i)) + return (EINVAL); + + if (lo_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED) + return (ENODEV); + + lo_dequeue_sc = m_service_class_from_idx(i); + lo_dequeue_scidx = MBUF_SCIDX(lo_dequeue_sc); + + return (err); } diff --git a/bsd/net/if_media.h b/bsd/net/if_media.h index 32afe224d..12cbc871b 100644 --- a/bsd/net/if_media.h +++ b/bsd/net/if_media.h @@ -221,7 +221,7 @@ int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, #define IFM_FDX 0x00100000 /* Force full duplex */ #define IFM_HDX 0x00200000 /* Force half duplex */ #define IFM_FLOW 0x00400000 /* enable hardware flow control */ -#define IFM_EEE 0x00800000 /* Driver defined flag */ +#define IFM_EEE 0x00800000 /* Support energy efficient ethernet */ #define IFM_FLAG0 0x01000000 /* Driver defined flag */ #define IFM_FLAG1 0x02000000 /* Driver defined flag */ #define IFM_FLAG2 0x04000000 /* Driver defined flag */ diff --git a/bsd/net/if_mib.c b/bsd/net/if_mib.c index 9ab76f698..a7fd2db5c 100644 --- a/bsd/net/if_mib.c +++ b/bsd/net/if_mib.c @@ -108,12 +108,12 @@ SYSCTL_NODE(_net_link_generic, IFMIB_IFALLDATA, ifalldata, CTLFLAG_RD | CTLFLAG_ static int make_ifmibdata(struct ifnet *, int *, struct sysctl_req *); -int +int make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) { struct ifmibdata ifmd; int error = 0; - + switch(name[1]) { default: error = ENOENT; @@ -127,15 +127,15 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) if (ifnet_is_attached(ifp, 0)) { snprintf(ifmd.ifmd_name, sizeof(ifmd.ifmd_name), "%s%d", ifp->if_name, ifp->if_unit); - + #define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld COPY(pcount); COPY(flags); if_data_internal_to_if_data64(ifp, &ifp->if_data, &ifmd.ifmd_data); #undef COPY - ifmd.ifmd_snd_len = ifp->if_snd.ifq_len; - ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen; - ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops; + ifmd.ifmd_snd_len = IFCQ_LEN(&ifp->if_snd); + ifmd.ifmd_snd_maxlen = IFCQ_MAXLEN(&ifp->if_snd); + ifmd.ifmd_snd_drops = ifp->if_snd.ifcq_dropcnt.packets; } error = SYSCTL_OUT(req, &ifmd, sizeof ifmd); if (error || !req->newptr) @@ -176,15 +176,25 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) break; case IFDATA_SUPPLEMENTAL: { - struct if_traffic_class if_tc; + struct ifmibdata_supplemental *ifmd_supp; + + if ((ifmd_supp = _MALLOC(sizeof (*ifmd_supp), M_TEMP, + M_NOWAIT | M_ZERO)) == NULL) { + error = ENOMEM; + break; + } - if_copy_traffic_class(ifp, &if_tc); - - error = SYSCTL_OUT(req, &if_tc, sizeof(struct if_traffic_class)); + if_copy_traffic_class(ifp, &ifmd_supp->ifmd_traffic_class); + if_copy_data_extended(ifp, &ifmd_supp->ifmd_data_extended); + if_copy_packet_stats(ifp, &ifmd_supp->ifmd_packet_stats); + if_copy_rxpoll_stats(ifp, &ifmd_supp->ifmd_rxpoll_stats); + + error = SYSCTL_OUT(req, ifmd_supp, sizeof (*ifmd_supp)); + _FREE(ifmd_supp, M_TEMP); break; } } - + return error; } diff --git a/bsd/net/if_mib.h b/bsd/net/if_mib.h index 5b773bddf..3dbf262a2 100644 --- a/bsd/net/if_mib.h +++ b/bsd/net/if_mib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,6 +74,9 @@ struct ifmibdata { #ifdef PRIVATE struct ifmibdata_supplemental { struct if_traffic_class ifmd_traffic_class; + struct if_data_extended ifmd_data_extended; + struct if_packet_stats ifmd_packet_stats; + struct if_rxpoll_stats ifmd_rxpoll_stats; }; #endif /* PRIVATE */ diff --git a/bsd/net/if_pflog.c b/bsd/net/if_pflog.c index ae2f9254c..18d6435e0 100644 --- a/bsd/net/if_pflog.c +++ b/bsd/net/if_pflog.c @@ -125,10 +125,6 @@ pfloginit(void) { int i; - if (pf_perim_lock == NULL || pf_lock == NULL) { - panic("%s: called before PF is initialized", __func__); - /* NOTREACHED */ - } LIST_INIT(&pflogif_list); for (i = 0; i < PFLOGIFS_MAX; i++) pflogifs[i] = NULL; diff --git a/bsd/net/if_stf.c b/bsd/net/if_stf.c index c9d24e249..41d1c15db 100644 --- a/bsd/net/if_stf.c +++ b/bsd/net/if_stf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,7 +150,7 @@ #include #endif -#define GET_V4(x) ((const struct in_addr *)(&(x)->s6_addr16[1])) +#define GET_V4(x) ((const struct in_addr *)(const void *)(&(x)->s6_addr16[1])) static lck_grp_t *stf_mtx_grp; @@ -473,7 +473,7 @@ stf_getsrcifa6(struct ifnet *ifp) IFA_UNLOCK(ia); continue; } - sin6 = (struct sockaddr_in6 *)ia->ifa_addr; + sin6 = (struct sockaddr_in6 *)(void *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { IFA_UNLOCK(ia); continue; @@ -524,11 +524,11 @@ stf_pre_output( struct ip6_hdr *ip6; struct in6_ifaddr *ia6; struct sockaddr_in *dst4; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; errno_t result = 0; sc = ifnet_softc(ifp); - dst6 = (const struct sockaddr_in6 *)dst; + dst6 = (const struct sockaddr_in6 *)(const void *)dst; /* just in case */ if ((ifnet_flags(ifp) & IFF_UP) == 0) { @@ -603,7 +603,7 @@ stf_pre_output( ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); lck_mtx_lock(&sc->sc_ro_mtx); - dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; + dst4 = (struct sockaddr_in *)(void *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { /* cache route doesn't match: always the case during the first use */ @@ -616,7 +616,8 @@ stf_pre_output( } } - result = ip_output_list(m, 0, NULL, &sc->sc_ro, IP_OUTARGS, NULL, &ipoa); + result = ip_output_list(m, 0, NULL, &sc->sc_ro, IP_OUTARGS, NULL, + &ipoa); lck_mtx_unlock(&sc->sc_ro_mtx); /* Assumption: ip_output will free mbuf on errors */ @@ -865,7 +866,7 @@ stf_ioctl( error = EAFNOSUPPORT; break; } - sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; + sin6 = (struct sockaddr_in6 *)(void *)ifa->ifa_addr; if (IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { if ( !(ifnet_flags( ifp ) & IFF_UP) ) { /* do this only if the interface is not already up */ diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index a8667845b..8f766ba3c 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,7 +73,12 @@ static errno_t utun_demux(ifnet_t interface, mbuf_t data, char *frame_header, protocol_family_t *protocol); static errno_t utun_framer(ifnet_t interface, mbuf_t *packet, const struct sockaddr *dest, const char *desk_linkaddr, - const char *frame_type); + const char *frame_type +#if KPI_INTERFACE_EMBEDDED + , + u_int32_t *prepend_len, u_int32_t *postpend_len +#endif /* KPI_INTERFACE_EMBEDDED */ + ); static errno_t utun_add_proto(ifnet_t interface, protocol_family_t protocol, const struct ifnet_demux_desc *demux_array, u_int32_t demux_count); @@ -88,15 +93,7 @@ static errno_t utun_proto_input(ifnet_t interface, protocol_family_t protocol, static errno_t utun_proto_pre_output(ifnet_t interface, protocol_family_t protocol, mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type, char *link_layer_dest); - -/* Control block allocated for each kernel control connection */ -struct utun_pcb { - kern_ctl_ref utun_ctlref; - ifnet_t utun_ifp; - u_int32_t utun_unit; - u_int32_t utun_flags; - int utun_ext_ifdata_stats; -}; +__private_extern__ errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m); static kern_ctl_ref utun_kctlref; static u_int32_t utun_family; @@ -104,7 +101,7 @@ static OSMallocTag utun_malloc_tag; static SInt32 utun_ifcount = 0; /* Prepend length */ -static void* +void* utun_alloc(size_t size) { size_t *mem = OSMalloc(size + sizeof(size_t), utun_malloc_tag); @@ -117,7 +114,7 @@ utun_alloc(size_t size) return (void*)mem; } -static void +void utun_free(void *ptr) { size_t *size = ptr; @@ -423,7 +420,9 @@ utun_ctl_disconnect( struct utun_pcb *pcb = unitinfo; ifnet_t ifp = pcb->utun_ifp; errno_t result = 0; - + + utun_cleanup_crypto(pcb); + pcb->utun_ctlref = NULL; pcb->utun_unit = 0; @@ -455,37 +454,7 @@ utun_ctl_send( mbuf_t m, __unused int flags) { - struct utun_pcb *pcb = unitinfo; - errno_t result; - - mbuf_pkthdr_setrcvif(m, pcb->utun_ifp); - - bpf_tap_in(pcb->utun_ifp, DLT_NULL, m, 0, 0); - - if (pcb->utun_flags & UTUN_FLAGS_NO_INPUT) { - /* flush data */ - mbuf_freem(m); - return 0; - } - - if (!pcb->utun_ext_ifdata_stats) { - struct ifnet_stat_increment_param incs; - - bzero(&incs, sizeof(incs)); - incs.packets_in = 1; - incs.bytes_in = mbuf_pkthdr_len(m); - result = ifnet_input(pcb->utun_ifp, m, &incs); - } else { - result = ifnet_input(pcb->utun_ifp, m, NULL); - } - if (result != 0) { - ifnet_stat_increment_in(pcb->utun_ifp, 0, 0, 1); - - printf("utun_ctl_send - ifnet_input failed: %d\n", result); - mbuf_freem(m); - } - - return 0; + return utun_pkt_input((struct utun_pcb *)unitinfo, m); } static errno_t @@ -518,6 +487,30 @@ utun_ctl_setopt( pcb->utun_flags = *(u_int32_t *)data; break; + case UTUN_OPT_ENABLE_CRYPTO: + result = utun_ctl_enable_crypto(kctlref, unit, unitinfo, opt, data, len); + break; + + case UTUN_OPT_CONFIG_CRYPTO_KEYS: + result = utun_ctl_config_crypto_keys(kctlref, unit, unitinfo, opt, data, len); + break; + + case UTUN_OPT_UNCONFIG_CRYPTO_KEYS: + result = utun_ctl_unconfig_crypto_keys(kctlref, unit, unitinfo, opt, data, len); + break; + + case UTUN_OPT_DISABLE_CRYPTO: + result = utun_ctl_disable_crypto(kctlref, unit, unitinfo, opt, data, len); + break; + + case UTUN_OPT_STOP_CRYPTO_DATA_TRAFFIC: + result = utun_ctl_stop_crypto_data_traffic(kctlref, unit, unitinfo, opt, data, len); + break; + + case UTUN_OPT_START_CRYPTO_DATA_TRAFFIC: + result = utun_ctl_start_crypto_data_traffic(kctlref, unit, unitinfo, opt, data, len); + break; + case UTUN_OPT_EXT_IFDATA_STATS: if (len != sizeof(int)) { result = EMSGSIZE; @@ -586,6 +579,10 @@ utun_ctl_getopt( *len = snprintf(data, *len, "%s%d", ifnet_name(pcb->utun_ifp), ifnet_unit(pcb->utun_ifp)) + 1; break; + case UTUN_OPT_GENERATE_CRYPTO_KEYS_IDX: + result = utun_ctl_generate_crypto_keys_idx(kctlref, unit, unitinfo, opt, data, len); + break; + default: result = ENOPROTOOPT; break; @@ -611,8 +608,18 @@ utun_output( return 0; } + // otherwise, fall thru to ctl_enqueumbuf if (pcb->utun_ctlref) { - int length = mbuf_pkthdr_len(data); + int length; + + // only pass packets to utun-crypto if crypto is enabled and 'suspend data traffic' is not. + if ((pcb->utun_flags & (UTUN_FLAGS_CRYPTO | UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC)) == UTUN_FLAGS_CRYPTO) { + if (utun_pkt_crypto_output(pcb, &data) == 0) { + return 0; + } + } + + length = mbuf_pkthdr_len(data); result = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, data, CTL_DATA_EOR); if (result != 0) { mbuf_freem(data); @@ -657,7 +664,13 @@ utun_framer( mbuf_t *packet, __unused const struct sockaddr *dest, __unused const char *desk_linkaddr, - const char *frame_type) + const char *frame_type +#if KPI_INTERFACE_EMBEDDED + , + u_int32_t *prepend_len, + u_int32_t *postpend_len +#endif /* KPI_INTERFACE_EMBEDDED */ + ) { if (mbuf_prepend(packet, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { printf("utun_framer - ifnet_output prepend failed\n"); @@ -667,6 +680,10 @@ utun_framer( // just return, because the buffer was freed in mbuf_prepend return EJUSTRETURN; } +#if KPI_INTERFACE_EMBEDDED + *prepend_len = sizeof(protocol_family_t); + *postpend_len = 0; +#endif /* KPI_INTERFACE_EMBEDDED */ // place protocol number at the beginning of the mbuf *(protocol_family_t *)mbuf_data(*packet) = htonl(*(protocol_family_t *)(uintptr_t)(size_t)frame_type); @@ -791,3 +808,48 @@ utun_attach_proto( return result; } +errno_t +utun_pkt_input (struct utun_pcb *pcb, mbuf_t m) +{ + errno_t result; + protocol_family_t protocol; + + mbuf_pkthdr_setrcvif(m, pcb->utun_ifp); + + bpf_tap_in(pcb->utun_ifp, DLT_NULL, m, 0, 0); + + if (pcb->utun_flags & UTUN_FLAGS_NO_INPUT) { + /* flush data */ + mbuf_freem(m); + return 0; + } + protocol = ntohl(*(u_int32_t *)mbuf_data(m)); + + // quick exit for keepalive packets + if (protocol == AF_UTUN && pcb->utun_flags & UTUN_FLAGS_CRYPTO) { + if (utun_pkt_crypto_output(pcb, &m) == 0) { + return 0; + } + printf("%s: utun_pkt_crypto_output failed, flags %x\n", __FUNCTION__, pcb->utun_flags); + return EINVAL; + } + + if (!pcb->utun_ext_ifdata_stats) { + struct ifnet_stat_increment_param incs; + + bzero(&incs, sizeof(incs)); + incs.packets_in = 1; + incs.bytes_in = mbuf_pkthdr_len(m); + result = ifnet_input(pcb->utun_ifp, m, &incs); + } else { + result = ifnet_input(pcb->utun_ifp, m, NULL); + } + if (result != 0) { + ifnet_stat_increment_in(pcb->utun_ifp, 0, 0, 1); + + printf("%s - ifnet_input failed: %d\n", __FUNCTION__, result); + mbuf_freem(m); + } + + return 0; +} diff --git a/bsd/net/if_utun.h b/bsd/net/if_utun.h index d1860e11a..32379a882 100644 --- a/bsd/net/if_utun.h +++ b/bsd/net/if_utun.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -30,8 +30,24 @@ #ifndef _NET_IF_UTUN_H_ #define _NET_IF_UTUN_H_ +#include + #ifdef KERNEL_PRIVATE +#include + +/* Control block allocated for each kernel control connection */ +struct utun_pcb { + kern_ctl_ref utun_ctlref; + ifnet_t utun_ifp; + u_int32_t utun_unit; + u_int32_t utun_flags; + int utun_ext_ifdata_stats; + utun_crypto_ctx_t utun_crypto_ctx[UTUN_CRYPTO_CTX_NUM_DIRS]; +}; + +void* utun_alloc(size_t size); +void utun_free(void *ptr); errno_t utun_register_control(void); #endif @@ -44,17 +60,34 @@ errno_t utun_register_control(void); /* * Socket option names to manage utun */ -#define UTUN_OPT_FLAGS 1 -#define UTUN_OPT_IFNAME 2 -#define UTUN_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ -#define UTUN_OPT_INC_IFDATA_STATS_IN 4 /* set to increment stat counters (type struct utun_stats_param) */ -#define UTUN_OPT_INC_IFDATA_STATS_OUT 5 /* set to increment stat counters (type struct utun_stats_param) */ +#define UTUN_OPT_FLAGS 1 +#define UTUN_OPT_IFNAME 2 +#define UTUN_OPT_EXT_IFDATA_STATS 3 /* get|set (type int) */ +#define UTUN_OPT_INC_IFDATA_STATS_IN 4 /* set to increment stat counters (type struct utun_stats_param) */ +#define UTUN_OPT_INC_IFDATA_STATS_OUT 5 /* set to increment stat counters (type struct utun_stats_param) */ +#define UTUN_OPT_ENABLE_CRYPTO 6 +#define UTUN_OPT_CONFIG_CRYPTO_KEYS 7 +#define UTUN_OPT_UNCONFIG_CRYPTO_KEYS 8 +#define UTUN_OPT_GENERATE_CRYPTO_KEYS_IDX 9 +#define UTUN_OPT_DISABLE_CRYPTO 10 +#define UTUN_OPT_STOP_CRYPTO_DATA_TRAFFIC 11 +#define UTUN_OPT_START_CRYPTO_DATA_TRAFFIC 12 /* * Flags for by UTUN_OPT_FLAGS */ #define UTUN_FLAGS_NO_OUTPUT 0x0001 #define UTUN_FLAGS_NO_INPUT 0x0002 +#define UTUN_FLAGS_CRYPTO 0x0004 +#define UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC 0x0008 + +/* + * utun packet type flags + */ +#define UTUN_PKT_TYPE_KEEPALIVE 0x0001 +#define UTUN_PKT_TYPE_IPSEC 0x0002 +#define UTUN_PKT_TYPE_DTLS 0x0004 + /* * utun stats parameter structure diff --git a/bsd/net/if_utun_crypto.c b/bsd/net/if_utun_crypto.c new file mode 100644 index 000000000..176f4cd66 --- /dev/null +++ b/bsd/net/if_utun_crypto.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + + +#include +#include +#include +#include +#include +#include +#include + +void +utun_cleanup_crypto (struct utun_pcb *pcb) +{ + utun_cleanup_all_crypto_ipsec(pcb); + // utun_cleanup_all_crypto_dtls(pcb); + pcb->utun_flags &= ~UTUN_FLAGS_CRYPTO; +} + +errno_t +utun_ctl_enable_crypto (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto context args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - ensure that the crypto context is *not* already valid (don't recreate already valid context). + * - we have only one context per direction and type. + * - any error should be equivalent to noop. + */ + if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + int idx; + utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; + utun_crypto_ctx_t *crypto_ctx; + + if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); + return EINVAL; + } + if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); + return EINVAL; + } + if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); + return EINVAL; + } + if (crypto_args->args_ulen != sizeof(crypto_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + if (crypto_args->type == UTUN_CRYPTO_TYPE_IPSEC) { + utun_ctl_enable_crypto_ipsec(pcb, crypto_args); + } else { + // unsupported + return EPROTONOSUPPORT; + } + for (idx = 0; idx < UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_MAX); idx++) { + crypto_ctx = &pcb->utun_crypto_ctx[idx]; + if (crypto_ctx->valid) { + return EBADF; + } + + crypto_ctx->type = crypto_args->type; + LIST_INIT(&crypto_ctx->keys_listhead); + crypto_ctx->valid = 1; + } + // data traffic is stopped by default + pcb->utun_flags |= (UTUN_FLAGS_CRYPTO | UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC); + return 0; + } +} + +errno_t +utun_ctl_disable_crypto (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto context args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - ensure that the crypto context *is* already valid (don't release invalid context). + * - we have only one context per direction and type. + * - ensure that the crypto context has no crypto material. + * - any error should be equivalent to noop. + */ + if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; + + if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); + return EINVAL; + } + if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); + return EINVAL; + } + if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); + return EINVAL; + } + if (crypto_args->args_ulen != sizeof(crypto_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + + if (crypto_args->type == UTUN_CRYPTO_TYPE_IPSEC) { + utun_ctl_disable_crypto_ipsec(pcb); + } else { + // unsupported + return EPROTONOSUPPORT; + } + } + pcb->utun_flags &= ~(UTUN_FLAGS_CRYPTO | UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC); + return 0; +} + +errno_t +utun_ctl_config_crypto_keys (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto material args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - crypto material direction and type must match the associated crypto context's. + * - we can have a list of crypto materials per context. + * - ensure that the crypto context is already valid (don't add crypto material to invalid context). + * - any error should be equivalent to noop. + */ + if (len < UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + int idx; + utun_crypto_keys_args_t *crypto_keys_args = (__typeof__(crypto_keys_args))data; + utun_crypto_ctx_t *crypto_ctx; + utun_crypto_keys_t *crypto_keys = NULL; + + if (crypto_keys_args->ver == 0 || crypto_keys_args->ver >= UTUN_CRYPTO_KEYS_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_keys_args->ver); + return EINVAL; + } + if (crypto_keys_args->dir == 0 || crypto_keys_args->dir >= UTUN_CRYPTO_DIR_MAX) { + printf("%s: dir check failed %d\n", __FUNCTION__, crypto_keys_args->dir); + return EINVAL; + } + if (crypto_keys_args->type == 0 || crypto_keys_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_keys_args->type); + return EINVAL; + } + if (len < UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)len, (int)UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)); + return EINVAL; + } + idx = UTUN_CRYPTO_DIR_TO_IDX(crypto_keys_args->dir); + crypto_ctx = &pcb->utun_crypto_ctx[idx]; + if (!crypto_ctx->valid) { + return EBADF; + } + if (crypto_keys_args->type != crypto_ctx->type) { + // can't add keymat to context with different crypto type + return ENOENT; + } + crypto_keys = utun_alloc(sizeof(*crypto_keys)); + if (!crypto_keys) { + return ENOBUFS; + } + bzero(crypto_keys, sizeof(*crypto_keys)); + if (crypto_keys_args->args_ulen != sizeof(crypto_keys_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + + // branch-off for ipsec vs. dtls + if (crypto_keys_args->type == UTUN_CRYPTO_TYPE_IPSEC) { + errno_t err; + if ((err = utun_ctl_config_crypto_keys_ipsec(pcb, crypto_keys_args, crypto_keys))) { + utun_free(crypto_keys); + return err; + } + } else { + // unsupported + utun_free(crypto_keys); + return EPROTONOSUPPORT; + } + crypto_keys->type = crypto_keys_args->type; + LIST_INSERT_HEAD(&crypto_ctx->keys_listhead, crypto_keys, chain); + crypto_keys->valid = 1; + } + + return 0; +} + +errno_t +utun_ctl_unconfig_crypto_keys (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto material args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - crypto material direction and type must match the associated crypto context's. + * - we can have a list of crypto materials per context. + * - ensure that the crypto context is already valid (don't add crypto material to invalid context). + * - any error should be equivalent to noop. + */ + if (len < UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + int idx; + utun_crypto_keys_args_t *crypto_keys_args = (__typeof__(crypto_keys_args))data; + utun_crypto_ctx_t *crypto_ctx; + utun_crypto_keys_t *cur_crypto_keys, *nxt_crypto_keys; + + if (crypto_keys_args->ver == 0 || crypto_keys_args->ver >= UTUN_CRYPTO_KEYS_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_keys_args->ver); + return EINVAL; + } + if (crypto_keys_args->dir == 0 || crypto_keys_args->dir >= UTUN_CRYPTO_DIR_MAX) { + printf("%s: dir check failed %d\n", __FUNCTION__, crypto_keys_args->dir); + return EINVAL; + } + if (crypto_keys_args->type == 0 || crypto_keys_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_keys_args->type); + return EINVAL; + } + if (len < UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)len, (int)UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(crypto_keys_args)); + return EINVAL; + } + idx = UTUN_CRYPTO_DIR_TO_IDX(crypto_keys_args->dir); + crypto_ctx = &pcb->utun_crypto_ctx[idx]; + if (!crypto_ctx->valid) { + return EBADF; + } + if (crypto_keys_args->type != crypto_ctx->type) { + // can't add keymat to context with different crypto type + return ENOENT; + } + if (crypto_keys_args->args_ulen != sizeof(crypto_keys_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + + // traverse crypto materials looking for the right one + for (cur_crypto_keys = (__typeof__(cur_crypto_keys))LIST_FIRST(&crypto_ctx->keys_listhead); + cur_crypto_keys != NULL; + cur_crypto_keys = nxt_crypto_keys) { + nxt_crypto_keys = (__typeof__(nxt_crypto_keys))LIST_NEXT(cur_crypto_keys, chain); + // branch-off for ipsec vs. dtls + if (crypto_keys_args->type == UTUN_CRYPTO_TYPE_IPSEC) { + if (crypto_keys_args->u.ipsec_v1.spi == cur_crypto_keys->state.u.ipsec.spi) { + errno_t err; + if ((err = utun_ctl_unconfig_crypto_keys_ipsec(crypto_keys_args, cur_crypto_keys))) { + return err; + } + LIST_REMOVE(cur_crypto_keys, chain); + bzero(cur_crypto_keys, sizeof(*cur_crypto_keys)); + utun_free(cur_crypto_keys); + return 0; + } + } else { + // unsupported + return EPROTONOSUPPORT; + } + } + // TODO: if there is no SA left, ensure utun can't decrypt/encrypt packets directly. it should rely on the vpnplugin for that. + } + + return 0; +} + +errno_t +utun_ctl_generate_crypto_keys_idx (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t *len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto material index args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - crypto material direction and type must match the associated crypto context's. + * - we can have a list of crypto materials per context. + * - any error should be equivalent to noop. + */ + if (*len < UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + int idx; + utun_crypto_keys_idx_args_t *crypto_keys_idx_args = (__typeof__(crypto_keys_idx_args))data; + utun_crypto_ctx_t *crypto_ctx; + + if (crypto_keys_idx_args->ver == 0 || crypto_keys_idx_args->ver >= UTUN_CRYPTO_KEYS_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_keys_idx_args->ver); + return EINVAL; + } + if (crypto_keys_idx_args->dir == 0 || crypto_keys_idx_args->dir >= UTUN_CRYPTO_DIR_MAX) { + printf("%s: dir check failed %d\n", __FUNCTION__, crypto_keys_idx_args->dir); + return EINVAL; + } + if (crypto_keys_idx_args->type == 0 || crypto_keys_idx_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_keys_idx_args->type); + return EINVAL; + } + if (*len < UTUN_CRYPTO_KEYS_IDX_ARGS_TOTAL_SIZE(crypto_keys_idx_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)*len, (int)UTUN_CRYPTO_KEYS_IDX_ARGS_TOTAL_SIZE(crypto_keys_idx_args)); + return EINVAL; + } + idx = UTUN_CRYPTO_DIR_TO_IDX(crypto_keys_idx_args->dir); + crypto_ctx = &pcb->utun_crypto_ctx[idx]; + if (!crypto_ctx->valid) { + return EBADF; + } + if (crypto_keys_idx_args->type != crypto_ctx->type) { + // can't add keymat to context with different crypto type + return ENOENT; + } + if (crypto_keys_idx_args->args_ulen != sizeof(crypto_keys_idx_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + + // traverse crypto materials looking for the right one + // branch-off for ipsec vs. dtls + if (crypto_keys_idx_args->type == UTUN_CRYPTO_TYPE_IPSEC) { + errno_t err; + if ((err = utun_ctl_generate_crypto_keys_idx_ipsec(crypto_keys_idx_args))) { + return err; + } + } else { + // unsupported + return EPROTONOSUPPORT; + } + } + + return 0; +} + +errno_t +utun_ctl_stop_crypto_data_traffic (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto context args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - ensure that the crypto context *is* already valid (don't release invalid context). + * - we have only one context per direction and type. + * - ensure that the crypto context has no crypto material. + * - any error should be equivalent to noop. + */ + if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; + + if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); + return EINVAL; + } + if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); + return EINVAL; + } + if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); + return EINVAL; + } + if (crypto_args->args_ulen != sizeof(crypto_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + + if ((pcb->utun_flags & UTUN_FLAGS_CRYPTO) == 0) { + printf("%s: crypto is already disabled\n", __FUNCTION__); + return EINVAL; + } + + if (crypto_args->type != UTUN_CRYPTO_TYPE_IPSEC) { + // unsupported + return EPROTONOSUPPORT; + } + } + pcb->utun_flags |= UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC; + return 0; +} + +errno_t +utun_ctl_start_crypto_data_traffic (__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len) +{ + struct utun_pcb *pcb = unitinfo; + + /* + * - verify the crypto context args passed from user-land. + * - check the size of the argument buffer. + * - check the direction (IN or OUT) + * - check the type (IPSec or DTLS) + * - ensure that the crypto context *is* already valid (don't release invalid context). + * - we have only one context per direction and type. + * - ensure that the crypto context has no crypto material. + * - any error should be equivalent to noop. + */ + if (len < UTUN_CRYPTO_ARGS_HDR_SIZE) { + return EMSGSIZE; + } else { + utun_crypto_args_t *crypto_args = (__typeof__(crypto_args))data; + + if (crypto_args->ver == 0 || crypto_args->ver >= UTUN_CRYPTO_ARGS_VER_MAX) { + printf("%s: ver check failed %d\n", __FUNCTION__, crypto_args->ver); + return EINVAL; + } + if (crypto_args->type == 0 || crypto_args->type >= UTUN_CRYPTO_TYPE_MAX) { + printf("%s: type check failed %d\n", __FUNCTION__, crypto_args->type); + return EINVAL; + } + if (len < UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)) { + printf("%s: vlen check failed (%d,%d)\n", __FUNCTION__, + (int)len, (int)UTUN_CRYPTO_ARGS_TOTAL_SIZE(crypto_args)); + return EINVAL; + } + if (crypto_args->args_ulen != sizeof(crypto_args->u)) { + printf("%s: compatibility mode\n", __FUNCTION__); + } + + if ((pcb->utun_flags & UTUN_FLAGS_CRYPTO) == 0) { + printf("%s: crypto is already disabled\n", __FUNCTION__); + return EINVAL; + } + + if (crypto_args->type != UTUN_CRYPTO_TYPE_IPSEC) { + // unsupported + return EPROTONOSUPPORT; + } + } + pcb->utun_flags &= ~UTUN_FLAGS_CRYPTO_STOP_DATA_TRAFFIC; + return 0; +} + +int +utun_pkt_crypto_output (struct utun_pcb *pcb, mbuf_t *m) +{ + int idx = UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT); + if (!pcb->utun_crypto_ctx[idx].valid) { + printf("%s: context is invalid %d\n", __FUNCTION__, pcb->utun_crypto_ctx[idx].valid); + return -1; + } + if (pcb->utun_crypto_ctx[idx].type == UTUN_CRYPTO_TYPE_IPSEC) { + return(utun_pkt_ipsec_output(pcb, m)); + } else { + // unsupported + printf("%s: type is invalid %d\n", __FUNCTION__, pcb->utun_crypto_ctx[idx].type); + } + return -1; +} diff --git a/bsd/net/if_utun_crypto.h b/bsd/net/if_utun_crypto.h new file mode 100644 index 000000000..804ffa91e --- /dev/null +++ b/bsd/net/if_utun_crypto.h @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_UTUN_CRYPTO_H_ +#define _NET_IF_UTUN_CRYPTO_H_ + +// constants used in configuring the crypto context +typedef enum utun_crypto_ver { + UTUN_CRYPTO_VER_1 = 1, + UTUN_CRYPTO_VER_MAX, +} utun_crypto_ver_t; + +#define UTUN_CRYPTO_KEYS_IPSEC_VER_1 UTUN_CRYPTO_VER_1 +#define UTUN_CRYPTO_IPSEC_VER_1 UTUN_CRYPTO_VER_1 + +#define UTUN_CRYPTO_ARGS_VER_MAX UTUN_CRYPTO_VER_MAX +#define UTUN_CRYPTO_KEYS_ARGS_VER_MAX UTUN_CRYPTO_VER_MAX + +typedef enum utun_crypto_dir { + UTUN_CRYPTO_DIR_IN = 1, + UTUN_CRYPTO_DIR_OUT, + UTUN_CRYPTO_DIR_MAX, +} utun_crypto_dir_t; + +#define UTUN_CRYPTO_CTX_NUM_DIRS 2 + +#define BITSTOBYTES(n) (n >> 3) +#define BYTESTOBITS(n) (n << 3) + +#define MAX_KEY_AUTH_LEN_BITS 512 // corresponds to SHA512 +#define MAX_KEY_AUTH_LEN_BYTES (BITSTOBYTES(MAX_KEY_AUTH_LEN_BITS)) +#define MAX_KEY_ENC_LEN_BITS 256 // corresponds to AES256 +#define MAX_KEY_ENC_LEN_BYTES (BITSTOBYTES(MAX_KEY_ENC_LEN_BITS)) + +typedef enum utun_crypto_type { + UTUN_CRYPTO_TYPE_IPSEC = 1, + UTUN_CRYPTO_TYPE_DTLS, + UTUN_CRYPTO_TYPE_MAX, +} utun_crypto_type_t; + +typedef enum if_utun_crypto_ipsec_mode { + IF_UTUN_CRYPTO_IPSEC_MODE_NONE = 0, + IF_UTUN_CRYPTO_IPSEC_MODE_TRANSPORT, + IF_UTUN_CRYPTO_IPSEC_MODE_TUNNEL, + IF_UTUN_CRYPTO_IPSEC_MODE_MAX, +} if_utun_crypto_ipsec_mode_t; + +typedef enum if_utun_crypto_ipsec_proto { + IF_UTUN_CRYPTO_IPSEC_PROTO_NONE = 0, + IF_UTUN_CRYPTO_IPSEC_PROTO_ESP, + IF_UTUN_CRYPTO_IPSEC_PROTO_AH, + IF_UTUN_CRYPTO_IPSEC_PROTO_MAX, +} if_utun_crypto_ipsec_proto_t; + +typedef enum if_utun_crypto_ipsec_auth { + IF_UTUN_CRYPTO_IPSEC_AUTH_NONE = 0, + IF_UTUN_CRYPTO_IPSEC_AUTH_MD5, + IF_UTUN_CRYPTO_IPSEC_AUTH_SHA1, + IF_UTUN_CRYPTO_IPSEC_AUTH_SHA256, + IF_UTUN_CRYPTO_IPSEC_AUTH_SHA384, + IF_UTUN_CRYPTO_IPSEC_AUTH_SHA512, + IF_UTUN_CRYPTO_IPSEC_AUTH_MAX, +} if_utun_crypto_ipsec_auth_t; + +typedef enum if_utun_crypto_ipsec_enc { + IF_UTUN_CRYPTO_IPSEC_ENC_NONE = 0, + IF_UTUN_CRYPTO_IPSEC_ENC_DES, + IF_UTUN_CRYPTO_IPSEC_ENC_3DES, + IF_UTUN_CRYPTO_IPSEC_ENC_AES128, + IF_UTUN_CRYPTO_IPSEC_ENC_AES256, + IF_UTUN_CRYPTO_IPSEC_ENC_MAX, +} if_utun_crypto_ipsec_enc_t; + +typedef enum if_utun_crypto_ipsec_keepalive { + IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_NONE = 0, + IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_NATT, + IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_ESP, + IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_MAX, +} if_utun_crypto_ipsec_keepalive_t; + +typedef enum if_utun_crypto_ipsec_natd { + IF_UTUN_CRYPTO_IPSEC_NATD_NONE = 0, + IF_UTUN_CRYPTO_IPSEC_NATD_MINE, + IF_UTUN_CRYPTO_IPSEC_NATD_PEER, + IF_UTUN_CRYPTO_IPSEC_NATD_BOTH, + IF_UTUN_CRYPTO_IPSEC_NATD_MAX, +} if_utun_crypto_ipsec_natd_t; + +// structures used for storing the App's keying index arguments +typedef struct utun_crypto_keys_idx_ipsec_args_v1 { + struct sockaddr_storage src_addr; // v4 or v6 socket address (ignore port numbers) + struct sockaddr_storage dst_addr; // v4 or v6 socket address (ignore port numbers) + if_utun_crypto_ipsec_proto_t proto; + if_utun_crypto_ipsec_mode_t mode; + u_int32_t reqid; // policy's reqid, default to 0 for now since we are avoiding policies. + u_int32_t spi; // 0 when requesting the index, otherwise it contains the resulting index + u_int32_t spirange_min; // default to 0 + u_int32_t spirange_max; // default to 0xffffffff +} __attribute__((packed)) utun_crypto_keys_idx_ipsec_args_v1_t; + +typedef struct utun_crypto_keys_idx_dtls_args_v1 { + // stub for DTLS keying index arguments + u_int32_t unused; // place holder +} __attribute__((packed)) utun_crypto_keys_idx_dtls_args_v1_t; + +// App's parent structure for sending/storing keying index arguments +typedef struct utun_crypto_keys_idx_args { + utun_crypto_ver_t ver; + utun_crypto_type_t type; + utun_crypto_dir_t dir; + u_int32_t args_ulen; + u_int32_t varargs_buflen; + union { + // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_CTX_IDX_ARGS_HDR_SIZE breaks backwards compatibility + utun_crypto_keys_idx_ipsec_args_v1_t ipsec_v1; + utun_crypto_keys_idx_dtls_args_v1_t dtls_v1; + // future (additional) versions of the arguments may be placed here + } u; + u_int8_t varargs_buf[0]; +} __attribute__((aligned(4), packed)) utun_crypto_keys_idx_args_t; + +// structures used for storing the App's keying material arguments +typedef struct utun_crypto_keys_ipsec_args_v1 { + struct sockaddr_storage src_addr; // v4 or v6 socket address (ignore port numbers) + struct sockaddr_storage dst_addr; // v4 or v6 socket address (ignore port numbers) + if_utun_crypto_ipsec_proto_t proto; + if_utun_crypto_ipsec_mode_t mode; + if_utun_crypto_ipsec_auth_t alg_auth; + if_utun_crypto_ipsec_enc_t alg_enc; + if_utun_crypto_ipsec_keepalive_t keepalive; + if_utun_crypto_ipsec_natd_t natd; + u_int8_t replay; // window size default to 4 + u_int8_t punt_rx_keepalive; + u_int16_t interval_tx_keepalive; + u_int16_t key_auth_len; // 128 or 160 or 192 or 256 or 384 or 512 + u_int16_t key_enc_len; // 64 or 128 or 192 or 256 + u_int16_t natt_port; // if non-zero flags will be set to include SADB_X_EXT_NATT + u_int16_t unused; + u_int32_t seq; // default to 0 + u_int32_t spi; + u_int32_t pid; // vpnagent's process id + u_int32_t reqid; // policy's reqid, default to 0 for now since we are avoiding policies. + u_int64_t lifetime_hard; // value in seconds + u_int64_t lifetime_soft; // value in seconds + // key_auth and key_enc will actually be stored in utun_crypto_KEYS_args_t.varargs_buf +} __attribute__((packed)) utun_crypto_keys_ipsec_args_v1_t; + +typedef struct utun_crypto_ctx_dtls_mat_args_v1 { + // stub for DTLS keying material arguments + u_int32_t unused; // place holder +} __attribute__((packed)) utun_crypto_keys_dtls_args_v1_t; + +// App's parent structure for sending/storing keying material arguments +typedef struct utun_crypto_keys_args { + utun_crypto_ver_t ver; + utun_crypto_type_t type; + utun_crypto_dir_t dir; + u_int32_t args_ulen; + u_int32_t varargs_buflen; + union { + // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE breaks backwards compatibility + utun_crypto_keys_ipsec_args_v1_t ipsec_v1; + utun_crypto_keys_dtls_args_v1_t dtls_v1; + // future (additional) versions of the arguments may be placed here + } u; + u_int8_t varargs_buf[0]; +} __attribute__((aligned(4), packed)) utun_crypto_keys_args_t; + +// structures used for storing the App's crypto arguments +typedef struct utun_crypto_ipsec_args_v1 { + // stub for IPSec crypto context arguments + u_int32_t unused; // place holder +} __attribute__((packed)) utun_crypto_ipsec_args_v1_t; + +typedef struct utun_crypto_dtls_args_v1 { + // stub for DTLS crypto context arguments + u_int32_t unused; // place holder +} __attribute__((packed)) utun_crypto_dtls_args_v1_t; + +// App's parent structure for starting/stopping crypto +typedef struct utun_crypto_args { + utun_crypto_ver_t ver; + utun_crypto_type_t type; + u_int32_t stop_data_traffic; + u_int32_t args_ulen; + u_int32_t varargs_buflen; + union { + // don't change the order, number, or size of elements above this line (in this struct). otherwise UTUN_CRYPTO_ARGS_HDR_SIZE breaks backwards compatibility + utun_crypto_ipsec_args_v1_t ipsec_v1; + utun_crypto_dtls_args_v1_t dtls_v1; + // future (additional) versions of the arguments may be placed here + } u; + u_int8_t varargs_buf[0]; // must be at the end of this struct +} __attribute__((aligned(4), packed)) utun_crypto_args_t; + +#ifdef KERNEL_PRIVATE + +#include +#include +#include +#include +#include +#include + +struct utun_pcb; + +// structures used for storing kernel's keying material runtime state +typedef struct utun_crypto_keys_ipsec_state { + // kernel's ipsec keying material state + u_int32_t spi; + struct secashead *sah; + struct secasvar *sav; + u_int8_t proto; + u_int8_t ifamily; + u_int8_t mode; + u_int8_t unused; +} __attribute__((packed)) utun_crypto_keys_ipsec_state_t; + +typedef struct utun_crypto_keys_dtls_state { + // stub for kernel's DTLS keying material state + u_int32_t unused; // place holder +} __attribute__((packed)) utun_crypto_keys_dtls_state_t; + +// kernel's parent structure for keying material state +typedef struct utun_crypto_keys_state { + union { + utun_crypto_keys_ipsec_state_t ipsec; + utun_crypto_keys_dtls_state_t dtls; + } u; +} __attribute__((aligned(4), packed)) utun_crypto_keys_state_t; + +// kernel's parent structure for keying material +typedef struct utun_crypto_keys { + int valid; // is valid? + utun_crypto_type_t type; + u_int16_t unused; + utun_crypto_keys_state_t state; // runtime state + LIST_ENTRY(utun_crypto_keys) chain; +} __attribute__((aligned(4), packed)) utun_crypto_keys_t; + +// kernel's parent structure for all crypto stuff +typedef struct utun_crypto_ctx { + int valid; + utun_crypto_type_t type; + u_int16_t unused; + LIST_HEAD(chain, utun_crypto_keys) keys_listhead; +} __attribute__((aligned(4), packed)) utun_crypto_ctx_t; + +#define UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_keys_idx_args_t *)0)->u)) +#define UTUN_CRYPTO_KEYS_IDX_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE + args->args_ulen) +#define UTUN_CRYPTO_KEYS_IDX_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_KEYS_IDX_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) + +#define UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_keys_args_t *)0)->u)) +#define UTUN_CRYPTO_KEYS_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE + args->args_ulen) +#define UTUN_CRYPTO_KEYS_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_KEYS_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) + +#define UTUN_CRYPTO_ARGS_HDR_SIZE ((size_t)(&((utun_crypto_args_t *)0)->u)) +#define UTUN_CRYPTO_ARGS_VARARGS_BUF(args) ((u_int8_t *)args + UTUN_CRYPTO_ARGS_HDR_SIZE + args->args_ulen) +#define UTUN_CRYPTO_ARGS_TOTAL_SIZE(args) ((size_t)(UTUN_CRYPTO_ARGS_HDR_SIZE + args->args_ulen + args->varargs_buflen)) + +#define UTUN_CRYPTO_DIR_TO_IDX(dir) (dir - 1) +#define UTUN_CRYPTO_IDX_TO_DIR(idx) (idx + 1) + +void +utun_cleanup_crypto(struct utun_pcb *pcb); + +errno_t +utun_ctl_enable_crypto(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len); + +errno_t +utun_ctl_disable_crypto(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len); + +errno_t +utun_ctl_config_crypto_keys(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len); + +errno_t +utun_ctl_unconfig_crypto_keys(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len); + +errno_t +utun_ctl_generate_crypto_keys_idx(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t *len); + +errno_t +utun_ctl_stop_crypto_data_traffic(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len); + +errno_t +utun_ctl_start_crypto_data_traffic(__unused kern_ctl_ref kctlref, + __unused u_int32_t unit, + __unused void *unitinfo, + __unused int opt, + void *data, + size_t len); + +int +utun_pkt_crypto_output(struct utun_pcb *pcb, mbuf_t *m); + +#endif // KERNEL_PRIVATE + +#endif // _NET_IF_UTUN_CRYPTO_H_ diff --git a/bsd/net/if_utun_crypto_ipsec.c b/bsd/net/if_utun_crypto_ipsec.c new file mode 100644 index 000000000..0166ba13c --- /dev/null +++ b/bsd/net/if_utun_crypto_ipsec.c @@ -0,0 +1,1088 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern lck_mtx_t *sadb_mutex; +extern int esp_udp_encap_port; // udp encap listening port +extern int ipsec_policy_count; +extern int ipsec_bypass; +extern int natt_keepalive_interval; + +static int utun_punt_rx_keepalive = 0; // optional global control + +extern errno_t utun_pkt_input (struct utun_pcb *pcb, mbuf_t m); + +static u_int8_t +utun_ipsec_mode_to_sadb_mode (if_utun_crypto_ipsec_mode_t mode) +{ + switch (mode) { + case IF_UTUN_CRYPTO_IPSEC_MODE_TRANSPORT: + return IPSEC_MODE_TRANSPORT; + case IF_UTUN_CRYPTO_IPSEC_MODE_TUNNEL: + return IPSEC_MODE_TUNNEL; + default: + return 0; + } +} + +static u_int16_t +utun_ipsec_proto_to_sadb_proto (if_utun_crypto_ipsec_proto_t proto) +{ + switch (proto) { + case IF_UTUN_CRYPTO_IPSEC_PROTO_ESP: + return IPPROTO_ESP; + case IF_UTUN_CRYPTO_IPSEC_PROTO_AH: + return IPPROTO_AH; + default: + return 0; + } +} + +static u_int8_t +utun_ipsec_proto_to_sadb_satype (if_utun_crypto_ipsec_proto_t proto) +{ + switch (proto) { + case IF_UTUN_CRYPTO_IPSEC_PROTO_ESP: + return SADB_SATYPE_ESP; + case IF_UTUN_CRYPTO_IPSEC_PROTO_AH: + return SADB_SATYPE_AH; + default: + return 0; + } +} + +static u_int8_t +utun_ipsec_auth_to_sadb_aalg (if_utun_crypto_ipsec_auth_t auth) +{ + switch (auth) { + case IF_UTUN_CRYPTO_IPSEC_AUTH_MD5: + return SADB_AALG_MD5HMAC; + case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA1: + return SADB_AALG_SHA1HMAC; + case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA256: + return SADB_X_AALG_SHA2_256; + case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA384: + return SADB_X_AALG_SHA2_384; + case IF_UTUN_CRYPTO_IPSEC_AUTH_SHA512: + return SADB_X_AALG_SHA2_512; + default: + return 0; + } +} + +static u_int8_t +utun_ipsec_enc_to_sadb_ealg (if_utun_crypto_ipsec_enc_t enc) +{ + switch (enc) { + case IF_UTUN_CRYPTO_IPSEC_ENC_DES: + return SADB_EALG_DESCBC; + case IF_UTUN_CRYPTO_IPSEC_ENC_3DES: + return SADB_EALG_3DESCBC; + case IF_UTUN_CRYPTO_IPSEC_ENC_AES128: + case IF_UTUN_CRYPTO_IPSEC_ENC_AES256: + return SADB_X_EALG_AESCBC; + default: + return 0; + } +} + +static u_int32_t +utun_ipsec_keepalive_and_nat_info_to_sadb_flags (if_utun_crypto_ipsec_keepalive_t keepalive, + int punt_rx_keepalive, + if_utun_crypto_ipsec_natd_t natd, + u_int16_t natt_port) +{ + u_int32_t flags = 0; + + if (natt_port && natt_port != 500) { + flags |= SADB_X_EXT_NATT; + + switch (keepalive) { + case IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_NATT: + flags |= SADB_X_EXT_NATT_KEEPALIVE; // normal keepalive packet + break; + case IF_UTUN_CRYPTO_IPSEC_KEEPALIVE_ESP: + flags |= (SADB_X_EXT_ESP_KEEPALIVE | SADB_X_EXT_PUNT_RX_KEEPALIVE); // use an EMPTY ESP as a keepalive + break; + default: + break; + } + + switch (natd) { + case IF_UTUN_CRYPTO_IPSEC_NATD_PEER: + flags |= SADB_X_EXT_NATT_DETECTED_PEER; + break; + default: + break; + } + } + + if (punt_rx_keepalive) { + flags |= SADB_X_EXT_PUNT_RX_KEEPALIVE; + } + + return flags; +} + +static errno_t +utun_ipsec_set_sah (struct secashead **sah, + u_int8_t dir, + u_int16_t proto, + u_int8_t mode, + u_int32_t reqid, + struct sockaddr_storage *src_addr, + struct sockaddr_storage *dst_addr) +{ + struct secasindex saidx; + + // currently only support tunnel mode and ESP + if (proto != IPPROTO_ESP || + mode != IPSEC_MODE_TUNNEL) { + return EINVAL; + } + if ((((struct sockaddr *)src_addr)->sa_family != AF_INET && + ((struct sockaddr *)src_addr)->sa_family != AF_INET6) || + (((struct sockaddr *)dst_addr)->sa_family != AF_INET && + ((struct sockaddr *)dst_addr)->sa_family != AF_INET6)) { + return EINVAL; + } + + bzero(&saidx, sizeof(saidx)); + saidx.proto = proto; + saidx.mode = mode; + saidx.reqid = reqid; + bcopy(src_addr, &saidx.src, sizeof(saidx.src)); + bcopy(dst_addr, &saidx.dst, sizeof(saidx.dst)); + + lck_mtx_lock(sadb_mutex); + // TODO: add sah and policy (collision) check and prevention. ensure that there is no conflicting policy. + // TDDO: ensure that key_spdaddxxx doesn't add a policy that's conflicting with any of our sahs. + *sah = key_newsah2(&saidx, dir); + lck_mtx_unlock(sadb_mutex); + return 0; +} + +static int +utun_ipsec_clr_sahs (struct secashead **sah) +{ + struct secasvar *sav; + struct secasvar *nextsav; + u_int state; + + lck_mtx_lock(sadb_mutex); + for (state = 0; state < SADB_SASTATE_MAX; state++) { + for (sav = LIST_FIRST(&(*sah)->savtree[state]); + sav != NULL; + sav = nextsav) { + nextsav = LIST_NEXT(sav, chain); + if (sav->state == SADB_SASTATE_LARVAL || + sav->state == SADB_SASTATE_DEAD) { + continue; + } + + if (sav->utun_pcb) { + sav->utun_pcb = NULL; + sav->utun_is_keepalive_fn = NULL; + sav->utun_in_fn = NULL; + sav->refcnt--; // unlinked from pcb + } else { + printf("%s: SAV inconsistency\n", __FUNCTION__); + } + + key_sa_chgstate(sav, SADB_SASTATE_DEAD); + key_freesav(sav, KEY_SADB_LOCKED); + } + } + + // clear the rest of the SAs + key_delsah(*sah); + lck_mtx_unlock(sadb_mutex); + return 0; +} + +static void +utun_ipsec_set_udp_encap_listen_port (utun_crypto_dir_t dir, + u_int16_t natt_port) +{ + if (dir == UTUN_CRYPTO_DIR_IN) { + if (natt_port && natt_port != 500) { + esp_udp_encap_port = natt_port; + } + } +} + +static void +utun_set_lifetime (struct sadb_lifetime *lfh, + int type, + u_int64_t l_time) +{ + lfh->sadb_lifetime_len = (sizeof(*lfh) >> 3); // convert to words + lfh->sadb_lifetime_exttype = type; + lfh->sadb_lifetime_allocations = 0; + lfh->sadb_lifetime_bytes = 0; + lfh->sadb_lifetime_addtime = l_time; + lfh->sadb_lifetime_usetime = l_time; +} + +static struct sadb_key * +utun_ipsec_set_keybuf (u_int16_t type, + u_int8_t *key, + u_int16_t key_len) +{ + struct sadb_key *new; + int len = sizeof(*new) + BITSTOBYTES(key_len); + + lck_mtx_lock(sadb_mutex); + new = utun_alloc(len); + if (new == NULL) { + return NULL; + } + lck_mtx_unlock(sadb_mutex); + bzero(new, len); + new->sadb_key_len = BITSTOBYTES(key_len); + new->sadb_key_exttype = type; + new->sadb_key_bits = key_len; + bcopy(key, &new[1], new->sadb_key_len); + return new; +} + +static errno_t +utun_ipsec_alloc_sav (struct secashead *sah, + struct secasvar **sav, + struct utun_pcb *pcb, + u_int8_t satype, + u_int8_t alg_auth, + u_int8_t alg_enc, + u_int32_t flags, + u_int8_t replay, + u_int8_t *key_auth, + u_int16_t key_auth_len, + u_int8_t *key_enc, + u_int16_t key_enc_len, + u_int16_t natt_port, + u_int32_t seq, + u_int32_t spi, + u_int32_t pid, + u_int64_t lifetime_hard, + u_int64_t lifetime_soft) +{ + struct sadb_key *keye, *keya; + struct sadb_lifetime lfh, lfs; + + if (*sav) { + return EINVAL; + } + + bzero(&lfh, sizeof(lfh)); + utun_set_lifetime(&lfh, SADB_EXT_LIFETIME_HARD, lifetime_hard); + bzero(&lfs, sizeof(lfs)); + utun_set_lifetime(&lfs, SADB_EXT_LIFETIME_SOFT, lifetime_soft); + + if ((keya = utun_ipsec_set_keybuf(SADB_EXT_KEY_AUTH, key_auth, key_auth_len)) == NULL) { + return ENOBUFS; + } + if ((keye = utun_ipsec_set_keybuf(SADB_EXT_KEY_ENCRYPT, key_enc, key_enc_len)) == NULL) { + utun_free(keya); + return ENOBUFS; + } + + lck_mtx_lock(sadb_mutex); + if ((*sav = key_newsav2(sah, + satype, + alg_auth, + alg_enc, + flags, + replay, + keya, + key_auth_len, + keye, + key_enc_len, + natt_port, + seq, + spi, + pid, + &lfh, + &lfs)) == NULL) { + lck_mtx_unlock(sadb_mutex); + utun_free(keya); + utun_free(keye); + return ENOBUFS; + } + (*sav)->utun_pcb = (__typeof__((*sav)->utun_pcb))pcb; + (*sav)->utun_is_keepalive_fn = (__typeof__((*sav)->utun_is_keepalive_fn))utun_pkt_is_ipsec_keepalive; + (*sav)->utun_in_fn = (__typeof__((*sav)->utun_in_fn))utun_pkt_ipsec_input; + (*sav)->refcnt++; // for the pcb + lck_mtx_unlock(sadb_mutex); + utun_free(keya); + utun_free(keye); + return 0; +} + +static int +utun_ipsec_free_sav (struct secasvar **sav) +{ + lck_mtx_lock(sadb_mutex); + if ((*sav)->utun_pcb) { + (*sav)->utun_pcb = NULL; + (*sav)->utun_is_keepalive_fn = NULL; + (*sav)->utun_in_fn = NULL; + } + (*sav)->refcnt--; // unlinked from pcb + key_sa_chgstate(*sav, SADB_SASTATE_DEAD); + key_freesav(*sav, KEY_SADB_LOCKED); + lck_mtx_unlock(sadb_mutex); + *sav = NULL; + return 0; +} + +static int +utun_ipsec_num_savs (struct secashead **sah) +{ + struct secasvar *sav; + struct secasvar *nextsav; + u_int state; + int n = 0; + + lck_mtx_lock(sadb_mutex); + for (state = 0; state < SADB_SASTATE_MAX; state++) { + for (sav = LIST_FIRST(&(*sah)->savtree[state]); + sav != NULL; + sav = nextsav) { + nextsav = LIST_NEXT(sav, chain); + if (sav->state == SADB_SASTATE_LARVAL || + sav->state == SADB_SASTATE_DYING || + sav->state == SADB_SASTATE_DEAD) { + continue; + } + + if (sav->utun_pcb) { + n++; + } else { + printf("%s: SAV inconsistency\n", __FUNCTION__); + } + } + } + lck_mtx_unlock(sadb_mutex); + + return n; +} + +static errno_t +utun_ctl_config_crypto_keys_ipsec_v1 (struct utun_pcb *pcb, + utun_crypto_keys_args_t *args, + utun_crypto_keys_t *crypto_keys) +{ + utun_crypto_keys_ipsec_args_v1_t *args_ipsec_v1 = &args->u.ipsec_v1; + u_int8_t *varargs_buf = UTUN_CRYPTO_KEYS_ARGS_VARARGS_BUF(args); + errno_t err; + struct secashead *sah; + u_int16_t proto; + u_int8_t mode; + u_int8_t satype, aalg, ealg; + u_int32_t flags; + + if (args_ipsec_v1->key_auth_len > MAX_KEY_AUTH_LEN_BITS) { + printf("%s: invalid auth key len %d, max %d\n", __FUNCTION__, + args_ipsec_v1->key_auth_len, MAX_KEY_AUTH_LEN_BITS); + return EINVAL; + } + if (args_ipsec_v1->key_enc_len > MAX_KEY_ENC_LEN_BITS) { + printf("%s: invalid enc key len %d, max %d\n", __FUNCTION__, + args_ipsec_v1->key_enc_len, MAX_KEY_ENC_LEN_BITS); + return EINVAL; + } + if (args->varargs_buflen != (__typeof__(args->varargs_buflen))((BITSTOBYTES(args_ipsec_v1->key_auth_len) + + BITSTOBYTES(args_ipsec_v1->key_enc_len)))) { + printf("%s: len check failed (%d,%d, %d)\n", __FUNCTION__, + args->varargs_buflen, args_ipsec_v1->key_auth_len, args_ipsec_v1->key_enc_len); + return EINVAL; + } + sah = IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys); + if (!sah) { + // TODO: make sure we pass through this once + proto = utun_ipsec_proto_to_sadb_proto(args_ipsec_v1->proto); + mode = utun_ipsec_mode_to_sadb_mode(args_ipsec_v1->mode); + + if ((err = utun_ipsec_set_sah(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys), + UTUN_CRYPTO_DIR_TO_IPSEC_DIR(args->dir), + proto, + mode, + args_ipsec_v1->reqid, + &args_ipsec_v1->src_addr, + &args_ipsec_v1->dst_addr))) { + return err; + } + sah = IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys); + if (!sah) { + return EBADF; + } + } + + satype = utun_ipsec_proto_to_sadb_satype(args_ipsec_v1->proto); + aalg = utun_ipsec_auth_to_sadb_aalg(args_ipsec_v1->alg_auth); + ealg = utun_ipsec_enc_to_sadb_ealg(args_ipsec_v1->alg_enc); + flags = utun_ipsec_keepalive_and_nat_info_to_sadb_flags(args_ipsec_v1->keepalive, + args_ipsec_v1->punt_rx_keepalive, + args_ipsec_v1->natd, + args_ipsec_v1->natt_port); + + if ((err = utun_ipsec_alloc_sav(sah, + &IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys), + pcb, + satype, + aalg, + ealg, + flags, + args_ipsec_v1->replay, + varargs_buf, + args_ipsec_v1->key_auth_len, + (varargs_buf + BITSTOBYTES(args_ipsec_v1->key_auth_len)), + args_ipsec_v1->key_enc_len, + args_ipsec_v1->natt_port, + args_ipsec_v1->seq, + args_ipsec_v1->spi, + args_ipsec_v1->pid, + args_ipsec_v1->lifetime_hard, + args_ipsec_v1->lifetime_soft))) { + return err; + } + crypto_keys->state.u.ipsec.proto = sah->saidx.proto; + crypto_keys->state.u.ipsec.mode = sah->saidx.mode; + if (((struct sockaddr *)&sah->saidx.src)->sa_family == AF_INET) { + crypto_keys->state.u.ipsec.ifamily = IPPROTO_IPV4; + } else { + crypto_keys->state.u.ipsec.ifamily = IPPROTO_IPV6; + } + crypto_keys->state.u.ipsec.spi = args_ipsec_v1->spi; + utun_ipsec_set_udp_encap_listen_port(args->dir, args_ipsec_v1->natt_port); + return 0; +} + +static errno_t +utun_ctl_unconfig_crypto_keys_ipsec_v1 (utun_crypto_keys_t *crypto_keys) +{ + if (!IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys)) { + return EBADF; + } + if (!IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys)) { + return EBADF; + } + if (utun_ipsec_free_sav(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys))) { + return EADDRNOTAVAIL; + } + if (!utun_ipsec_num_savs(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys))) { + (void)utun_ipsec_clr_sahs(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys)); + + // release sah + IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(crypto_keys) = NULL; + } + + return 0; +} + +static void +utun_set_spirange (struct sadb_spirange *spirange, + u_int32_t spirange_min, + u_int32_t spirange_max) +{ + spirange->sadb_spirange_min = spirange_min; + spirange->sadb_spirange_max = spirange_max; +} + +static u_int32_t +utun_ipsec_get_spi (struct sockaddr_storage *src_addr, + struct sockaddr_storage *dst_addr, + u_int16_t proto, + u_int8_t mode, + u_int32_t reqid, + u_int32_t spirange_min, + u_int32_t spirange_max) +{ + struct sadb_spirange spirange; + utun_set_spirange(&spirange, spirange_min, spirange_max); + // TODO: should this allocate an SAH? + return key_getspi2((struct sockaddr *)src_addr, + (struct sockaddr *)dst_addr, + proto, + mode, + reqid, + &spirange); +} + +static errno_t +utun_ctl_generate_crypto_keys_idx_ipsec_v1 (utun_crypto_keys_idx_args_t *args) +{ + utun_crypto_keys_idx_ipsec_args_v1_t *args_ipsec_v1 = &args->u.ipsec_v1; + u_int16_t proto; + u_int8_t mode; + + proto = utun_ipsec_proto_to_sadb_proto(args_ipsec_v1->proto); + mode = utun_ipsec_mode_to_sadb_mode(args_ipsec_v1->mode); + + args_ipsec_v1->spi = 0; + if ((args_ipsec_v1->spi = utun_ipsec_get_spi(&args_ipsec_v1->src_addr, + &args_ipsec_v1->dst_addr, + proto, + mode, + args_ipsec_v1->reqid, + args_ipsec_v1->spirange_min, + args_ipsec_v1->spirange_max)) == 0) { + return ENOBUFS; + } + return 0; +} + +void +utun_cleanup_all_crypto_ipsec (struct utun_pcb *pcb) +{ + int idx; + utun_crypto_ctx_t *crypto_ctx; + utun_crypto_keys_t *cur_crypto_keys, *nxt_crypto_keys; + + for (idx = 0; idx < UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_MAX); idx++) { + crypto_ctx = &pcb->utun_crypto_ctx[idx]; + if (!crypto_ctx->valid || + crypto_ctx->type != UTUN_CRYPTO_TYPE_IPSEC) { + continue; + } + + // flush all crypto materials + for (cur_crypto_keys = (__typeof__(cur_crypto_keys))LIST_FIRST(&crypto_ctx->keys_listhead); + cur_crypto_keys != NULL; + cur_crypto_keys = nxt_crypto_keys) { + nxt_crypto_keys = (__typeof__(nxt_crypto_keys))LIST_NEXT(cur_crypto_keys, chain); + + if (!cur_crypto_keys->valid) { + continue; + } + + if (IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(cur_crypto_keys)) { + (void)utun_ipsec_free_sav(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(cur_crypto_keys)); + } + + if (IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(cur_crypto_keys)) { + (void)utun_ipsec_clr_sahs(&IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(cur_crypto_keys)); + } + + LIST_REMOVE(cur_crypto_keys, chain); + bzero(cur_crypto_keys, sizeof(*cur_crypto_keys)); + utun_free(cur_crypto_keys); + } + + bzero(crypto_ctx, sizeof(*crypto_ctx)); + } +} + +static errno_t +utun_ctl_enable_crypto_ipsec_v1 (__unused utun_crypto_args_t *args) +{ + return 0; +} + +/* + * Summary: enables ipsec crypto info for the specified utun. + */ +void +utun_ctl_enable_crypto_ipsec(__unused struct utun_pcb *pcb, + utun_crypto_args_t *args) +{ + lck_mtx_lock(sadb_mutex); + /* Turn off the ipsec bypass, if already on */ + if (ipsec_bypass) { + ipsec_bypass = 0; + } + if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { + (void)utun_ctl_enable_crypto_ipsec_v1(args); + } + lck_mtx_unlock(sadb_mutex); +} + +/* + * Summary: disables ipsec crypto info for the specified utun. + */ +void +utun_ctl_disable_crypto_ipsec(__unused struct utun_pcb *pcb) +{ + utun_cleanup_all_crypto_ipsec(pcb); + lck_mtx_lock(sadb_mutex); + /* Turn on the ipsec bypass, if there are no other policies */ + if (!ipsec_policy_count && !ipsec_bypass) // TODO: ipsec_policy_count may be 1 by default + ipsec_bypass = 1; + utun_punt_rx_keepalive = 0; + lck_mtx_unlock(sadb_mutex); +} + +errno_t +utun_ctl_config_crypto_keys_ipsec (struct utun_pcb *pcb, + utun_crypto_keys_args_t *args, + utun_crypto_keys_t *crypto_keys) +{ + if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { + return(utun_ctl_config_crypto_keys_ipsec_v1(pcb, args, crypto_keys)); + } else { + printf("%s: ver unsupported (%d, %d)\n", __FUNCTION__, args->ver, UTUN_CRYPTO_KEYS_IPSEC_VER_1); + return EINVAL; + } +} + +errno_t +utun_ctl_unconfig_crypto_keys_ipsec (utun_crypto_keys_args_t *args, + utun_crypto_keys_t *crypto_keys) +{ + if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { + return(utun_ctl_unconfig_crypto_keys_ipsec_v1(crypto_keys)); + } else { + printf("%s: ver unsupported (%d, %d)\n", __FUNCTION__, args->ver, UTUN_CRYPTO_KEYS_IPSEC_VER_1); + return EINVAL; + } +} + +errno_t +utun_ctl_generate_crypto_keys_idx_ipsec (utun_crypto_keys_idx_args_t *args) +{ + if (args->ver == UTUN_CRYPTO_KEYS_IPSEC_VER_1) { + return(utun_ctl_generate_crypto_keys_idx_ipsec_v1(args)); + } else { + printf("%s: ver unsupported (%d, %d)\n", __FUNCTION__, args->ver, UTUN_CRYPTO_KEYS_IPSEC_VER_1); + return EINVAL; + } +} + +int +utun_pkt_ipsec_output (struct utun_pcb *pcb, mbuf_t *pkt) +{ + utun_crypto_keys_t *crypto_keys = IF_UTUN_GET_TX_CRYPTO_KEYS(pcb); + struct secasvar *sav; + protocol_family_t proto; + mbuf_t new; + int err; + struct route *ro = NULL; + struct route ro_copy; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; + + if (crypto_keys && + crypto_keys->state.u.ipsec.proto == IPPROTO_ESP && + (sav = IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(crypto_keys)) && + sav->state == SADB_SASTATE_MATURE) { + // TODO: update stats to increment outgoing packets + // TODO: allow empty packets thru + + proto = ntohl(*(mtod(*pkt, protocol_family_t *))); + m_adj(*pkt, sizeof(protocol_family_t)); + + bzero(&ro_copy, sizeof(ro_copy)); + + if ((proto == AF_UTUN || proto == AF_INET) && crypto_keys->state.u.ipsec.ifamily == IPPROTO_IPV4) { + struct ip *ip; + struct sockaddr_in *dst4; + + if (proto == AF_INET) { + if ((*pkt)->m_len < (__typeof__((*pkt)->m_len))sizeof(*ip)) { + if (!(*pkt = m_pullup(*pkt, sizeof(*ip)))) { + printf("%s: m_pullup failed\n", __FUNCTION__); + return 0; + } + } + + // split the mbuf chain to put the ip header and payloads in separate mbufs + new = ipsec4_splithdr(*pkt); + if (!new) { + printf("%s: ipsec4_splithdr(1) failed\n", __FUNCTION__); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + *pkt = NULL; + return 0; + } + *pkt = new; + + // encapsulate with the outer header + if ((err = ipsec4_encapsulate(new, sav))) { + printf("%s: ipsec4_encapsulate failed (%d)\n", __FUNCTION__, err); + *pkt = NULL; + return 0; + } + + } else { + // otherwise it's AF_UTUN which will be a keepalive packet to be encapsulated, encrypted and sent + // encapsulate with the outer header + if ((err = ipsec4_encapsulate_utun_esp_keepalive(pkt, sav))) { + printf("%s: ipsec4_encapsulate failed (%d)\n", __FUNCTION__, err); + return 0; + } + new = *pkt; + } + + ip = mtod(new, __typeof__(ip)); + // grab sadb_mutex, to update sah's route cache and get a local copy of it + lck_mtx_lock(sadb_mutex); + ro = &sav->sah->sa_route; + dst4 = (struct sockaddr_in *)(void *)&ro->ro_dst; + if (ro->ro_rt) { + RT_LOCK(ro->ro_rt); + } + if (ro->ro_rt != NULL && + (ro->ro_rt->generation_id != route_generation || + !(ro->ro_rt->rt_flags & RTF_UP) || + dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (ro->ro_rt == NULL) { + dst4->sin_family = AF_INET; + dst4->sin_len = sizeof(*dst4); + dst4->sin_addr = ip->ip_dst; + rtalloc(ro); + if (ro->ro_rt) { + RT_LOCK(ro->ro_rt); + } else { + printf("%s: rtalloc(1) failed\n", __FUNCTION__); + mbuf_freem(new); + *pkt = NULL; + return 0; + } + } + if (ro->ro_rt->rt_flags & RTF_GATEWAY) { + dst4 = (struct sockaddr_in *)(void *)ro->ro_rt->rt_gateway; + } + RT_UNLOCK(ro->ro_rt); + route_copyout(&ro_copy, ro, sizeof(ro_copy)); + // release sadb_mutex, after updating sah's route cache and getting a local copy + lck_mtx_unlock(sadb_mutex); + + // split the mbuf chain to put the ip header and payloads in separate mbufs + new = ipsec4_splithdr(*pkt); + if (!new) { + printf("%s: ipsec4_splithdr(2) failed\n", __FUNCTION__); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + *pkt = NULL; + return 0; + } + *pkt = new; + + if ((err = esp4_output(new, sav))) { + printf("%s: esp4_output failed (%d)\n", __FUNCTION__, err); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + *pkt = NULL; + return 0; // drop + } + + ip = mtod(new, __typeof__(ip)); + ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */ + } else if ((proto == AF_UTUN || proto == AF_INET6) && crypto_keys->state.u.ipsec.ifamily == IPPROTO_IPV6) { + int plen; + struct ip6_hdr *ip6; + struct sockaddr_in6 *dst6; + + if (proto == AF_INET6) { + // split the mbuf chain to put the ip header and payloads in separate mbufs + new = ipsec6_splithdr(*pkt); + if (!new) { + printf("%s: ipsec6_splithdr(1) failed\n", __FUNCTION__); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + *pkt = NULL; + return 0; + } + *pkt = new; + + // encapsulate with the outer header + if ((err = ipsec6_encapsulate(new, sav))) { + printf("%s: ipsec6_encapsulate failed (%d)\n", __FUNCTION__, err); + *pkt = NULL; + return 0; + } + + } else { + // otherwise it's AF_UTUN which will be a keepalive packet to be encapsulated, encrypted and sent + // encapsulate with the outer header + if ((err = ipsec6_encapsulate_utun_esp_keepalive(pkt, sav))) { + printf("%s: ipsec6_encapsulate failed (%d)\n", __FUNCTION__, err); + return 0; + } + new = *pkt; + } + + ip6 = mtod(new, __typeof__(ip6)); + // grab sadb_mutex, before updating sah's route cache + lck_mtx_lock(sadb_mutex); + ro = &sav->sah->sa_route; + dst6 = (struct sockaddr_in6 *)(void *)&ro->ro_dst; + if (ro->ro_rt) { + RT_LOCK(ro->ro_rt); + } + if (ro->ro_rt != NULL && + (ro->ro_rt->generation_id != route_generation || + !(ro->ro_rt->rt_flags & RTF_UP) || + !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (ro->ro_rt == NULL) { + bzero(dst6, sizeof(*dst6)); + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = ip6->ip6_dst; + rtalloc(ro); + if (ro->ro_rt) { + RT_LOCK(ro->ro_rt); + } else { + printf("%s: rtalloc(2) failed\n", __FUNCTION__); + mbuf_freem(new); + *pkt = NULL; + return 0; + } + } + if (ro->ro_rt->rt_flags & RTF_GATEWAY) { + dst6 = (struct sockaddr_in6 *)(void *)ro->ro_rt->rt_gateway; + } + RT_UNLOCK(ro->ro_rt); + route_copyout(&ro_copy, ro, sizeof(ro_copy)); + // release sadb_mutex, after updating sah's route cache and getting a local copy + lck_mtx_unlock(sadb_mutex); + + // split the mbuf chain to put the ip header and payloads in separate mbufs + new = ipsec6_splithdr(*pkt); + if (!new) { + printf("%s: ipsec6_splithdr failed\n", __FUNCTION__); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + *pkt = NULL; + return 0; + } + *pkt = new; + + if ((err = esp6_output(new, mtod(new, u_char *), new->m_next, sav))) { + printf("%s: esp6_output failed (%d)\n", __FUNCTION__, err); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + *pkt = NULL; + return 0; // drop + } + + plen = new->m_pkthdr.len - sizeof(struct ip6_hdr); + if (plen > IPV6_MAXPACKET) { + printf("%s: esp6_output failed due to invalid len (%d)\n", __FUNCTION__, plen); + if (ro_copy.ro_rt != NULL) { + rtfree(ro_copy.ro_rt); + } + mbuf_freem(new); + *pkt = NULL; + return 0; + } + ip6 = mtod(new, __typeof__(ip6)); + ip6->ip6_plen = ntohs(ip6->ip6_plen); /* flip len field before calling ip_output */ + } else { + printf("%s: packet's proto (%d) mismatched the context's proto (%d)\n", __FUNCTION__, + proto, crypto_keys->state.u.ipsec.ifamily); + mbuf_freem(*pkt); + *pkt = NULL; + return 0; + } + + if (pcb->utun_ifp) { + ifnet_stat_increment_out(pcb->utun_ifp, 1, mbuf_pkthdr_len(new), 0); + } + + if ((err = ip_output(new, NULL, &ro_copy, + (IP_OUTARGS | IP_NOIPSEC), NULL, &ipoa))) { + printf("%s: ip_output failed (%d)\n", __FUNCTION__, err); + } + lck_mtx_lock(sadb_mutex); + route_copyin(&ro_copy, ro, sizeof(*ro)); + lck_mtx_unlock(sadb_mutex); + return 0; + } else { + printf("%s: no suitable crypto-mat\n", __FUNCTION__); + } + return -1; +} + +// returns 0 if false, 1 if true, and -1 if there was a failure +int +utun_pkt_is_ipsec_keepalive (struct utun_pcb *pcb, mbuf_t *pkt, u_int16_t nxt, u_int32_t flags, size_t offs) +{ + int result; + u_int8_t *data; + int size_diff; + + if (!pcb->utun_ctlref) { + printf("%s - utun ctlref cleared\n", __FUNCTION__); + return 0; + } + + if (!(pcb->utun_flags & UTUN_FLAGS_CRYPTO)) { + printf("%s - crypto disabled\n", __FUNCTION__); + return 0; + } + + if ((*pkt)->m_pkthdr.len < 0) { + printf("%s - invalid hdr len, len %d, offs %lu\n", __FUNCTION__, (*pkt)->m_pkthdr.len, offs); + return 0; + } + + if ((size_t)(*pkt)->m_pkthdr.len <= offs) { + printf("%s - invalid offset, len %d, offs %lu\n", __FUNCTION__, (*pkt)->m_pkthdr.len, offs); + return 0; + } + + if ((*pkt)->m_len < 0) { + printf("%s - invalid len, len %d, offs %lu\n", __FUNCTION__, (*pkt)->m_len, offs); + return 0; + } + + // pullup offs + 1 bytes + if ((size_t)(*pkt)->m_len < (offs + 1)) { + if ((*pkt = m_pullup(*pkt, (offs + 1))) == NULL) { + printf("%s: m_pullup failed\n", __FUNCTION__); + return -1; + } + } + + if (pcb->utun_ifp) { + ifnet_stat_increment_in(pcb->utun_ifp, 1, mbuf_pkthdr_len(*pkt), 0); + } + + size_diff = (*pkt)->m_pkthdr.len - offs; + data = mtod(*pkt, __typeof(data)); + data += offs; + + // ESP keepalive meets all these conditions: ESP trailer's next proto indicates IP, the decrypted packet only has one zero'd byte in it. + if (flags & SADB_X_EXT_ESP_KEEPALIVE && + nxt == IPPROTO_IPV4 && + size_diff == 1 && + *data == 0) { + // TODO: update stats to increment keepalives and current timestamp + if (utun_punt_rx_keepalive || + flags & SADB_X_EXT_PUNT_RX_KEEPALIVE) { + + // strip all headers + if ((size_t)(*pkt)->m_len >= (offs + size_diff)) { + ovbcopy((caddr_t)data, (data + offs), size_diff); + (*pkt)->m_data += offs; + (*pkt)->m_len -= offs; + (*pkt)->m_pkthdr.len -= offs; + } else { + struct mbuf *n; + + n = m_split(*pkt, offs, M_DONTWAIT); + if (n == NULL) { + /* *pkt is retained by m_split */ + mbuf_freem(*pkt); + *pkt = NULL; + return -1; + } + m_adj(n, offs); + mbuf_freem(*pkt); + *pkt = n; + } + + // keepalive is being punted up to the control socket, prepend with a special packet type (PF_UTUN) + if (mbuf_prepend(pkt, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { + printf("%s - ifnet_output prepend failed\n", __FUNCTION__); + return -1; + } + if ((size_t)(*pkt)->m_len < (sizeof(protocol_family_t) + size_diff)) { + if ((*pkt = m_pullup(*pkt, (sizeof(protocol_family_t) + size_diff))) == NULL) { + printf("%s: m_pullup failed\n", __FUNCTION__); + return -1; + } + } + + // mark UTUN/Keepalive packet + *(protocol_family_t *)mbuf_data(*pkt) = htonl(PF_UTUN); + + result = ctl_enqueuembuf(pcb->utun_ctlref, pcb->utun_unit, *pkt, CTL_DATA_EOR); + if (result != 0) { + printf("%s: - ctl_enqueuembuf failed: %d\n", __FUNCTION__, result); + mbuf_freem(*pkt); + return -1; + } + *pkt = NULL; + } + return 1; + } + return 0; +} + +int +utun_pkt_ipsec_input (struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family) +{ + if (!m_tag_locate(*pkt, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPSEC, NULL)) { + return EINVAL; + } + + if (!(pcb->utun_flags & UTUN_FLAGS_CRYPTO)) { + printf("%s - crypto disabled\n", __FUNCTION__); + return EINVAL; + } + + if (!pcb->utun_ifp) { + printf("%s - utun ifp cleared\n", __FUNCTION__); + return EINVAL; + } + + // place protocol number at the beginning of the mbuf + if (mbuf_prepend(pkt, sizeof(protocol_family_t), MBUF_DONTWAIT) != 0) { + printf("%s - ifnet_output prepend failed\n", __FUNCTION__); + return ENOBUFS; + } + *(protocol_family_t *)mbuf_data(*pkt) = htonl(family); + + (void)utun_pkt_input(pcb, *pkt); + return 0; +} diff --git a/bsd/net/if_utun_crypto_ipsec.h b/bsd/net/if_utun_crypto_ipsec.h new file mode 100644 index 000000000..7a4c5f210 --- /dev/null +++ b/bsd/net/if_utun_crypto_ipsec.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_UTUN_CRYPTO_IPSEC_H_ +#define _NET_IF_UTUN_CRYPTO_IPSEC_H_ + +#ifdef KERNEL_PRIVATE + +struct utun_pcb; + +#define UTUN_CRYPTO_DIR_TO_IPSEC_DIR(dir) (dir == UTUN_CRYPTO_DIR_IN)? IPSEC_DIR_INBOUND : IPSEC_DIR_OUTBOUND +#define IF_UTUN_GET_TX_CRYPTO_KEYS(pcb) LIST_FIRST(&pcb->utun_crypto_ctx[UTUN_CRYPTO_DIR_TO_IDX(UTUN_CRYPTO_DIR_OUT)].keys_listhead) +#define IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAH(keys) keys->state.u.ipsec.sah +#define IF_UTUN_GET_CRYPTO_KEYS_IPSEC_SAV(keys) keys->state.u.ipsec.sav + +/* + * Summary: cleans up all crypto info for the specified utun. + */ +void +utun_cleanup_all_crypto_ipsec(struct utun_pcb *pcb); + +/* + * Summary: enables ipsec crypto info for the specified utun. + */ +void +utun_ctl_enable_crypto_ipsec(struct utun_pcb *pcb, utun_crypto_args_t *args); + +/* + * Summary: disables ipsec crypto info for the specified utun. + */ +void +utun_ctl_disable_crypto_ipsec(struct utun_pcb *pcb); + +/* + * Summary: configures an ipsec crypto context for the specified utun, with keying material + * (needed for traffic encrypt/decrypt). + * Args: + * pcb - the specified utun state info + * args - the ipsec crypto context keying arguments as passed down from userland. + * crypto_ctx_mat - the ipsec crypto context's keying material to be filled. + * Returns: 0 if successful, otherwise returns an appropriate errno. + */ +errno_t +utun_ctl_config_crypto_keys_ipsec(struct utun_pcb *pcb, + utun_crypto_keys_args_t *args, + utun_crypto_keys_t *crypto_ctx_mat); + +/* + * Summary: unconfigures the keying material in an ipsec crypto context for the specified utun. + * Args: + * args - the ipsec crypto context keying arguments as passed down from userland. + * crypto_ctx_mat - the ipsec crypto context's keying material to be filled. + * Returns: 0 if successful, otherwise returns an appropriate errno. + */ +errno_t +utun_ctl_unconfig_crypto_keys_ipsec(utun_crypto_keys_args_t *args, + utun_crypto_keys_t *crypto_ctx_mat); + +/* + * Summary: generates an SPI/index to be using by keying material in an ipsec crypto context + * for the specified utun. + * Args: + * args - the ipsec crypto context key index arguments as passed down from userland. + * Returns: 0 if successful, otherwise returns an appropriate errno. + */ +errno_t +utun_ctl_generate_crypto_keys_idx_ipsec(utun_crypto_keys_idx_args_t *args); + +int +utun_pkt_ipsec_output(struct utun_pcb *pcb, mbuf_t *pkt); + +int +utun_pkt_is_ipsec_keepalive(struct utun_pcb *pcb, mbuf_t *pkt, u_int16_t nxt, u_int32_t flags, size_t off); + +int +utun_pkt_ipsec_input(struct utun_pcb *pcb, mbuf_t *pkt, protocol_family_t family); + +#endif // KERNEL_PRIVATE + +#endif // _NET_IF_UTUN_CRYPTO_IPSEC_H_ diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index f3e64b0e3..426a78bb5 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -91,7 +91,7 @@ #define APPLE_IF_FAM_DISC 8 #define APPLE_IF_FAM_MDECAP 9 #define APPLE_IF_FAM_GIF 10 -#define APPLE_IF_FAM_FAITH 11 +#define APPLE_IF_FAM_FAITH 11 /* deprecated */ #define APPLE_IF_FAM_STF 12 #define APPLE_IF_FAM_FIREWIRE 13 #define APPLE_IF_FAM_BOND 14 @@ -225,6 +225,10 @@ struct if_data64 { #ifdef PRIVATE struct if_traffic_class { + u_int64_t ifi_ibepackets; /* TC_BE packets received on interface */ + u_int64_t ifi_ibebytes; /* TC_BE bytes received on interface */ + u_int64_t ifi_obepackets; /* TC_BE packet sent on interface */ + u_int64_t ifi_obebytes; /* TC_BE bytes sent on interface */ u_int64_t ifi_ibkpackets; /* TC_BK packets received on interface */ u_int64_t ifi_ibkbytes; /* TC_BK bytes received on interface */ u_int64_t ifi_obkpackets; /* TC_BK packet sent on interface */ @@ -237,6 +241,77 @@ struct if_traffic_class { u_int64_t ifi_ivobytes; /* TC_VO bytes received on interface */ u_int64_t ifi_ovopackets; /* TC_VO packets sent on interface */ u_int64_t ifi_ovobytes; /* TC_VO bytes sent on interface */ + u_int64_t ifi_ipvpackets; /* TC priv packets received on interface */ + u_int64_t ifi_ipvbytes; /* TC priv bytes received on interface */ + u_int64_t ifi_opvpackets; /* TC priv packets sent on interface */ + u_int64_t ifi_opvbytes; /* TC priv bytes sent on interface */ +}; + +struct if_data_extended { + u_int64_t ifi_alignerrs; /* unaligned (32-bit) input pkts */ +}; + +struct if_packet_stats { + /* TCP */ + u_int64_t ifi_tcp_badformat; + u_int64_t ifi_tcp_unspecv6; + u_int64_t ifi_tcp_synfin; + u_int64_t ifi_tcp_badformatipsec; + u_int64_t ifi_tcp_noconnnolist; + u_int64_t ifi_tcp_noconnlist; + u_int64_t ifi_tcp_listbadsyn; + u_int64_t ifi_tcp_icmp6unreach; + u_int64_t ifi_tcp_deprecate6; + u_int64_t ifi_tcp_rstinsynrcv; + u_int64_t ifi_tcp_ooopacket; + u_int64_t ifi_tcp_dospacket; + u_int64_t ifi_tcp_cleanup; + u_int64_t ifi_tcp_synwindow; + /* UDP */ + u_int64_t ifi_udp_port_unreach; + u_int64_t ifi_udp_faithprefix; + u_int64_t ifi_udp_port0; + u_int64_t ifi_udp_badlength; + u_int64_t ifi_udp_badchksum; + u_int64_t ifi_udp_badmcast; + u_int64_t ifi_udp_cleanup; + u_int64_t ifi_udp_badipsec; +}; + +struct if_description { + u_int32_t ifd_maxlen; /* must be IF_DESCSIZE */ + u_int32_t ifd_len; /* actual ifd_desc length */ + u_int8_t *ifd_desc; /* ptr to desc buffer */ +}; + +struct if_bandwidths { + u_int64_t eff_bw; /* effective bandwidth */ + u_int64_t max_bw; /* maximum theoretical bandwidth */ +}; + +struct if_rxpoll_stats { + u_int32_t ifi_poll_off_req; /* total # of POLL_OFF reqs */ + u_int32_t ifi_poll_off_err; /* total # of POLL_OFF errors */ + u_int32_t ifi_poll_on_req; /* total # of POLL_ON reqs */ + u_int32_t ifi_poll_on_err; /* total # of POLL_ON errors */ + + u_int32_t ifi_poll_wakeups_avg; /* avg # of wakeup reqs */ + u_int32_t ifi_poll_wakeups_lowat; /* wakeups low watermark */ + u_int32_t ifi_poll_wakeups_hiwat; /* wakeups high watermark */ + + u_int64_t ifi_poll_packets; /* total # of polled packets */ + u_int32_t ifi_poll_packets_avg; /* average polled packets */ + u_int32_t ifi_poll_packets_min; /* smallest polled packets */ + u_int32_t ifi_poll_packets_max; /* largest polled packets */ + u_int32_t ifi_poll_packets_lowat; /* packets low watermark */ + u_int32_t ifi_poll_packets_hiwat; /* packets high watermark */ + + u_int64_t ifi_poll_bytes; /* total # of polled bytes */ + u_int32_t ifi_poll_bytes_avg; /* average polled bytes */ + u_int32_t ifi_poll_bytes_min; /* smallest polled bytes */ + u_int32_t ifi_poll_bytes_max; /* largest polled bytes */ + u_int32_t ifi_poll_bytes_lowat; /* bytes low watermark */ + u_int32_t ifi_poll_bytes_hiwat; /* bytes high watermark */ }; #endif /* PRIVATE */ @@ -253,7 +328,7 @@ struct ifqueue { int ifq_drops; }; -#ifdef XNU_KERNEL_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* * Internal storage of if_data. This is bound to change. Various places in the * stack will translate this data structure in to the externally visible @@ -289,13 +364,33 @@ struct if_data_internal { u_int64_t ifi_noproto; /* destined for unsupported protocol */ u_int32_t ifi_recvtiming; /* usec spent receiving when timing */ u_int32_t ifi_xmittiming; /* usec spent xmitting when timing */ + u_int64_t ifi_alignerrs; /* unaligned (32-bit) input pkts */ #define IF_LASTCHANGEUPTIME 1 /* lastchange: 1-uptime 0-calendar time */ struct timeval ifi_lastchange; /* time of last administrative change */ u_int32_t ifi_hwassist; /* HW offload capabilities */ u_int32_t ifi_tso_v4_mtu; /* TCP Segment Offload IPv4 maximum segment size */ u_int32_t ifi_tso_v6_mtu; /* TCP Segment Offload IPv6 maximum segment size */ }; -#endif /* XNU_KERNEL_PRIVATE */ + +/* + * Fields per interface to measure perceived bandwidth. + */ + +struct if_measured_bw { + u_int64_t bw; /* measured bandwidth in bytes per ms */ + u_int64_t bytes; /* XXX not needed */ + u_int64_t ts; /* XXX not needed */ + u_int64_t cur_seq __attribute((aligned(8))); /* current sequence for marking a packet */ + u_int64_t start_ts; /* time at which a measurement started */ + u_int64_t start_seq; /* sequence at which a measurement should start */ + u_int64_t last_seq; /* last recorded seq */ + u_int64_t last_ts; /* last recorded ts */ + u_int32_t flags __attribute__((aligned(4))); /* flags */ +#define IF_MEASURED_BW_INPROGRESS 0x1 +#define IF_MEASURED_BW_CALCULATION 0x2 +}; + +#endif /* BSD_KERNEL_PRIVATE */ #ifdef PRIVATE #define if_mtu if_data.ifi_mtu @@ -322,12 +417,13 @@ struct if_data_internal { #define if_recvquota if_data.ifi_recvquota #define if_xmitquota if_data.ifi_xmitquota #endif /* PRIVATE */ -#ifdef XNU_KERNEL_PRIVATE +#ifdef BSD_KERNEL_PRIVATE #define if_tso_v4_mtu if_data.ifi_tso_v4_mtu #define if_tso_v6_mtu if_data.ifi_tso_v6_mtu -#endif /* XNU_KERNEL_PRIVATE */ +#define if_alignerrs if_data.ifi_alignerrs +#endif /* BSD_KERNEL_PRIVATE */ -#ifdef XNU_KERNEL_PRIVATE +#ifdef BSD_KERNEL_PRIVATE /* * Forward structure declarations for function prototypes [sic]. */ @@ -340,6 +436,8 @@ struct ifaddr; struct tqdummy; struct proto_hash_entry; struct dlil_threading_info; +struct tcpstat_local; +struct udpstat_local; #if PF struct pfi_kif; #endif /* PF */ @@ -352,7 +450,7 @@ LIST_HEAD(ifmultihead, ifmultiaddr); TAILQ_HEAD(tailq_head, tqdummy); TAILQ_HEAD(ifnet_filter_head, ifnet_filter); TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); -#endif /* XNU_KERNEL_PRIVATE */ +#endif /* BSD_KERNEL_PRIVATE */ #ifdef PRIVATE /* @@ -386,9 +484,13 @@ TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); #define IF_HWASSIST_TSO_V6 0x00400000 /* will do TCP Segment offload for IPv6, IFNET_TSO_IPV6 */ #endif /* PRIVATE */ -#ifdef XNU_KERNEL_PRIVATE +#ifdef BSD_KERNEL_PRIVATE +/* + * ifnet is private to BSD portion of kernel + */ #include #include +#include RB_HEAD(ll_reach_tree, if_llreach); /* define struct ll_reach_tree */ @@ -404,6 +506,7 @@ struct ifnet { decl_lck_rw_data(, if_lock); void *if_softc; /* pointer to driver state */ const char *if_name; /* name, e.g. ``en'' or ``lo'' */ + struct if_description if_desc; /* extended description */ TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ TAILQ_ENTRY(ifnet) if_detaching_link; /* list of detaching ifnets */ @@ -435,6 +538,11 @@ struct ifnet { ifnet_family_t if_family; /* value assigned by Apple */ uintptr_t if_family_cookie; ifnet_output_func if_output; + ifnet_pre_enqueue_func if_pre_enqueue; + ifnet_start_func if_start; + ifnet_ctl_func if_output_ctl; + ifnet_input_poll_func if_input_poll; + ifnet_ctl_func if_input_ctl; ifnet_ioctl_func if_ioctl; ifnet_set_bpf_tap if_set_bpf_tap; ifnet_detached_func if_free; @@ -447,6 +555,18 @@ struct ifnet { struct proto_hash_entry *if_proto_hash; void *if_kpi_storage; + decl_lck_mtx_data(, if_start_lock); + u_int32_t if_start_req; + u_int32_t if_start_active; /* output is active */ + struct timespec if_start_cycle; /* restart interval */ + struct thread *if_start_thread; + + struct ifclassq if_snd; /* transmit queue */ + u_int32_t if_output_sched_model; /* tx sched model */ + + struct if_bandwidths if_output_bw; + struct if_bandwidths if_input_bw; + decl_lck_mtx_data(, if_flt_lock) u_int32_t if_flt_busy; u_int32_t if_flt_waiters; @@ -458,9 +578,14 @@ struct ifnet { decl_lck_mtx_data(, if_addrconfig_lock); /* for serializing addr config */ struct in_multi *if_allhostsinm; /* store all-hosts inm for this ifp */ - struct dlil_threading_info *if_input_thread; + decl_lck_mtx_data(, if_poll_lock); + u_int16_t if_poll_req; + u_int16_t if_poll_update; /* link update */ + u_int32_t if_poll_active; /* polling is active */ + struct timespec if_poll_cycle; /* poll interval */ + struct thread *if_poll_thread; - struct ifqueue if_snd; + struct dlil_threading_info *if_inp; struct ifprefixhead if_prefixhead; /* list of prefixes per if */ struct { @@ -476,7 +601,6 @@ struct ifnet { u_int32_t if_wake_properties; #if PF - struct thread *if_pf_curthread; struct pfi_kif *if_pf_kif; #endif /* PF */ @@ -504,6 +628,11 @@ struct ifnet { #if INET6 struct mld_ifinfo *if_mli; /* for MLDv2 */ #endif /* INET6 */ + + int if_lqm; /* link quality metric */ + struct if_measured_bw if_bw; + struct tcpstat_local *if_tcp_stat; /* TCP specific stats */ + struct udpstat_local *if_udp_stat; /* UDP specific stats */ }; /* @@ -542,7 +671,8 @@ struct if_clone { */ #define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) #define IF_DROP(ifq) ((ifq)->ifq_drops++) -#define IF_ENQUEUE(ifq, m) { \ + +#define IF_ENQUEUE(ifq, m) do { \ (m)->m_nextpkt = NULL; \ if ((ifq)->ifq_tail == NULL) \ (ifq)->ifq_head = m; \ @@ -550,15 +680,17 @@ struct if_clone { ((struct mbuf*)(ifq)->ifq_tail)->m_nextpkt = m; \ (ifq)->ifq_tail = m; \ (ifq)->ifq_len++; \ -} -#define IF_PREPEND(ifq, m) { \ +} while (0) + +#define IF_PREPEND(ifq, m) do { \ (m)->m_nextpkt = (ifq)->ifq_head; \ if ((ifq)->ifq_tail == NULL) \ (ifq)->ifq_tail = (m); \ (ifq)->ifq_head = (m); \ (ifq)->ifq_len++; \ -} -#define IF_DEQUEUE(ifq, m) { \ +} while (0) + +#define IF_DEQUEUE(ifq, m) do { \ (m) = (ifq)->ifq_head; \ if (m != NULL) { \ if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \ @@ -566,8 +698,9 @@ struct if_clone { (m)->m_nextpkt = NULL; \ (ifq)->ifq_len--; \ } \ -} -#define IF_REMQUEUE(ifq, m) { \ +} while (0) + +#define IF_REMQUEUE(ifq, m) do { \ struct mbuf *_p = (ifq)->ifq_head; \ struct mbuf *_n = (m)->m_nextpkt; \ if ((m) == _p) \ @@ -588,14 +721,15 @@ struct if_clone { if (_p != NULL) \ _p->m_nextpkt = _n; \ (m)->m_nextpkt = NULL; \ -} +} while (0) + #define IF_DRAIN(ifq) do { \ - struct mbuf *m; \ + struct mbuf *_m; \ for (;;) { \ - IF_DEQUEUE(ifq, m); \ - if (m == NULL) \ + IF_DEQUEUE(ifq, _m); \ + if (_m == NULL) \ break; \ - m_freem(m); \ + m_freem(_m); \ } \ } while (0) @@ -749,7 +883,8 @@ struct ifmultiaddr { __private_extern__ struct ifnethead ifnet_head; __private_extern__ struct ifnet **ifindex2ifnet; -__private_extern__ int ifqmaxlen; +__private_extern__ u_int32_t if_sndq_maxlen; +__private_extern__ u_int32_t if_rcvq_maxlen; __private_extern__ int if_index; __private_extern__ struct ifaddr **ifnet_addrs; __private_extern__ lck_attr_t *ifa_mtx_attr; @@ -757,6 +892,8 @@ __private_extern__ lck_grp_t *ifa_mtx_grp; __private_extern__ lck_grp_t *ifnet_lock_group; __private_extern__ lck_attr_t *ifnet_lock_attr; extern ifnet_t lo_ifp; +extern uint32_t if_bw_measure_size; +extern u_int32_t if_bw_smoothing_val; extern int if_addmulti(struct ifnet *, const struct sockaddr *, struct ifmultiaddr **); @@ -774,6 +911,9 @@ extern int ifioctl(struct socket *, u_long, caddr_t, struct proc *); extern int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); extern struct ifnet *ifunit(const char *); extern struct ifnet *if_withname(struct sockaddr *); +extern void if_qflush(struct ifnet *, int); +extern void if_qflush_sc(struct ifnet *, mbuf_svc_class_t, u_int32_t, + u_int32_t *, u_int32_t *, int); extern struct if_clone *if_clone_lookup(const char *, u_int32_t *); extern int if_clone_attach(struct if_clone *); @@ -801,6 +941,10 @@ __private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t, u_int32_t); __private_extern__ int ifnet_is_attached(struct ifnet *, int refio); __private_extern__ void ifnet_decr_iorefcnt(struct ifnet *); +__private_extern__ void ifnet_set_start_cycle(struct ifnet *, + struct timespec *); +__private_extern__ void ifnet_set_poll_cycle(struct ifnet *, + struct timespec *); __private_extern__ void if_attach_ifa(struct ifnet *, struct ifaddr *); __private_extern__ void if_attach_link_ifa(struct ifnet *, struct ifaddr *); @@ -812,15 +956,18 @@ __private_extern__ void dlil_if_unlock(void); __private_extern__ void dlil_if_lock_assert(void); extern struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); -extern struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, + unsigned int); extern struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); extern struct ifaddr *ifa_ifwithnet(const struct sockaddr *); -extern struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, + unsigned int); extern struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *); -extern struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *); -extern struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *, - const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, + const struct sockaddr *); +extern struct ifaddr *ifa_ifwithroute_scoped_locked(int, + const struct sockaddr *, const struct sockaddr *, unsigned int); extern struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); __private_extern__ struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int); extern void ifa_addref(struct ifaddr *, int); @@ -836,6 +983,16 @@ __private_extern__ struct in_ifaddr *ifa_foraddr(unsigned int); __private_extern__ struct in_ifaddr *ifa_foraddr_scoped(unsigned int, unsigned int); +extern void ifnet_fclist_append(struct sfb *sp, struct sfb_fc_list *fcl); +extern struct sfb_bin_fcentry* ifnet_fce_alloc(int how); +extern void ifnet_fce_free(struct sfb_bin_fcentry *); + +struct ifreq; +extern errno_t ifnet_getset_opportunistic(struct ifnet *, u_long, + struct ifreq *, struct proc *); +extern int ifnet_get_throttle(struct ifnet *, u_int32_t *); +extern int ifnet_set_throttle(struct ifnet *, u_int32_t); + #if INET6 struct in6_addr; __private_extern__ struct in6_ifaddr *ifa_foraddr6(struct in6_addr *); @@ -849,6 +1006,12 @@ __private_extern__ void if_data_internal_to_if_data64(struct ifnet *ifp, const struct if_data_internal *if_data_int, struct if_data64 *if_data64); __private_extern__ void if_copy_traffic_class(struct ifnet *ifp, struct if_traffic_class *if_tc); +__private_extern__ void if_copy_data_extended(struct ifnet *ifp, + struct if_data_extended *if_de); +__private_extern__ void if_copy_packet_stats(struct ifnet *ifp, + struct if_packet_stats *if_ps); +__private_extern__ void if_copy_rxpoll_stats(struct ifnet *ifp, + struct if_rxpoll_stats *if_rs); __private_extern__ struct rtentry *ifnet_cached_rtlookup_inet(struct ifnet *, struct in_addr); @@ -857,5 +1020,19 @@ __private_extern__ struct rtentry *ifnet_cached_rtlookup_inet6(struct ifnet *, struct in6_addr *); #endif /* INET6 */ +__private_extern__ void if_lqm_update(struct ifnet *, int32_t); +__private_extern__ void ifnet_update_sndq(struct ifclassq *, cqev_t); +__private_extern__ void ifnet_update_rcv(struct ifnet *, cqev_t); + +__private_extern__ errno_t ifnet_set_input_bandwidths(struct ifnet *, + struct if_bandwidths *); +__private_extern__ errno_t ifnet_set_output_bandwidths(struct ifnet *, + struct if_bandwidths *, boolean_t); +__private_extern__ u_int64_t ifnet_output_linkrate(struct ifnet *); +__private_extern__ u_int64_t ifnet_input_linkrate(struct ifnet *); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE +/* for uuid.c */ +__private_extern__ int uuid_get_ethernet(u_int8_t *); #endif /* XNU_KERNEL_PRIVATE */ #endif /* !_NET_IF_VAR_H_ */ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index cf090602d..c1a5f0a20 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1090,6 +1090,8 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) int soft_vlan; u_short tag; vlan_parent_ref vlp = NULL; + int err; + struct flowadv adv = { FADV_SUCCESS }; if (m == 0) { return (0); @@ -1167,7 +1169,18 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); } - return (ifnet_output_raw(p, PF_VLAN, m)); + + err = dlil_output(p, PF_VLAN, m, NULL, NULL, 1, &adv); + + if (err == 0) { + if (adv.code == FADV_FLOW_CONTROLLED) { + err = EQFULL; + } else if (adv.code == FADV_SUSPENDED) { + err = EQSUSPENDED; + } + } + + return (err); unlock_done: vlan_unlock(); @@ -1208,7 +1221,7 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol, m_freem(m); return 0; } - evl = (struct ether_vlan_header *)frame_header; + evl = (struct ether_vlan_header *)(void *)frame_header; if (ntohs(evl->evl_proto) == ETHERTYPE_VLAN) { /* don't allow VLAN within VLAN */ m_freem(m); diff --git a/bsd/net/iptap.c b/bsd/net/iptap.c new file mode 100644 index 000000000..c665af150 --- /dev/null +++ b/bsd/net/iptap.c @@ -0,0 +1,437 @@ +/* + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IPTAP_IF_NAME "iptap" +#define IPTAP_PRINTF printf +#define IP_TAP_NOT_USED 0 + +#define VALID_PACKET(type, label)\ + if (iptap_clients == 0) \ + goto label; \ + \ + if (type != IFT_ETHER && \ + type != IFT_CELLULAR) \ + goto label + +static void *iptap_alloc(size_t); +static void iptap_free(void *); +static errno_t iptap_register_control(void); +static inline void iptap_lock_shared(void); +static inline void iptap_lock_exclusive(void); +static inline void iptap_lock_done(void); +static void iptap_alloc_lock(void); +static void iptap_free_lock(void); + +static void iptap_enqueue_mbuf(struct ifnet *, protocol_family_t, struct mbuf *, u_int32_t, u_int32_t, u_int8_t); + +/* kernctl callbacks */ +static errno_t iptap_ctl_connect(kern_ctl_ref, struct sockaddr_ctl *, void **); +static errno_t iptap_ctl_disconnect(kern_ctl_ref, u_int32_t, void *); + +#if IP_TAP_NOT_USED + +static errno_t iptap_deregister_control(void); + +static errno_t iptap_ctl_send(kern_ctl_ref, u_int32_t, void *, mbuf_t, int); +static errno_t iptap_ctl_setopt(kern_ctl_ref, u_int32_t, void *, int, void *, size_t); +static errno_t iptap_ctl_getopt(kern_ctl_ref, u_int32_t, void *, int, void *, size_t *); + +#endif /* IP_TAP_NOT_USED */ + +decl_lck_rw_data(static, iptap_mtx); +static lck_grp_t *iptap_grp; +static kern_ctl_ref iptap_kernctl; +static unsigned int iptap_clients; +static OSMallocTag iptap_malloc_tag; + +struct iptap_client_t { + LIST_ENTRY(iptap_client_t) _cle; + u_int32_t _unit; +}; + +static LIST_HEAD(, iptap_client_t) _s_iptap_clients; + + +__private_extern__ void +iptap_init(void) { + + iptap_alloc_lock(); + + iptap_malloc_tag = OSMalloc_Tagalloc(IPTAP_CONTROL_NAME, OSMT_DEFAULT); + if (iptap_malloc_tag == NULL) { + iptap_free_lock(); + IPTAP_PRINTF("iptap_init failed: unable to allocate malloc tag.\n"); + return; + } + + if (iptap_register_control() != 0) { + iptap_free_lock(); + OSMalloc_Tagfree(iptap_malloc_tag); + IPTAP_PRINTF("iptap_init failed: iptap_register_control failure.\n"); + return; + } + + iptap_clients = 0; +} + +__private_extern__ void +iptap_ipf_input(struct ifnet *ifp, protocol_family_t proto, struct mbuf *mp, char *frame_header) +{ + VALID_PACKET(ifp->if_type, done); + + do { + char *hdr = (char *)mbuf_data(mp); + size_t start = (size_t)((char*)mbuf_datastart(mp)); + size_t o_len = mp->m_len; + + if (frame_header != NULL && (size_t)frame_header >= start && (size_t)frame_header <= (size_t)hdr) { + if (mbuf_setdata(mp, frame_header, o_len + ((size_t)hdr - (size_t)frame_header)) == 0) { + iptap_enqueue_mbuf(ifp, proto, mp, ((size_t)hdr - (size_t)frame_header), 0, IPTAP_INPUT_TAG); + mbuf_setdata(mp, hdr, o_len); + } + } else { + iptap_enqueue_mbuf(ifp, proto, mp, 0, 0, IPTAP_INPUT_TAG); + } + + } while (0); + +done: + return; +} + +__private_extern__ void +iptap_ipf_output(struct ifnet *ifp, protocol_family_t proto, struct mbuf *mp, u_int32_t pre, u_int32_t post) +{ + VALID_PACKET(ifp->if_type, done); + + iptap_enqueue_mbuf(ifp, proto, mp, pre, post, IPTAP_OUTPUT_TAG); + +done: + return; +} + +static void +iptap_enqueue_mbuf(struct ifnet *ifp, protocol_family_t proto, struct mbuf *mp, u_int32_t pre, u_int32_t post, u_int8_t io) +{ + errno_t err = 0; + struct iptap_client_t *client = NULL; + mbuf_t copy, itr = (mbuf_t)mp; + iptap_hdr_t header; + u_int32_t len = 0; + + memset(&header, 0x0, sizeof(header)); + header.version = IPTAP_VERSION_1; + header.type = ifp->if_type; + header.unit = ifp->if_unit; + strlcpy(header.if_name, ifp->if_name, sizeof(header.if_name)); + header.hdr_length = sizeof(header); + header.protocol_family = proto; + header.frame_pre_length = pre; + header.frame_pst_length = post; + header.io = io; + + do { + len += mbuf_len(itr); + itr = mbuf_next(itr); + } while (itr != NULL); + + iptap_lock_shared(); + + LIST_FOREACH(client, &_s_iptap_clients, _cle) { + + mbuf_dup((mbuf_t)mp, MBUF_DONTWAIT, ©); + if (copy == NULL) + continue; + + err = mbuf_prepend(©, sizeof(header), MBUF_DONTWAIT); + if (err != 0) { + if (copy != NULL) { + mbuf_freem(copy); + copy = NULL; + } + continue; + } + + HTONS(header.unit); + HTONL(header.hdr_length); + HTONL(header.protocol_family); + HTONL(header.frame_pre_length); + HTONL(header.frame_pst_length); + header.length = htonl(len); + + memcpy(mbuf_data(copy), &header, sizeof(header)); + + err = ctl_enqueuembuf(iptap_kernctl, client->_unit, copy, CTL_DATA_EOR); + if (err != 0) { + mbuf_freem(copy); + copy = NULL; + IPTAP_PRINTF("iptap_enqueue_mbuf failed: %d\n", (err)); + continue; + } + } + + iptap_lock_done(); +} + +static void* +iptap_alloc(size_t size) +{ + size_t *mem = OSMalloc(size + sizeof(size_t), iptap_malloc_tag); + + if (mem) { + *mem = size + sizeof(size_t); + mem++; + memset(mem, 0x0, size); + } + + return (void*)mem; +} + +static void +iptap_free(void *ptr) +{ + size_t *size = ptr; + size--; + OSFree(size, *size, iptap_malloc_tag); + ptr = NULL; +} + +static void +iptap_alloc_lock(void) +{ + lck_grp_attr_t *grp_attr; + lck_attr_t *attr; + + grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attr); + iptap_grp = lck_grp_alloc_init(IPTAP_IF_NAME, grp_attr); + lck_grp_attr_free(grp_attr); + + attr = lck_attr_alloc_init(); + lck_attr_setdefault(attr); + + lck_rw_init(&iptap_mtx, iptap_grp, attr); + lck_attr_free(attr); +} + +static void +iptap_free_lock(void) +{ + lck_rw_destroy(&iptap_mtx, iptap_grp); + lck_grp_free(iptap_grp); + iptap_grp = NULL; +} + +static inline void +iptap_lock_shared(void) +{ + lck_rw_lock_shared(&iptap_mtx); +} + +static inline void +iptap_lock_exclusive(void) +{ + lck_rw_lock_exclusive(&iptap_mtx); +} + +static inline void +iptap_lock_done(void) +{ + lck_rw_done(&iptap_mtx); +} + +static errno_t +iptap_register_control(void) +{ + errno_t err = 0; + struct kern_ctl_reg kern_ctl; + + bzero(&kern_ctl, sizeof(kern_ctl)); + strlcpy(kern_ctl.ctl_name, IPTAP_CONTROL_NAME, sizeof(kern_ctl.ctl_name)); + kern_ctl.ctl_name[sizeof(kern_ctl.ctl_name) - 1] = 0; + kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED; + kern_ctl.ctl_recvsize = IPTAP_BUFFERSZ; + kern_ctl.ctl_connect = iptap_ctl_connect; + kern_ctl.ctl_disconnect = iptap_ctl_disconnect; + kern_ctl.ctl_send = NULL; + kern_ctl.ctl_setopt = NULL; + kern_ctl.ctl_getopt = NULL; + + err = ctl_register(&kern_ctl, &iptap_kernctl); + + return (err); +} + +static errno_t +iptap_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, void **unitinfo) +{ +#pragma unused(kctlref) +#pragma unused(unitinfo) + errno_t err = 0; + struct iptap_client_t *client = NULL; + + client = (struct iptap_client_t *)iptap_alloc(sizeof(struct iptap_client_t)); + if (client != NULL) { + iptap_lock_exclusive(); + + iptap_clients++; + client->_unit = sac->sc_unit; + LIST_INSERT_HEAD(&_s_iptap_clients, client, _cle); + + iptap_lock_done(); + } else { + err = ENOMEM; + } + + return (err == 0) ? (0) : (err); +} + +static errno_t +iptap_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo) +{ +#pragma unused(kctlref) +#pragma unused(unitinfo) + errno_t err = 0; + struct iptap_client_t *client = NULL; + + iptap_lock_exclusive(); + + LIST_FOREACH(client, &_s_iptap_clients, _cle) { + if (client->_unit == unit) { + iptap_clients--; + LIST_REMOVE(client, _cle); + break; + } + } + + iptap_lock_done(); + + /* get rid of all the interfaces before free'ing */ + iptap_free(client); + + if (client == NULL) + panic("iptap_ctl_disconnect: received a disconnect notification without a cache entry.\n"); + + return (err == 0) ? (0) : (err); +} + +#if IP_TAP_NOT_USED + +__private_extern__ void +iptap_destroy(void) { + + if (iptap_clients != 0) { + IPTAP_PRINTF("iptap_destroy failed: there are still outstanding clients.\n"); + return; + } + + if (iptap_deregister_control() != 0) { + IPTAP_PRINTF("iptap_destroy failed: iptap_deregister_control failed.\n"); + } + + OSMalloc_Tagfree(iptap_malloc_tag); + + iptap_free_lock(); +} + +static errno_t +iptap_deregister_control(void) +{ + errno_t err = 0; + + if (iptap_kernctl != NULL) { + err = ctl_deregister(iptap_kernctl); + } else { + err = EINVAL; + } + + return (err); +} + +static errno_t +iptap_ctl_send(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, mbuf_t m, int flags) +{ +#pragma unused(kctlref) +#pragma unused(unit) +#pragma unused(unitinfo) +#pragma unused(m) +#pragma unused(flags) + return (KERN_SUCCESS); +} + +static errno_t +iptap_ctl_setopt(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int opt, void *data, size_t len) +{ +#pragma unused(kctlref) +#pragma unused(unit) +#pragma unused(unitinfo) +#pragma unused(opt) +#pragma unused(data) +#pragma unused(len) + return (KERN_SUCCESS); +} + +static errno_t +iptap_ctl_getopt(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, int opt, void *data, size_t *len) +{ +#pragma unused(kctlref) +#pragma unused(unit) +#pragma unused(unitinfo) +#pragma unused(opt) +#pragma unused(data) +#pragma unused(len) + return (KERN_SUCCESS); +} + +#endif /* IP_TAP_NOT_USED */ + diff --git a/osfmk/ddb/db_expr.h b/bsd/net/iptap.h similarity index 55% rename from osfmk/ddb/db_expr.h rename to bsd/net/iptap.h index 080e1a6c6..db8b0a22f 100644 --- a/osfmk/ddb/db_expr.h +++ b/bsd/net/iptap.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,51 +22,51 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:19:18 ezf - * change marker to not FREE - * [1994/09/22 21:09:57 ezf] - * - * Revision 1.1.2.3 1993/09/17 21:34:35 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:14 robert] - * - * Revision 1.1.2.2 1993/07/27 18:27:21 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:11:42 elliston] - * - * $EndLog$ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#ifndef IPTAP_H +#define IPTAP_H -#ifndef _DDB_DB_EXPR_H_ -#define _DDB_DB_EXPR_H_ +#include -#include -#include +#define IPTAP_CONTROL_NAME "com.apple.net.iptap_control" +#define IPTAP_BUFFERSZ (128 * 1024) +#define IPTAP_VERSION_1 0x1 -/* Prototypes for functions exported by this module. - */ +enum { + IPTAP_OUTPUT_TAG = 0x01, + IPTAP_INPUT_TAG = 0x10, + IPTAP_UNKNOWN_TAG = 0x11 +}; + +#pragma pack(push) +#pragma pack(1) + +typedef struct iptap_hdr_t { + uint32_t hdr_length; + uint8_t version; + uint32_t length; + uint8_t type; + uint16_t unit; + uint8_t io; + uint32_t protocol_family; + uint32_t frame_pre_length; + uint32_t frame_pst_length; + char if_name[IFNAMSIZ]; +} __attribute__ ((__packed__)) iptap_hdr_t; + +#pragma pack(pop) -int db_size_option( - char *modif, - boolean_t *u_option, - boolean_t *t_option); +#ifdef KERNEL_PRIVATE -int db_expression(db_expr_t *valuep); +extern void iptap_init(void); +extern void iptap_ipf_input(struct ifnet *, protocol_family_t, struct mbuf *, char *); +extern void iptap_ipf_output(struct ifnet *, protocol_family_t, struct mbuf *, u_int32_t, u_int32_t); +#if 0 +extern void iptap_destroy(void); +#endif -#endif /* !_DDB_DB_EXPR_H_ */ +#endif /* KERNEL_PRIVATE */ +#endif /* IPTAP_H */ \ No newline at end of file diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index 82ba11b03..e613eec82 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -38,147 +38,282 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include #include #include #include +#include +#include +#include +#include +#include +#ifdef INET +#include +#endif +#ifdef INET6 +#include +#endif #include "net/net_str_id.h" #if IF_LASTCHANGEUPTIME -#define TOUCHLASTCHANGE(__if_lastchange) microuptime(__if_lastchange) +#define TOUCHLASTCHANGE(__if_lastchange) { \ + (__if_lastchange)->tv_sec = net_uptime(); \ + (__if_lastchange)->tv_usec = 0; \ +} #else -#define TOUCHLASTCHANGE(__if_lastchange) microtime(__if_lastchange) +#define TOUCHLASTCHANGE(__if_lastchange) microtime(__if_lastchange) #endif -static errno_t -ifnet_list_get_common(ifnet_family_t, boolean_t, ifnet_t **, u_int32_t *); +#define _cast_non_const(p) ((void *)(uintptr_t)(p)) + +static errno_t ifnet_defrouter_llreachinfo(ifnet_t, int, + struct ifnet_llreach_info *); +static void ifnet_kpi_free(ifnet_t); +static errno_t ifnet_list_get_common(ifnet_family_t, boolean_t, ifnet_t **, + u_int32_t *); +static errno_t ifnet_set_lladdr_internal(ifnet_t, const void *, size_t, + u_char, int); +static errno_t ifnet_awdl_check_eflags(ifnet_t, u_int32_t *, u_int32_t *); /* - Temporary work around until we have real reference counting - - We keep the bits about calling dlil_if_release (which should be - called recycle) transparent by calling it from our if_free function - pointer. We have to keep the client's original detach function - somewhere so we can call it. + * Temporary work around until we have real reference counting + * + * We keep the bits about calling dlil_if_release (which should be + * called recycle) transparent by calling it from our if_free function + * pointer. We have to keep the client's original detach function + * somewhere so we can call it. */ static void -ifnet_kpi_free( - ifnet_t ifp) +ifnet_kpi_free(ifnet_t ifp) { - ifnet_detached_func detach_func = ifp->if_kpi_storage; - - if (detach_func) + ifnet_detached_func detach_func = ifp->if_kpi_storage; + + if (detach_func != NULL) detach_func(ifp); - - if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) { + + if (ifp->if_broadcast.length > sizeof (ifp->if_broadcast.u.buffer)) { FREE(ifp->if_broadcast.u.ptr, M_IFADDR); ifp->if_broadcast.u.ptr = NULL; } - + dlil_if_release(ifp); } -static __inline__ void* -_cast_non_const(const void * ptr) { - union { - const void* cval; - void* val; - } ret; - - ret.cval = ptr; - return (ret.val); +errno_t +ifnet_allocate(const struct ifnet_init_params *init, ifnet_t *interface) +{ + struct ifnet_init_eparams einit; + + bzero(&einit, sizeof (einit)); + + einit.ver = IFNET_INIT_CURRENT_VERSION; + einit.len = sizeof (einit); + einit.flags = IFNET_INIT_LEGACY; + einit.uniqueid = init->uniqueid; + einit.uniqueid_len = init->uniqueid_len; + einit.name = init->name; + einit.unit = init->unit; + einit.family = init->family; + einit.type = init->type; + einit.output = init->output; + einit.demux = init->demux; + einit.add_proto = init->add_proto; + einit.del_proto = init->del_proto; + einit.check_multi = init->check_multi; + einit.framer = init->framer; + einit.softc = init->softc; + einit.ioctl = init->ioctl; + einit.set_bpf_tap = init->set_bpf_tap; + einit.detach = init->detach; + einit.event = init->event; + einit.broadcast_addr = init->broadcast_addr; + einit.broadcast_len = init->broadcast_len; + + return (ifnet_allocate_extended(&einit, interface)); } errno_t -ifnet_allocate( - const struct ifnet_init_params *init, - ifnet_t *interface) +ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, + ifnet_t *interface) { - int error; + struct ifnet_init_eparams einit; struct ifnet *ifp = NULL; - - if (init->family == 0) - return EINVAL; - if (init->name == NULL || - init->output == NULL) - return EINVAL; - if (strlen(init->name) >= IFNAMSIZ) - return EINVAL; - if ((init->type & 0xFFFFFF00) != 0 || init->type == 0) - return EINVAL; - - error = dlil_if_acquire(init->family, init->uniqueid, init->uniqueid_len, &ifp); - if (error == 0) - { + int error; + + einit = *einit0; + + if (einit.ver != IFNET_INIT_CURRENT_VERSION || + einit.len < sizeof (einit)) + return (EINVAL); + + if (einit.family == 0 || einit.name == NULL || + strlen(einit.name) >= IFNAMSIZ || + (einit.type & 0xFFFFFF00) != 0 || einit.type == 0) + return (EINVAL); + + if (einit.flags & IFNET_INIT_LEGACY) { + if (einit.output == NULL || einit.flags != IFNET_INIT_LEGACY) + return (EINVAL); + + einit.pre_enqueue = NULL; + einit.start = NULL; + einit.output_ctl = NULL; + einit.output_sched_model = IFNET_SCHED_MODEL_NORMAL; + einit.input_poll = NULL; + einit.input_ctl = NULL; + } else { + if (einit.start == NULL) + return (EINVAL); + + einit.output = NULL; + if (einit.output_sched_model >= IFNET_SCHED_MODEL_MAX) + return (EINVAL); + + if (einit.flags & IFNET_INIT_INPUT_POLL) { + if (einit.input_poll == NULL || einit.input_ctl == NULL) + return (EINVAL); + } else { + einit.input_poll = NULL; + einit.input_ctl = NULL; + } + } + + error = dlil_if_acquire(einit.family, einit.uniqueid, + einit.uniqueid_len, &ifp); + + if (error == 0) { + u_int64_t br; + /* * Cast ifp->if_name as non const. dlil_if_acquire sets it up * to point to storage of at least IFNAMSIZ bytes. It is safe * to write to this. */ - strncpy(_cast_non_const(ifp->if_name), init->name, IFNAMSIZ); - ifp->if_type = init->type; - ifp->if_family = init->family; - ifp->if_unit = init->unit; - ifp->if_output = init->output; - ifp->if_demux = init->demux; - ifp->if_add_proto = init->add_proto; - ifp->if_del_proto = init->del_proto; - ifp->if_check_multi = init->check_multi; - ifp->if_framer = init->framer; - ifp->if_softc = init->softc; - ifp->if_ioctl = init->ioctl; - ifp->if_set_bpf_tap = init->set_bpf_tap; - ifp->if_free = ifnet_kpi_free; - ifp->if_event = init->event; - ifp->if_kpi_storage = init->detach; - ifp->if_eflags |= IFEF_USEKPI; - - if (init->broadcast_len && init->broadcast_addr) { - if (init->broadcast_len > sizeof(ifp->if_broadcast.u.buffer)) { - MALLOC(ifp->if_broadcast.u.ptr, u_char*, init->broadcast_len, M_IFADDR, M_NOWAIT); + strncpy(_cast_non_const(ifp->if_name), einit.name, IFNAMSIZ); + ifp->if_type = einit.type; + ifp->if_family = einit.family; + ifp->if_unit = einit.unit; + ifp->if_output = einit.output; + ifp->if_pre_enqueue = einit.pre_enqueue; + ifp->if_start = einit.start; + ifp->if_output_ctl = einit.output_ctl; + ifp->if_output_sched_model = einit.output_sched_model; + ifp->if_output_bw.eff_bw = einit.output_bw; + ifp->if_output_bw.max_bw = einit.output_bw_max; + ifp->if_input_poll = einit.input_poll; + ifp->if_input_ctl = einit.input_ctl; + ifp->if_input_bw.eff_bw = einit.input_bw; + ifp->if_input_bw.max_bw = einit.input_bw_max; + ifp->if_demux = einit.demux; + ifp->if_add_proto = einit.add_proto; + ifp->if_del_proto = einit.del_proto; + ifp->if_check_multi = einit.check_multi; + ifp->if_framer = einit.framer; + ifp->if_softc = einit.softc; + ifp->if_ioctl = einit.ioctl; + ifp->if_set_bpf_tap = einit.set_bpf_tap; + ifp->if_free = ifnet_kpi_free; + ifp->if_event = einit.event; + ifp->if_kpi_storage = einit.detach; + + if (ifp->if_output_bw.eff_bw > ifp->if_output_bw.max_bw) + ifp->if_output_bw.max_bw = ifp->if_output_bw.eff_bw; + else if (ifp->if_output_bw.eff_bw == 0) + ifp->if_output_bw.eff_bw = ifp->if_output_bw.max_bw; + + if (ifp->if_input_bw.eff_bw > ifp->if_input_bw.max_bw) + ifp->if_input_bw.max_bw = ifp->if_input_bw.eff_bw; + else if (ifp->if_input_bw.eff_bw == 0) + ifp->if_input_bw.eff_bw = ifp->if_input_bw.max_bw; + + if (ifp->if_output_bw.max_bw == 0) + ifp->if_output_bw = ifp->if_input_bw; + else if (ifp->if_input_bw.max_bw == 0) + ifp->if_input_bw = ifp->if_output_bw; + + if (ifp->if_ioctl == NULL) + ifp->if_ioctl = ifp_if_ioctl; + + /* Pin if_baudrate to 32 bits */ + br = MAX(ifp->if_output_bw.max_bw, ifp->if_input_bw.max_bw); + if (br != 0) + ifp->if_baudrate = (br > 0xFFFFFFFF) ? 0xFFFFFFFF : br; + + if (ifp->if_start != NULL) { + ifp->if_eflags |= IFEF_TXSTART; + if (ifp->if_pre_enqueue == NULL) + ifp->if_pre_enqueue = ifnet_enqueue; + ifp->if_output = ifp->if_pre_enqueue; + } else { + ifp->if_eflags &= ~IFEF_TXSTART; + } + + if (ifp->if_input_poll != NULL) + ifp->if_eflags |= IFEF_RXPOLL; + else + ifp->if_eflags &= ~IFEF_RXPOLL; + + VERIFY(!(einit.flags & IFNET_INIT_LEGACY) || + (ifp->if_pre_enqueue == NULL && ifp->if_start == NULL && + ifp->if_output_ctl == NULL && ifp->if_input_poll == NULL && + ifp->if_input_ctl == NULL)); + VERIFY(!(einit.flags & IFNET_INIT_INPUT_POLL) || + (ifp->if_input_poll != NULL && ifp->if_input_ctl != NULL)); + + if (einit.broadcast_len && einit.broadcast_addr) { + if (einit.broadcast_len > + sizeof (ifp->if_broadcast.u.buffer)) { + MALLOC(ifp->if_broadcast.u.ptr, u_char *, + einit.broadcast_len, M_IFADDR, M_NOWAIT); if (ifp->if_broadcast.u.ptr == NULL) { error = ENOMEM; + } else { + bcopy(einit.broadcast_addr, + ifp->if_broadcast.u.ptr, + einit.broadcast_len); } - else { - bcopy(init->broadcast_addr, ifp->if_broadcast.u.ptr, init->broadcast_len); - } + } else { + bcopy(einit.broadcast_addr, + ifp->if_broadcast.u.buffer, + einit.broadcast_len); } - else { - bcopy(init->broadcast_addr, ifp->if_broadcast.u.buffer, init->broadcast_len); - } - ifp->if_broadcast.length = init->broadcast_len; - } - else { - bzero(&ifp->if_broadcast, sizeof(ifp->if_broadcast)); + ifp->if_broadcast.length = einit.broadcast_len; + } else { + bzero(&ifp->if_broadcast, sizeof (ifp->if_broadcast)); } - + + IFCQ_MAXLEN(&ifp->if_snd) = einit.sndq_maxlen; + if (error == 0) { *interface = ifp; - ifnet_reference(ifp); // temporary - this should be done in dlil_if_acquire - } - else { + // temporary - this should be done in dlil_if_acquire + ifnet_reference(ifp); + } else { dlil_if_release(ifp); - *interface = 0; + *interface = NULL; } } - + /* - Note: We should do something here to indicate that we haven't been - attached yet. By doing so, we can catch the case in ifnet_release - where the reference count reaches zero and call the recycle - function. If the interface is attached, the interface will be - recycled when the interface's if_free function is called. If the - interface is never attached, the if_free function will never be - called and the interface will never be recycled. - */ - - return error; + * Note: We should do something here to indicate that we haven't been + * attached yet. By doing so, we can catch the case in ifnet_release + * where the reference count reaches zero and call the recycle + * function. If the interface is attached, the interface will be + * recycled when the interface's if_free function is called. If the + * interface is never attached, the if_free function will never be + * called and the interface will never be recycled. + */ + + return (error); } errno_t @@ -193,53 +328,54 @@ ifnet_release(ifnet_t ifp) return (dlil_if_free(ifp)); } -errno_t -ifnet_interface_family_find(const char *module_string, ifnet_family_t *family_id) +errno_t +ifnet_interface_family_find(const char *module_string, + ifnet_family_t *family_id) { if (module_string == NULL || family_id == NULL) - return EINVAL; - return net_str_id_find_internal(module_string, family_id, NSI_IF_FAM_ID, 1); - + return (EINVAL); + + return (net_str_id_find_internal(module_string, family_id, + NSI_IF_FAM_ID, 1)); } -void* -ifnet_softc( - ifnet_t interface) +void * +ifnet_softc(ifnet_t interface) { - return interface == NULL ? NULL : interface->if_softc; + return ((interface == NULL) ? NULL : interface->if_softc); } -const char* -ifnet_name( - ifnet_t interface) +const char * +ifnet_name(ifnet_t interface) { - return interface == NULL ? NULL : interface->if_name; + return ((interface == NULL) ? NULL : interface->if_name); } ifnet_family_t -ifnet_family( - ifnet_t interface) +ifnet_family(ifnet_t interface) { - return interface == NULL ? 0 : interface->if_family; + return ((interface == NULL) ? 0 : interface->if_family); } u_int32_t -ifnet_unit( - ifnet_t interface) +ifnet_unit(ifnet_t interface) { - return interface == NULL ? (u_int32_t)0xffffffff : (u_int32_t)interface->if_unit; + return ((interface == NULL) ? (u_int32_t)0xffffffff : + (u_int32_t)interface->if_unit); } u_int32_t -ifnet_index( - ifnet_t interface) +ifnet_index(ifnet_t interface) { - return interface == NULL ? (u_int32_t)0xffffffff : interface->if_index; + return ((interface == NULL) ? (u_int32_t)0xffffffff : + interface->if_index); } errno_t ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, u_int16_t mask) { + uint16_t old_flags; + if (interface == NULL) return (EINVAL); @@ -250,17 +386,69 @@ ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, u_int16_t mask) if_updown(interface, (new_flags & IFF_UP) == IFF_UP); } + old_flags = interface->if_flags; interface->if_flags = (new_flags & mask) | (interface->if_flags & ~mask); + /* If we are modifying the multicast flag, set/unset the silent flag */ + if ((old_flags & IFF_MULTICAST) != + (interface->if_flags & IFF_MULTICAST)) { +#if INET + if (IGMP_IFINFO(interface) != NULL) + igmp_initsilent(interface, IGMP_IFINFO(interface)); +#endif /* INET */ +#if INET6 + if (MLD_IFINFO(interface) != NULL) + mld6_initsilent(interface, MLD_IFINFO(interface)); +#endif /* INET6 */ + } + ifnet_lock_done(interface); return (0); } u_int16_t -ifnet_flags( - ifnet_t interface) +ifnet_flags(ifnet_t interface) { - return interface == NULL ? 0 : interface->if_flags; + return ((interface == NULL) ? 0 : interface->if_flags); +} + +/* + * This routine ensures the following: + * + * If IFEF_AWDL is set by the caller, also set the rest of flags as + * defined in IFEF_AWDL_MASK. + * + * If IFEF_AWDL has been set on the interface and the caller attempts + * to clear one or more of the associated flags in IFEF_AWDL_MASK, + * return failure. + * + * All other flags not associated with AWDL are not affected. + * + * See for current definition of IFEF_AWDL_MASK. + */ +static errno_t +ifnet_awdl_check_eflags(ifnet_t ifp, u_int32_t *new_eflags, u_int32_t *mask) +{ + u_int32_t eflags; + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + + eflags = (*new_eflags & *mask) | (ifp->if_eflags & ~(*mask)); + + if (ifp->if_eflags & IFEF_AWDL) { + if (eflags & IFEF_AWDL) { + if ((eflags & IFEF_AWDL_MASK) != IFEF_AWDL_MASK) + return (1); + } else { + *new_eflags &= ~IFEF_AWDL_MASK; + *mask |= IFEF_AWDL_MASK; + } + } else if (eflags & IFEF_AWDL) { + *new_eflags |= IFEF_AWDL_MASK; + *mask |= IFEF_AWDL_MASK; + } + + return (0); } errno_t @@ -270,17 +458,24 @@ ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask) return (EINVAL); ifnet_lock_exclusive(interface); - interface->if_eflags = (new_flags & mask) | (interface->if_eflags & ~mask); + /* + * Sanity checks for IFEF_AWDL and its related flags. + */ + if (ifnet_awdl_check_eflags(interface, &new_flags, &mask) != 0) { + ifnet_lock_done(interface); + return (EINVAL); + } + interface->if_eflags = + (new_flags & mask) | (interface->if_eflags & ~mask); ifnet_lock_done(interface); return (0); } u_int32_t -ifnet_eflags( - ifnet_t interface) +ifnet_eflags(ifnet_t interface) { - return interface == NULL ? 0 : interface->if_eflags; + return ((interface == NULL) ? 0 : interface->if_eflags); } errno_t @@ -349,15 +544,76 @@ ifnet_idle_flags(ifnet_t ifp) return ((ifp == NULL) ? 0 : ifp->if_idle_flags); } -errno_t ifnet_set_capabilities_supported(ifnet_t ifp, u_int32_t new_caps, +errno_t +ifnet_set_link_quality(ifnet_t ifp, int quality) +{ + errno_t err = 0; + + if (ifp == NULL || quality < IFNET_LQM_MIN || quality > IFNET_LQM_MAX) { + err = EINVAL; + goto done; + } + + if (!ifnet_is_attached(ifp, 0)) { + err = ENXIO; + goto done; + } + + if_lqm_update(ifp, quality); + +done: + return (err); +} + +int +ifnet_link_quality(ifnet_t ifp) +{ + int lqm; + + if (ifp == NULL) + return (IFNET_LQM_THRESH_OFF); + + ifnet_lock_shared(ifp); + lqm = ifp->if_lqm; + ifnet_lock_done(ifp); + + return (lqm); +} + +static errno_t +ifnet_defrouter_llreachinfo(ifnet_t ifp, int af, + struct ifnet_llreach_info *iflri) +{ + if (ifp == NULL || iflri == NULL) + return (EINVAL); + + VERIFY(af == AF_INET || af == AF_INET6); + + return (ifnet_llreach_get_defrouter(ifp, af, iflri)); +} + +errno_t +ifnet_inet_defrouter_llreachinfo(ifnet_t ifp, struct ifnet_llreach_info *iflri) +{ + return (ifnet_defrouter_llreachinfo(ifp, AF_INET, iflri)); +} + +errno_t +ifnet_inet6_defrouter_llreachinfo(ifnet_t ifp, struct ifnet_llreach_info *iflri) +{ + return (ifnet_defrouter_llreachinfo(ifp, AF_INET6, iflri)); +} + +errno_t +ifnet_set_capabilities_supported(ifnet_t ifp, u_int32_t new_caps, u_int32_t mask) { errno_t error = 0; int tmp; if (ifp == NULL) - return EINVAL; - + return (EINVAL); + ifnet_lock_exclusive(ifp); tmp = (new_caps & mask) | (ifp->if_capabilities & ~mask); if ((tmp & ~IFCAP_VALID)) @@ -365,27 +621,29 @@ errno_t ifnet_set_capabilities_supported(ifnet_t ifp, u_int32_t new_caps, else ifp->if_capabilities = tmp; ifnet_lock_done(ifp); - - return error; + + return (error); } -u_int32_t ifnet_capabilities_supported(ifnet_t ifp) +u_int32_t +ifnet_capabilities_supported(ifnet_t ifp) { return ((ifp == NULL) ? 0 : ifp->if_capabilities); } -errno_t ifnet_set_capabilities_enabled(ifnet_t ifp, u_int32_t new_caps, +errno_t +ifnet_set_capabilities_enabled(ifnet_t ifp, u_int32_t new_caps, u_int32_t mask) { errno_t error = 0; int tmp; - struct kev_msg ev_msg; + struct kev_msg ev_msg; struct net_event_data ev_data; if (ifp == NULL) - return EINVAL; - + return (EINVAL); + ifnet_lock_exclusive(ifp); tmp = (new_caps & mask) | (ifp->if_capenable & ~mask); if ((tmp & ~IFCAP_VALID) || (tmp & ~ifp->if_capabilities)) @@ -393,55 +651,52 @@ errno_t ifnet_set_capabilities_enabled(ifnet_t ifp, u_int32_t new_caps, else ifp->if_capenable = tmp; ifnet_lock_done(ifp); - + /* Notify application of the change */ - bzero(&ev_data, sizeof(struct net_event_data)); - bzero(&ev_msg, sizeof(struct kev_msg)); - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; + bzero(&ev_data, sizeof (struct net_event_data)); + bzero(&ev_msg, sizeof (struct kev_msg)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; - ev_msg.event_code = KEV_DL_IFCAP_CHANGED; + ev_msg.event_code = KEV_DL_IFCAP_CHANGED; strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t)ifp->if_unit; + ev_msg.dv[0].data_length = sizeof (struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - return error; + return (error); } -u_int32_t ifnet_capabilities_enabled(ifnet_t ifp) +u_int32_t +ifnet_capabilities_enabled(ifnet_t ifp) { return ((ifp == NULL) ? 0 : ifp->if_capenable); - - return 0; } -static const ifnet_offload_t offload_mask = IFNET_CSUM_IP | IFNET_CSUM_TCP | - IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | - IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | - IFNET_CSUM_SUM16 | IFNET_VLAN_TAGGING | IFNET_VLAN_MTU | - IFNET_MULTIPAGES | IFNET_TSO_IPV4 | IFNET_TSO_IPV6; - -static const ifnet_offload_t any_offload_csum = IFNET_CSUM_IP | IFNET_CSUM_TCP | - IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | - IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | - IFNET_CSUM_SUM16; +static const ifnet_offload_t offload_mask = + (IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | + IFNET_IP_FRAGMENT | IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | + IFNET_IPV6_FRAGMENT | IFNET_CSUM_SUM16 | IFNET_VLAN_TAGGING | + IFNET_VLAN_MTU | IFNET_MULTIPAGES | IFNET_TSO_IPV4 | IFNET_TSO_IPV6); +static const ifnet_offload_t any_offload_csum = + (IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_CSUM_SUM16); errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) { u_int32_t ifcaps = 0; - + if (interface == NULL) return (EINVAL); ifnet_lock_exclusive(interface); - interface->if_hwassist = (offload & offload_mask); + interface->if_hwassist = (offload & offload_mask); ifnet_lock_done(interface); if ((offload & any_offload_csum)) @@ -455,133 +710,130 @@ ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) if ((offload & IFNET_VLAN_TAGGING)) ifcaps |= IFCAP_VLAN_HWTAGGING; if (ifcaps != 0) { - (void) ifnet_set_capabilities_supported(interface, ifcaps, IFCAP_VALID); - (void) ifnet_set_capabilities_enabled(interface, ifcaps, IFCAP_VALID); + (void) ifnet_set_capabilities_supported(interface, ifcaps, + IFCAP_VALID); + (void) ifnet_set_capabilities_enabled(interface, ifcaps, + IFCAP_VALID); } return (0); } ifnet_offload_t -ifnet_offload( - ifnet_t interface) +ifnet_offload(ifnet_t interface) { - return interface == NULL ? 0 : (interface->if_hwassist & offload_mask); + return ((interface == NULL) ? + 0 : (interface->if_hwassist & offload_mask)); } -errno_t -ifnet_set_tso_mtu( - ifnet_t interface, - sa_family_t family, - u_int32_t mtuLen) +errno_t +ifnet_set_tso_mtu(ifnet_t interface, sa_family_t family, u_int32_t mtuLen) { errno_t error = 0; - if (interface == NULL) return EINVAL; - - if (mtuLen < interface->if_mtu) - return EINVAL; - + if (interface == NULL || mtuLen < interface->if_mtu) + return (EINVAL); switch (family) { + case AF_INET: + if (interface->if_hwassist & IFNET_TSO_IPV4) + interface->if_tso_v4_mtu = mtuLen; + else + error = EINVAL; + break; - case AF_INET: - if (interface->if_hwassist & IFNET_TSO_IPV4) - interface->if_tso_v4_mtu = mtuLen; - else - error = EINVAL; - break; - - case AF_INET6: - if (interface->if_hwassist & IFNET_TSO_IPV6) - interface->if_tso_v6_mtu = mtuLen; - else - error = EINVAL; - break; + case AF_INET6: + if (interface->if_hwassist & IFNET_TSO_IPV6) + interface->if_tso_v6_mtu = mtuLen; + else + error = EINVAL; + break; - default: - error = EPROTONOSUPPORT; + default: + error = EPROTONOSUPPORT; + break; } - return error; + return (error); } - -errno_t -ifnet_get_tso_mtu( - ifnet_t interface, - sa_family_t family, - u_int32_t *mtuLen) + +errno_t +ifnet_get_tso_mtu(ifnet_t interface, sa_family_t family, u_int32_t *mtuLen) { errno_t error = 0; - if (interface == NULL || mtuLen == NULL) return EINVAL; - + if (interface == NULL || mtuLen == NULL) + return (EINVAL); + switch (family) { + case AF_INET: + if (interface->if_hwassist & IFNET_TSO_IPV4) + *mtuLen = interface->if_tso_v4_mtu; + else + error = EINVAL; + break; - case AF_INET: - if (interface->if_hwassist & IFNET_TSO_IPV4) - *mtuLen = interface->if_tso_v4_mtu; - else - error = EINVAL; - break; + case AF_INET6: + if (interface->if_hwassist & IFNET_TSO_IPV6) + *mtuLen = interface->if_tso_v6_mtu; + else + error = EINVAL; + break; - case AF_INET6: - if (interface->if_hwassist & IFNET_TSO_IPV6) - *mtuLen = interface->if_tso_v6_mtu; - else - error = EINVAL; - break; - default: - error = EPROTONOSUPPORT; + default: + error = EPROTONOSUPPORT; + break; } - return error; + return (error); } errno_t ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) { - struct kev_msg ev_msg; + struct kev_msg ev_msg; struct net_event_data ev_data; - bzero(&ev_data, sizeof(struct net_event_data)); - bzero(&ev_msg, sizeof(struct kev_msg)); + bzero(&ev_data, sizeof (struct net_event_data)); + bzero(&ev_msg, sizeof (struct kev_msg)); + if (interface == NULL) - return EINVAL; + return (EINVAL); /* Do not accept wacky values */ if ((properties & mask) & ~IF_WAKE_VALID_FLAGS) - return EINVAL; + return (EINVAL); ifnet_lock_exclusive(interface); - interface->if_wake_properties = (properties & mask) | (interface->if_wake_properties & ~mask); + interface->if_wake_properties = + (properties & mask) | (interface->if_wake_properties & ~mask); ifnet_lock_done(interface); (void) ifnet_touch_lastchange(interface); /* Notify application of the change */ - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; - ev_msg.event_code = KEV_DL_WAKEFLAGS_CHANGED; + ev_msg.event_code = KEV_DL_WAKEFLAGS_CHANGED; strlcpy(&ev_data.if_name[0], interface->if_name, IFNAMSIZ); - ev_data.if_family = interface->if_family; - ev_data.if_unit = (u_int32_t) interface->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; + ev_data.if_family = interface->if_family; + ev_data.if_unit = (u_int32_t)interface->if_unit; + ev_msg.dv[0].data_length = sizeof (struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - return 0; + return (0); } u_int32_t ifnet_get_wake_flags(ifnet_t interface) { - return interface == NULL ? 0 : interface->if_wake_properties; + return ((interface == NULL) ? 0 : interface->if_wake_properties); } /* @@ -624,269 +876,358 @@ ifnet_get_link_mib_data(ifnet_t interface, void *mibData, u_int32_t *mibLen) } u_int32_t -ifnet_get_link_mib_data_length( - ifnet_t interface) +ifnet_get_link_mib_data_length(ifnet_t interface) { - return interface == NULL ? 0 : interface->if_linkmiblen; + return ((interface == NULL) ? 0 : interface->if_linkmiblen); } errno_t -ifnet_output( - ifnet_t interface, - protocol_family_t protocol_family, - mbuf_t m, - void *route, - const struct sockaddr *dest) +ifnet_output(ifnet_t interface, protocol_family_t protocol_family, + mbuf_t m, void *route, const struct sockaddr *dest) { if (interface == NULL || protocol_family == 0 || m == NULL) { - if (m) + if (m != NULL) mbuf_freem_list(m); - return EINVAL; + return (EINVAL); } - return dlil_output(interface, protocol_family, m, route, dest, 0); + return (dlil_output(interface, protocol_family, m, route, dest, 0, NULL)); } errno_t -ifnet_output_raw( - ifnet_t interface, - protocol_family_t protocol_family, - mbuf_t m) +ifnet_output_raw(ifnet_t interface, protocol_family_t protocol_family, mbuf_t m) { if (interface == NULL || m == NULL) { - if (m) + if (m != NULL) mbuf_freem_list(m); - return EINVAL; + return (EINVAL); } - return dlil_output(interface, protocol_family, m, NULL, NULL, 1); + return (dlil_output(interface, protocol_family, m, NULL, NULL, 1, NULL)); } errno_t -ifnet_set_mtu( - ifnet_t interface, - u_int32_t mtu) +ifnet_set_mtu(ifnet_t interface, u_int32_t mtu) { - if (interface == NULL) return EINVAL; - interface->if_data.ifi_mtu = mtu; - return 0; + if (interface == NULL) + return (EINVAL); + + interface->if_mtu = mtu; + return (0); } u_int32_t -ifnet_mtu( - ifnet_t interface) +ifnet_mtu(ifnet_t interface) { - u_int32_t retval; - retval = interface == NULL ? 0 : interface->if_data.ifi_mtu; - return retval; + return ((interface == NULL) ? 0 : interface->if_mtu); } u_char -ifnet_type( - ifnet_t interface) +ifnet_type(ifnet_t interface) { - u_char retval; - - retval = interface == NULL ? 0 : interface->if_data.ifi_type; - return retval; + return ((interface == NULL) ? 0 : interface->if_data.ifi_type); } -#if 0 errno_t -ifnet_set_typelen(ifnet_t interface, u_char typelen) +ifnet_set_addrlen(ifnet_t interface, u_char addrlen) { - ifnet_lock_exclusive(interface); - interface->if_data.ifi_typelen = typelen; - ifnet_lock_done(interface); + if (interface == NULL) + return (EINVAL); + + interface->if_data.ifi_addrlen = addrlen; return (0); } u_char -ifnet_typelen( - ifnet_t interface) +ifnet_addrlen(ifnet_t interface) { - u_char retval; - retval = interface == NULL ? 0 : interface->if_data.ifi_typelen; - return retval; + return ((interface == NULL) ? 0 : interface->if_data.ifi_addrlen); } -#endif errno_t -ifnet_set_addrlen( - ifnet_t interface, - u_char addrlen) +ifnet_set_hdrlen(ifnet_t interface, u_char hdrlen) { - if (interface == NULL) return EINVAL; - interface->if_data.ifi_addrlen = addrlen; - return 0; + if (interface == NULL) + return (EINVAL); + + interface->if_data.ifi_hdrlen = hdrlen; + return (0); } u_char -ifnet_addrlen( - ifnet_t interface) +ifnet_hdrlen(ifnet_t interface) { - u_char retval; - retval = interface == NULL ? 0 : interface->if_data.ifi_addrlen; - return retval; + return ((interface == NULL) ? 0 : interface->if_data.ifi_hdrlen); } errno_t -ifnet_set_hdrlen( - ifnet_t interface, - u_char hdrlen) +ifnet_set_metric(ifnet_t interface, u_int32_t metric) { - if (interface == NULL) return EINVAL; - interface->if_data.ifi_hdrlen = hdrlen; - return 0; + if (interface == NULL) + return (EINVAL); + + interface->if_data.ifi_metric = metric; + return (0); } -u_char -ifnet_hdrlen( - ifnet_t interface) +u_int32_t +ifnet_metric(ifnet_t interface) { - u_char retval; - retval = interface == NULL ? 0 : interface->if_data.ifi_hdrlen; - return retval; + return ((interface == NULL) ? 0 : interface->if_data.ifi_metric); } errno_t -ifnet_set_metric( - ifnet_t interface, - u_int32_t metric) +ifnet_set_baudrate(struct ifnet *ifp, u_int64_t baudrate) { - if (interface == NULL) return EINVAL; - interface->if_data.ifi_metric = metric; - return 0; + if (ifp == NULL) + return (EINVAL); + + ifp->if_output_bw.max_bw = ifp->if_input_bw.max_bw = + ifp->if_output_bw.eff_bw = ifp->if_input_bw.eff_bw = baudrate; + + /* Pin if_baudrate to 32 bits until we can change the storage size */ + ifp->if_baudrate = (baudrate > 0xFFFFFFFF) ? 0xFFFFFFFF : baudrate; + + return (0); } -u_int32_t -ifnet_metric( - ifnet_t interface) +u_int64_t +ifnet_baudrate(struct ifnet *ifp) +{ + return ((ifp == NULL) ? 0 : ifp->if_baudrate); +} + +errno_t +ifnet_set_bandwidths(struct ifnet *ifp, struct if_bandwidths *output_bw, + struct if_bandwidths *input_bw) { - u_int32_t retval; - retval = interface == NULL ? 0 : interface->if_data.ifi_metric; - return retval; + if (ifp == NULL) + return (EINVAL); + + if (input_bw != NULL) + (void) ifnet_set_input_bandwidths(ifp, input_bw); + + if (output_bw != NULL) + (void) ifnet_set_output_bandwidths(ifp, output_bw, FALSE); + + return (0); +} + +errno_t +ifnet_set_output_bandwidths(struct ifnet *ifp, struct if_bandwidths *bw, + boolean_t locked) +{ + struct if_bandwidths old_bw; + struct ifclassq *ifq; + u_int64_t br; + + ifq = &ifp->if_snd; + if (!locked) + IFCQ_LOCK(ifq); + IFCQ_LOCK_ASSERT_HELD(ifq); + + old_bw = ifp->if_output_bw; + if (bw != NULL) { + if (bw->eff_bw != 0) + ifp->if_output_bw.eff_bw = bw->eff_bw; + if (bw->max_bw != 0) + ifp->if_output_bw.max_bw = bw->max_bw; + if (ifp->if_output_bw.eff_bw > ifp->if_output_bw.max_bw) + ifp->if_output_bw.max_bw = ifp->if_output_bw.eff_bw; + else if (ifp->if_output_bw.eff_bw == 0) + ifp->if_output_bw.eff_bw = ifp->if_output_bw.max_bw; + } + + /* Pin if_baudrate to 32 bits */ + br = MAX(ifp->if_output_bw.max_bw, ifp->if_input_bw.max_bw); + if (br != 0) + ifp->if_baudrate = (br > 0xFFFFFFFF) ? 0xFFFFFFFF : br; + + /* Adjust queue parameters if needed */ + if (old_bw.eff_bw != ifp->if_output_bw.eff_bw || + old_bw.max_bw != ifp->if_output_bw.max_bw) + ifnet_update_sndq(ifq, CLASSQ_EV_LINK_SPEED); + + if (!locked) + IFCQ_UNLOCK(ifq); + + return (0); } errno_t -ifnet_set_baudrate( - ifnet_t interface, - u_int64_t baudrate) +ifnet_set_input_bandwidths(struct ifnet *ifp, struct if_bandwidths *bw) { - if (interface == NULL) return EINVAL; - /* Pin baudrate to 32 bits until we can change the storage size */ - interface->if_data.ifi_baudrate = baudrate > 0xFFFFFFFF ? 0xFFFFFFFF : baudrate; - return 0; + struct if_bandwidths old_bw; + + old_bw = ifp->if_input_bw; + if (bw->eff_bw != 0) + ifp->if_input_bw.eff_bw = bw->eff_bw; + if (bw->max_bw != 0) + ifp->if_input_bw.max_bw = bw->max_bw; + if (ifp->if_input_bw.eff_bw > ifp->if_input_bw.max_bw) + ifp->if_input_bw.max_bw = ifp->if_input_bw.eff_bw; + else if (ifp->if_input_bw.eff_bw == 0) + ifp->if_input_bw.eff_bw = ifp->if_input_bw.max_bw; + + if (old_bw.eff_bw != ifp->if_input_bw.eff_bw || + old_bw.max_bw != ifp->if_input_bw.max_bw) + ifnet_update_rcv(ifp, CLASSQ_EV_LINK_SPEED); + + return (0); } u_int64_t -ifnet_baudrate( - ifnet_t interface) +ifnet_output_linkrate(struct ifnet *ifp) { - u_int64_t retval; - retval = interface == NULL ? 0 : interface->if_data.ifi_baudrate; - return retval; + struct ifclassq *ifq = &ifp->if_snd; + u_int64_t rate; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + rate = ifp->if_output_bw.eff_bw; + if (IFCQ_TBR_IS_ENABLED(ifq)) { + u_int64_t tbr_rate = ifp->if_snd.ifcq_tbr.tbr_rate_raw; + VERIFY(tbr_rate > 0); + rate = MIN(rate, ifp->if_snd.ifcq_tbr.tbr_rate_raw); + } + + return (rate); +} + +u_int64_t +ifnet_input_linkrate(struct ifnet *ifp) +{ + return (ifp->if_input_bw.eff_bw); } errno_t -ifnet_stat_increment(ifnet_t interface, - const struct ifnet_stat_increment_param *counts) +ifnet_bandwidths(struct ifnet *ifp, struct if_bandwidths *output_bw, + struct if_bandwidths *input_bw) { - if (interface == NULL) + if (ifp == NULL) return (EINVAL); - atomic_add_64(&interface->if_data.ifi_ipackets, counts->packets_in); - atomic_add_64(&interface->if_data.ifi_ibytes, counts->bytes_in); - atomic_add_64(&interface->if_data.ifi_ierrors, counts->errors_in); + if (output_bw != NULL) + *output_bw = ifp->if_output_bw; + if (input_bw != NULL) + *input_bw = ifp->if_input_bw; + + return (0); +} - atomic_add_64(&interface->if_data.ifi_opackets, counts->packets_out); - atomic_add_64(&interface->if_data.ifi_obytes, counts->bytes_out); - atomic_add_64(&interface->if_data.ifi_oerrors, counts->errors_out); +errno_t +ifnet_stat_increment(struct ifnet *ifp, + const struct ifnet_stat_increment_param *s) +{ + if (ifp == NULL) + return (EINVAL); - atomic_add_64(&interface->if_data.ifi_collisions, counts->collisions); - atomic_add_64(&interface->if_data.ifi_iqdrops, counts->dropped); + if (s->packets_in != 0) + atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in); + if (s->bytes_in != 0) + atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in); + if (s->errors_in != 0) + atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in); + + if (s->packets_out != 0) + atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out); + if (s->bytes_out != 0) + atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out); + if (s->errors_out != 0) + atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out); + + if (s->collisions != 0) + atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions); + if (s->dropped != 0) + atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped); /* Touch the last change time. */ - TOUCHLASTCHANGE(&interface->if_lastchange); + TOUCHLASTCHANGE(&ifp->if_lastchange); return (0); } errno_t -ifnet_stat_increment_in(ifnet_t interface, u_int32_t packets_in, +ifnet_stat_increment_in(struct ifnet *ifp, u_int32_t packets_in, u_int32_t bytes_in, u_int32_t errors_in) { - if (interface == NULL) + if (ifp == NULL) return (EINVAL); - atomic_add_64(&interface->if_data.ifi_ipackets, packets_in); - atomic_add_64(&interface->if_data.ifi_ibytes, bytes_in); - atomic_add_64(&interface->if_data.ifi_ierrors, errors_in); + if (packets_in != 0) + atomic_add_64(&ifp->if_data.ifi_ipackets, packets_in); + if (bytes_in != 0) + atomic_add_64(&ifp->if_data.ifi_ibytes, bytes_in); + if (errors_in != 0) + atomic_add_64(&ifp->if_data.ifi_ierrors, errors_in); - TOUCHLASTCHANGE(&interface->if_lastchange); + TOUCHLASTCHANGE(&ifp->if_lastchange); return (0); } errno_t -ifnet_stat_increment_out(ifnet_t interface, u_int32_t packets_out, +ifnet_stat_increment_out(struct ifnet *ifp, u_int32_t packets_out, u_int32_t bytes_out, u_int32_t errors_out) { - if (interface == NULL) + if (ifp == NULL) return (EINVAL); - atomic_add_64(&interface->if_data.ifi_opackets, packets_out); - atomic_add_64(&interface->if_data.ifi_obytes, bytes_out); - atomic_add_64(&interface->if_data.ifi_oerrors, errors_out); + if (packets_out != 0) + atomic_add_64(&ifp->if_data.ifi_opackets, packets_out); + if (bytes_out != 0) + atomic_add_64(&ifp->if_data.ifi_obytes, bytes_out); + if (errors_out != 0) + atomic_add_64(&ifp->if_data.ifi_oerrors, errors_out); - TOUCHLASTCHANGE(&interface->if_lastchange); + TOUCHLASTCHANGE(&ifp->if_lastchange); return (0); } errno_t -ifnet_set_stat(ifnet_t interface, const struct ifnet_stats_param *stats) +ifnet_set_stat(struct ifnet *ifp, const struct ifnet_stats_param *s) { - if (interface == NULL) + if (ifp == NULL) return (EINVAL); - atomic_set_64(&interface->if_data.ifi_ipackets, stats->packets_in); - atomic_set_64(&interface->if_data.ifi_ibytes, stats->bytes_in); - atomic_set_64(&interface->if_data.ifi_imcasts, stats->multicasts_in); - atomic_set_64(&interface->if_data.ifi_ierrors, stats->errors_in); + atomic_set_64(&ifp->if_data.ifi_ipackets, s->packets_in); + atomic_set_64(&ifp->if_data.ifi_ibytes, s->bytes_in); + atomic_set_64(&ifp->if_data.ifi_imcasts, s->multicasts_in); + atomic_set_64(&ifp->if_data.ifi_ierrors, s->errors_in); - atomic_set_64(&interface->if_data.ifi_opackets, stats->packets_out); - atomic_set_64(&interface->if_data.ifi_obytes, stats->bytes_out); - atomic_set_64(&interface->if_data.ifi_omcasts, stats->multicasts_out); - atomic_set_64(&interface->if_data.ifi_oerrors, stats->errors_out); + atomic_set_64(&ifp->if_data.ifi_opackets, s->packets_out); + atomic_set_64(&ifp->if_data.ifi_obytes, s->bytes_out); + atomic_set_64(&ifp->if_data.ifi_omcasts, s->multicasts_out); + atomic_set_64(&ifp->if_data.ifi_oerrors, s->errors_out); - atomic_set_64(&interface->if_data.ifi_collisions, stats->collisions); - atomic_set_64(&interface->if_data.ifi_iqdrops, stats->dropped); - atomic_set_64(&interface->if_data.ifi_noproto, stats->no_protocol); + atomic_set_64(&ifp->if_data.ifi_collisions, s->collisions); + atomic_set_64(&ifp->if_data.ifi_iqdrops, s->dropped); + atomic_set_64(&ifp->if_data.ifi_noproto, s->no_protocol); /* Touch the last change time. */ - TOUCHLASTCHANGE(&interface->if_lastchange); + TOUCHLASTCHANGE(&ifp->if_lastchange); - return 0; + return (0); } errno_t -ifnet_stat(ifnet_t interface, struct ifnet_stats_param *stats) +ifnet_stat(struct ifnet *ifp, struct ifnet_stats_param *s) { - if (interface == NULL) + if (ifp == NULL) return (EINVAL); - atomic_get_64(stats->packets_in, &interface->if_data.ifi_ipackets); - atomic_get_64(stats->bytes_in, &interface->if_data.ifi_ibytes); - atomic_get_64(stats->multicasts_in, &interface->if_data.ifi_imcasts); - atomic_get_64(stats->errors_in, &interface->if_data.ifi_ierrors); + atomic_get_64(s->packets_in, &ifp->if_data.ifi_ipackets); + atomic_get_64(s->bytes_in, &ifp->if_data.ifi_ibytes); + atomic_get_64(s->multicasts_in, &ifp->if_data.ifi_imcasts); + atomic_get_64(s->errors_in, &ifp->if_data.ifi_ierrors); - atomic_get_64(stats->packets_out, &interface->if_data.ifi_opackets); - atomic_get_64(stats->bytes_out, &interface->if_data.ifi_obytes); - atomic_get_64(stats->multicasts_out, &interface->if_data.ifi_omcasts); - atomic_get_64(stats->errors_out, &interface->if_data.ifi_oerrors); + atomic_get_64(s->packets_out, &ifp->if_data.ifi_opackets); + atomic_get_64(s->bytes_out, &ifp->if_data.ifi_obytes); + atomic_get_64(s->multicasts_out, &ifp->if_data.ifi_omcasts); + atomic_get_64(s->errors_out, &ifp->if_data.ifi_oerrors); - atomic_get_64(stats->collisions, &interface->if_data.ifi_collisions); - atomic_get_64(stats->dropped, &interface->if_data.ifi_iqdrops); - atomic_get_64(stats->no_protocol, &interface->if_data.ifi_noproto); + atomic_get_64(s->collisions, &ifp->if_data.ifi_collisions); + atomic_get_64(s->dropped, &ifp->if_data.ifi_iqdrops); + atomic_get_64(s->no_protocol, &ifp->if_data.ifi_noproto); return (0); } @@ -1060,38 +1401,37 @@ ifnet_lladdr(ifnet_t interface) */ ifa = interface->if_lladdr; IFA_LOCK_SPIN(ifa); - lladdr = LLADDR(SDL(ifa->ifa_addr)); + lladdr = LLADDR(SDL((void *)ifa->ifa_addr)); IFA_UNLOCK(ifa); return (lladdr); } errno_t -ifnet_llbroadcast_copy_bytes( - ifnet_t interface, - void *addr, - size_t buffer_len, - size_t *out_len) +ifnet_llbroadcast_copy_bytes(ifnet_t interface, void *addr, size_t buffer_len, + size_t *out_len) { - if (interface == NULL || addr == NULL || out_len == NULL) return EINVAL; - + if (interface == NULL || addr == NULL || out_len == NULL) + return (EINVAL); + *out_len = interface->if_broadcast.length; - - if (buffer_len < interface->if_broadcast.length) { - return EMSGSIZE; - } - + + if (buffer_len < interface->if_broadcast.length) + return (EMSGSIZE); + if (interface->if_broadcast.length == 0) - return ENXIO; - - if (interface->if_broadcast.length <= sizeof(interface->if_broadcast.u.buffer)) { - bcopy(interface->if_broadcast.u.buffer, addr, interface->if_broadcast.length); - } - else { - bcopy(interface->if_broadcast.u.ptr, addr, interface->if_broadcast.length); + return (ENXIO); + + if (interface->if_broadcast.length <= + sizeof (interface->if_broadcast.u.buffer)) { + bcopy(interface->if_broadcast.u.buffer, addr, + interface->if_broadcast.length); + } else { + bcopy(interface->if_broadcast.u.ptr, addr, + interface->if_broadcast.length); } - - return 0; + + return (0); } errno_t @@ -1109,7 +1449,7 @@ ifnet_lladdr_copy_bytes(ifnet_t interface, void *lladdr, size_t lladdr_len) */ ifa = interface->if_lladdr; IFA_LOCK_SPIN(ifa); - sdl = SDL(ifa->ifa_addr); + sdl = SDL((void *)ifa->ifa_addr); if (lladdr_len != sdl->sdl_alen) { bzero(lladdr, lladdr_len); IFA_UNLOCK(ifa); @@ -1144,7 +1484,7 @@ ifnet_set_lladdr_internal(ifnet_t interface, const void *lladdr, struct sockaddr_dl *sdl; IFA_LOCK_SPIN(ifa); - sdl = (struct sockaddr_dl*)ifa->ifa_addr; + sdl = (struct sockaddr_dl *)(void *)ifa->ifa_addr; if (lladdr_len != 0) { bcopy(lladdr, LLADDR(sdl), lladdr_len); } else { @@ -1165,29 +1505,24 @@ ifnet_set_lladdr_internal(ifnet_t interface, const void *lladdr, /* Generate a kernel event */ if (error == 0) { dlil_post_msg(interface, KEV_DL_SUBCLASS, - KEV_DL_LINK_ADDRESS_CHANGED, NULL, 0); + KEV_DL_LINK_ADDRESS_CHANGED, NULL, 0); } return (error); } errno_t -ifnet_set_lladdr( - ifnet_t interface, - const void* lladdr, - size_t lladdr_len) +ifnet_set_lladdr(ifnet_t interface, const void* lladdr, size_t lladdr_len) { - return ifnet_set_lladdr_internal(interface, lladdr, lladdr_len, 0, 0); + return (ifnet_set_lladdr_internal(interface, lladdr, lladdr_len, 0, 0)); } errno_t -ifnet_set_lladdr_and_type( - ifnet_t interface, - const void* lladdr, - size_t lladdr_len, - u_char type) +ifnet_set_lladdr_and_type(ifnet_t interface, const void* lladdr, + size_t lladdr_len, u_char type) { - return ifnet_set_lladdr_internal(interface, lladdr, lladdr_len, type, 1); + return (ifnet_set_lladdr_internal(interface, lladdr, + lladdr_len, type, 1)); } errno_t @@ -1256,18 +1591,16 @@ ifnet_get_multicast_list(ifnet_t ifp, ifmultiaddr_t **addresses) } void -ifnet_free_multicast_list( - ifmultiaddr_t *addresses) +ifnet_free_multicast_list(ifmultiaddr_t *addresses) { int i; - - if (addresses == NULL) return; - + + if (addresses == NULL) + return; + for (i = 0; addresses[i] != NULL; i++) - { ifmaddr_release(addresses[i]); - } - + FREE(addresses, M_TEMP); } @@ -1294,10 +1627,10 @@ ifnet_find_by_name(const char *ifname, ifnet_t *ifpp) continue; IFA_LOCK(ifa); - ll_addr = (struct sockaddr_dl *)ifa->ifa_addr; + ll_addr = (struct sockaddr_dl *)(void *)ifa->ifa_addr; - if (namelen == ll_addr->sdl_nlen && - !strncmp(ll_addr->sdl_data, ifname, ll_addr->sdl_nlen)) { + if (namelen == ll_addr->sdl_nlen && strncmp(ll_addr->sdl_data, + ifname, ll_addr->sdl_nlen) == 0) { IFA_UNLOCK(ifa); *ifpp = ifp; ifnet_reference(*ifpp); @@ -1406,8 +1739,91 @@ ifnet_list_free(ifnet_t *interfaces) FREE(interfaces, M_TEMP); } +void +ifnet_transmit_burst_start(ifnet_t ifp, mbuf_t pkt) +{ + uint32_t orig_flags; + + if (ifp == NULL || !(pkt->m_flags & M_PKTHDR)) + return; + + orig_flags = OSBitOrAtomic(IF_MEASURED_BW_INPROGRESS, + &ifp->if_bw.flags); + if (orig_flags & IF_MEASURED_BW_INPROGRESS) { + /* There is already a measurement in progress; skip this one */ + return; + } + + ifp->if_bw.start_seq = pkt->m_pkthdr.pf_mtag.pftag_pktseq; + ifp->if_bw.start_ts = mach_absolute_time(); +} + +void +ifnet_transmit_burst_end(ifnet_t ifp, mbuf_t pkt) +{ + uint64_t oseq, ots, bytes, ts, t; + uint32_t flags; + + if ( ifp == NULL || !(pkt->m_flags & M_PKTHDR)) + return; + + flags = OSBitOrAtomic(IF_MEASURED_BW_CALCULATION, &ifp->if_bw.flags); + + /* If a calculation is already in progress, just return */ + if (flags & IF_MEASURED_BW_CALCULATION) + return; + + /* Check if a measurement was started at all */ + if (!(flags & IF_MEASURED_BW_INPROGRESS)) { + /* + * It is an error to call burst_end before burst_start. + * Reset the calculation flag and return. + */ + goto done; + } + + oseq = pkt->m_pkthdr.pf_mtag.pftag_pktseq; + ots = mach_absolute_time(); + + if (ifp->if_bw.start_seq > 0 && oseq > ifp->if_bw.start_seq) { + ts = ots - ifp->if_bw.start_ts; + if (ts > 0 ) { + absolutetime_to_nanoseconds(ts, &t); + bytes = oseq - ifp->if_bw.start_seq; + ifp->if_bw.bytes = bytes; + ifp->if_bw.ts = ts; + + if (t > 0) { + uint64_t bw = 0; + + /* Compute bandwidth as bytes/ms */ + bw = (bytes * NSEC_PER_MSEC) / t; + if (bw > 0) { + if (ifp->if_bw.bw > 0) { + u_int32_t shft; + + shft = if_bw_smoothing_val; + /* Compute EWMA of bw */ + ifp->if_bw.bw = (bw + + ((ifp->if_bw.bw << shft) - + ifp->if_bw.bw)) >> shft; + } else { + ifp->if_bw.bw = bw; + } + } + } + ifp->if_bw.last_seq = oseq; + ifp->if_bw.last_ts = ots; + } + } + +done: + flags = ~(IF_MEASURED_BW_INPROGRESS | IF_MEASURED_BW_CALCULATION); + OSBitAndAtomic(flags, &ifp->if_bw.flags); +} + /****************************************************************************/ -/* ifaddr_t accessors */ +/* ifaddr_t accessors */ /****************************************************************************/ errno_t @@ -1540,46 +1956,49 @@ ifaddr_ifnet(ifaddr_t ifa) } ifaddr_t -ifaddr_withaddr( - const struct sockaddr* address) +ifaddr_withaddr(const struct sockaddr *address) { - if (address == NULL) return NULL; - return ifa_ifwithaddr(address); + if (address == NULL) + return (NULL); + + return (ifa_ifwithaddr(address)); } ifaddr_t -ifaddr_withdstaddr( - const struct sockaddr* address) +ifaddr_withdstaddr(const struct sockaddr *address) { - if (address == NULL) return NULL; - return ifa_ifwithdstaddr(address); + if (address == NULL) + return (NULL); + + return (ifa_ifwithdstaddr(address)); } ifaddr_t -ifaddr_withnet( - const struct sockaddr* net) +ifaddr_withnet(const struct sockaddr *net) { - if (net == NULL) return NULL; - return ifa_ifwithnet(net); + if (net == NULL) + return (NULL); + + return (ifa_ifwithnet(net)); } ifaddr_t -ifaddr_withroute( - int flags, - const struct sockaddr* destination, - const struct sockaddr* gateway) +ifaddr_withroute(int flags, const struct sockaddr *destination, + const struct sockaddr *gateway) { - if (destination == NULL || gateway == NULL) return NULL; - return ifa_ifwithroute(flags, destination, gateway); + if (destination == NULL || gateway == NULL) + return (NULL); + + return (ifa_ifwithroute(flags, destination, gateway)); } ifaddr_t -ifaddr_findbestforaddr( - const struct sockaddr *addr, - ifnet_t interface) +ifaddr_findbestforaddr(const struct sockaddr *addr, ifnet_t interface) { - if (addr == NULL || interface == NULL) return NULL; - return ifaof_ifpforaddr(addr, interface); + if (addr == NULL || interface == NULL) + return (NULL); + + return (ifaof_ifpforaddr(addr, interface)); } errno_t @@ -1646,37 +2065,42 @@ ifmaddr_lladdress(ifmultiaddr_t ifma, struct sockaddr *out_addr, ifnet_t ifmaddr_ifnet(ifmultiaddr_t ifma) { - return (ifma == NULL ? NULL : ifma->ifma_ifp); + return ((ifma == NULL) ? NULL : ifma->ifma_ifp); } /******************************************************************************/ /* interface cloner */ /******************************************************************************/ -errno_t -ifnet_clone_attach(struct ifnet_clone_params *cloner_params, if_clone_t *ifcloner) +errno_t +ifnet_clone_attach(struct ifnet_clone_params *cloner_params, + if_clone_t *ifcloner) { errno_t error = 0; struct if_clone *ifc = NULL; size_t namelen; - - if (cloner_params == NULL || ifcloner == NULL || cloner_params->ifc_name == NULL || - cloner_params->ifc_create == NULL || cloner_params->ifc_destroy == NULL || - (namelen = strlen(cloner_params->ifc_name)) >= IFNAMSIZ) { + + if (cloner_params == NULL || ifcloner == NULL || + cloner_params->ifc_name == NULL || + cloner_params->ifc_create == NULL || + cloner_params->ifc_destroy == NULL || + (namelen = strlen(cloner_params->ifc_name)) >= IFNAMSIZ) { error = EINVAL; goto fail; } - + if (if_clone_lookup(cloner_params->ifc_name, NULL) != NULL) { - printf("ifnet_clone_attach: already a cloner for %s\n", cloner_params->ifc_name); + printf("%s: already a cloner for %s\n", __func__, + cloner_params->ifc_name); error = EEXIST; goto fail; } /* Make room for name string */ - ifc = _MALLOC(sizeof(struct if_clone) + IFNAMSIZ + 1, M_CLONE, M_WAITOK | M_ZERO); + ifc = _MALLOC(sizeof (struct if_clone) + IFNAMSIZ + 1, M_CLONE, + M_WAITOK | M_ZERO); if (ifc == NULL) { - printf("ifnet_clone_attach: _MALLOC failed\n"); + printf("%s: _MALLOC failed\n", __func__); error = ENOBUFS; goto fail; } @@ -1689,41 +2113,97 @@ ifnet_clone_attach(struct ifnet_clone_params *cloner_params, if_clone_t *ifclone error = if_clone_attach(ifc); if (error != 0) { - printf("ifnet_clone_attach: if_clone_attach failed %d\n", error); + printf("%s: if_clone_attach failed %d\n", __func__, error); goto fail; } *ifcloner = ifc; - - return 0; + + return (0); fail: if (ifc != NULL) FREE(ifc, M_CLONE); - return error; + return (error); } -errno_t +errno_t ifnet_clone_detach(if_clone_t ifcloner) { errno_t error = 0; struct if_clone *ifc = ifcloner; - + if (ifc == NULL || ifc->ifc_name == NULL) - return EINVAL; - + return (EINVAL); + if ((if_clone_lookup(ifc->ifc_name, NULL)) == NULL) { - printf("ifnet_clone_attach: no cloner for %s\n", ifc->ifc_name); + printf("%s: no cloner for %s\n", __func__, ifc->ifc_name); error = EINVAL; goto fail; } if_clone_detach(ifc); - + FREE(ifc, M_CLONE); - return 0; fail: - return error; + return (error); } +/******************************************************************************/ +/* misc */ +/******************************************************************************/ +extern void udp_get_ports_used(unsigned int ifindex, uint8_t *bitfield); +extern void tcp_get_ports_used(unsigned int ifindex, uint8_t *bitfield); +errno_t +ifnet_get_local_ports(ifnet_t ifp, uint8_t *bitfield) +{ + if (bitfield == NULL) + return (EINVAL); + + bzero(bitfield, 8192); + + udp_get_ports_used(ifp ? ifp->if_index : 0, bitfield); + tcp_get_ports_used(ifp ? ifp->if_index : 0, bitfield); + + return (0); +} + +errno_t +ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr* sa, int32_t rssi, + int lqm, int npm, u_int8_t srvinfo[48]) +{ + if (ifp == NULL || sa == NULL || srvinfo == NULL) + return(EINVAL); + if (sa->sa_len > sizeof(struct sockaddr_storage)) + return(EINVAL); + if (sa->sa_family != AF_LINK && sa->sa_family != AF_INET6) + return(EINVAL); + + dlil_node_present(ifp, sa, rssi, lqm, npm, srvinfo); + return (0); +} + +errno_t +ifnet_notice_node_absence(ifnet_t ifp, struct sockaddr* sa) +{ + if (ifp == NULL || sa == NULL) + return(EINVAL); + if (sa->sa_len > sizeof(struct sockaddr_storage)) + return(EINVAL); + if (sa->sa_family != AF_LINK && sa->sa_family != AF_INET6) + return(EINVAL); + + dlil_node_absent(ifp, sa); + return (0); +} + +errno_t +ifnet_notice_master_elected(ifnet_t ifp) +{ + if (ifp == NULL) + return(EINVAL); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_MASTER_ELECTED, NULL, 0); + return (0); +} diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index e2fd084b6..d4df862da 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2011 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,13 +34,36 @@ #ifndef __KPI_INTERFACE__ #define __KPI_INTERFACE__ + +#ifndef XNU_KERNEL_PRIVATE +#include +#endif + #include +#ifdef KERNEL_PRIVATE +#include +#endif /* KERNEL_PRIVATE */ + #ifndef _SA_FAMILY_T #define _SA_FAMILY_T typedef __uint8_t sa_family_t; #endif +#ifdef XNU_KERNEL_PRIVATE +#if CONFIG_EMBEDDED + #define KPI_INTERFACE_EMBEDDED 1 +#else + #define KPI_INTERFACE_EMBEDDED 0 +#endif +#else +#if TARGET_OS_EMBEDDED + #define KPI_INTERFACE_EMBEDDED 1 +#else + #define KPI_INTERFACE_EMBEDDED 0 +#endif +#endif + struct timeval; struct sockaddr; struct sockaddr_dl; @@ -81,7 +104,7 @@ enum { IFNET_FAMILY_DISC = 8, IFNET_FAMILY_MDECAP = 9, IFNET_FAMILY_GIF = 10, - IFNET_FAMILY_FAITH = 11, + IFNET_FAMILY_FAITH = 11, /* deprecated */ IFNET_FAMILY_STF = 12, IFNET_FAMILY_FIREWIRE = 13, IFNET_FAMILY_BOND = 14, @@ -310,6 +333,10 @@ typedef void (*ifnet_event_func)(ifnet_t interface, const struct kev_msg *msg); protocol's pre-output function. @param frame_type The frame type as determined by the protocol's pre-output function. + @param prepend_len The length of prepended bytes to the mbuf. + (ONLY used if KPI_INTERFACE_EMBEDDED is defined to 1) + @param postpend_len The length of the postpended bytes to the mbuf. + (ONLY used if KPI_INTERFACE_EMBEDDED is defined to 1) @result If the result is zero, processing will continue normally. If the result is EJUSTRETURN, processing will stop but the @@ -318,8 +345,11 @@ typedef void (*ifnet_event_func)(ifnet_t interface, const struct kev_msg *msg); the packet will be freed. */ typedef errno_t (*ifnet_framer_func)(ifnet_t interface, mbuf_t *packet, - const struct sockaddr *dest, const char *desk_linkaddr, - const char *frame_type); + const struct sockaddr *dest, const char *desk_linkaddr, const char *frame_type +#if KPI_INTERFACE_EMBEDDED + , u_int32_t *prepend_len, u_int32_t *postpend_len +#endif /* KPI_INTERFACE_EMBEDDED */ + ); /*! @typedef ifnet_add_proto_func @@ -560,9 +590,9 @@ struct ifnet_stat_increment_param { /*! @struct ifnet_init_params @discussion This structure is used to define various properties of - the interface when calling ifnet_init. A copy of these values - will be stored in the ifnet and can not be modified while the - interface is attached. + the interface when calling ifnet_allocate. A copy of these + values will be stored in the ifnet and cannot be modified + while the interface is attached. @field uniqueid An identifier unique to this instance of the interface. @field uniqueid_len The length, in bytes, of the uniqueid. @@ -618,6 +648,245 @@ struct ifnet_init_params { u_int32_t broadcast_len; /* required for non point-to-point interfaces */ }; +#ifdef KERNEL_PRIVATE +/* Valid values for version */ +#define IFNET_INIT_VERSION_2 2 +#define IFNET_INIT_CURRENT_VERSION IFNET_INIT_VERSION_2 + +/* Valid values for flags */ +#define IFNET_INIT_LEGACY 0x1 /* legacy network interface model */ +#define IFNET_INIT_INPUT_POLL 0x2 /* opportunistic input polling model */ + +/* + @typedef ifnet_pre_enqueue_func + @discussion ifnet_pre_enqueue_func is called for each outgoing packet + for the interface. The driver may perform last-minute changes + on the (fully formed) packet, but it is responsible for calling + ifnet_enqueue() to enqueue the packet upon completion. + @param interface The interface being sent on. + @param data The packet to be sent. + */ +typedef errno_t (*ifnet_pre_enqueue_func)(ifnet_t interface, mbuf_t data); + +/* + @typedef ifnet_start_func + @discussion ifnet_start_func is used to indicate to the driver that + one or more packets may be dequeued by calling ifnet_dequeue() + or ifnet_dequeue_multi(). This routine gets invoked when + ifnet_start() is called; the ifnet_start_func callback will + be executed within the context of a dedicated kernel thread, + hence it is guaranteed to be single threaded. The driver must + employ additional serializations if this callback routine is + to be called directly from another context, in order to prevent + race condition related issues (e.g. out-of-order packets.) + The dequeued packets will be fully formed packets (including + frame headers). The packets must be freed by the driver. + @param interface The interface being sent on. + */ +typedef void (*ifnet_start_func)(ifnet_t interface); + +/* + @typedef ifnet_input_poll_func + @discussion ifnet_input_poll_func is called by the network stack to + retrieve one or more packets from the driver which implements + the new driver input model. + @param interface The interface to retrieve the packets from. + @param flags For future use. + @param max_count The maximum number of packets to be dequeued. + @param first_packet Pointer to the first packet being dequeued. + @param last_packet Pointer to the last packet being dequeued. + @param cnt Pointer to a storage for the number of packets dequeued. + @param len Pointer to a storage for the total length (in bytes) + of the dequeued packets. + */ +typedef void (*ifnet_input_poll_func)(ifnet_t interface, u_int32_t flags, + u_int32_t max_count, mbuf_t *first_packet, mbuf_t *last_packet, + u_int32_t *cnt, u_int32_t *len); + +/* + @enum Interface control commands + @abstract Constants defining control commands. + @constant IFNET_CTL_SET_INPUT_MODEL Set input model. + @constant IFNET_CTL_GET_INPUT_MODEL Get input model. + */ +enum { + IFNET_CTL_SET_INPUT_MODEL = 1, + IFNET_CTL_GET_INPUT_MODEL = 2, +}; + +/* + @typedef ifnet_ctl_cmd_t + @abstract Storage type for the interface control command. + */ +typedef u_int32_t ifnet_ctl_cmd_t; + +/* + @enum Interface model sub-commands + @abstract Constants defining model sub-commands. + @constant IFNET_MODEL_INPUT_POLL_OFF Polling is inactive. When set, + the network stack will no longer invoke the input_poll callback + until the next time polling is turned on; the driver should + proceed to pushing the packets up to the network stack as in + the legacy input model, and if applicable, the driver should + also enable receive interrupt for the hardware. During get, + this indicates that the driver is currently operating in + the legacy/push input model. + @constant IFNET_MODEL_INPUT_POLL_ON Polling is active. When set, the + network stack will begin to invoke the input_poll callback to + retrieve packets from the driver until the next time polling + is turned off; the driver should no longer be pushing packets + up to the network stack, and if applicable, the driver should + also disable receive interrupt for the hardware. During get, + this indicates that the driver is currently operating in + the new/pull input model. + */ +enum { + IFNET_MODEL_INPUT_POLL_OFF = 0, + IFNET_MODEL_INPUT_POLL_ON = 1, +}; + +/* + @typedef ifnet_model_t + @abstract Storage type for the interface model sub-command. + */ +typedef u_int32_t ifnet_model_t; + +/* + @struct ifnet_model_params + @discussion This structure is used as parameter to the ifnet model + sub-commands. + @field model The interface model. + */ +struct ifnet_model_params { + ifnet_model_t model; + u_int32_t reserved[3]; +}; + +/* + @typedef ifnet_ctl_func + @discussion ifnet_ctl_func is called by the network stack to inform + about changes in parameters, or retrieve the parameters + related to the output or input processing or capabilities. + @param interface The interface. + @param cmd The ifnet_ctl_cmd_t interface control command. + @param arglen The length of the command argument. + @param arg The command argument. + @result 0 upon success, otherwise errno error. + */ +typedef errno_t (*ifnet_ctl_func)(ifnet_t interface, ifnet_ctl_cmd_t cmd, + u_int32_t arglen, void *arg); + +/* + @struct ifnet_init_eparams + @discussion This structure is used to define various properties of + the interface when calling ifnet_allocate_extended. A copy of + these values will be stored in the ifnet and cannot be modified + while the interface is attached. + @field ver The current structure version (IFNET_INIT_CURRENT_VERSION) + @field len The length of this structure. + @field flags See above values for flags. + @field uniqueid An identifier unique to this instance of the + interface. + @field uniqueid_len The length, in bytes, of the uniqueid. + @field name The interface name (i.e. en). + @field unit The interface unit number (en0's unit number is 0). + @field family The interface family. + @field type The interface type (see sys/if_types.h). Must be less + than 256. For new types, use IFT_OTHER. + @field sndq_maxlen The maximum size of the output queue; valid only + if IFNET_INIT_LEGACY is not set. + @field output The output function for the interface. Every packet the + stack attempts to send through this interface will go out + through this function. + @field pre_enqueue The pre_enqueue function for the interface, valid + only if IFNET_INIT_LEGACY is not set, and optional if it is set. + @field start The start function for the interface, valid only if + IFNET_INIT_LEGACY is not set, and required if it is set. + @field output_ctl The output control function for the interface, valid + only if IFNET_INIT_LEGACY is not set. + @field output_sched_model The IFNET_SCHED_MODEL value for the output + queue, as defined in net/if.h + @field output_bw The effective output bandwidth (in bits per second.) + @field output_bw_max The maximum theoretical output bandwidth + (in bits per second.) + @field input_poll The poll function for the interface, valid only if + IFNET_INIT_LEGACY is not set and only if IFNET_INIT_INPUT_POLL + is set. + @field input_ctl The input control function for the interface, valid + only if IFNET_INIT_LEGACY is not set and only if opportunistic + input polling is enabled via IFNET_INIT_INPUT_POLL flag. + @field rcvq_maxlen The size of the driver's receive ring or the total + count of descriptors used in the receive path; valid only if + IFNET_INIT_INPUT_POLL is set. + @field input_bw The effective input bandwidth (in bits per second.) + @field input_bw_max The maximum theoretical input bandwidth + (in bits per second.) + @field demux The function used to determine the protocol family of an + incoming packet. + @field add_proto The function used to attach a protocol to this + interface. + @field del_proto The function used to remove a protocol from this + interface. + @field framer The function used to frame outbound packets, may be NULL. + @field softc Driver specific storage. This value can be retrieved from + the ifnet using the ifnet_softc function. + @field ioctl The function used to handle ioctls. + @field set_bpf_tap The function used to set the bpf_tap function. + @field detach The function called to let the driver know the interface + has been detached. + @field event The function to notify the interface of various interface + specific kernel events. + @field broadcast_addr The link-layer broadcast address for this + interface. + @field broadcast_len The length of the link-layer broadcast address. +*/ +struct ifnet_init_eparams { + u_int32_t ver; /* required */ + u_int32_t len; /* required */ + u_int32_t flags; /* optional */ + + /* used to match recycled interface */ + const void *uniqueid; /* optional */ + u_int32_t uniqueid_len; /* optional */ + + /* used to fill out initial values for interface */ + const char *name; /* required */ + u_int32_t unit; /* required */ + ifnet_family_t family; /* required */ + u_int32_t type; /* required */ + u_int32_t sndq_maxlen; /* optional, only for new model */ + ifnet_output_func output; /* required only for legacy model */ + ifnet_pre_enqueue_func pre_enqueue; /* optional, only for new model */ + ifnet_start_func start; /* required only for new model */ + ifnet_ctl_func output_ctl; /* optional, only for new model */ + u_int32_t output_sched_model; /* optional, only for new model */ + u_int32_t reserved; /* for future use */ + u_int64_t output_bw; /* optional */ + u_int64_t output_bw_max; /* optional */ + u_int64_t _reserved[4]; /* for future use */ + ifnet_input_poll_func input_poll; /* optional, ignored for legacy model */ + ifnet_ctl_func input_ctl; /* required for opportunistic polling */ + u_int32_t rcvq_maxlen; /* optional, only for opportunistic polling */ + u_int32_t __reserved; /* for future use */ + u_int64_t input_bw; /* optional */ + u_int64_t input_bw_max; /* optional */ + u_int64_t ___reserved[4]; /* for future use */ + ifnet_demux_func demux; /* required */ + ifnet_add_proto_func add_proto; /* required */ + ifnet_del_proto_func del_proto; /* required */ + ifnet_check_multi check_multi; /* required for non point-to-point interfaces */ + ifnet_framer_func framer; /* optional */ + void *softc; /* optional */ + ifnet_ioctl_func ioctl; /* optional */ + ifnet_set_bpf_tap set_bpf_tap; /* deprecated */ + ifnet_detached_func detach; /* optional */ + ifnet_event_func event; /* optional */ + const void *broadcast_addr; /* required for non point-to-point interfaces */ + u_int32_t broadcast_len; /* required for non point-to-point interfaces */ + u_int64_t ____reserved[4]; /* for future use */ +}; +#endif /* KERNEL_PRIVATE */ + /*! @struct ifnet_stats_param @discussion This structure is used get and set the interface @@ -744,6 +1013,250 @@ __BEGIN_DECLS extern errno_t ifnet_allocate(const struct ifnet_init_params *init, ifnet_t *interface); +#ifdef KERNEL_PRIVATE +/* + @function ifnet_allocate_extended + @discussion An extended/newer variant of ifnet_allocate, with additional + support for the new output and input driver models. + @param init The initial values for the interface. These values can + not be changed after the interface has been allocated. + @param interface The interface allocated upon success. + @result May return ENOMEM if there is insufficient memory or EEXIST + if an interface with the same uniqueid and family has already + been allocated and is in use. + */ +extern errno_t ifnet_allocate_extended(const struct ifnet_init_eparams *init, + ifnet_t *interface); + +/* + @function ifnet_purge + @discussion Purge the output queue of an interface which implements + the new driver output model. + @param interface The interface to purge. + */ +extern void ifnet_purge(ifnet_t interface); + +/* + @function ifnet_enqueue + @discussion Enqueue a packet to the output queue of an interface + which implements the new driver output model. + @param interface The interface to enqueue the packet to. + @param packet The packet being enqueued; only one packet is allowed + to be enqueued at a time. + @result May return EINVAL if the parameters are invalid; ENXIO if + the interface doesn't implement the new driver output model; + EQFULL if the output queue is flow-controlled; or EQSUSPENDED + if the output queue is suspended. This routine either frees + or consumes the packet; the caller must not modify or free + it after calling this routine. Any attempt to enqueue more + than one packet will cause the entire packet chain to be freed. + */ +extern errno_t ifnet_enqueue(ifnet_t interface, mbuf_t packet); + +/* + @function ifnet_dequeue + @discussion Dequeue a packet from the output queue of an interface + which implements the new driver output model, and that the + output scheduling model is set to IFNET_SCHED_MODEL_NORMAL. + @param interface The interface to dequeue the packet from. + @param packet Pointer to the packet being dequeued. + @result May return EINVAL if the parameters are invalid, ENXIO if + the interface doesn't implement the new driver output model + or the output scheduling model isn't IFNET_SCHED_MODEL_NORMAL, + or EAGAIN if there is currently no packet available to + be dequeued. + */ +extern errno_t ifnet_dequeue(ifnet_t interface, mbuf_t *packet); + +/* + @function ifnet_dequeue_service_class + @discussion Dequeue a packet of a particular service class from the + appropriate output queue of an interface which implements the + new driver output model, and that the output scheduling model + is set to IFNET_SCHED_MODEL_DRIVER_MANAGED. + @param interface The interface to dequeue the packet from. + @param tc The service class. + @param packet Pointer to the packet being dequeued. + @result May return EINVAL if the parameters are invalid, ENXIO if + the interface doesn't implement the new driver output model + or if the output scheduling model isn't configured to + IFNET_SCHED_MODEL_DRIVER_MANAGED, or EAGAIN if there + is currently no packet available to be dequeued. + */ +extern errno_t ifnet_dequeue_service_class(ifnet_t interface, + mbuf_svc_class_t tc, mbuf_t *packet); + +/* + @function ifnet_dequeue_multi + @discussion Dequeue one or more packets from the output queue of an + interface which implements the new driver output model, and that + the output scheduling model is set to IFNET_SCHED_MODEL_NORMAL. + The returned packet chain is traversable with mbuf_nextpkt(). + @param interface The interface to dequeue the packets from. + @param first_packet Pointer to the first packet being dequeued. + @param last_packet Pointer to the last packet being dequeued. Caller + may supply NULL if not interested in value. + @param cnt Pointer to a storage for the number of packets dequeued. + Caller may supply NULL if not interested in value. + @param len Pointer to a storage for the total length (in bytes) + of the dequeued packets. Caller may supply NULL if not + interested in value. + @result May return EINVAL if the parameters are invalid, ENXIO if + the interface doesn't implement the new driver output model + or the output scheduling model isn't IFNET_SCHED_MODEL_NORMAL, + or EAGAIN if there is currently no packet available to + be dequeued. + */ +extern errno_t ifnet_dequeue_multi(ifnet_t interface, u_int32_t max, + mbuf_t *first_packet, mbuf_t *last_packet, u_int32_t *cnt, u_int32_t *len); + +/* + @function ifnet_dequeue_service_class_multi + @discussion Dequeue one or more packets of a particular service class + from the appropriate output queue of an interface which + implements the new driver output model, and that the output + scheduling model is set to IFNET_SCHED_MODEL_DRIVER_MANAGED. + The returned packet chain is traversable with mbuf_nextpkt(). + @param interface The interface to dequeue the packets from. + @param tc The service class. + @param first_packet Pointer to the first packet being dequeued. + @param last_packet Pointer to the last packet being dequeued. Caller + may supply NULL if not interested in value. + @param cnt Pointer to a storage for the number of packets dequeued. + Caller may supply NULL if not interested in value. + @param len Pointer to a storage for the total length (in bytes) + of the dequeued packets. Caller may supply NULL if not + interested in value. + @result May return EINVAL if the parameters are invalid, ENXIO if + the interface doesn't implement the new driver output model + or if the output scheduling model isn't configured to + IFNET_SCHED_MODEL_DRIVER_MANAGED, or EAGAIN if there + is currently no packet available to be dequeued. + */ +extern errno_t ifnet_dequeue_service_class_multi(ifnet_t interface, + mbuf_svc_class_t tc, u_int32_t max, mbuf_t *first_packet, + mbuf_t *last_packet, u_int32_t *cnt, u_int32_t *len); + +/* + @function ifnet_set_output_sched_model + @discussion Set the output scheduling model of an interface which + implements the new driver output model. + @param interface The interface to set scheduling model on. + @param model The IFNET_SCHED_MODEL value as defined in net/if.h + @result May return EINVAL if the parameters are invalid or ENXIO if + the interface doesn't implement the new driver output model. + */ +extern errno_t ifnet_set_output_sched_model(ifnet_t interface, + u_int32_t model); + +/* + @function ifnet_set_sndq_maxlen + @discussion Set the maximum length of the output queue of an + interface which implements the new driver output model. + This call may be issued post ifnet_allocate_extended in + order to modify the maximum output queue length previously + set at registration time. + @param interface The interface to set the max queue length on. + @param maxqlen The maximum number of packets in the output queue. + @result May return EINVAL if the parameters are invalid or ENXIO if + the interface doesn't implement the new driver output model. + */ +extern errno_t ifnet_set_sndq_maxlen(ifnet_t interface, u_int32_t maxqlen); + +/* + @function ifnet_get_sndq_maxlen + @discussion Get the maximum length of the output queue of an + interface which implements the new driver output model. + @param interface The interface to get the max queue length on. + @param maxqlen Pointer to a storage for the maximum number of packets + in the output queue. + @result May return EINVAL if the parameters are invalid or ENXIO if + the interface doesn't implement the new driver output model. + */ +extern errno_t ifnet_get_sndq_maxlen(ifnet_t interface, u_int32_t *maxqlen); + +/* + @function ifnet_get_sndq_len + @discussion Get the current length of the output queue of an + interface which implements the new driver output model. + @param interface The interface to get the current queue length on. + @param qlen Pointer to a storage for the current number of packets + in the output queue. + @result May return EINVAL if the parameters are invalid or ENXIO if + the interface doesn't implement the new driver output model. + */ +extern errno_t ifnet_get_sndq_len(ifnet_t interface, u_int32_t *qlen); + +/* + @function ifnet_set_rcvq_maxlen + @discussion Set the maximum length of the input queue of an + interface which implements the new driver input model. + This call may be issued post ifnet_allocate_extended in + order to modify the maximum input queue length previously + set at registration time. + @param interface The interface to set the max queue length on. + @param maxqlen The maximum number of packets in the input queue. + Drivers typically set this to the size of the receive ring + or the total number of descriptors used for the input path. + @result May return EINVAL if the parameters are invalid or ENXIO if + the interface doesn't implement the new driver input model. + */ +extern errno_t ifnet_set_rcvq_maxlen(ifnet_t interface, u_int32_t maxqlen); + +/* + @function ifnet_get_rcvq_maxlen + @discussion Get the maximum length of the input queue of an + interface which implements the new driver input model. + @param interface The interface to get the max queue length on. + @param maxqlen Pointer to a storage for the maximum number of packets + in the input queue. + @result May return EINVAL if the parameters are invalid or ENXIO if + the interface doesn't implement the new driver input model. + */ +extern errno_t ifnet_get_rcvq_maxlen(ifnet_t interface, u_int32_t *maxqlen); + +/* + @function ifnet_start + @discussion Trigger the transmission at the driver layer on an + interface which implements the new driver output model. + @param interface The interface to start the transmission on. + */ +extern void ifnet_start(ifnet_t interface); + +/* + @function ifnet_transmit_burst_start + @discussion Inform the kernel about the beginning of transmission + of a burst. This function should be called when a burst of + packets are scheduled to get transmitted over the link. The + callback will be used by the system to start measuring + bandwidth available on that link. The driver may choose to + adopt this scheme for uplink bandwidth measurement, in case + the information can't be obtained from the hardware. Else + it may alternatively inform the network stack about the + information using ifnet_set_bandwidths. + @param interface The interface. + @param mbuf_t The first packet in a burst of packets that has been + scheduled to transmit. +*/ +extern void ifnet_transmit_burst_start(ifnet_t interface, mbuf_t pkt); + +/* + @function ifnet_transmit_burst_end + @discussion Inform the kernel about the end of transmission of a burst. + This function should be called when the transmission of a burst + of packets is done. This information will be used by the + system to estimate bandwidth available on that link. The + driver may choose to adopt this scheme for uplink bandwidth + measurement, in case the information can't be obtained from + the hardware. Else it may alternatively inform the network + stack about the information using ifnet_set_bandwidths. + @param interface The interface. + @param mbuf_t The last packet in the burst that has been successfully + transmitted. +*/ +extern void ifnet_transmit_burst_end(ifnet_t interface, mbuf_t pkt); +#endif /* KERNEL_PRIVATE */ + /*! @function ifnet_reference @discussion Increment the reference count of the ifnet to assure @@ -903,7 +1416,7 @@ extern u_int16_t ifnet_flags(ifnet_t interface); #ifdef KERNEL_PRIVATE -/*! +/* @function ifnet_set_eflags @discussion Sets the extended interface flags to new_flags. This function lets you specify which flags you want to change using @@ -919,7 +1432,7 @@ extern u_int16_t ifnet_flags(ifnet_t interface); extern errno_t ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask); -/*! +/* @function ifnet_eflags @discussion Returns the extended interface flags that are set. @param interface Interface to retrieve the flags from. @@ -927,7 +1440,7 @@ extern errno_t ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, */ extern u_int32_t ifnet_eflags(ifnet_t interface); -/*! +/* @function ifnet_set_idle_flags @discussion Sets the if_idle_flags to new_flags. This function lets you specify which flags you want to change using the @@ -947,7 +1460,7 @@ extern u_int32_t ifnet_eflags(ifnet_t interface); extern errno_t ifnet_set_idle_flags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask); -/*! +/* @function ifnet_idle_flags @discussion Returns the value of if_idle_flags. @param interface Interface to retrieve the flags from. @@ -955,6 +1468,101 @@ extern errno_t ifnet_set_idle_flags(ifnet_t interface, u_int32_t new_flags, */ extern u_int32_t ifnet_idle_flags(ifnet_t interface); +/* + @function ifnet_set_link_quality + @discussion Sets the Link Quality Metric for the ifnet. + @param interface Interface for which the Link Quality Metric should + be associated to. + @param quality IFNET_LQM value as defined in net/if.h. + @result 0 on success otherwise the errno error. EINVAL if quality + is not a valid value. ENXIO if the interface is not attached. +*/ +extern errno_t ifnet_set_link_quality(ifnet_t interface, int quality); + +/* + @function ifnet_link_quality + @discussion Returns the Link Quality Metric for the ifnet. + @param interface Interface to retrieve the value from. + @result IFNET_LQM as defined in net/if.h +*/ +extern int ifnet_link_quality(ifnet_t interface); + +/* + @struct ifnet_llreach_info + @discussion This structure is used to describe the link-layer + reachability information of an on-link node. + @field iflri_refcnt The number of network-layer objects referring + to this link-layer reachability record. + @field iflri_probes The total number of outstanding probes. + @field iflri_snd_expire The send expiration time. This is calculated + based on the last time the system transmitted a packet to the + node. A zero value indicates that a packet has not been sent + to the node. A non-zero value indicates the time before the + record is determined to be invalid. When the record is no + longer valid, the system will send probe(s) to resolve the + node again. This value is relative to the current time + specified in iflri_curtime. + @field iflri_rcv_expire The receive expiriation time. This is + calculated based on the last time the system received a packet + from the node. A zero value indicates that a packet has not + been received from the node. A non-zero value indicates the + time before the record is determined to be invalid. When the + record is no longer valid, the system will send probe(s) to + resolve the node again. This value is relative to the current + time specified in iflri_curtime. + @field iflri_curtime The current time when this record was retrieved. + @field iflri_netproto The protocol number of the network-layer object. + @field iflri_addr The link-layer address of the node. + @field iflri_rssi The received signal strength indication (RSSI) of the + node in dBm. The special value IFNET_RSSI_UNKNOWN is used when + the RSSI is either unknown or inapplicable for the interface. + @field iflri_lqm The link quality metric (LQM) to the node. The + special value IFNET_LQM_UNKNOWN is used when the LQM is not + currently known. The special value IFNET_LQM_OFF is used when + the link quality metric is inapplicable to nodes at this + attached to the network at this interface. + @field iflri_npm The node proximity metric (NPM) to the node. The + special value IFNET_NPM_UNKNOWN is used when the NPM is not + currently known. + */ +#define IFNET_LLREACHINFO_ADDRLEN 64 /* max ll addr len */ + +struct ifnet_llreach_info { + u_int32_t iflri_refcnt; + u_int32_t iflri_probes; + u_int64_t iflri_snd_expire; + u_int64_t iflri_rcv_expire; + u_int64_t iflri_curtime; + u_int32_t iflri_netproto; + u_int8_t iflri_addr[IFNET_LLREACHINFO_ADDRLEN]; + int32_t iflri_rssi; + int32_t iflri_lqm; + int32_t iflri_npm; +}; + +/* + @function ifnet_inet_defrouter_llreachinfo + @discussion Retrieve link-layer reachability information of the + default IPv4 router specific to the interface. + @param interface The interface associated with the default IPv4 router. + @param pinfo Pointer to the ifnet_llreach_info structure where the + information will be returned to, upon success. + @result 0 upon success, otherwise errno error. + */ +extern errno_t ifnet_inet_defrouter_llreachinfo(ifnet_t interface, + struct ifnet_llreach_info *pinfo); + +/* + @function ifnet_inet6_defrouter_llreachinfo + @discussion Retrieve link-layer reachability information of the + default IPv6 router specific to the interface. + @param interface The interface associated with the default IPv6 router. + @param pinfo Pointer to the ifnet_llreach_info structure where the + information will be returned to, upon success. + @result 0 upon success, otherwise errno error. + */ +extern errno_t ifnet_inet6_defrouter_llreachinfo(ifnet_t interface, + struct ifnet_llreach_info *pinfo); #endif /* KERNEL_PRIVATE */ /*! @@ -1258,6 +1866,28 @@ extern errno_t ifnet_output_raw(ifnet_t interface, extern errno_t ifnet_input(ifnet_t interface, mbuf_t first_packet, const struct ifnet_stat_increment_param *stats); +#ifdef KERNEL_PRIVATE +/* + @function ifnet_input_extended + @discussion Inputs packets from the interface. The interface's demux + will be called to determine the protocol. Once the protocol is + determined, the interface filters and protocol filters will be + called. From there, the packet will be passed to the registered + protocol. If there is an error, the mbuf chain will be freed. + @param interface The interface. + @param first_packet The first packet in a chain of packets. + @param last_packet The last packet in a chain of packets. This may be + set to NULL if the driver does not have the information. + @param stats Counts to be integrated in to the stats. The interface + statistics will be incremented by the amounts specified in + stats. Unlike ifnet_input(), this parameter is required by + this extended variant. + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_input_extended(ifnet_t interface, mbuf_t first_packet, + mbuf_t last_packet, const struct ifnet_stat_increment_param *stats); +#endif /* KERNEL_PRIVATE */ + /*! @function ifnet_ioctl @discussion Calls the interface's ioctl function with the parameters @@ -1392,6 +2022,38 @@ extern errno_t ifnet_set_baudrate(ifnet_t interface, u_int64_t baudrate); */ extern u_int64_t ifnet_baudrate(ifnet_t interface); +#ifdef KERNEL_PRIVATE +typedef struct if_bandwidths if_bandwidths_t; + +/* + @function ifnet_set_bandwidths + @param interface The interface. + @param output_bw The output bandwidth values (in bits per second). + May be set to NULL if the caller does not want to alter the + existing output bandwidth values. + @param input_bw The input bandwidth values (in bits per second). + May be set to NULL if the caller does not want to alter the + existing input bandwidth values. + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_set_bandwidths(ifnet_t interface, + if_bandwidths_t *output_bw, if_bandwidths_t *input_bw); + +/* + @function ifnet_bandwidths + @param interface The interface. + @param output_bw The output bandwidth values (in bits per second). + May be set to NULL if the caller does not want to retrieve the + output bandwidth value. + @param input_bw The input bandwidth values (in bits per second). + May be set to NULL if the caller does not want to retrieve the + input bandwidth value. + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_bandwidths(ifnet_t interface, if_bandwidths_t *output_bw, + if_bandwidths_t *input_bw); +#endif /* KERNEL_PRIVATE */ + /*! @function ifnet_stat_increment @discussion @@ -2010,6 +2672,71 @@ extern errno_t ifnet_clone_attach(struct ifnet_clone_params *cloner_params, if_c */ extern errno_t ifnet_clone_detach(if_clone_t ifcloner); +/******************************************************************************/ +/* misc */ +/******************************************************************************/ + +/* + @function ifnet_get_local_ports + @discussion Returns a bitfield indicating which ports have sockets + open. An interface that supports waking the host on unicast traffic may + use this information to discard incoming unicast packets that don't have + a corresponding bit set instead of waking up the host. For port 0x0001, + bit 1 of the first byte would be set. For port n, bit 1 << (n % 8) of + the (n / 8)'th byte would be set. + @param ifp The interface in question. + @param bitfield A pointer to 8192 bytes. + @result Returns 0 on success. + */ +extern errno_t ifnet_get_local_ports(ifnet_t ifp, uint8_t *bitfield); +/******************************************************************************/ +/* for interfaces that support dynamic node absence/presence events */ +/******************************************************************************/ + +/* + @function ifnet_notice_node_presence + @discussion Provided for network interface drivers to notify the + system of a change detected in the presence of the specified + node. + @param ifp The interface attached to the link where the specified node + is present. + @param sa The AF_LINK family address of the node whose presence is + changing. + @param rssi The received signal strength indication as measured in + dBm by a radio receiver. + @param lqm A link quality metric associated with the specified node. + @param npm A node proximity metric associated with the specified node. + @param srvinfo A fixed-size array of octets containing opaque service + information data used by the mDNS responder subsystem. + @result Returns 0 on success, or EINVAL if arguments are invalid. + */ +extern errno_t +ifnet_notice_node_presence(ifnet_t ifp, struct sockaddr* sa, int32_t rssi, + int lqm, int npm, u_int8_t srvinfo[48]); + +/* + @function ifnet_notice_node_absence + @discussion Provided for network interface drivers to notify the + system that the absence of the specified node has been detected. + @param ifp The interface attached to the link where the absence of the + specified node has been detected. + @param sa The AF_LINK family address of the node whose absence has been + detected. + @result Returns 0 on success, or EINVAL if arguments are invalid. + */ +extern errno_t ifnet_notice_node_absence(ifnet_t ifp, struct sockaddr* sa); + +/* + @function ifnet_notice_master_elected + @discussion Provided for network interface drivers to notify the system + that the nodes with a locally detected presence on the attached + link have elected a new master. + @param ifp The interface attached to the link where the new master has + been elected. + @result Returns 0 on success, or EINVAL if arguments are invalid. + */ +extern errno_t ifnet_notice_master_elected(ifnet_t ifp); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/net/kpi_protocol.c b/bsd/net/kpi_protocol.c index 6c3043c94..982dc905a 100644 --- a/bsd/net/kpi_protocol.c +++ b/bsd/net/kpi_protocol.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,167 +44,156 @@ typedef int (*attach_t)(struct ifnet *ifp, uint32_t protocol_family); typedef int (*detach_t)(struct ifnet *ifp, uint32_t protocol_family); struct proto_input_entry { - struct proto_input_entry *next; - int detach; - struct domain *domain; - int hash; - int chain; - - protocol_family_t protocol; - proto_input_handler input; + struct proto_input_entry *next; + int detach; + struct domain *domain; + int hash; + int chain; + + protocol_family_t protocol; + proto_input_handler input; proto_input_detached_handler detached; - - mbuf_t inject_first; - mbuf_t inject_last; - - struct proto_input_entry *input_next; - mbuf_t input_first; - mbuf_t input_last; + + mbuf_t inject_first; + mbuf_t inject_last; + + struct proto_input_entry *input_next; + mbuf_t input_first; + mbuf_t input_last; }; struct proto_family_str { TAILQ_ENTRY(proto_family_str) proto_fam_next; - protocol_family_t proto_family; - ifnet_family_t if_family; - proto_plumb_handler attach_proto; - proto_unplumb_handler detach_proto; + protocol_family_t proto_family; + ifnet_family_t if_family; + proto_plumb_handler attach_proto; + proto_unplumb_handler detach_proto; }; -#define PROTO_HASH_SLOTS 5 +#define PROTO_HASH_SLOTS 5 -static struct proto_input_entry *proto_hash[PROTO_HASH_SLOTS]; -static int proto_total_waiting = 0; -static struct proto_input_entry *proto_input_add_list = NULL; -static lck_mtx_t *proto_family_mutex = 0; -static TAILQ_HEAD(, proto_family_str) proto_family_head = - TAILQ_HEAD_INITIALIZER(proto_family_head); - -extern lck_mtx_t *domain_proto_mtx; -extern struct dlil_threading_info *dlil_lo_thread_ptr; +static struct proto_input_entry *proto_hash[PROTO_HASH_SLOTS]; +static int proto_total_waiting = 0; +static struct proto_input_entry *proto_input_add_list = NULL; +decl_lck_mtx_data(static, proto_family_mutex_data); +static lck_mtx_t *proto_family_mutex = &proto_family_mutex_data; +static TAILQ_HEAD(, proto_family_str) proto_family_head = + TAILQ_HEAD_INITIALIZER(proto_family_head); static int -proto_hash_value( - protocol_family_t protocol) +proto_hash_value(protocol_family_t protocol) { - switch(protocol) { + switch (protocol) { case PF_INET: - return 0; + return (0); case PF_INET6: - return 1; + return (1); case PF_APPLETALK: - return 2; + return (2); case PF_VLAN: - return 3; + return (3); } - return 4; + return (4); } __private_extern__ void proto_kpi_init(void) { - lck_grp_attr_t *grp_attrib = 0; - lck_attr_t *lck_attrib = 0; - lck_grp_t *lck_group = 0; - + lck_grp_attr_t *grp_attrib = NULL; + lck_attr_t *lck_attrib = NULL; + lck_grp_t *lck_group = NULL; + /* Allocate a mtx lock */ grp_attrib = lck_grp_attr_alloc_init(); lck_group = lck_grp_alloc_init("protocol kpi", grp_attrib); lck_grp_attr_free(grp_attrib); lck_attrib = lck_attr_alloc_init(); - proto_family_mutex = lck_mtx_alloc_init(lck_group, lck_attrib); + lck_mtx_init(proto_family_mutex, lck_group, lck_attrib); lck_grp_free(lck_group); lck_attr_free(lck_attrib); - - bzero(proto_hash, sizeof(proto_hash)); + + bzero(proto_hash, sizeof (proto_hash)); } __private_extern__ errno_t -proto_register_input( - protocol_family_t protocol, - proto_input_handler input, - proto_input_detached_handler detached, - int chains) +proto_register_input(protocol_family_t protocol, proto_input_handler input, + proto_input_detached_handler detached, int chains) { - struct proto_input_entry *entry; - struct dlil_threading_info *thread = dlil_lo_thread_ptr; - - entry = _MALLOC(sizeof(*entry), M_IFADDR, M_WAITOK); - + struct dlil_threading_info *inp = dlil_main_input_thread; + struct domain *dp = domains; + int do_unlock; + + entry = _MALLOC(sizeof (*entry), M_IFADDR, M_WAITOK); + if (entry == NULL) - return ENOMEM; - - bzero(entry, sizeof(*entry)); + return (ENOMEM); + + bzero(entry, sizeof (*entry)); entry->protocol = protocol; entry->input = input; entry->detached = detached; entry->hash = proto_hash_value(protocol); entry->chain = chains; - - { - struct domain *dp = domains; - - lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(domain_proto_mtx); - while (dp && (protocol_family_t)dp->dom_family != protocol) - dp = dp->dom_next; - entry->domain = dp; - lck_mtx_unlock(domain_proto_mtx); - } - - lck_mtx_lock(&thread->input_lck); + do_unlock = domain_proto_mtx_lock(); + while (dp && (protocol_family_t)dp->dom_family != protocol) + dp = dp->dom_next; + entry->domain = dp; + domain_proto_mtx_unlock(do_unlock); + + lck_mtx_lock(&inp->input_lck); entry->next = proto_input_add_list; proto_input_add_list = entry; - - thread->input_waiting |= DLIL_PROTO_REGISTER; - if ((thread->input_waiting & DLIL_INPUT_RUNNING) == 0) - wakeup((caddr_t)&thread->input_waiting); - lck_mtx_unlock(&thread->input_lck); - - return 0; -} + inp->input_waiting |= DLIL_PROTO_REGISTER; + if ((inp->input_waiting & DLIL_INPUT_RUNNING) == 0) + wakeup((caddr_t)&inp->input_waiting); + lck_mtx_unlock(&inp->input_lck); + + return (0); +} __private_extern__ void -proto_unregister_input( - protocol_family_t protocol) +proto_unregister_input(protocol_family_t protocol) { struct proto_input_entry *entry = NULL; - - for (entry = proto_hash[proto_hash_value(protocol)]; entry; entry = entry->next) + + for (entry = proto_hash[proto_hash_value(protocol)]; entry != NULL; + entry = entry->next) { if (entry->protocol == protocol) break; - - if (entry) + } + + if (entry != NULL) entry->detach = 1; } - static void -proto_delayed_attach( - struct proto_input_entry *entry) +proto_delayed_attach(struct proto_input_entry *entry) { struct proto_input_entry *next_entry; - for (next_entry = entry->next; entry; entry = next_entry) { + + for (next_entry = entry->next; entry != NULL; entry = next_entry) { struct proto_input_entry *exist; int hash_slot; - + hash_slot = proto_hash_value(entry->protocol); next_entry = entry->next; - - for (exist = proto_hash[hash_slot]; exist; exist = exist->next) + + for (exist = proto_hash[hash_slot]; exist != NULL; + exist = exist->next) { if (exist->protocol == entry->protocol) break; - + } + /* If the entry already exists, call detached and dispose */ - if (exist) { + if (exist != NULL) { if (entry->detached) entry->detached(entry->protocol); FREE(entry, M_IFADDR); - } - else { + } else { entry->next = proto_hash[hash_slot]; proto_hash[hash_slot] = entry; } @@ -214,31 +203,32 @@ proto_delayed_attach( __private_extern__ void proto_input_run(void) { - struct proto_input_entry *entry; - struct dlil_threading_info *thread = dlil_lo_thread_ptr; + struct proto_input_entry *entry; + struct dlil_threading_info *inp = dlil_main_input_thread; mbuf_t packet_list; int i, locked = 0; - lck_mtx_assert(&thread->input_lck, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_NOTOWNED); - if ((thread->input_waiting & DLIL_PROTO_REGISTER) != 0) { - lck_mtx_lock_spin(&thread->input_lck); + if (inp->input_waiting & DLIL_PROTO_REGISTER) { + lck_mtx_lock_spin(&inp->input_lck); entry = proto_input_add_list; proto_input_add_list = NULL; - thread->input_waiting &= ~DLIL_PROTO_REGISTER; - lck_mtx_unlock(&thread->input_lck); + inp->input_waiting &= ~DLIL_PROTO_REGISTER; + lck_mtx_unlock(&inp->input_lck); proto_delayed_attach(entry); } + /* - Move everything from the lock protected list to the thread - specific list. + * Move everything from the lock protected list to the thread + * specific list. */ for (i = 0; proto_total_waiting != 0 && i < PROTO_HASH_SLOTS; i++) { - for (entry = proto_hash[i]; entry && proto_total_waiting; - entry = entry->next) { - if (entry->inject_first) { - lck_mtx_lock_spin(&thread->input_lck); - thread->input_waiting &= ~DLIL_PROTO_WAITING; + for (entry = proto_hash[i]; + entry != NULL && proto_total_waiting; entry = entry->next) { + if (entry->inject_first != NULL) { + lck_mtx_lock_spin(&inp->input_lck); + inp->input_waiting &= ~DLIL_PROTO_WAITING; packet_list = entry->inject_first; @@ -246,160 +236,156 @@ proto_input_run(void) entry->inject_last = NULL; proto_total_waiting--; - lck_mtx_unlock(&thread->input_lck); + lck_mtx_unlock(&inp->input_lck); - if (entry->domain && (entry->domain->dom_flags & DOM_REENTRANT) == 0) { + if (entry->domain != NULL && !(entry->domain-> + dom_flags & DOM_REENTRANT)) { lck_mtx_lock(entry->domain->dom_mtx); locked = 1; } - + if (entry->chain) { - entry->input(entry->protocol, packet_list); - } - else { + entry->input(entry->protocol, + packet_list); + } else { mbuf_t packet; - - for (packet = packet_list; packet; packet = packet_list) { - packet_list = mbuf_nextpkt(packet); + + for (packet = packet_list; + packet != NULL; + packet = packet_list) { + packet_list = + mbuf_nextpkt(packet); mbuf_setnextpkt(packet, NULL); - entry->input(entry->protocol, packet); + entry->input(entry->protocol, + packet); } } if (locked) { locked = 0; lck_mtx_unlock(entry->domain->dom_mtx); - } + } + } } } - } - } errno_t -proto_input( - protocol_family_t protocol, - mbuf_t packet_list) +proto_input(protocol_family_t protocol, mbuf_t packet_list) { - struct proto_input_entry *entry; - errno_t locked =0, result = 0; + struct proto_input_entry *entry; + errno_t locked = 0, result = 0; - for (entry = proto_hash[proto_hash_value(protocol)]; entry; - entry = entry->next) { + for (entry = proto_hash[proto_hash_value(protocol)]; entry != NULL; + entry = entry->next) { if (entry->protocol == protocol) break; } - if (entry->domain && (entry->domain->dom_flags & DOM_REENTRANT) == 0) { + if (entry->domain && !(entry->domain->dom_flags & DOM_REENTRANT)) { lck_mtx_lock(entry->domain->dom_mtx); locked = 1; } - + if (entry->chain) { entry->input(entry->protocol, packet_list); - } - else { + } else { mbuf_t packet; - - for (packet = packet_list; packet; packet = packet_list) { + + for (packet = packet_list; packet != NULL; + packet = packet_list) { packet_list = mbuf_nextpkt(packet); mbuf_setnextpkt(packet, NULL); entry->input(entry->protocol, packet); } } - + if (locked) { lck_mtx_unlock(entry->domain->dom_mtx); - } - return result; + } + return (result); } errno_t -proto_inject( - protocol_family_t protocol, - mbuf_t packet_list) +proto_inject(protocol_family_t protocol, mbuf_t packet_list) { - struct proto_input_entry *entry; - mbuf_t last_packet; - int hash_slot = proto_hash_value(protocol); - struct dlil_threading_info *thread = dlil_lo_thread_ptr; - - for (last_packet = packet_list; mbuf_nextpkt(last_packet); - last_packet = mbuf_nextpkt(last_packet)) + struct proto_input_entry *entry; + mbuf_t last_packet; + int hash_slot = proto_hash_value(protocol); + struct dlil_threading_info *inp = dlil_main_input_thread; + + for (last_packet = packet_list; mbuf_nextpkt(last_packet) != NULL; + last_packet = mbuf_nextpkt(last_packet)) /* find the last packet */; - - for (entry = proto_hash[hash_slot]; entry; entry = entry->next) { + + for (entry = proto_hash[hash_slot]; entry != NULL; + entry = entry->next) { if (entry->protocol == protocol) break; } - - if (entry) { - lck_mtx_lock(&thread->input_lck); + + if (entry != NULL) { + lck_mtx_lock(&inp->input_lck); if (entry->inject_first == NULL) { proto_total_waiting++; - thread->input_waiting |= DLIL_PROTO_WAITING; + inp->input_waiting |= DLIL_PROTO_WAITING; entry->inject_first = packet_list; - } - else { + } else { mbuf_setnextpkt(entry->inject_last, packet_list); } entry->inject_last = last_packet; - if ((thread->input_waiting & DLIL_INPUT_RUNNING) == 0) { - wakeup((caddr_t)&thread->input_waiting); + if ((inp->input_waiting & DLIL_INPUT_RUNNING) == 0) { + wakeup((caddr_t)&inp->input_waiting); } - lck_mtx_unlock(&thread->input_lck); - } - else - { - return ENOENT; + lck_mtx_unlock(&inp->input_lck); + } else { + return (ENOENT); } - return 0; + return (0); } -static struct proto_family_str* -proto_plumber_find( - protocol_family_t proto_family, - ifnet_family_t if_family) +static struct proto_family_str * +proto_plumber_find(protocol_family_t proto_family, ifnet_family_t if_family) { struct proto_family_str *mod = NULL; TAILQ_FOREACH(mod, &proto_family_head, proto_fam_next) { - if ((mod->proto_family == (proto_family & 0xffff)) - && (mod->if_family == (if_family & 0xffff))) + if ((mod->proto_family == (proto_family & 0xffff)) && + (mod->if_family == (if_family & 0xffff))) break; - } + } - return mod; + return (mod); } errno_t -proto_register_plumber( - protocol_family_t protocol_family, - ifnet_family_t interface_family, - proto_plumb_handler attach, - proto_unplumb_handler detach) +proto_register_plumber(protocol_family_t protocol_family, + ifnet_family_t interface_family, proto_plumb_handler attach, + proto_unplumb_handler detach) { struct proto_family_str *proto_family; - if (attach == NULL) return EINVAL; + if (attach == NULL) + return (EINVAL); lck_mtx_lock(proto_family_mutex); - + TAILQ_FOREACH(proto_family, &proto_family_head, proto_fam_next) { if (proto_family->proto_family == protocol_family && - proto_family->if_family == interface_family) { + proto_family->if_family == interface_family) { lck_mtx_unlock(proto_family_mutex); - return EEXIST; + return (EEXIST); } } - proto_family = (struct proto_family_str *) _MALLOC(sizeof(struct proto_family_str), M_IFADDR, M_WAITOK); + proto_family = (struct proto_family_str *) + _MALLOC(sizeof (struct proto_family_str), M_IFADDR, M_WAITOK); if (!proto_family) { lck_mtx_unlock(proto_family_mutex); - return ENOMEM; + return (ENOMEM); } - bzero(proto_family, sizeof(struct proto_family_str)); + bzero(proto_family, sizeof (struct proto_family_str)); proto_family->proto_family = protocol_family; proto_family->if_family = interface_family & 0xffff; proto_family->attach_proto = attach; @@ -407,57 +393,51 @@ proto_register_plumber( TAILQ_INSERT_TAIL(&proto_family_head, proto_family, proto_fam_next); lck_mtx_unlock(proto_family_mutex); - return 0; + return (0); } void -proto_unregister_plumber( - protocol_family_t protocol_family, - ifnet_family_t interface_family) +proto_unregister_plumber(protocol_family_t protocol_family, + ifnet_family_t interface_family) { struct proto_family_str *proto_family; lck_mtx_lock(proto_family_mutex); proto_family = proto_plumber_find(protocol_family, interface_family); - if (proto_family == 0) { + if (proto_family == NULL) { lck_mtx_unlock(proto_family_mutex); return; } TAILQ_REMOVE(&proto_family_head, proto_family, proto_fam_next); FREE(proto_family, M_IFADDR); - + lck_mtx_unlock(proto_family_mutex); - return; } __private_extern__ errno_t -proto_plumb( - protocol_family_t protocol_family, - ifnet_t ifp) +proto_plumb(protocol_family_t protocol_family, ifnet_t ifp) { struct proto_family_str *proto_family; int ret = 0; lck_mtx_lock(proto_family_mutex); proto_family = proto_plumber_find(protocol_family, ifp->if_family); - if (proto_family == 0) { + if (proto_family == NULL) { lck_mtx_unlock(proto_family_mutex); - return ENXIO; + return (ENXIO); } ret = proto_family->attach_proto(ifp, protocol_family); lck_mtx_unlock(proto_family_mutex); - return ret; + return (ret); } __private_extern__ errno_t -proto_unplumb( - protocol_family_t protocol_family, - ifnet_t ifp) +proto_unplumb(protocol_family_t protocol_family, ifnet_t ifp) { struct proto_family_str *proto_family; int ret = 0; @@ -465,11 +445,11 @@ proto_unplumb( lck_mtx_lock(proto_family_mutex); proto_family = proto_plumber_find(protocol_family, ifp->if_family); - if (proto_family && proto_family->detach_proto) + if (proto_family != NULL && proto_family->detach_proto) proto_family->detach_proto(ifp, protocol_family); else ret = ifnet_detach_protocol(ifp, protocol_family); - + lck_mtx_unlock(proto_family_mutex); - return ret; + return (ret); } diff --git a/bsd/net/lacp.h b/bsd/net/lacp.h index 73fb8a1ab..04c81c167 100644 --- a/bsd/net/lacp.h +++ b/bsd/net/lacp.h @@ -43,6 +43,7 @@ #define _NET_LACP_H_ #include +#include /** ** Link Aggregation Control Protocol (LACP) definitions @@ -248,6 +249,61 @@ lacp_actor_partner_state_expired(lacp_actor_partner_state state) return ((state & LACP_ACTOR_PARTNER_STATE_EXPIRED) != 0); } +/* + * Function: lacp_uint16_set + * Purpose: + * Set a field in a structure that's at least 16 bits to the given + * value, putting it into network byte order + */ +static __inline__ void +lacp_uint16_set(uint8_t * field, uint16_t value) +{ + uint16_t tmp_value = htons(value); + memcpy((void *)field, (void *)&tmp_value, sizeof(uint16_t)); + return; +} + +/* + * Function: lacp_uint16_get + * Purpose: + * Get a field in a structure that's at least 16 bits, converting + * to host byte order. + */ +static __inline__ uint16_t +lacp_uint16_get(const uint8_t * field) +{ + uint16_t tmp_field; + memcpy((void *)&tmp_field, (void *)field, sizeof(uint16_t)); + return (ntohs(tmp_field)); +} + +/* + * Function: lacp_uint32_set + * Purpose: + * Set a field in a structure that's at least 32 bits to the given + * value, putting it into network byte order + */ +static __inline__ void +lacp_uint32_set(uint8_t * field, uint32_t value) +{ + uint32_t tmp_value = htonl(value); + memcpy((void *)field, (void *)&tmp_value, sizeof(uint32_t)); + return; +} + +/* + * Function: lacp_uint32_get + * Purpose: + * Get a field in a structure that's at least 32 bits, converting + * to host byte order. + */ +static __inline__ uint32_t +lacp_uint32_get(const uint8_t * field) +{ + uint32_t tmp_field; + memcpy((void *)&tmp_field, (void *)field, sizeof(uint32_t)); + return (ntohl(tmp_field)); +} /* * LACP Actor/Partner TLV access functions @@ -256,57 +312,54 @@ static __inline__ void lacp_actor_partner_tlv_set_system_priority(lacp_actor_partner_tlv_ref tlv, lacp_system_priority system_priority) { - *((lacp_system_priority *)tlv->lap_system_priority) - = (lacp_system_priority)htons(system_priority); + lacp_uint16_set(tlv->lap_system_priority, system_priority); return; } static __inline__ lacp_system_priority lacp_actor_partner_tlv_get_system_priority(const lacp_actor_partner_tlv_ref tlv) { - return ((lacp_system_priority) - ntohs(*((u_short *)tlv->lap_system_priority))); + return (lacp_system_priority)lacp_uint16_get(tlv->lap_system_priority); } static __inline__ void lacp_actor_partner_tlv_set_key(lacp_actor_partner_tlv_ref tlv, lacp_key key) { - *((lacp_key *)tlv->lap_key) = (lacp_key)htons(key); + lacp_uint16_set(tlv->lap_key, key); return; } static __inline__ lacp_key lacp_actor_partner_tlv_get_key(const lacp_actor_partner_tlv_ref tlv) { - return ((lacp_key)ntohs(*((u_short *)tlv->lap_key))); + return (lacp_key)lacp_uint16_get(tlv->lap_key); } static __inline__ void lacp_actor_partner_tlv_set_port_priority(lacp_actor_partner_tlv_ref tlv, lacp_port_priority port_priority) { - *((lacp_port_priority *)tlv->lap_port_priority) - = (lacp_port_priority)htons(port_priority); + lacp_uint16_set(tlv->lap_port_priority, port_priority); return; } static __inline__ lacp_port_priority lacp_actor_partner_tlv_get_port_priority(const lacp_actor_partner_tlv_ref tlv) { - return ((lacp_port_priority)ntohs(*((u_short *)tlv->lap_port_priority))); + return (lacp_port_priority)lacp_uint16_get(tlv->lap_port_priority); } static __inline__ void lacp_actor_partner_tlv_set_port(lacp_actor_partner_tlv_ref tlv, lacp_port port) { - *((lacp_port *)tlv->lap_port) = (lacp_port)htons(port); + lacp_uint16_set(tlv->lap_port, port); return; } static __inline__ lacp_port lacp_actor_partner_tlv_get_port(const lacp_actor_partner_tlv_ref tlv) { - return ((lacp_port)ntohs(*((u_short *)tlv->lap_port))); + return (lacp_port)lacp_uint16_get(tlv->lap_port); } /* @@ -316,15 +369,14 @@ static __inline__ void lacp_collector_tlv_set_max_delay(lacp_collector_tlv_ref tlv, lacp_collector_max_delay delay) { - *((lacp_collector_max_delay *)tlv->lac_max_delay) - = (lacp_collector_max_delay)htons(delay); + lacp_uint16_set(tlv->lac_max_delay, delay); return; } static __inline__ lacp_collector_max_delay lacp_collector_tlv_get_max_delay(const lacp_collector_tlv_ref tlv) { - return ((lacp_collector_max_delay)ntohs(*((u_short *)tlv->lac_max_delay))); + return (lacp_collector_max_delay)lacp_uint16_get(tlv->lac_max_delay); } typedef struct lacpdu_s { @@ -380,32 +432,28 @@ typedef struct la_marker_pdu_s { static __inline__ void la_marker_pdu_set_requestor_port(la_marker_pdu_ref lmpdu, lacp_port port) { - *((lacp_port *)lmpdu->lm_requestor_port) = (lacp_port)htons(port); + lacp_uint16_set(lmpdu->lm_requestor_port, port); return; } static __inline__ lacp_port la_marker_pdu_get_requestor_port(la_marker_pdu_ref lmpdu) { - return ((lacp_port)ntohs(*((lacp_port *)lmpdu->lm_requestor_port))); + return (lacp_port)lacp_uint16_get(lmpdu->lm_requestor_port); } static __inline__ void la_marker_pdu_set_requestor_transaction_id(la_marker_pdu_ref lmpdu, la_marker_transaction_id xid) { - *((la_marker_transaction_id *)lmpdu->lm_requestor_transaction_id) - = (la_marker_transaction_id)htonl(xid); + lacp_uint32_set(lmpdu->lm_requestor_transaction_id, xid); return; } static __inline__ la_marker_transaction_id la_marker_pdu_get_requestor_transaction_id(la_marker_pdu_ref lmpdu) { - la_marker_transaction_id * xid_p; - - xid_p = (la_marker_transaction_id *)lmpdu->lm_requestor_transaction_id; - return ((la_marker_transaction_id)ntohl(*xid_p)); + return (la_marker_transaction_id)lacp_uint32_get(lmpdu->lm_requestor_transaction_id); } static __inline__ void diff --git a/bsd/net/ndrv.c b/bsd/net/ndrv.c index 51c218910..dc49773ed 100644 --- a/bsd/net/ndrv.c +++ b/bsd/net/ndrv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2008 Apple Inc. All rights reserved. + * Copyright (c) 1997-2008, 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,10 @@ #include +static unsigned int ndrv_multi_max_count = NDRV_DMUX_MAX_DESCR; +SYSCTL_UINT(_net, OID_AUTO, ndrv_multi_max_count, CTLFLAG_RW | CTLFLAG_LOCKED, + &ndrv_multi_max_count, 0, "Number of allowed multicast addresses per NRDV socket"); + static int ndrv_do_detach(struct ndrv_cb *); static int ndrv_do_disconnect(struct ndrv_cb *); static struct ndrv_cb *ndrv_find_inbound(struct ifnet *ifp, u_int32_t protocol_family); @@ -98,7 +103,6 @@ TAILQ_HEAD(, ndrv_cb) ndrvl = TAILQ_HEAD_INITIALIZER(ndrvl); extern struct domain ndrvdomain; extern struct protosw ndrvsw; -extern lck_mtx_t *domain_proto_mtx; #define NDRV_PROTODEMUX_COUNT 10 @@ -594,7 +598,7 @@ ndrv_do_disconnect(struct ndrv_cb *np) } /* Hackery - return a string version of a decimal number */ -static char * +static void sprint_d(u_int n, char *buf, int buflen) { char dbuf[IFNAMSIZ]; char *cp = dbuf+IFNAMSIZ-1; @@ -606,7 +610,7 @@ sprint_d(u_int n, char *buf, int buflen) n /= 10; } while (n != 0 && buflen > 0); strncpy(buf, cp, IFNAMSIZ-buflen); - return (cp); + return; } /* @@ -622,7 +626,7 @@ static int name_cmp(struct ifnet *ifp, char *q) len = strlen(ifnet_name(ifp)); strncpy(r, ifnet_name(ifp), IFNAMSIZ); r += len; - (void)sprint_d(ifnet_unit(ifp), r, IFNAMSIZ-(r-buf)); + sprint_d(ifnet_unit(ifp), r, IFNAMSIZ-(r-buf)); #if NDRV_DEBUG kprintf("Comparing %s, %s\n", buf, q); #endif @@ -885,10 +889,12 @@ ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt) int result; if (sopt->sopt_val == 0 || sopt->sopt_valsize < 2 || - sopt->sopt_level != SOL_NDRVPROTO) + sopt->sopt_level != SOL_NDRVPROTO || sopt->sopt_valsize > SOCK_MAXADDRLEN) return EINVAL; if (np->nd_if == NULL) return ENXIO; + if (!(np->nd_dlist_cnt < ndrv_multi_max_count)) + return EPERM; // Allocate storage MALLOC(ndrv_multi, struct ndrv_multiaddr*, sizeof(struct ndrv_multiaddr) - @@ -918,6 +924,7 @@ ndrv_do_add_multicast(struct ndrv_cb *np, struct sockopt *sopt) // Add to our linked list ndrv_multi->next = np->nd_multiaddrs; np->nd_multiaddrs = ndrv_multi; + np->nd_dlist_cnt++; } else { @@ -938,7 +945,7 @@ ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt) if (sopt->sopt_val == 0 || sopt->sopt_valsize < 2 || sopt->sopt_level != SOL_NDRVPROTO) return EINVAL; - if (np->nd_if == NULL) + if (np->nd_if == NULL || np->nd_dlist_cnt == 0) return ENXIO; // Allocate storage @@ -992,6 +999,8 @@ ndrv_do_remove_multicast(struct ndrv_cb *np, struct sockopt *sopt) } } + np->nd_dlist_cnt--; + // Free the memory FREE(ndrv_entry, M_IFADDR); } diff --git a/bsd/net/ndrv.h b/bsd/net/ndrv.h index 7e9fc9700..a201a2fd7 100644 --- a/bsd/net/ndrv.h +++ b/bsd/net/ndrv.h @@ -169,4 +169,12 @@ struct ndrv_protocol_desc32 { * you a second or two. */ +/* Max number of descriptions allowed by default */ +#define NDRV_DMUX_MAX_DESCR 1024 + +/* + * sysctl MIB tags at the kern.ipc.nrdv level + */ +#define NRDV_MULTICAST_ADDRS_PER_SOCK 1 /* to toggle NDRV_DMUX_MAX_DESCR value */ + #endif /* _NET_NDRV_H */ diff --git a/bsd/net/ndrv_var.h b/bsd/net/ndrv_var.h index e12a0e0ef..c2c208595 100644 --- a/bsd/net/ndrv_var.h +++ b/bsd/net/ndrv_var.h @@ -62,11 +62,12 @@ struct ndrv_cb struct sockproto nd_proto; /* proto family, protocol */ int nd_descrcnt; /* # elements in nd_dlist - Obsolete */ TAILQ_HEAD(dlist, dlil_demux_desc) nd_dlist; /* Descr. list */ + u_int32_t nd_dlist_cnt; /* Descr. list count */ struct ifnet *nd_if; /* obsolete, maintained for binary compatibility */ - u_int32_t nd_proto_family; - u_int32_t nd_family; - struct ndrv_multiaddr* nd_multiaddrs; - short nd_unit; + u_int32_t nd_proto_family; + u_int32_t nd_family; + struct ndrv_multiaddr* nd_multiaddrs; + short nd_unit; }; #define sotondrvcb(so) ((struct ndrv_cb *)(so)->so_pcb) diff --git a/bsd/net/net_str_id.c b/bsd/net/net_str_id.c index bc28f03c4..e1ed7e907 100644 --- a/bsd/net/net_str_id.c +++ b/bsd/net/net_str_id.c @@ -47,7 +47,8 @@ #define FIRST_NET_STR_ID 1000 static SLIST_HEAD(,net_str_id_entry) net_str_id_list = {NULL}; -static lck_mtx_t *net_str_id_lock = NULL; +decl_lck_mtx_data(static, net_str_id_lock_data); +static lck_mtx_t *net_str_id_lock = &net_str_id_lock_data; static u_int32_t nsi_kind_next[NSI_MAX_KIND] = { FIRST_NET_STR_ID, FIRST_NET_STR_ID, FIRST_NET_STR_ID }; static u_int32_t nsi_next_id = FIRST_NET_STR_ID; @@ -71,7 +72,7 @@ net_str_id_init(void) lck_grp_attr_free(grp_attrib); lck_attrb = lck_attr_alloc_init(); - net_str_id_lock = lck_mtx_alloc_init(lck_group, lck_attrb); + lck_mtx_init(net_str_id_lock, lck_group, lck_attrb); lck_grp_free(lck_group); lck_attr_free(lck_attrb); diff --git a/bsd/net/netsrc.c b/bsd/net/netsrc.c index 2c1037c26..7501053e3 100644 --- a/bsd/net/netsrc.c +++ b/bsd/net/netsrc.c @@ -227,6 +227,8 @@ netsrc_ipv6(kern_ctl_ref kctl, uint32_t unit, struct netsrc_req *nrq) nrp.nrp_flags |= NETSRC_IP6_FLAG_TENTATIVE; if (ia->ia6_flags & IN6_IFF_DEPRECATED) nrp.nrp_flags |= NETSRC_IP6_FLAG_DEPRECATED; + if (ia->ia6_flags & IN6_IFF_OPTIMISTIC) + nrp.nrp_flags |= NETSRC_IP6_FLAG_OPTIMISTIC; sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); memcpy(&sin6.sin6_addr, in6, sizeof(*in6)); diff --git a/bsd/net/netsrc.h b/bsd/net/netsrc.h index 54ba8d8be..d93c4a014 100644 --- a/bsd/net/netsrc.h +++ b/bsd/net/netsrc.h @@ -53,6 +53,7 @@ struct netsrc_rep { #define NETSRC_IP6_FLAG_TENTATIVE 0x0001 #define NETSRC_IP6_FLAG_TEMPORARY 0x0002 #define NETSRC_IP6_FLAG_DEPRECATED 0x0004 +#define NETSRC_IP6_FLAG_OPTIMISTIC 0x0008 uint16_t nrp_flags; uint16_t nrp_label; uint16_t nrp_precedence; diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index 833b8ca34..eb83ac3c9 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * Copyright (c) 2010-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,24 +62,62 @@ __private_extern__ int nstat_collect = 1; SYSCTL_INT(_net, OID_AUTO, statistics, CTLFLAG_RW | CTLFLAG_LOCKED, &nstat_collect, 0, "Collect detailed statistics"); +enum +{ + NSTAT_FLAG_CLEANUP = (0x1 << 0), + NSTAT_FLAG_REQCOUNTS = (0x1 << 1) +}; + typedef struct nstat_control_state { - struct nstat_control_state *next; - u_int32_t watching; + struct nstat_control_state *ncs_next; + u_int32_t ncs_watching; decl_lck_mtx_data(, mtx); - kern_ctl_ref kctl; - u_int32_t unit; - nstat_src_ref_t next_srcref; - struct nstat_src *srcs; - int cleanup; - int suser; + kern_ctl_ref ncs_kctl; + u_int32_t ncs_unit; + nstat_src_ref_t ncs_next_srcref; + struct nstat_src *ncs_srcs; + u_int32_t ncs_flags; } nstat_control_state; +typedef struct nstat_provider +{ + struct nstat_provider *next; + nstat_provider_id_t nstat_provider_id; + size_t nstat_descriptor_length; + errno_t (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie); + int (*nstat_gone)(nstat_provider_cookie_t cookie); + errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); + errno_t (*nstat_watcher_add)(nstat_control_state *state); + void (*nstat_watcher_remove)(nstat_control_state *state); + errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); + void (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked); +} nstat_provider; + + +typedef struct nstat_src +{ + struct nstat_src *next; + nstat_src_ref_t srcref; + nstat_provider *provider; + nstat_provider_cookie_t cookie; +} nstat_src; + +static errno_t nstat_control_send_counts(nstat_control_state *, + nstat_src *, unsigned long long, int *); +static int nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context); +static errno_t nstat_control_send_removed(nstat_control_state *, nstat_src *); +static void nstat_control_cleanup_source(nstat_control_state *state, nstat_src *src, + boolean_t); + +static u_int32_t nstat_udp_watchers = 0; +static u_int32_t nstat_tcp_watchers = 0; + static void nstat_control_register(void); static volatile OSMallocTag nstat_malloc_tag = NULL; static nstat_control_state *nstat_controls = NULL; -static uint64_t nstat_idle_time = 0ULL; +static uint64_t nstat_idle_time = 0; static decl_lck_mtx_data(, nstat_mtx); static void @@ -94,7 +132,7 @@ nstat_copy_sa_out( if (src->sa_family == AF_INET6 && src->sa_len >= sizeof(struct sockaddr_in6)) { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)dst; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)(void *)dst; if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) { if (sin6->sin6_scope_id == 0) @@ -143,20 +181,6 @@ nstat_ip6_to_sockaddr( #pragma mark -- Network Statistic Providers -- -typedef struct nstat_provider -{ - struct nstat_provider *next; - nstat_provider_id_t nstat_provider_id; - size_t nstat_descriptor_length; - errno_t (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie); - int (*nstat_gone)(nstat_provider_cookie_t cookie); - errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); - errno_t (*nstat_watcher_add)(nstat_control_state *state); - void (*nstat_watcher_remove)(nstat_control_state *state); - errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); - void (*nstat_release)(nstat_provider_cookie_t cookie); -} nstat_provider; - static errno_t nstat_control_source_add(u_int64_t context, nstat_control_state *state, nstat_provider *provider, nstat_provider_cookie_t cookie); struct nstat_provider *nstat_providers = NULL; @@ -186,7 +210,6 @@ nstat_lookup_entry( *out_provider = nstat_find_provider_by_id(id); if (*out_provider == NULL) { - printf("%s:%d: provider %u not found\n", __FUNCTION__, __LINE__, id); return ENOENT; } @@ -197,7 +220,7 @@ static void nstat_init_route_provider(void); static void nstat_init_tcp_provider(void); static void nstat_init_udp_provider(void); -static void +__private_extern__ void nstat_init(void) { if (nstat_malloc_tag != NULL) return; @@ -241,7 +264,7 @@ nstat_malloc_aligned( u_int8_t *aligned = buffer + sizeof(*hdr); aligned = (u_int8_t*)P2ROUNDUP(aligned, alignment); - hdr = (struct align_header*)(aligned - sizeof(*hdr)); + hdr = (struct align_header*)(void *)(aligned - sizeof(*hdr)); hdr->offset = aligned - buffer; hdr->length = size; @@ -253,7 +276,7 @@ nstat_free_aligned( void *buffer, OSMallocTag tag) { - struct align_header *hdr = (struct align_header*)((u_int8_t*)buffer - sizeof(*hdr)); + struct align_header *hdr = (struct align_header*)(void *)((u_int8_t*)buffer - sizeof(*hdr)); OSFree(((char*)buffer) - hdr->offset, hdr->length, tag); } @@ -280,7 +303,6 @@ nstat_route_lookup( if (length < sizeof(*param)) { - printf("%s:%d: expected %lu byte param, received %u\n", __FUNCTION__, __LINE__, sizeof(*param), length); return EINVAL; } @@ -288,16 +310,13 @@ nstat_route_lookup( param->dst.v4.sin_family > AF_MAX || (param->mask.v4.sin_family != 0 && param->mask.v4.sin_family != param->dst.v4.sin_family)) { - printf("%s:%d invalid family (dst=%d, mask=%d)\n", __FUNCTION__, __LINE__, - param->dst.v4.sin_family, param->mask.v4.sin_family); return EINVAL; } if (param->dst.v4.sin_len > sizeof(param->dst) || (param->mask.v4.sin_family && param->mask.v4.sin_len > sizeof(param->mask.v4.sin_len))) { - printf("%s:%d invalid length (dst=%d, mask=%d)\n", __FUNCTION__, __LINE__, - param->dst.v4.sin_len, param->mask.v4.sin_len); + return EINVAL; } // TBD: Need to validate length of sockaddr for different families? @@ -360,7 +379,8 @@ nstat_route_counts( static void nstat_route_release( - nstat_provider_cookie_t cookie) + nstat_provider_cookie_t cookie, + __unused int locked) { rtfree((struct rtentry*)cookie); } @@ -421,7 +441,6 @@ nstat_route_add_watcher( result = rnh->rnh_walktree(rnh, nstat_route_walktree_add, state); if (result != 0) { - printf("%s:%d rnh_walktree failed: %d\n", __FUNCTION__, __LINE__, result); break; } } @@ -441,9 +460,9 @@ nstat_route_new_entry( if ((rt->rt_flags & RTF_UP) != 0) { nstat_control_state *state; - for (state = nstat_controls; state; state = state->next) + for (state = nstat_controls; state; state = state->ncs_next) { - if ((state->watching & (1 << NSTAT_PROVIDER_ROUTE)) != 0) + if ((state->ncs_watching & (1 << NSTAT_PROVIDER_ROUTE)) != 0) { // this client is watching routes // acquire a reference for the route @@ -474,7 +493,6 @@ nstat_route_copy_descriptor( nstat_route_descriptor *desc = (nstat_route_descriptor*)data; if (len < sizeof(*desc)) { - printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(*desc), len); return EINVAL; } bzero(desc, sizeof(*desc)); @@ -710,6 +728,7 @@ nstat_route_rtt( } } + #pragma mark -- TCP Provider -- static nstat_provider nstat_tcp_provider; @@ -725,7 +744,6 @@ nstat_tcpudp_lookup( const nstat_tcp_add_param *param = (const nstat_tcp_add_param*)data; if (length < sizeof(*param)) { - printf("%s:%d expected %lu byte param, received %u\n", __FUNCTION__, __LINE__, sizeof(*param), length); return EINVAL; } @@ -733,8 +751,6 @@ nstat_tcpudp_lookup( if (param->remote.v4.sin_family != 0 && param->remote.v4.sin_family != param->local.v4.sin_family) { - printf("%s:%d src family (%d) and dst family (%d) don't match\n", - __FUNCTION__, __LINE__, param->local.v4.sin_family, param->remote.v4.sin_family); return EINVAL; } @@ -748,9 +764,6 @@ nstat_tcpudp_lookup( (param->remote.v4.sin_family != 0 && param->remote.v4.sin_len != sizeof(param->remote.v4))) { - printf("%s:%d invalid length for v4 src (%d) or dst (%d), should be %lu\n", - __FUNCTION__, __LINE__, param->local.v4.sin_len, param->remote.v4.sin_len, - sizeof(param->remote.v4)); return EINVAL; } @@ -772,9 +785,6 @@ nstat_tcpudp_lookup( (param->remote.v6.sin6_family != 0 && param->remote.v6.sin6_len != sizeof(param->remote.v6))) { - printf("%s:%d invalid length for v6 src (%d) or dst (%d), should be %lu\n", - __FUNCTION__, __LINE__, param->local.v6.sin6_len, param->remote.v6.sin6_len, - sizeof(param->remote.v6)); return EINVAL; } @@ -788,7 +798,6 @@ nstat_tcpudp_lookup( #endif default: - printf("%s:%d unsupported address family %d\n", __FUNCTION__, __LINE__, param->local.v4.sin_family); return EINVAL; } @@ -836,35 +845,33 @@ nstat_tcp_counts( *out_gone = 1; } - if (tp->t_state > TCPS_LISTEN) - { - atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); - atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); - atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); - atomic_get_64(out_counts->nstat_txbytes, &inp->inp_stat->txbytes); - out_counts->nstat_rxduplicatebytes = tp->t_stat.rxduplicatebytes; - out_counts->nstat_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; - out_counts->nstat_txretransmit = tp->t_stat.txretransmitbytes; - out_counts->nstat_connectattempts = tp->t_state >= TCPS_SYN_SENT ? 1 : 0; - out_counts->nstat_connectsuccesses = tp->t_state >= TCPS_ESTABLISHED ? 1 : 0; - out_counts->nstat_avg_rtt = tp->t_srtt; - out_counts->nstat_min_rtt = tp->t_rttbest; - out_counts->nstat_var_rtt = tp->t_rttvar; - } + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); + atomic_get_64(out_counts->nstat_txbytes, &inp->inp_stat->txbytes); + out_counts->nstat_rxduplicatebytes = tp->t_stat.rxduplicatebytes; + out_counts->nstat_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; + out_counts->nstat_txretransmit = tp->t_stat.txretransmitbytes; + out_counts->nstat_connectattempts = tp->t_state >= TCPS_SYN_SENT ? 1 : 0; + out_counts->nstat_connectsuccesses = tp->t_state >= TCPS_ESTABLISHED ? 1 : 0; + out_counts->nstat_avg_rtt = tp->t_srtt; + out_counts->nstat_min_rtt = tp->t_rttbest; + out_counts->nstat_var_rtt = tp->t_rttvar; + if (out_counts->nstat_avg_rtt < out_counts->nstat_min_rtt) + out_counts->nstat_min_rtt = out_counts->nstat_avg_rtt; return 0; } static void nstat_tcp_release( - nstat_provider_cookie_t cookie) + nstat_provider_cookie_t cookie, + int locked) { struct inpcb *inp = (struct inpcb*)cookie; - in_pcb_checkstate(inp, WNT_RELEASE, 0); + in_pcb_checkstate(inp, WNT_RELEASE, locked); } -static u_int32_t nstat_tcp_watchers = 0; - static errno_t nstat_tcp_add_watcher( nstat_control_state *state) @@ -908,9 +915,9 @@ nstat_tcp_new_pcb( lck_mtx_lock(&nstat_mtx); nstat_control_state *state; - for (state = nstat_controls; state; state = state->next) + for (state = nstat_controls; state; state = state->ncs_next) { - if ((state->watching & (1 << NSTAT_PROVIDER_TCP)) != 0) + if ((state->ncs_watching & (1 << NSTAT_PROVIDER_TCP)) != 0) { // this client is watching tcp // acquire a reference for it @@ -928,6 +935,54 @@ nstat_tcp_new_pcb( lck_mtx_unlock(&nstat_mtx); } +__private_extern__ void +nstat_pcb_detach(struct inpcb *inp) +{ + nstat_control_state *state; + nstat_src *src, *prevsrc; + nstat_src *dead_list = NULL; + + if (inp == NULL || (nstat_tcp_watchers == 0 && nstat_udp_watchers == 0)) + return; + + lck_mtx_lock(&nstat_mtx); + for (state = nstat_controls; state; state = state->ncs_next) { + lck_mtx_lock(&state->mtx); + for (prevsrc = NULL, src = state->ncs_srcs; src; + prevsrc = src, src = src->next) + if (src->cookie == inp) + break; + + if (src) { + // send one last counts notification + nstat_control_send_counts(state, src, 0, NULL); + + // send a last description + nstat_control_send_description(state, src, 0); + + // send the source removed notification + nstat_control_send_removed(state, src); + + if (prevsrc) + prevsrc->next = src->next; + else + state->ncs_srcs = src->next; + + src->next = dead_list; + dead_list = src; + } + lck_mtx_unlock(&state->mtx); + } + lck_mtx_unlock(&nstat_mtx); + + while (dead_list) { + src = dead_list; + dead_list = src->next; + + nstat_control_cleanup_source(NULL, src, TRUE); + } +} + static errno_t nstat_tcp_copy_descriptor( nstat_provider_cookie_t cookie, @@ -936,13 +991,15 @@ nstat_tcp_copy_descriptor( { if (len < sizeof(nstat_tcp_descriptor)) { - printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(nstat_tcp_descriptor), len); return EINVAL; } nstat_tcp_descriptor *desc = (nstat_tcp_descriptor*)data; struct inpcb *inp = (struct inpcb*)cookie; struct tcpcb *tp = intotcpcb(inp); + + if (inp->inp_state == INPCB_STATE_DEAD) + return EINVAL; bzero(desc, sizeof(*desc)); @@ -962,8 +1019,8 @@ nstat_tcp_copy_descriptor( } desc->state = intotcpcb(inp)->t_state; - if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp) - desc->ifindex = inp->inp_route.ro_rt->rt_ifp->if_index; + desc->ifindex = (inp->inp_last_outifp == NULL) ? 0 : + inp->inp_last_outifp->if_index; // danger - not locked, values could be bogus desc->txunacked = tp->snd_max - tp->snd_una; @@ -977,6 +1034,7 @@ nstat_tcp_copy_descriptor( // they're in sync? desc->upid = so->last_upid; desc->pid = so->last_pid; + desc->traffic_class = so->so_traffic_class; proc_name(desc->pid, desc->pname, sizeof(desc->pname)); desc->pname[sizeof(desc->pname) - 1] = 0; @@ -1054,14 +1112,13 @@ nstat_udp_counts( static void nstat_udp_release( - nstat_provider_cookie_t cookie) + nstat_provider_cookie_t cookie, + int locked) { struct inpcb *inp = (struct inpcb*)cookie; - in_pcb_checkstate(inp, WNT_RELEASE, 0); + in_pcb_checkstate(inp, WNT_RELEASE, locked); } -static u_int32_t nstat_udp_watchers = 0; - static errno_t nstat_udp_add_watcher( nstat_control_state *state) @@ -1105,9 +1162,9 @@ nstat_udp_new_pcb( lck_mtx_lock(&nstat_mtx); nstat_control_state *state; - for (state = nstat_controls; state; state = state->next) + for (state = nstat_controls; state; state = state->ncs_next) { - if ((state->watching & (1 << NSTAT_PROVIDER_UDP)) != 0) + if ((state->ncs_watching & (1 << NSTAT_PROVIDER_UDP)) != 0) { // this client is watching tcp // acquire a reference for it @@ -1133,13 +1190,15 @@ nstat_udp_copy_descriptor( { if (len < sizeof(nstat_udp_descriptor)) { - printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(nstat_tcp_descriptor), len); return EINVAL; } nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; struct inpcb *inp = (struct inpcb*)cookie; + if (inp->inp_state == INPCB_STATE_DEAD) + return EINVAL; + bzero(desc, sizeof(*desc)); if (inp->inp_vflag & INP_IPV6) @@ -1157,9 +1216,9 @@ nstat_udp_copy_descriptor( &desc->remote.v4, sizeof(desc->remote)); } - if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp) - desc->ifindex = inp->inp_route.ro_rt->rt_ifp->if_index; - + desc->ifindex = (inp->inp_last_outifp == NULL) ? 0 : + inp->inp_last_outifp->if_index; + struct socket *so = inp->inp_socket; if (so) { @@ -1170,6 +1229,7 @@ nstat_udp_copy_descriptor( desc->rcvbufsize = so->so_rcv.sb_hiwat; desc->rcvbufused = so->so_rcv.sb_cc; + desc->traffic_class = so->so_traffic_class; proc_name(desc->pid, desc->pname, sizeof(desc->pname)); desc->pname[sizeof(desc->pname) - 1] = 0; @@ -1197,22 +1257,12 @@ nstat_init_udp_provider(void) #pragma mark -- Kernel Control Socket -- -typedef struct nstat_src -{ - struct nstat_src *next; - nstat_src_ref_t srcref; - nstat_provider *provider; - nstat_provider_cookie_t cookie; -} nstat_src; - static kern_ctl_ref nstat_ctlref = NULL; static lck_grp_t *nstat_lck_grp = NULL; static errno_t nstat_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, void **uinfo); static errno_t nstat_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo); static errno_t nstat_control_send(kern_ctl_ref kctl, u_int32_t unit, void *uinfo, mbuf_t m, int flags); -static int nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context); -static void nstat_control_cleanup_source(nstat_control_state *state, struct nstat_src *src); static void* @@ -1222,43 +1272,47 @@ nstat_idle_check( { lck_mtx_lock(&nstat_mtx); - nstat_idle_time = 0ULL; + nstat_idle_time = 0; nstat_control_state *control; nstat_src *dead = NULL; nstat_src *dead_list = NULL; - for (control = nstat_controls; control; control = control->next) + for (control = nstat_controls; control; control = control->ncs_next) { lck_mtx_lock(&control->mtx); - nstat_src **srcpp = &control->srcs; + nstat_src **srcpp = &control->ncs_srcs; - while(*srcpp != NULL) + if (!(control->ncs_flags & NSTAT_FLAG_REQCOUNTS)) { - if ((*srcpp)->provider->nstat_gone((*srcpp)->cookie)) + while(*srcpp != NULL) { - // Pull it off the list - dead = *srcpp; - *srcpp = (*srcpp)->next; - - // send a last description - nstat_control_send_description(control, dead, 0ULL); - - // send the source removed notification - nstat_msg_src_removed removed; - removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; - removed.hdr.context = 0; - removed.srcref = dead->srcref; - (void)ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR); - - // Put this on the list to release later - dead->next = dead_list; - dead_list = dead; - } - else - { - srcpp = &(*srcpp)->next; + if ((*srcpp)->provider->nstat_gone((*srcpp)->cookie)) + { + // Pull it off the list + dead = *srcpp; + *srcpp = (*srcpp)->next; + + // send one last counts notification + nstat_control_send_counts(control, dead, + 0, NULL); + + // send a last description + nstat_control_send_description(control, dead, 0); + + // send the source removed notification + nstat_control_send_removed(control, dead); + + // Put this on the list to release later + dead->next = dead_list; + dead_list = dead; + } + else + { + srcpp = &(*srcpp)->next; + } } } + control->ncs_flags &= ~NSTAT_FLAG_REQCOUNTS; lck_mtx_unlock(&control->mtx); } @@ -1276,7 +1330,7 @@ nstat_idle_check( dead = dead_list; dead_list = dead->next; - nstat_control_cleanup_source(NULL, dead); + nstat_control_cleanup_source(NULL, dead, FALSE); } return NULL; @@ -1301,27 +1355,20 @@ nstat_control_register(void) nstat_control.ctl_disconnect = nstat_control_disconnect; nstat_control.ctl_send = nstat_control_send; - errno_t result = ctl_register(&nstat_control, &nstat_ctlref); - if (result != 0) - printf("%s:%d ctl_register failed: %d", __FUNCTION__, __LINE__, result); + ctl_register(&nstat_control, &nstat_ctlref); } static void nstat_control_cleanup_source( nstat_control_state *state, - struct nstat_src *src) + struct nstat_src *src, + boolean_t locked) { if (state) - { - nstat_msg_src_removed removed; - removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; - removed.hdr.context = 0; - removed.srcref = src->srcref; - (void)ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR); - } + nstat_control_send_removed(state, src); // Cleanup the source if we found it. - src->provider->nstat_release(src->cookie); + src->provider->nstat_release(src->cookie, locked); OSFree(src, sizeof(*src), nstat_malloc_tag); } @@ -1336,20 +1383,16 @@ nstat_control_connect( bzero(state, sizeof(*state)); lck_mtx_init(&state->mtx, nstat_lck_grp, NULL); - state->kctl = kctl; - state->unit = sac->sc_unit; + state->ncs_kctl = kctl; + state->ncs_unit = sac->sc_unit; + state->ncs_flags = NSTAT_FLAG_REQCOUNTS; *uinfo = state; - // check if we're super user - proc_t pself = proc_self(); - state->suser = proc_suser(pself) == 0; - proc_rele(pself); - lck_mtx_lock(&nstat_mtx); - state->next = nstat_controls; + state->ncs_next = nstat_controls; nstat_controls = state; - if (nstat_idle_time == 0ULL) + if (nstat_idle_time == 0) { clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); @@ -1372,11 +1415,11 @@ nstat_control_disconnect( // pull it out of the global list of states lck_mtx_lock(&nstat_mtx); nstat_control_state **statepp; - for (statepp = &nstat_controls; *statepp; statepp = &(*statepp)->next) + for (statepp = &nstat_controls; *statepp; statepp = &(*statepp)->ncs_next) { if (*statepp == state) { - *statepp = state->next; + *statepp = state->ncs_next; break; } } @@ -1385,8 +1428,8 @@ nstat_control_disconnect( lck_mtx_lock(&state->mtx); // Stop watching for sources nstat_provider *provider; - watching = state->watching; - state->watching = 0; + watching = state->ncs_watching; + state->ncs_watching = 0; for (provider = nstat_providers; provider && watching; provider = provider->next) { if ((watching & (1 << provider->nstat_provider_id)) != 0) @@ -1397,11 +1440,11 @@ nstat_control_disconnect( } // set cleanup flags - state->cleanup = TRUE; + state->ncs_flags |= NSTAT_FLAG_CLEANUP; // Copy out the list of sources - nstat_src *srcs = state->srcs; - state->srcs = NULL; + nstat_src *srcs = state->ncs_srcs; + state->ncs_srcs = NULL; lck_mtx_unlock(&state->mtx); while (srcs) @@ -1413,7 +1456,7 @@ nstat_control_disconnect( srcs = src->next; // clean it up - nstat_control_cleanup_source(NULL, src); + nstat_control_cleanup_source(NULL, src, FALSE); } OSFree(state, sizeof(*state), nstat_malloc_tag); @@ -1430,26 +1473,51 @@ nstat_control_next_src_ref( for (i = 0; i < 1000 && toReturn == NSTAT_SRC_REF_INVALID; i++) { - if (state->next_srcref == NSTAT_SRC_REF_INVALID || - state->next_srcref == NSTAT_SRC_REF_ALL) + if (state->ncs_next_srcref == NSTAT_SRC_REF_INVALID || + state->ncs_next_srcref == NSTAT_SRC_REF_ALL) { - state->next_srcref = 1; + state->ncs_next_srcref = 1; } nstat_src *src; - for (src = state->srcs; src; src = src->next) + for (src = state->ncs_srcs; src; src = src->next) { - if (src->srcref == state->next_srcref) + if (src->srcref == state->ncs_next_srcref) break; } - if (src == NULL) toReturn = state->next_srcref; - state->next_srcref++; + if (src == NULL) toReturn = state->ncs_next_srcref; + state->ncs_next_srcref++; } return toReturn; } +static errno_t +nstat_control_send_counts( + nstat_control_state *state, + nstat_src *src, + unsigned long long context, + int *gone) +{ + nstat_msg_src_counts counts; + int localgone = 0; + errno_t result = 0; + + counts.hdr.type = NSTAT_MSG_TYPE_SRC_COUNTS; + counts.hdr.context = context; + counts.srcref = src->srcref; + bzero(&counts.counts, sizeof(counts.counts)); + if (src->provider->nstat_counts(src->cookie, &counts.counts, + &localgone) == 0) { + result = ctl_enqueuedata(state->ncs_kctl, state->ncs_unit, &counts, + sizeof(counts), CTL_DATA_EOR); + } + if (gone) + *gone = localgone; + return result; +} + static int nstat_control_send_description( nstat_control_state *state, @@ -1460,8 +1528,6 @@ nstat_control_send_description( if (src->provider->nstat_descriptor_length == 0 || src->provider->nstat_copy_descriptor == NULL) { - lck_mtx_unlock(&state->mtx); - printf("%s:%d - provider doesn't support descriptions\n", __FUNCTION__, __LINE__); return EOPNOTSUPP; } @@ -1471,8 +1537,6 @@ nstat_control_send_description( u_int32_t size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length; if (mbuf_allocpacket(MBUF_WAITOK, size, &one, &msg) != 0) { - lck_mtx_unlock(&state->mtx); - printf("%s:%d - failed to allocate response\n", __FUNCTION__, __LINE__); return ENOMEM; } @@ -1486,7 +1550,6 @@ nstat_control_send_description( if (result != 0) { mbuf_freem(msg); - printf("%s:%d - provider failed to copy descriptor %d\n", __FUNCTION__, __LINE__, result); return result; } @@ -1495,16 +1558,32 @@ nstat_control_send_description( desc->srcref = src->srcref; desc->provider = src->provider->nstat_provider_id; - result = ctl_enqueuembuf(state->kctl, state->unit, msg, CTL_DATA_EOR); + result = ctl_enqueuembuf(state->ncs_kctl, state->ncs_unit, msg, CTL_DATA_EOR); if (result != 0) { - printf("%s:%d ctl_enqueuembuf returned error %d\n", __FUNCTION__, __LINE__, result); mbuf_freem(msg); } return result; } +static errno_t +nstat_control_send_removed( + nstat_control_state *state, + nstat_src *src) +{ + nstat_msg_src_removed removed; + errno_t result; + + removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; + removed.hdr.context = 0; + removed.srcref = src->srcref; + result = ctl_enqueuedata(state->ncs_kctl, state->ncs_unit, &removed, + sizeof(removed), CTL_DATA_EOR); + + return result; +} + static errno_t nstat_control_handle_add_request( nstat_control_state *state, @@ -1515,8 +1594,6 @@ nstat_control_handle_add_request( // Verify the header fits in the first mbuf if (mbuf_len(m) < offsetof(nstat_msg_add_src_req, param)) { - printf("mbuf_len(m)=%lu, offsetof(nstat_msg_add_src_req*, param)=%lu\n", - mbuf_len(m), offsetof(nstat_msg_add_src_req, param)); return EINVAL; } @@ -1524,7 +1601,6 @@ nstat_control_handle_add_request( int32_t paramlength = mbuf_pkthdr_len(m) - offsetof(nstat_msg_add_src_req, param); if (paramlength < 0 || paramlength > 2 * 1024) { - printf("invalid paramlength=%d\n", paramlength); return EINVAL; } @@ -1554,33 +1630,11 @@ nstat_control_handle_add_request( result = nstat_control_source_add(req->hdr.context, state, provider, cookie); if (result != 0) - provider->nstat_release(cookie); + provider->nstat_release(cookie, 0); return result; } -static int -nstat_perm_check( - __unused nstat_control_state *state) -{ - int allow = 0; -#if !REQUIRE_ROOT_FOR_STATS - allow = 1; -#else - // If the socket was created by a priv process, allow - if (state->suser) return 1; - - // If the current process is priv, allow - proc_t self = proc_self(); - allow = proc_suser(self) == 0; - proc_rele(self); - - // TBD: check for entitlement, root check is too coarse -#endif /* REQUIRE_ROOT_FOR_STATS */ - - return allow; -} - static errno_t nstat_control_handle_add_all( nstat_control_state *state, @@ -1588,16 +1642,9 @@ nstat_control_handle_add_all( { errno_t result = 0; - if (!nstat_perm_check(state)) - { - return EPERM; - } - // Verify the header fits in the first mbuf if (mbuf_len(m) < sizeof(nstat_msg_add_all_srcs)) { - printf("mbuf_len(m)=%lu, sizeof(nstat_msg_add_all_srcs)=%lu\n", - mbuf_len(m), sizeof(nstat_msg_add_all_srcs)); return EINVAL; } @@ -1609,9 +1656,9 @@ nstat_control_handle_add_all( // Make sure we don't add the provider twice lck_mtx_lock(&state->mtx); - if ((state->watching & (1 << provider->nstat_provider_id)) != 0) + if ((state->ncs_watching & (1 << provider->nstat_provider_id)) != 0) result = EALREADY; - state->watching |= (1 << provider->nstat_provider_id); + state->ncs_watching |= (1 << provider->nstat_provider_id); lck_mtx_unlock(&state->mtx); if (result != 0) return result; @@ -1619,7 +1666,7 @@ nstat_control_handle_add_all( if (result != 0) { lck_mtx_lock(&state->mtx); - state->watching &= ~(1 << provider->nstat_provider_id); + state->ncs_watching &= ~(1 << provider->nstat_provider_id); lck_mtx_unlock(&state->mtx); } @@ -1630,8 +1677,7 @@ nstat_control_handle_add_all( success.context = req->hdr.context; success.type = NSTAT_MSG_TYPE_SUCCESS; success.pad = 0; - if (ctl_enqueuedata(state->kctl, state->unit, &success, sizeof(success), CTL_DATA_EOR) != 0) - printf("%s:%d - failed to enqueue success message\n", __FUNCTION__, __LINE__); + ctl_enqueuedata(state->ncs_kctl, state->ncs_unit, &success, sizeof(success), CTL_DATA_EOR); } return result; @@ -1671,7 +1717,7 @@ nstat_control_source_add( lck_mtx_lock(&state->mtx); add->srcref = src->srcref = nstat_control_next_src_ref(state); - if (state->cleanup || src->srcref == NSTAT_SRC_REF_INVALID) + if (state->ncs_flags & NSTAT_FLAG_CLEANUP || src->srcref == NSTAT_SRC_REF_INVALID) { lck_mtx_unlock(&state->mtx); OSFree(src, sizeof(*src), nstat_malloc_tag); @@ -1682,23 +1728,22 @@ nstat_control_source_add( src->cookie = cookie; // send the source added message - errno_t result = ctl_enqueuembuf(state->kctl, state->unit, msg, CTL_DATA_EOR); + errno_t result = ctl_enqueuembuf(state->ncs_kctl, state->ncs_unit, msg, CTL_DATA_EOR); if (result != 0) { lck_mtx_unlock(&state->mtx); - printf("%s:%d ctl_enqueuembuf failed: %d\n", __FUNCTION__, __LINE__, result); OSFree(src, sizeof(*src), nstat_malloc_tag); mbuf_freem(msg); return result; } // Put the source in the list - src->next = state->srcs; - state->srcs = src; + src->next = state->ncs_srcs; + state->ncs_srcs = src; // send the description message // not useful as the source is often not complete -// nstat_control_send_description(state, src, 0ULL); +// nstat_control_send_description(state, src, 0); lck_mtx_unlock(&state->mtx); @@ -1714,7 +1759,6 @@ nstat_control_handle_remove_request( if (mbuf_copydata(m, offsetof(nstat_msg_rem_src_req, srcref), sizeof(srcref), &srcref) != 0) { - printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(nstat_msg_rem_src_req)); return EINVAL; } @@ -1723,7 +1767,7 @@ nstat_control_handle_remove_request( // Remove this source as we look for it nstat_src **nextp; nstat_src *src = NULL; - for (nextp = &state->srcs; *nextp; nextp = &(*nextp)->next) + for (nextp = &state->ncs_srcs; *nextp; nextp = &(*nextp)->next) { if ((*nextp)->srcref == srcref) { @@ -1735,7 +1779,7 @@ nstat_control_handle_remove_request( lck_mtx_unlock(&state->mtx); - if (src) nstat_control_cleanup_source(state, src); + if (src) nstat_control_cleanup_source(state, src, FALSE); return src ? 0 : ENOENT; } @@ -1758,12 +1802,13 @@ nstat_control_handle_query_request( nstat_msg_query_src_req req; if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) { - printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(req)); return EINVAL; } lck_mtx_lock(&state->mtx); - nstat_src **srcpp = &state->srcs; + if (req.srcref == NSTAT_SRC_REF_ALL) + state->ncs_flags |= NSTAT_FLAG_REQCOUNTS; + nstat_src **srcpp = &state->ncs_srcs; while (*srcpp != NULL) { int gone; @@ -1772,26 +1817,20 @@ nstat_control_handle_query_request( if (req.srcref == NSTAT_SRC_REF_ALL || (*srcpp)->srcref == req.srcref) { - nstat_msg_src_counts counts; - counts.hdr.type = NSTAT_MSG_TYPE_SRC_COUNTS; - counts.hdr.context = req.hdr.context; - counts.srcref = (*srcpp)->srcref; - bzero(&counts.counts, sizeof(counts.counts)); - result = (*srcpp)->provider->nstat_counts((*srcpp)->cookie, &counts.counts, &gone); + result = nstat_control_send_counts(state, *srcpp, + req.hdr.context, &gone); - if (result == 0) - { - result = ctl_enqueuedata(state->kctl, state->unit, &counts, sizeof(counts), CTL_DATA_EOR); - } - else - { - printf("%s:%d provider->nstat_counts failed: %d\n", __FUNCTION__, __LINE__, result); - } + // If the counts message failed to enqueue then we should clear our flag so + // that a client doesn't miss anything on idle cleanup. + if (result != 0) + state->ncs_flags &= ~NSTAT_FLAG_REQCOUNTS; if (gone) { // send one last descriptor message so client may see last state - nstat_control_send_description(state, *srcpp, 0ULL); + + nstat_control_send_description(state, *srcpp, + 0); // pull src out of the list nstat_src *src = *srcpp; @@ -1818,7 +1857,7 @@ nstat_control_handle_query_request( dead_srcs = src->next; // release src and send notification - nstat_control_cleanup_source(state, src); + nstat_control_cleanup_source(state, src, FALSE); } if (req.srcref == NSTAT_SRC_REF_ALL) @@ -1827,8 +1866,7 @@ nstat_control_handle_query_request( success.context = req.hdr.context; success.type = NSTAT_MSG_TYPE_SUCCESS; success.pad = 0; - if (ctl_enqueuedata(state->kctl, state->unit, &success, sizeof(success), CTL_DATA_EOR) != 0) - printf("%s:%d - failed to enqueue success message\n", __FUNCTION__, __LINE__); + ctl_enqueuedata(state->ncs_kctl, state->ncs_unit, &success, sizeof(success), CTL_DATA_EOR); result = 0; } @@ -1843,14 +1881,13 @@ nstat_control_handle_get_src_description( nstat_msg_get_src_description req; if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) { - printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(req)); return EINVAL; } // Find the source lck_mtx_lock(&state->mtx); nstat_src *src; - for (src = state->srcs; src; src = src->next) + for (src = state->ncs_srcs; src; src = src->next) { if (src->srcref == req.srcref) break; @@ -1860,7 +1897,6 @@ nstat_control_handle_get_src_description( if (!src) { lck_mtx_unlock(&state->mtx); - printf("%s:%d - no matching source\n", __FUNCTION__, __LINE__); return ENOENT; } @@ -1886,8 +1922,6 @@ nstat_control_send( if (mbuf_pkthdr_len(m) < sizeof(hdr)) { // Is this the right thing to do? - printf("%s:%d - message too short, was %ld expected %lu\n", __FUNCTION__, __LINE__, - mbuf_pkthdr_len(m), sizeof(*hdr)); mbuf_freem(m); return EINVAL; } @@ -1925,7 +1959,6 @@ nstat_control_send( break; default: - printf("%s:%d - unknown message type %d\n", __FUNCTION__, __LINE__, hdr->type); result = EINVAL; break; } diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h index 4bbb3dc1b..4696d89ff 100644 --- a/bsd/net/ntstat.h +++ b/bsd/net/ntstat.h @@ -121,6 +121,7 @@ typedef struct nstat_tcp_descriptor u_int32_t txunacked; u_int32_t txwindow; u_int32_t txcwindow; + u_int32_t traffic_class; u_int64_t upid; u_int32_t pid; @@ -147,6 +148,7 @@ typedef struct nstat_udp_descriptor u_int32_t rcvbufsize; u_int32_t rcvbufused; + u_int32_t traffic_class; u_int64_t upid; u_int32_t pid; @@ -191,7 +193,7 @@ typedef struct nstat_route_descriptor enum { - // generice respnse messages + // generic response messages NSTAT_MSG_TYPE_SUCCESS = 0 ,NSTAT_MSG_TYPE_ERROR = 1 @@ -315,6 +317,8 @@ enum // indicates whether or not collection of statistics is enabled extern int nstat_collect; +void nstat_init(void); + // Route collection routines void nstat_route_connect_attempt(struct rtentry *rte); void nstat_route_connect_success(struct rtentry *rte); @@ -328,6 +332,7 @@ struct inpcb; void nstat_tcp_new_pcb(struct inpcb *inp); void nstat_udp_new_pcb(struct inpcb *inp); void nstat_route_new_entry(struct rtentry *rt); +void nstat_pcb_detach(struct inpcb *inp); // locked_add_64 uses atomic operations on 32bit so the 64bit // value can be properly read. The values are only ever incremented diff --git a/bsd/net/pf.c b/bsd/net/pf.c index 0597ffd4d..b13db985e 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Apple Inc. All rights reserved. + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,7 +104,7 @@ #include #include #include - +#include #include #include @@ -120,15 +120,11 @@ #include #endif /* INET6 */ -#ifndef NO_APPLE_EXTENSIONS -#define DPFPRINTF(n, x) (pf_status.debug >= (n) ? printf x : ((void)0)) -#else -#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x -#endif +#if DUMMYNET +#include +#endif /* DUMMYNET */ -/* XXX: should be in header somewhere */ -#define satosin(sa) ((struct sockaddr_in *)(sa)) -#define sintosa(sin) ((struct sockaddr *)(sin)) +#define DPFPRINTF(n, x) (pf_status.debug >= (n) ? printf x : ((void)0)) /* * On Mac OS X, the rtableid value is treated as the interface scope @@ -145,8 +141,10 @@ /* * Global variables */ -lck_mtx_t *pf_lock; -lck_rw_t *pf_perim_lock; +decl_lck_mtx_data(,pf_lock_data); +decl_lck_rw_data(,pf_perim_lock_data); +lck_mtx_t *pf_lock = &pf_lock_data; +lck_rw_t *pf_perim_lock = &pf_perim_lock_data; /* state tables */ struct pf_state_tree_lan_ext pf_statetbl_lan_ext; @@ -155,14 +153,14 @@ struct pf_state_tree_ext_gwy pf_statetbl_ext_gwy; struct pf_palist pf_pabuf; struct pf_status pf_status; -#if ALTQ +#if PF_ALTQ struct pf_altqqueue pf_altqs[2]; struct pf_altqqueue *pf_altqs_active; struct pf_altqqueue *pf_altqs_inactive; u_int32_t ticket_altqs_active; u_int32_t ticket_altqs_inactive; int altqs_inactive_open; -#endif /* ALTQ */ +#endif /* PF_ALTQ */ u_int32_t ticket_pabuf; static MD5_CTX pf_tcp_secret_ctx; @@ -179,11 +177,10 @@ static struct pf_anchor_stackframe { struct pool pf_src_tree_pl, pf_rule_pl, pf_pooladdr_pl; struct pool pf_state_pl, pf_state_key_pl; -#if ALTQ +#if PF_ALTQ struct pool pf_altq_pl; -#endif /* ALTQ */ +#endif /* PF_ALTQ */ -#ifndef NO_APPLE_EXTENSIONS typedef void (*hook_fn_t)(void *); struct hook_desc { @@ -204,7 +201,6 @@ struct pool pf_app_state_pl; static void pf_print_addr(struct pf_addr *addr, sa_family_t af); static void pf_print_sk_host(struct pf_state_host *, u_int8_t, int, u_int8_t); -#endif static void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t); @@ -233,7 +229,6 @@ static void pf_send_tcp(const struct pf_rule *, sa_family_t, u_int16_t, struct ether_header *, struct ifnet *); static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, sa_family_t, struct pf_rule *); -#ifndef NO_APPLE_EXTENSIONS static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, int, int, struct pfi_kif *, struct pf_addr *, union pf_state_xport *, struct pf_addr *, @@ -244,17 +239,6 @@ static struct pf_rule *pf_get_translation_aux(struct pf_pdesc *, union pf_state_xport *, struct pf_addr *, union pf_state_xport *, struct pf_addr *, union pf_state_xport *); -#else -struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, - int, int, struct pfi_kif *, - struct pf_addr *, u_int16_t, struct pf_addr *, - u_int16_t, int); -struct pf_rule *pf_get_translation(struct pf_pdesc *, struct mbuf *, - int, int, struct pfi_kif *, struct pf_src_node **, - struct pf_addr *, u_int16_t, - struct pf_addr *, u_int16_t, - struct pf_addr *, u_int16_t *); -#endif static void pf_attach_state(struct pf_state_key *, struct pf_state *, int); static void pf_detach_state(struct pf_state *, int); @@ -263,6 +247,11 @@ static int pf_test_rule(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **, struct ifqueue *); +#if DUMMYNET +static int pf_test_dummynet(struct pf_rule **, int, + struct pfi_kif *, struct mbuf **, + struct pf_pdesc *, struct ip_fw_args *); +#endif /* DUMMYNET */ static int pf_test_fragment(struct pf_rule **, int, struct pfi_kif *, struct mbuf *, void *, struct pf_pdesc *, struct pf_rule **, @@ -270,15 +259,9 @@ static int pf_test_fragment(struct pf_rule **, int, static int pf_test_state_tcp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); -#ifndef NO_APPLE_EXTENSIONS static int pf_test_state_udp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); -#else -static int pf_test_state_udp(struct pf_state **, int, - struct pfi_kif *, struct mbuf *, int, - void *, struct pf_pdesc *); -#endif static int pf_test_state_icmp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); @@ -291,18 +274,11 @@ static void pf_hash(struct pf_addr *, struct pf_addr *, static int pf_map_addr(u_int8_t, struct pf_rule *, struct pf_addr *, struct pf_addr *, struct pf_addr *, struct pf_src_node **); -#ifndef NO_APPLE_EXTENSIONS static int pf_get_sport(struct pf_pdesc *, struct pfi_kif *, struct pf_rule *, struct pf_addr *, union pf_state_xport *, struct pf_addr *, union pf_state_xport *, struct pf_addr *, union pf_state_xport *, struct pf_src_node **); -#else -int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *, - struct pf_addr *, struct pf_addr *, u_int16_t, - struct pf_addr *, u_int16_t *, u_int16_t, u_int16_t, - struct pf_src_node **); -#endif static void pf_route(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *); @@ -330,7 +306,6 @@ static void pf_stateins_err(const char *, struct pf_state *, struct pfi_kif *); static int pf_check_congestion(struct ifqueue *); -#ifndef NO_APPLE_EXTENSIONS #if 0 static const char *pf_pptp_ctrl_type_name(u_int16_t code); #endif @@ -344,7 +319,6 @@ static int pf_ike_compare(struct pf_app_state *, struct pf_app_state *); static int pf_test_state_esp(struct pf_state **, int, struct pfi_kif *, int, struct pf_pdesc *); -#endif extern struct pool pfr_ktable_pl; extern struct pool pfr_kentry_pl; @@ -356,10 +330,9 @@ struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = { { &pf_src_tree_pl, PFSNODE_HIWAT }, { &pf_frent_pl, PFFRAG_FRENT_HIWAT }, { &pfr_ktable_pl, PFR_KTABLE_HIWAT }, - { &pfr_kentry_pl, PFR_KENTRY_HIWAT } + { &pfr_kentry_pl, PFR_KENTRY_HIWAT }, }; -#ifndef NO_APPLE_EXTENSIONS struct mbuf * pf_lazy_makewritable(struct pf_pdesc *pd, struct mbuf *m, int len) { @@ -425,6 +398,10 @@ pf_state_lookup_aux(struct pf_state **state, struct pfi_kif *kif, do { \ int action; \ *state = pf_find_state(kif, &key, direction); \ + if (*state != NULL && pd != NULL && \ + pd->flowhash == 0) { \ + pd->flowhash = (*state)->state_key->flowhash; \ + } \ if (pf_state_lookup_aux(state, kif, direction, &action)) \ return (action); \ } while (0) @@ -444,31 +421,6 @@ pf_state_lookup_aux(struct pf_state **state, struct pfi_kif *kif, (STATE_ADDR_TRANSLATE(sk) || \ (sk)->lan.xport.call_id != (sk)->gwy.xport.call_id) -#else -#define STATE_LOOKUP() \ - do { \ - *state = pf_find_state(kif, &key, direction); \ - if (*state == NULL || (*state)->timeout == PFTM_PURGE) \ - return (PF_DROP); \ - if (direction == PF_OUT && \ - (((*state)->rule.ptr->rt == PF_ROUTETO && \ - (*state)->rule.ptr->direction == PF_OUT) || \ - ((*state)->rule.ptr->rt == PF_REPLYTO && \ - (*state)->rule.ptr->direction == PF_IN)) && \ - (*state)->rt_kif != NULL && \ - (*state)->rt_kif != kif) \ - return (PF_PASS); \ - } while (0) - -#define STATE_TRANSLATE(sk) \ - (sk)->lan.addr.addr32[0] != (sk)->gwy.addr.addr32[0] || \ - ((sk)->af == AF_INET6 && \ - ((sk)->lan.addr.addr32[1] != (sk)->gwy.addr.addr32[1] || \ - (sk)->lan.addr.addr32[2] != (sk)->gwy.addr.addr32[2] || \ - (sk)->lan.addr.addr32[3] != (sk)->gwy.addr.addr32[3])) || \ - (sk)->lan.port != (sk)->gwy.port -#endif - #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all @@ -524,7 +476,6 @@ RB_GENERATE(pf_state_tree_id, pf_state, #define PF_DT_SKIP_LANEXT 0x01 #define PF_DT_SKIP_EXTGWY 0x02 -#ifndef NO_APPLE_EXTENSIONS static const u_int16_t PF_PPTP_PORT = 1723; static const u_int32_t PF_PPTP_MAGIC_NUMBER = 0x1A2B3C4D; @@ -807,7 +758,6 @@ struct pf_esp_hdr { u_int32_t seqno; u_int8_t payload[]; }; -#endif static __inline int pf_src_compare(struct pf_src_node *a, struct pf_src_node *b) @@ -857,16 +807,13 @@ static __inline int pf_state_compare_lan_ext(struct pf_state_key *a, struct pf_state_key *b) { int diff; -#ifndef NO_APPLE_EXTENSIONS - int extfilter; -#endif + int extfilter; if ((diff = a->proto - b->proto) != 0) return (diff); if ((diff = a->af - b->af) != 0) return (diff); -#ifndef NO_APPLE_EXTENSIONS extfilter = PF_EXTFILTER_APD; switch (a->proto) { @@ -911,7 +858,6 @@ pf_state_compare_lan_ext(struct pf_state_key *a, struct pf_state_key *b) default: break; } -#endif switch (a->af) { #if INET @@ -920,24 +866,16 @@ pf_state_compare_lan_ext(struct pf_state_key *a, struct pf_state_key *b) return (1); if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0]) return (-1); -#ifndef NO_APPLE_EXTENSIONS if (extfilter < PF_EXTFILTER_EI) { if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) return (1); if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); } -#else - if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) - return (1); - if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) - return (-1); -#endif break; #endif /* INET */ #if INET6 case AF_INET6: -#ifndef NO_APPLE_EXTENSIONS if (a->lan.addr.addr32[3] > b->lan.addr.addr32[3]) return (1); if (a->lan.addr.addr32[3] < b->lan.addr.addr32[3]) @@ -973,45 +911,10 @@ pf_state_compare_lan_ext(struct pf_state_key *a, struct pf_state_key *b) if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); } -#else - if (a->lan.addr.addr32[3] > b->lan.addr.addr32[3]) - return (1); - if (a->lan.addr.addr32[3] < b->lan.addr.addr32[3]) - return (-1); - if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3]) - return (1); - if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3]) - return (-1); - if (a->lan.addr.addr32[2] > b->lan.addr.addr32[2]) - return (1); - if (a->lan.addr.addr32[2] < b->lan.addr.addr32[2]) - return (-1); - if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2]) - return (1); - if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2]) - return (-1); - if (a->lan.addr.addr32[1] > b->lan.addr.addr32[1]) - return (1); - if (a->lan.addr.addr32[1] < b->lan.addr.addr32[1]) - return (-1); - if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1]) - return (1); - if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1]) - return (-1); - if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0]) - return (1); - if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0]) - return (-1); - if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) - return (1); - if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) - return (-1); -#endif break; #endif /* INET6 */ } -#ifndef NO_APPLE_EXTENSIONS if (a->app_state && b->app_state) { if (a->app_state->compare_lan_ext && b->app_state->compare_lan_ext) { @@ -1025,12 +928,6 @@ pf_state_compare_lan_ext(struct pf_state_key *a, struct pf_state_key *b) return (diff); } } -#else - if ((diff = a->lan.port - b->lan.port) != 0) - return (diff); - if ((diff = a->ext.port - b->ext.port) != 0) - return (diff); -#endif return (0); } @@ -1039,9 +936,7 @@ static __inline int pf_state_compare_ext_gwy(struct pf_state_key *a, struct pf_state_key *b) { int diff; -#ifndef NO_APPLE_EXTENSIONS - int extfilter; -#endif + int extfilter; if ((diff = a->proto - b->proto) != 0) return (diff); @@ -1049,7 +944,6 @@ pf_state_compare_ext_gwy(struct pf_state_key *a, struct pf_state_key *b) if ((diff = a->af - b->af) != 0) return (diff); -#ifndef NO_APPLE_EXTENSIONS extfilter = PF_EXTFILTER_APD; switch (a->proto) { @@ -1094,12 +988,10 @@ pf_state_compare_ext_gwy(struct pf_state_key *a, struct pf_state_key *b) default: break; } -#endif switch (a->af) { #if INET case AF_INET: -#ifndef NO_APPLE_EXTENSIONS if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) return (1); if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) @@ -1110,21 +1002,10 @@ pf_state_compare_ext_gwy(struct pf_state_key *a, struct pf_state_key *b) if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); } -#else - if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) - return (1); - if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) - return (-1); - if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) - return (1); - if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) - return (-1); -#endif break; #endif /* INET */ #if INET6 case AF_INET6: -#ifndef NO_APPLE_EXTENSIONS if (a->gwy.addr.addr32[3] > b->gwy.addr.addr32[3]) return (1); if (a->gwy.addr.addr32[3] < b->gwy.addr.addr32[3]) @@ -1160,45 +1041,10 @@ pf_state_compare_ext_gwy(struct pf_state_key *a, struct pf_state_key *b) if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) return (-1); } -#else - if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3]) - return (1); - if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3]) - return (-1); - if (a->gwy.addr.addr32[3] > b->gwy.addr.addr32[3]) - return (1); - if (a->gwy.addr.addr32[3] < b->gwy.addr.addr32[3]) - return (-1); - if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2]) - return (1); - if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2]) - return (-1); - if (a->gwy.addr.addr32[2] > b->gwy.addr.addr32[2]) - return (1); - if (a->gwy.addr.addr32[2] < b->gwy.addr.addr32[2]) - return (-1); - if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1]) - return (1); - if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1]) - return (-1); - if (a->gwy.addr.addr32[1] > b->gwy.addr.addr32[1]) - return (1); - if (a->gwy.addr.addr32[1] < b->gwy.addr.addr32[1]) - return (-1); - if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0]) - return (1); - if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0]) - return (-1); - if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0]) - return (1); - if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0]) - return (-1); -#endif break; #endif /* INET6 */ } -#ifndef NO_APPLE_EXTENSIONS if (a->app_state && b->app_state) { if (a->app_state->compare_ext_gwy && b->app_state->compare_ext_gwy) { @@ -1212,12 +1058,6 @@ pf_state_compare_ext_gwy(struct pf_state_key *a, struct pf_state_key *b) return (diff); } } -#else - if ((diff = a->ext.port - b->ext.port) != 0) - return (diff); - if ((diff = a->gwy.port - b->gwy.port) != 0) - return (diff); -#endif return (0); } @@ -1262,7 +1102,8 @@ pf_find_state_byid(struct pf_state_cmp *key) { pf_status.fcounters[FCNT_STATE_SEARCH]++; - return (RB_FIND(pf_state_tree_id, &tree_id, (struct pf_state *)key)); + return (RB_FIND(pf_state_tree_id, &tree_id, + (struct pf_state *)(void *)key)); } static struct pf_state * @@ -1529,7 +1370,6 @@ pf_stateins_err(const char *tree, struct pf_state *s, struct pfi_kif *kif) struct pf_state_key *sk = s->state_key; if (pf_status.debug >= PF_DEBUG_MISC) { -#ifndef NO_APPLE_EXTENSIONS printf("pf: state insert failed: %s %s ", tree, kif->pfik_name); switch (sk->proto) { case IPPROTO_TCP: @@ -1557,18 +1397,6 @@ pf_stateins_err(const char *tree, struct pf_state *s, struct pfi_kif *kif) printf(" ext: "); pf_print_sk_host(&sk->ext, sk->af, sk->proto, sk->proto_variant); -#else - printf("pf: state insert failed: %s %s", tree, kif->pfik_name); - printf(" lan: "); - pf_print_host(&sk->lan.addr, sk->lan.port, - sk->af); - printf(" gwy: "); - pf_print_host(&sk->gwy.addr, sk->gwy.port, - sk->af); - printf(" ext: "); - pf_print_host(&sk->ext.addr, sk->ext.port, - sk->af); -#endif if (s->sync_flags & PFSTATE_FROMSYNC) printf(" (from sync)"); printf("\n"); @@ -1634,64 +1462,76 @@ pf_insert_state(struct pfi_kif *kif, struct pf_state *s) return (0); } -void -pf_purge_thread_fn(void *v, wait_result_t w) +static int +pf_purge_thread_cont(int err) { -#pragma unused(v, w) - u_int32_t nloops = 0; - int t = 0; - - for (;;) { - (void) tsleep(pf_purge_thread_fn, PWAIT, "pftm", t * hz); - - lck_rw_lock_shared(pf_perim_lock); - lck_mtx_lock(pf_lock); - - /* purge everything if not running */ - if (!pf_status.running) { - pf_purge_expired_states(pf_status.states); - pf_purge_expired_fragments(); - pf_purge_expired_src_nodes(); - - /* terminate thread (we don't currently do this) */ - if (pf_purge_thread == NULL) { - lck_mtx_unlock(pf_lock); - lck_rw_done(pf_perim_lock); - - thread_deallocate(current_thread()); - thread_terminate(current_thread()); - /* NOTREACHED */ - return; - } else { - /* if there's nothing left, sleep w/o timeout */ - if (pf_status.states == 0 && - pf_normalize_isempty() && - RB_EMPTY(&tree_src_tracking)) - t = 0; - - lck_mtx_unlock(pf_lock); - lck_rw_done(pf_perim_lock); - continue; +#pragma unused(err) + static u_int32_t nloops = 0; + int t = 1; /* 1 second */ + + lck_rw_lock_shared(pf_perim_lock); + lck_mtx_lock(pf_lock); + + /* purge everything if not running */ + if (!pf_status.running) { + pf_purge_expired_states(pf_status.states); + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(); + + /* terminate thread (we don't currently do this) */ + if (pf_purge_thread == NULL) { + lck_mtx_unlock(pf_lock); + lck_rw_done(pf_perim_lock); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); + /* NOTREACHED */ + return (0); + } else { + /* if there's nothing left, sleep w/o timeout */ + if (pf_status.states == 0 && + pf_normalize_isempty() && + RB_EMPTY(&tree_src_tracking)) { + nloops = 0; + t = 0; } - } else if (t == 0) { - /* Set timeout to 1 second */ - t = 1; + goto done; } + } - /* process a fraction of the state table every second */ - pf_purge_expired_states(1 + (pf_status.states - / pf_default_rule.timeout[PFTM_INTERVAL])); - - /* purge other expired types every PFTM_INTERVAL seconds */ - if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { - pf_purge_expired_fragments(); - pf_purge_expired_src_nodes(); - nloops = 0; - } + /* process a fraction of the state table every second */ + pf_purge_expired_states(1 + (pf_status.states + / pf_default_rule.timeout[PFTM_INTERVAL])); - lck_mtx_unlock(pf_lock); - lck_rw_done(pf_perim_lock); + /* purge other expired types every PFTM_INTERVAL seconds */ + if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(); + nloops = 0; } +done: + lck_mtx_unlock(pf_lock); + lck_rw_done(pf_perim_lock); + + (void) tsleep0(pf_purge_thread_fn, PWAIT, "pf_purge_cont", + t * hz, pf_purge_thread_cont); + /* NOTREACHED */ + VERIFY(0); + + return (0); +} + +void +pf_purge_thread_fn(void *v, wait_result_t w) +{ +#pragma unused(v, w) + (void) tsleep0(pf_purge_thread_fn, PWAIT, "pf_purge", 0, + pf_purge_thread_cont); + /* + * tsleep0() shouldn't have returned as PCATCH was not set; + * therefore assert in this case. + */ + VERIFY(0); } u_int64_t @@ -1795,7 +1635,6 @@ pf_unlink_state(struct pf_state *cur) { lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); -#ifndef NO_APPLE_EXTENSIONS if (cur->src.state == PF_TCPS_PROXY_DST) { pf_send_tcp(cur->rule.ptr, cur->state_key->af, &cur->state_key->ext.addr, &cur->state_key->lan.addr, @@ -1806,15 +1645,6 @@ pf_unlink_state(struct pf_state *cur) } hook_runloop(&cur->unlink_hooks, HOOK_REMOVE|HOOK_FREE); -#else - if (cur->src.state == PF_TCPS_PROXY_DST) { - pf_send_tcp(cur->rule.ptr, cur->state_key->af, - &cur->state_key->ext.addr, &cur->state_key->lan.addr, - cur->state_key->ext.port, cur->state_key->lan.port, - cur->src.seqhi, cur->src.seqlo + 1, - TH_RST|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL); - } -#endif RB_REMOVE(pf_state_tree_id, &tree_id, cur); #if NPFSYNC if (cur->creatorid == pf_status.hostid) @@ -1933,7 +1763,6 @@ pf_tbladdr_copyout(struct pf_addr_wrap *aw) kt->pfrkt_cnt : -1; } -#ifndef NO_APPLE_EXTENSIONS static void pf_print_addr(struct pf_addr *addr, sa_family_t af) { @@ -2016,76 +1845,13 @@ pf_print_sk_host(struct pf_state_host *sh, sa_family_t af, int proto, break; } } -#endif static void pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) { -#ifndef NO_APPLE_EXTENSIONS pf_print_addr(addr, af); if (p) printf("[%u]", ntohs(p)); -#else - switch (af) { -#if INET - case AF_INET: { - u_int32_t a = ntohl(addr->addr32[0]); - printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, - (a>>8)&255, a&255); - if (p) { - p = ntohs(p); - printf(":%u", p); - } - break; - } -#endif /* INET */ -#if INET6 - case AF_INET6: { - u_int16_t b; - u_int8_t i, curstart = 255, curend = 0, - maxstart = 0, maxend = 0; - for (i = 0; i < 8; i++) { - if (!addr->addr16[i]) { - if (curstart == 255) - curstart = i; - else - curend = i; - } else { - if (curstart) { - if ((curend - curstart) > - (maxend - maxstart)) { - maxstart = curstart; - maxend = curend; - curstart = 255; - } - } - } - } - for (i = 0; i < 8; i++) { - if (i >= maxstart && i <= maxend) { - if (maxend != 7) { - if (i == maxstart) - printf(":"); - } else { - if (i == maxend) - printf(":"); - } - } else { - b = ntohs(addr->addr16[i]); - printf("%x", b); - if (i < 7) - printf(":"); - } - } - if (p) { - p = ntohs(p); - printf("[%u]", p); - } - break; - } -#endif /* INET6 */ - } -#endif } void @@ -2093,14 +1859,12 @@ pf_print_state(struct pf_state *s) { struct pf_state_key *sk = s->state_key; switch (sk->proto) { -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_ESP: printf("ESP "); break; case IPPROTO_GRE: printf("GRE%u ", sk->proto_variant); break; -#endif case IPPROTO_TCP: printf("TCP "); break; @@ -2117,19 +1881,11 @@ pf_print_state(struct pf_state *s) printf("%u ", sk->proto); break; } -#ifndef NO_APPLE_EXTENSIONS pf_print_sk_host(&sk->lan, sk->af, sk->proto, sk->proto_variant); printf(" "); pf_print_sk_host(&sk->gwy, sk->af, sk->proto, sk->proto_variant); printf(" "); pf_print_sk_host(&sk->ext, sk->af, sk->proto, sk->proto_variant); -#else - pf_print_host(&sk->lan.addr, sk->lan.port, sk->af); - printf(" "); - pf_print_host(&sk->gwy.addr, sk->gwy.port, sk->af); - printf(" "); - pf_print_host(&sk->ext.addr, sk->ext.port, sk->af); -#endif printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, s->src.seqhi, s->src.max_win, s->src.seqdiff); if (s->src.wscale && s->dst.wscale) @@ -2197,7 +1953,6 @@ pf_calc_skip_steps(struct pf_rulequeue *rules) if (cur->src.neg != prev->src.neg || pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); -#ifndef NO_APPLE_EXTENSIONS { union pf_rule_xport *cx = &cur->src.xport; union pf_rule_xport *px = &prev->src.xport; @@ -2217,16 +1972,9 @@ pf_calc_skip_steps(struct pf_rulequeue *rules) break; } } -#else - if (cur->src.port[0] != prev->src.port[0] || - cur->src.port[1] != prev->src.port[1] || - cur->src.port_op != prev->src.port_op) - PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); -#endif if (cur->dst.neg != prev->dst.neg || pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); -#ifndef NO_APPLE_EXTENSIONS { union pf_rule_xport *cx = &cur->dst.xport; union pf_rule_xport *px = &prev->dst.xport; @@ -2252,12 +2000,6 @@ pf_calc_skip_steps(struct pf_rulequeue *rules) break; } } -#else - if (cur->dst.port[0] != prev->dst.port[0] || - cur->dst.port[1] != prev->dst.port[1] || - cur->dst.port_op != prev->dst.port_op) - PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); -#endif prev = cur; cur = TAILQ_NEXT(cur, entries); @@ -2266,6 +2008,32 @@ pf_calc_skip_steps(struct pf_rulequeue *rules) PF_SET_SKIP_STEPS(i); } +u_int32_t +pf_calc_state_key_flowhash(struct pf_state_key *sk) +{ + struct pf_flowhash_key fh __attribute__((aligned(8))); + + bzero(&fh, sizeof (fh)); + if (PF_ALEQ(&sk->lan.addr, &sk->ext.addr, sk->af)) { + bcopy(&sk->lan.addr, &fh.ap1.addr, sizeof (fh.ap1.addr)); + bcopy(&sk->ext.addr, &fh.ap2.addr, sizeof (fh.ap2.addr)); + } else { + bcopy(&sk->ext.addr, &fh.ap1.addr, sizeof (fh.ap1.addr)); + bcopy(&sk->lan.addr, &fh.ap2.addr, sizeof (fh.ap2.addr)); + } + if (sk->lan.xport.spi <= sk->ext.xport.spi) { + fh.ap1.xport.spi = sk->lan.xport.spi; + fh.ap2.xport.spi = sk->ext.xport.spi; + } else { + fh.ap1.xport.spi = sk->ext.xport.spi; + fh.ap2.xport.spi = sk->lan.xport.spi; + } + fh.af = sk->af; + fh.proto = sk->proto; + + return (net_flowhash(&fh, sizeof (fh), pf_hash_seed)); +} + static int pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) { @@ -2556,11 +2324,7 @@ pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, dst->seqdiff), 0); memcpy(&opt[i], &sack, sizeof (sack)); } -#ifndef NO_APPLE_EXTENSIONS copyback = off + sizeof (*th) + thoptlen; -#else - copyback = 1; -#endif } /* FALLTHROUGH */ default: @@ -2571,17 +2335,12 @@ pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, } } -#ifndef NO_APPLE_EXTENSIONS if (copyback) { m = pf_lazy_makewritable(pd, m, copyback); if (!m) return (-1); m_copyback(m, off + sizeof (*th), thoptlen, opts); } -#else - if (copyback) - m_copyback(m, off + sizeof (*th), thoptlen, opts); -#endif return (copyback); } @@ -2637,19 +2396,37 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af, } if (tag) - pf_mtag->flags |= PF_TAG_GENERATED; - pf_mtag->tag = rtag; + pf_mtag->pftag_flags |= PF_TAG_GENERATED; + pf_mtag->pftag_tag = rtag; if (r != NULL && PF_RTABLEID_IS_VALID(r->rtableid)) - pf_mtag->rtableid = r->rtableid; + pf_mtag->pftag_rtableid = r->rtableid; -#if ALTQ - if (r != NULL && r->qid) { - pf_mtag->qid = r->qid; - /* add hints for ecn */ - pf_mtag->hdr = mtod(m, struct ip *); +#if PF_ALTQ + if (altq_allowed && r != NULL && r->qid) + pf_mtag->pftag_qid = r->qid; +#endif /* PF_ALTQ */ + + /* add hints for ecn */ + pf_mtag->pftag_hdr = mtod(m, struct ip *); + /* record address family */ + pf_mtag->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6); + switch (af) { +#if INET + case AF_INET: + pf_mtag->pftag_flags |= PF_TAG_HDR_INET; + break; +#endif /* INET */ +#if INET6 + case AF_INET6: + pf_mtag->pftag_flags |= PF_TAG_HDR_INET6; + break; +#endif /* INET6 */ } -#endif /* ALTQ */ + /* indicate this is TCP */ + pf_mtag->pftag_flags |= PF_TAG_TCP; + + /* Make sure headers are 32-bit aligned */ m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; @@ -2665,7 +2442,7 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af, h->ip_src.s_addr = saddr->v4.s_addr; h->ip_dst.s_addr = daddr->v4.s_addr; - th = (struct tcphdr *)((caddr_t)h + sizeof (struct ip)); + th = (struct tcphdr *)(void *)((caddr_t)h + sizeof (struct ip)); break; #endif /* INET */ #if INET6 @@ -2678,7 +2455,8 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af, memcpy(&h6->ip6_src, &saddr->v6, sizeof (struct in6_addr)); memcpy(&h6->ip6_dst, &daddr->v6, sizeof (struct in6_addr)); - th = (struct tcphdr *)((caddr_t)h6 + sizeof (struct ip6_hdr)); + th = (struct tcphdr *)(void *) + ((caddr_t)h6 + sizeof (struct ip6_hdr)); break; #endif /* INET6 */ } @@ -2764,18 +2542,34 @@ pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, if ((pf_mtag = pf_get_mtag(m0)) == NULL) return; - pf_mtag->flags |= PF_TAG_GENERATED; + pf_mtag->pftag_flags |= PF_TAG_GENERATED; if (PF_RTABLEID_IS_VALID(r->rtableid)) - pf_mtag->rtableid = r->rtableid; - -#if ALTQ - if (r->qid) { - pf_mtag->qid = r->qid; - /* add hints for ecn */ - pf_mtag->hdr = mtod(m0, struct ip *); + pf_mtag->pftag_rtableid = r->rtableid; + +#if PF_ALTQ + if (altq_allowed && r->qid) + pf_mtag->pftag_qid = r->qid; +#endif /* PF_ALTQ */ + + /* add hints for ecn */ + pf_mtag->pftag_hdr = mtod(m0, struct ip *); + /* record address family */ + pf_mtag->pftag_flags &= + ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6 | PF_TAG_TCP); + switch (af) { +#if INET + case AF_INET: + pf_mtag->pftag_flags |= PF_TAG_HDR_INET; + break; +#endif /* INET */ +#if INET6 + case AF_INET6: + pf_mtag->pftag_flags |= PF_TAG_HDR_INET6; + break; +#endif /* INET6 */ } -#endif /* ALTQ */ + switch (af) { #if INET case AF_INET: @@ -2911,7 +2705,6 @@ pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) return (pf_match(op, a1, a2, p)); } -#ifndef NO_APPLE_EXTENSIONS int pf_match_xport(u_int8_t proto, u_int8_t proto_variant, union pf_rule_xport *rx, union pf_state_xport *sx) @@ -2946,7 +2739,6 @@ pf_match_xport(u_int8_t proto, u_int8_t proto_variant, union pf_rule_xport *rx, return (d); } -#endif int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) @@ -2970,7 +2762,7 @@ pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_mtag *pf_mtag, { #pragma unused(m) if (*tag == -1) - *tag = pf_mtag->tag; + *tag = pf_mtag->pftag_tag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); @@ -2978,18 +2770,25 @@ pf_match_tag(struct mbuf *m, struct pf_rule *r, struct pf_mtag *pf_mtag, int pf_tag_packet(struct mbuf *m, struct pf_mtag *pf_mtag, int tag, - unsigned int rtableid) + unsigned int rtableid, struct pf_pdesc *pd) { - if (tag <= 0 && !PF_RTABLEID_IS_VALID(rtableid)) + if (tag <= 0 && !PF_RTABLEID_IS_VALID(rtableid) && + (pd == NULL || pd->flowhash == 0)) return (0); if (pf_mtag == NULL && (pf_mtag = pf_get_mtag(m)) == NULL) return (1); if (tag > 0) - pf_mtag->tag = tag; + pf_mtag->pftag_tag = tag; if (PF_RTABLEID_IS_VALID(rtableid)) - pf_mtag->rtableid = rtableid; + pf_mtag->pftag_rtableid = rtableid; + if (pd != NULL && pd->flowhash != 0) { + pf_mtag->pftag_flags |= PF_TAG_FLOWHASH; + pf_mtag->pftag_flowhash = pd->flowhash; + pf_mtag->pftag_flags |= (pd->flags & PFDESC_FLOW_ADV) ? + PF_TAG_FLOWADV : 0; + } return (0); } @@ -3297,8 +3096,10 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, } break; case PF_POOL_SRCHASH: - pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); - PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); + pf_hash(saddr, (struct pf_addr *)(void *)&hash, + &rpool->key, af); + PF_POOLMASK(naddr, raddr, rmask, + (struct pf_addr *)(void *)&hash, af); break; case PF_POOL_ROUNDROBIN: if (rpool->cur->addr.type == PF_ADDR_TABLE) { @@ -3363,32 +3164,20 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, return (0); } -#ifndef NO_APPLE_EXTENSIONS static int pf_get_sport(struct pf_pdesc *pd, struct pfi_kif *kif, struct pf_rule *r, struct pf_addr *saddr, union pf_state_xport *sxport, struct pf_addr *daddr, union pf_state_xport *dxport, struct pf_addr *naddr, union pf_state_xport *nxport, struct pf_src_node **sn) -#else -int -pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, - struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t dport, - struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high, - struct pf_src_node **sn) -#endif { #pragma unused(kif) struct pf_state_key_cmp key; struct pf_addr init_addr; -#ifndef NO_APPLE_EXTENSIONS unsigned int cut; sa_family_t af = pd->af; u_int8_t proto = pd->proto; unsigned int low = r->rpool.proxy_port[0]; unsigned int high = r->rpool.proxy_port[1]; -#else - u_int16_t cut; -#endif bzero(&init_addr, sizeof (init_addr)); if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) @@ -3399,7 +3188,6 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, high = 65535; } -#ifndef NO_APPLE_EXTENSIONS if (!nxport) return (0); /* No output necessary. */ @@ -3459,13 +3247,11 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, return (0); } } -#endif do { key.af = af; key.proto = proto; PF_ACPY(&key.ext.addr, daddr, key.af); PF_ACPY(&key.gwy.addr, naddr, key.af); -#ifndef NO_APPLE_EXTENSIONS switch (proto) { case IPPROTO_UDP: key.proto_variant = r->extfilter; @@ -3478,54 +3264,31 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, key.ext.xport = *dxport; else memset(&key.ext.xport, 0, sizeof (key.ext.xport)); -#else - key.ext.port = dport; -#endif /* * port search; start random, step; * similar 2 portloop in in_pcbbind */ if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_ICMP)) { -#ifndef NO_APPLE_EXTENSIONS if (dxport) key.gwy.xport = *dxport; else memset(&key.gwy.xport, 0, sizeof (key.ext.xport)); -#else - key.gwy.port = dport; -#endif if (pf_find_state_all(&key, PF_IN, NULL) == NULL) return (0); } else if (low == 0 && high == 0) { -#ifndef NO_APPLE_EXTENSIONS key.gwy.xport = *nxport; -#else - key.gwy.port = *nport; -#endif if (pf_find_state_all(&key, PF_IN, NULL) == NULL) return (0); } else if (low == high) { -#ifndef NO_APPLE_EXTENSIONS key.gwy.xport.port = htons(low); if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { nxport->port = htons(low); return (0); } -#else - key.gwy.port = htons(low); - if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { - *nport = htons(low); - return (0); - } -#endif } else { -#ifndef NO_APPLE_EXTENSIONS unsigned int tmp; -#else - u_int16_t tmp; -#endif if (low > high) { tmp = low; low = high; @@ -3535,38 +3298,20 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, cut = htonl(random()) % (1 + high - low) + low; /* low <= cut <= high */ for (tmp = cut; tmp <= high; ++(tmp)) { -#ifndef NO_APPLE_EXTENSIONS key.gwy.xport.port = htons(tmp); if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { nxport->port = htons(tmp); return (0); } -#else - key.gwy.port = htons(tmp); - if (pf_find_state_all(&key, PF_IN, NULL) == - NULL) { - *nport = htons(tmp); - return (0); - } -#endif } for (tmp = cut - 1; tmp >= low; --(tmp)) { -#ifndef NO_APPLE_EXTENSIONS key.gwy.xport.port = htons(tmp); if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { nxport->port = htons(tmp); return (0); } -#else - key.gwy.port = htons(tmp); - if (pf_find_state_all(&key, PF_IN, NULL) == - NULL) { - *nport = htons(tmp); - return (0); - } -#endif } } @@ -3587,18 +3332,11 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, return (1); /* none available */ } -#ifndef NO_APPLE_EXTENSIONS static struct pf_rule * pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, struct pfi_kif *kif, struct pf_addr *saddr, union pf_state_xport *sxport, struct pf_addr *daddr, union pf_state_xport *dxport, int rs_num) -#else -struct pf_rule * -pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, - int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport, - struct pf_addr *daddr, u_int16_t dport, int rs_num) -#endif { struct pf_rule *r, *rm = NULL; struct pf_ruleset *ruleset = NULL; @@ -3610,16 +3348,13 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, while (r && rm == NULL) { struct pf_rule_addr *src = NULL, *dst = NULL; struct pf_addr_wrap *xdst = NULL; -#ifndef NO_APPLE_EXTENSIONS struct pf_addr_wrap *xsrc = NULL; union pf_rule_xport rdrxport; -#endif if (r->action == PF_BINAT && direction == PF_IN) { src = &r->dst; if (r->rpool.cur != NULL) xdst = &r->rpool.cur->addr; -#ifndef NO_APPLE_EXTENSIONS } else if (r->action == PF_RDR && direction == PF_OUT) { dst = &r->src; src = &r->dst; @@ -3629,7 +3364,6 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, htons(r->rpool.proxy_port[0]); xsrc = &r->rpool.cur->addr; } -#endif } else { src = &r->src; dst = &r->dst; @@ -3644,7 +3378,6 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; -#ifndef NO_APPLE_EXTENSIONS else if (xsrc && PF_MISMATCHAW(xsrc, saddr, pd->af, 0, NULL)) r = TAILQ_NEXT(r, entries); else if (!xsrc && PF_MISMATCHAW(&src->addr, saddr, pd->af, @@ -3656,14 +3389,6 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, r = TAILQ_NEXT(r, entries); else if (!xsrc && !pf_match_xport(r->proto, r->proto_variant, &src->xport, sxport)) -#else - else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, - src->neg, kif)) - r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : - PF_SKIP_DST_ADDR].ptr; - else if (src->port_op && !pf_match_port(src->port_op, - src->port[0], src->port[1], sport)) -#endif r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT : PF_SKIP_DST_PORT].ptr; else if (dst != NULL && @@ -3672,14 +3397,8 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, 0, NULL)) r = TAILQ_NEXT(r, entries); -#ifndef NO_APPLE_EXTENSIONS else if (dst && !pf_match_xport(r->proto, r->proto_variant, &dst->xport, dxport)) -#else - else if (dst != NULL && dst->port_op && - !pf_match_port(dst->port_op, dst->port[0], - dst->port[1], dport)) -#endif r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) r = TAILQ_NEXT(r, entries); @@ -3702,7 +3421,7 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r, NULL, NULL); } - if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid, NULL)) return (NULL); if (rm != NULL && (rm->action == PF_NONAT || rm->action == PF_NORDR || rm->action == PF_NOBINAT)) @@ -3710,25 +3429,15 @@ pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, return (rm); } -#ifndef NO_APPLE_EXTENSIONS static struct pf_rule * pf_get_translation_aux(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, struct pfi_kif *kif, struct pf_src_node **sn, struct pf_addr *saddr, union pf_state_xport *sxport, struct pf_addr *daddr, union pf_state_xport *dxport, struct pf_addr *naddr, union pf_state_xport *nxport) -#else -struct pf_rule * -pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, - struct pfi_kif *kif, struct pf_src_node **sn, - struct pf_addr *saddr, u_int16_t sport, - struct pf_addr *daddr, u_int16_t dport, - struct pf_addr *naddr, u_int16_t *nport) -#endif { struct pf_rule *r = NULL; -#ifndef NO_APPLE_EXTENSIONS if (direction == PF_OUT) { r = pf_match_translation(pd, m, off, direction, kif, saddr, sxport, daddr, dxport, PF_RULESET_BINAT); @@ -3745,21 +3454,6 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, r = pf_match_translation(pd, m, off, direction, kif, saddr, sxport, daddr, dxport, PF_RULESET_BINAT); } -#else - if (direction == PF_OUT) { - r = pf_match_translation(pd, m, off, direction, kif, saddr, - sport, daddr, dport, PF_RULESET_BINAT); - if (r == NULL) - r = pf_match_translation(pd, m, off, direction, kif, - saddr, sport, daddr, dport, PF_RULESET_NAT); - } else { - r = pf_match_translation(pd, m, off, direction, kif, saddr, - sport, daddr, dport, PF_RULESET_RDR); - if (r == NULL) - r = pf_match_translation(pd, m, off, direction, kif, - saddr, sport, daddr, dport, PF_RULESET_BINAT); - } -#endif if (r != NULL) { switch (r->action) { @@ -3768,14 +3462,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, case PF_NORDR: return (NULL); case PF_NAT: -#ifndef NO_APPLE_EXTENSIONS if (pf_get_sport(pd, kif, r, saddr, sxport, daddr, dxport, naddr, nxport, sn)) { -#else - if (pf_get_sport(pd->af, pd->proto, r, saddr, - daddr, dport, naddr, nport, r->rpool.proxy_port[0], - r->rpool.proxy_port[1], sn)) { -#endif DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation " "(%u-%u) failed\n", @@ -3863,7 +3551,6 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, } break; case PF_RDR: { -#ifndef NO_APPLE_EXTENSIONS switch (direction) { case PF_OUT: if (r->dst.addr.type == PF_ADDR_DYNIFTL) { @@ -3939,31 +3626,6 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, } break; } -#else - if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn)) - return (NULL); - if ((r->rpool.opts & PF_POOL_TYPEMASK) == - PF_POOL_BITMASK) - PF_POOLMASK(naddr, naddr, - &r->rpool.cur->addr.v.a.mask, daddr, - pd->af); - - if (r->rpool.proxy_port[1]) { - u_int32_t tmp_nport; - - tmp_nport = ((ntohs(dport) - - ntohs(r->dst.port[0])) % - (r->rpool.proxy_port[1] - - r->rpool.proxy_port[0] + 1)) + - r->rpool.proxy_port[0]; - - /* wrap around if necessary */ - if (tmp_nport > 65535) - tmp_nport -= 65535; - *nport = htons((u_int16_t)tmp_nport); - } else if (r->rpool.proxy_port[0]) - *nport = htons(r->rpool.proxy_port[0]); -#endif break; } default: @@ -4183,7 +3845,7 @@ pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer) case AF_INET: hlen = sizeof (struct ip); bzero(&ro, sizeof (ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; + dst = (struct sockaddr_in *)(void *)&ro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof (*dst); dst->sin_addr = addr->v4; @@ -4195,7 +3857,7 @@ pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer) case AF_INET6: hlen = sizeof (struct ip6_hdr); bzero(&ro6, sizeof (ro6)); - dst6 = (struct sockaddr_in6 *)&ro6.ro_dst; + dst6 = (struct sockaddr_in6 *)(void *)&ro6.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof (*dst6); dst6->sin6_addr = addr->v6; @@ -4274,16 +3936,14 @@ pf_detach_state(struct pf_state *s, int flags) if (!(flags & PF_DT_SKIP_LANEXT)) RB_REMOVE(pf_state_tree_lan_ext, &pf_statetbl_lan_ext, sk); -#ifndef NO_APPLE_EXTENSIONS if (sk->app_state) pool_put(&pf_app_state_pl, sk->app_state); -#endif pool_put(&pf_state_key_pl, sk); } } struct pf_state_key * -pf_alloc_state_key(struct pf_state *s) +pf_alloc_state_key(struct pf_state *s, struct pf_state_key *psk) { struct pf_state_key *sk; @@ -4293,6 +3953,20 @@ pf_alloc_state_key(struct pf_state *s) TAILQ_INIT(&sk->states); pf_attach_state(sk, s, 0); + /* initialize state key from psk, if provided */ + if (psk != NULL) { + bcopy(&psk->lan, &sk->lan, sizeof (sk->lan)); + bcopy(&psk->gwy, &sk->gwy, sizeof (sk->gwy)); + bcopy(&psk->ext, &sk->ext, sizeof (sk->ext)); + sk->af = psk->af; + sk->proto = psk->proto; + sk->direction = psk->direction; + sk->proto_variant = psk->proto_variant; + VERIFY(psk->app_state == NULL); + sk->flowhash = psk->flowhash; + /* don't touch tree entries, states and refcnt on sk */ + } + return (sk); } @@ -4334,9 +4008,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #pragma unused(h) struct pf_rule *nr = NULL; struct pf_addr *saddr = pd->src, *daddr = pd->dst; -#ifdef NO_APPLE_EXTENSIONS - u_int16_t bport, nport = 0; -#endif sa_family_t af = pd->af; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; @@ -4350,15 +4021,11 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, int match = 0; int state_icmp = 0; u_int16_t mss = tcp_mssdflt; -#ifdef NO_APPLE_EXTENSIONS - u_int16_t sport, dport; -#endif u_int8_t icmptype = 0, icmpcode = 0; -#ifndef NO_APPLE_EXTENSIONS struct pf_grev1_hdr *grev1 = pd->hdr.grev1; union pf_state_xport bxport, nxport, sxport, dxport; -#endif + struct pf_state_key psk; lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); @@ -4367,46 +4034,28 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, return (PF_DROP); } -#ifndef NO_APPLE_EXTENSIONS hdrlen = 0; sxport.spi = 0; dxport.spi = 0; nxport.spi = 0; -#else - sport = dport = hdrlen = 0; -#endif switch (pd->proto) { case IPPROTO_TCP: -#ifndef NO_APPLE_EXTENSIONS sxport.port = th->th_sport; dxport.port = th->th_dport; -#else - sport = th->th_sport; - dport = th->th_dport; -#endif hdrlen = sizeof (*th); break; case IPPROTO_UDP: -#ifndef NO_APPLE_EXTENSIONS sxport.port = pd->hdr.udp->uh_sport; dxport.port = pd->hdr.udp->uh_dport; -#else - sport = pd->hdr.udp->uh_sport; - dport = pd->hdr.udp->uh_dport; -#endif hdrlen = sizeof (*pd->hdr.udp); break; #if INET case IPPROTO_ICMP: if (pd->af != AF_INET) break; -#ifndef NO_APPLE_EXTENSIONS sxport.port = dxport.port = pd->hdr.icmp->icmp_id; hdrlen = ICMP_MINLEN; -#else - sport = dport = pd->hdr.icmp->icmp_id; -#endif icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; @@ -4422,11 +4071,7 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, case IPPROTO_ICMPV6: if (pd->af != AF_INET6) break; -#ifndef NO_APPLE_EXTENSIONS sxport.port = dxport.port = pd->hdr.icmp6->icmp6_id; -#else - sport = dport = pd->hdr.icmp6->icmp6_id; -#endif hdrlen = sizeof (*pd->hdr.icmp6); icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; @@ -4438,7 +4083,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, state_icmp++; break; #endif /* INET6 */ -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_GRE: if (pd->proto_variant == PF_GRE_PPTP_VARIANT) { sxport.call_id = dxport.call_id = @@ -4451,80 +4095,55 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, dxport.spi = pd->hdr.esp->spi; hdrlen = sizeof (*pd->hdr.esp); break; -#endif } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); if (direction == PF_OUT) { -#ifndef NO_APPLE_EXTENSIONS bxport = nxport = sxport; /* check outgoing packet for BINAT/NAT */ if ((nr = pf_get_translation_aux(pd, m, off, PF_OUT, kif, &nsn, saddr, &sxport, daddr, &dxport, &pd->naddr, &nxport)) != NULL) { -#else - bport = nport = sport; - /* check outgoing packet for BINAT/NAT */ - if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn, - saddr, sport, daddr, dport, &pd->naddr, &nport)) != NULL) { -#endif PF_ACPY(&pd->baddr, saddr, af); switch (pd->proto) { case IPPROTO_TCP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &pd->naddr, nxport.port, 0, af); sxport.port = th->th_sport; -#else - pf_change_ap(saddr, &th->th_sport, pd->ip_sum, - &th->th_sum, &pd->naddr, nport, 0, af); - sport = th->th_sport; -#endif rewrite++; break; case IPPROTO_UDP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, saddr, &pd->hdr.udp->uh_sport, pd->ip_sum, &pd->hdr.udp->uh_sum, &pd->naddr, nxport.port, 1, af); sxport.port = pd->hdr.udp->uh_sport; -#else - pf_change_ap(saddr, &pd->hdr.udp->uh_sport, - pd->ip_sum, &pd->hdr.udp->uh_sum, - &pd->naddr, nport, 1, af); - sport = pd->hdr.udp->uh_sport; -#endif rewrite++; break; #if INET case IPPROTO_ICMP: - pf_change_a(&saddr->v4.s_addr, pd->ip_sum, - pd->naddr.v4.s_addr, 0); -#ifndef NO_APPLE_EXTENSIONS - pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( - pd->hdr.icmp->icmp_cksum, sxport.port, - nxport.port, 0); - pd->hdr.icmp->icmp_id = nxport.port; - ++rewrite; -#else - pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( - pd->hdr.icmp->icmp_cksum, sport, nport, 0); - pd->hdr.icmp->icmp_id = nport; - m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp); -#endif + if (pd->af == AF_INET) { + pf_change_a(&saddr->v4.s_addr, pd->ip_sum, + pd->naddr.v4.s_addr, 0); + pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, sxport.port, + nxport.port, 0); + pd->hdr.icmp->icmp_id = nxport.port; + ++rewrite; + } break; #endif /* INET */ #if INET6 case IPPROTO_ICMPV6: - pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, - &pd->naddr, 0); - rewrite++; + if (pd->af == AF_INET6) { + pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, + &pd->naddr, 0); + rewrite++; + } break; #endif /* INET */ -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_GRE: switch (af) { #if INET @@ -4557,7 +4176,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #endif /* INET6 */ } break; -#endif default: switch (af) { #if INET @@ -4580,63 +4198,45 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, pd->nat_rule = nr; } } else { -#ifndef NO_APPLE_EXTENSIONS bxport.port = nxport.port = dxport.port; /* check incoming packet for BINAT/RDR */ if ((nr = pf_get_translation_aux(pd, m, off, PF_IN, kif, &nsn, saddr, &sxport, daddr, &dxport, &pd->naddr, &nxport)) != NULL) { -#else - bport = nport = dport; - /* check incoming packet for BINAT/RDR */ - if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn, - saddr, sport, daddr, dport, &pd->naddr, &nport)) != NULL) { -#endif PF_ACPY(&pd->baddr, daddr, af); switch (pd->proto) { case IPPROTO_TCP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &pd->naddr, nxport.port, 0, af); dxport.port = th->th_dport; -#else - pf_change_ap(daddr, &th->th_dport, pd->ip_sum, - &th->th_sum, &pd->naddr, nport, 0, af); - dport = th->th_dport; -#endif rewrite++; break; case IPPROTO_UDP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, daddr, &pd->hdr.udp->uh_dport, pd->ip_sum, &pd->hdr.udp->uh_sum, &pd->naddr, nxport.port, 1, af); dxport.port = pd->hdr.udp->uh_dport; -#else - pf_change_ap(direction, daddr, - &pd->hdr.udp->uh_dport, - pd->ip_sum, &pd->hdr.udp->uh_sum, - &pd->naddr, nport, 1, af); - dport = pd->hdr.udp->uh_dport; -#endif rewrite++; break; #if INET case IPPROTO_ICMP: - pf_change_a(&daddr->v4.s_addr, pd->ip_sum, - pd->naddr.v4.s_addr, 0); + if (pd->af == AF_INET) { + pf_change_a(&daddr->v4.s_addr, pd->ip_sum, + pd->naddr.v4.s_addr, 0); + } break; #endif /* INET */ #if INET6 case IPPROTO_ICMPV6: - pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, - &pd->naddr, 0); - rewrite++; + if (pd->af == AF_INET6) { + pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, + &pd->naddr, 0); + rewrite++; + } break; #endif /* INET6 */ -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_GRE: if (pd->proto_variant == PF_GRE_PPTP_VARIANT) grev1->call_id = nxport.call_id; @@ -4671,7 +4271,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #endif /* INET6 */ } break; -#endif default: switch (af) { #if INET @@ -4695,10 +4294,8 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, } } -#ifndef NO_APPLE_EXTENSIONS if (nr && nr->tag > 0) tag = nr->tag; -#endif while (r != NULL) { r->evaluations++; @@ -4714,33 +4311,23 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ -#ifndef NO_APPLE_EXTENSIONS else if (r->proto == pd->proto && (r->proto == IPPROTO_TCP || r->proto == IPPROTO_UDP) && r->src.xport.range.op && !pf_match_port(r->src.xport.range.op, r->src.xport.range.port[0], r->src.xport.range.port[1], th->th_sport)) -#else - else if (r->src.port_op && !pf_match_port(r->src.port_op, - r->src.port[0], r->src.port[1], th->th_sport)) -#endif r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ -#ifndef NO_APPLE_EXTENSIONS else if (r->proto == pd->proto && (r->proto == IPPROTO_TCP || r->proto == IPPROTO_UDP) && r->dst.xport.range.op && !pf_match_port(r->dst.xport.range.op, r->dst.xport.range.port[0], r->dst.xport.range.port[1], th->th_dport)) -#else - else if (r->dst.port_op && !pf_match_port(r->dst.port_op, - r->dst.port[0], r->dst.port[1], th->th_dport)) -#endif r = r->skip[PF_SKIP_DST_PORT].ptr; /* icmp only. type always 0 in other cases */ else if (r->type && r->type != icmptype + 1) @@ -4748,7 +4335,14 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, /* icmp only. type always 0 in other cases */ else if (r->code && r->code != icmpcode + 1) r = TAILQ_NEXT(r, entries); - else if (r->tos && !(r->tos == pd->tos)) + else if ((r->rule_flag & PFRULE_TOS) && r->tos && + !(r->tos & pd->tos)) + r = TAILQ_NEXT(r, entries); + else if ((r->rule_flag & PFRULE_DSCP) && r->tos && + !(r->tos & (pd->tos & DSCP_MASK))) + r = TAILQ_NEXT(r, entries); + else if ((r->rule_flag & PFRULE_SC) && r->tos && + ((r->tos & SCIDX_MASK) != pd->sc)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); @@ -4804,7 +4398,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->log)) { -#ifndef NO_APPLE_EXTENSIONS if (rewrite > 0) { if (rewrite < off + hdrlen) rewrite = off + hdrlen; @@ -4817,10 +4410,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, m_copyback(m, off, hdrlen, pd->hdr.any); } -#else - if (rewrite) - m_copyback(m, off, hdrlen, pd->hdr.any); -#endif PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd); } @@ -4834,34 +4423,19 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, if (direction == PF_OUT) { switch (pd->proto) { case IPPROTO_TCP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &pd->baddr, bxport.port, 0, af); sxport.port = th->th_sport; -#else - pf_change_ap(saddr, &th->th_sport, - pd->ip_sum, &th->th_sum, - &pd->baddr, bport, 0, af); - sport = th->th_sport; -#endif rewrite++; break; case IPPROTO_UDP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, saddr, &pd->hdr.udp->uh_sport, pd->ip_sum, &pd->hdr.udp->uh_sum, &pd->baddr, bxport.port, 1, af); sxport.port = pd->hdr.udp->uh_sport; -#else - pf_change_ap(saddr, - &pd->hdr.udp->uh_sport, pd->ip_sum, - &pd->hdr.udp->uh_sum, &pd->baddr, - bport, 1, af); - sport = pd->hdr.udp->uh_sport; -#endif rewrite++; break; case IPPROTO_ICMP: @@ -4870,7 +4444,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #endif /* nothing! */ break; -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_GRE: PF_ACPY(&pd->baddr, saddr, af); ++rewrite; @@ -4908,7 +4481,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #endif /* INET6 */ } break; -#endif default: switch (af) { case AF_INET: @@ -4924,34 +4496,19 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, } else { switch (pd->proto) { case IPPROTO_TCP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &pd->baddr, bxport.port, 0, af); dxport.port = th->th_dport; -#else - pf_change_ap(daddr, &th->th_dport, - pd->ip_sum, &th->th_sum, - &pd->baddr, bport, 0, af); - dport = th->th_dport; -#endif rewrite++; break; case IPPROTO_UDP: -#ifndef NO_APPLE_EXTENSIONS pf_change_ap(direction, pd->mp, daddr, &pd->hdr.udp->uh_dport, pd->ip_sum, &pd->hdr.udp->uh_sum, &pd->baddr, bxport.port, 1, af); dxport.port = pd->hdr.udp->uh_dport; -#else - pf_change_ap(daddr, - &pd->hdr.udp->uh_dport, pd->ip_sum, - &pd->hdr.udp->uh_sum, &pd->baddr, - bport, 1, af); - dport = pd->hdr.udp->uh_dport; -#endif rewrite++; break; case IPPROTO_ICMP: @@ -4960,7 +4517,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #endif /* nothing! */ break; -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_GRE: if (pd->proto_variant == PF_GRE_PPTP_VARIANT) @@ -4999,7 +4555,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, #endif /* INET6 */ } break; -#endif default: switch (af) { case AF_INET: @@ -5054,16 +4609,12 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp); } } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && -#ifndef NO_APPLE_EXTENSIONS pd->proto != IPPROTO_ESP && pd->proto != IPPROTO_AH && -#endif r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && -#ifndef NO_APPLE_EXTENSIONS pd->proto != IPPROTO_ESP && pd->proto != IPPROTO_AH && -#endif r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); @@ -5072,7 +4623,91 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, if (r->action == PF_DROP) return (PF_DROP); - if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) { + /* prepare state key, for flowhash and/or the state (if created) */ + bzero(&psk, sizeof (psk)); + psk.proto = pd->proto; + psk.direction = direction; + psk.af = af; + if (pd->proto == IPPROTO_UDP) { + if (ntohs(pd->hdr.udp->uh_sport) == PF_IKE_PORT && + ntohs(pd->hdr.udp->uh_dport) == PF_IKE_PORT) { + psk.proto_variant = PF_EXTFILTER_APD; + } else { + psk.proto_variant = nr ? nr->extfilter : r->extfilter; + if (psk.proto_variant < PF_EXTFILTER_APD) + psk.proto_variant = PF_EXTFILTER_APD; + } + } else if (pd->proto == IPPROTO_GRE) { + psk.proto_variant = pd->proto_variant; + } + if (direction == PF_OUT) { + PF_ACPY(&psk.gwy.addr, saddr, af); + PF_ACPY(&psk.ext.addr, daddr, af); + switch (pd->proto) { + case IPPROTO_UDP: + psk.gwy.xport = sxport; + psk.ext.xport = dxport; + break; + case IPPROTO_ESP: + psk.gwy.xport.spi = 0; + psk.ext.xport.spi = pd->hdr.esp->spi; + break; + case IPPROTO_ICMP: +#if INET6 + case IPPROTO_ICMPV6: +#endif + psk.gwy.xport.port = nxport.port; + psk.ext.xport.spi = 0; + break; + default: + psk.gwy.xport = sxport; + psk.ext.xport = dxport; + break; + } + if (nr != NULL) { + PF_ACPY(&psk.lan.addr, &pd->baddr, af); + psk.lan.xport = bxport; + } else { + PF_ACPY(&psk.lan.addr, &psk.gwy.addr, af); + psk.lan.xport = psk.gwy.xport; + } + } else { + PF_ACPY(&psk.lan.addr, daddr, af); + PF_ACPY(&psk.ext.addr, saddr, af); + switch (pd->proto) { + case IPPROTO_ICMP: +#if INET6 + case IPPROTO_ICMPV6: +#endif + psk.lan.xport = nxport; + psk.ext.xport.spi = 0; + break; + case IPPROTO_ESP: + psk.ext.xport.spi = 0; + psk.lan.xport.spi = pd->hdr.esp->spi; + break; + default: + psk.lan.xport = dxport; + psk.ext.xport = sxport; + break; + } + if (nr != NULL) { + PF_ACPY(&psk.gwy.addr, &pd->baddr, af); + psk.gwy.xport = bxport; + } else { + PF_ACPY(&psk.gwy.addr, &psk.lan.addr, af); + psk.gwy.xport = psk.lan.xport; + } + } + if (pd->flowhash != 0) { + /* flowhash was already computed by upper layers */ + psk.flowhash = pd->flowhash; + } else { + psk.flowhash = pf_calc_state_key_flowhash(&psk); + pd->flowhash = psk.flowhash; + } + + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid, pd)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } @@ -5083,7 +4718,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, struct pf_state *s = NULL; struct pf_state_key *sk = NULL; struct pf_src_node *sn = NULL; -#ifndef NO_APPLE_EXTENSIONS struct pf_ike_hdr ike; if (pd->proto == IPPROTO_UDP) { @@ -5124,7 +4758,6 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, goto cleanup; } } -#endif /* check maximums */ if (r->max_states && (r->states >= r->max_states)) { @@ -5142,9 +4775,7 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && ((direction == PF_OUT && -#ifndef NO_APPLE_EXTENSIONS nr->action != PF_RDR && -#endif pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) || (pf_insert_src_node(&nsn, nr, saddr, af) != 0))) { REASON_SET(&reason, PFRES_SRCLIMIT); @@ -5168,19 +4799,15 @@ cleanup: pool_put(&pf_src_tree_pl, nsn); } if (sk != NULL) { -#ifndef NO_APPLE_EXTENSIONS if (sk->app_state) pool_put(&pf_app_state_pl, sk->app_state); -#endif pool_put(&pf_state_key_pl, sk); } return (PF_DROP); } bzero(s, sizeof (*s)); -#ifndef NO_APPLE_EXTENSIONS TAILQ_INIT(&s->unlink_hooks); -#endif s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; @@ -5236,7 +4863,6 @@ cleanup: #endif s->timeout = PFTM_ICMP_FIRST_PACKET; break; -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_GRE: s->src.state = PFGRE1S_INITIATING; s->dst.state = PFGRE1S_NO_TRAFFIC; @@ -5247,7 +4873,6 @@ cleanup: s->dst.state = PFESPS_NO_TRAFFIC; s->timeout = PFTM_ESP_FIRST_PACKET; break; -#endif default: s->src.state = PFOTHERS_SINGLE; s->dst.state = PFOTHERS_NO_TRAFFIC; @@ -5293,136 +4918,14 @@ cleanup: } } - if ((sk = pf_alloc_state_key(s)) == NULL) { + /* allocate state key and import values from psk */ + if ((sk = pf_alloc_state_key(s, &psk)) == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } - sk->proto = pd->proto; - sk->direction = direction; - sk->af = af; -#ifndef NO_APPLE_EXTENSIONS - if (pd->proto == IPPROTO_UDP) { - if (ntohs(pd->hdr.udp->uh_sport) == PF_IKE_PORT && - ntohs(pd->hdr.udp->uh_dport) == PF_IKE_PORT) { - sk->proto_variant = PF_EXTFILTER_APD; - } else { - sk->proto_variant = nr ? nr->extfilter : - r->extfilter; - if (sk->proto_variant < PF_EXTFILTER_APD) - sk->proto_variant = PF_EXTFILTER_APD; - } - } else if (pd->proto == IPPROTO_GRE) { - sk->proto_variant = pd->proto_variant; - } -#endif - if (direction == PF_OUT) { - PF_ACPY(&sk->gwy.addr, saddr, af); - PF_ACPY(&sk->ext.addr, daddr, af); - switch (pd->proto) { -#ifndef NO_APPLE_EXTENSIONS - case IPPROTO_UDP: - sk->gwy.xport = sxport; - sk->ext.xport = dxport; - break; - case IPPROTO_ESP: - sk->gwy.xport.spi = 0; - sk->ext.xport.spi = pd->hdr.esp->spi; - break; -#endif - case IPPROTO_ICMP: -#if INET6 - case IPPROTO_ICMPV6: -#endif -#ifndef NO_APPLE_EXTENSIONS - sk->gwy.xport.port = nxport.port; - sk->ext.xport.spi = 0; -#else - sk->gwy.port = nport; - sk->ext.port = 0; -#endif - break; - default: -#ifndef NO_APPLE_EXTENSIONS - sk->gwy.xport = sxport; - sk->ext.xport = dxport; - break; -#else - sk->gwy.port = sport; - sk->ext.port = dport; -#endif - } -#ifndef NO_APPLE_EXTENSIONS - if (nr != NULL) { - PF_ACPY(&sk->lan.addr, &pd->baddr, af); - sk->lan.xport = bxport; - } else { - PF_ACPY(&sk->lan.addr, &sk->gwy.addr, af); - sk->lan.xport = sk->gwy.xport; - } -#else - if (nr != NULL) { - PF_ACPY(&sk->lan.addr, &pd->baddr, af); - sk->lan.port = bport; - } else { - PF_ACPY(&sk->lan.addr, &sk->gwy.addr, af); - sk->lan.port = sk->gwy.port; - } -#endif - } else { - PF_ACPY(&sk->lan.addr, daddr, af); - PF_ACPY(&sk->ext.addr, saddr, af); - switch (pd->proto) { - case IPPROTO_ICMP: -#if INET6 - case IPPROTO_ICMPV6: -#endif -#ifndef NO_APPLE_EXTENSIONS - sk->lan.xport = nxport; - sk->ext.xport.spi = 0; -#else - sk->lan.port = nport; - sk->ext.port = 0; -#endif - break; -#ifndef NO_APPLE_EXTENSIONS - case IPPROTO_ESP: - sk->ext.xport.spi = 0; - sk->lan.xport.spi = pd->hdr.esp->spi; - break; - default: - sk->lan.xport = dxport; - sk->ext.xport = sxport; - break; -#else - default: - sk->lan.port = dport; - sk->ext.port = sport; -#endif - } -#ifndef NO_APPLE_EXTENSIONS - if (nr != NULL) { - PF_ACPY(&sk->gwy.addr, &pd->baddr, af); - sk->gwy.xport = bxport; - } else { - PF_ACPY(&sk->gwy.addr, &sk->lan.addr, af); - sk->gwy.xport = sk->lan.xport; - } - } -#else - if (nr != NULL) { - PF_ACPY(&sk->gwy.addr, &pd->baddr, af); - sk->gwy.port = bport; - } else { - PF_ACPY(&sk->gwy.addr, &sk->lan.addr, af); - sk->gwy.port = sk->lan.port; - } - } -#endif - pf_set_rt_ifp(s, saddr); /* needs s->state_key set */ -#ifndef NO_APPLE_EXTENSIONS m = pd->mp; if (sk->app_state == 0) { @@ -5484,7 +4987,6 @@ cleanup: break; } } -#endif if (pf_insert_state(BOUND_IFACE(r, kif), s)) { if (pd->proto == IPPROTO_TCP) @@ -5505,7 +5007,6 @@ cleanup: r->keep_state == PF_STATE_SYNPROXY) { s->src.state = PF_TCPS_PROXY_SRC; if (nr != NULL) { -#ifndef NO_APPLE_EXTENSIONS if (direction == PF_OUT) { pf_change_ap(direction, pd->mp, saddr, &th->th_sport, pd->ip_sum, @@ -5519,19 +5020,6 @@ cleanup: bxport.port, 0, af); sxport.port = th->th_dport; } -#else - if (direction == PF_OUT) { - pf_change_ap(saddr, &th->th_sport, - pd->ip_sum, &th->th_sum, &pd->baddr, - bport, 0, af); - sport = th->th_sport; - } else { - pf_change_ap(daddr, &th->th_dport, - pd->ip_sum, &th->th_sum, &pd->baddr, - bport, 0, af); - sport = th->th_dport; - } -#endif } s->src.seqhi = htonl(random()); /* Find mss option */ @@ -5546,7 +5034,6 @@ cleanup: return (PF_SYNPROXY_DROP); } -#ifndef NO_APPLE_EXTENSIONS if (sk->app_state && sk->app_state->handler) { int offx = off; @@ -5572,11 +5059,9 @@ cleanup: m = pd->mp; } } -#endif } /* copy back packet headers if we performed NAT operations */ -#ifndef NO_APPLE_EXTENSIONS if (rewrite) { if (rewrite < off + hdrlen) rewrite = off + hdrlen; @@ -5589,14 +5074,282 @@ cleanup: m_copyback(m, off, hdrlen, pd->hdr.any); } -#else - if (rewrite) - m_copyback(m, off, hdrlen, pd->hdr.any); -#endif return (PF_PASS); } +#if DUMMYNET +/* + * When pf_test_dummynet() returns PF_PASS, the rule matching parameter "rm" + * remains unchanged, meaning the packet did not match a dummynet rule. + * when the packet does match a dummynet rule, pf_test_dummynet() returns + * PF_PASS and zero out the mbuf rule as the packet is effectively siphoned + * out by dummynet. + */ +static int +pf_test_dummynet(struct pf_rule **rm, int direction, struct pfi_kif *kif, + struct mbuf **m0, struct pf_pdesc *pd, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct pf_rule *am = NULL; + struct pf_ruleset *rsm = NULL; + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + sa_family_t af = pd->af; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct tcphdr *th = pd->hdr.tcp; + u_short reason; + int hdrlen = 0; + int tag = -1; + unsigned int rtableid = IFSCOPE_NONE; + int asd = 0; + int match = 0; + u_int8_t icmptype = 0, icmpcode = 0; + union pf_state_xport nxport, sxport, dxport; + struct ip_fw_args dnflow; + struct pf_rule *prev_matching_rule = fwa ? fwa->fwa_pf_rule : NULL; + int found_prev_rule = (prev_matching_rule) ? 0 : 1; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if (!DUMMYNET_LOADED) + return (PF_PASS); + + if (TAILQ_EMPTY(pf_main_ruleset.rules[PF_RULESET_DUMMYNET].active.ptr)) { + return (PF_PASS); + } + bzero(&dnflow, sizeof(dnflow)); + + hdrlen = 0; + sxport.spi = 0; + dxport.spi = 0; + nxport.spi = 0; + + /* Fragments don't gave protocol headers */ + if (!(pd->flags & PFDESC_IP_FRAG)) + switch (pd->proto) { + case IPPROTO_TCP: + dnflow.fwa_id.flags = pd->hdr.tcp->th_flags; + dnflow.fwa_id.dst_port = pd->hdr.tcp->th_dport; + dnflow.fwa_id.src_port = pd->hdr.tcp->th_sport; + sxport.port = pd->hdr.tcp->th_sport; + dxport.port = pd->hdr.tcp->th_dport; + hdrlen = sizeof (*th); + break; + case IPPROTO_UDP: + dnflow.fwa_id.dst_port = pd->hdr.udp->uh_dport; + dnflow.fwa_id.src_port = pd->hdr.udp->uh_sport; + sxport.port = pd->hdr.udp->uh_sport; + dxport.port = pd->hdr.udp->uh_dport; + hdrlen = sizeof (*pd->hdr.udp); + break; +#if INET + case IPPROTO_ICMP: + if (pd->af != AF_INET) + break; + sxport.port = dxport.port = pd->hdr.icmp->icmp_id; + hdrlen = ICMP_MINLEN; + icmptype = pd->hdr.icmp->icmp_type; + icmpcode = pd->hdr.icmp->icmp_code; + break; +#endif /* INET */ +#if INET6 + case IPPROTO_ICMPV6: + if (pd->af != AF_INET6) + break; + sxport.port = dxport.port = pd->hdr.icmp6->icmp6_id; + hdrlen = sizeof (*pd->hdr.icmp6); + icmptype = pd->hdr.icmp6->icmp6_type; + icmpcode = pd->hdr.icmp6->icmp6_code; + break; +#endif /* INET6 */ + case IPPROTO_GRE: + if (pd->proto_variant == PF_GRE_PPTP_VARIANT) { + sxport.call_id = dxport.call_id = + pd->hdr.grev1->call_id; + hdrlen = sizeof (*pd->hdr.grev1); + } + break; + case IPPROTO_ESP: + sxport.spi = 0; + dxport.spi = pd->hdr.esp->spi; + hdrlen = sizeof (*pd->hdr.esp); + break; + } + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_DUMMYNET].active.ptr); + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, saddr, af, + r->src.neg, kif)) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + /* tcp/udp only. port_op always 0 in other cases */ + else if (r->proto == pd->proto && + (r->proto == IPPROTO_TCP || r->proto == IPPROTO_UDP) && + ((pd->flags & PFDESC_IP_FRAG) || + ((r->src.xport.range.op && + !pf_match_port(r->src.xport.range.op, + r->src.xport.range.port[0], r->src.xport.range.port[1], + th->th_sport))))) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, + r->dst.neg, NULL)) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + /* tcp/udp only. port_op always 0 in other cases */ + else if (r->proto == pd->proto && + (r->proto == IPPROTO_TCP || r->proto == IPPROTO_UDP) && + r->dst.xport.range.op && + ((pd->flags & PFDESC_IP_FRAG) || + !pf_match_port(r->dst.xport.range.op, + r->dst.xport.range.port[0], r->dst.xport.range.port[1], + th->th_dport))) + r = r->skip[PF_SKIP_DST_PORT].ptr; + /* icmp only. type always 0 in other cases */ + else if (r->type && + ((pd->flags & PFDESC_IP_FRAG) || + r->type != icmptype + 1)) + r = TAILQ_NEXT(r, entries); + /* icmp only. type always 0 in other cases */ + else if (r->code && + ((pd->flags & PFDESC_IP_FRAG) || + r->code != icmpcode + 1)) + r = TAILQ_NEXT(r, entries); + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_TCP && + ((pd->flags & PFDESC_IP_FRAG) || + (r->flagset & th->th_flags) != r->flags)) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag)) + r = TAILQ_NEXT(r, entries); + else { + /* + * Need to go past the previous dummynet matching rule + */ + if (r->anchor == NULL) { + if (found_prev_rule) { + if (r->tag) + tag = r->tag; + if (PF_RTABLEID_IS_VALID(r->rtableid)) + rtableid = r->rtableid; + match = 1; + *rm = r; + am = a; + rsm = ruleset; + if ((*rm)->quick) + break; + } else if (r == prev_matching_rule) { + found_prev_rule = 1; + } + r = TAILQ_NEXT(r, entries); + } else { + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_DUMMYNET, &r, &a, &match); + } + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_DUMMYNET, &r, &a, &match)) + break; + } + r = *rm; + a = am; + ruleset = rsm; + + if (!match) + return (PF_PASS); + + REASON_SET(&reason, PFRES_DUMMYNET); + + if (r->log) { + PFLOG_PACKET(kif, h, m, af, direction, reason, r, + a, ruleset, pd); + } + + if (r->action == PF_NODUMMYNET) { + int dirndx = (direction == PF_OUT); + + r->packets[dirndx]++; + r->bytes[dirndx] += pd->tot_len; + + return (PF_PASS); + } + if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid, pd)) { + REASON_SET(&reason, PFRES_MEMORY); + + return (PF_DROP); + } + + if (r->dnpipe && ip_dn_io_ptr != NULL) { + int dirndx = (direction == PF_OUT); + + r->packets[dirndx]++; + r->bytes[dirndx] += pd->tot_len; + + dnflow.fwa_cookie = r->dnpipe; + dnflow.fwa_pf_rule = r; + dnflow.fwa_id.addr_type = (af == AF_INET) ? 4 : 6; + dnflow.fwa_id.proto = pd->proto; + dnflow.fwa_flags = r->dntype; + + if (fwa != NULL) { + dnflow.fwa_oif = fwa->fwa_oif; + dnflow.fwa_oflags = fwa->fwa_oflags; + /* + * Note that fwa_ro, fwa_dst and fwa_ipoa are + * actually in a union so the following does work + * for both IPv4 and IPv6 + */ + dnflow.fwa_ro = fwa->fwa_ro; + dnflow.fwa_dst = fwa->fwa_dst; + dnflow.fwa_ipoa = fwa->fwa_ipoa; + dnflow.fwa_ro6_pmtu = fwa->fwa_ro6_pmtu; + dnflow.fwa_origifp = fwa->fwa_origifp; + dnflow.fwa_mtu = fwa->fwa_mtu; + dnflow.fwa_alwaysfrag = fwa->fwa_alwaysfrag; + dnflow.fwa_unfragpartlen = fwa->fwa_unfragpartlen; + dnflow.fwa_exthdrs = fwa->fwa_exthdrs; + } + + if (af == AF_INET) { + struct ip *iphdr = mtod(m, struct ip *); + NTOHS(iphdr->ip_len); + NTOHS(iphdr->ip_off); + } + /* + * Don't need to unlock pf_lock as NET_THREAD_HELD_PF + * allows for recursive behavior + */ + ip_dn_io_ptr(m, + dnflow.fwa_cookie, + af == AF_INET ? + direction == PF_IN ? DN_TO_IP_IN : DN_TO_IP_OUT : + direction == PF_IN ? DN_TO_IP6_IN : DN_TO_IP6_OUT, + &dnflow, DN_CLIENT_PF); + + /* + * The packet is siphoned out by dummynet so return a NULL + * mbuf so the caller can still return success. + */ + *m0 = NULL; + + return (PF_PASS); + } + + return (PF_PASS); +} +#endif /* DUMMYNET */ + static int pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, @@ -5628,11 +5381,17 @@ pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; - else if (r->tos && !(r->tos == pd->tos)) + else if ((r->rule_flag & PFRULE_TOS) && r->tos && + !(r->tos & pd->tos)) + r = TAILQ_NEXT(r, entries); + else if ((r->rule_flag & PFRULE_DSCP) && r->tos && + !(r->tos & (pd->tos & DSCP_MASK))) + r = TAILQ_NEXT(r, entries); + else if ((r->rule_flag & PFRULE_SC) && r->tos && + ((r->tos & SCIDX_MASK) != pd->sc)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); -#ifndef NO_APPLE_EXTENSIONS else if (pd->proto == IPPROTO_UDP && (r->src.xport.range.op || r->dst.xport.range.op)) r = TAILQ_NEXT(r, entries); @@ -5640,14 +5399,6 @@ pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, (r->src.xport.range.op || r->dst.xport.range.op || r->flagset)) r = TAILQ_NEXT(r, entries); -#else - else if (pd->proto == IPPROTO_UDP && - (r->src.port_op || r->dst.port_op)) - r = TAILQ_NEXT(r, entries); - else if (pd->proto == IPPROTO_TCP && - (r->src.port_op || r->dst.port_op || r->flagset)) - r = TAILQ_NEXT(r, entries); -#endif else if ((pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) && (r->type || r->code)) @@ -5686,7 +5437,7 @@ pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, if (r->action != PF_PASS) return (PF_DROP); - if (pf_tag_packet(m, pd->pf_mtag, tag, -1)) { + if (pf_tag_packet(m, pd->pf_mtag, tag, -1, NULL)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } @@ -5694,7 +5445,6 @@ pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, return (PF_PASS); } -#ifndef NO_APPLE_EXTENSIONS static void pf_pptp_handler(struct pf_state *s, int direction, int off, struct pf_pdesc *pd, struct pfi_kif *kif) @@ -5763,7 +5513,7 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, return; } - gsk = pf_alloc_state_key(gs); + gsk = pf_alloc_state_key(gs, NULL); if (!gsk) { pool_put(&pf_app_state_pl, gas); pool_put(&pf_state_pl, gs); @@ -5780,6 +5530,7 @@ pf_pptp_handler(struct pf_state *s, int direction, int off, gsk->lan.xport.call_id = 0; gsk->gwy.xport.call_id = 0; gsk->ext.xport.call_id = 0; + gsk->flowhash = pf_calc_state_key_flowhash(gsk); memset(gas, 0, sizeof (*gas)); gas->u.grev1.pptp_state = s; STATE_INC_COUNTERS(gs); @@ -6038,7 +5789,6 @@ pf_ike_compare(struct pf_app_state *a, struct pf_app_state *b) int64_t d = a->u.ike.cookie - b->u.ike.cookie; return ((d > 0) ? 1 : ((d < 0) ? -1 : 0)); } -#endif static int pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, @@ -6055,31 +5805,19 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, int copyback = 0; struct pf_state_peer *src, *dst; -#ifndef NO_APPLE_EXTENSIONS key.app_state = 0; -#endif key.af = pd->af; key.proto = IPPROTO_TCP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = th->th_sport; key.gwy.xport.port = th->th_dport; -#else - key.ext.port = th->th_sport; - key.gwy.port = th->th_dport; -#endif } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = th->th_sport; key.ext.xport.port = th->th_dport; -#else - key.lan.port = th->th_sport; - key.ext.port = th->th_dport; -#endif } STATE_LOOKUP(); @@ -6142,11 +5880,7 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, if ((*state)->dst.seqhi == 1) (*state)->dst.seqhi = htonl(random()); pf_send_tcp((*state)->rule.ptr, pd->af, &psrc->addr, -#ifndef NO_APPLE_EXTENSIONS &pdst->addr, psrc->xport.port, pdst->xport.port, -#else - &pdst->addr, psrc->port, pdst->port, -#endif (*state)->dst.seqhi, 0, TH_SYN, 0, (*state)->src.mss, 0, 0, (*state)->tag, NULL, NULL); REASON_SET(reason, PFRES_SYNPROXY); @@ -6165,11 +5899,7 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, TH_ACK, (*state)->src.max_win, 0, 0, 0, (*state)->tag, NULL, NULL); pf_send_tcp((*state)->rule.ptr, pd->af, &psrc->addr, -#ifndef NO_APPLE_EXTENSIONS &pdst->addr, psrc->xport.port, pdst->xport.port, -#else - &pdst->addr, psrc->port, pdst->port, -#endif (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL, NULL); @@ -6259,10 +5989,7 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { -#ifndef NO_APPLE_MODIFICATION /* - * - * * Window scale negotiation has failed, * therefore we must restore the window * scale in the state record that we @@ -6270,17 +5997,12 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, * pf_test_rule(). Care is required to * prevent arithmetic overflow from * zeroing the window when it's - * truncated down to 16-bits. --jhw + * truncated down to 16-bits. */ u_int32_t max_win = dst->max_win; max_win <<= dst->wscale & PF_WSCALE_MASK; dst->max_win = MIN(0xffff, max_win); -#else - /* fixup other window */ - dst->max_win <<= dst->wscale & - PF_WSCALE_MASK; -#endif /* in case of a retrans SYN|ACK */ dst->wscale = 0; } @@ -6298,16 +6020,10 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, * the crappy stack check or if we picked up the connection * after establishment) */ -#ifndef NO_APPLE_MODIFICATIONS if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, (u_int32_t)dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, (u_int32_t)dst->max_win << dws); -#else - if (src->seqhi == 1 || - SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) - src->seqhi = end + MAX(1, dst->max_win << dws); -#endif if (win > src->max_win) src->max_win = win; @@ -6363,7 +6079,6 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, * options anyway. */ if (dst->seqdiff && (th->th_off << 2) > (int)sizeof (struct tcphdr)) { -#ifndef NO_APPLE_EXTENSIONS copyback = pf_modulate_sack(m, off, pd, th, dst); if (copyback == -1) { REASON_SET(reason, PFRES_MEMORY); @@ -6371,21 +6086,13 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, } m = pd->mp; -#else - if (pf_modulate_sack(m, off, pd, th, dst)) - copyback = 1; -#endif } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ -#ifndef NO_APPLE_MODIFICATIONS SEQ_GEQ(seq, src->seqlo - ((u_int32_t)dst->max_win << dws)) && -#else - SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && -#endif /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ @@ -6401,9 +6108,7 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, *state, src, dst, ©back)) return (PF_DROP); -#ifndef NO_APPLE_EXTENSIONS m = pd->mp; -#endif } /* update max window */ @@ -6413,13 +6118,8 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ -#ifndef NO_APPLE_MODIFICATIONS if (SEQ_GEQ(ack + ((u_int32_t)win << sws), dst->seqhi)) dst->seqhi = ack + MAX(((u_int32_t)win << sws), 1); -#else - if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) - dst->seqhi = ack + MAX((win << sws), 1); -#endif /* update states */ if (th->th_flags & TH_SYN) @@ -6507,9 +6207,7 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, ©back)) return (PF_DROP); -#ifndef NO_APPLE_EXTENSIONS m = pd->mp; -#endif } /* update max window */ @@ -6519,13 +6217,8 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ -#ifndef NO_APPLE_MODIFICATIONS if (SEQ_GEQ(ack + ((u_int32_t)win << sws), dst->seqhi)) dst->seqhi = ack + MAX(((u_int32_t)win << sws), 1); -#else - if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) - dst->seqhi = ack + MAX((win << sws), 1); -#endif /* * Cannot set dst->seqhi here since this could be a shotgunned @@ -6567,12 +6260,8 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', -#ifndef NO_APPLE_MODIFICATIONS SEQ_GEQ(seq, src->seqlo - ((u_int32_t)dst->max_win << dws)) ? -#else - SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? -#endif ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', @@ -6585,7 +6274,6 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, /* Any packets which have gotten here are to be passed */ -#ifndef NO_APPLE_EXTENSIONS if ((*state)->state_key->app_state && (*state)->state_key->app_state->handler) { (*state)->state_key->app_state->handler(*state, direction, @@ -6622,74 +6310,39 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, /* Copyback sequence modulation or stateful scrub changes */ m_copyback(m, off, sizeof (*th), th); } -#else - /* translate source/destination address, if necessary */ - if (STATE_TRANSLATE((*state)->state_key)) { - if (direction == PF_OUT) - pf_change_ap(pd->src, pd->mp, &th->th_sport, pd->ip_sum, - &th->th_sum, &(*state)->state_key->gwy.addr, - (*state)->state_key->gwy.port, 0, pd->af); - else - pf_change_ap(pd->dst, pd->mp, &th->th_dport, pd->ip_sum, - &th->th_sum, &(*state)->state_key->lan.addr, - (*state)->state_key->lan.port, 0, pd->af); - m_copyback(m, off, sizeof (*th), th); - } else if (copyback) { - /* Copyback sequence modulation or stateful scrub changes */ - m_copyback(m, off, sizeof (*th), th); - } -#endif return (PF_PASS); } -#ifndef NO_APPLE_EXTENSIONS static int pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) -#else -pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, - struct mbuf *m, int off, void *h, struct pf_pdesc *pd) -#endif { #pragma unused(h) struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; struct udphdr *uh = pd->hdr.udp; -#ifndef NO_APPLE_EXTENSIONS struct pf_app_state as; int dx, action, extfilter; key.app_state = 0; key.proto_variant = PF_EXTFILTER_APD; -#endif key.af = pd->af; key.proto = IPPROTO_UDP; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = uh->uh_sport; key.gwy.xport.port = uh->uh_dport; dx = PF_IN; -#else - key.ext.port = uh->uh_sport; - key.gwy.port = uh->uh_dport; -#endif } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = uh->uh_sport; key.ext.xport.port = uh->uh_dport; dx = PF_OUT; -#else - key.lan.port = uh->uh_sport; - key.ext.port = uh->uh_dport; -#endif } -#ifndef NO_APPLE_EXTENSIONS if (ntohs(uh->uh_sport) == PF_IKE_PORT && ntohs(uh->uh_dport) == PF_IKE_PORT) { struct pf_ike_hdr ike; @@ -6734,11 +6387,12 @@ pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, *state = pf_find_state(kif, &key, dx); } + if ((*state) != NULL && pd != NULL && + pd->flowhash == 0) + pd->flowhash = (*state)->state_key->flowhash; + if (pf_state_lookup_aux(state, kif, direction, &action)) return (action); -#else - STATE_LOOKUP(); -#endif if (direction == (*state)->state_key->direction) { src = &(*state)->src; @@ -6761,7 +6415,6 @@ pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, else (*state)->timeout = PFTM_UDP_SINGLE; -#ifndef NO_APPLE_EXTENSIONS extfilter = (*state)->state_key->proto_variant; if (extfilter > PF_EXTFILTER_APD) { (*state)->state_key->ext.xport.port = key.ext.xport.port; @@ -6801,20 +6454,6 @@ pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, (*state)->state_key->lan.xport.port, 1, pd->af); m_copyback(m, off, sizeof (*uh), uh); } -#else - /* translate source/destination address, if necessary */ - if (STATE_TRANSLATE((*state)->state_key)) { - if (direction == PF_OUT) - pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum, - &uh->uh_sum, &(*state)->state_key->gwy.addr, - (*state)->state_key->gwy.port, 1, pd->af); - else - pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum, - &uh->uh_sum, &(*state)->state_key->lan.addr, - (*state)->state_key->lan.port, 1, pd->af); - m_copyback(m, off, sizeof (*uh), uh); - } -#endif return (PF_PASS); } @@ -6830,10 +6469,8 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, int state_icmp = 0; struct pf_state_key_cmp key; -#ifndef NO_APPLE_EXTENSIONS struct pf_app_state as; key.app_state = 0; -#endif switch (pd->proto) { #if INET @@ -6876,23 +6513,13 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = 0; key.gwy.xport.port = icmpid; -#else - key.ext.port = 0; - key.gwy.port = icmpid; -#endif } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = icmpid; key.ext.xport.port = 0; -#else - key.lan.port = icmpid; - key.ext.port = 0; -#endif } STATE_LOOKUP(); @@ -6909,7 +6536,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, pf_change_a(&saddr->v4.s_addr, pd->ip_sum, (*state)->state_key->gwy.addr.v4.s_addr, 0); -#ifndef NO_APPLE_EXTENSIONS pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, @@ -6920,14 +6546,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, off + ICMP_MINLEN); if (!m) return (PF_DROP); -#else - pd->hdr.icmp->icmp_cksum = - pf_cksum_fixup( - pd->hdr.icmp->icmp_cksum, icmpid, - (*state)->state_key->gwy.port, 0); - pd->hdr.icmp->icmp_id = - (*state)->state_key->gwy.port; -#endif m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp); break; @@ -6937,12 +6555,10 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &(*state)->state_key->gwy.addr, 0); -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, off + sizeof (struct icmp6_hdr)); if (!m) return (PF_DROP); -#endif m_copyback(m, off, sizeof (struct icmp6_hdr), pd->hdr.icmp6); @@ -6956,7 +6572,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, pf_change_a(&daddr->v4.s_addr, pd->ip_sum, (*state)->state_key->lan.addr.v4.s_addr, 0); -#ifndef NO_APPLE_EXTENSIONS pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, @@ -6967,14 +6582,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, off + ICMP_MINLEN); if (!m) return (PF_DROP); -#else - pd->hdr.icmp->icmp_cksum = - pf_cksum_fixup( - pd->hdr.icmp->icmp_cksum, icmpid, - (*state)->state_key->lan.port, 0); - pd->hdr.icmp->icmp_id = - (*state)->state_key->lan.port; -#endif m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp); break; @@ -6984,12 +6591,10 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &(*state)->state_key->lan.addr, 0); -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, off + sizeof (struct icmp6_hdr)); if (!m) return (PF_DROP); -#endif m_copyback(m, off, sizeof (struct icmp6_hdr), pd->hdr.icmp6); @@ -7134,23 +6739,13 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = th.th_dport; key.gwy.xport.port = th.th_sport; -#else - key.ext.port = th.th_dport; - key.gwy.port = th.th_sport; -#endif } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = th.th_dport; key.ext.xport.port = th.th_sport; -#else - key.lan.port = th.th_dport; - key.ext.port = th.th_sport; -#endif } STATE_LOOKUP(); @@ -7177,12 +6772,8 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } if (!SEQ_GEQ(src->seqhi, seq) || -#ifndef NO_APPLE_MODIFICATION !SEQ_GEQ(seq, src->seqlo - ((u_int32_t)dst->max_win << dws))) { -#else - !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))) { -#endif if (pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, pd->hdr.icmp->icmp_code); @@ -7201,21 +6792,13 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { pf_change_icmp(pd2.src, &th.th_sport, daddr, &(*state)->state_key->lan.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->lan.xport.port, NULL, -#else - (*state)->state_key->lan.port, NULL, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); } else { pf_change_icmp(pd2.dst, &th.th_dport, saddr, &(*state)->state_key->gwy.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->gwy.xport.port, NULL, -#else - (*state)->state_key->gwy.port, NULL, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); } @@ -7223,11 +6806,9 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } if (copyback) { -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, off2 + 8); if (!m) return (PF_DROP); -#endif switch (pd2.af) { #if INET case AF_INET: @@ -7255,9 +6836,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } case IPPROTO_UDP: { struct udphdr uh; -#ifndef NO_APPLE_EXTENSIONS int dx, action; -#endif if (!pf_pull_hdr(m, off2, &uh, sizeof (uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, @@ -7271,28 +6850,17 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = uh.uh_dport; key.gwy.xport.port = uh.uh_sport; dx = PF_IN; -#else - key.ext.port = uh.uh_dport; - key.gwy.port = uh.uh_sport; -#endif } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = uh.uh_dport; key.ext.xport.port = uh.uh_sport; dx = PF_OUT; -#else - key.lan.port = uh.uh_dport; - key.ext.port = uh.uh_sport; -#endif } -#ifndef NO_APPLE_EXTENSIONS key.proto_variant = PF_EXTFILTER_APD; if (ntohs(uh.uh_sport) == PF_IKE_PORT && @@ -7335,40 +6903,31 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, *state = pf_find_state(kif, &key, dx); } + if (*state != NULL && pd != NULL && + pd->flowhash == 0) + pd->flowhash = (*state)->state_key->flowhash; + if (pf_state_lookup_aux(state, kif, direction, &action)) return (action); -#else - STATE_LOOKUP(); -#endif if (STATE_TRANSLATE((*state)->state_key)) { if (direction == PF_IN) { pf_change_icmp(pd2.src, &uh.uh_sport, daddr, &(*state)->state_key->lan.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->lan.xport.port, &uh.uh_sum, -#else - (*state)->state_key->lan.port, &uh.uh_sum, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); } else { pf_change_icmp(pd2.dst, &uh.uh_dport, saddr, &(*state)->state_key->gwy.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->gwy.xport.port, &uh.uh_sum, -#else - (*state)->state_key->gwy.port, &uh.uh_sum, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); } -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, off2 + sizeof (uh)); if (!m) return (PF_DROP); -#endif switch (pd2.af) { #if INET case AF_INET: @@ -7410,23 +6969,13 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = 0; key.gwy.xport.port = iih.icmp_id; -#else - key.ext.port = 0; - key.gwy.port = iih.icmp_id; -#endif } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = iih.icmp_id; key.ext.xport.port = 0; -#else - key.lan.port = iih.icmp_id; - key.ext.port = 0; -#endif } STATE_LOOKUP(); @@ -7435,29 +6984,19 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { pf_change_icmp(pd2.src, &iih.icmp_id, daddr, &(*state)->state_key->lan.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->lan.xport.port, NULL, -#else - (*state)->state_key->lan.port, NULL, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); } else { pf_change_icmp(pd2.dst, &iih.icmp_id, saddr, &(*state)->state_key->gwy.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->gwy.xport.port, NULL, -#else - (*state)->state_key->gwy.port, NULL, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); } -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, off2 + ICMP_MINLEN); if (!m) return (PF_DROP); -#endif m_copyback(m, off, ICMP_MINLEN, pd->hdr.icmp); m_copyback(m, ipoff2, sizeof (h2), &h2); m_copyback(m, off2, ICMP_MINLEN, &iih); @@ -7484,23 +7023,13 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = 0; key.gwy.xport.port = iih.icmp6_id; -#else - key.ext.port = 0; - key.gwy.port = iih.icmp6_id; -#endif } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = iih.icmp6_id; key.ext.xport.port = 0; -#else - key.lan.port = iih.icmp6_id; - key.ext.port = 0; -#endif } STATE_LOOKUP(); @@ -7509,30 +7038,20 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { pf_change_icmp(pd2.src, &iih.icmp6_id, daddr, &(*state)->state_key->lan.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->lan.xport.port, NULL, -#else - (*state)->state_key->lan.port, NULL, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); } else { pf_change_icmp(pd2.dst, &iih.icmp6_id, saddr, &(*state)->state_key->gwy.addr, -#ifndef NO_APPLE_EXTENSIONS (*state)->state_key->gwy.xport.port, NULL, -#else - (*state)->state_key->gwy.port, NULL, -#endif pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); } -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, off2 + sizeof (struct icmp6_hdr)); if (!m) return (PF_DROP); -#endif m_copyback(m, off, sizeof (struct icmp6_hdr), pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof (h2_6), &h2_6); @@ -7550,23 +7069,13 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd2.dst, key.af); PF_ACPY(&key.gwy.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = 0; key.gwy.xport.port = 0; -#else - key.ext.port = 0; - key.gwy.port = 0; -#endif } else { PF_ACPY(&key.lan.addr, pd2.dst, key.af); PF_ACPY(&key.ext.addr, pd2.src, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = 0; key.ext.xport.port = 0; -#else - key.lan.port = 0; - key.ext.port = 0; -#endif } STATE_LOOKUP(); @@ -7588,25 +7097,17 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, switch (pd2.af) { #if INET case AF_INET: -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, ipoff2 + sizeof (h2)); if (!m) return (PF_DROP); -#endif - m_copyback(m, off, ICMP_MINLEN, - pd->hdr.icmp); - m_copyback(m, ipoff2, sizeof (h2), &h2); - break; #endif /* INET */ #if INET6 case AF_INET6: -#ifndef NO_APPLE_EXTENSIONS m = pf_lazy_makewritable(pd, m, ipoff2 + sizeof (h2_6)); if (!m) return (PF_DROP); -#endif m_copyback(m, off, sizeof (struct icmp6_hdr), pd->hdr.icmp6); @@ -7624,7 +7125,6 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } } -#ifndef NO_APPLE_EXTENSIONS static int pf_test_state_grev1(struct pf_state **state, int direction, struct pfi_kif *kif, int off, struct pf_pdesc *pd) @@ -7726,7 +7226,7 @@ pf_test_state_grev1(struct pf_state **state, int direction, return (PF_PASS); } -int +static int pf_test_state_esp(struct pf_state **state, int direction, struct pfi_kif *kif, int off, struct pf_pdesc *pd) { @@ -7814,6 +7314,11 @@ pf_test_state_esp(struct pf_state **state, int direction, struct pfi_kif *kif, } } + if (*state != NULL && pd != NULL && + pd->flowhash == 0) { + pd->flowhash = (*state)->state_key->flowhash; + } + if (pf_state_lookup_aux(state, kif, direction, &action)) return (action); @@ -7878,7 +7383,6 @@ pf_test_state_esp(struct pf_state **state, int direction, struct pfi_kif *kif, return (PF_PASS); } -#endif static int pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, @@ -7887,31 +7391,19 @@ pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; -#ifndef NO_APPLE_EXTENSIONS key.app_state = 0; -#endif key.af = pd->af; key.proto = pd->proto; if (direction == PF_IN) { PF_ACPY(&key.ext.addr, pd->src, key.af); PF_ACPY(&key.gwy.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.ext.xport.port = 0; key.gwy.xport.port = 0; -#else - key.ext.port = 0; - key.gwy.port = 0; -#endif } else { PF_ACPY(&key.lan.addr, pd->src, key.af); PF_ACPY(&key.ext.addr, pd->dst, key.af); -#ifndef NO_APPLE_EXTENSIONS key.lan.xport.port = 0; key.ext.xport.port = 0; -#else - key.lan.port = 0; - key.ext.port = 0; -#endif } STATE_LOOKUP(); @@ -7938,11 +7430,7 @@ pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, (*state)->timeout = PFTM_OTHER_SINGLE; /* translate source/destination address, if necessary */ -#ifndef NO_APPLE_EXTENSIONS if (STATE_ADDR_TRANSLATE((*state)->state_key)) { -#else - if (STATE_TRANSLATE((*state)->state_key)) { -#endif if (direction == PF_OUT) { switch (pd->af) { #if INET @@ -8143,7 +7631,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, (dir != PF_IN && dir != PF_OUT) || oifp == NULL) panic("pf_route: invalid parameters"); - if (pd->pf_mtag->routed++ > 3) { + if (pd->pf_mtag->pftag_routed++ > 3) { m0 = *m; *m = NULL; goto bad; @@ -8168,7 +7656,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, ro = &iproute; bzero((caddr_t)ro, sizeof (*ro)); - dst = satosin(&ro->ro_dst); + dst = satosin((void *)&ro->ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof (*dst); dst->sin_addr = ip->ip_dst; @@ -8185,7 +7673,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = satosin(ro->ro_rt->rt_gateway); + dst = satosin((void *)ro->ro_rt->rt_gateway); RT_UNLOCK(ro->ro_rt); } else { if (TAILQ_EMPTY(&r->rpool.list)) { @@ -8211,7 +7699,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, goto bad; if (oifp != ifp) { - if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS) + if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) goto bad; else if (m0 == NULL) goto done; @@ -8279,7 +7767,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); - error = ifnet_output(ifp, PF_INET, m0, ro, sintosa(dst)); + error = ifnet_output(ifp, PF_INET, m0, ro->ro_rt, sintosa(dst)); goto done; } @@ -8300,12 +7788,12 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, m1 = m0; /* PR-8933605: send ip_len,ip_off to ip_fragment in host byte order */ -#if BYTE_ORDER != BIG_ENDIAN +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_off); NTOHS(ip->ip_len); #endif error = ip_fragment(m0, ifp, ifp->if_mtu, sw_csum); - + if (error) { m0 = NULL; goto bad; @@ -8315,7 +7803,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, m1 = m0->m_nextpkt; m0->m_nextpkt = 0; if (error == 0) - error = ifnet_output(ifp, PF_INET, m0, ro, + error = ifnet_output(ifp, PF_INET, m0, ro->ro_rt, sintosa(dst)); else m_freem(m0); @@ -8357,7 +7845,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, (dir != PF_IN && dir != PF_OUT) || oifp == NULL) panic("pf_route6: invalid parameters"); - if (pd->pf_mtag->routed++ > 3) { + if (pd->pf_mtag->pftag_routed++ > 3) { m0 = *m; *m = NULL; goto bad; @@ -8392,7 +7880,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, if ((pf_mtag = pf_get_mtag(m0)) == NULL) goto bad; - pf_mtag->flags |= PF_TAG_GENERATED; + pf_mtag->pftag_flags |= PF_TAG_GENERATED; ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); return; } @@ -8419,7 +7907,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, goto bad; if (oifp != ifp) { - if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS) + if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) goto bad; else if (m0 == NULL) goto done; @@ -8438,7 +7926,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr)) dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((unsigned)m0->m_pkthdr.len <= ifp->if_mtu) { - error = nd6_output(ifp, ifp, m0, dst, NULL); + error = nd6_output(ifp, ifp, m0, dst, NULL, NULL); } else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r->rt != PF_DUPTO) @@ -8549,22 +8037,24 @@ pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, } #if INET -#ifndef NO_APPLE_EXTENSIONS #define PF_APPLE_UPDATE_PDESC_IPv4() \ do { \ if (m && pd.mp && m != pd.mp) { \ m = pd.mp; \ h = mtod(m, struct ip *); \ + pd.pf_mtag = pf_get_mtag(m); \ } \ } while (0) -#endif int pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, - struct ether_header *eh) + struct ether_header *eh, struct ip_fw_args *fwa) { +#if !DUMMYNET +#pragma unused(fwa) +#endif struct pfi_kif *kif; - u_short action, reason = 0, log = 0; + u_short action = PF_PASS, reason = 0, log = 0; struct mbuf *m = *m0; struct ip *h = 0; struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; @@ -8587,7 +8077,7 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, return (PF_DROP); } - if (pd.pf_mtag->flags & PF_TAG_GENERATED) + if (pd.pf_mtag->pftag_flags & PF_TAG_GENERATED) return (PF_PASS); kif = (struct pfi_kif *)ifp->if_pf_kif; @@ -8605,6 +8095,22 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, panic("non-M_PKTHDR is passed to pf_test"); #endif /* DIAGNOSTIC */ + /* initialize enough of pd for the done label */ + h = mtod(m, struct ip *); + pd.mp = m; + pd.lmw = 0; + pd.pf_mtag = pf_get_mtag(m); + pd.src = (struct pf_addr *)&h->ip_src; + pd.dst = (struct pf_addr *)&h->ip_dst; + PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET); + pd.ip_sum = &h->ip_sum; + pd.proto = h->ip_p; + pd.proto_variant = 0; + pd.af = AF_INET; + pd.tos = h->ip_tos; + pd.tot_len = ntohs(h->ip_len); + pd.eh = eh; + if (m->m_pkthdr.len < (int)sizeof (*h)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); @@ -8612,11 +8118,22 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, goto done; } +#if DUMMYNET + if (fwa != NULL && fwa->fwa_pf_rule != NULL) + goto nonormalize; +#endif /* DUMMYNET */ + /* We do IP header normalization and packet reassembly here */ - if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { + action = pf_normalize_ip(m0, dir, kif, &reason, &pd); + pd.mp = m = *m0; + if (action != PF_PASS || pd.lmw < 0) { action = PF_DROP; goto done; } + +#if DUMMYNET +nonormalize: +#endif /* DUMMYNET */ m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip *); @@ -8633,18 +8150,32 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET); pd.ip_sum = &h->ip_sum; pd.proto = h->ip_p; -#ifndef NO_APPLE_EXTENSIONS pd.proto_variant = 0; pd.mp = m; pd.lmw = 0; -#endif + pd.pf_mtag = pf_get_mtag(m); pd.af = AF_INET; pd.tos = h->ip_tos; + pd.sc = MBUF_SCIDX(mbuf_get_service_class(m)); pd.tot_len = ntohs(h->ip_len); pd.eh = eh; + if (pd.pf_mtag != NULL && pd.pf_mtag->pftag_flowhash != 0) { + pd.flowhash = pd.pf_mtag->pftag_flowhash; + pd.flags |= (m->m_pkthdr.m_fhflags & PF_TAG_FLOWADV) ? + PFDESC_FLOW_ADV : 0; + } /* handle fragments that didn't get reassembled by normalization */ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { + pd.flags |= PFDESC_IP_FRAG; +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); goto done; @@ -8663,21 +8194,25 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, pd.p_len = pd.tot_len - off - (th.th_off << 2); if ((th.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv4(); -#endif if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv4(); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -8707,15 +8242,19 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, REASON_SET(&reason, PFRES_SHORT); goto done; } -#ifndef NO_APPLE_EXTENSIONS +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd, &reason); if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv4(); -#else - action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -8738,13 +8277,19 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, log = action != PF_PASS; goto done; } +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv4(); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -8758,7 +8303,6 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, break; } -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_ESP: { struct pf_esp_hdr esp; @@ -8768,6 +8312,14 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, log = action != PF_PASS; goto done; } +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_esp(&s, dir, kif, off, &pd); if (pd.lmw < 0) goto done; @@ -8793,6 +8345,14 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, log = (action != PF_PASS); goto done; } +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ if ((ntohs(grev1.flags) & PF_GRE_FLAG_VERSION_MASK) == 1 && ntohs(grev1.protocol_type) == PF_GRE_PPP_ETHERTYPE) { if (ntohs(grev1.payload_length) > @@ -8823,15 +8383,20 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, /* not GREv1/PPTP, so treat as ordinary GRE... */ } -#endif default: +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_other(&s, dir, kif, &pd); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv4(); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -8846,10 +8411,8 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, } done: -#ifndef NO_APPLE_EXTENSIONS *m0 = pd.mp; PF_APPLE_UPDATE_PDESC_IPv4(); -#endif if (action == PF_PASS && h->ip_hl > 5 && !((s && s->allow_opts) || r->allow_opts)) { @@ -8861,20 +8424,31 @@ done: (unsigned int) h->ip_hl)); } - if ((s && s->tag) || PF_RTABLEID_IS_VALID(r->rtableid)) + if ((s && s->tag) || PF_RTABLEID_IS_VALID(r->rtableid) || + pd.flowhash != 0) (void) pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, - r->rtableid); + r->rtableid, &pd); -#if ALTQ - if (action == PF_PASS && r->qid) { - if (pqid || (pd.tos & IPTOS_LOWDELAY)) - pd.pf_mtag->qid = r->pqid; - else - pd.pf_mtag->qid = r->qid; + if (action == PF_PASS) { +#if PF_ALTQ + if (altq_allowed && r->qid) { + if (pqid || (pd.tos & IPTOS_LOWDELAY)) + pd.pf_mtag->pftag_qid = r->pqid; + else + pd.pf_mtag->pftag_qid = r->qid; + } +#endif /* PF_ALTQ */ /* add hints for ecn */ - pd.pf_mtag->hdr = h; + pd.pf_mtag->pftag_hdr = h; + /* record address family */ + pd.pf_mtag->pftag_flags &= ~PF_TAG_HDR_INET6; + pd.pf_mtag->pftag_flags |= PF_TAG_HDR_INET; + /* record TCP vs. non-TCP */ + if (pd.proto == IPPROTO_TCP) + pd.pf_mtag->pftag_flags |= PF_TAG_TCP; + else + pd.pf_mtag->pftag_flags &= ~PF_TAG_TCP; } -#endif /* ALTQ */ /* * connections redirected to loopback should not match sockets @@ -8886,7 +8460,7 @@ done: (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) - pd.pf_mtag->flags |= PF_TAG_TRANSLATE_LOCALHOST; + pd.pf_mtag->pftag_flags |= PF_TAG_TRANSLATE_LOCALHOST; if (log) { struct pf_rule *lr; @@ -8966,7 +8540,6 @@ done: tr->dst.neg); } -#ifndef NO_APPLE_EXTENSIONS VERIFY(m == NULL || pd.mp == NULL || pd.mp == m); if (*m0) { @@ -8983,7 +8556,6 @@ done: *m0 = m; } -#endif if (action == PF_SYNPROXY_DROP) { m_freem(*m0); @@ -8998,7 +8570,6 @@ done: #endif /* INET */ #if INET6 -#ifndef NO_APPLE_EXTENSIONS #define PF_APPLE_UPDATE_PDESC_IPv6() \ do { \ if (m && pd.mp && m != pd.mp) { \ @@ -9008,14 +8579,16 @@ done: h = mtod(m, struct ip6_hdr *); \ } \ } while (0) -#endif int pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, - struct ether_header *eh) + struct ether_header *eh, struct ip_fw_args *fwa) { +#if !DUMMYNET +#pragma unused(fwa) +#endif struct pfi_kif *kif; - u_short action, reason = 0, log = 0; + u_short action = PF_PASS, reason = 0, log = 0; struct mbuf *m = *m0, *n = NULL; struct ip6_hdr *h; struct pf_rule *a = NULL, *r = &pf_default_rule, *tr, *nr; @@ -9024,6 +8597,7 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0; + u_int8_t nxt; lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); @@ -9038,7 +8612,7 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, return (PF_DROP); } - if (pd.pf_mtag->flags & PF_TAG_GENERATED) + if (pd.pf_mtag->pftag_flags & PF_TAG_GENERATED) return (PF_PASS); kif = (struct pfi_kif *)ifp->if_pf_kif; @@ -9058,6 +8632,29 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, h = mtod(m, struct ip6_hdr *); + nxt = h->ip6_nxt; + off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); + pd.mp = m; + pd.lmw = 0; + pd.pf_mtag = pf_get_mtag(m); + pd.src = (struct pf_addr *)&h->ip6_src; + pd.dst = (struct pf_addr *)&h->ip6_dst; + PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET6); + pd.ip_sum = NULL; + pd.af = AF_INET6; + pd.proto = nxt; + pd.proto_variant = 0; + pd.tos = 0; + pd.sc = MBUF_SCIDX(mbuf_get_service_class(m)); + pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); + pd.eh = eh; + + if (pd.pf_mtag->pftag_flowhash != 0) { + pd.flowhash = pd.pf_mtag->pftag_flowhash; + pd.flags |= (m->m_pkthdr.m_fhflags & PF_TAG_FLOWADV) ? + PFDESC_FLOW_ADV : 0; + } + if (m->m_pkthdr.len < (int)sizeof (*h)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); @@ -9065,12 +8662,22 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, goto done; } +#if DUMMYNET + if (fwa != NULL && fwa->fwa_pf_rule != NULL) + goto nonormalize; +#endif /* DUMMYNET */ + /* We do IP header normalization and packet reassembly here */ - if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { + action = pf_normalize_ip6(m0, dir, kif, &reason, &pd); + pd.mp = m = *m0; + if (action != PF_PASS || pd.lmw < 0) { action = PF_DROP; goto done; } - m = *m0; /* pf_normalize messes with m0 */ + +#if DUMMYNET +nonormalize: +#endif /* DUMMYNET */ h = mtod(m, struct ip6_hdr *); #if 1 @@ -9096,56 +8703,54 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, off = ((caddr_t)h - m->m_data) + sizeof (struct ip6_hdr); pd.proto = h->ip6_nxt; -#ifndef NO_APPLE_EXTENSIONS pd.proto_variant = 0; pd.mp = m; pd.lmw = 0; -#endif - do { - switch (pd.proto) { - case IPPROTO_FRAGMENT: - action = pf_test_fragment(&r, dir, kif, m, h, - &pd, &a, &ruleset); - if (action == PF_DROP) - REASON_SET(&reason, PFRES_FRAG); - goto done; - case IPPROTO_ROUTING: { - struct ip6_rthdr rthdr; + pd.pf_mtag = pf_get_mtag(m); - if (rh_cnt++) { - DPFPRINTF(PF_DEBUG_MISC, - ("pf: IPv6 more than one rthdr\n")); - action = PF_DROP; - REASON_SET(&reason, PFRES_IPOPTIONS); - log = 1; - goto done; - } - if (!pf_pull_hdr(m, off, &rthdr, sizeof (rthdr), NULL, + do { + switch (nxt) { + case IPPROTO_FRAGMENT: { + struct ip6_frag ip6f; + + pd.flags |= PFDESC_IP_FRAG; + if (!pf_pull_hdr(m, off, &ip6f, sizeof ip6f, NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, - ("pf: IPv6 short rthdr\n")); + ("pf: IPv6 short fragment header\n")); action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } - if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { - DPFPRINTF(PF_DEBUG_MISC, - ("pf: IPv6 rthdr0\n")); - action = PF_DROP; - REASON_SET(&reason, PFRES_IPOPTIONS); + pd.proto = nxt = ip6f.ip6f_nxt; +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ + action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, + &ruleset); + if (action == PF_DROP) { + REASON_SET(&reason, PFRES_FRAG); log = 1; - goto done; } - /* FALLTHROUGH */ + goto done; } + case IPPROTO_ROUTING: + ++rh_cnt; + /* FALL THROUGH */ + case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; - if (!pf_pull_hdr(m, off, &opt6, sizeof (opt6), + if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short opt\n")); @@ -9157,7 +8762,7 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, off += (opt6.ip6e_len + 2) * 4; else off += (opt6.ip6e_len + 1) * 8; - pd.proto = opt6.ip6e_nxt; + nxt = opt6.ip6e_nxt; /* goto the next header */ break; } @@ -9183,21 +8788,25 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv6(); -#endif if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv6(); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -9227,15 +8836,19 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, REASON_SET(&reason, PFRES_SHORT); goto done; } -#ifndef NO_APPLE_EXTENSIONS +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd, &reason); if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv6(); -#else - action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -9258,13 +8871,19 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, log = action != PF_PASS; goto done; } +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv6(); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -9278,7 +8897,6 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, break; } -#ifndef NO_APPLE_EXTENSIONS case IPPROTO_ESP: { struct pf_esp_hdr esp; @@ -9288,6 +8906,14 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, log = action != PF_PASS; goto done; } +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_esp(&s, dir, kif, off, &pd); if (pd.lmw < 0) goto done; @@ -9314,6 +8940,14 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, log = (action != PF_PASS); goto done; } +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ if ((ntohs(grev1.flags) & PF_GRE_FLAG_VERSION_MASK) == 1 && ntohs(grev1.protocol_type) == PF_GRE_PPP_ETHERTYPE) { if (ntohs(grev1.payload_length) > @@ -9344,15 +8978,20 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, /* not GREv1/PPTP, so treat as ordinary GRE... */ } -#endif default: +#if DUMMYNET + /* Traffic goes through dummynet first */ + action = pf_test_dummynet(&r, dir, kif, &m, &pd, fwa); + if (action == PF_DROP || m == NULL) { + *m0 = NULL; + return (action); + } +#endif /* DUMMYNET */ action = pf_test_state_other(&s, dir, kif, &pd); -#ifndef NO_APPLE_EXTENSIONS if (pd.lmw < 0) goto done; PF_APPLE_UPDATE_PDESC_IPv6(); -#endif if (action == PF_PASS) { #if NPFSYNC pfsync_update_state(s); @@ -9367,10 +9006,8 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, } done: -#ifndef NO_APPLE_EXTENSIONS *m0 = pd.mp; PF_APPLE_UPDATE_PDESC_IPv6(); -#endif if (n != m) { m_freem(n); @@ -9387,27 +9024,37 @@ done: ("pf: dropping packet with dangerous v6 headers\n")); } - if ((s && s->tag) || PF_RTABLEID_IS_VALID(r->rtableid)) + if ((s && s->tag) || PF_RTABLEID_IS_VALID(r->rtableid) || pd.flowhash != 0) (void) pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, - r->rtableid); + r->rtableid, &pd); -#if ALTQ - if (action == PF_PASS && r->qid) { - if (pd.tos & IPTOS_LOWDELAY) - pd.pf_mtag->qid = r->pqid; - else - pd.pf_mtag->qid = r->qid; + if (action == PF_PASS) { +#if PF_ALTQ + if (altq_allowed && r->qid) { + if (pd.tos & IPTOS_LOWDELAY) + pd.pf_mtag->pftag_qid = r->pqid; + else + pd.pf_mtag->pftag_qid = r->qid; + } +#endif /* PF_ALTQ */ /* add hints for ecn */ - pd.pf_mtag->hdr = h; + pd.pf_mtag->pftag_hdr = h; + /* record address family */ + pd.pf_mtag->pftag_flags &= ~PF_TAG_HDR_INET; + pd.pf_mtag->pftag_flags |= PF_TAG_HDR_INET6; + /* record TCP vs. non-TCP */ + if (pd.proto == IPPROTO_TCP) + pd.pf_mtag->pftag_flags |= PF_TAG_TCP; + else + pd.pf_mtag->pftag_flags &= ~PF_TAG_TCP; } -#endif /* ALTQ */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) - pd.pf_mtag->flags |= PF_TAG_TRANSLATE_LOCALHOST; + pd.pf_mtag->pftag_flags |= PF_TAG_TRANSLATE_LOCALHOST; if (log) { struct pf_rule *lr; @@ -9495,7 +9142,6 @@ done: /* pf_route6 can free the mbuf causing *m0 to become NULL */ pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); #else -#ifndef NO_APPLE_EXTENSIONS VERIFY(m == NULL || pd.mp == NULL || pd.mp == m); if (*m0) { @@ -9526,23 +9172,7 @@ done: /* pf_route6 can free the mbuf causing *m0 to become NULL */ pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); } -#else - if (action != PF_SYNPROXY_DROP && r->rt) - /* pf_route6 can free the mbuf causing *m0 to become NULL */ - pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); - - if (action == PF_PASS) { - m = *m0; - h = mtod(m, struct ip6_hdr *); - } - - if (action == PF_SYNPROXY_DROP) { - m_freem(*m0); - *m0 = NULL; - action = PF_PASS; - } -#endif -#endif +#endif /* 0 */ return (action); } @@ -9627,41 +9257,16 @@ pool_put(struct pool *pp, void *v) struct pf_mtag * pf_find_mtag(struct mbuf *m) { -#if !PF_PKTHDR - struct m_tag *mtag; - - if ((mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, - KERNEL_TAG_TYPE_PF, NULL)) == NULL) - return (NULL); - - return ((struct pf_mtag *)(mtag + 1)); -#else if (!(m->m_flags & M_PKTHDR)) return (NULL); - return (&m->m_pkthdr.pf_mtag); -#endif /* PF_PKTHDR */ + return (m_pftag(m)); } struct pf_mtag * pf_get_mtag(struct mbuf *m) { -#if !PF_PKTHDR - struct m_tag *mtag; - - if ((mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, - NULL)) == NULL) { - mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, - sizeof (struct pf_mtag), M_NOWAIT, m); - if (mtag == NULL) - return (NULL); - bzero(mtag + 1, sizeof (struct pf_mtag)); - m_tag_prepend(m, mtag); - } - return ((struct pf_mtag *)(mtag + 1)); -#else return (pf_find_mtag(m)); -#endif /* PF_PKTHDR */ } uint64_t diff --git a/bsd/net/pf_if.c b/bsd/net/pf_if.c index 4c05205ba..66d939f92 100644 --- a/bsd/net/pf_if.c +++ b/bsd/net/pf_if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2010 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -148,7 +148,8 @@ pfi_kif_get(const char *kif_name) bzero(&s, sizeof (s)); strlcpy(s.pfik_name, kif_name, sizeof (s.pfik_name)); - if ((kif = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)&s)) != NULL) + if ((kif = RB_FIND(pfi_ifhead, &pfi_ifs, + (struct pfi_kif *)(void *)&s)) != NULL) return (kif); /* create new one */ @@ -461,8 +462,8 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) continue; } if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && - IN6_IS_ADDR_LINKLOCAL( - &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) { + IN6_IS_ADDR_LINKLOCAL(&((struct sockaddr_in6 *) + (void *)ia->ifa_addr)->sin6_addr)) { IFA_UNLOCK(ia); continue; } @@ -484,10 +485,10 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) { if (af == AF_INET) net2 = pfi_unmask(&((struct sockaddr_in *) - ia->ifa_netmask)->sin_addr); + (void *)ia->ifa_netmask)->sin_addr); else if (af == AF_INET6) net2 = pfi_unmask(&((struct sockaddr_in6 *) - ia->ifa_netmask)->sin6_addr); + (void *)ia->ifa_netmask)->sin6_addr); } if (af == AF_INET && net2 > 32) net2 = 32; @@ -536,9 +537,10 @@ pfi_address_add(struct sockaddr *sa, int af, int net) p->pfra_af = af; p->pfra_net = net; if (af == AF_INET) - p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr; + p->pfra_ip4addr = ((struct sockaddr_in *)(void *)sa)->sin_addr; else if (af == AF_INET6) { - p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr; + p->pfra_ip6addr = + ((struct sockaddr_in6 *)(void *)sa)->sin6_addr; if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr)) p->pfra_ip6addr.s6_addr16[1] = 0; } @@ -601,7 +603,7 @@ pfi_update_status(const char *name, struct pf_status *pfs) lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); strlcpy(key.pfik_name, name, sizeof (key.pfik_name)); - p = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)&key); + p = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)(void *)&key); if (p == NULL) return; diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 25763d8f5..6e76775a7 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -83,6 +83,7 @@ #include +#include #include #include #include @@ -95,8 +96,16 @@ #include #include +#if DUMMYNET +#include +#else +struct ip_fw_args; +#endif /* DUMMYNET */ + #include +#include + #include #include @@ -114,9 +123,14 @@ #include #endif /* INET6 */ -#if ALTQ -#include -#endif /* ALTQ */ +#if PF_ALTQ +#include +#include +#include +#include +#include +#include +#endif /* PF_ALTQ */ #if 0 static void pfdetach(void); @@ -124,12 +138,34 @@ static void pfdetach(void); static int pfopen(dev_t, int, int, struct proc *); static int pfclose(dev_t, int, int, struct proc *); static int pfioctl(dev_t, u_long, caddr_t, int, struct proc *); +static int pfioctl_ioc_table(u_long, struct pfioc_table_32 *, + struct pfioc_table_64 *, struct proc *); +static int pfioctl_ioc_tokens(u_long, struct pfioc_tokens_32 *, + struct pfioc_tokens_64 *, struct proc *); +static int pfioctl_ioc_rule(u_long, int, struct pfioc_rule *, struct proc *); +static int pfioctl_ioc_state_kill(u_long, struct pfioc_state_kill *, + struct proc *); +static int pfioctl_ioc_state(u_long, struct pfioc_state *, struct proc *); +static int pfioctl_ioc_states(u_long, struct pfioc_states_32 *, + struct pfioc_states_64 *, struct proc *); +static int pfioctl_ioc_natlook(u_long, struct pfioc_natlook *, struct proc *); +static int pfioctl_ioc_tm(u_long, struct pfioc_tm *, struct proc *); +static int pfioctl_ioc_limit(u_long, struct pfioc_limit *, struct proc *); +static int pfioctl_ioc_pooladdr(u_long, struct pfioc_pooladdr *, struct proc *); +static int pfioctl_ioc_ruleset(u_long, struct pfioc_ruleset *, struct proc *); +static int pfioctl_ioc_trans(u_long, struct pfioc_trans_32 *, + struct pfioc_trans_64 *, struct proc *); +static int pfioctl_ioc_src_nodes(u_long, struct pfioc_src_nodes_32 *, + struct pfioc_src_nodes_64 *, struct proc *); +static int pfioctl_ioc_src_node_kill(u_long, struct pfioc_src_node_kill *, + struct proc *); +static int pfioctl_ioc_iface(u_long, struct pfioc_iface_32 *, + struct pfioc_iface_64 *, struct proc *); static struct pf_pool *pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t, u_int8_t, u_int8_t, u_int8_t); - static void pf_mv_pool(struct pf_palist *, struct pf_palist *); static void pf_empty_pool(struct pf_palist *); -#if ALTQ +#if PF_ALTQ static int pf_begin_altq(u_int32_t *); static int pf_rollback_altq(u_int32_t); static int pf_commit_altq(u_int32_t); @@ -137,18 +173,15 @@ static int pf_enable_altq(struct pf_altq *); static int pf_disable_altq(struct pf_altq *); static void pf_altq_copyin(struct pf_altq *, struct pf_altq *); static void pf_altq_copyout(struct pf_altq *, struct pf_altq *); -#endif /* ALTQ */ +#endif /* PF_ALTQ */ static int pf_begin_rules(u_int32_t *, int, const char *); static int pf_rollback_rules(u_int32_t, int, char *); static int pf_setup_pfsync_matching(struct pf_ruleset *); static void pf_hash_rule(MD5_CTX *, struct pf_rule *); -#ifndef NO_APPLE_EXTENSIONS static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *, u_int8_t); -#else -static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *); -#endif static int pf_commit_rules(u_int32_t, int, char *); -static void pf_rule_copyin(struct pf_rule *, struct pf_rule *, struct proc *); +static void pf_rule_copyin(struct pf_rule *, struct pf_rule *, struct proc *, + int); static void pf_rule_copyout(struct pf_rule *, struct pf_rule *); static void pf_state_export(struct pfsync_state *, struct pf_state_key *, struct pf_state *); @@ -156,6 +189,16 @@ static void pf_state_import(struct pfsync_state *, struct pf_state_key *, struct pf_state *); static void pf_pooladdr_copyin(struct pf_pooladdr *, struct pf_pooladdr *); static void pf_pooladdr_copyout(struct pf_pooladdr *, struct pf_pooladdr *); +static void pf_expire_states_and_src_nodes(struct pf_rule *); +static void pf_delete_rule_from_ruleset(struct pf_ruleset *, + int, struct pf_rule *); +static int pf_rule_setup(struct pfioc_rule *, struct pf_rule *, + struct pf_ruleset *); +static void pf_delete_rule_by_owner(char *); +static int pf_delete_rule_by_ticket(struct pfioc_rule *); +static void pf_ruleset_cleanup(struct pf_ruleset *, int); +static void pf_deleterule_anchor_step_out(struct pf_ruleset **, + int, struct pf_rule **); #define PF_CDEV_MAJOR (-1) @@ -189,27 +232,37 @@ static void pf_detach_hooks(void); */ int pf_is_enabled = 0; +#if PF_ALTQ +u_int32_t altq_allowed = 0; +#endif /* PF_ALTQ */ + +u_int32_t pf_hash_seed; + /* * These are the pf enabled reference counting variables */ static u_int64_t pf_enabled_ref_count; static u_int32_t nr_tokens = 0; +static u_int64_t pffwrules; +static u_int32_t pfdevcnt; SLIST_HEAD(list_head, pfioc_kernel_token); static struct list_head token_list_head; struct pf_rule pf_default_rule; -#if ALTQ +#if PF_ALTQ static int pf_altq_running; -#endif /* ALTQ */ +#endif /* PF_ALTQ */ #define TAGID_MAX 50000 +#if !PF_ALTQ static TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags); -#if ALTQ -static TAILQ_HEAD(pf_tags, pf_tagname) pf_qids = - TAILQ_HEAD_INITIALIZER(pf_qids); -#endif /* ALTQ */ +#else /* PF_ALTQ */ +static TAILQ_HEAD(pf_tags, pf_tagname) + pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags), + pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids); +#endif /* PF_ALTQ */ #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE @@ -222,17 +275,77 @@ static void pf_rtlabel_remove(struct pf_addr_wrap *); static void pf_rtlabel_copyout(struct pf_addr_wrap *); #if INET -static int pf_inet_hook(struct ifnet *, struct mbuf **, int); +static int pf_inet_hook(struct ifnet *, struct mbuf **, int, + struct ip_fw_args *); #endif /* INET */ #if INET6 -static int pf_inet6_hook(struct ifnet *, struct mbuf **, int); +static int pf_inet6_hook(struct ifnet *, struct mbuf **, int, + struct ip_fw_args *); #endif /* INET6 */ -#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x +#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x + +/* + * Helper macros for ioctl structures which vary in size (32-bit vs. 64-bit) + */ +#define PFIOCX_STRUCT_DECL(s) \ +struct { \ + union { \ + struct s##_32 _s##_32; \ + struct s##_64 _s##_64; \ + } _u; \ +} *s##_un = NULL \ + +#define PFIOCX_STRUCT_BEGIN(a, s, _action) { \ + VERIFY(s##_un == NULL); \ + s##_un = _MALLOC(sizeof (*s##_un), M_TEMP, M_WAITOK|M_ZERO); \ + if (s##_un == NULL) { \ + _action \ + } else { \ + if (p64) \ + bcopy(a, &s##_un->_u._s##_64, \ + sizeof (struct s##_64)); \ + else \ + bcopy(a, &s##_un->_u._s##_32, \ + sizeof (struct s##_32)); \ + } \ +} + +#define PFIOCX_STRUCT_END(s, a) { \ + VERIFY(s##_un != NULL); \ + if (p64) \ + bcopy(&s##_un->_u._s##_64, a, sizeof (struct s##_64)); \ + else \ + bcopy(&s##_un->_u._s##_32, a, sizeof (struct s##_32)); \ + _FREE(s##_un, M_TEMP); \ + s##_un = NULL; \ +} + +#define PFIOCX_STRUCT_ADDR32(s) (&s##_un->_u._s##_32) +#define PFIOCX_STRUCT_ADDR64(s) (&s##_un->_u._s##_64) + +/* + * Helper macros for regular ioctl structures. + */ +#define PFIOC_STRUCT_BEGIN(a, v, _action) { \ + VERIFY((v) == NULL); \ + (v) = _MALLOC(sizeof (*(v)), M_TEMP, M_WAITOK|M_ZERO); \ + if ((v) == NULL) { \ + _action \ + } else { \ + bcopy(a, v, sizeof (*(v))); \ + } \ +} + +#define PFIOC_STRUCT_END(v, a) { \ + VERIFY((v) != NULL); \ + bcopy(v, a, sizeof (*(v))); \ + _FREE(v, M_TEMP); \ + (v) = NULL; \ +} -#define PF_USER_ADDR(a, s, f) \ - (proc_is64bit(current_proc()) ? \ - ((struct s##_64 *)a)->f : ((struct s##_32 *)a)->f) +#define PFIOC_STRUCT_ADDR32(s) (&s##_un->_u._s##_32) +#define PFIOC_STRUCT_ADDR64(s) (&s##_un->_u._s##_64) static lck_attr_t *pf_perim_lock_attr; static lck_grp_t *pf_perim_lock_grp; @@ -247,38 +360,39 @@ struct thread *pf_purge_thread; extern void pfi_kifaddr_update(void *); /* pf enable ref-counting helper functions */ -static u_int64_t generate_token(void); -static int remove_token(struct pfioc_remove_token *); -static void invalidate_all_tokens(void); +static u_int64_t generate_token(struct proc *); +static int remove_token(struct pfioc_remove_token *); +static void invalidate_all_tokens(void); static u_int64_t -generate_token(void) +generate_token(struct proc *p) { u_int64_t token_value; struct pfioc_kernel_token *new_token; - new_token = _MALLOC(sizeof (struct pfioc_kernel_token), M_TEMP, M_WAITOK|M_ZERO); + new_token = _MALLOC(sizeof (struct pfioc_kernel_token), M_TEMP, + M_WAITOK|M_ZERO); lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); if (new_token == NULL) { /* malloc failed! bail! */ printf("%s: unable to allocate pf token structure!", __func__); - return 0; + return (0); } token_value = (u_int64_t)(uintptr_t)new_token; new_token->token.token_value = token_value; - new_token->token.pid = proc_pid(current_proc()); + new_token->token.pid = proc_pid(p); proc_name(new_token->token.pid, new_token->token.proc_name, - sizeof (new_token->token.proc_name)); + sizeof (new_token->token.proc_name)); new_token->token.timestamp = pf_calendar_time_second(); SLIST_INSERT_HEAD(&token_list_head, new_token, next); nr_tokens++; - return token_value; + return (token_value); } static int @@ -290,15 +404,16 @@ remove_token(struct pfioc_remove_token *tok) SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { if (tok->token_value == entry->token.token_value) { - SLIST_REMOVE(&token_list_head, entry, pfioc_kernel_token, next); + SLIST_REMOVE(&token_list_head, entry, + pfioc_kernel_token, next); _FREE(entry, M_TEMP); nr_tokens--; - return 0; /* success */ + return (0); /* success */ } } printf("pf : remove failure\n"); - return ESRCH; /* failure */ + return (ESRCH); /* failure */ } static void @@ -314,8 +429,6 @@ invalidate_all_tokens(void) } nr_tokens = 0; - - return; } void @@ -328,13 +441,12 @@ pfinit(void) pf_perim_lock_grp = lck_grp_alloc_init("pf_perim", pf_perim_lock_grp_attr); pf_perim_lock_attr = lck_attr_alloc_init(); - pf_perim_lock = lck_rw_alloc_init(pf_perim_lock_grp, - pf_perim_lock_attr); + lck_rw_init(pf_perim_lock, pf_perim_lock_grp, pf_perim_lock_attr); pf_lock_grp_attr = lck_grp_attr_alloc_init(); pf_lock_grp = lck_grp_alloc_init("pf", pf_lock_grp_attr); pf_lock_attr = lck_attr_alloc_init(); - pf_lock = lck_mtx_alloc_init(pf_lock_grp, pf_lock_attr); + lck_mtx_init(pf_lock, pf_lock_grp, pf_lock_attr); pool_init(&pf_rule_pl, sizeof (struct pf_rule), 0, 0, 0, "pfrulepl", NULL); @@ -344,14 +456,12 @@ pfinit(void) NULL); pool_init(&pf_state_key_pl, sizeof (struct pf_state_key), 0, 0, 0, "pfstatekeypl", NULL); -#ifndef NO_APPLE_EXTENSIONS pool_init(&pf_app_state_pl, sizeof (struct pf_app_state), 0, 0, 0, "pfappstatepl", NULL); -#endif -#if ALTQ +#if PF_ALTQ pool_init(&pf_altq_pl, sizeof (struct pf_altq), 0, 0, 0, "pfaltqpl", NULL); -#endif /* ALTQ */ +#endif /* PF_ALTQ */ pool_init(&pf_pooladdr_pl, sizeof (struct pf_pooladdr), 0, 0, 0, "pfpooladdrpl", NULL); pfr_initialize(); @@ -370,12 +480,32 @@ pfinit(void) pf_init_ruleset(&pf_main_ruleset); TAILQ_INIT(&pf_pabuf); TAILQ_INIT(&state_list); -#if ALTQ +#if PF_ALTQ TAILQ_INIT(&pf_altqs[0]); TAILQ_INIT(&pf_altqs[1]); pf_altqs_active = &pf_altqs[0]; pf_altqs_inactive = &pf_altqs[1]; -#endif /* ALTQ */ + + PE_parse_boot_argn("altq", &altq_allowed, sizeof (altq_allowed)); + + _CASSERT(ALTRQ_PURGE == CLASSQRQ_PURGE); + _CASSERT(ALTRQ_PURGE_SC == CLASSQRQ_PURGE_SC); + _CASSERT(ALTRQ_EVENT == CLASSQRQ_EVENT); + + _CASSERT(ALTDQ_REMOVE == CLASSQDQ_REMOVE); + _CASSERT(ALTDQ_POLL == CLASSQDQ_POLL); +#endif /* PF_ALTQ */ + + _CASSERT((SC_BE & SCIDX_MASK) == SCIDX_BE); + _CASSERT((SC_BK_SYS & SCIDX_MASK) == SCIDX_BK_SYS); + _CASSERT((SC_BK & SCIDX_MASK) == SCIDX_BK); + _CASSERT((SC_RD & SCIDX_MASK) == SCIDX_RD); + _CASSERT((SC_OAM & SCIDX_MASK) == SCIDX_OAM); + _CASSERT((SC_AV & SCIDX_MASK) == SCIDX_AV); + _CASSERT((SC_RV & SCIDX_MASK) == SCIDX_RV); + _CASSERT((SC_VI & SCIDX_MASK) == SCIDX_VI); + _CASSERT((SC_VO & SCIDX_MASK) == SCIDX_VO); + _CASSERT((SC_CTL & SCIDX_MASK) == SCIDX_CTL); /* default rule should never be garbage collected */ pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; @@ -395,14 +525,12 @@ pfinit(void) t[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL; t[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL; t[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL; -#ifndef NO_APPLE_EXTENSIONS t[PFTM_GREv1_FIRST_PACKET] = PFTM_GREv1_FIRST_PACKET_VAL; t[PFTM_GREv1_INITIATING] = PFTM_GREv1_INITIATING_VAL; t[PFTM_GREv1_ESTABLISHED] = PFTM_GREv1_ESTABLISHED_VAL; t[PFTM_ESP_FIRST_PACKET] = PFTM_ESP_FIRST_PACKET_VAL; t[PFTM_ESP_INITIATING] = PFTM_ESP_INITIATING_VAL; t[PFTM_ESP_ESTABLISHED] = PFTM_ESP_ESTABLISHED_VAL; -#endif t[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL; t[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL; t[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL; @@ -416,6 +544,7 @@ pfinit(void) pf_normalize_init(); bzero(&pf_status, sizeof (pf_status)); pf_status.debug = PF_DEBUG_URGENT; + pf_hash_seed = random(); /* XXX do our best to avoid a conflict */ pf_status.hostid = random(); @@ -431,9 +560,12 @@ pfinit(void) printf("%s: failed to allocate major number!\n", __func__); return; } - (void) devfs_make_node(makedev(maj, 0), DEVFS_CHAR, + (void) devfs_make_node(makedev(maj, PFDEV_PF), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0600, "pf", 0); + (void) devfs_make_node(makedev(maj, PFDEV_PFM), DEVFS_CHAR, + UID_ROOT, GID_WHEEL, 0600, "pfm", 0); + pf_attach_hooks(); } @@ -458,10 +590,10 @@ pfdetach(void) for (i = 0; i < PF_RULESET_MAX; i++) if (pf_begin_rules(&ticket, i, &r) == 0) pf_commit_rules(ticket, i, &r); -#if ALTQ +#if PF_ALTQ if (pf_begin_altq(&ticket) == 0) pf_commit_altq(ticket); -#endif /* ALTQ */ +#endif /* PF_ALTQ */ /* clear states */ RB_FOREACH(state, pf_state_tree_id, &tree_id) { @@ -503,9 +635,9 @@ pfdetach(void) /* destroy the pools */ pool_destroy(&pf_pooladdr_pl); -#if ALTQ +#if PF_ALTQ pool_destroy(&pf_altq_pl); -#endif /* ALTQ */ +#endif /* PF_ALTQ */ pool_destroy(&pf_state_pl); pool_destroy(&pf_rule_pl); pool_destroy(&pf_src_tree_pl); @@ -522,8 +654,18 @@ static int pfopen(dev_t dev, int flags, int fmt, struct proc *p) { #pragma unused(flags, fmt, p) - if (minor(dev) >= 1) + if (minor(dev) >= PFDEV_MAX) return (ENXIO); + + if (minor(dev) == PFDEV_PFM) { + lck_mtx_lock(pf_lock); + if (pfdevcnt != 0) { + lck_mtx_unlock(pf_lock); + return (EBUSY); + } + pfdevcnt++; + lck_mtx_unlock(pf_lock); + } return (0); } @@ -531,8 +673,15 @@ static int pfclose(dev_t dev, int flags, int fmt, struct proc *p) { #pragma unused(flags, fmt, p) - if (minor(dev) >= 1) + if (minor(dev) >= PFDEV_MAX) return (ENXIO); + + if (minor(dev) == PFDEV_PFM) { + lck_mtx_lock(pf_lock); + VERIFY(pfdevcnt > 0); + pfdevcnt--; + lck_mtx_unlock(pf_lock); + } return (0); } @@ -630,11 +779,13 @@ pf_rm_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule) return; pf_tag_unref(rule->tag); pf_tag_unref(rule->match_tag); -#if ALTQ - if (rule->pqid != rule->qid) - pf_qid_unref(rule->pqid); - pf_qid_unref(rule->qid); -#endif /* ALTQ */ +#if PF_ALTQ + if (altq_allowed) { + if (rule->pqid != rule->qid) + pf_qid_unref(rule->pqid); + pf_qid_unref(rule->qid); + } +#endif /* PF_ALTQ */ pf_rtlabel_remove(&rule->src.addr); pf_rtlabel_remove(&rule->dst.addr); pfi_dynaddr_remove(&rule->src.addr); @@ -775,22 +926,28 @@ pf_rtlabel_copyout(struct pf_addr_wrap *a) #pragma unused(a) } -#if ALTQ +#if PF_ALTQ u_int32_t pf_qname2qid(char *qname) { + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + return ((u_int32_t)tagname2tag(&pf_qids, qname)); } void pf_qid2qname(u_int32_t qid, char *p) { + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + tag2tagname(&pf_qids, (u_int16_t)qid, p); } void pf_qid_unref(u_int32_t qid) { + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + tag_unref(&pf_qids, (u_int16_t)qid); } @@ -800,10 +957,12 @@ pf_begin_altq(u_int32_t *ticket) struct pf_altq *altq; int error = 0; + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + /* Purge the old altq list */ while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) { TAILQ_REMOVE(pf_altqs_inactive, altq, entries); - if (altq->qname[0] == 0) { + if (altq->qname[0] == '\0') { /* detach and destroy the discipline */ error = altq_remove(altq); } else @@ -823,12 +982,14 @@ pf_rollback_altq(u_int32_t ticket) struct pf_altq *altq; int error = 0; + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + if (!altqs_inactive_open || ticket != ticket_altqs_inactive) return (0); /* Purge the old altq list */ while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) { TAILQ_REMOVE(pf_altqs_inactive, altq, entries); - if (altq->qname[0] == 0) { + if (altq->qname[0] == '\0') { /* detach and destroy the discipline */ error = altq_remove(altq); } else @@ -844,13 +1005,14 @@ pf_commit_altq(u_int32_t ticket) { struct pf_altqqueue *old_altqs; struct pf_altq *altq; - int s, err, error = 0; + int err, error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); if (!altqs_inactive_open || ticket != ticket_altqs_inactive) return (EBUSY); /* swap altqs, keep the old. */ - s = splnet(); old_altqs = pf_altqs_active; pf_altqs_active = pf_altqs_inactive; pf_altqs_inactive = old_altqs; @@ -858,13 +1020,12 @@ pf_commit_altq(u_int32_t ticket) /* Attach new disciplines */ TAILQ_FOREACH(altq, pf_altqs_active, entries) { - if (altq->qname[0] == 0) { + if (altq->qname[0] == '\0') { /* attach the discipline */ error = altq_pfattach(altq); if (error == 0 && pf_altq_running) error = pf_enable_altq(altq); if (error != 0) { - splx(s); return (error); } } @@ -873,7 +1034,7 @@ pf_commit_altq(u_int32_t ticket) /* Purge the old altq list */ while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) { TAILQ_REMOVE(pf_altqs_inactive, altq, entries); - if (altq->qname[0] == 0) { + if (altq->qname[0] == '\0') { /* detach and destroy the discipline */ if (pf_altq_running) error = pf_disable_altq(altq); @@ -887,7 +1048,6 @@ pf_commit_altq(u_int32_t ticket) pf_qid_unref(altq->qid); pool_put(&pf_altq_pl, altq); } - splx(s); altqs_inactive_open = 0; return (error); @@ -897,23 +1057,40 @@ static int pf_enable_altq(struct pf_altq *altq) { struct ifnet *ifp; - struct tb_profile tb; - int s, error = 0; + struct ifclassq *ifq; + int error = 0; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); if ((ifp = ifunit(altq->ifname)) == NULL) return (EINVAL); - if (ifp->if_snd.altq_type != ALTQT_NONE) - error = altq_enable(&ifp->if_snd); + ifq = &ifp->if_snd; + IFCQ_LOCK(ifq); + if (IFCQ_ALTQ(ifq)->altq_type != ALTQT_NONE) + error = altq_enable(IFCQ_ALTQ(ifq)); + + /* set or clear tokenbucket regulator */ + if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(IFCQ_ALTQ(ifq))) { + struct tb_profile tb = { 0, 0, 0 }; - /* set tokenbucket regulator */ - if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { - tb.rate = altq->ifbandwidth; - tb.depth = altq->tbrsize; - s = splnet(); - error = tbr_set(&ifp->if_snd, &tb); - splx(s); + if (altq->aflags & PF_ALTQF_TBR) { + if (altq->bwtype != PF_ALTQ_BW_ABSOLUTE && + altq->bwtype != PF_ALTQ_BW_PERCENT) { + error = EINVAL; + } else { + if (altq->bwtype == PF_ALTQ_BW_ABSOLUTE) + tb.rate = altq->ifbandwidth; + else + tb.percent = altq->ifbandwidth; + tb.depth = altq->tbrsize; + error = ifclassq_tbr_set(ifq, &tb, TRUE); + } + } else if (IFCQ_TBR_IS_ENABLED(ifq)) { + error = ifclassq_tbr_set(ifq, &tb, TRUE); + } } + IFCQ_UNLOCK(ifq); return (error); } @@ -922,8 +1099,10 @@ static int pf_disable_altq(struct pf_altq *altq) { struct ifnet *ifp; - struct tb_profile tb; - int s, error; + struct ifclassq *ifq; + int error; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); if ((ifp = ifunit(altq->ifname)) == NULL) return (EINVAL); @@ -932,18 +1111,21 @@ pf_disable_altq(struct pf_altq *altq) * when the discipline is no longer referenced, it was overridden * by a new one. if so, just return. */ - if (altq->altq_disc != ifp->if_snd.altq_disc) + ifq = &ifp->if_snd; + IFCQ_LOCK(ifq); + if (altq->altq_disc != IFCQ_ALTQ(ifq)->altq_disc) { + IFCQ_UNLOCK(ifq); return (0); + } - error = altq_disable(&ifp->if_snd); + error = altq_disable(IFCQ_ALTQ(ifq)); - if (error == 0) { + if (error == 0 && IFCQ_TBR_IS_ENABLED(ifq)) { /* clear tokenbucket regulator */ - tb.rate = 0; - s = splnet(); - error = tbr_set(&ifp->if_snd, &tb); - splx(s); + struct tb_profile tb = { 0, 0, 0 }; + error = ifclassq_tbr_set(ifq, &tb, TRUE); } + IFCQ_UNLOCK(ifq); return (error); } @@ -957,18 +1139,22 @@ pf_altq_copyin(struct pf_altq *src, struct pf_altq *dst) dst->qname[sizeof (dst->qname) - 1] = '\0'; dst->parent[sizeof (dst->parent) - 1] = '\0'; dst->altq_disc = NULL; - TAILQ_INIT(&dst->entries); + dst->entries.tqe_next = NULL; + dst->entries.tqe_prev = NULL; } static void pf_altq_copyout(struct pf_altq *src, struct pf_altq *dst) { - bcopy(src, dst, sizeof (struct pf_altq)); + struct pf_altq pa; - dst->altq_disc = NULL; - TAILQ_INIT(&dst->entries); + bcopy(src, &pa, sizeof (struct pf_altq)); + pa.altq_disc = NULL; + pa.entries.tqe_next = NULL; + pa.entries.tqe_prev = NULL; + bcopy(&pa, dst, sizeof (struct pf_altq)); } -#endif /* ALTQ */ +#endif /* PF_ALTQ */ static int pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor) @@ -1010,29 +1196,24 @@ pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor) return (0); } -#define PF_MD5_UPD(st, elm) \ +#define PF_MD5_UPD(st, elm) \ MD5Update(ctx, (u_int8_t *)&(st)->elm, sizeof ((st)->elm)) -#define PF_MD5_UPD_STR(st, elm) \ +#define PF_MD5_UPD_STR(st, elm) \ MD5Update(ctx, (u_int8_t *)(st)->elm, strlen((st)->elm)) -#define PF_MD5_UPD_HTONL(st, elm, stor) do { \ +#define PF_MD5_UPD_HTONL(st, elm, stor) do { \ (stor) = htonl((st)->elm); \ MD5Update(ctx, (u_int8_t *)&(stor), sizeof (u_int32_t)); \ } while (0) -#define PF_MD5_UPD_HTONS(st, elm, stor) do { \ +#define PF_MD5_UPD_HTONS(st, elm, stor) do { \ (stor) = htons((st)->elm); \ MD5Update(ctx, (u_int8_t *)&(stor), sizeof (u_int16_t)); \ } while (0) -#ifndef NO_APPLE_EXTENSIONS static void pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr, u_int8_t proto) -#else -static void -pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) -#endif { PF_MD5_UPD(pfr, addr.type); switch (pfr->addr.type) { @@ -1053,7 +1234,6 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) break; } -#ifndef NO_APPLE_EXTENSIONS switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -1067,12 +1247,6 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) } PF_MD5_UPD(pfr, neg); -#else - PF_MD5_UPD(pfr, port[0]); - PF_MD5_UPD(pfr, port[1]); - PF_MD5_UPD(pfr, neg); - PF_MD5_UPD(pfr, port_op); -#endif } static void @@ -1081,13 +1255,8 @@ pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule) u_int16_t x; u_int32_t y; -#ifndef NO_APPLE_EXTENSIONS pf_hash_rule_addr(ctx, &rule->src, rule->proto); pf_hash_rule_addr(ctx, &rule->dst, rule->proto); -#else - pf_hash_rule_addr(ctx, &rule->src); - pf_hash_rule_addr(ctx, &rule->dst); -#endif PF_MD5_UPD_STR(rule, label); PF_MD5_UPD_STR(rule, ifname); PF_MD5_UPD_STR(rule, match_tagname); @@ -1177,7 +1346,8 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) } static void -pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p) +pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p, + int minordev) { bcopy(src, dst, sizeof (struct pf_rule)); @@ -1205,6 +1375,8 @@ pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p) dst->entries.tqe_prev = NULL; dst->entries.tqe_next = NULL; + if ((uint8_t)minordev == PFDEV_PFM) + dst->rule_flag |= PFRULE_PFM; } static void @@ -1231,7 +1403,6 @@ pf_state_export(struct pfsync_state *sp, struct pf_state_key *sk, bzero(sp, sizeof (struct pfsync_state)); /* copy from state key */ -#ifndef NO_APPLE_EXTENSIONS sp->lan.addr = sk->lan.addr; sp->lan.xport = sk->lan.xport; sp->gwy.addr = sk->gwy.addr; @@ -1240,17 +1411,10 @@ pf_state_export(struct pfsync_state *sp, struct pf_state_key *sk, sp->ext.xport = sk->ext.xport; sp->proto_variant = sk->proto_variant; sp->tag = s->tag; -#else - sp->lan.addr = sk->lan.addr; - sp->lan.port = sk->lan.port; - sp->gwy.addr = sk->gwy.addr; - sp->gwy.port = sk->gwy.port; - sp->ext.addr = sk->ext.addr; - sp->ext.port = sk->ext.port; -#endif sp->proto = sk->proto; sp->af = sk->af; sp->direction = sk->direction; + sp->flowhash = sk->flowhash; /* copy from state */ memcpy(&sp->id, &s->id, sizeof (sp->id)); @@ -1292,7 +1456,6 @@ pf_state_import(struct pfsync_state *sp, struct pf_state_key *sk, struct pf_state *s) { /* copy to state key */ -#ifndef NO_APPLE_EXTENSIONS sk->lan.addr = sp->lan.addr; sk->lan.xport = sp->lan.xport; sk->gwy.addr = sp->gwy.addr; @@ -1301,17 +1464,10 @@ pf_state_import(struct pfsync_state *sp, struct pf_state_key *sk, sk->ext.xport = sp->ext.xport; sk->proto_variant = sp->proto_variant; s->tag = sp->tag; -#else - sk->lan.addr = sp->lan.addr; - sk->lan.port = sp->lan.port; - sk->gwy.addr = sp->gwy.addr; - sk->gwy.port = sp->gwy.port; - sk->ext.addr = sp->ext.addr; - sk->ext.port = sp->ext.port; -#endif sk->proto = sp->proto; sk->af = sp->af; sk->direction = sp->direction; + sk->flowhash = pf_calc_state_key_flowhash(sk); /* copy to state */ memcpy(&s->id, &sp->id, sizeof (sp->id)); @@ -1429,9 +1585,9 @@ static int pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) { #pragma unused(dev) - struct pf_pooladdr *pa = NULL; - struct pf_pool *pool = NULL; - int error = 0; + int p64 = proc_is64bit(p); + int error = 0; + int minordev = minor(dev); if (kauth_cred_issuser(kauth_cred_get()) == 0) return (EPERM); @@ -1450,6 +1606,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCNATLOOK: case DIOCSETDEBUG: case DIOCGETSTATES: + case DIOCINSERTRULE: + case DIOCDELETERULE: case DIOCGETTIMEOUT: case DIOCCLRRULECTRS: case DIOCGETLIMIT: @@ -1473,17 +1631,23 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCGETSRCNODES: case DIOCCLRSRCNODES: case DIOCIGETIFACES: + case DIOCGIFSPEED: case DIOCSETIFFLAG: case DIOCCLRIFFLAG: break; case DIOCRCLRTABLES: case DIOCRADDTABLES: case DIOCRDELTABLES: - case DIOCRSETTFLAGS: - if (((struct pfioc_table *)addr)->pfrio_flags & - PFR_FLAG_DUMMY) + case DIOCRSETTFLAGS: { + int pfrio_flags; + + bcopy(&((struct pfioc_table *)(void *)addr)-> + pfrio_flags, &pfrio_flags, sizeof (pfrio_flags)); + + if (pfrio_flags & PFR_FLAG_DUMMY) break; /* dummy operation ok */ return (EPERM); + } default: return (EPERM); } @@ -1501,6 +1665,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCGETSTATE: case DIOCGETSTATUS: case DIOCGETSTATES: + case DIOCINSERTRULE: + case DIOCDELETERULE: case DIOCGETTIMEOUT: case DIOCGETLIMIT: case DIOCGETALTQS: @@ -1517,6 +1683,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCOSFPGET: case DIOCGETSRCNODES: case DIOCIGETIFACES: + case DIOCGIFSPEED: break; case DIOCRCLRTABLES: case DIOCRADDTABLES: @@ -1526,22 +1693,48 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCRADDADDRS: case DIOCRDELADDRS: case DIOCRSETADDRS: - case DIOCRSETTFLAGS: - if (((struct pfioc_table *)addr)->pfrio_flags & - PFR_FLAG_DUMMY) { + case DIOCRSETTFLAGS: { + int pfrio_flags; + + bcopy(&((struct pfioc_table *)(void *)addr)-> + pfrio_flags, &pfrio_flags, sizeof (pfrio_flags)); + + if (pfrio_flags & PFR_FLAG_DUMMY) { flags |= FWRITE; /* need write lock for dummy */ break; /* dummy operation ok */ } return (EACCES); - case DIOCGETRULE: - if (((struct pfioc_rule *)addr)->action == - PF_GET_CLR_CNTR) + } + case DIOCGETRULE: { + u_int32_t action; + + bcopy(&((struct pfioc_rule *)(void *)addr)->action, + &action, sizeof (action)); + + if (action == PF_GET_CLR_CNTR) return (EACCES); break; + } default: return (EACCES); } +#if PF_ALTQ + switch (cmd) { + case DIOCSTARTALTQ: + case DIOCSTOPALTQ: + case DIOCADDALTQ: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCCHANGEALTQ: + case DIOCGETQSTATS: + /* fail if ALTQ is disabled */ + if (!altq_allowed) + return (ENODEV); + break; + } +#endif /* PF_ALTQ */ + if (flags & FWRITE) lck_rw_lock_exclusive(pf_perim_lock); else @@ -1573,11 +1766,14 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } break; - case DIOCSTARTREF: /* returns a token */ + case DIOCSTARTREF: /* u_int64_t */ if (pf_purge_thread == NULL) { error = ENOMEM; } else { - if ((*(u_int64_t *)addr = generate_token()) != 0) { + u_int64_t token; + + /* small enough to be on stack */ + if ((token = generate_token(p)) != 0) { if (pf_is_enabled == 0) { pf_start(); } @@ -1586,8 +1782,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } else { error = ENOMEM; DPFPRINTF(PF_DEBUG_URGENT, - ("pf: unable to generate token\n")); + ("pf: unable to generate token\n")); } + bcopy(&token, addr, sizeof (token)); } break; @@ -1601,230 +1798,1332 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } break; - case DIOCSTOPREF: + case DIOCSTOPREF: /* struct pfioc_remove_token */ if (!pf_status.running) { error = ENOENT; } else { - if ((error = remove_token( - (struct pfioc_remove_token*)addr))==0) { + struct pfioc_remove_token pfrt; + + /* small enough to be on stack */ + bcopy(addr, &pfrt, sizeof (pfrt)); + if ((error = remove_token(&pfrt)) == 0) { VERIFY(pf_enabled_ref_count != 0); pf_enabled_ref_count--; - // return currently held references - ((struct pfioc_remove_token *)addr)->refcount - = pf_enabled_ref_count; + /* return currently held references */ + pfrt.refcount = pf_enabled_ref_count; DPFPRINTF(PF_DEBUG_MISC, - ("pf: enabled refcount decremented\n")); + ("pf: enabled refcount decremented\n")); } else { error = EINVAL; DPFPRINTF(PF_DEBUG_URGENT, - ("pf: token mismatch\n")); - break; + ("pf: token mismatch\n")); } + bcopy(&pfrt, addr, sizeof (pfrt)); - if (pf_enabled_ref_count == 0) + if (error == 0 && pf_enabled_ref_count == 0) pf_stop(); } break; - case DIOCGETSTARTERS: { - struct pfioc_tokens *g_token = (struct pfioc_tokens *)addr; - struct pfioc_token *tokens; - struct pfioc_kernel_token *entry, *tmp; - user_addr_t token_buf; - int g_token_size_copy; - char *ptr; + case DIOCGETSTARTERS: { /* struct pfioc_tokens */ + PFIOCX_STRUCT_DECL(pfioc_tokens); - if (nr_tokens == 0) { - error = ENOENT; - break; - } + PFIOCX_STRUCT_BEGIN(addr, pfioc_tokens, error = ENOMEM; break;); + error = pfioctl_ioc_tokens(cmd, + PFIOCX_STRUCT_ADDR32(pfioc_tokens), + PFIOCX_STRUCT_ADDR64(pfioc_tokens), p); + PFIOCX_STRUCT_END(pfioc_tokens, addr); + break; + } + + case DIOCADDRULE: /* struct pfioc_rule */ + case DIOCGETRULES: /* struct pfioc_rule */ + case DIOCGETRULE: /* struct pfioc_rule */ + case DIOCCHANGERULE: /* struct pfioc_rule */ + case DIOCINSERTRULE: /* struct pfioc_rule */ + case DIOCDELETERULE: { /* struct pfioc_rule */ + struct pfioc_rule *pr = NULL; + + PFIOC_STRUCT_BEGIN(addr, pr, error = ENOMEM; break;); + error = pfioctl_ioc_rule(cmd, minordev, pr, p); + PFIOC_STRUCT_END(pr, addr); + break; + } - g_token_size_copy = g_token->size; + case DIOCCLRSTATES: /* struct pfioc_state_kill */ + case DIOCKILLSTATES: { /* struct pfioc_state_kill */ + struct pfioc_state_kill *psk = NULL; - if (g_token->size == 0) { - g_token->size = sizeof (struct pfioc_token) * nr_tokens; - break; - } + PFIOC_STRUCT_BEGIN(addr, psk, error = ENOMEM; break;); + error = pfioctl_ioc_state_kill(cmd, psk, p); + PFIOC_STRUCT_END(psk, addr); + break; + } - token_buf = PF_USER_ADDR(addr, pfioc_tokens, pgt_buf); - tokens = _MALLOC(sizeof(struct pfioc_token) * nr_tokens, - M_TEMP, M_WAITOK); + case DIOCADDSTATE: /* struct pfioc_state */ + case DIOCGETSTATE: { /* struct pfioc_state */ + struct pfioc_state *ps = NULL; - if (tokens == NULL) { - error = ENOMEM; + PFIOC_STRUCT_BEGIN(addr, ps, error = ENOMEM; break;); + error = pfioctl_ioc_state(cmd, ps, p); + PFIOC_STRUCT_END(ps, addr); + break; + } + + case DIOCGETSTATES: { /* struct pfioc_states */ + PFIOCX_STRUCT_DECL(pfioc_states); + + PFIOCX_STRUCT_BEGIN(addr, pfioc_states, error = ENOMEM; break;); + error = pfioctl_ioc_states(cmd, + PFIOCX_STRUCT_ADDR32(pfioc_states), + PFIOCX_STRUCT_ADDR64(pfioc_states), p); + PFIOCX_STRUCT_END(pfioc_states, addr); + break; + } + + case DIOCGETSTATUS: { /* struct pf_status */ + struct pf_status *s = NULL; + + PFIOC_STRUCT_BEGIN(&pf_status, s, error = ENOMEM; break;); + pfi_update_status(s->ifname, s); + PFIOC_STRUCT_END(s, addr); + break; + } + + case DIOCSETSTATUSIF: { /* struct pfioc_if */ + struct pfioc_if *pi = (struct pfioc_if *)(void *)addr; + + /* OK for unaligned accesses */ + if (pi->ifname[0] == 0) { + bzero(pf_status.ifname, IFNAMSIZ); break; } + strlcpy(pf_status.ifname, pi->ifname, IFNAMSIZ); + break; + } - ptr = (void *)tokens; - SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { - if ((unsigned)g_token_size_copy - < sizeof(struct pfioc_token)) - break; /* no more buffer space left */ + case DIOCCLRSTATUS: { + bzero(pf_status.counters, sizeof (pf_status.counters)); + bzero(pf_status.fcounters, sizeof (pf_status.fcounters)); + bzero(pf_status.scounters, sizeof (pf_status.scounters)); + pf_status.since = pf_calendar_time_second(); + if (*pf_status.ifname) + pfi_update_status(pf_status.ifname, NULL); + break; + } - ((struct pfioc_token *)(ptr))->token_value = entry->token.token_value; - ((struct pfioc_token *)(ptr))->timestamp = entry->token.timestamp; - ((struct pfioc_token *)(ptr))->pid = entry->token.pid; - memcpy(((struct pfioc_token *)(ptr))->proc_name, entry->token.proc_name, - PFTOK_PROCNAME_LEN); - ptr += sizeof(struct pfioc_token); + case DIOCNATLOOK: { /* struct pfioc_natlook */ + struct pfioc_natlook *pnl = NULL; - g_token_size_copy -= sizeof(struct pfioc_token); - } + PFIOC_STRUCT_BEGIN(addr, pnl, error = ENOMEM; break;); + error = pfioctl_ioc_natlook(cmd, pnl, p); + PFIOC_STRUCT_END(pnl, addr); + break; + } - if (g_token_size_copy < g_token->size) { - error = copyout(tokens, token_buf, - g_token->size - g_token_size_copy); - } + case DIOCSETTIMEOUT: /* struct pfioc_tm */ + case DIOCGETTIMEOUT: { /* struct pfioc_tm */ + struct pfioc_tm pt; - g_token->size -= g_token_size_copy; - _FREE(tokens, M_TEMP); + /* small enough to be on stack */ + bcopy(addr, &pt, sizeof (pt)); + error = pfioctl_ioc_tm(cmd, &pt, p); + bcopy(&pt, addr, sizeof (pt)); + break; + } + + case DIOCGETLIMIT: /* struct pfioc_limit */ + case DIOCSETLIMIT: { /* struct pfioc_limit */ + struct pfioc_limit pl; + /* small enough to be on stack */ + bcopy(addr, &pl, sizeof (pl)); + error = pfioctl_ioc_limit(cmd, &pl, p); + bcopy(&pl, addr, sizeof (pl)); break; - } + } - case DIOCADDRULE: { - struct pfioc_rule *pr = (struct pfioc_rule *)addr; - struct pf_ruleset *ruleset; - struct pf_rule *rule, *tail; - struct pf_pooladdr *apa; - int rs_num; + case DIOCSETDEBUG: { /* u_int32_t */ + bcopy(addr, &pf_status.debug, sizeof (u_int32_t)); + break; + } - pr->anchor[sizeof (pr->anchor) - 1] = '\0'; - pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; - ruleset = pf_find_ruleset(pr->anchor); - if (ruleset == NULL) { - error = EINVAL; - break; + case DIOCCLRRULECTRS: { + /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */ + struct pf_ruleset *ruleset = &pf_main_ruleset; + struct pf_rule *rule; + + TAILQ_FOREACH(rule, + ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; } - rs_num = pf_get_ruleset_number(pr->rule.action); - if (rs_num >= PF_RULESET_MAX) { + break; + } + + case DIOCGIFSPEED: { + struct pf_ifspeed *psp = (struct pf_ifspeed *)(void *)addr; + struct pf_ifspeed ps; + struct ifnet *ifp; + u_int64_t baudrate; + + if (psp->ifname[0] != '\0') { + /* Can we completely trust user-land? */ + strlcpy(ps.ifname, psp->ifname, IFNAMSIZ); + ps.ifname[IFNAMSIZ - 1] = '\0'; + ifp = ifunit(ps.ifname); + if (ifp != NULL) { + baudrate = ifp->if_output_bw.max_bw; + bcopy(&baudrate, &psp->baudrate, + sizeof (baudrate)); + } else { + error = EINVAL; + } + } else { error = EINVAL; - break; } - if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { - error = EINVAL; - break; + break; + } + +#if PF_ALTQ + case DIOCSTARTALTQ: { + struct pf_altq *altq; + + VERIFY(altq_allowed); + /* enable all altq interfaces on active list */ + TAILQ_FOREACH(altq, pf_altqs_active, entries) { + if (altq->qname[0] == '\0') { + error = pf_enable_altq(altq); + if (error != 0) + break; + } } - if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) { - error = EBUSY; - break; + if (error == 0) + pf_altq_running = 1; + DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n")); + break; + } + + case DIOCSTOPALTQ: { + struct pf_altq *altq; + + VERIFY(altq_allowed); + /* disable all altq interfaces on active list */ + TAILQ_FOREACH(altq, pf_altqs_active, entries) { + if (altq->qname[0] == '\0') { + error = pf_disable_altq(altq); + if (error != 0) + break; + } } - if (pr->pool_ticket != ticket_pabuf) { + if (error == 0) + pf_altq_running = 0; + DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n")); + break; + } + + case DIOCADDALTQ: { /* struct pfioc_altq */ + struct pfioc_altq *pa = (struct pfioc_altq *)(void *)addr; + struct pf_altq *altq, *a; + u_int32_t ticket; + + VERIFY(altq_allowed); + bcopy(&pa->ticket, &ticket, sizeof (ticket)); + if (ticket != ticket_altqs_inactive) { error = EBUSY; break; } - rule = pool_get(&pf_rule_pl, PR_WAITOK); - if (rule == NULL) { + altq = pool_get(&pf_altq_pl, PR_WAITOK); + if (altq == NULL) { error = ENOMEM; break; } - pf_rule_copyin(&pr->rule, rule, p); -#if !INET - if (rule->af == AF_INET) { - pool_put(&pf_rule_pl, rule); - error = EAFNOSUPPORT; - break; - } -#endif /* INET */ -#if !INET6 - if (rule->af == AF_INET6) { - pool_put(&pf_rule_pl, rule); - error = EAFNOSUPPORT; - break; - } -#endif /* INET6 */ - tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, - pf_rulequeue); - if (tail) - rule->nr = tail->nr + 1; - else - rule->nr = 0; - if (rule->ifname[0]) { - rule->kif = pfi_kif_get(rule->ifname); - if (rule->kif == NULL) { - pool_put(&pf_rule_pl, rule); - error = EINVAL; + pf_altq_copyin(&pa->altq, altq); + + /* + * if this is for a queue, find the discipline and + * copy the necessary fields + */ + if (altq->qname[0] != '\0') { + if ((altq->qid = pf_qname2qid(altq->qname)) == 0) { + error = EBUSY; + pool_put(&pf_altq_pl, altq); break; } - pfi_kif_ref(rule->kif, PFI_KIF_REF_RULE); + altq->altq_disc = NULL; + TAILQ_FOREACH(a, pf_altqs_inactive, entries) { + if (strncmp(a->ifname, altq->ifname, + IFNAMSIZ) == 0 && a->qname[0] == '\0') { + altq->altq_disc = a->altq_disc; + break; + } + } + } + + error = altq_add(altq); + if (error) { + pool_put(&pf_altq_pl, altq); + break; + } + + TAILQ_INSERT_TAIL(pf_altqs_inactive, altq, entries); + pf_altq_copyout(altq, &pa->altq); + break; + } + + case DIOCGETALTQS: { + struct pfioc_altq *pa = (struct pfioc_altq *)(void *)addr; + struct pf_altq *altq; + u_int32_t nr; + + VERIFY(altq_allowed); + nr = 0; + TAILQ_FOREACH(altq, pf_altqs_active, entries) + nr++; + bcopy(&nr, &pa->nr, sizeof (nr)); + bcopy(&ticket_altqs_active, &pa->ticket, sizeof (pa->ticket)); + break; + } + + case DIOCGETALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)(void *)addr; + struct pf_altq *altq; + u_int32_t nr, pa_nr, ticket; + + VERIFY(altq_allowed); + bcopy(&pa->ticket, &ticket, sizeof (ticket)); + if (ticket != ticket_altqs_active) { + error = EBUSY; + break; + } + bcopy(&pa->nr, &pa_nr, sizeof (pa_nr)); + nr = 0; + altq = TAILQ_FIRST(pf_altqs_active); + while ((altq != NULL) && (nr < pa_nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + error = EBUSY; + break; + } + pf_altq_copyout(altq, &pa->altq); + break; + } + + case DIOCCHANGEALTQ: + VERIFY(altq_allowed); + /* CHANGEALTQ not supported yet! */ + error = ENODEV; + break; + + case DIOCGETQSTATS: { + struct pfioc_qstats *pq = (struct pfioc_qstats *)(void *)addr; + struct pf_altq *altq; + u_int32_t nr, pq_nr, ticket; + int nbytes; + + VERIFY(altq_allowed); + bcopy(&pq->ticket, &ticket, sizeof (ticket)); + if (ticket != ticket_altqs_active) { + error = EBUSY; + break; + } + bcopy(&pq->nr, &pq_nr, sizeof (pq_nr)); + nr = 0; + altq = TAILQ_FIRST(pf_altqs_active); + while ((altq != NULL) && (nr < pq_nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + error = EBUSY; + break; + } + bcopy(&pq->nbytes, &nbytes, sizeof (nbytes)); + error = altq_getqstats(altq, pq->buf, &nbytes); + if (error == 0) { + pq->scheduler = altq->scheduler; + bcopy(&nbytes, &pq->nbytes, sizeof (nbytes)); + } + break; + } +#endif /* PF_ALTQ */ + + case DIOCBEGINADDRS: /* struct pfioc_pooladdr */ + case DIOCADDADDR: /* struct pfioc_pooladdr */ + case DIOCGETADDRS: /* struct pfioc_pooladdr */ + case DIOCGETADDR: /* struct pfioc_pooladdr */ + case DIOCCHANGEADDR: { /* struct pfioc_pooladdr */ + struct pfioc_pooladdr *pp = NULL; + + PFIOC_STRUCT_BEGIN(addr, pp, error = ENOMEM; break;) + error = pfioctl_ioc_pooladdr(cmd, pp, p); + PFIOC_STRUCT_END(pp, addr); + break; + } + + case DIOCGETRULESETS: /* struct pfioc_ruleset */ + case DIOCGETRULESET: { /* struct pfioc_ruleset */ + struct pfioc_ruleset *pr = NULL; + + PFIOC_STRUCT_BEGIN(addr, pr, error = ENOMEM; break;); + error = pfioctl_ioc_ruleset(cmd, pr, p); + PFIOC_STRUCT_END(pr, addr); + break; + } + + case DIOCRCLRTABLES: /* struct pfioc_table */ + case DIOCRADDTABLES: /* struct pfioc_table */ + case DIOCRDELTABLES: /* struct pfioc_table */ + case DIOCRGETTABLES: /* struct pfioc_table */ + case DIOCRGETTSTATS: /* struct pfioc_table */ + case DIOCRCLRTSTATS: /* struct pfioc_table */ + case DIOCRSETTFLAGS: /* struct pfioc_table */ + case DIOCRCLRADDRS: /* struct pfioc_table */ + case DIOCRADDADDRS: /* struct pfioc_table */ + case DIOCRDELADDRS: /* struct pfioc_table */ + case DIOCRSETADDRS: /* struct pfioc_table */ + case DIOCRGETADDRS: /* struct pfioc_table */ + case DIOCRGETASTATS: /* struct pfioc_table */ + case DIOCRCLRASTATS: /* struct pfioc_table */ + case DIOCRTSTADDRS: /* struct pfioc_table */ + case DIOCRINADEFINE: { /* struct pfioc_table */ + PFIOCX_STRUCT_DECL(pfioc_table); + + PFIOCX_STRUCT_BEGIN(addr, pfioc_table, error = ENOMEM; break;); + error = pfioctl_ioc_table(cmd, + PFIOCX_STRUCT_ADDR32(pfioc_table), + PFIOCX_STRUCT_ADDR64(pfioc_table), p); + PFIOCX_STRUCT_END(pfioc_table, addr); + break; + } + + case DIOCOSFPADD: /* struct pf_osfp_ioctl */ + case DIOCOSFPGET: { /* struct pf_osfp_ioctl */ + struct pf_osfp_ioctl *io = NULL; + + PFIOC_STRUCT_BEGIN(addr, io, error = ENOMEM; break;); + if (cmd == DIOCOSFPADD) { + error = pf_osfp_add(io); + } else { + VERIFY(cmd == DIOCOSFPGET); + error = pf_osfp_get(io); + } + PFIOC_STRUCT_END(io, addr); + break; + } + + case DIOCXBEGIN: /* struct pfioc_trans */ + case DIOCXROLLBACK: /* struct pfioc_trans */ + case DIOCXCOMMIT: { /* struct pfioc_trans */ + PFIOCX_STRUCT_DECL(pfioc_trans); + + PFIOCX_STRUCT_BEGIN(addr, pfioc_trans, error = ENOMEM; break;); + error = pfioctl_ioc_trans(cmd, + PFIOCX_STRUCT_ADDR32(pfioc_trans), + PFIOCX_STRUCT_ADDR64(pfioc_trans), p); + PFIOCX_STRUCT_END(pfioc_trans, addr); + break; + } + + case DIOCGETSRCNODES: { /* struct pfioc_src_nodes */ + PFIOCX_STRUCT_DECL(pfioc_src_nodes); + + PFIOCX_STRUCT_BEGIN(addr, pfioc_src_nodes, + error = ENOMEM; break;); + error = pfioctl_ioc_src_nodes(cmd, + PFIOCX_STRUCT_ADDR32(pfioc_src_nodes), + PFIOCX_STRUCT_ADDR64(pfioc_src_nodes), p); + PFIOCX_STRUCT_END(pfioc_src_nodes, addr); + break; + } + + case DIOCCLRSRCNODES: { + struct pf_src_node *n; + struct pf_state *state; + + RB_FOREACH(state, pf_state_tree_id, &tree_id) { + state->src_node = NULL; + state->nat_src_node = NULL; + } + RB_FOREACH(n, pf_src_tree, &tree_src_tracking) { + n->expire = 1; + n->states = 0; + } + pf_purge_expired_src_nodes(); + pf_status.src_nodes = 0; + break; + } + + case DIOCKILLSRCNODES: { /* struct pfioc_src_node_kill */ + struct pfioc_src_node_kill *psnk = NULL; + + PFIOC_STRUCT_BEGIN(addr, psnk, error = ENOMEM; break;); + error = pfioctl_ioc_src_node_kill(cmd, psnk, p); + PFIOC_STRUCT_END(psnk, addr); + break; + } + + case DIOCSETHOSTID: { /* u_int32_t */ + u_int32_t hid; + + /* small enough to be on stack */ + bcopy(addr, &hid, sizeof (hid)); + if (hid == 0) + pf_status.hostid = random(); + else + pf_status.hostid = hid; + break; + } + + case DIOCOSFPFLUSH: + pf_osfp_flush(); + break; + + case DIOCIGETIFACES: /* struct pfioc_iface */ + case DIOCSETIFFLAG: /* struct pfioc_iface */ + case DIOCCLRIFFLAG: { /* struct pfioc_iface */ + PFIOCX_STRUCT_DECL(pfioc_iface); + + PFIOCX_STRUCT_BEGIN(addr, pfioc_iface, error = ENOMEM; break;); + error = pfioctl_ioc_iface(cmd, + PFIOCX_STRUCT_ADDR32(pfioc_iface), + PFIOCX_STRUCT_ADDR64(pfioc_iface), p); + PFIOCX_STRUCT_END(pfioc_iface, addr); + break; + } + + default: + error = ENODEV; + break; + } + + lck_mtx_unlock(pf_lock); + lck_rw_done(pf_perim_lock); + + return (error); +} + +static int +pfioctl_ioc_table(u_long cmd, struct pfioc_table_32 *io32, + struct pfioc_table_64 *io64, struct proc *p) +{ + int p64 = proc_is64bit(p); + int error = 0; + + if (!p64) + goto struct32; + + /* + * 64-bit structure processing + */ + switch (cmd) { + case DIOCRCLRTABLES: + if (io64->pfrio_esize != 0) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_clr_tables(&io64->pfrio_table, &io64->pfrio_ndel, + io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRADDTABLES: + if (io64->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_add_tables(io64->pfrio_buffer, io64->pfrio_size, + &io64->pfrio_nadd, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRDELTABLES: + if (io64->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_del_tables(io64->pfrio_buffer, io64->pfrio_size, + &io64->pfrio_ndel, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRGETTABLES: + if (io64->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_get_tables(&io64->pfrio_table, io64->pfrio_buffer, + &io64->pfrio_size, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRGETTSTATS: + if (io64->pfrio_esize != sizeof (struct pfr_tstats)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_get_tstats(&io64->pfrio_table, io64->pfrio_buffer, + &io64->pfrio_size, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRCLRTSTATS: + if (io64->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_clr_tstats(io64->pfrio_buffer, io64->pfrio_size, + &io64->pfrio_nzero, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRSETTFLAGS: + if (io64->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_set_tflags(io64->pfrio_buffer, io64->pfrio_size, + io64->pfrio_setflag, io64->pfrio_clrflag, + &io64->pfrio_nchange, &io64->pfrio_ndel, + io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRCLRADDRS: + if (io64->pfrio_esize != 0) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_clr_addrs(&io64->pfrio_table, &io64->pfrio_ndel, + io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRADDADDRS: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_add_addrs(&io64->pfrio_table, io64->pfrio_buffer, + io64->pfrio_size, &io64->pfrio_nadd, io64->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRDELADDRS: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_del_addrs(&io64->pfrio_table, io64->pfrio_buffer, + io64->pfrio_size, &io64->pfrio_ndel, io64->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRSETADDRS: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_set_addrs(&io64->pfrio_table, io64->pfrio_buffer, + io64->pfrio_size, &io64->pfrio_size2, &io64->pfrio_nadd, + &io64->pfrio_ndel, &io64->pfrio_nchange, io64->pfrio_flags | + PFR_FLAG_USERIOCTL, 0); + break; + + case DIOCRGETADDRS: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_get_addrs(&io64->pfrio_table, io64->pfrio_buffer, + &io64->pfrio_size, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRGETASTATS: + if (io64->pfrio_esize != sizeof (struct pfr_astats)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_get_astats(&io64->pfrio_table, io64->pfrio_buffer, + &io64->pfrio_size, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRCLRASTATS: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_clr_astats(&io64->pfrio_table, io64->pfrio_buffer, + io64->pfrio_size, &io64->pfrio_nzero, io64->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRTSTADDRS: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_tst_addrs(&io64->pfrio_table, io64->pfrio_buffer, + io64->pfrio_size, &io64->pfrio_nmatch, io64->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRINADEFINE: + if (io64->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io64->pfrio_table); + error = pfr_ina_define(&io64->pfrio_table, io64->pfrio_buffer, + io64->pfrio_size, &io64->pfrio_nadd, &io64->pfrio_naddr, + io64->pfrio_ticket, io64->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + goto done; + +struct32: + /* + * 32-bit structure processing + */ + switch (cmd) { + case DIOCRCLRTABLES: + if (io32->pfrio_esize != 0) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_clr_tables(&io32->pfrio_table, &io32->pfrio_ndel, + io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRADDTABLES: + if (io32->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_add_tables(io32->pfrio_buffer, io32->pfrio_size, + &io32->pfrio_nadd, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRDELTABLES: + if (io32->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_del_tables(io32->pfrio_buffer, io32->pfrio_size, + &io32->pfrio_ndel, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRGETTABLES: + if (io32->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_get_tables(&io32->pfrio_table, io32->pfrio_buffer, + &io32->pfrio_size, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRGETTSTATS: + if (io32->pfrio_esize != sizeof (struct pfr_tstats)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_get_tstats(&io32->pfrio_table, io32->pfrio_buffer, + &io32->pfrio_size, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRCLRTSTATS: + if (io32->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_clr_tstats(io32->pfrio_buffer, io32->pfrio_size, + &io32->pfrio_nzero, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRSETTFLAGS: + if (io32->pfrio_esize != sizeof (struct pfr_table)) { + error = ENODEV; + break; + } + error = pfr_set_tflags(io32->pfrio_buffer, io32->pfrio_size, + io32->pfrio_setflag, io32->pfrio_clrflag, + &io32->pfrio_nchange, &io32->pfrio_ndel, + io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRCLRADDRS: + if (io32->pfrio_esize != 0) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_clr_addrs(&io32->pfrio_table, &io32->pfrio_ndel, + io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRADDADDRS: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_add_addrs(&io32->pfrio_table, io32->pfrio_buffer, + io32->pfrio_size, &io32->pfrio_nadd, io32->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRDELADDRS: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_del_addrs(&io32->pfrio_table, io32->pfrio_buffer, + io32->pfrio_size, &io32->pfrio_ndel, io32->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRSETADDRS: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_set_addrs(&io32->pfrio_table, io32->pfrio_buffer, + io32->pfrio_size, &io32->pfrio_size2, &io32->pfrio_nadd, + &io32->pfrio_ndel, &io32->pfrio_nchange, io32->pfrio_flags | + PFR_FLAG_USERIOCTL, 0); + break; + + case DIOCRGETADDRS: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_get_addrs(&io32->pfrio_table, io32->pfrio_buffer, + &io32->pfrio_size, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRGETASTATS: + if (io32->pfrio_esize != sizeof (struct pfr_astats)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_get_astats(&io32->pfrio_table, io32->pfrio_buffer, + &io32->pfrio_size, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + case DIOCRCLRASTATS: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_clr_astats(&io32->pfrio_table, io32->pfrio_buffer, + io32->pfrio_size, &io32->pfrio_nzero, io32->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRTSTADDRS: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_tst_addrs(&io32->pfrio_table, io32->pfrio_buffer, + io32->pfrio_size, &io32->pfrio_nmatch, io32->pfrio_flags | + PFR_FLAG_USERIOCTL); + break; + + case DIOCRINADEFINE: + if (io32->pfrio_esize != sizeof (struct pfr_addr)) { + error = ENODEV; + break; + } + pfr_table_copyin_cleanup(&io32->pfrio_table); + error = pfr_ina_define(&io32->pfrio_table, io32->pfrio_buffer, + io32->pfrio_size, &io32->pfrio_nadd, &io32->pfrio_naddr, + io32->pfrio_ticket, io32->pfrio_flags | PFR_FLAG_USERIOCTL); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + +done: + return (error); +} + +static int +pfioctl_ioc_tokens(u_long cmd, struct pfioc_tokens_32 *tok32, + struct pfioc_tokens_64 *tok64, struct proc *p) +{ + struct pfioc_token *tokens; + struct pfioc_kernel_token *entry, *tmp; + user_addr_t token_buf; + int ocnt, cnt, error = 0, p64 = proc_is64bit(p); + char *ptr; + + switch (cmd) { + case DIOCGETSTARTERS: { + int size; + + if (nr_tokens == 0) { + error = ENOENT; + break; + } + + size = sizeof (struct pfioc_token) * nr_tokens; + ocnt = cnt = (p64 ? tok64->size : tok32->size); + if (cnt == 0) { + if (p64) + tok64->size = size; + else + tok32->size = size; + break; + } + + token_buf = (p64 ? tok64->pgt_buf : tok32->pgt_buf); + tokens = _MALLOC(size, M_TEMP, M_WAITOK|M_ZERO); + if (tokens == NULL) { + error = ENOMEM; + break; + } + + ptr = (void *)tokens; + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + struct pfioc_token *t; + + if ((unsigned)cnt < sizeof (*tokens)) + break; /* no more buffer space left */ + + t = (struct pfioc_token *)(void *)ptr; + t->token_value = entry->token.token_value; + t->timestamp = entry->token.timestamp; + t->pid = entry->token.pid; + bcopy(entry->token.proc_name, t->proc_name, + PFTOK_PROCNAME_LEN); + ptr += sizeof (struct pfioc_token); + + cnt -= sizeof (struct pfioc_token); + } + + if (cnt < ocnt) + error = copyout(tokens, token_buf, ocnt - cnt); + + if (p64) + tok64->size = ocnt - cnt; + else + tok32->size = ocnt - cnt; + + _FREE(tokens, M_TEMP); + break; + } + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} + +static void +pf_expire_states_and_src_nodes(struct pf_rule *rule) +{ + struct pf_state *state; + struct pf_src_node *sn; + int killed = 0; + + /* expire the states */ + state = TAILQ_FIRST(&state_list); + while (state) { + if (state->rule.ptr == rule) + state->timeout = PFTM_PURGE; + state = TAILQ_NEXT(state, entry_list); + } + pf_purge_expired_states(pf_status.states); + + /* expire the src_nodes */ + RB_FOREACH(sn, pf_src_tree, &tree_src_tracking) { + if (sn->rule.ptr != rule) + continue; + if (sn->states != 0) { + RB_FOREACH(state, pf_state_tree_id, + &tree_id) { + if (state->src_node == sn) + state->src_node = NULL; + if (state->nat_src_node == sn) + state->nat_src_node = NULL; + } + sn->states = 0; } + sn->expire = 1; + killed++; + } + if (killed) + pf_purge_expired_src_nodes(); +} -#if ALTQ - /* set queue IDs */ - if (rule->qname[0] != 0) { - if ((rule->qid = pf_qname2qid(rule->qname)) == 0) - error = EBUSY; - else if (rule->pqname[0] != 0) { - if ((rule->pqid = - pf_qname2qid(rule->pqname)) == 0) - error = EBUSY; - } else - rule->pqid = rule->qid; +static void +pf_delete_rule_from_ruleset(struct pf_ruleset *ruleset, int rs_num, + struct pf_rule *rule) +{ + struct pf_rule *r; + int nr = 0; + + pf_expire_states_and_src_nodes(rule); + + pf_rm_rule(ruleset->rules[rs_num].active.ptr, rule); + if (ruleset->rules[rs_num].active.rcount-- == 0) + panic("%s: rcount value broken!", __func__); + r = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + + while (r) { + r->nr = nr++; + r = TAILQ_NEXT(r, entries); + } +} + + +static void +pf_ruleset_cleanup(struct pf_ruleset *ruleset, int rs) +{ + pf_calc_skip_steps(ruleset->rules[rs].active.ptr); + ruleset->rules[rs].active.ticket = + ++ruleset->rules[rs].inactive.ticket; +} + +static int +pf_delete_rule_by_ticket(struct pfioc_rule *pr) +{ + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num; + int is_anchor; + int error; + + is_anchor = (pr->anchor_call[0] != '\0'); + if ((ruleset = pf_find_ruleset_with_owner(pr->anchor, + pr->rule.owner, is_anchor, &error)) == NULL) + return (error); + + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + return (EINVAL); + } + + if (pr->rule.ticket) { + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while (rule && (rule->ticket != pr->rule.ticket)) + rule = TAILQ_NEXT(rule, entries); + if (rule == NULL) + return (ENOENT); + + if (strcmp(rule->owner, pr->rule.owner)) + return (EACCES); + +delete_rule: + if (rule->anchor && (ruleset != &pf_main_ruleset) && + ((strcmp(ruleset->anchor->owner, "")) == 0) && + ((ruleset->rules[rs_num].active.rcount - 1) == 0)) { + /* set rule & ruleset to parent and repeat */ + struct pf_rule *delete_rule = rule; + struct pf_ruleset *delete_ruleset = ruleset; + +#define parent_ruleset ruleset->anchor->parent->ruleset + if (ruleset->anchor->parent == NULL) + ruleset = &pf_main_ruleset; + else + ruleset = &parent_ruleset; + + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while (rule && + (rule->anchor != delete_ruleset->anchor)) + rule = TAILQ_NEXT(rule, entries); + if (rule == NULL) + panic("%s: rule not found!", __func__); + + if (delete_rule->rule_flag & PFRULE_PFM) + pffwrules--; + + pf_delete_rule_from_ruleset(delete_ruleset, + rs_num, delete_rule); + delete_ruleset->rules[rs_num].active.ticket = + ++delete_ruleset->rules[rs_num].inactive.ticket; + + goto delete_rule; + } else { + if (rule->rule_flag & PFRULE_PFM) + pffwrules--; + pf_delete_rule_from_ruleset(ruleset, rs_num, + rule); + pf_ruleset_cleanup(ruleset, rs_num); } -#endif /* ALTQ */ - if (rule->tagname[0]) - if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0) - error = EBUSY; - if (rule->match_tagname[0]) - if ((rule->match_tag = - pf_tagname2tag(rule->match_tagname)) == 0) + } + + return (0); +} + +static void +pf_delete_rule_by_owner(char *owner) +{ + struct pf_ruleset *ruleset; + struct pf_rule *rule, *next; + int deleted = 0; + + for (int rs = 0; rs < PF_RULESET_MAX; rs++) { + rule = TAILQ_FIRST(pf_main_ruleset.rules[rs].active.ptr); + ruleset = &pf_main_ruleset; + while (rule) { + next = TAILQ_NEXT(rule, entries); + if (rule->anchor) { + if (((strcmp(rule->owner, owner)) == 0) || + ((strcmp(rule->owner, "")) == 0)) { + if (rule->anchor->ruleset.rules[rs].active.rcount > 0) { + if (deleted) { + pf_ruleset_cleanup(ruleset, rs); + deleted = 0; + } + /* step into anchor */ + ruleset = + &rule->anchor->ruleset; + rule = TAILQ_FIRST(ruleset->rules[rs].active.ptr); + continue; + } else { + if (rule->rule_flag & + PFRULE_PFM) + pffwrules--; + pf_delete_rule_from_ruleset(ruleset, rs, rule); + deleted = 1; + rule = next; + } + } else + rule = next; + } else { + if (((strcmp(rule->owner, owner)) == 0)) { + /* delete rule */ + if (rule->rule_flag & PFRULE_PFM) + pffwrules--; + pf_delete_rule_from_ruleset(ruleset, + rs, rule); + deleted = 1; + } + rule = next; + } + if (rule == NULL) { + if (deleted) { + pf_ruleset_cleanup(ruleset, rs); + deleted = 0; + } + if (ruleset != &pf_main_ruleset) + pf_deleterule_anchor_step_out(&ruleset, + rs, &rule); + } + } + } +} + +static void +pf_deleterule_anchor_step_out(struct pf_ruleset **ruleset_ptr, + int rs, struct pf_rule **rule_ptr) +{ + struct pf_ruleset *ruleset = *ruleset_ptr; + struct pf_rule *rule = *rule_ptr; + + /* step out of anchor */ + struct pf_ruleset *rs_copy = ruleset; + ruleset = ruleset->anchor->parent? + &ruleset->anchor->parent->ruleset:&pf_main_ruleset; + + rule = TAILQ_FIRST(ruleset->rules[rs].active.ptr); + while (rule && (rule->anchor != rs_copy->anchor)) + rule = TAILQ_NEXT(rule, entries); + if (rule == NULL) + panic("%s: parent rule of anchor not found!", __func__); + if (rule->anchor->ruleset.rules[rs].active.rcount > 0) + rule = TAILQ_NEXT(rule, entries); + + *ruleset_ptr = ruleset; + *rule_ptr = rule; +} + +static int +pf_rule_setup(struct pfioc_rule *pr, struct pf_rule *rule, + struct pf_ruleset *ruleset) { + struct pf_pooladdr *apa; + int error = 0; + + if (rule->ifname[0]) { + rule->kif = pfi_kif_get(rule->ifname); + if (rule->kif == NULL) { + pool_put(&pf_rule_pl, rule); + return (EINVAL); + } + pfi_kif_ref(rule->kif, PFI_KIF_REF_RULE); + } +#if PF_ALTQ + /* set queue IDs */ + if (altq_allowed && rule->qname[0] != '\0') { + if ((rule->qid = pf_qname2qid(rule->qname)) == 0) + error = EBUSY; + else if (rule->pqname[0] != '\0') { + if ((rule->pqid = + pf_qname2qid(rule->pqname)) == 0) error = EBUSY; - if (rule->rt && !rule->direction) - error = EINVAL; + } else + rule->pqid = rule->qid; + } +#endif /* PF_ALTQ */ + if (rule->tagname[0]) + if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0) + error = EBUSY; + if (rule->match_tagname[0]) + if ((rule->match_tag = + pf_tagname2tag(rule->match_tagname)) == 0) + error = EBUSY; + if (rule->rt && !rule->direction) + error = EINVAL; #if PFLOG - if (!rule->log) - rule->logif = 0; - if (rule->logif >= PFLOGIFS_MAX) - error = EINVAL; + if (!rule->log) + rule->logif = 0; + if (rule->logif >= PFLOGIFS_MAX) + error = EINVAL; #endif /* PFLOG */ - if (pf_rtlabel_add(&rule->src.addr) || - pf_rtlabel_add(&rule->dst.addr)) - error = EBUSY; - if (pfi_dynaddr_setup(&rule->src.addr, rule->af)) - error = EINVAL; - if (pfi_dynaddr_setup(&rule->dst.addr, rule->af)) + if (pf_rtlabel_add(&rule->src.addr) || + pf_rtlabel_add(&rule->dst.addr)) + error = EBUSY; + if (pfi_dynaddr_setup(&rule->src.addr, rule->af)) + error = EINVAL; + if (pfi_dynaddr_setup(&rule->dst.addr, rule->af)) + error = EINVAL; + if (pf_tbladdr_setup(ruleset, &rule->src.addr)) + error = EINVAL; + if (pf_tbladdr_setup(ruleset, &rule->dst.addr)) + error = EINVAL; + if (pf_anchor_setup(rule, ruleset, pr->anchor_call)) + error = EINVAL; + TAILQ_FOREACH(apa, &pf_pabuf, entries) + if (pf_tbladdr_setup(ruleset, &apa->addr)) error = EINVAL; - if (pf_tbladdr_setup(ruleset, &rule->src.addr)) + + if (rule->overload_tblname[0]) { + if ((rule->overload_tbl = pfr_attach_table(ruleset, + rule->overload_tblname)) == NULL) error = EINVAL; - if (pf_tbladdr_setup(ruleset, &rule->dst.addr)) + else + rule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&pf_pabuf, &rule->rpool.list); + if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) || + (rule->action == PF_BINAT)) && rule->anchor == NULL) || + (rule->rt > PF_FASTROUTE)) && + (TAILQ_FIRST(&rule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_rm_rule(NULL, rule); + return (error); + } + rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list); + rule->evaluations = rule->packets[0] = rule->packets[1] = + rule->bytes[0] = rule->bytes[1] = 0; + + return (0); +} + +static int +pfioctl_ioc_rule(u_long cmd, int minordev, struct pfioc_rule *pr, struct proc *p) +{ + int error = 0; + + switch (cmd) { + case DIOCADDRULE: { + struct pf_ruleset *ruleset; + struct pf_rule *rule, *tail; + int rs_num; + + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { error = EINVAL; - if (pf_anchor_setup(rule, ruleset, pr->anchor_call)) + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { error = EINVAL; - TAILQ_FOREACH(apa, &pf_pabuf, entries) - if (pf_tbladdr_setup(ruleset, &apa->addr)) - error = EINVAL; - - if (rule->overload_tblname[0]) { - if ((rule->overload_tbl = pfr_attach_table(ruleset, - rule->overload_tblname)) == NULL) - error = EINVAL; - else - rule->overload_tbl->pfrkt_flags |= - PFR_TFLAG_ACTIVE; + break; } - - pf_mv_pool(&pf_pabuf, &rule->rpool.list); - if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) || - (rule->action == PF_BINAT)) && rule->anchor == NULL) || - (rule->rt > PF_FASTROUTE)) && - (TAILQ_FIRST(&rule->rpool.list) == NULL)) + if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { error = EINVAL; - - if (error) { - pf_rm_rule(NULL, rule); break; } - rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list); - rule->evaluations = rule->packets[0] = rule->packets[1] = - rule->bytes[0] = rule->bytes[1] = 0; + if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) { + error = EBUSY; + break; + } + if (pr->pool_ticket != ticket_pabuf) { + error = EBUSY; + break; + } + rule = pool_get(&pf_rule_pl, PR_WAITOK); + if (rule == NULL) { + error = ENOMEM; + break; + } + pf_rule_copyin(&pr->rule, rule, p, minordev); +#if !INET + if (rule->af == AF_INET) { + pool_put(&pf_rule_pl, rule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#if !INET6 + if (rule->af == AF_INET6) { + pool_put(&pf_rule_pl, rule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + if (tail) + rule->nr = tail->nr + 1; + else + rule->nr = 0; + + if ((error = pf_rule_setup(pr, rule, ruleset))) + break; + TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr, rule, entries); ruleset->rules[rs_num].inactive.rcount++; + if (rule->rule_flag & PFRULE_PFM) + pffwrules++; break; } case DIOCGETRULES: { - struct pfioc_rule *pr = (struct pfioc_rule *)addr; struct pf_ruleset *ruleset; struct pf_rule *tail; int rs_num; @@ -1852,7 +3151,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCGETRULE: { - struct pfioc_rule *pr = (struct pfioc_rule *)addr; struct pf_ruleset *ruleset; struct pf_rule *rule; int rs_num, i; @@ -1907,9 +3205,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCCHANGERULE: { - struct pfioc_rule *pcr = (struct pfioc_rule *)addr; + struct pfioc_rule *pcr = pr; struct pf_ruleset *ruleset; struct pf_rule *oldrule = NULL, *newrule = NULL; + struct pf_pooladdr *pa; u_int32_t nr = 0; int rs_num; @@ -1959,7 +3258,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - pf_rule_copyin(&pcr->rule, newrule, p); + pf_rule_copyin(&pcr->rule, newrule, p, minordev); #if !INET if (newrule->af == AF_INET) { pool_put(&pf_rule_pl, newrule); @@ -1985,20 +3284,20 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } else newrule->kif = NULL; -#if ALTQ +#if PF_ALTQ /* set queue IDs */ - if (newrule->qname[0] != 0) { + if (altq_allowed && newrule->qname[0] != '\0') { if ((newrule->qid = pf_qname2qid(newrule->qname)) == 0) error = EBUSY; - else if (newrule->pqname[0] != 0) { + else if (newrule->pqname[0] != '\0') { if ((newrule->pqid = pf_qname2qid(newrule->pqname)) == 0) error = EBUSY; } else newrule->pqid = newrule->qid; } -#endif /* ALTQ */ +#endif /* PF_ALTQ */ if (newrule->tagname[0]) if ((newrule->tag = pf_tagname2tag(newrule->tagname)) == 0) @@ -2112,9 +3411,149 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + case DIOCINSERTRULE: { + struct pf_ruleset *ruleset; + struct pf_rule *rule, *tail, *r; + int rs_num; + int is_anchor; + + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; + is_anchor = (pr->anchor_call[0] != '\0'); + + if ((ruleset = pf_find_ruleset_with_owner(pr->anchor, + pr->rule.owner, is_anchor, &error)) == NULL) + break; + + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + error = EINVAL; + break; + } + if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } + + /* make sure this anchor rule doesn't exist already */ + if (is_anchor) { + r = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while (r) { + if (r->anchor && + ((strcmp(r->anchor->name, + pr->anchor_call)) == 0)) { + if (((strcmp(pr->rule.owner, + r->owner)) == 0) || + ((strcmp(r->owner, "")) == 0)) + error = EEXIST; + else + error = EPERM; + break; + } + r = TAILQ_NEXT(r, entries); + } + } + + rule = pool_get(&pf_rule_pl, PR_WAITOK); + if (rule == NULL) { + error = ENOMEM; + break; + } + pf_rule_copyin(&pr->rule, rule, p, minordev); +#if !INET + if (rule->af == AF_INET) { + pool_put(&pf_rule_pl, rule); + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#if !INET6 + if (rule->af == AF_INET6) { + pool_put(&pf_rule_pl, rule); + error = EAFNOSUPPORT; + break; + } + +#endif /* INET6 */ + r = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while ((r != NULL) && (rule->priority >= (unsigned)r->priority)) + r = TAILQ_NEXT(r, entries); + if (r == NULL) { + if ((tail = + TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue)) != NULL) + rule->nr = tail->nr + 1; + else + rule->nr = 0; + } else { + rule->nr = r->nr; + } + + if ((error = pf_rule_setup(pr, rule, ruleset))) + break; + + if (rule->anchor != NULL) + strncpy(rule->anchor->owner, rule->owner, + PF_OWNER_NAME_SIZE); + + if (r) { + TAILQ_INSERT_BEFORE(r, rule, entries); + while (r && ++r->nr) + r = TAILQ_NEXT(r, entries); + } else + TAILQ_INSERT_TAIL(ruleset->rules[rs_num].active.ptr, + rule, entries); + ruleset->rules[rs_num].active.rcount++; + + /* Calculate checksum for the main ruleset */ + if (ruleset == &pf_main_ruleset) + error = pf_setup_pfsync_matching(ruleset); + + pf_ruleset_cleanup(ruleset, rs_num); + rule->ticket = ruleset->rules[rs_num].active.ticket; + + pr->rule.ticket = rule->ticket; + pf_rule_copyout(rule, &pr->rule); + if (rule->rule_flag & PFRULE_PFM) + pffwrules++; + break; + } + + case DIOCDELETERULE: { + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; + + if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } + + if (pr->rule.ticket) { + if ((error = pf_delete_rule_by_ticket(pr))) + break; + } else + pf_delete_rule_by_owner(pr->rule.owner); + pr->nr = pffwrules; + break; + } + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} + +static int +pfioctl_ioc_state_kill(u_long cmd, struct pfioc_state_kill *psk, struct proc *p) +{ +#pragma unused(p) + int error = 0; + + switch (cmd) { case DIOCCLRSTATES: { struct pf_state *s, *nexts; - struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; int killed = 0; psk->psk_ifname[sizeof (psk->psk_ifname) - 1] = '\0'; @@ -2142,7 +3581,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_state *s, *nexts; struct pf_state_key *sk; struct pf_state_host *src, *dst; - struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; int killed = 0; for (s = RB_MIN(pf_state_tree_id, &tree_id); s; @@ -2167,23 +3605,12 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) &psk->psk_dst.addr.v.a.addr, &psk->psk_dst.addr.v.a.mask, &dst->addr, sk->af) && -#ifndef NO_APPLE_EXTENSIONS (pf_match_xport(psk->psk_proto, psk->psk_proto_variant, &psk->psk_src.xport, &src->xport)) && (pf_match_xport(psk->psk_proto, psk->psk_proto_variant, &psk->psk_dst.xport, &dst->xport)) && -#else - (psk->psk_src.port_op == 0 || - pf_match_port(psk->psk_src.port_op, - psk->psk_src.port[0], psk->psk_src.port[1], - src->port)) && - (psk->psk_dst.port_op == 0 || - pf_match_port(psk->psk_dst.port_op, - psk->psk_dst.port[0], psk->psk_dst.port[1], - dst->port)) && -#endif (!psk->psk_ifname[0] || strcmp(psk->psk_ifname, s->kif->pfik_name) == 0)) { #if NPFSYNC @@ -2199,9 +3626,23 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} + +static int +pfioctl_ioc_state(u_long cmd, struct pfioc_state *ps, struct proc *p) +{ +#pragma unused(p) + int error = 0; + + switch (cmd) { case DIOCADDSTATE: { - struct pfioc_state *ps = (struct pfioc_state *)addr; - struct pfsync_state *sp = &ps->state; + struct pfsync_state *sp = &ps->state; struct pf_state *s; struct pf_state_key *sk; struct pfi_kif *kif; @@ -2217,7 +3658,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } bzero(s, sizeof (struct pf_state)); - if ((sk = pf_alloc_state_key(s)) == NULL) { + if ((sk = pf_alloc_state_key(s, NULL)) == NULL) { pool_put(&pf_state_pl, s); error = ENOMEM; break; @@ -2230,10 +3671,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOENT; break; } -#ifndef NO_APPLE_EXTENSIONS TAILQ_INIT(&s->unlink_hooks); s->state_key->app_state = 0; -#endif if (pf_insert_state(kif, s)) { pfi_kif_unref(kif, PFI_KIF_REF_NONE); pool_put(&pf_state_pl, s); @@ -2246,7 +3685,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCGETSTATE: { - struct pfioc_state *ps = (struct pfioc_state *)addr; struct pf_state *s; struct pf_state_cmp id_key; @@ -2263,27 +3701,50 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } - case DIOCGETSTATES: { - struct pfioc_states *ps = (struct pfioc_states *)addr; + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} + +static int +pfioctl_ioc_states(u_long cmd, struct pfioc_states_32 *ps32, + struct pfioc_states_64 *ps64, struct proc *p) +{ + int p64 = proc_is64bit(p); + int error = 0; + + switch (cmd) { + case DIOCGETSTATES: { /* struct pfioc_states */ struct pf_state *state; struct pfsync_state *pstore; user_addr_t buf; u_int32_t nr = 0; + int len, size; - if (ps->ps_len == 0) { - nr = pf_status.states; - ps->ps_len = sizeof (struct pfsync_state) * nr; + len = (p64 ? ps64->ps_len : ps32->ps_len); + if (len == 0) { + size = sizeof (struct pfsync_state) * pf_status.states; + if (p64) + ps64->ps_len = size; + else + ps32->ps_len = size; break; } pstore = _MALLOC(sizeof (*pstore), M_TEMP, M_WAITOK); - buf = PF_USER_ADDR(addr, pfioc_states, ps_buf); + if (pstore == NULL) { + error = ENOMEM; + break; + } + buf = (p64 ? ps64->ps_buf : ps32->ps_buf); state = TAILQ_FIRST(&state_list); while (state) { if (state->timeout != PFTM_UNLINKED) { - if ((nr + 1) * sizeof (*pstore) > - (unsigned)ps->ps_len) + if ((nr + 1) * sizeof (*pstore) > (unsigned)len) break; pf_state_export(pstore, @@ -2299,42 +3760,32 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) state = TAILQ_NEXT(state, entry_list); } - ps->ps_len = sizeof (struct pfsync_state) * nr; + size = sizeof (struct pfsync_state) * nr; + if (p64) + ps64->ps_len = size; + else + ps32->ps_len = size; _FREE(pstore, M_TEMP); break; } - case DIOCGETSTATUS: { - struct pf_status *s = (struct pf_status *)addr; - bcopy(&pf_status, s, sizeof (struct pf_status)); - pfi_update_status(s->ifname, s); - break; - } - - case DIOCSETSTATUSIF: { - struct pfioc_if *pi = (struct pfioc_if *)addr; - - if (pi->ifname[0] == 0) { - bzero(pf_status.ifname, IFNAMSIZ); - break; - } - strlcpy(pf_status.ifname, pi->ifname, IFNAMSIZ); - break; + default: + VERIFY(0); + /* NOTREACHED */ } +fail: + return (error); +} - case DIOCCLRSTATUS: { - bzero(pf_status.counters, sizeof (pf_status.counters)); - bzero(pf_status.fcounters, sizeof (pf_status.fcounters)); - bzero(pf_status.scounters, sizeof (pf_status.scounters)); - pf_status.since = pf_calendar_time_second(); - if (*pf_status.ifname) - pfi_update_status(pf_status.ifname, NULL); - break; - } +static int +pfioctl_ioc_natlook(u_long cmd, struct pfioc_natlook *pnl, struct proc *p) +{ +#pragma unused(p) + int error = 0; + switch (cmd) { case DIOCNATLOOK: { - struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr; struct pf_state_key *sk; struct pf_state *state; struct pf_state_key_cmp key; @@ -2342,21 +3793,14 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) key.af = pnl->af; key.proto = pnl->proto; - -#ifndef NO_APPLE_EXTENSIONS key.proto_variant = pnl->proto_variant; -#endif if (!pnl->proto || PF_AZERO(&pnl->saddr, pnl->af) || PF_AZERO(&pnl->daddr, pnl->af) || ((pnl->proto == IPPROTO_TCP || pnl->proto == IPPROTO_UDP) && -#ifndef NO_APPLE_EXTENSIONS (!pnl->dxport.port || !pnl->sxport.port))) -#else - (!pnl->dport || !pnl->sport))) -#endif error = EINVAL; else { /* @@ -2367,35 +3811,19 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) */ if (direction == PF_IN) { PF_ACPY(&key.ext.addr, &pnl->daddr, pnl->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&key.ext.xport, &pnl->dxport, sizeof (key.ext.xport)); -#else - key.ext.port = pnl->dport; -#endif PF_ACPY(&key.gwy.addr, &pnl->saddr, pnl->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&key.gwy.xport, &pnl->sxport, sizeof (key.gwy.xport)); -#else - key.gwy.port = pnl->sport; -#endif state = pf_find_state_all(&key, PF_IN, &m); } else { PF_ACPY(&key.lan.addr, &pnl->daddr, pnl->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&key.lan.xport, &pnl->dxport, sizeof (key.lan.xport)); -#else - key.lan.port = pnl->dport; -#endif PF_ACPY(&key.ext.addr, &pnl->saddr, pnl->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&key.ext.xport, &pnl->sxport, sizeof (key.ext.xport)); -#else - key.ext.port = pnl->sport; -#endif state = pf_find_state_all(&key, PF_OUT, &m); } if (m > 1) @@ -2405,37 +3833,21 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) if (direction == PF_IN) { PF_ACPY(&pnl->rsaddr, &sk->lan.addr, sk->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&pnl->rsxport, &sk->lan.xport, sizeof (pnl->rsxport)); -#else - pnl->rsport = sk->lan.port; -#endif PF_ACPY(&pnl->rdaddr, &pnl->daddr, pnl->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&pnl->rdxport, &pnl->dxport, sizeof (pnl->rdxport)); -#else - pnl->rdport = pnl->dport; -#endif } else { PF_ACPY(&pnl->rdaddr, &sk->gwy.addr, sk->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&pnl->rdxport, &sk->gwy.xport, sizeof (pnl->rdxport)); -#else - pnl->rdport = sk->gwy.port; -#endif PF_ACPY(&pnl->rsaddr, &pnl->saddr, pnl->af); -#ifndef NO_APPLE_EXTENSIONS memcpy(&pnl->rsxport, &pnl->sxport, sizeof (pnl->rsxport)); -#else - pnl->rsport = pnl->sport; -#endif } } else error = ENOENT; @@ -2443,9 +3855,23 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} + +static int +pfioctl_ioc_tm(u_long cmd, struct pfioc_tm *pt, struct proc *p) +{ +#pragma unused(p) + int error = 0; + + switch (cmd) { case DIOCSETTIMEOUT: { - struct pfioc_tm *pt = (struct pfioc_tm *)addr; - int old; + int old; if (pt->timeout < 0 || pt->timeout >= PFTM_MAX || pt->seconds < 0) { @@ -2463,8 +3889,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCGETTIMEOUT: { - struct pfioc_tm *pt = (struct pfioc_tm *)addr; - if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) { error = EINVAL; goto fail; @@ -2473,8 +3897,22 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + default: + VERIFY(0); + /* NOTREACHED */ + } +fail: + return (error); +} + +static int +pfioctl_ioc_limit(u_long cmd, struct pfioc_limit *pl, struct proc *p) +{ +#pragma unused(p) + int error = 0; + + switch (cmd) { case DIOCGETLIMIT: { - struct pfioc_limit *pl = (struct pfioc_limit *)addr; if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) { error = EINVAL; @@ -2485,8 +3923,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCSETLIMIT: { - struct pfioc_limit *pl = (struct pfioc_limit *)addr; - int old_limit; + int old_limit; if (pl->index < 0 || pl->index >= PF_LIMIT_MAX || pf_pool_limits[pl->index].pp == NULL) { @@ -2501,188 +3938,30 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } - case DIOCSETDEBUG: { - u_int32_t *level = (u_int32_t *)addr; - - pf_status.debug = *level; - break; - } - - case DIOCCLRRULECTRS: { - /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */ - struct pf_ruleset *ruleset = &pf_main_ruleset; - struct pf_rule *rule; - - TAILQ_FOREACH(rule, - ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) { - rule->evaluations = 0; - rule->packets[0] = rule->packets[1] = 0; - rule->bytes[0] = rule->bytes[1] = 0; - } - break; - } - -#if ALTQ - case DIOCSTARTALTQ: { - struct pf_altq *altq; - - /* enable all altq interfaces on active list */ - TAILQ_FOREACH(altq, pf_altqs_active, entries) { - if (altq->qname[0] == 0) { - error = pf_enable_altq(altq); - if (error != 0) - break; - } - } - if (error == 0) - pf_altq_running = 1; - DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n")); - break; - } - - case DIOCSTOPALTQ: { - struct pf_altq *altq; - - /* disable all altq interfaces on active list */ - TAILQ_FOREACH(altq, pf_altqs_active, entries) { - if (altq->qname[0] == 0) { - error = pf_disable_altq(altq); - if (error != 0) - break; - } - } - if (error == 0) - pf_altq_running = 0; - DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n")); - break; - } - - case DIOCADDALTQ: { - struct pfioc_altq *pa = (struct pfioc_altq *)addr; - struct pf_altq *altq, *a; - - if (pa->ticket != ticket_altqs_inactive) { - error = EBUSY; - break; - } - altq = pool_get(&pf_altq_pl, PR_WAITOK); - if (altq == NULL) { - error = ENOMEM; - break; - } - pf_altq_copyin(&pa->altq, altq); - - /* - * if this is for a queue, find the discipline and - * copy the necessary fields - */ - if (altq->qname[0] != 0) { - if ((altq->qid = pf_qname2qid(altq->qname)) == 0) { - error = EBUSY; - pool_put(&pf_altq_pl, altq); - break; - } - altq->altq_disc = NULL; - TAILQ_FOREACH(a, pf_altqs_inactive, entries) { - if (strncmp(a->ifname, altq->ifname, - IFNAMSIZ) == 0 && a->qname[0] == 0) { - altq->altq_disc = a->altq_disc; - break; - } - } - } - - error = altq_add(altq); - if (error) { - pool_put(&pf_altq_pl, altq); - break; - } - - TAILQ_INSERT_TAIL(pf_altqs_inactive, altq, entries); - pf_altq_copyout(altq, &pa->altq); - break; - } - - case DIOCGETALTQS: { - struct pfioc_altq *pa = (struct pfioc_altq *)addr; - struct pf_altq *altq; - - pa->nr = 0; - TAILQ_FOREACH(altq, pf_altqs_active, entries) - pa->nr++; - pa->ticket = ticket_altqs_active; - break; - } - - case DIOCGETALTQ: { - struct pfioc_altq *pa = (struct pfioc_altq *)addr; - struct pf_altq *altq; - u_int32_t nr; - - if (pa->ticket != ticket_altqs_active) { - error = EBUSY; - break; - } - nr = 0; - altq = TAILQ_FIRST(pf_altqs_active); - while ((altq != NULL) && (nr < pa->nr)) { - altq = TAILQ_NEXT(altq, entries); - nr++; - } - if (altq == NULL) { - error = EBUSY; - break; - } - pf_altq_copyout(altq, &pa->altq); - break; + default: + VERIFY(0); + /* NOTREACHED */ } +fail: + return (error); +} - case DIOCCHANGEALTQ: - /* CHANGEALTQ not supported yet! */ - error = ENODEV; - break; - - case DIOCGETQSTATS: { - struct pfioc_qstats *pq = (struct pfioc_qstats *)addr; - struct pf_altq *altq; - u_int32_t nr; - int nbytes; - - if (pq->ticket != ticket_altqs_active) { - error = EBUSY; - break; - } - nbytes = pq->nbytes; - nr = 0; - altq = TAILQ_FIRST(pf_altqs_active); - while ((altq != NULL) && (nr < pq->nr)) { - altq = TAILQ_NEXT(altq, entries); - nr++; - } - if (altq == NULL) { - error = EBUSY; - break; - } - error = altq_getqstats(altq, pq->buf, &nbytes); - if (error == 0) { - pq->scheduler = altq->scheduler; - pq->nbytes = nbytes; - } - break; - } -#endif /* ALTQ */ +static int +pfioctl_ioc_pooladdr(u_long cmd, struct pfioc_pooladdr *pp, struct proc *p) +{ +#pragma unused(p) + struct pf_pooladdr *pa = NULL; + struct pf_pool *pool = NULL; + int error = 0; + switch (cmd) { case DIOCBEGINADDRS: { - struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; - pf_empty_pool(&pf_pabuf); pp->ticket = ++ticket_pabuf; break; } case DIOCADDADDR: { - struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; - pp->anchor[sizeof (pp->anchor) - 1] = '\0'; if (pp->ticket != ticket_pabuf) { error = EBUSY; @@ -2733,8 +4012,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCGETADDRS: { - struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; - pp->nr = 0; pp->anchor[sizeof (pp->anchor) - 1] = '\0'; pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, @@ -2749,7 +4026,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCGETADDR: { - struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; u_int32_t nr = 0; pp->anchor[sizeof (pp->anchor) - 1] = '\0'; @@ -2776,7 +4052,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCCHANGEADDR: { - struct pfioc_pooladdr *pca = (struct pfioc_pooladdr *)addr; + struct pfioc_pooladdr *pca = pp; struct pf_pooladdr *oldpa = NULL, *newpa = NULL; struct pf_ruleset *ruleset; @@ -2886,8 +4162,22 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} + +static int +pfioctl_ioc_ruleset(u_long cmd, struct pfioc_ruleset *pr, struct proc *p) +{ +#pragma unused(p) + int error = 0; + + switch (cmd) { case DIOCGETRULESETS: { - struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; struct pf_ruleset *ruleset; struct pf_anchor *anchor; @@ -2912,7 +4202,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCGETRULESET: { - struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; struct pf_ruleset *ruleset; struct pf_anchor *anchor; u_int32_t nr = 0; @@ -2945,259 +4234,39 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } - case DIOCRCLRTABLES: { - struct pfioc_table *io = (struct pfioc_table *)addr; - - if (io->pfrio_esize != 0) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, - io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRADDTABLES: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_table)) { - error = ENODEV; - break; - } - error = pfr_add_tables(buf, io->pfrio_size, - &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRDELTABLES: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_table)) { - error = ENODEV; - break; - } - error = pfr_del_tables(buf, io->pfrio_size, - &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRGETTABLES: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_table)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_get_tables(&io->pfrio_table, buf, - &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRGETTSTATS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_tstats)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_get_tstats(&io->pfrio_table, buf, - &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRCLRTSTATS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_table)) { - error = ENODEV; - break; - } - error = pfr_clr_tstats(buf, io->pfrio_size, - &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRSETTFLAGS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_table)) { - error = ENODEV; - break; - } - error = pfr_set_tflags(buf, io->pfrio_size, - io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange, - &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRCLRADDRS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - - if (io->pfrio_esize != 0) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, - io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRADDADDRS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_add_addrs(&io->pfrio_table, buf, - io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | - PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRDELADDRS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_del_addrs(&io->pfrio_table, buf, - io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags | - PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRSETADDRS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_set_addrs(&io->pfrio_table, buf, - io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd, - &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags | - PFR_FLAG_USERIOCTL, 0); - break; - } - - case DIOCRGETADDRS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_get_addrs(&io->pfrio_table, buf, - &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRGETASTATS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_astats)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_get_astats(&io->pfrio_table, buf, - &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRCLRASTATS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_clr_astats(&io->pfrio_table, buf, - io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags | - PFR_FLAG_USERIOCTL); - break; - } - - case DIOCRTSTADDRS: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_tst_addrs(&io->pfrio_table, buf, - io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags | - PFR_FLAG_USERIOCTL); - break; + default: + VERIFY(0); + /* NOTREACHED */ } - case DIOCRINADEFINE: { - struct pfioc_table *io = (struct pfioc_table *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_table, pfrio_buffer); - - if (io->pfrio_esize != sizeof (struct pfr_addr)) { - error = ENODEV; - break; - } - pfr_table_copyin_cleanup(&io->pfrio_table); - error = pfr_ina_define(&io->pfrio_table, buf, - io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr, - io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL); - break; - } + return (error); +} - case DIOCOSFPADD: { - struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; - error = pf_osfp_add(io); - break; - } +static int +pfioctl_ioc_trans(u_long cmd, struct pfioc_trans_32 *io32, + struct pfioc_trans_64 *io64, struct proc *p) +{ + int p64 = proc_is64bit(p); + int error = 0, esize, size; + user_addr_t buf; - case DIOCOSFPGET: { - struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; - error = pf_osfp_get(io); - break; - } + esize = (p64 ? io64->esize : io32->esize); + size = (p64 ? io64->size : io32->size); + buf = (p64 ? io64->array : io32->array); + switch (cmd) { case DIOCXBEGIN: { - struct pfioc_trans *io = (struct pfioc_trans *)addr; struct pfioc_trans_e *ioe; struct pfr_table *table; - user_addr_t buf; int i; - if (io->esize != sizeof (*ioe)) { + if (esize != sizeof (*ioe)) { error = ENODEV; goto fail; } ioe = _MALLOC(sizeof (*ioe), M_TEMP, M_WAITOK); table = _MALLOC(sizeof (*table), M_TEMP, M_WAITOK); - buf = PF_USER_ADDR(addr, pfioc_trans, array); - for (i = 0; i < io->size; i++, buf += sizeof (*ioe)) { + for (i = 0; i < size; i++, buf += sizeof (*ioe)) { if (copyin(buf, ioe, sizeof (*ioe))) { _FREE(table, M_TEMP); _FREE(ioe, M_TEMP); @@ -3207,19 +4276,22 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: -#if ALTQ - if (ioe->anchor[0]) { - _FREE(table, M_TEMP); - _FREE(ioe, M_TEMP); - error = EINVAL; - goto fail; - } - if ((error = pf_begin_altq(&ioe->ticket))) { - _FREE(table, M_TEMP); - _FREE(ioe, M_TEMP); - goto fail; +#if PF_ALTQ + if (altq_allowed) { + if (ioe->anchor[0]) { + _FREE(table, M_TEMP); + _FREE(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + error = pf_begin_altq(&ioe->ticket); + if (error != 0) { + _FREE(table, M_TEMP); + _FREE(ioe, M_TEMP); + goto fail; + } } -#endif /* ALTQ */ +#endif /* PF_ALTQ */ break; case PF_RULESET_TABLE: bzero(table, sizeof (*table)); @@ -3254,20 +4326,17 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCXROLLBACK: { - struct pfioc_trans *io = (struct pfioc_trans *)addr; struct pfioc_trans_e *ioe; struct pfr_table *table; - user_addr_t buf; int i; - if (io->esize != sizeof (*ioe)) { + if (esize != sizeof (*ioe)) { error = ENODEV; goto fail; } ioe = _MALLOC(sizeof (*ioe), M_TEMP, M_WAITOK); table = _MALLOC(sizeof (*table), M_TEMP, M_WAITOK); - buf = PF_USER_ADDR(addr, pfioc_trans, array); - for (i = 0; i < io->size; i++, buf += sizeof (*ioe)) { + for (i = 0; i < size; i++, buf += sizeof (*ioe)) { if (copyin(buf, ioe, sizeof (*ioe))) { _FREE(table, M_TEMP); _FREE(ioe, M_TEMP); @@ -3277,19 +4346,22 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: -#if ALTQ - if (ioe->anchor[0]) { - _FREE(table, M_TEMP); - _FREE(ioe, M_TEMP); - error = EINVAL; - goto fail; - } - if ((error = pf_rollback_altq(ioe->ticket))) { - _FREE(table, M_TEMP); - _FREE(ioe, M_TEMP); - goto fail; /* really bad */ +#if PF_ALTQ + if (altq_allowed) { + if (ioe->anchor[0]) { + _FREE(table, M_TEMP); + _FREE(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + error = pf_rollback_altq(ioe->ticket); + if (error != 0) { + _FREE(table, M_TEMP); + _FREE(ioe, M_TEMP); + goto fail; /* really bad */ + } } -#endif /* ALTQ */ +#endif /* PF_ALTQ */ break; case PF_RULESET_TABLE: bzero(table, sizeof (*table)); @@ -3318,22 +4390,20 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } case DIOCXCOMMIT: { - struct pfioc_trans *io = (struct pfioc_trans *)addr; struct pfioc_trans_e *ioe; struct pfr_table *table; struct pf_ruleset *rs; - user_addr_t _buf, buf; + user_addr_t _buf = buf; int i; - if (io->esize != sizeof (*ioe)) { + if (esize != sizeof (*ioe)) { error = ENODEV; goto fail; } ioe = _MALLOC(sizeof (*ioe), M_TEMP, M_WAITOK); table = _MALLOC(sizeof (*table), M_TEMP, M_WAITOK); - buf = _buf = PF_USER_ADDR(addr, pfioc_trans, array); /* first makes sure everything will succeed */ - for (i = 0; i < io->size; i++, buf += sizeof (*ioe)) { + for (i = 0; i < size; i++, buf += sizeof (*ioe)) { if (copyin(buf, ioe, sizeof (*ioe))) { _FREE(table, M_TEMP); _FREE(ioe, M_TEMP); @@ -3343,21 +4413,24 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: -#if ALTQ - if (ioe->anchor[0]) { - _FREE(table, M_TEMP); - _FREE(ioe, M_TEMP); - error = EINVAL; - goto fail; - } - if (!altqs_inactive_open || ioe->ticket != - ticket_altqs_inactive) { - _FREE(table, M_TEMP); - _FREE(ioe, M_TEMP); - error = EBUSY; - goto fail; +#if PF_ALTQ + if (altq_allowed) { + if (ioe->anchor[0]) { + _FREE(table, M_TEMP); + _FREE(ioe, M_TEMP); + error = EINVAL; + goto fail; + } + if (!altqs_inactive_open || + ioe->ticket != + ticket_altqs_inactive) { + _FREE(table, M_TEMP); + _FREE(ioe, M_TEMP); + error = EBUSY; + goto fail; + } } -#endif /* ALTQ */ +#endif /* PF_ALTQ */ break; case PF_RULESET_TABLE: rs = pf_find_ruleset(ioe->anchor); @@ -3392,7 +4465,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } buf = _buf; /* now do the commit - no errors should happen here */ - for (i = 0; i < io->size; i++, buf += sizeof (*ioe)) { + for (i = 0; i < size; i++, buf += sizeof (*ioe)) { if (copyin(buf, ioe, sizeof (*ioe))) { _FREE(table, M_TEMP); _FREE(ioe, M_TEMP); @@ -3402,13 +4475,14 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: -#if ALTQ - if ((error = pf_commit_altq(ioe->ticket))) { +#if PF_ALTQ + if (altq_allowed && + (error = pf_commit_altq(ioe->ticket))) { _FREE(table, M_TEMP); _FREE(ioe, M_TEMP); goto fail; /* really bad */ } -#endif /* ALTQ */ +#endif /* PF_ALTQ */ break; case PF_RULESET_TABLE: bzero(table, sizeof (*table)); @@ -3436,28 +4510,52 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + default: + VERIFY(0); + /* NOTREACHED */ + } +fail: + return (error); +} + +static int +pfioctl_ioc_src_nodes(u_long cmd, struct pfioc_src_nodes_32 *psn32, + struct pfioc_src_nodes_64 *psn64, struct proc *p) +{ + int p64 = proc_is64bit(p); + int error = 0; + + switch (cmd) { case DIOCGETSRCNODES: { - struct pfioc_src_nodes *psn = (struct pfioc_src_nodes *)addr; struct pf_src_node *n, *pstore; user_addr_t buf; u_int32_t nr = 0; - int space = psn->psn_len; + int space, size; + space = (p64 ? psn64->psn_len : psn32->psn_len); if (space == 0) { RB_FOREACH(n, pf_src_tree, &tree_src_tracking) nr++; - psn->psn_len = sizeof (struct pf_src_node) * nr; + + size = sizeof (struct pf_src_node) * nr; + if (p64) + psn64->psn_len = size; + else + psn32->psn_len = size; break; } pstore = _MALLOC(sizeof (*pstore), M_TEMP, M_WAITOK); - buf = PF_USER_ADDR(addr, pfioc_src_nodes, psn_buf); + if (pstore == NULL) { + error = ENOMEM; + break; + } + buf = (p64 ? psn64->psn_buf : psn32->psn_buf); RB_FOREACH(n, pf_src_tree, &tree_src_tracking) { uint64_t secs = pf_time_second(), diff; - if ((nr + 1) * sizeof (*pstore) > - (unsigned)psn->psn_len) + if ((nr + 1) * sizeof (*pstore) > (unsigned)space) break; bcopy(n, pstore, sizeof (*pstore)); @@ -3490,34 +4588,37 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) buf += sizeof (*pstore); nr++; } - psn->psn_len = sizeof (struct pf_src_node) * nr; + + size = sizeof (struct pf_src_node) * nr; + if (p64) + psn64->psn_len = size; + else + psn32->psn_len = size; _FREE(pstore, M_TEMP); break; } - case DIOCCLRSRCNODES: { - struct pf_src_node *n; - struct pf_state *state; - - RB_FOREACH(state, pf_state_tree_id, &tree_id) { - state->src_node = NULL; - state->nat_src_node = NULL; - } - RB_FOREACH(n, pf_src_tree, &tree_src_tracking) { - n->expire = 1; - n->states = 0; - } - pf_purge_expired_src_nodes(); - pf_status.src_nodes = 0; - break; + default: + VERIFY(0); + /* NOTREACHED */ } +fail: + return (error); +} + +static int +pfioctl_ioc_src_node_kill(u_long cmd, struct pfioc_src_node_kill *psnk, + struct proc *p) +{ +#pragma unused(p) + int error = 0; + + switch (cmd) { case DIOCKILLSRCNODES: { struct pf_src_node *sn; struct pf_state *s; - struct pfioc_src_node_kill *psnk = - (struct pfioc_src_node_kill *)addr; int killed = 0; RB_FOREACH(sn, pf_src_tree, &tree_src_tracking) { @@ -3552,77 +4653,91 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } - case DIOCSETHOSTID: { - u_int32_t *hid = (u_int32_t *)addr; - - if (*hid == 0) - pf_status.hostid = random(); - else - pf_status.hostid = *hid; - break; + default: + VERIFY(0); + /* NOTREACHED */ } - case DIOCOSFPFLUSH: - pf_osfp_flush(); - break; + return (error); +} + +static int +pfioctl_ioc_iface(u_long cmd, struct pfioc_iface_32 *io32, + struct pfioc_iface_64 *io64, struct proc *p) +{ + int p64 = proc_is64bit(p); + int error = 0; + switch (cmd) { case DIOCIGETIFACES: { - struct pfioc_iface *io = (struct pfioc_iface *)addr; - user_addr_t buf = PF_USER_ADDR(addr, pfioc_iface, pfiio_buffer); + user_addr_t buf; + int esize; + + buf = (p64 ? io64->pfiio_buffer : io32->pfiio_buffer); + esize = (p64 ? io64->pfiio_esize : io32->pfiio_esize); /* esize must be that of the user space version of pfi_kif */ - if (io->pfiio_esize != sizeof (struct pfi_uif)) { + if (esize != sizeof (struct pfi_uif)) { error = ENODEV; break; } - io->pfiio_name[sizeof (io->pfiio_name) - 1] = '\0'; - error = pfi_get_ifaces(io->pfiio_name, buf, &io->pfiio_size); + if (p64) + io64->pfiio_name[sizeof (io64->pfiio_name) - 1] = '\0'; + else + io32->pfiio_name[sizeof (io32->pfiio_name) - 1] = '\0'; + error = pfi_get_ifaces( + p64 ? io64->pfiio_name : io32->pfiio_name, buf, + p64 ? &io64->pfiio_size : &io32->pfiio_size); break; } case DIOCSETIFFLAG: { - struct pfioc_iface *io = (struct pfioc_iface *)addr; + if (p64) + io64->pfiio_name[sizeof (io64->pfiio_name) - 1] = '\0'; + else + io32->pfiio_name[sizeof (io32->pfiio_name) - 1] = '\0'; - io->pfiio_name[sizeof (io->pfiio_name) - 1] = '\0'; - error = pfi_set_flags(io->pfiio_name, io->pfiio_flags); + error = pfi_set_flags( + p64 ? io64->pfiio_name : io32->pfiio_name, + p64 ? io64->pfiio_flags : io32->pfiio_flags); break; } case DIOCCLRIFFLAG: { - struct pfioc_iface *io = (struct pfioc_iface *)addr; + if (p64) + io64->pfiio_name[sizeof (io64->pfiio_name) - 1] = '\0'; + else + io32->pfiio_name[sizeof (io32->pfiio_name) - 1] = '\0'; - io->pfiio_name[sizeof (io->pfiio_name) - 1] = '\0'; - error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags); + error = pfi_clear_flags( + p64 ? io64->pfiio_name : io32->pfiio_name, + p64 ? io64->pfiio_flags : io32->pfiio_flags); break; } default: - error = ENODEV; - break; + VERIFY(0); + /* NOTREACHED */ } -fail: - lck_mtx_unlock(pf_lock); - lck_rw_done(pf_perim_lock); return (error); } int pf_af_hook(struct ifnet *ifp, struct mbuf **mppn, struct mbuf **mp, - unsigned int af, int input) + unsigned int af, int input, struct ip_fw_args *fwa) { int error = 0, reentry; - struct thread *curthread = current_thread(); struct mbuf *nextpkt; - reentry = (ifp->if_pf_curthread == curthread); + reentry = net_thread_check_lock(NET_THREAD_HELD_PF); if (!reentry) { lck_rw_lock_shared(pf_perim_lock); if (!pf_is_enabled) goto done; lck_mtx_lock(pf_lock); - ifp->if_pf_curthread = curthread; + net_thread_set_lock(NET_THREAD_HELD_PF); } if (mppn != NULL && *mppn != NULL) @@ -3633,34 +4748,35 @@ pf_af_hook(struct ifnet *ifp, struct mbuf **mppn, struct mbuf **mp, switch (af) { #if INET case AF_INET: { - error = pf_inet_hook(ifp, mp, input); + error = pf_inet_hook(ifp, mp, input, fwa); break; } #endif /* INET */ #if INET6 case AF_INET6: - error = pf_inet6_hook(ifp, mp, input); + error = pf_inet6_hook(ifp, mp, input, fwa); break; #endif /* INET6 */ default: break; } - if (nextpkt != NULL) { - if (*mp != NULL) { - struct mbuf *m = *mp; - while (m->m_nextpkt != NULL) - m = m->m_nextpkt; - m->m_nextpkt = nextpkt; - } else { - *mp = nextpkt; - } + /* When packet valid, link to the next packet */ + if (*mp != NULL && nextpkt != NULL) { + struct mbuf *m = *mp; + while (m->m_nextpkt != NULL) + m = m->m_nextpkt; + m->m_nextpkt = nextpkt; + } + /* Fix up linkage of previous packet in the chain */ + if (mppn != NULL) { + if (*mp != NULL) + *mppn = *mp; + else + *mppn = nextpkt; } - if (mppn != NULL && *mppn != NULL) - *mppn = *mp; - if (!reentry) { - ifp->if_pf_curthread = NULL; + net_thread_unset_lock(NET_THREAD_HELD_PF); lck_mtx_unlock(pf_lock); } done: @@ -3673,7 +4789,8 @@ done: #if INET static int -pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input) +pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input, + struct ip_fw_args *fwa) { struct mbuf *m = *mp; #if BYTE_ORDER != BIG_ENDIAN @@ -3703,7 +4820,7 @@ pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input) HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - if (pf_test(input ? PF_IN : PF_OUT, ifp, mp, NULL) != PF_PASS) { + if (pf_test(input ? PF_IN : PF_OUT, ifp, mp, NULL, fwa) != PF_PASS) { if (*mp != NULL) { m_freem(*mp); *mp = NULL; @@ -3727,7 +4844,8 @@ pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input) #if INET6 int -pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input) +pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input, + struct ip_fw_args *fwa) { int error = 0; @@ -3749,7 +4867,7 @@ pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input) } } - if (pf_test6(input ? PF_IN : PF_OUT, ifp, mp, NULL) != PF_PASS) { + if (pf_test6(input ? PF_IN : PF_OUT, ifp, mp, NULL, fwa) != PF_PASS) { if (*mp != NULL) { m_freem(*mp); *mp = NULL; diff --git a/bsd/net/pf_norm.c b/bsd/net/pf_norm.c index 053cdc13c..69283ce6e 100644 --- a/bsd/net/pf_norm.c +++ b/bsd/net/pf_norm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,9 +89,16 @@ #include struct pf_frent { - LIST_ENTRY(pf_frent) fr_next; - struct ip *fr_ip; - struct mbuf *fr_m; + LIST_ENTRY(pf_frent) fr_next; + struct mbuf *fr_m; +#define fr_ip fr_u.fru_ipv4 +#define fr_ip6 fr_u.fru_ipv6 + union { + struct ip *fru_ipv4; + struct ip6_hdr *fru_ipv6; + } fr_u; + struct ip6_frag fr_ip6f_opt; + int fr_ip6f_hlen; }; struct pf_frcache { @@ -108,12 +115,18 @@ struct pf_frcache { struct pf_fragment { RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; - struct in_addr fr_src; - struct in_addr fr_dst; + struct pf_addr fr_srcx; + struct pf_addr fr_dstx; u_int8_t fr_p; /* protocol of this fragment */ u_int8_t fr_flags; /* status flags */ - u_int16_t fr_id; /* fragment id for reassemble */ u_int16_t fr_max; /* fragment data max */ +#define fr_id fr_uid.fru_id4 +#define fr_id6 fr_uid.fru_id6 + union { + u_int16_t fru_id4; + u_int32_t fru_id6; + } fr_uid; + int fr_af; u_int32_t fr_timeout; #define fr_queue fr_u.fru_queue #define fr_cache fr_u.fru_cache @@ -134,22 +147,29 @@ RB_PROTOTYPE_SC(__private_extern__, pf_frag_tree, pf_fragment, fr_entry, RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); /* Private prototypes */ +static void pf_ip6hdr2key(struct pf_fragment *, struct ip6_hdr *, + struct ip6_frag *); static void pf_ip2key(struct pf_fragment *, struct ip *); static void pf_remove_fragment(struct pf_fragment *); static void pf_flush_fragments(void); static void pf_free_fragment(struct pf_fragment *); -static struct pf_fragment *pf_find_fragment(struct ip *, struct pf_frag_tree *); +static struct pf_fragment *pf_find_fragment_by_key(struct pf_fragment *, + struct pf_frag_tree *); +static __inline struct pf_fragment * + pf_find_fragment_by_ipv4_header(struct ip *, struct pf_frag_tree *); +static __inline struct pf_fragment * + pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *, + struct pf_frag_tree *); static struct mbuf *pf_reassemble(struct mbuf **, struct pf_fragment **, struct pf_frent *, int); static struct mbuf *pf_fragcache(struct mbuf **, struct ip *, struct pf_fragment **, int, int, int *); -#ifndef NO_APPLE_MODIFICATIONS +static struct mbuf *pf_reassemble6(struct mbuf **, struct pf_fragment **, + struct pf_frent *, int); +static struct mbuf *pf_frag6cache(struct mbuf **, struct ip6_hdr*, + struct ip6_frag *, struct pf_fragment **, int, int, int, int *); static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *, struct pf_pdesc *, struct mbuf *, struct tcphdr *, int, int *); -#else -static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, - struct tcphdr *, int, sa_family_t); -#endif #define DPFPRINTF(x) do { \ if (pf_status.debug >= PF_DEBUG_MISC) { \ @@ -211,18 +231,74 @@ pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) { int diff; - if ((diff = a->fr_id - b->fr_id)) + if ((diff = a->fr_af - b->fr_af)) return (diff); else if ((diff = a->fr_p - b->fr_p)) return (diff); - else if (a->fr_src.s_addr < b->fr_src.s_addr) - return (-1); - else if (a->fr_src.s_addr > b->fr_src.s_addr) - return (1); - else if (a->fr_dst.s_addr < b->fr_dst.s_addr) - return (-1); - else if (a->fr_dst.s_addr > b->fr_dst.s_addr) - return (1); + else { + struct pf_addr *sa = &a->fr_srcx; + struct pf_addr *sb = &b->fr_srcx; + struct pf_addr *da = &a->fr_dstx; + struct pf_addr *db = &b->fr_dstx; + + switch (a->fr_af) { +#ifdef INET + case AF_INET: + if ((diff = a->fr_id - b->fr_id)) + return (diff); + else if (sa->v4.s_addr < sb->v4.s_addr) + return (-1); + else if (sa->v4.s_addr > sb->v4.s_addr) + return (1); + else if (da->v4.s_addr < db->v4.s_addr) + return (-1); + else if (da->v4.s_addr > db->v4.s_addr) + return (1); + break; +#endif +#ifdef INET6 + case AF_INET6: + if ((diff = a->fr_id6 - b->fr_id6)) + return (diff); + else if (sa->addr32[3] < sb->addr32[3]) + return (-1); + else if (sa->addr32[3] > sb->addr32[3]) + return (1); + else if (sa->addr32[2] < sb->addr32[2]) + return (-1); + else if (sa->addr32[2] > sb->addr32[2]) + return (1); + else if (sa->addr32[1] < sb->addr32[1]) + return (-1); + else if (sa->addr32[1] > sb->addr32[1]) + return (1); + else if (sa->addr32[0] < sb->addr32[0]) + return (-1); + else if (sa->addr32[0] > sb->addr32[0]) + return (1); + else if (da->addr32[3] < db->addr32[3]) + return (-1); + else if (da->addr32[3] > db->addr32[3]) + return (1); + else if (da->addr32[2] < db->addr32[2]) + return (-1); + else if (da->addr32[2] > db->addr32[2]) + return (1); + else if (da->addr32[1] < db->addr32[1]) + return (-1); + else if (da->addr32[1] > db->addr32[1]) + return (1); + else if (da->addr32[0] < db->addr32[0]) + return (-1); + else if (da->addr32[0] > db->addr32[0]) + return (1); + break; +#endif + default: + VERIFY(!0 && "only IPv4 and IPv6 supported!"); + break; + } + } return (0); } @@ -238,7 +314,19 @@ pf_purge_expired_fragments(void) if (frag->fr_timeout > expire) break; - DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + switch (frag->fr_af) { + case AF_INET: + DPFPRINTF(("expiring IPv4 %d(%p) from queue.\n", + ntohs(frag->fr_id), frag)); + break; + case AF_INET6: + DPFPRINTF(("expiring IPv6 %d(%p) from queue.\n", + ntohl(frag->fr_id6), frag)); + break; + default: + VERIFY(0 && "only IPv4 and IPv6 supported"); + break; + } pf_free_fragment(frag); } @@ -247,7 +335,19 @@ pf_purge_expired_fragments(void) if (frag->fr_timeout > expire) break; - DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + switch (frag->fr_af) { + case AF_INET: + DPFPRINTF(("expiring IPv4 %d(%p) from cache.\n", + ntohs(frag->fr_id), frag)); + break; + case AF_INET6: + DPFPRINTF(("expiring IPv6 %d(%p) from cache.\n", + ntohl(frag->fr_id6), frag)); + break; + default: + VERIFY(0 && "only IPv4 and IPv6 supported"); + break; + } pf_free_fragment(frag); VERIFY(TAILQ_EMPTY(&pf_cachequeue) || TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag); @@ -321,24 +421,33 @@ pf_free_fragment(struct pf_fragment *frag) pf_remove_fragment(frag); } +static void +pf_ip6hdr2key(struct pf_fragment *key, struct ip6_hdr *ip6, + struct ip6_frag *fh) +{ + key->fr_p = fh->ip6f_nxt; + key->fr_id6 = fh->ip6f_ident; + key->fr_af = AF_INET6; + key->fr_srcx.v6 = ip6->ip6_src; + key->fr_dstx.v6 = ip6->ip6_dst; +} + static void pf_ip2key(struct pf_fragment *key, struct ip *ip) { key->fr_p = ip->ip_p; key->fr_id = ip->ip_id; - key->fr_src.s_addr = ip->ip_src.s_addr; - key->fr_dst.s_addr = ip->ip_dst.s_addr; + key->fr_af = AF_INET; + key->fr_srcx.v4.s_addr = ip->ip_src.s_addr; + key->fr_dstx.v4.s_addr = ip->ip_dst.s_addr; } static struct pf_fragment * -pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree) +pf_find_fragment_by_key(struct pf_fragment *key, struct pf_frag_tree *tree) { - struct pf_fragment key; - struct pf_fragment *frag; - - pf_ip2key(&key, ip); - - frag = RB_FIND(pf_frag_tree, tree, &key); + struct pf_fragment *frag; + + frag = RB_FIND(pf_frag_tree, tree, key); if (frag != NULL) { /* XXX Are we sure we want to update the timeout? */ frag->fr_timeout = pf_time_second(); @@ -350,9 +459,26 @@ pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree) TAILQ_INSERT_HEAD(&pf_cachequeue, frag, frag_next); } } - + return (frag); } + +static __inline struct pf_fragment * +pf_find_fragment_by_ipv4_header(struct ip *ip, struct pf_frag_tree *tree) +{ + struct pf_fragment key; + pf_ip2key(&key, ip); + return pf_find_fragment_by_key(&key, tree); +} + +static __inline struct pf_fragment * +pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh, + struct pf_frag_tree *tree) +{ + struct pf_fragment key; + pf_ip6hdr2key(&key, ip6, fh); + return pf_find_fragment_by_key(&key, tree); +} /* Removes a fragment from the fragment queue and frees the fragment */ @@ -402,8 +528,9 @@ pf_reassemble(struct mbuf **m0, struct pf_fragment **frag, (*frag)->fr_flags = 0; (*frag)->fr_max = 0; - (*frag)->fr_src = frent->fr_ip->ip_src; - (*frag)->fr_dst = frent->fr_ip->ip_dst; + (*frag)->fr_af = AF_INET; + (*frag)->fr_srcx.v4 = frent->fr_ip->ip_src; + (*frag)->fr_dstx.v4 = frent->fr_ip->ip_dst; (*frag)->fr_p = frent->fr_ip->ip_p; (*frag)->fr_id = frent->fr_ip->ip_id; (*frag)->fr_timeout = pf_time_second(); @@ -534,8 +661,8 @@ insert: m_cat(m, m2); } - ip->ip_src = (*frag)->fr_src; - ip->ip_dst = (*frag)->fr_dst; + ip->ip_src = (*frag)->fr_srcx.v4; + ip->ip_dst = (*frag)->fr_dstx.v4; /* Remove from fragment queue */ pf_remove_fragment(*frag); @@ -600,8 +727,9 @@ pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff, (*frag)->fr_flags = PFFRAG_NOBUFFER; (*frag)->fr_max = 0; - (*frag)->fr_src = h->ip_src; - (*frag)->fr_dst = h->ip_dst; + (*frag)->fr_af = AF_INET; + (*frag)->fr_srcx.v4 = h->ip_src; + (*frag)->fr_dstx.v4 = h->ip_dst; (*frag)->fr_p = h->ip_p; (*frag)->fr_id = h->ip_id; (*frag)->fr_timeout = pf_time_second(); @@ -865,6 +993,535 @@ drop_fragment: return (NULL); } +#define FR_IP6_OFF(fr) \ + (ntohs((fr)->fr_ip6f_opt.ip6f_offlg & IP6F_OFF_MASK)) +#define FR_IP6_PLEN(fr) (ntohs((fr)->fr_ip6->ip6_plen)) +struct mbuf * +pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag, + struct pf_frent *frent, int mff) +{ + struct mbuf *m, *m2; + struct pf_frent *frea, *frep, *next; + struct ip6_hdr *ip6; + int plen, off, fr_max; + + VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag)); + m = *m0; + frep = NULL; + ip6 = frent->fr_ip6; + off = FR_IP6_OFF(frent); + plen = FR_IP6_PLEN(frent); + fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof *ip6); + + DPFPRINTF(("%p IPv6 frag plen %u off %u fr_ip6f_hlen %u fr_max %u m_len %u\n", m, + plen, off, frent->fr_ip6f_hlen, fr_max, m->m_len)); + + /* strip off headers up to the fragment payload */ + m->m_data += frent->fr_ip6f_hlen; + m->m_len -= frent->fr_ip6f_hlen; + + /* Create a new reassembly queue for this packet */ + if (*frag == NULL) { + *frag = pool_get(&pf_frag_pl, PR_NOWAIT); + if (*frag == NULL) { + pf_flush_fragments(); + *frag = pool_get(&pf_frag_pl, PR_NOWAIT); + if (*frag == NULL) + goto drop_fragment; + } + + (*frag)->fr_flags = 0; + (*frag)->fr_max = 0; + (*frag)->fr_af = AF_INET6; + (*frag)->fr_srcx.v6 = frent->fr_ip6->ip6_src; + (*frag)->fr_dstx.v6 = frent->fr_ip6->ip6_dst; + (*frag)->fr_p = frent->fr_ip6f_opt.ip6f_nxt; + (*frag)->fr_id6 = frent->fr_ip6f_opt.ip6f_ident; + (*frag)->fr_timeout = pf_time_second(); + LIST_INIT(&(*frag)->fr_queue); + + RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag); + TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next); + + /* We do not have a previous fragment */ + frep = NULL; + goto insert; + } + + /* + * Find a fragment after the current one: + * - off contains the real shifted offset. + */ + LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) { + if (FR_IP6_OFF(frea) > off) + break; + frep = frea; + } + + VERIFY(frep != NULL || frea != NULL); + + if (frep != NULL && + FR_IP6_OFF(frep) + FR_IP6_PLEN(frep) - frep->fr_ip6f_hlen > off) + { + u_int16_t precut; + + precut = FR_IP6_OFF(frep) + FR_IP6_PLEN(frep) - + frep->fr_ip6f_hlen - off; + if (precut >= plen) + goto drop_fragment; + m_adj(frent->fr_m, precut); + DPFPRINTF(("overlap -%d\n", precut)); + /* Enforce 8 byte boundaries */ + frent->fr_ip6f_opt.ip6f_offlg = + htons(ntohs(frent->fr_ip6f_opt.ip6f_offlg) + + (precut >> 3)); + off = FR_IP6_OFF(frent); + plen -= precut; + ip6->ip6_plen = htons(plen); + } + + for (; frea != NULL && plen + off > FR_IP6_OFF(frea); frea = next) { + u_int16_t aftercut; + + aftercut = plen + off - FR_IP6_OFF(frea); + DPFPRINTF(("adjust overlap %d\n", aftercut)); + if (aftercut < FR_IP6_PLEN(frea) - frea->fr_ip6f_hlen) { + frea->fr_ip6->ip6_plen = htons(FR_IP6_PLEN(frea) - + aftercut); + frea->fr_ip6f_opt.ip6f_offlg = + htons(ntohs(frea->fr_ip6f_opt.ip6f_offlg) + + (aftercut >> 3)); + m_adj(frea->fr_m, aftercut); + break; + } + + /* This fragment is completely overlapped, lose it */ + next = LIST_NEXT(frea, fr_next); + m_freem(frea->fr_m); + LIST_REMOVE(frea, fr_next); + pool_put(&pf_frent_pl, frea); + pf_nfrents--; + } + + insert: + /* Update maximum data size */ + if ((*frag)->fr_max < fr_max) + (*frag)->fr_max = fr_max; + /* This is the last segment */ + if (!mff) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + if (frep == NULL) + LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next); + else + LIST_INSERT_AFTER(frep, frent, fr_next); + + /* Check if we are completely reassembled */ + if (!((*frag)->fr_flags & PFFRAG_SEENLAST)) + return (NULL); + + /* Check if we have all the data */ + off = 0; + for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) { + next = LIST_NEXT(frep, fr_next); + off += FR_IP6_PLEN(frep) - (frent->fr_ip6f_hlen - sizeof *ip6); + DPFPRINTF(("frep at %d, next %d, max %d\n", + off, next == NULL ? -1 : FR_IP6_OFF(next), + (*frag)->fr_max)); + if (off < (*frag)->fr_max && + (next == NULL || FR_IP6_OFF(next) != off)) { + DPFPRINTF(("missing fragment at %d, next %d, max %d\n", + off, next == NULL ? -1 : FR_IP6_OFF(next), + (*frag)->fr_max)); + return (NULL); + } + } + DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max)); + if (off < (*frag)->fr_max) + return (NULL); + + /* We have all the data */ + frent = LIST_FIRST(&(*frag)->fr_queue); + VERIFY(frent != NULL); + if (frent->fr_ip6f_hlen + off > IP_MAXPACKET) { + DPFPRINTF(("drop: too big: %d\n", off)); + pf_free_fragment(*frag); + *frag = NULL; + return (NULL); + } + + ip6 = frent->fr_ip6; + ip6->ip6_nxt = (*frag)->fr_p; + ip6->ip6_plen = htons(off); + ip6->ip6_src = (*frag)->fr_srcx.v6; + ip6->ip6_dst = (*frag)->fr_dstx.v6; + + /* Remove from fragment queue */ + pf_remove_fragment(*frag); + *frag = NULL; + + m = frent->fr_m; + m->m_len += sizeof(struct ip6_hdr); + m->m_data -= sizeof(struct ip6_hdr); + memmove(m->m_data, ip6, sizeof(struct ip6_hdr)); + + next = LIST_NEXT(frent, fr_next); + pool_put(&pf_frent_pl, frent); + pf_nfrents--; + for (frent = next; next != NULL; frent = next) { + m2 = frent->fr_m; + + m_cat(m, m2); + next = LIST_NEXT(frent, fr_next); + pool_put(&pf_frent_pl, frent); + pf_nfrents--; + } + + /* XXX this should be done elsewhere */ + if (m->m_flags & M_PKTHDR) { + int pktlen = 0; + for (m2 = m; m2; m2 = m2->m_next) + pktlen += m2->m_len; + m->m_pkthdr.len = pktlen; + } + + DPFPRINTF(("complete: %p ip6_plen %d m_pkthdr.len %d\n", + m, ntohs(ip6->ip6_plen), m->m_pkthdr.len)); + + return m; + + drop_fragment: + /* Oops - fail safe - drop packet */ + pool_put(&pf_frent_pl, frent); + --pf_nfrents; + m_freem(m); + return NULL; +} + +static struct mbuf * +pf_frag6cache(struct mbuf **m0, struct ip6_hdr *h, struct ip6_frag *fh, + struct pf_fragment **frag, int hlen, int mff, int drop, int *nomem) +{ + struct mbuf *m = *m0; + u_int16_t plen, off, fr_max; + struct pf_frcache *frp, *fra, *cur = NULL; + int hosed = 0; + + VERIFY(*frag == NULL || !BUFFER_FRAGMENTS(*frag)); + m = *m0; + off = ntohs(fh->ip6f_offlg & IP6F_OFF_MASK); + plen = ntohs(h->ip6_plen) - (hlen - sizeof *h); + + /* + * Apple Modification: dimambro@apple.com. The hlen, being passed + * into this function Includes all the headers associated with + * the packet, and may include routing headers, so to get to + * the data payload as stored in the original IPv6 header we need + * to subtract al those headers and the IP header. + * + * The 'max' local variable should also contain the offset from the start + * of the reassembled packet to the octet just past the end of the octets + * in the current fragment where: + * - 'off' is the offset from the start of the reassembled packet to the + * first octet in the fragment, + * - 'plen' is the length of the "payload data length" Excluding all the + * IPv6 headers of the fragment. + * - 'hlen' is computed in pf_normalize_ip6() as the offset from the start + * of the IPv6 packet to the beginning of the data. + */ + fr_max = off + plen; + + DPFPRINTF(("%p plen %u off %u fr_max %u\n", m, + plen, off, fr_max)); + + /* Create a new range queue for this packet */ + if (*frag == NULL) { + *frag = pool_get(&pf_cache_pl, PR_NOWAIT); + if (*frag == NULL) { + pf_flush_fragments(); + *frag = pool_get(&pf_cache_pl, PR_NOWAIT); + if (*frag == NULL) + goto no_mem; + } + + /* Get an entry for the queue */ + cur = pool_get(&pf_cent_pl, PR_NOWAIT); + if (cur == NULL) { + pool_put(&pf_cache_pl, *frag); + *frag = NULL; + goto no_mem; + } + pf_ncache++; + + (*frag)->fr_flags = PFFRAG_NOBUFFER; + (*frag)->fr_max = 0; + (*frag)->fr_af = AF_INET6; + (*frag)->fr_srcx.v6 = h->ip6_src; + (*frag)->fr_dstx.v6 = h->ip6_dst; + (*frag)->fr_p = fh->ip6f_nxt; + (*frag)->fr_id6 = fh->ip6f_ident; + (*frag)->fr_timeout = pf_time_second(); + + cur->fr_off = off; + cur->fr_end = fr_max; + LIST_INIT(&(*frag)->fr_cache); + LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next); + + RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag); + TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next); + + DPFPRINTF(("frag6cache[%d]: new %d-%d\n", ntohl(fh->ip6f_ident), + off, fr_max)); + + goto pass; + } + + /* + * Find a fragment after the current one: + * - off contains the real shifted offset. + */ + frp = NULL; + LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) { + if (fra->fr_off > off) + break; + frp = fra; + } + + VERIFY(frp != NULL || fra != NULL); + + if (frp != NULL) { + int precut; + + precut = frp->fr_end - off; + if (precut >= plen) { + /* Fragment is entirely a duplicate */ + DPFPRINTF(("frag6cache[%u]: dead (%d-%d) %d-%d\n", + ntohl(fh->ip6f_ident), frp->fr_off, frp->fr_end, + off, fr_max)); + goto drop_fragment; + } + if (precut == 0) { + /* They are adjacent. Fixup cache entry */ + DPFPRINTF(("frag6cache[%u]: adjacent (%d-%d) %d-%d\n", + ntohl(fh->ip6f_ident), frp->fr_off, frp->fr_end, + off, fr_max)); + frp->fr_end = fr_max; + } else if (precut > 0) { + /* The first part of this payload overlaps with a + * fragment that has already been passed. + * Need to trim off the first part of the payload. + * But to do so easily, we need to create another + * mbuf to throw the original header into. + */ + + DPFPRINTF(("frag6cache[%u]: chop %d (%d-%d) %d-%d\n", + ntohl(fh->ip6f_ident), precut, frp->fr_off, + frp->fr_end, off, fr_max)); + + off += precut; + fr_max -= precut; + /* Update the previous frag to encompass this one */ + frp->fr_end = fr_max; + + if (!drop) { + /* XXX Optimization opportunity + * This is a very heavy way to trim the payload. + * we could do it much faster by diddling mbuf + * internals but that would be even less legible + * than this mbuf magic. For my next trick, + * I'll pull a rabbit out of my laptop. + */ + *m0 = m_copym(m, 0, hlen, M_NOWAIT); + if (*m0 == NULL) + goto no_mem; + VERIFY((*m0)->m_next == NULL); + m_adj(m, precut + hlen); + m_cat(*m0, m); + m = *m0; + if (m->m_flags & M_PKTHDR) { + int pktlen = 0; + struct mbuf *t; + for (t = m; t; t = t->m_next) + pktlen += t->m_len; + m->m_pkthdr.len = pktlen; + } + + h = mtod(m, struct ip6_hdr *); + + VERIFY((int)m->m_len == + ntohs(h->ip6_plen) - precut); + fh->ip6f_offlg &= ~IP6F_OFF_MASK; + fh->ip6f_offlg |= + htons(ntohs(fh->ip6f_offlg & IP6F_OFF_MASK) + + (precut >> 3)); + h->ip6_plen = htons(ntohs(h->ip6_plen) - + precut); + } else { + hosed++; + } + } else { + /* There is a gap between fragments */ + + DPFPRINTF(("frag6cache[%u]: gap %d (%d-%d) %d-%d\n", + ntohl(fh->ip6f_ident), -precut, frp->fr_off, + frp->fr_end, off, fr_max)); + + cur = pool_get(&pf_cent_pl, PR_NOWAIT); + if (cur == NULL) + goto no_mem; + pf_ncache++; + + cur->fr_off = off; + cur->fr_end = fr_max; + LIST_INSERT_AFTER(frp, cur, fr_next); + } + } + + if (fra != NULL) { + int aftercut; + int merge = 0; + + aftercut = fr_max - fra->fr_off; + if (aftercut == 0) { + /* Adjacent fragments */ + DPFPRINTF(("frag6cache[%u]: adjacent %d-%d (%d-%d)\n", + ntohl(fh->ip6f_ident), off, fr_max, fra->fr_off, + fra->fr_end)); + fra->fr_off = off; + merge = 1; + } else if (aftercut > 0) { + /* Need to chop off the tail of this fragment */ + DPFPRINTF(("frag6cache[%u]: chop %d %d-%d (%d-%d)\n", + ntohl(fh->ip6f_ident), aftercut, off, fr_max, + fra->fr_off, fra->fr_end)); + fra->fr_off = off; + fr_max -= aftercut; + + merge = 1; + + if (!drop) { + m_adj(m, -aftercut); + if (m->m_flags & M_PKTHDR) { + int pktlen = 0; + struct mbuf *t; + for (t = m; t; t = t->m_next) + pktlen += t->m_len; + m->m_pkthdr.len = pktlen; + } + h = mtod(m, struct ip6_hdr *); + VERIFY((int)m->m_len == + ntohs(h->ip6_plen) - aftercut); + h->ip6_plen = + htons(ntohs(h->ip6_plen) - aftercut); + } else { + hosed++; + } + } else if (frp == NULL) { + /* There is a gap between fragments */ + DPFPRINTF(("frag6cache[%u]: gap %d %d-%d (%d-%d)\n", + ntohl(fh->ip6f_ident), -aftercut, off, fr_max, + fra->fr_off, fra->fr_end)); + + cur = pool_get(&pf_cent_pl, PR_NOWAIT); + if (cur == NULL) + goto no_mem; + pf_ncache++; + + cur->fr_off = off; + cur->fr_end = fr_max; + LIST_INSERT_BEFORE(fra, cur, fr_next); + } + + /* Need to glue together two separate fragment descriptors */ + if (merge) { + if (cur && fra->fr_off <= cur->fr_end) { + /* Need to merge in a previous 'cur' */ + DPFPRINTF(("frag6cache[%u]: adjacent(merge " + "%d-%d) %d-%d (%d-%d)\n", + ntohl(fh->ip6f_ident), cur->fr_off, + cur->fr_end, off, fr_max, fra->fr_off, + fra->fr_end)); + fra->fr_off = cur->fr_off; + LIST_REMOVE(cur, fr_next); + pool_put(&pf_cent_pl, cur); + pf_ncache--; + cur = NULL; + } else if (frp && fra->fr_off <= frp->fr_end) { + /* Need to merge in a modified 'frp' */ + VERIFY(cur == NULL); + DPFPRINTF(("frag6cache[%u]: adjacent(merge " + "%d-%d) %d-%d (%d-%d)\n", + ntohl(fh->ip6f_ident), frp->fr_off, + frp->fr_end, off, fr_max, fra->fr_off, + fra->fr_end)); + fra->fr_off = frp->fr_off; + LIST_REMOVE(frp, fr_next); + pool_put(&pf_cent_pl, frp); + pf_ncache--; + frp = NULL; + } + } + } + + if (hosed) { + /* + * We must keep tracking the overall fragment even when + * we're going to drop it anyway so that we know when to + * free the overall descriptor. Thus we drop the frag late. + */ + goto drop_fragment; + } + + pass: + /* Update maximum data size */ + if ((*frag)->fr_max < fr_max) + (*frag)->fr_max = fr_max; + + /* This is the last segment */ + if (!mff) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + /* Check if we are completely reassembled */ + if (((*frag)->fr_flags & PFFRAG_SEENLAST) && + LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 && + LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) { + /* Remove from fragment queue */ + DPFPRINTF(("frag6cache[%u]: done 0-%d\n", + ntohl(fh->ip6f_ident), (*frag)->fr_max)); + pf_free_fragment(*frag); + *frag = NULL; + } + + return (m); + + no_mem: + *nomem = 1; + + /* Still need to pay attention to !IP_MF */ + if (!mff && *frag != NULL) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + m_freem(m); + return (NULL); + + drop_fragment: + + /* Still need to pay attention to !IP_MF */ + if (!mff && *frag != NULL) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + if (drop) { + /* This fragment has been deemed bad. Don't reass */ + if (((*frag)->fr_flags & PFFRAG_DROP) == 0) + DPFPRINTF(("frag6cache[%u]: dropping overall fragment\n", + ntohl(fh->ip6f_ident))); + (*frag)->fr_flags |= PFFRAG_DROP; + } + + m_freem(m); + return (NULL); +} + int pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) @@ -969,8 +1626,7 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) { /* Fully buffer all of the fragments */ - frag = pf_find_fragment(h, &pf_frag_tree); - + frag = pf_find_fragment_by_ipv4_header(h, &pf_frag_tree); /* Check if we saw the last fragment already */ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && fr_max > frag->fr_max) @@ -987,8 +1643,8 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, frent->fr_m = m; /* Might return a completely reassembled mbuf, or NULL */ - DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, - fr_max)); + DPFPRINTF(("reass IPv4 frag %d @ %d-%d\n", ntohs(h->ip_id), + fragoff, fr_max)); *m0 = m = pf_reassemble(m0, &frag, frent, mff); if (m == NULL) @@ -1014,7 +1670,7 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, /* non-buffering fragment cache (drops or masks overlaps) */ int nomem = 0; - if (dir == PF_OUT && (pd->pf_mtag->flags & PF_TAG_FRAGCACHE)) { + if (dir == PF_OUT && (pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) { /* * Already passed the fragment cache in the * input direction. If we continued, it would @@ -1023,7 +1679,7 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, goto fragment_pass; } - frag = pf_find_fragment(h, &pf_cache_tree); + frag = pf_find_fragment_by_ipv4_header(h, &pf_cache_tree); /* Check if we saw the last fragment already */ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && @@ -1054,7 +1710,7 @@ pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, } #endif if (dir == PF_IN) - pd->pf_mtag->flags |= PF_TAG_FRAGCACHE; + pd->pf_mtag->pftag_flags |= PF_TAG_FRAGCACHE; if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) goto drop; @@ -1117,7 +1773,7 @@ drop: return (PF_DROP); bad: - DPFPRINTF(("dropping bad fragment\n")); + DPFPRINTF(("dropping bad IPv4 fragment\n")); /* Free associated fragments */ if (frag != NULL) @@ -1152,6 +1808,10 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, u_int16_t fragoff = 0; u_int8_t proto; int terminal; + struct pf_frent *frent; + struct pf_fragment *pff = NULL; + int mff = 0, rh_cnt = 0; + u_int16_t fr_max; int asd = 0; struct pf_ruleset *ruleset = NULL; @@ -1203,6 +1863,7 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, proto = h->ip6_nxt; terminal = 0; do { + pd->proto = proto; switch (proto) { case IPPROTO_FRAGMENT: goto fragment; @@ -1213,19 +1874,20 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, if (!pf_pull_hdr(m, off, &ext, sizeof (ext), NULL, NULL, AF_INET6)) goto shortpkt; -#ifndef NO_APPLE_EXTENSIONS /* * + * Multiple routing headers not allowed. * Routing header type zero considered harmful. */ if (proto == IPPROTO_ROUTING) { const struct ip6_rthdr *rh = (const struct ip6_rthdr *)&ext; + if (rh_cnt++) + goto drop; if (rh->ip6r_type == IPV6_RTHDR_TYPE_0) goto drop; } else -#endif if (proto == IPPROTO_AH) off += (ext.ip6e_len + 2) * 4; else @@ -1311,32 +1973,110 @@ fragment: if (!pf_pull_hdr(m, off, &frag, sizeof (frag), NULL, NULL, AF_INET6)) goto shortpkt; fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK); - if (fragoff + (plen - off - sizeof (frag)) > IPV6_MAXPACKET) - goto badfrag; - - /* do something about it */ - /* remember to set pd->flags |= PFDESC_IP_REAS */ + pd->proto = frag.ip6f_nxt; + mff = ntohs(frag.ip6f_offlg & IP6F_MORE_FRAG); + off += sizeof frag; + if (fragoff + (plen - off) > IPV6_MAXPACKET) + goto badfrag; + + fr_max = fragoff + plen - (off - sizeof(struct ip6_hdr)); + DPFPRINTF(("%p IPv6 frag plen %u mff %d off %u fragoff %u fr_max %u\n", m, + plen, mff, off, fragoff, fr_max)); + + if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) { + /* Fully buffer all of the fragments */ + pd->flags |= PFDESC_IP_REAS; + + pff = pf_find_fragment_by_ipv6_header(h, &frag, + &pf_frag_tree); + + /* Check if we saw the last fragment already */ + if (pff != NULL && (pff->fr_flags & PFFRAG_SEENLAST) && + fr_max > pff->fr_max) + goto badfrag; + + /* Get an entry for the fragment queue */ + frent = pool_get(&pf_frent_pl, PR_NOWAIT); + if (frent == NULL) { + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } + pf_nfrents++; + frent->fr_ip6 = h; + frent->fr_m = m; + frent->fr_ip6f_opt = frag; + frent->fr_ip6f_hlen = off; + + /* Might return a completely reassembled mbuf, or NULL */ + DPFPRINTF(("reass IPv6 frag %d @ %d-%d\n", + ntohl(frag.ip6f_ident), fragoff, fr_max)); + *m0 = m = pf_reassemble6(m0, &pff, frent, mff); + + if (m == NULL) + return (PF_DROP); + + if (pff != NULL && (pff->fr_flags & PFFRAG_DROP)) + goto drop; + + h = mtod(m, struct ip6_hdr *); + } + else if (dir == PF_IN || !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) { + /* non-buffering fragment cache (overlaps: see RFC 5722) */ + int nomem = 0; + + pff = pf_find_fragment_by_ipv6_header(h, &frag, + &pf_cache_tree); + + /* Check if we saw the last fragment already */ + if (pff != NULL && (pff->fr_flags & PFFRAG_SEENLAST) && + fr_max > pff->fr_max) { + if (r->rule_flag & PFRULE_FRAGDROP) + pff->fr_flags |= PFFRAG_DROP; + goto badfrag; + } + + *m0 = m = pf_frag6cache(m0, h, &frag, &pff, off, mff, + (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem); + if (m == NULL) { + if (nomem) + goto no_mem; + goto drop; + } + + if (dir == PF_IN) + pd->pf_mtag->pftag_flags |= PF_TAG_FRAGCACHE; + + if (pff != NULL && (pff->fr_flags & PFFRAG_DROP)) + goto drop; + } + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (r->min_ttl && h->ip6_hlim < r->min_ttl) + h->ip6_hlim = r->min_ttl; return (PF_PASS); -shortpkt: + no_mem: + REASON_SET(reason, PFRES_MEMORY); + goto dropout; + + shortpkt: REASON_SET(reason, PFRES_SHORT); - if (r != NULL && r->log) - PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, - NULL, NULL, pd); - return (PF_DROP); - -drop: + goto dropout; + + drop: REASON_SET(reason, PFRES_NORM); - if (r != NULL && r->log) - PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, - NULL, NULL, pd); - return (PF_DROP); - -badfrag: + goto dropout; + + badfrag: + DPFPRINTF(("dropping bad IPv6 fragment\n")); REASON_SET(reason, PFRES_FRAG); + goto dropout; + + dropout: + if (pff != NULL) + pf_free_fragment(pff); if (r != NULL && r->log) - PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, - NULL, NULL, pd); + PFLOG_PACKET(kif, h, m, AF_INET6, dir, *reason, r, NULL, NULL, pd); return (PF_DROP); } #endif /* INET6 */ @@ -1354,12 +2094,10 @@ pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, u_int8_t flags; sa_family_t af = pd->af; struct pf_ruleset *ruleset = NULL; -#ifndef NO_APPLE_EXTENSIONS union pf_state_xport sxport, dxport; sxport.port = th->th_sport; dxport.port = th->th_dport; -#endif r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { @@ -1375,26 +2113,16 @@ pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif)) r = r->skip[PF_SKIP_SRC_ADDR].ptr; -#ifndef NO_APPLE_EXTENSIONS else if (r->src.xport.range.op && !pf_match_xport(r->src.xport.range.op, r->proto_variant, &r->src.xport, &sxport)) -#else - else if (r->src.port_op && !pf_match_port(r->src.port_op, - r->src.port[0], r->src.port[1], th->th_sport)) -#endif r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL)) r = r->skip[PF_SKIP_DST_ADDR].ptr; -#ifndef NO_APPLE_EXTENSIONS else if (r->dst.xport.range.op && !pf_match_xport(r->dst.xport.range.op, r->proto_variant, &r->dst.xport, &dxport)) -#else - else if (r->dst.port_op && !pf_match_port(r->dst.port_op, - r->dst.port[0], r->dst.port[1], th->th_dport)) -#endif r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(pf_osfp_fingerprint(pd, m, off, th), @@ -1469,7 +2197,6 @@ pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, } /* copy back packet headers if we sanitized */ -#ifndef NO_APPLE_EXTENSIONS /* Process options */ if (r->max_mss) { int rv = pf_normalize_tcpopt(r, dir, kif, pd, m, th, off, @@ -1492,14 +2219,6 @@ pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, m_copyback(mw, off, sizeof (*th), th); } -#else - /* Process options */ - if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af)) - rewrite = 1; - - if (rewrite) - m_copyback(m, off, sizeof (*th), th); -#endif return (PF_PASS); @@ -1721,7 +2440,6 @@ pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, } if (copyback) { /* Copyback the options, caller copys back header */ -#ifndef NO_APPLE_EXTENSIONS int optoff = off + sizeof (*th); int optlen = (th->th_off << 2) - sizeof (*th); m = pf_lazy_makewritable(pd, m, optoff + optlen); @@ -1731,12 +2449,6 @@ pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, } *writeback = optoff + optlen; m_copyback(m, optoff, optlen, hdr + sizeof (*th)); -#else - *writeback = 1; - m_copyback(m, off + sizeof (struct tcphdr), - (th->th_off << 2) - sizeof (struct tcphdr), hdr + - sizeof (struct tcphdr)); -#endif } } @@ -2012,7 +2724,6 @@ pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, return (0); } -#ifndef NO_APPLE_EXTENSIONS static int pf_normalize_tcpopt(struct pf_rule *r, int dir, struct pfi_kif *kif, struct pf_pdesc *pd, struct mbuf *m, struct tcphdr *th, int off, @@ -2020,12 +2731,6 @@ pf_normalize_tcpopt(struct pf_rule *r, int dir, struct pfi_kif *kif, { #pragma unused(dir, kif) sa_family_t af = pd->af; -#else -static int -pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, - int off, sa_family_t af) -{ -#endif u_int16_t *mss; int thoff; int opt, cnt, optlen = 0; @@ -2036,15 +2741,9 @@ pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, thoff = th->th_off << 2; cnt = thoff - sizeof (struct tcphdr); -#ifndef NO_APPLE_MODIFICATIONS if (cnt > 0 && !pf_pull_hdr(m, off + sizeof (*th), opts, cnt, NULL, NULL, af)) return PF_DROP; -#else - if (cnt > 0 && !pf_pull_hdr(m, off + sizeof (*th), opts, cnt, - NULL, NULL, af)) - return (rewrite); -#endif for (; cnt > 0; cnt -= optlen, optp += optlen) { opt = optp[0]; @@ -2061,9 +2760,8 @@ pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, } switch (opt) { case TCPOPT_MAXSEG: - mss = (u_int16_t *)(optp + 2); + mss = (u_int16_t *)(void *)(optp + 2); if ((ntohs(*mss)) > r->max_mss) { -#ifndef NO_APPLE_MODIFICATIONS /* * * Only do the TCP checksum fixup if delayed @@ -2073,10 +2771,6 @@ pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, !(m->m_pkthdr.csum_flags & CSUM_TCP)) th->th_sum = pf_cksum_fixup(th->th_sum, *mss, htons(r->max_mss), 0); -#else - th->th_sum = pf_cksum_fixup(th->th_sum, - *mss, htons(r->max_mss), 0); -#endif *mss = htons(r->max_mss); rewrite = 1; } @@ -2086,7 +2780,6 @@ pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, } } -#ifndef NO_APPLE_MODIFICATIONS if (rewrite) { struct mbuf *mw; u_short reason; @@ -2106,10 +2799,4 @@ pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, } return PF_PASS; -#else - if (rewrite) - m_copyback(m, off + sizeof (*th), thoff - sizeof (*th), opts); - - return (rewrite); -#endif } diff --git a/bsd/net/pf_osfp.c b/bsd/net/pf_osfp.c index 89d71e889..e04a94e0f 100644 --- a/bsd/net/pf_osfp.c +++ b/bsd/net/pf_osfp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2010 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,7 +109,7 @@ pf_osfp_fingerprint(struct pf_pdesc *pd, struct mbuf *m, int off, pd->af)) return (NULL); - return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)hdr)); + return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)(void *)hdr)); } struct pf_osfp_enlist * diff --git a/bsd/net/pf_ruleset.c b/bsd/net/pf_ruleset.c index 2ac10962e..27121f779 100644 --- a/bsd/net/pf_ruleset.c +++ b/bsd/net/pf_ruleset.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -168,6 +168,11 @@ pf_get_ruleset_number(u_int8_t action) case PF_NORDR: return (PF_RULESET_RDR); break; +#if DUMMYNET + case PF_DUMMYNET: + case PF_NODUMMYNET: + return (PF_RULESET_DUMMYNET); +#endif /* DUMMYNET */ default: return (PF_RULESET_MAX); break; @@ -217,6 +222,29 @@ pf_find_ruleset(const char *path) return (&anchor->ruleset); } +struct pf_ruleset * +pf_find_ruleset_with_owner(const char *path, const char *owner, int is_anchor, + int *error) +{ + struct pf_anchor *anchor; + + while (*path == '/') + path++; + if (!*path) + return (&pf_main_ruleset); + anchor = pf_find_anchor(path); + if (anchor == NULL) { + *error = EINVAL; + return (NULL); + } else { + if ((owner && anchor->owner && (!strcmp(owner, anchor->owner))) + || (is_anchor && !strcmp(anchor->owner, ""))) + return (&anchor->ruleset); + *error = EPERM; + return NULL; + } +} + struct pf_ruleset * pf_find_or_create_ruleset(const char *path) { diff --git a/bsd/net/pf_table.c b/bsd/net/pf_table.c index ea3b529f5..427cc6567 100644 --- a/bsd/net/pf_table.c +++ b/bsd/net/pf_table.c @@ -1517,7 +1517,7 @@ pfr_ina_define(struct pfr_table *tbl, user_addr_t addr, int size, return (EBUSY); tbl->pfrt_flags |= PFR_TFLAG_INACTIVE; SLIST_INIT(&tableq); - kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)tbl); + kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)(void *)tbl); if (kt == NULL) { kt = pfr_create_ktable(tbl, 0, 1); if (kt == NULL) @@ -2005,7 +2005,7 @@ pfr_lookup_table(struct pfr_table *tbl) /* struct pfr_ktable start like a struct pfr_table */ return (RB_FIND(pfr_ktablehead, &pfr_ktables, - (struct pfr_ktable *)tbl)); + (struct pfr_ktable *)(void *)tbl)); } int diff --git a/bsd/net/pfkeyv2.h b/bsd/net/pfkeyv2.h index e452e1d2e..3f8089c8f 100644 --- a/bsd/net/pfkeyv2.h +++ b/bsd/net/pfkeyv2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -412,7 +412,6 @@ struct sadb_sastat { #define SADB_X_EXT_NATT_KEEPALIVE 0x0004 /* Local node is behind NAT, send keepalives */ /* Should only be set for outbound SAs */ #define SADB_X_EXT_NATT_MULTIPLEUSERS 0x0008 /* For use on VPN server - support multiple users */ -#define SADB_X_EXT_NATT_DETECTED_PEER 0x0010 #endif /* PRIVATE */ @@ -426,6 +425,12 @@ struct sadb_sastat { #define SADB_X_EXT_PZERO 0x0200 /* zero padding for ESP */ #define SADB_X_EXT_PMASK 0x0300 /* mask for padding flag */ +#ifdef PRIVATE +#define SADB_X_EXT_NATT_DETECTED_PEER 0x1000 +#define SADB_X_EXT_ESP_KEEPALIVE 0x2000 +#define SADB_X_EXT_PUNT_RX_KEEPALIVE 0x4000 +#endif /* PRIVATE */ + #if 1 #define SADB_X_EXT_RAWCPI 0x0080 /* use well known CPI (IPComp) */ #endif diff --git a/bsd/net/pfvar.h b/bsd/net/pfvar.h index 4483dc867..a1f77f8a0 100644 --- a/bsd/net/pfvar.h +++ b/bsd/net/pfvar.h @@ -74,6 +74,7 @@ extern "C" { #endif +#include #include #include #include @@ -89,7 +90,6 @@ extern "C" { #include #include -#include #if BYTE_ORDER == BIG_ENDIAN #define htobe64(x) (x) @@ -146,17 +146,19 @@ union sockaddr_union { struct ip; struct ip6_hdr; struct tcphdr; -#ifndef NO_APPLE_EXTENSIONS struct pf_grev1_hdr; struct pf_esp_hdr; -#endif /* !NO_APPLE_EXTENSIONS */ #endif /* KERNEL */ +#define PF_GRE_PPTP_VARIANT 0x01 + enum { PF_INOUT, PF_IN, PF_OUT }; enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT, - PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP }; + PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, + PF_DUMMYNET, PF_NODUMMYNET }; enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT, - PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX }; + PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_DUMMYNET, + PF_RULESET_MAX }; enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT, PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG }; enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY }; @@ -173,11 +175,9 @@ enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED, PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE, PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY, -#ifndef NO_APPLE_EXTENSIONS - PFTM_GREv1_FIRST_PACKET, PFTM_GREv1_INITIATING, PFTM_GREv1_ESTABLISHED, - PFTM_ESP_FIRST_PACKET, PFTM_ESP_INITIATING, PFTM_ESP_ESTABLISHED, -#endif /* !NO_APPLE_EXTENSIONS */ - PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE, + PFTM_GREv1_FIRST_PACKET, PFTM_GREv1_INITIATING, + PFTM_GREv1_ESTABLISHED, PFTM_ESP_FIRST_PACKET, PFTM_ESP_INITIATING, + PFTM_ESP_ESTABLISHED, PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE, PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL, PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE, PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED, @@ -195,14 +195,12 @@ enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, #define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */ #define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */ #define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */ -#ifndef NO_APPLE_EXTENSIONS #define PFTM_GREv1_FIRST_PACKET_VAL 120 #define PFTM_GREv1_INITIATING_VAL 30 #define PFTM_GREv1_ESTABLISHED_VAL 1800 #define PFTM_ESP_FIRST_PACKET_VAL 120 #define PFTM_ESP_INITIATING_VAL 30 #define PFTM_ESP_ESTABLISHED_VAL 900 -#endif /* !NO_APPLE_EXTENSIONS */ #define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */ #define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */ #define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */ @@ -213,9 +211,7 @@ enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO }; enum { PF_LIMIT_STATES, -#ifndef NO_APPLE_EXTENSIONS PF_LIMIT_APP_STATES, -#endif /* !NO_APPLE_EXTENSIONS */ PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS, PF_LIMIT_TABLES, PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX }; #define PF_POOL_IDMASK 0x0f @@ -286,7 +282,6 @@ struct pf_addr_wrap { u_int8_t iflags; /* PFI_AFLAG_* */ }; -#ifndef NO_APPLE_EXTENSIONS struct pf_port_range { u_int16_t port[2]; u_int8_t op; @@ -297,7 +292,6 @@ union pf_rule_xport { u_int16_t call_id; u_int32_t spi; }; -#endif /* !NO_APPLE_EXTENSIONS */ #ifdef KERNEL struct pfi_dynaddr { @@ -361,6 +355,13 @@ struct pfi_dynaddr { (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[0] != (b)->addr32[0])) \ +#define PF_ALEQ(a, b, c) \ + ((c == AF_INET && (a)->addr32[0] <= (b)->addr32[0]) || \ + ((a)->addr32[3] <= (b)->addr32[3] && \ + (a)->addr32[2] <= (b)->addr32[2] && \ + (a)->addr32[1] <= (b)->addr32[1] && \ + (a)->addr32[0] <= (b)->addr32[0])) \ + #define PF_AZERO(a, c) \ ((c == AF_INET && !(a)->addr32[0]) || \ (!(a)->addr32[0] && !(a)->addr32[1] && \ @@ -396,6 +397,12 @@ struct pfi_dynaddr { (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[0] != (b)->addr32[0]) \ +#define PF_ALEQ(a, b, c) \ + ((a)->addr32[3] <= (b)->addr32[3] && \ + (a)->addr32[2] <= (b)->addr32[2] && \ + (a)->addr32[1] <= (b)->addr32[1] && \ + (a)->addr32[0] <= (b)->addr32[0]) \ + #define PF_AZERO(a, c) \ (!(a)->addr32[0] && \ !(a)->addr32[1] && \ @@ -425,6 +432,9 @@ struct pfi_dynaddr { #define PF_ANEQ(a, b, c) \ ((a)->addr32[0] != (b)->addr32[0]) +#define PF_ALEQ(a, b, c) \ + ((a)->addr32[0] <= (b)->addr32[0]) + #define PF_AZERO(a, c) \ (!(a)->addr32[0]) @@ -487,14 +497,8 @@ struct pf_rule_gid { struct pf_rule_addr { struct pf_addr_wrap addr; -#ifndef NO_APPLE_EXTENSIONS union pf_rule_xport xport; u_int8_t neg; -#else /* NO_APPLE_EXTENSIONS */ - u_int16_t port[2]; - u_int8_t neg; - u_int8_t port_op; -#endif /* NO_APPLE_EXTENSIONS */ }; struct pf_pooladdr { @@ -690,6 +694,11 @@ struct pf_rule { u_int64_t packets[2]; u_int64_t bytes[2]; + u_int32_t ticket; +#define PF_OWNER_NAME_SIZE 64 + char owner[PF_OWNER_NAME_SIZE]; + u_int32_t priority; + #ifdef KERNEL struct pfi_kif *kif __attribute__((aligned(8))); #else /* !KERNEL */ @@ -757,6 +766,38 @@ struct pf_rule { u_int8_t allow_opts; u_int8_t rt; u_int8_t return_ttl; + +/* service class categories */ +#define SCIDX_MASK 0x0f +#define SC_BE 0x10 +#define SC_BK_SYS 0x11 +#define SC_BK 0x12 +#define SC_RD 0x13 +#define SC_OAM 0x14 +#define SC_AV 0x15 +#define SC_RV 0x16 +#define SC_VI 0x17 +#define SC_VO 0x18 +#define SC_CTL 0x19 + +/* diffserve code points */ +#define DSCP_MASK 0xfc +#define DSCP_CUMASK 0x03 +#define DSCP_EF 0xb8 +#define DSCP_AF11 0x28 +#define DSCP_AF12 0x30 +#define DSCP_AF13 0x38 +#define DSCP_AF21 0x48 +#define DSCP_AF22 0x50 +#define DSCP_AF23 0x58 +#define DSCP_AF31 0x68 +#define DSCP_AF32 0x70 +#define DSCP_AF33 0x78 +#define DSCP_AF41 0x88 +#define DSCP_AF42 0x90 +#define DSCP_AF43 0x98 +#define AF_CLASSMASK 0xe0 +#define AF_DROPPRECMASK 0x18 u_int8_t tos; u_int8_t anchor_relative; u_int8_t anchor_wildcard; @@ -765,13 +806,18 @@ struct pf_rule { #define PF_FLUSH_GLOBAL 0x02 u_int8_t flush; -#ifndef NO_APPLE_EXTENSIONS u_int8_t proto_variant; u_int8_t extfilter; /* Filter mode [PF_EXTFILTER_xxx] */ - u_int8_t extmap; /* Mapping mode [PF_EXTMAP_xxx] */ -#endif /* !NO_APPLE_EXTENSIONS */ + u_int8_t extmap; /* Mapping mode [PF_EXTMAP_xxx] */ + u_int32_t dnpipe; + u_int32_t dntype; }; +/* pf device identifiers */ +#define PFDEV_PF 0 +#define PFDEV_PFM 1 +#define PFDEV_MAX 2 + /* rule flags */ #define PFRULE_DROP 0x0000 #define PFRULE_RETURNRST 0x0001 @@ -789,28 +835,32 @@ struct pf_rule { #define PFRULE_RANDOMID 0x0800 #define PFRULE_REASSEMBLE_TCP 0x1000 +/* rule flags for TOS/DSCP/service class differentiation */ +#define PFRULE_TOS 0x2000 +#define PFRULE_DSCP 0x4000 +#define PFRULE_SC 0x8000 + /* rule flags again */ -#define PFRULE_IFBOUND 0x00010000 /* if-bound */ +#define PFRULE_IFBOUND 0x00010000 /* if-bound */ +#define PFRULE_PFM 0x00020000 /* created by pfm device */ -#define PFSTATE_HIWAT 10000 /* default state table size */ -#define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */ -#define PFSTATE_ADAPT_END 12000 /* default adaptive timeout end */ +#define PFSTATE_HIWAT 10000 /* default state table size */ +#define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */ +#define PFSTATE_ADAPT_END 12000 /* default adaptive timeout end */ -#ifndef NO_APPLE_EXTENSIONS -#define PFAPPSTATE_HIWAT 10000 /* default same as state table */ +#define PFAPPSTATE_HIWAT 10000 /* default same as state table */ enum pf_extmap { PF_EXTMAP_APD = 1, /* Address-port-dependent mapping */ - PF_EXTMAP_AD, /* Address-dependent mapping */ - PF_EXTMAP_EI /* Endpoint-independent mapping */ + PF_EXTMAP_AD, /* Address-dependent mapping */ + PF_EXTMAP_EI /* Endpoint-independent mapping */ }; enum pf_extfilter { PF_EXTFILTER_APD = 1, /* Address-port-dependent filtering */ - PF_EXTFILTER_AD, /* Address-dependent filtering */ - PF_EXTFILTER_EI /* Endpoint-independent filtering */ + PF_EXTFILTER_AD, /* Address-dependent filtering */ + PF_EXTFILTER_EI /* Endpoint-independent filtering */ }; -#endif /* !NO_APPLE_EXTENSIONS */ struct pf_threshold { u_int32_t limit; @@ -862,7 +912,6 @@ struct pf_state_scrub { }; #endif /* KERNEL */ -#ifndef NO_APPLE_EXTENSIONS union pf_state_xport { u_int16_t port; u_int16_t call_id; @@ -870,16 +919,9 @@ union pf_state_xport { }; struct pf_state_host { - struct pf_addr addr; + struct pf_addr addr; union pf_state_xport xport; }; -#else /* NO_APPLE_EXTENSIONS */ -struct pf_state_host { - struct pf_addr addr; - u_int16_t port; - u_int16_t pad; -}; -#endif /* NO_APPLE_EXTENSIONS */ #ifdef KERNEL struct pf_state_peer { @@ -896,10 +938,7 @@ struct pf_state_peer { }; TAILQ_HEAD(pf_state_queue, pf_state); -#endif /* KERNEL */ -#ifndef NO_APPLE_EXTENSIONS -#ifdef KERNEL struct pf_state; struct pf_pdesc; struct pf_app_state; @@ -931,11 +970,7 @@ struct pf_app_state { struct pf_ike_state ike; } u; }; -#endif /* KERNEL */ -#define PF_GRE_PPTP_VARIANT 0x01 -#endif /* !NO_APPLE_EXTENSIONS */ -#ifdef KERNEL /* keep synced with struct pf_state, used in RB_FIND */ struct pf_state_key_cmp { struct pf_state_host lan; @@ -944,12 +979,8 @@ struct pf_state_key_cmp { sa_family_t af; u_int8_t proto; u_int8_t direction; -#ifndef NO_APPLE_EXTENSIONS u_int8_t proto_variant; struct pf_app_state *app_state; -#else /* NO_APPLE_EXTENSIONS */ - u_int8_t pad; -#endif /* NO_APPLE_EXTENSIONS */ }; TAILQ_HEAD(pf_statelist, pf_state); @@ -961,17 +992,14 @@ struct pf_state_key { sa_family_t af; u_int8_t proto; u_int8_t direction; -#ifndef NO_APPLE_EXTENSIONS u_int8_t proto_variant; struct pf_app_state *app_state; -#else /* NO_APPLE_EXTENSIONS */ - u_int8_t pad; -#endif /* NO_APPLE_EXTENSIONS */ + u_int32_t flowhash; RB_ENTRY(pf_state_key) entry_lan_ext; RB_ENTRY(pf_state_key) entry_ext_gwy; struct pf_statelist states; - u_short refcnt; /* same size as if_index */ + u_int32_t refcnt; }; @@ -981,6 +1009,14 @@ struct pf_state_cmp { u_int32_t creatorid; u_int32_t pad; }; + +/* flowhash key (12-bytes multiple for performance) */ +struct pf_flowhash_key { + struct pf_state_host ap1; /* address+port blob 1 */ + struct pf_state_host ap2; /* address+port blob 2 */ + u_int32_t af; + u_int32_t proto; +}; #endif /* KERNEL */ struct hook_desc; @@ -1001,9 +1037,7 @@ struct pf_state { union pf_rule_ptr anchor; union pf_rule_ptr nat_rule; struct pf_addr rt_addr; -#ifndef NO_APPLE_EXTENSIONS - struct hook_desc_head unlink_hooks; -#endif /* !NO_APPLE_EXTENSIONS */ + struct hook_desc_head unlink_hooks; struct pf_state_key *state_key; struct pfi_kif *kif; struct pfi_kif *rt_kif; @@ -1041,14 +1075,9 @@ struct pfsync_state_scrub { } __packed; struct pfsync_state_host { - struct pf_addr addr; -#ifndef NO_APPLE_EXTENSIONS + struct pf_addr addr; union pf_state_xport xport; - u_int16_t pad[2]; -#else /* NO_APPLE_EXTENSIONS */ - u_int16_t port; - u_int16_t pad[3]; -#endif /* NO_APPLE_EXTENSIONS */ + u_int16_t pad[2]; } __packed; struct pfsync_state_peer { @@ -1072,12 +1101,10 @@ struct pfsync_state { struct pfsync_state_peer src; struct pfsync_state_peer dst; struct pf_addr rt_addr; -#ifndef NO_APPLE_EXTENSIONS struct hook_desc_head unlink_hooks; #if !defined(__LP64__) u_int32_t _pad[2]; #endif /* !__LP64__ */ -#endif /* !NO_APPLE_EXTENSIONS */ u_int32_t rule; u_int32_t anchor; u_int32_t nat_rule; @@ -1086,9 +1113,7 @@ struct pfsync_state { u_int32_t packets[2][2]; u_int32_t bytes[2][2]; u_int32_t creatorid; -#ifndef NO_APPLE_EXTENSIONS - u_int16_t tag; -#endif /* !NO_APPLE_EXTENSIONS */ + u_int16_t tag; sa_family_t af; u_int8_t proto; u_int8_t direction; @@ -1097,9 +1122,9 @@ struct pfsync_state { u_int8_t timeout; u_int8_t sync_flags; u_int8_t updates; -#ifndef NO_APPLE_EXTENSIONS u_int8_t proto_variant; -#endif /* !NO_APPLE_EXTENSIONS */ + u_int8_t __pad; + u_int32_t flowhash; } __packed; #define PFSYNC_FLAG_COMPRESS 0x01 @@ -1187,6 +1212,7 @@ struct pf_anchor { struct pf_ruleset ruleset; int refcnt; /* anchor rules */ int match; + char owner[PF_OWNER_NAME_SIZE]; }; #ifdef KERNEL RB_PROTOTYPE_SC(__private_extern__, pf_anchor_global, pf_anchor, entry_global, @@ -1382,10 +1408,8 @@ struct pf_pdesc { #if INET6 struct icmp6_hdr *icmp6; #endif /* INET6 */ -#ifndef NO_APPLE_EXTENSIONS - struct pf_grev1_hdr *grev1; - struct pf_esp_hdr *esp; -#endif /* !NO_APPLE_EXTENSIONS */ + struct pf_grev1_hdr *grev1; + struct pf_esp_hdr *esp; void *any; } hdr; struct pf_addr baddr; /* address before translation */ @@ -1395,10 +1419,8 @@ struct pf_pdesc { struct pf_addr *dst; struct ether_header *eh; -#ifndef NO_APPLE_EXTENSIONS struct mbuf *mp; - int lmw; /* lazy writable offset */ -#endif /* !NO_APPLE_EXTENSIONS */ + int lmw; /* lazy writable offset */ struct pf_mtag *pf_mtag; u_int16_t *ip_sum; u_int32_t p_len; /* total length of payload */ @@ -1406,12 +1428,14 @@ struct pf_pdesc { /* state code. Easier than tags */ #define PFDESC_TCP_NORM 0x0001 /* TCP shall be statefully scrubbed */ #define PFDESC_IP_REAS 0x0002 /* IP frags would've been reassembled */ +#define PFDESC_FLOW_ADV 0x0004 /* sender can use flow advisory */ +#define PFDESC_IP_FRAG 0x0008 /* This is a fragment */ sa_family_t af; u_int8_t proto; u_int8_t tos; -#ifndef NO_APPLE_EXTENSIONS u_int8_t proto_variant; -#endif /* !NO_APPLE_EXTENSIONS */ + mbuf_svc_class_t sc; + u_int32_t flowhash; /* flow hash to identify the sender */ }; #endif /* KERNEL */ @@ -1435,7 +1459,8 @@ struct pf_pdesc { #define PFRES_MAXSTATES 12 /* State limit */ #define PFRES_SRCLIMIT 13 /* Source node/conn limit */ #define PFRES_SYNPROXY 14 /* SYN proxy */ -#define PFRES_MAX 15 /* total+1 */ +#define PFRES_DUMMYNET 15 /* Dummynet */ +#define PFRES_MAX 16 /* total+1 */ #define PFRES_NAMES { \ "match", \ @@ -1453,6 +1478,7 @@ struct pf_pdesc { "state-limit", \ "src-limit", \ "synproxy", \ + "dummynet", \ NULL \ } @@ -1491,7 +1517,6 @@ struct pf_pdesc { NULL \ } -#ifndef NO_APPLE_EXTENSIONS /* GREv1 protocol state enumeration */ #define PFGRE1S_NO_TRAFFIC 0 #define PFGRE1S_INITIATING 1 @@ -1513,7 +1538,6 @@ struct pf_pdesc { #define PFESPS_NSTATES 3 /* number of state levels */ #define PFESPS_NAMES { "NO_TRAFFIC", "INITIATING", "ESTABLISHED", NULL } -#endif /* !NO_APPLE_EXTENSIONS */ /* Other protocol state enumeration */ #define PFOTHERS_NO_TRAFFIC 0 @@ -1574,66 +1598,111 @@ struct pf_status { }; struct cbq_opts { - u_int minburst; - u_int maxburst; - u_int pktsize; - u_int maxpktsize; - u_int ns_per_byte; - u_int maxidle; - int minidle; - u_int offtime; - int flags; + u_int32_t minburst; + u_int32_t maxburst; + u_int32_t pktsize; + u_int32_t maxpktsize; + u_int32_t ns_per_byte; + u_int32_t maxidle; + int32_t minidle; + u_int32_t offtime; + u_int32_t flags; }; struct priq_opts { - int flags; + u_int32_t flags; +}; + +struct qfq_opts { + u_int32_t flags; + u_int32_t lmax; }; struct hfsc_opts { /* real-time service curve */ - u_int rtsc_m1; /* slope of the 1st segment in bps */ - u_int rtsc_d; /* the x-projection of m1 in msec */ - u_int rtsc_m2; /* slope of the 2nd segment in bps */ + u_int64_t rtsc_m1; /* slope of the 1st segment in bps */ + u_int64_t rtsc_d; /* the x-projection of m1 in msec */ + u_int64_t rtsc_m2; /* slope of the 2nd segment in bps */ + u_int32_t rtsc_fl; /* service curve flags */ +#if !defined(__LP64__) + u_int32_t _pad; +#endif /* !__LP64__ */ /* link-sharing service curve */ - u_int lssc_m1; - u_int lssc_d; - u_int lssc_m2; + u_int64_t lssc_m1; + u_int64_t lssc_d; + u_int64_t lssc_m2; + u_int32_t lssc_fl; +#if !defined(__LP64__) + u_int32_t __pad; +#endif /* !__LP64__ */ /* upper-limit service curve */ - u_int ulsc_m1; - u_int ulsc_d; - u_int ulsc_m2; - int flags; + u_int64_t ulsc_m1; + u_int64_t ulsc_d; + u_int64_t ulsc_m2; + u_int32_t ulsc_fl; + u_int32_t flags; /* scheduler flags */ }; +struct fairq_opts { + u_int32_t nbuckets; /* hash buckets */ + u_int32_t flags; + u_int64_t hogs_m1; /* hog detection bandwidth */ + + /* link-sharing service curve */ + u_int64_t lssc_m1; + u_int64_t lssc_d; + u_int64_t lssc_m2; +}; + +/* bandwidth types */ +#define PF_ALTQ_BW_ABSOLUTE 1 /* bw in absolute value (bps) */ +#define PF_ALTQ_BW_PERCENT 2 /* bandwidth in percentage */ + +/* ALTQ rule flags */ +#define PF_ALTQF_TBR 0x1 /* enable Token Bucket Regulator */ + +/* queue rule flags */ +#define PF_ALTQ_QRF_WEIGHT 0x1 /* weight instead of priority */ + struct pf_altq { char ifname[IFNAMSIZ]; - void *altq_disc; /* discipline-specific state */ + /* discipline-specific state */ + void *altq_disc __attribute__((aligned(8))); + TAILQ_ENTRY(pf_altq) entries __attribute__((aligned(8))); #if !defined(__LP64__) - u_int32_t _pad; -#endif /* !__LP64__ */ - TAILQ_ENTRY(pf_altq) entries; -#if !defined(__LP64__) - u_int32_t __pad[2]; + u_int32_t _pad[2]; #endif /* !__LP64__ */ + u_int32_t aflags; /* ALTQ rule flags */ + u_int32_t bwtype; /* bandwidth type */ + /* scheduler spec */ - u_int8_t scheduler; /* scheduler type */ - u_int16_t tbrsize; /* tokenbucket regulator size */ - u_int32_t ifbandwidth; /* interface bandwidth */ + u_int32_t scheduler; /* scheduler type */ + u_int32_t tbrsize; /* tokenbucket regulator size */ + u_int64_t ifbandwidth; /* interface bandwidth */ /* queue spec */ char qname[PF_QNAME_SIZE]; /* queue name */ char parent[PF_QNAME_SIZE]; /* parent name */ u_int32_t parent_qid; /* parent queue id */ - u_int32_t bandwidth; /* queue bandwidth */ - u_int8_t priority; /* priority */ - u_int16_t qlimit; /* queue size limit */ - u_int16_t flags; /* misc flags */ + u_int32_t qrflags; /* queue rule flags */ + union { + u_int32_t priority; /* priority */ + u_int32_t weight; /* weight */ + }; + u_int32_t qlimit; /* queue size limit */ + u_int32_t flags; /* misc flags */ +#if !defined(__LP64__) + u_int32_t __pad; +#endif /* !__LP64__ */ + u_int64_t bandwidth; /* queue bandwidth */ union { struct cbq_opts cbq_opts; struct priq_opts priq_opts; struct hfsc_opts hfsc_opts; + struct fairq_opts fairq_opts; + struct qfq_opts qfq_opts; } pq_u; u_int32_t qid; /* return value */ @@ -1686,7 +1755,6 @@ struct pfioc_natlook { struct pf_addr daddr; struct pf_addr rsaddr; struct pf_addr rdaddr; -#ifndef NO_APPLE_EXTENSIONS union pf_state_xport sxport; union pf_state_xport dxport; union pf_state_xport rsxport; @@ -1695,15 +1763,6 @@ struct pfioc_natlook { u_int8_t proto; u_int8_t proto_variant; u_int8_t direction; -#else /* NO_APPLE_EXTENSIONS */ - u_int16_t sport; - u_int16_t dport; - u_int16_t rsport; - u_int16_t rdport; - sa_family_t af; - u_int8_t proto; - u_int8_t direction; -#endif /* NO_APPLE_EXTENSIONS */ }; struct pfioc_state { @@ -1717,29 +1776,21 @@ struct pfioc_src_node_kill { struct pf_rule_addr psnk_dst; }; -#ifndef NO_APPLE_EXTENSIONS struct pfioc_state_addr_kill { struct pf_addr_wrap addr; u_int8_t reserved_[3]; u_int8_t neg; union pf_rule_xport xport; }; -#endif /* !NO_APPLE_EXTENSIONS */ struct pfioc_state_kill { /* XXX returns the number of states killed in psk_af */ sa_family_t psk_af; -#ifndef NO_APPLE_EXTENSIONS u_int8_t psk_proto; u_int8_t psk_proto_variant; u_int8_t _pad; struct pfioc_state_addr_kill psk_src; struct pfioc_state_addr_kill psk_dst; -#else /* NO_APPLE_EXTENSIONS */ - int psk_proto; - struct pf_rule_addr psk_src; - struct pf_rule_addr psk_dst; -#endif /* NO_APPLE_EXTENSIONS */ char psk_ifname[IFNAMSIZ]; }; @@ -1998,6 +2049,11 @@ struct pfioc_iface_64 { }; #endif /* KERNEL */ +struct pf_ifspeed { + char ifname[IFNAMSIZ]; + u_int64_t baudrate; +}; + /* * ioctl operations */ @@ -2020,7 +2076,8 @@ struct pfioc_iface_64 { #define DIOCSETDEBUG _IOWR('D', 24, u_int32_t) #define DIOCGETSTATES _IOWR('D', 25, struct pfioc_states) #define DIOCCHANGERULE _IOWR('D', 26, struct pfioc_rule) -/* XXX cut 26 - 28 */ +#define DIOCINSERTRULE _IOWR('D', 27, struct pfioc_rule) +#define DIOCDELETERULE _IOWR('D', 28, struct pfioc_rule) #define DIOCSETTIMEOUT _IOWR('D', 29, struct pfioc_tm) #define DIOCGETTIMEOUT _IOWR('D', 30, struct pfioc_tm) #define DIOCADDSTATE _IOWR('D', 37, struct pfioc_state) @@ -2071,7 +2128,8 @@ struct pfioc_iface_64 { #define DIOCIGETIFACES _IOWR('D', 87, struct pfioc_iface) #define DIOCSETIFFLAG _IOWR('D', 89, struct pfioc_iface) #define DIOCCLRIFFLAG _IOWR('D', 90, struct pfioc_iface) -#define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill) +#define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill) +#define DIOCGIFSPEED _IOWR('D', 92, struct pf_ifspeed) #ifdef KERNEL RB_HEAD(pf_src_tree, pf_src_node); @@ -2089,7 +2147,7 @@ TAILQ_HEAD(pf_poolqueue, pf_pool); __private_extern__ struct pf_poolqueue pf_pools[2]; __private_extern__ struct pf_palist pf_pabuf; __private_extern__ u_int32_t ticket_pabuf; -#if ALTQ +#if PF_ALTQ TAILQ_HEAD(pf_altqqueue, pf_altq); __private_extern__ struct pf_altqqueue pf_altqs[2]; __private_extern__ u_int32_t ticket_altqs_active; @@ -2097,7 +2155,7 @@ __private_extern__ u_int32_t ticket_altqs_inactive; __private_extern__ int altqs_inactive_open; __private_extern__ struct pf_altqqueue *pf_altqs_active; __private_extern__ struct pf_altqqueue *pf_altqs_inactive; -#endif /* ALTQ */ +#endif /* PF_ALTQ */ __private_extern__ struct pf_poolqueue *pf_pools_active; __private_extern__ struct pf_poolqueue *pf_pools_inactive; @@ -2106,16 +2164,15 @@ __private_extern__ int pf_tbladdr_setup(struct pf_ruleset *, __private_extern__ void pf_tbladdr_remove(struct pf_addr_wrap *); __private_extern__ void pf_tbladdr_copyout(struct pf_addr_wrap *); __private_extern__ void pf_calc_skip_steps(struct pf_rulequeue *); +__private_extern__ u_int32_t pf_calc_state_key_flowhash(struct pf_state_key *); __private_extern__ struct pool pf_src_tree_pl, pf_rule_pl; __private_extern__ struct pool pf_state_pl, pf_state_key_pl, pf_pooladdr_pl; __private_extern__ struct pool pf_state_scrub_pl; -#if ALTQ +#if PF_ALTQ __private_extern__ struct pool pf_altq_pl; -#endif /* ALTQ */ -#ifndef NO_APPLE_EXTENSIONS +#endif /* PF_ALTQ */ __private_extern__ struct pool pf_app_state_pl; -#endif /* !NO_APPLE_EXTENSIONS */ __private_extern__ struct thread *pf_purge_thread; @@ -2143,23 +2200,22 @@ __private_extern__ void pf_addrcpy(struct pf_addr *, struct pf_addr *, u_int8_t); __private_extern__ void pf_rm_rule(struct pf_rulequeue *, struct pf_rule *); +struct ip_fw_args; #if INET __private_extern__ int pf_test(int, struct ifnet *, struct mbuf **, - struct ether_header *); + struct ether_header *, struct ip_fw_args *); #endif /* INET */ #if INET6 __private_extern__ int pf_test6(int, struct ifnet *, struct mbuf **, - struct ether_header *); + struct ether_header *, struct ip_fw_args *); __private_extern__ void pf_poolmask(struct pf_addr *, struct pf_addr *, struct pf_addr *, struct pf_addr *, u_int8_t); __private_extern__ void pf_addr_inc(struct pf_addr *, sa_family_t); #endif /* INET6 */ -#ifndef NO_APPLE_EXTENSIONS __private_extern__ struct mbuf *pf_lazy_makewritable(struct pf_pdesc *, struct mbuf *, int); -#endif /* !NO_APPLE_EXTENSIONS */ __private_extern__ void *pf_pull_hdr(struct mbuf *, int, void *, int, u_short *, u_short *, sa_family_t); __private_extern__ void pf_change_a(void *, u_int16_t *, u_int32_t, u_int8_t); @@ -2172,10 +2228,8 @@ __private_extern__ int pf_match_addr_range(struct pf_addr *, struct pf_addr *, struct pf_addr *, sa_family_t); __private_extern__ int pf_match(u_int8_t, u_int32_t, u_int32_t, u_int32_t); __private_extern__ int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); -#ifndef NO_APPLE_EXTENSIONS __private_extern__ int pf_match_xport(u_int8_t, u_int8_t, union pf_rule_xport *, union pf_state_xport *); -#endif /* !NO_APPLE_EXTENSIONS */ __private_extern__ int pf_match_uid(u_int8_t, uid_t, uid_t, uid_t); __private_extern__ int pf_match_gid(u_int8_t, gid_t, gid_t, gid_t); @@ -2201,7 +2255,8 @@ __private_extern__ int pf_routable(struct pf_addr *addr, sa_family_t af, __private_extern__ int pf_rtlabel_match(struct pf_addr *, sa_family_t, struct pf_addr_wrap *); __private_extern__ int pf_socket_lookup(int, struct pf_pdesc *); -__private_extern__ struct pf_state_key *pf_alloc_state_key(struct pf_state *); +__private_extern__ struct pf_state_key *pf_alloc_state_key(struct pf_state *, + struct pf_state_key *); __private_extern__ void pfr_initialize(void); __private_extern__ int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); @@ -2275,7 +2330,7 @@ __private_extern__ void pf_tag2tagname(u_int16_t, char *); __private_extern__ void pf_tag_ref(u_int16_t); __private_extern__ void pf_tag_unref(u_int16_t); __private_extern__ int pf_tag_packet(struct mbuf *, struct pf_mtag *, - int, unsigned int); + int, unsigned int, struct pf_pdesc *); __private_extern__ void pf_step_into_anchor(int *, struct pf_ruleset **, int, struct pf_rule **, struct pf_rule **, int *); __private_extern__ int pf_step_out_of_anchor(int *, struct pf_ruleset **, int, @@ -2294,7 +2349,7 @@ struct pf_pool_limit { __private_extern__ struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX]; __private_extern__ int pf_af_hook(struct ifnet *, struct mbuf **, - struct mbuf **, unsigned int, int); + struct mbuf **, unsigned int, int, struct ip_fw_args *); __private_extern__ int pf_ifaddr_hook(struct ifnet *, unsigned long); __private_extern__ void pf_ifnet_hook(struct ifnet *, int); @@ -2308,6 +2363,11 @@ __private_extern__ struct pf_anchor pf_main_anchor; __private_extern__ int pf_is_enabled; #define PF_IS_ENABLED (pf_is_enabled != 0) +__private_extern__ u_int32_t pf_hash_seed; + +#if PF_ALTQ +__private_extern__ u_int32_t altq_allowed; +#endif /* PF_ALTQ */ /* these ruleset functions can be linked into userland programs (pfctl) */ __private_extern__ int pf_get_ruleset_number(u_int8_t); @@ -2320,6 +2380,8 @@ __private_extern__ void pf_anchor_remove(struct pf_rule *); __private_extern__ void pf_remove_if_empty_ruleset(struct pf_ruleset *); __private_extern__ struct pf_anchor *pf_find_anchor(const char *); __private_extern__ struct pf_ruleset *pf_find_ruleset(const char *); +__private_extern__ struct pf_ruleset *pf_find_ruleset_with_owner(const char *, + const char *, int, int *); __private_extern__ struct pf_ruleset *pf_find_or_create_ruleset(const char *); __private_extern__ void pf_rs_initialize(void); @@ -2333,6 +2395,8 @@ __private_extern__ int pf_osfp_get(struct pf_osfp_ioctl *); __private_extern__ void pf_osfp_initialize(void); __private_extern__ int pf_osfp_match(struct pf_osfp_enlist *, pf_osfp_t); __private_extern__ struct pf_os_fingerprint *pf_osfp_validate(void); +__private_extern__ struct pf_mtag *pf_find_mtag(struct mbuf *); +__private_extern__ struct pf_mtag *pf_get_mtag(struct mbuf *); #else /* !KERNEL */ extern struct pf_anchor_global pf_anchors; extern struct pf_anchor pf_main_anchor; @@ -2349,6 +2413,8 @@ extern void pf_anchor_remove(struct pf_rule *); extern void pf_remove_if_empty_ruleset(struct pf_ruleset *); extern struct pf_anchor *pf_find_anchor(const char *); extern struct pf_ruleset *pf_find_ruleset(const char *); +extern struct pf_ruleset *pf_find_ruleset_with_owner(const char *, + const char *, int, int *); extern struct pf_ruleset *pf_find_or_create_ruleset(const char *); extern void pf_rs_initialize(void); #endif /* !KERNEL */ diff --git a/bsd/net/pktsched/Makefile b/bsd/net/pktsched/Makefile new file mode 100644 index 000000000..ad824436a --- /dev/null +++ b/bsd/net/pktsched/Makefile @@ -0,0 +1,44 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTINC_SUBDIRS = \ + +INSTINC_SUBDIRS_PPC = \ + +INSTINC_SUBDIRS_I386 = \ + +EXPINC_SUBDIRS = \ + +EXPINC_SUBDIRS_PPC = \ + +EXPINC_SUBDIRS_I386 = \ + +DATAFILES= \ + +KERNELFILES= \ + +PRIVATE_DATAFILES = \ + pktsched.h pktsched_cbq.h pktsched_fairq.h pktsched_hfsc.h \ + pktsched_priq.h pktsched_tcq.h pktsched_rmclass.h pktsched_qfq.h + +PRIVATE_KERNELFILES = ${KERNELFILES} + +INSTALL_MI_LIST = ${DATAFILES} + +INSTALL_MI_DIR = net/pktsched + +EXPORT_MI_LIST = ${INSTALL_MI_LIST} ${KERNELFILES} + +EXPORT_MI_DIR = ${INSTALL_MI_DIR} + +INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/net/pktsched/pktsched.c b/bsd/net/pktsched/pktsched.c new file mode 100644 index 000000000..eda1ae420 --- /dev/null +++ b/bsd/net/pktsched/pktsched.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#if PKTSCHED_PRIQ +#include +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_FAIRQ +#include +#endif /* PKTSCHED_FAIRQ */ +#if PKTSCHED_CBQ +#include +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_HFSC +#include +#endif /* PKTSCHED_HFSC */ + +#include + +u_int32_t machclk_freq = 0; +u_int64_t machclk_per_sec = 0; +u_int32_t pktsched_verbose; /* more noise if greater than 1 */ + +static void init_machclk(void); + +SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "pktsched"); + +SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED, + &pktsched_verbose, 0, "Packet scheduler verbosity level"); + +void +pktsched_init(void) +{ + init_machclk(); + if (machclk_freq == 0) { + panic("%s: no CPU clock available!\n", __func__); + /* NOTREACHED */ + } + + tcq_init(); + qfq_init(); +#if PKTSCHED_PRIQ + priq_init(); +#endif /* PKTSCHED_PRIQ */ +#if PKTSCHED_FAIRQ + fairq_init(); +#endif /* PKTSCHED_FAIRQ */ +#if PKTSCHED_CBQ + cbq_init(); +#endif /* PKTSCHED_CBQ */ +#if PKTSCHED_HFSC + hfsc_init(); +#endif /* PKTSCHED_HFSC */ +} + +static void +init_machclk(void) +{ + /* + * Initialize machclk_freq using the timerbase frequency + * value from device specific info. + */ + machclk_freq = gPEClockFrequencyInfo.timebase_frequency_hz; + + clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, + &machclk_per_sec); +} + +u_int64_t +pktsched_abs_to_nsecs(u_int64_t abstime) +{ + u_int64_t nsecs; + + absolutetime_to_nanoseconds(abstime, &nsecs); + return (nsecs); +} + +u_int64_t +pktsched_nsecs_to_abstime(u_int64_t nsecs) +{ + u_int64_t abstime; + + nanoseconds_to_absolutetime(nsecs, &abstime); + return (abstime); +} + +int +pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags) +{ + int error = 0; + u_int32_t qflags = sflags; + u_int32_t rflags; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(machclk_freq != 0); + + /* Nothing to do unless the scheduler type changes */ + if (ifq->ifcq_type == scheduler) + return (0); + + qflags &= (PKTSCHEDF_QALG_RED | PKTSCHEDF_QALG_RIO | + PKTSCHEDF_QALG_BLUE | PKTSCHEDF_QALG_SFB); + + /* These are mutually exclusive */ + if (qflags != 0 && + qflags != PKTSCHEDF_QALG_RED && qflags != PKTSCHEDF_QALG_RIO && + qflags != PKTSCHEDF_QALG_BLUE && qflags != PKTSCHEDF_QALG_SFB) { + panic("%s: RED|RIO|BLUE|SFB mutually exclusive\n", __func__); + /* NOTREACHED */ + } + + /* + * Remember the flags that need to be restored upon success, as + * they may be cleared when we tear down existing scheduler. + */ + rflags = (ifq->ifcq_flags & IFCQF_ENABLED); + + if (ifq->ifcq_type != PKTSCHEDT_NONE) { + (void) pktsched_teardown(ifq); + + /* Teardown should have succeeded */ + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_enqueue == NULL); + VERIFY(ifq->ifcq_dequeue == NULL); + VERIFY(ifq->ifcq_dequeue_sc == NULL); + VERIFY(ifq->ifcq_request == NULL); + } + + switch (scheduler) { +#if PKTSCHED_PRIQ + case PKTSCHEDT_PRIQ: + error = priq_setup_ifclassq(ifq, sflags); + break; +#endif /* PKTSCHED_PRIQ */ + + case PKTSCHEDT_TCQ: + error = tcq_setup_ifclassq(ifq, sflags); + break; + + case PKTSCHEDT_QFQ: + error = qfq_setup_ifclassq(ifq, sflags); + break; + + default: + error = ENXIO; + break; + } + + if (error == 0) + ifq->ifcq_flags |= rflags; + + return (error); +} + +int +pktsched_teardown(struct ifclassq *ifq) +{ + int error = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if_qflush(ifq->ifcq_ifp, 1); + VERIFY(IFCQ_IS_EMPTY(ifq)); + + ifq->ifcq_flags &= ~IFCQF_ENABLED; + + switch (ifq->ifcq_type) { + case PKTSCHEDT_NONE: + break; + +#if PKTSCHED_PRIQ + case PKTSCHEDT_PRIQ: + error = priq_teardown_ifclassq(ifq); + break; +#endif /* PKTSCHED_PRIQ */ + + case PKTSCHEDT_TCQ: + error = tcq_teardown_ifclassq(ifq); + break; + + case PKTSCHEDT_QFQ: + error = qfq_teardown_ifclassq(ifq); + break; + + default: + error = ENXIO; + break; + } + + return (error); +} + +int +pktsched_getqstats(struct ifclassq *ifq, u_int32_t qid, + struct if_ifclassq_stats *ifqs) +{ + int error; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + switch (ifq->ifcq_type) { +#if PKTSCHED_PRIQ + case PKTSCHEDT_PRIQ: + error = priq_getqstats_ifclassq(ifq, qid, ifqs); + break; +#endif /* PKTSCHED_PRIQ */ + + case PKTSCHEDT_TCQ: + error = tcq_getqstats_ifclassq(ifq, qid, ifqs); + break; + + case PKTSCHEDT_QFQ: + error = qfq_getqstats_ifclassq(ifq, qid, ifqs); + break; + + default: + error = ENXIO; + break; + } + + return (error); +} diff --git a/bsd/net/pktsched/pktsched.h b/bsd/net/pktsched/pktsched.h new file mode 100644 index 000000000..aa3361b37 --- /dev/null +++ b/bsd/net/pktsched/pktsched.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PKTSCHED_PKTSCHED_H_ +#define _PKTSCHED_PKTSCHED_H_ + +#ifdef PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +/* packet scheduler type */ +#define PKTSCHEDT_NONE 0 /* reserved */ +#define PKTSCHEDT_CBQ 1 /* cbq */ +#define PKTSCHEDT_HFSC 2 /* hfsc */ +#define PKTSCHEDT_PRIQ 3 /* priority queue */ +#define PKTSCHEDT_FAIRQ 4 /* fairq */ +#define PKTSCHEDT_TCQ 5 /* traffic class queue */ +#define PKTSCHEDT_QFQ 6 /* quick fair queueing */ +#define PKTSCHEDT_MAX 7 /* should be max sched type + 1 */ + +#ifdef BSD_KERNEL_PRIVATE +#include +#include +#include + +/* flags for pktsched_setup */ +#define PKTSCHEDF_QALG_RED 0x1 /* use RED */ +#define PKTSCHEDF_QALG_RIO 0x2 /* use RIO */ +#define PKTSCHEDF_QALG_BLUE 0x4 /* use BLUE */ +#define PKTSCHEDF_QALG_SFB 0x8 /* use SFB */ +#define PKTSCHEDF_QALG_ECN 0x10 /* enable ECN */ +#define PKTSCHEDF_QALG_FLOWCTL 0x20 /* enable flow control advisories */ + +/* macro for timeout/untimeout */ +/* use old-style timeout/untimeout */ +/* dummy callout structure */ +struct callout { + void *c_arg; /* function argument */ + void (*c_func)(void *); /* function to call */ +}; + +#define CALLOUT_INIT(c) do { \ + (void) memset((c), 0, sizeof (*(c))); \ +} while (/*CONSTCOND*/ 0) + +#define CALLOUT_RESET(c, t, f, a) do { \ + (c)->c_arg = (a); \ + (c)->c_func = (f); \ + timeout((f), (a), (t)); \ +} while (/*CONSTCOND*/ 0) + +#define CALLOUT_STOP(c) untimeout((c)->c_func, (c)->c_arg) +#define CALLOUT_INITIALIZER { NULL, NULL } + +typedef void (timeout_t)(void *); + +/* + * Bitmap operations + */ +typedef u_int32_t pktsched_bitmap_t; + +static inline boolean_t +pktsched_bit_tst(u_int32_t ix, pktsched_bitmap_t *pData) +{ + return (*pData & (1 << ix)); +} + +static inline void +pktsched_bit_set(u_int32_t ix, pktsched_bitmap_t *pData) +{ + *pData |= (1 << ix); +} + +static inline void +pktsched_bit_clr(u_int32_t ix, pktsched_bitmap_t *pData) +{ + *pData &= ~(1 << ix); +} + +static inline pktsched_bitmap_t +pktsched_ffs(pktsched_bitmap_t pData) +{ + return (ffs(pData)); +} + +static inline pktsched_bitmap_t +pktsched_fls(pktsched_bitmap_t pData) +{ + return ((sizeof (pktsched_bitmap_t) << 3) - clz(pData)); +} + +static inline pktsched_bitmap_t +__fls(pktsched_bitmap_t word) +{ + VERIFY(word != 0); + return (pktsched_fls(word) - 1); +} + +/* + * We can use mach_absolute_time which returns a 64-bit value with + * granularity less than a microsecond even on the slowest processor. + */ +#define read_machclk() mach_absolute_time() + +/* + * machine dependent clock + * a 64bit high resolution time counter. + */ +extern u_int32_t machclk_freq; +extern u_int64_t machclk_per_sec; +extern u_int32_t pktsched_verbose; + +SYSCTL_DECL(_net_pktsched); + +struct if_ifclassq_stats; + +extern void pktsched_init(void); +extern int pktsched_setup(struct ifclassq *, u_int32_t, u_int32_t); +extern int pktsched_teardown(struct ifclassq *); +extern int pktsched_getqstats(struct ifclassq *, u_int32_t, + struct if_ifclassq_stats *); +extern u_int64_t pktsched_abs_to_nsecs(u_int64_t); +extern u_int64_t pktsched_nsecs_to_abstime(u_int64_t); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _PKTSCHED_PKTSCHED_H_ */ diff --git a/bsd/net/pktsched/pktsched_cbq.c b/bsd/net/pktsched/pktsched_cbq.c new file mode 100644 index 000000000..a923f6c87 --- /dev/null +++ b/bsd/net/pktsched/pktsched_cbq.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_cbq.c,v 1.23 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_cbq.c,v 1.9 2000/12/14 08:12:45 thorpej Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#if PKTSCHED_CBQ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * Forward Declarations. + */ +#if 0 +static int cbq_enqueue_ifclassq(struct ifclassq *, struct mbuf *); +static struct mbuf *cbq_dequeue_ifclassq(struct ifclassq *, cqdq_op_t); +static int cbq_request_ifclassq(struct ifclassq *, cqrq_t, void *); +#endif +static int cbq_class_destroy(cbq_state_t *, struct rm_class *); +static int cbq_destroy_locked(cbq_state_t *); +static struct rm_class *cbq_clh_to_clp(cbq_state_t *, u_int32_t); +static const char *cbq_style(cbq_state_t *); +static int cbq_clear_interface(cbq_state_t *); +static void cbqrestart(struct ifclassq *); + +#define CBQ_ZONE_MAX 32 /* maximum elements in zone */ +#define CBQ_ZONE_NAME "pktsched_cbq" /* zone name */ + +static unsigned int cbq_size; /* size of zone element */ +static struct zone *cbq_zone; /* zone for cbq */ + +void +cbq_init(void) +{ + _CASSERT(CBQCLF_RED == RMCF_RED); + _CASSERT(CBQCLF_ECN == RMCF_ECN); + _CASSERT(CBQCLF_RIO == RMCF_RIO); + _CASSERT(CBQCLF_FLOWVALVE == RMCF_FLOWVALVE); + _CASSERT(CBQCLF_CLEARDSCP == RMCF_CLEARDSCP); + _CASSERT(CBQCLF_WRR == RMCF_WRR); + _CASSERT(CBQCLF_EFFICIENT == RMCF_EFFICIENT); + _CASSERT(CBQCLF_BLUE == RMCF_BLUE); + _CASSERT(CBQCLF_SFB == RMCF_SFB); + _CASSERT(CBQCLF_FLOWCTL == RMCF_FLOWCTL); + _CASSERT(CBQCLF_LAZY == RMCF_LAZY); + + cbq_size = sizeof (cbq_state_t); + cbq_zone = zinit(cbq_size, CBQ_ZONE_MAX * cbq_size, 0, CBQ_ZONE_NAME); + if (cbq_zone == NULL) { + panic("%s: failed allocating %s", __func__, CBQ_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(cbq_zone, Z_EXPAND, TRUE); + zone_change(cbq_zone, Z_CALLERACCT, TRUE); + + rmclass_init(); +} + +cbq_state_t * +cbq_alloc(struct ifnet *ifp, int how, boolean_t altq) +{ + cbq_state_t *cbqp; + + /* allocate and initialize cbq_state_t */ + cbqp = (how == M_WAITOK) ? zalloc(cbq_zone) : zalloc_noblock(cbq_zone); + if (cbqp == NULL) + return (NULL); + + bzero(cbqp, cbq_size); + CALLOUT_INIT(&cbqp->cbq_callout); + cbqp->cbq_qlen = 0; + cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifclassq */ + if (altq) + cbqp->cbq_flags |= CBQSF_ALTQ; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler allocated\n", + if_name(ifp), cbq_style(cbqp)); + } + + return (cbqp); +} + +int +cbq_destroy(cbq_state_t *cbqp) +{ + struct ifclassq *ifq = cbqp->ifnp.ifq_; + int err; + + IFCQ_LOCK(ifq); + err = cbq_destroy_locked(cbqp); + IFCQ_UNLOCK(ifq); + + return (err); +} + +static int +cbq_destroy_locked(cbq_state_t *cbqp) +{ + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + (void) cbq_clear_interface(cbqp); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler destroyed\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + } + + if (cbqp->ifnp.default_) + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + if (cbqp->ifnp.root_) + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + + /* deallocate cbq_state_t */ + zfree(cbq_zone, cbqp); + + return (0); +} + +int +cbq_add_queue(cbq_state_t *cbqp, u_int32_t qlimit, u_int32_t priority, + u_int32_t minburst, u_int32_t maxburst, u_int32_t pktsize, + u_int32_t maxpktsize, u_int32_t ns_per_byte, u_int32_t maxidle, int minidle, + u_int32_t offtime, u_int32_t flags, u_int32_t parent_qid, u_int32_t qid, + struct rm_class **clp) +{ +#pragma unused(minburst, maxburst, maxpktsize) + struct rm_class *borrow, *parent; + struct rm_class *cl; + int i, error; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + /* Sanitize flags unless internally configured */ + if (cbqp->cbq_flags & CBQSF_ALTQ) + flags &= CBQCLF_USERFLAGS; + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = qid % CBQ_MAX_CLASSES; + if (cbqp->cbq_class_tbl[i] != NULL) { + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + } + + /* check parameters */ + if (priority >= CBQ_MAXPRI) + return (EINVAL); + + if (ns_per_byte == 0) { + log(LOG_ERR, "%s: %s invalid inverse data rate\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + return (EINVAL); + } + + /* Get pointers to parent and borrow classes. */ + parent = cbq_clh_to_clp(cbqp, parent_qid); + if (flags & CBQCLF_BORROW) + borrow = parent; + else + borrow = NULL; + + /* + * A class must borrow from its parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (flags & CBQCLF_ROOTCLASS) == 0) { + log(LOG_ERR, "%s: %s no parent class!\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + log(LOG_ERR, "%s: %s borrow class != parent\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + return (EINVAL); + } + + /* + * check parameters + */ + switch (flags & CBQCLF_CLASSMASK) { + case CBQCLF_ROOTCLASS: + if (parent != NULL) { + log(LOG_ERR, "%s: %s parent exists\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + return (EINVAL); + } + if (cbqp->ifnp.root_) { + log(LOG_ERR, "%s: %s root class exists\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + return (EINVAL); + } + break; + case CBQCLF_DEFCLASS: + if (cbqp->ifnp.default_) { + log(LOG_ERR, "%s: %s default class exists\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp)); + return (EINVAL); + } + break; + case 0: + break; + default: + /* more than two flags bits set */ + log(LOG_ERR, "%s: %s invalid class flags 0x%x\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp), + (flags & CBQCLF_CLASSMASK)); + return (EINVAL); + } + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + error = rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, ns_per_byte, + cbqrestart, qid, qlimit, RM_MAXQUEUED, maxidle, minidle, + offtime, flags); + if (error != 0) + return (error); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(priority, &cbqp->ifnp, ns_per_byte, + rmc_delay_action, qid, qlimit, parent, borrow, maxidle, + minidle, offtime, pktsize, flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + cl->stats_.handle = qid; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + + if (clp != NULL) + *clp = cl; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s created qid=%d pri=%d qlimit=%d " + "flags=%b\n", if_name(CBQS_IFP(cbqp)), cbq_style(cbqp), + qid, priority, qlimit, flags, CBQCLF_BITS); + } + + return (0); +} + +int +cbq_remove_queue(cbq_state_t *cbqp, u_int32_t qid) +{ + struct rm_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + if ((cl = cbq_clh_to_clp(cbqp, qid)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (RMC_IS_A_PARENT_CLASS(cl)) + return (EINVAL); + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if (cbqp->cbq_class_tbl[i] == cl) { + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + break; + } + } + return (0); +} + +/* + * int + * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This + * function destroys a given traffic class. Before destroying + * the class, all traffic for that class is released. + */ +static int +cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl) +{ + int i; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s destroyed qid=%d pri=%d\n", + if_name(CBQS_IFP(cbqp)), cbq_style(cbqp), + cl->stats_.handle, cl->pri_); + } + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) + cbqp->cbq_class_tbl[i] = NULL; + + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + + return (0); +} + +/* convert class handle to class pointer */ +static struct rm_class * +cbq_clh_to_clp(cbq_state_t *cbqp, u_int32_t chandle) +{ + int i; + struct rm_class *cl; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % CBQ_MAX_CLASSES; + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + return (NULL); +} + +static const char * +cbq_style(cbq_state_t *cbqp) +{ + return ((cbqp->cbq_flags & CBQSF_ALTQ) ? "ALTQ_CBQ" : "CBQ"); +} + +static int +cbq_clear_interface(cbq_state_t *cbqp) +{ + int again, i; + struct rm_class *cl; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + /* clear out the classes now */ + do { + again = 0; + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (RMC_IS_A_PARENT_CLASS(cl)) + again++; + else { + cbq_class_destroy(cbqp, cl); + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + } + } + } + } while (again); + + return (0); +} + +/* copy the stats info in rm_class to class_states_t */ +int +cbq_get_class_stats(cbq_state_t *cbqp, u_int32_t qid, class_stats_t *statsp) +{ + struct rm_class *cl; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + if ((cl = cbq_clh_to_clp(cbqp, qid)) == NULL) + return (EINVAL); + + statsp->xmit_cnt = cl->stats_.xmit_cnt; + statsp->drop_cnt = cl->stats_.drop_cnt; + statsp->over = cl->stats_.over; + statsp->borrows = cl->stats_.borrows; + statsp->overactions = cl->stats_.overactions; + statsp->delays = cl->stats_.delays; + + statsp->depth = cl->depth_; + statsp->priority = cl->pri_; + statsp->maxidle = cl->maxidle_; + statsp->minidle = cl->minidle_; + statsp->offtime = cl->offtime_; + statsp->qmax = qlimit(&cl->q_); + statsp->ns_per_byte = cl->ns_per_byte_; + statsp->wrr_allot = cl->w_allotment_; + statsp->qcnt = qlen(&cl->q_); + statsp->avgidle = cl->avgidle_; + + statsp->qtype = qtype(&cl->q_); + statsp->qstate = qstate(&cl->q_); +#if CLASSQ_RED + if (q_is_red(&cl->q_)) + red_getstats(cl->red_, &statsp->red[0]); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (q_is_rio(&cl->q_)) + rio_getstats(cl->rio_, &statsp->red[0]); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->q_)) + blue_getstats(cl->blue_, &statsp->blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) + sfb_getstats(cl->sfb_, &statsp->sfb); + + return (0); +} + +int +cbq_enqueue(cbq_state_t *cbqp, struct rm_class *cl, struct mbuf *m, + struct pf_mtag *t) +{ + struct ifclassq *ifq = cbqp->ifnp.ifq_; + int len, ret; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + /* grab class set by classifier */ + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + log(LOG_ERR, "%s: packet for %s does not have pkthdr\n", + if_name(ifq->ifcq_ifp)); + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + + if (cl == NULL) { + cl = cbq_clh_to_clp(cbqp, t->pftag_qid); + if (cl == NULL) { + cl = cbqp->ifnp.default_; + if (cl == NULL) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + } + } + + len = m_pktlen(m); + + ret = rmc_queue_packet(cl, m, t); + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + /* packet has been freed in rmc_queue_packet */ + PKTCNTR_ADD(&cl->stats_.drop_cnt, 1, len); + IFCQ_DROP_ADD(ifq, 1, len); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + /* NOT REACHED */ + } + } + + /* successfully queued. */ + ++cbqp->cbq_qlen; + IFCQ_INC_LEN(ifq); + + return (ret); +} + +struct mbuf * +cbq_dequeue(cbq_state_t *cbqp, cqdq_op_t op) +{ + struct ifclassq *ifq = cbqp->ifnp.ifq_; + struct mbuf *m; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + m = rmc_dequeue_next(&cbqp->ifnp, op); + + if (m && op == CLASSQDQ_REMOVE) { + --cbqp->cbq_qlen; /* decrement # of packets in cbq */ + IFCQ_DEC_LEN(ifq); + IFCQ_XMIT_ADD(ifq, 1, m_pktlen(m)); + + /* Update the class. */ + rmc_update_class_util(&cbqp->ifnp); + } + return (m); +} + +/* + * void + * cbqrestart(queue_t *) - Restart sending of data. + * called from rmc_restart via timeout after waking up + * a suspended class. + * Returns: NONE + */ + +static void +cbqrestart(struct ifclassq *ifq) +{ + u_int32_t qlen; + + IFCQ_LOCK(ifq); + qlen = IFCQ_LEN(ifq); + IFCQ_UNLOCK(ifq); + + if (qlen > 0) + ifnet_start(ifq->ifcq_ifp); +} + +void +cbq_purge(cbq_state_t *cbqp) +{ + struct rm_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (!qempty(&cl->q_) && pktsched_verbose) { + log(LOG_DEBUG, "%s: %s purge qid=%d pri=%d " + "qlen=%d\n", if_name(CBQS_IFP(cbqp)), + cbq_style(cbqp), cl->stats_.handle, + cl->pri_, qlen(&cl->q_)); + } + rmc_dropall(cl); + } + } +} + +void +cbq_event(cbq_state_t *cbqp, cqev_t ev) +{ + struct rm_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(cbqp->ifnp.ifq_); + + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s update qid=%d pri=%d " + "event=%s\n", if_name(CBQS_IFP(cbqp)), + cbq_style(cbqp), cl->stats_.handle, + cl->pri_, ifclassq_ev2str(ev)); + } + rmc_updateq(cl, ev); + } + } +} + +int +cqb_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ +#pragma unused(ifq, flags) + return (ENXIO); /* not yet */ +} + +int +cbq_teardown_ifclassq(struct ifclassq *ifq) +{ + cbq_state_t *cbqp = ifq->ifcq_disc; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(cbqp != NULL && ifq->ifcq_type == PKTSCHEDT_CBQ); + + (void) cbq_destroy_locked(cbqp); + + ifq->ifcq_disc = NULL; + for (i = 0; i < IFCQ_SC_MAX; i++) { + ifq->ifcq_disc_slots[i].qid = 0; + ifq->ifcq_disc_slots[i].cl = NULL; + } + + return (ifclassq_detach(ifq)); +} + +int +cbq_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot, + struct if_ifclassq_stats *ifqs) +{ + cbq_state_t *cbqp = ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_type == PKTSCHEDT_CBQ); + + if (slot >= IFCQ_SC_MAX) + return (EINVAL); + + return (cbq_get_class_stats(cbqp, ifq->ifcq_disc_slots[slot].qid, + &ifqs->ifqs_cbq_stats)); +} +#endif /* PKTSCHED_CBQ */ diff --git a/bsd/net/pktsched/pktsched_cbq.h b/bsd/net/pktsched/pktsched_cbq.h new file mode 100644 index 000000000..15fe1b0b3 --- /dev/null +++ b/bsd/net/pktsched/pktsched_cbq.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_cbq.h,v 1.8 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_CBQ_H_ +#define _NET_PKTSCHED_PKTSCHED_CBQ_H_ + +#ifdef PRIVATE +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* class flags should be same as class flags in rm_class.h */ +#define CBQCLF_RED RMCF_RED /* use RED */ +#define CBQCLF_ECN RMCF_ECN /* use ECN with RED/BLUE/SFB */ +#define CBQCLF_RIO RMCF_RIO /* use RIO */ +#define CBQCLF_FLOWVALVE RMCF_FLOWVALVE /* use flowvalve/penalty-box */ +#define CBQCLF_CLEARDSCP RMCF_CLEARDSCP /* clear diffserv codepoint */ +#define CBQCLF_BORROW 0x0020 /* borrow from parent */ + +/* class flags only for root class */ +#define CBQCLF_WRR RMCF_WRR /* weighted-round robin */ +#define CBQCLF_EFFICIENT RMCF_EFFICIENT /* work-conserving */ + +/* class flags for special classes */ +#define CBQCLF_ROOTCLASS 0x1000 /* root class */ +#define CBQCLF_DEFCLASS 0x2000 /* default class */ +#define CBQCLF_CLASSMASK 0xf000 /* class mask */ + +#define CBQCLF_BLUE RMCF_BLUE /* use BLUE */ +#define CBQCLF_SFB RMCF_SFB /* use SFB */ +#define CBQCLF_FLOWCTL RMCF_FLOWCTL /* enable flow ctl advisories */ + +#ifdef BSD_KERNEL_PRIVATE +#define CBQCLF_LAZY 0x10000000 /* on-demand resource allocation */ +#endif /* BSD_KERNEL_PRIVATE */ + +#define CBQCLF_USERFLAGS \ + (CBQCLF_RED | CBQCLF_ECN | CBQCLF_RIO | CBQCLF_FLOWVALVE | \ + CBQCLF_CLEARDSCP | CBQCLF_BORROW | CBQCLF_WRR | CBQCLF_EFFICIENT | \ + CBQCLF_ROOTCLASS | CBQCLF_DEFCLASS | CBQCLF_BLUE | CBQCLF_SFB | \ + CBQCLF_FLOWCTL) + +#ifdef BSD_KERNEL_PRIVATE +#define CBQCLF_BITS \ + "\020\1RED\2ECN\3RIO\4FLOWVALVE\5CLEARDSCP\6BORROW" \ + "\11WRR\12EFFICIENT\15ROOT\16DEFAULT\21BLUE\22SFB\23FLOWCTL\35LAZY" +#else +#define CBQCLF_BITS \ + "\020\1RED\2ECN\3RIO\4FLOWVALVE\5CLEARDSCP\6BORROW" \ + "\11WRR\12EFFICIENT\15ROOT\16DEFAULT\21BLUE\22SFB\23FLOWCTL" +#endif /* !BSD_KERNEL_PRIVATE */ + +#define CBQ_MAXQSIZE 200 +#define CBQ_MAXPRI RM_MAXPRIO + +typedef struct cbq_classstats { + u_int32_t handle; + u_int32_t depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int32_t over; /* # times went over limit */ + u_int32_t borrows; /* # times tried to borrow */ + u_int32_t overactions; /* # times invoked overlimit action */ + u_int32_t delays; /* # times invoked delay actions */ + + /* other static class parameters useful for debugging */ + int priority; + int maxidle; + int minidle; + int offtime; + int qmax; + int ns_per_byte; + int wrr_allot; + + int qcnt; /* # packets in queue */ + int avgidle; + + /* RED, RIO, BLUE, SFB related info */ + classq_type_t qtype; + union { + /* RIO has 3 red stats */ + struct red_stats red[RIO_NDROPPREC]; + struct blue_stats blue; + struct sfb_stats sfb; + }; + classq_state_t qstate; +} class_stats_t; + +#ifdef BSD_KERNEL_PRIVATE +/* + * Define macros only good for kernel drivers and modules. + */ +#define CBQ_WATCHDOG (hz / 20) +#define CBQ_TIMEOUT 10 +#define CBQ_LS_TIMEOUT (20 * hz / 1000) + +#define CBQ_MAX_CLASSES 256 + +/* cbqstate flags */ +#define CBQSF_ALTQ 0x1 /* configured via PF/ALTQ */ + +/* + * Define State structures. + */ +typedef struct cbqstate { + int cbq_qlen; /* # of packets in cbq */ + u_int32_t cbq_flags; /* flags */ + struct rm_class *cbq_class_tbl[CBQ_MAX_CLASSES]; + + struct rm_ifdat ifnp; + struct callout cbq_callout; /* for timeouts */ +} cbq_state_t; + +#define CBQS_IFP(_cs) ((_cs)->ifnp.ifq_->ifcq_ifp) + +extern void cbq_init(void); +extern cbq_state_t *cbq_alloc(struct ifnet *, int, boolean_t); +extern int cbq_destroy(cbq_state_t *); +extern void cbq_purge(cbq_state_t *); +extern void cbq_event(cbq_state_t *, cqev_t); +extern int cbq_add_queue(cbq_state_t *, u_int32_t, u_int32_t, u_int32_t, + u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t, int, u_int32_t, + u_int32_t, u_int32_t, u_int32_t, struct rm_class **); +extern int cbq_remove_queue(cbq_state_t *, u_int32_t); +extern int cbq_get_class_stats(cbq_state_t *, u_int32_t, class_stats_t *); +extern int cbq_enqueue(cbq_state_t *, struct rm_class *, struct mbuf *, + struct pf_mtag *); +extern struct mbuf *cbq_dequeue(cbq_state_t *, cqdq_op_t); +extern int cqb_setup_ifclassq(struct ifclassq *, u_int32_t); +extern int cbq_teardown_ifclassq(struct ifclassq *); +extern int cbq_getqstats_ifclassq(struct ifclassq *, u_int32_t, + struct if_ifclassq_stats *); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* !_NET_PKTSCHED_PKTSCHED_CBQ_H_ */ diff --git a/bsd/net/pktsched/pktsched_fairq.c b/bsd/net/pktsched/pktsched_fairq.c new file mode 100644 index 000000000..f5fc7356b --- /dev/null +++ b/bsd/net/pktsched/pktsched_fairq.c @@ -0,0 +1,1290 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.2 2008/05/14 11:59:23 sephe Exp $ + */ +/* + * Matt: I gutted altq_priq.c and used it as a skeleton on which to build + * fairq. The fairq algorithm is completely different then priq, of course, + * but because I used priq's skeleton I believe I should include priq's + * copyright. + * + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * FAIRQ - take traffic classified by keep state (hashed into + * pf->pftag_flowhash) and bucketize it. Fairly extract + * the first packet from each bucket in a round-robin fashion. + * + * TODO - better overall qlimit support (right now it is per-bucket). + * - NOTE: red etc is per bucket, not overall. + * - better service curve support. + * + * EXAMPLE: + * + * altq on em0 fairq bandwidth 650Kb queue { std, bulk } + * queue std priority 3 bandwidth 200Kb \ + * fairq (buckets 64, default, hogs 1Kb) qlimit 50 + * queue bulk priority 2 bandwidth 100Kb \ + * fairq (buckets 64, hogs 1Kb) qlimit 50 + * + * NOTE: When the aggregate bandwidth is less than the link bandwidth + * any remaining bandwidth is dynamically assigned using the + * existing bandwidth specs as weightings. + * + * pass out on em0 from any to any keep state queue std + * pass out on em0 inet proto tcp ..... port ... keep state queue bulk + */ + +#if PKTSCHED_FAIRQ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * function prototypes + */ +#if 0 +static int fairq_enqueue_ifclassq(struct ifclassq *, struct mbuf *); +static struct mbuf *fairq_dequeue_ifclassq(struct ifclassq *, cqdq_op_t); +static int fairq_request_ifclassq(struct ifclassq *, cqrq_t, void *); +#endif +static int fairq_clear_interface(struct fairq_if *); +static inline int fairq_addq(struct fairq_class *, struct mbuf *, + struct pf_mtag *); +static inline struct mbuf *fairq_getq(struct fairq_class *, u_int64_t); +static inline struct mbuf *fairq_pollq(struct fairq_class *, u_int64_t, int *); +static fairq_bucket_t *fairq_selectq(struct fairq_class *, int); +static void fairq_purgeq(struct fairq_if *, struct fairq_class *, u_int32_t, + u_int32_t *, u_int32_t *); +static void fairq_updateq(struct fairq_if *, struct fairq_class *, cqev_t); +static struct fairq_class *fairq_class_create(struct fairq_if *, int, u_int32_t, + u_int64_t, u_int32_t, int, u_int64_t, u_int64_t, u_int64_t, u_int64_t, + u_int32_t); +static int fairq_class_destroy(struct fairq_if *, struct fairq_class *); +static int fairq_destroy_locked(struct fairq_if *); +static inline struct fairq_class *fairq_clh_to_clp(struct fairq_if *, + u_int32_t); +static const char *fairq_style(struct fairq_if *); + +#define FAIRQ_ZONE_MAX 32 /* maximum elements in zone */ +#define FAIRQ_ZONE_NAME "pktsched_fairq" /* zone name */ + +static unsigned int fairq_size; /* size of zone element */ +static struct zone *fairq_zone; /* zone for fairq */ + +#define FAIRQ_CL_ZONE_MAX 32 /* maximum elements in zone */ +#define FAIRQ_CL_ZONE_NAME "pktsched_fairq_cl" /* zone name */ + +static unsigned int fairq_cl_size; /* size of zone element */ +static struct zone *fairq_cl_zone; /* zone for fairq */ + +void +fairq_init(void) +{ + fairq_size = sizeof (struct fairq_if); + fairq_zone = zinit(fairq_size, FAIRQ_ZONE_MAX * fairq_size, + 0, FAIRQ_ZONE_NAME); + if (fairq_zone == NULL) { + panic("%s: failed allocating %s", __func__, FAIRQ_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(fairq_zone, Z_EXPAND, TRUE); + zone_change(fairq_zone, Z_CALLERACCT, TRUE); + + fairq_cl_size = sizeof (struct fairq_class); + fairq_cl_zone = zinit(fairq_cl_size, FAIRQ_CL_ZONE_MAX * fairq_cl_size, + 0, FAIRQ_CL_ZONE_NAME); + if (fairq_cl_zone == NULL) { + panic("%s: failed allocating %s", __func__, FAIRQ_CL_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(fairq_cl_zone, Z_EXPAND, TRUE); + zone_change(fairq_cl_zone, Z_CALLERACCT, TRUE); +} + +struct fairq_if * +fairq_alloc(struct ifnet *ifp, int how, boolean_t altq) +{ + struct fairq_if *fif; + + fif = (how == M_WAITOK) ? + zalloc(fairq_zone) : zalloc_noblock(fairq_zone); + if (fif == NULL) + return (NULL); + + bzero(fif, fairq_size); + fif->fif_maxpri = -1; + fif->fif_ifq = &ifp->if_snd; + if (altq) + fif->fif_flags |= FAIRQIFF_ALTQ; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler allocated\n", + if_name(ifp), fairq_style(fif)); + } + + return (fif); +} + +int +fairq_destroy(struct fairq_if *fif) +{ + struct ifclassq *ifq = fif->fif_ifq; + int err; + + IFCQ_LOCK(ifq); + err = fairq_destroy_locked(fif); + IFCQ_UNLOCK(ifq); + + return (err); +} + +static int +fairq_destroy_locked(struct fairq_if *fif) +{ + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + (void) fairq_clear_interface(fif); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler destroyed\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif)); + } + + zfree(fairq_zone, fif); + + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +fairq_clear_interface(struct fairq_if *fif) +{ + struct fairq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + /* clear out the classes */ + for (pri = 0; pri <= fif->fif_maxpri; pri++) + if ((cl = fif->fif_classes[pri]) != NULL) + fairq_class_destroy(fif, cl); + + return (0); +} + +/* discard all the queued packets on the interface */ +void +fairq_purge(struct fairq_if *fif) +{ + struct fairq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + for (pri = 0; pri <= fif->fif_maxpri; pri++) { + if ((cl = fif->fif_classes[pri]) != NULL && cl->cl_head) + fairq_purgeq(fif, cl, 0, NULL, NULL); + } +#if !PF_ALTQ + /* + * This assertion is safe to be made only when PF_ALTQ is not + * configured; otherwise, IFCQ_LEN represents the sum of the + * packets managed by ifcq_disc and altq_disc instances, which + * is possible when transitioning between the two. + */ + VERIFY(IFCQ_LEN(fif->fif_ifq) == 0); +#endif /* !PF_ALTQ */ +} + +void +fairq_event(struct fairq_if *fif, cqev_t ev) +{ + struct fairq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + for (pri = 0; pri <= fif->fif_maxpri; pri++) + if ((cl = fif->fif_classes[pri]) != NULL) + fairq_updateq(fif, cl, ev); +} + +int +fairq_add_queue(struct fairq_if *fif, int priority, u_int32_t qlimit, + u_int64_t bandwidth, u_int32_t nbuckets, int flags, u_int64_t hogs_m1, + u_int64_t lssc_m1, u_int64_t lssc_d, u_int64_t lssc_m2, u_int32_t qid, + struct fairq_class **clp) +{ + struct fairq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + /* check parameters */ + if (priority >= FAIRQ_MAXPRI) + return (EINVAL); + if (bandwidth == 0 || (bandwidth / 8) == 0) + return (EINVAL); + if (fif->fif_classes[priority] != NULL) + return (EBUSY); + if (fairq_clh_to_clp(fif, qid) != NULL) + return (EBUSY); + + cl = fairq_class_create(fif, priority, qlimit, bandwidth, + nbuckets, flags, hogs_m1, lssc_m1, lssc_d, lssc_m2, qid); + if (cl == NULL) + return (ENOMEM); + + if (clp != NULL) + *clp = cl; + + return (0); +} + +static struct fairq_class * +fairq_class_create(struct fairq_if *fif, int pri, u_int32_t qlimit, + u_int64_t bandwidth, u_int32_t nbuckets, int flags, u_int64_t hogs_m1, + u_int64_t lssc_m1, u_int64_t lssc_d, u_int64_t lssc_m2, u_int32_t qid) +{ +#pragma unused(lssc_d, lssc_m2) + struct ifnet *ifp; + struct ifclassq *ifq; + struct fairq_class *cl; + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + /* Sanitize flags unless internally configured */ + if (fif->fif_flags & FAIRQIFF_ALTQ) + flags &= FARF_USERFLAGS; + +#if !CLASSQ_RED + if (flags & FARF_RED) { + log(LOG_ERR, "%s: %s RED not available!\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif)); + return (NULL); + } +#endif /* !CLASSQ_RED */ + +#if !CLASSQ_RIO + if (flags & FARF_RIO) { + log(LOG_ERR, "%s: %s RIO not available!\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif)); + return (NULL); + } +#endif /* CLASSQ_RIO */ + +#if !CLASSQ_BLUE + if (flags & FARF_BLUE) { + log(LOG_ERR, "%s: %s BLUE not available!\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif)); + return (NULL); + } +#endif /* CLASSQ_BLUE */ + + /* These are mutually exclusive */ + if ((flags & (FARF_RED|FARF_RIO|FARF_BLUE|FARF_SFB)) && + (flags & (FARF_RED|FARF_RIO|FARF_BLUE|FARF_SFB)) != FARF_RED && + (flags & (FARF_RED|FARF_RIO|FARF_BLUE|FARF_SFB)) != FARF_RIO && + (flags & (FARF_RED|FARF_RIO|FARF_BLUE|FARF_SFB)) != FARF_BLUE && + (flags & (FARF_RED|FARF_RIO|FARF_BLUE|FARF_SFB)) != FARF_SFB) { + log(LOG_ERR, "%s: %s more than one RED|RIO|BLUE|SFB\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif)); + return (NULL); + } + + if (bandwidth == 0 || (bandwidth / 8) == 0) { + log(LOG_ERR, "%s: %s invalid data rate %llu\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif), bandwidth); + return (NULL); + } + + if (nbuckets == 0) + nbuckets = 256; + if (nbuckets > FAIRQ_MAX_BUCKETS) + nbuckets = FAIRQ_MAX_BUCKETS; + /* enforce power-of-2 size */ + while ((nbuckets ^ (nbuckets - 1)) != ((nbuckets << 1) - 1)) + ++nbuckets; + + ifq = fif->fif_ifq; + ifp = FAIRQIF_IFP(fif); + + if ((cl = fif->fif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + if (cl->cl_head) + fairq_purgeq(fif, cl, 0, NULL, NULL); +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + cl->cl_qtype = Q_DROPTAIL; + cl->cl_qstate = QS_RUNNING; + } else { + cl = zalloc(fairq_cl_zone); + if (cl == NULL) + goto err_ret; + bzero(cl, fairq_cl_size); + cl->cl_nbuckets = nbuckets; + cl->cl_nbucket_mask = nbuckets - 1; + + cl->cl_buckets = _MALLOC(sizeof (struct fairq_bucket) * + cl->cl_nbuckets, M_DEVBUF, M_WAITOK|M_ZERO); + if (cl->cl_buckets == NULL) + goto err_buckets; + cl->cl_head = NULL; + } + + fif->fif_classes[pri] = cl; + if (flags & FARF_DEFAULTCLASS) + fif->fif_default = cl; + if (qlimit == 0 || qlimit > IFCQ_MAXLEN(ifq)) { + qlimit = IFCQ_MAXLEN(ifq); + if (qlimit == 0) + qlimit = DEFAULT_QLIMIT; /* use default */ + } + cl->cl_qlimit = qlimit; + for (i = 0; i < cl->cl_nbuckets; ++i) { + _qinit(&cl->cl_buckets[i].queue, Q_DROPTAIL, qlimit); + } + cl->cl_bandwidth = bandwidth / 8; /* cvt to bytes per second */ + cl->cl_qtype = Q_DROPTAIL; + cl->cl_qstate = QS_RUNNING; + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > fif->fif_maxpri) + fif->fif_maxpri = pri; + cl->cl_fif = fif; + cl->cl_handle = qid; + cl->cl_hogs_m1 = hogs_m1 / 8; + cl->cl_lssc_m1 = lssc_m1 / 8; /* NOT YET USED */ + cl->cl_bw_current = 0; + + if (flags & (FARF_RED|FARF_RIO|FARF_BLUE|FARF_SFB)) { +#if CLASSQ_RED || CLASSQ_RIO + u_int64_t ifbandwidth = ifnet_output_linkrate(ifp); + int pkttime; +#endif /* CLASSQ_RED || CLASSQ_RIO */ + + cl->cl_qflags = 0; + if (flags & FARF_ECN) { + if (flags & FARF_BLUE) + cl->cl_qflags |= BLUEF_ECN; + else if (flags & FARF_SFB) + cl->cl_qflags |= SFBF_ECN; + else if (flags & FARF_RED) + cl->cl_qflags |= REDF_ECN; + else if (flags & FARF_RIO) + cl->cl_qflags |= RIOF_ECN; + } + if (flags & FARF_FLOWCTL) { + if (flags & FARF_SFB) + cl->cl_qflags |= SFBF_FLOWCTL; + } + if (flags & FARF_CLEARDSCP) { + if (flags & FARF_RIO) + cl->cl_qflags |= RIOF_CLEARDSCP; + } +#if CLASSQ_RED || CLASSQ_RIO + /* + * XXX: RED & RIO should be watching link speed and MTU + * events and recompute pkttime accordingly. + */ + if (ifbandwidth < 8) + pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + pkttime = (int64_t)ifp->if_mtu * 1000 * 1000 * 1000 / + (ifbandwidth / 8); + + /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ +#if CLASSQ_RIO + if (flags & FARF_RIO) { + cl->cl_rio = + rio_alloc(ifp, 0, NULL, cl->cl_qflags, pkttime); + if (cl->cl_rio != NULL) + cl->cl_qtype = Q_RIO; + } +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (flags & FARF_RED) { + cl->cl_red = red_alloc(ifp, 0, 0, + cl->cl_qlimit * 10/100, + cl->cl_qlimit * 30/100, + cl->cl_qflags, pkttime); + if (cl->cl_red != NULL) + cl->cl_qtype = Q_RED; + } +#endif /* CLASSQ_RED */ +#endif /* CLASSQ_RED || CLASSQ_RIO */ +#if CLASSQ_BLUE + if (flags & FARF_BLUE) { + cl->cl_blue = blue_alloc(ifp, 0, 0, cl->cl_qflags); + if (cl->cl_blue != NULL) + cl->cl_qtype = Q_BLUE; + } +#endif /* CLASSQ_BLUE */ + if (flags & FARF_SFB) { + if (!(cl->cl_flags & FARF_LAZY)) + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + cl->cl_qlimit, cl->cl_qflags); + if (cl->cl_sfb != NULL || (cl->cl_flags & FARF_LAZY)) + cl->cl_qtype = Q_SFB; + } + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s created qid=%d pri=%d qlimit=%d " + "flags=%b\n", if_name(ifp), fairq_style(fif), + cl->cl_handle, cl->cl_pri, cl->cl_qlimit, flags, FARF_BITS); + } + + return (cl); + +err_buckets: + if (cl->cl_buckets != NULL) + _FREE(cl->cl_buckets, M_DEVBUF); +err_ret: + if (cl != NULL) { + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + cl->cl_qtype = Q_DROPTAIL; + cl->cl_qstate = QS_RUNNING; + } + zfree(fairq_cl_zone, cl); + } + return (NULL); +} + +int +fairq_remove_queue(struct fairq_if *fif, u_int32_t qid) +{ + struct fairq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + if ((cl = fairq_clh_to_clp(fif, qid)) == NULL) + return (EINVAL); + + return (fairq_class_destroy(fif, cl)); +} + +static int +fairq_class_destroy(struct fairq_if *fif, struct fairq_class *cl) +{ + struct ifclassq *ifq = fif->fif_ifq; + int pri; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (cl->cl_head) + fairq_purgeq(fif, cl, 0, NULL, NULL); + + fif->fif_classes[cl->cl_pri] = NULL; + if (fif->fif_poll_cache == cl) + fif->fif_poll_cache = NULL; + if (fif->fif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (fif->fif_classes[pri] != NULL) { + fif->fif_maxpri = pri; + break; + } + if (pri < 0) + fif->fif_maxpri = -1; + } + + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + cl->cl_qtype = Q_DROPTAIL; + cl->cl_qstate = QS_RUNNING; + } + + if (fif->fif_default == cl) + fif->fif_default = NULL; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s destroyed qid=%d pri=%d\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif), + cl->cl_handle, cl->cl_pri); + } + + _FREE(cl->cl_buckets, M_DEVBUF); + cl->cl_head = NULL; /* sanity */ + cl->cl_polled = NULL; /* sanity */ + cl->cl_buckets = NULL; /* sanity */ + + zfree(fairq_cl_zone, cl); + + return (0); +} + +int +fairq_enqueue(struct fairq_if *fif, struct fairq_class *cl, struct mbuf *m, + struct pf_mtag *t) +{ + struct ifclassq *ifq = fif->fif_ifq; + int len, ret; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(cl == NULL || cl->cl_fif == fif); + + if (cl == NULL) { + cl = fairq_clh_to_clp(fif, t->pftag_qid); + if (cl == NULL) { + cl = fif->fif_default; + if (cl == NULL) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + } + } + + cl->cl_flags |= FARF_HAS_PACKETS; + len = m_pktlen(m); + + ret = fairq_addq(cl, m, t); + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + + /* packet has been freed in fairq_addq */ + PKTCNTR_ADD(&cl->cl_dropcnt, 1, len); + IFCQ_DROP_ADD(ifq, 1, len); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + /* NOT REACHED */ + } + } + IFCQ_INC_LEN(ifq); + + /* successfully queued. */ + return (ret); +} + +/* + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +struct mbuf * +fairq_dequeue(struct fairq_if *fif, cqdq_op_t op) +{ + struct ifclassq *ifq = fif->fif_ifq; + struct fairq_class *cl; + struct fairq_class *best_cl; + struct mbuf *best_m; + struct mbuf *m; + u_int64_t cur_time = read_machclk(); + u_int32_t best_scale; + u_int32_t scale; + int pri; + int hit_limit; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (IFCQ_IS_EMPTY(ifq)) { + /* no packet in the queue */ + return (NULL); + } + + if (fif->fif_poll_cache && op == CLASSQDQ_REMOVE) { + best_cl = fif->fif_poll_cache; + m = fairq_getq(best_cl, cur_time); + fif->fif_poll_cache = NULL; + if (m != NULL) { + IFCQ_DEC_LEN(ifq); + IFCQ_XMIT_ADD(ifq, 1, m_pktlen(m)); + PKTCNTR_ADD(&best_cl->cl_xmitcnt, 1, m_pktlen(m)); + } + } else { + best_cl = NULL; + best_m = NULL; + best_scale = 0xFFFFFFFFU; + + for (pri = fif->fif_maxpri; pri >= 0; pri--) { + if ((cl = fif->fif_classes[pri]) == NULL) + continue; + if ((cl->cl_flags & FARF_HAS_PACKETS) == 0) + continue; + m = fairq_pollq(cl, cur_time, &hit_limit); + if (m == NULL) { + cl->cl_flags &= ~FARF_HAS_PACKETS; + continue; + } + + /* + * We can halt the search immediately if the queue + * did not hit its bandwidth limit. + */ + if (hit_limit == 0) { + best_cl = cl; + best_m = m; + break; + } + + /* + * Otherwise calculate the scale factor and select + * the queue with the lowest scale factor. This + * apportions any unused bandwidth weighted by + * the relative bandwidth specification. + */ + scale = cl->cl_bw_current * 100 / cl->cl_bandwidth; + if (scale < best_scale) { + best_cl = cl; + best_m = m; + best_scale = scale; + } + } + + if (op == CLASSQDQ_POLL) { + fif->fif_poll_cache = best_cl; + m = best_m; + } else if (best_cl != NULL) { + m = fairq_getq(best_cl, cur_time); + if (m != NULL) { + IFCQ_DEC_LEN(ifq); + IFCQ_XMIT_ADD(ifq, 1, m_pktlen(m)); + PKTCNTR_ADD(&best_cl->cl_xmitcnt, 1, + m_pktlen(m)); + } + } else { + m = NULL; + } + } + return (m); +} + +static inline int +fairq_addq(struct fairq_class *cl, struct mbuf *m, struct pf_mtag *t) +{ + struct ifclassq *ifq = cl->cl_fif->fif_ifq; + fairq_bucket_t *b; + u_int32_t hash = t->pftag_flowhash; + u_int32_t hindex; + u_int64_t bw; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + /* + * If the packet doesn't have any keep state put it on the end of + * our queue. XXX this can result in out of order delivery. + */ + if (hash == 0) { + if (cl->cl_head) + b = cl->cl_head->prev; + else + b = &cl->cl_buckets[0]; + } else { + hindex = (hash & cl->cl_nbucket_mask); + b = &cl->cl_buckets[hindex]; + } + + /* + * Add the bucket to the end of the circular list of active buckets. + * + * As a special case we add the bucket to the beginning of the list + * instead of the end if it was not previously on the list and if + * its traffic is less then the hog level. + */ + if (b->in_use == 0) { + b->in_use = 1; + if (cl->cl_head == NULL) { + cl->cl_head = b; + b->next = b; + b->prev = b; + } else { + b->next = cl->cl_head; + b->prev = cl->cl_head->prev; + b->prev->next = b; + b->next->prev = b; + + if (b->bw_delta && cl->cl_hogs_m1) { + bw = b->bw_bytes * machclk_freq / b->bw_delta; + if (bw < cl->cl_hogs_m1) + cl->cl_head = b; + } + } + } + +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + return (rio_addq(cl->cl_rio, &b->queue, m, t)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + return (red_addq(cl->cl_red, &b->queue, m, t)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + return (blue_addq(cl->cl_blue, &b->queue, m, t)); + else +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB) { + if (cl->cl_sfb == NULL) { + struct ifnet *ifp = FAIRQIF_IFP(cl->cl_fif); + + VERIFY(cl->cl_flags & FARF_LAZY); + IFCQ_CONVERT_LOCK(ifq); + + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + cl->cl_qlimit, cl->cl_qflags); + if (cl->cl_sfb == NULL) { + /* fall back to droptail */ + cl->cl_qtype = Q_DROPTAIL; + cl->cl_flags &= ~FARF_SFB; + cl->cl_qflags &= ~(SFBF_ECN | SFBF_FLOWCTL); + + log(LOG_ERR, "%s: %s SFB lazy allocation " + "failed for qid=%d pri=%d, falling back " + "to DROPTAIL\n", if_name(ifp), + fairq_style(cl->cl_fif), cl->cl_handle, + cl->cl_pri); + } + } + if (cl->cl_sfb != NULL) + return (sfb_addq(cl->cl_sfb, &b->queue, m, t)); + } else if (qlen(&b->queue) >= qlimit(&b->queue)) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + if (cl->cl_flags & FARF_CLEARDSCP) + write_dsfield(m, t, 0); + + _addq(&b->queue, m); + + return (0); +} + +static inline struct mbuf * +fairq_getq(struct fairq_class *cl, u_int64_t cur_time) +{ + fairq_bucket_t *b; + struct mbuf *m; + + IFCQ_LOCK_ASSERT_HELD(cl->cl_fif->fif_ifq); + + b = fairq_selectq(cl, 0); + if (b == NULL) + m = NULL; +#if CLASSQ_RIO + else if (cl->cl_qtype == Q_RIO) + m = rio_getq(cl->cl_rio, &b->queue); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + else if (cl->cl_qtype == Q_RED) + m = red_getq(cl->cl_red, &b->queue); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + else if (cl->cl_qtype == Q_BLUE) + m = blue_getq(cl->cl_blue, &b->queue); +#endif /* CLASSQ_BLUE */ + else if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + m = sfb_getq(cl->cl_sfb, &b->queue); + else + m = _getq(&b->queue); + + /* + * Calculate the BW change + */ + if (m != NULL) { + u_int64_t delta; + + /* + * Per-class bandwidth calculation + */ + delta = (cur_time - cl->cl_last_time); + if (delta > machclk_freq * 8) + delta = machclk_freq * 8; + cl->cl_bw_delta += delta; + cl->cl_bw_bytes += m->m_pkthdr.len; + cl->cl_last_time = cur_time; + if (cl->cl_bw_delta > machclk_freq) { + cl->cl_bw_delta -= cl->cl_bw_delta >> 2; + cl->cl_bw_bytes -= cl->cl_bw_bytes >> 2; + } + + /* + * Per-bucket bandwidth calculation + */ + delta = (cur_time - b->last_time); + if (delta > machclk_freq * 8) + delta = machclk_freq * 8; + b->bw_delta += delta; + b->bw_bytes += m->m_pkthdr.len; + b->last_time = cur_time; + if (b->bw_delta > machclk_freq) { + b->bw_delta -= b->bw_delta >> 2; + b->bw_bytes -= b->bw_bytes >> 2; + } + } + return (m); +} + +/* + * Figure out what the next packet would be if there were no limits. If + * this class hits its bandwidth limit *hit_limit is set to no-zero, otherwise + * it is set to 0. A non-NULL mbuf is returned either way. + */ +static inline struct mbuf * +fairq_pollq(struct fairq_class *cl, u_int64_t cur_time, int *hit_limit) +{ + fairq_bucket_t *b; + struct mbuf *m; + u_int64_t delta; + u_int64_t bw; + + IFCQ_LOCK_ASSERT_HELD(cl->cl_fif->fif_ifq); + + *hit_limit = 0; + b = fairq_selectq(cl, 1); + if (b == NULL) + return (NULL); + m = qhead(&b->queue); + + /* + * Did this packet exceed the class bandwidth? Calculate the + * bandwidth component of the packet. + * + * - Calculate bytes per second + */ + delta = cur_time - cl->cl_last_time; + if (delta > machclk_freq * 8) + delta = machclk_freq * 8; + cl->cl_bw_delta += delta; + cl->cl_last_time = cur_time; + if (cl->cl_bw_delta) { + bw = cl->cl_bw_bytes * machclk_freq / cl->cl_bw_delta; + + if (bw > cl->cl_bandwidth) + *hit_limit = 1; + cl->cl_bw_current = bw; +#if 0 + printf("BW %6lld relative to %6u %d queue %p\n", + bw, cl->cl_bandwidth, *hit_limit, b); +#endif + } + return (m); +} + +/* + * Locate the next queue we want to pull a packet out of. This code + * is also responsible for removing empty buckets from the circular list. + */ +static fairq_bucket_t * +fairq_selectq(struct fairq_class *cl, int ispoll) +{ + fairq_bucket_t *b; + u_int64_t bw; + + IFCQ_LOCK_ASSERT_HELD(cl->cl_fif->fif_ifq); + + if (ispoll == 0 && cl->cl_polled) { + b = cl->cl_polled; + cl->cl_polled = NULL; + return (b); + } + + while ((b = cl->cl_head) != NULL) { + /* + * Remove empty queues from consideration + */ + if (qempty(&b->queue)) { + b->in_use = 0; + cl->cl_head = b->next; + if (cl->cl_head == b) { + cl->cl_head = NULL; + } else { + b->next->prev = b->prev; + b->prev->next = b->next; + } + continue; + } + + /* + * Advance the round robin. Queues with bandwidths less + * then the hog bandwidth are allowed to burst. + */ + if (cl->cl_hogs_m1 == 0) { + cl->cl_head = b->next; + } else if (b->bw_delta) { + bw = b->bw_bytes * machclk_freq / b->bw_delta; + if (bw >= cl->cl_hogs_m1) { + cl->cl_head = b->next; + } + /* + * XXX TODO - + */ + } + + /* + * Return bucket b. + */ + break; + } + if (ispoll) + cl->cl_polled = b; + return (b); +} + +static void +fairq_purgeq(struct fairq_if *fif, struct fairq_class *cl, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes) +{ + struct ifclassq *ifq = fif->fif_ifq; + u_int32_t _cnt = 0, _len = 0; + fairq_bucket_t *b; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + /* become regular mutex before freeing mbufs */ + IFCQ_CONVERT_LOCK(ifq); + + while ((b = fairq_selectq(cl, 0)) != NULL) { + u_int32_t cnt, len, qlen; + + if ((qlen = qlen(&b->queue)) == 0) + continue; + +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_purgeq(cl->cl_rio, &b->queue, flow, &cnt, &len); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + red_purgeq(cl->cl_red, &b->queue, flow, &cnt, &len); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + blue_purgeq(cl->cl_blue, &b->queue, flow, &cnt, &len); + else +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + sfb_purgeq(cl->cl_sfb, &b->queue, flow, &cnt, &len); + else + _flushq_flow(&b->queue, flow, &cnt, &len); + + if (cnt == 0) + continue; + + VERIFY(qlen(&b->queue) == (qlen - cnt)); + + PKTCNTR_ADD(&cl->cl_dropcnt, cnt, len); + IFCQ_DROP_ADD(ifq, cnt, len); + + VERIFY(((signed)IFCQ_LEN(ifq) - cnt) >= 0); + IFCQ_LEN(ifq) -= cnt; + + _cnt += cnt; + _len += len; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s purge qid=%d pri=%d " + "qlen=[%d,%d] cnt=%d len=%d flow=0x%x\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif), + cl->cl_handle, cl->cl_pri, qlen, qlen(&b->queue), + cnt, len, flow); + } + } + + if (packets != NULL) + *packets = _cnt; + if (bytes != NULL) + *bytes = _len; +} + +static void +fairq_updateq(struct fairq_if *fif, struct fairq_class *cl, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s update qid=%d pri=%d event=%s\n", + if_name(FAIRQIF_IFP(fif)), fairq_style(fif), + cl->cl_handle, cl->cl_pri, ifclassq_ev2str(ev)); + } + +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + return (rio_updateq(cl->cl_rio, ev)); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + return (red_updateq(cl->cl_red, ev)); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + return (blue_updateq(cl->cl_blue, ev)); +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + return (sfb_updateq(cl->cl_sfb, ev)); +} + +int +fairq_get_class_stats(struct fairq_if *fif, u_int32_t qid, + struct fairq_classstats *sp) +{ + struct fairq_class *cl; + fairq_bucket_t *b; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + if ((cl = fairq_clh_to_clp(fif, qid)) == NULL) + return (EINVAL); + + sp->class_handle = cl->cl_handle; + sp->priority = cl->cl_pri; + sp->qlimit = cl->cl_qlimit; + sp->xmit_cnt = cl->cl_xmitcnt; + sp->drop_cnt = cl->cl_dropcnt; + sp->qtype = cl->cl_qtype; + sp->qstate = cl->cl_qstate; + sp->qlength = 0; + + if (cl->cl_head) { + b = cl->cl_head; + do { + sp->qlength += qlen(&b->queue); + b = b->next; + } while (b != cl->cl_head); + } + +#if CLASSQ_RED + if (cl->cl_qtype == Q_RED) + red_getstats(cl->cl_red, &sp->red[0]); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_getstats(cl->cl_rio, &sp->red[0]); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (cl->cl_qtype == Q_BLUE) + blue_getstats(cl->cl_blue, &sp->blue); +#endif /* CLASSQ_BLUE */ + if (cl->cl_qtype == Q_SFB && cl->cl_sfb != NULL) + sfb_getstats(cl->cl_sfb, &sp->sfb); + + return (0); +} + +/* convert a class handle to the corresponding class pointer */ +static inline struct fairq_class * +fairq_clh_to_clp(struct fairq_if *fif, u_int32_t chandle) +{ + struct fairq_class *cl; + int idx; + + IFCQ_LOCK_ASSERT_HELD(fif->fif_ifq); + + for (idx = fif->fif_maxpri; idx >= 0; idx--) + if ((cl = fif->fif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + +static const char * +fairq_style(struct fairq_if *fif) +{ + return ((fif->fif_flags & FAIRQIFF_ALTQ) ? "ALTQ_FAIRQ" : "FAIRQ"); +} + +int +fairq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ +#pragma unused(ifq, flags) + return (ENXIO); /* not yet */ +} + +int +fairq_teardown_ifclassq(struct ifclassq *ifq) +{ + struct fairq_if *fif = ifq->ifcq_disc; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(fif != NULL && ifq->ifcq_type == PKTSCHEDT_FAIRQ); + + (void) fairq_destroy_locked(fif); + + ifq->ifcq_disc = NULL; + for (i = 0; i < IFCQ_SC_MAX; i++) { + ifq->ifcq_disc_slots[i].qid = 0; + ifq->ifcq_disc_slots[i].cl = NULL; + } + + return (ifclassq_detach(ifq)); +} + +int +fairq_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot, + struct if_ifclassq_stats *ifqs) +{ + struct fairq_if *fif = ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_type == PKTSCHEDT_FAIRQ); + + if (slot >= IFCQ_SC_MAX) + return (EINVAL); + + return (fairq_get_class_stats(fif, ifq->ifcq_disc_slots[slot].qid, + &ifqs->ifqs_fairq_stats)); +} +#endif /* PKTSCHED_FAIRQ */ diff --git a/bsd/net/pktsched/pktsched_fairq.h b/bsd/net/pktsched/pktsched_fairq.h new file mode 100644 index 000000000..910172950 --- /dev/null +++ b/bsd/net/pktsched/pktsched_fairq.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/altq/altq_fairq.h,v 1.1 2008/04/06 18:58:15 dillon Exp $ + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_FAIRQ_H_ +#define _NET_PKTSCHED_PKTSCHED_FAIRQ_H_ + +#ifdef PRIVATE +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define FAIRQ_MAX_BUCKETS 2048 /* maximum number of sorting buckets */ +#define FAIRQ_MAXPRI RM_MAXPRIO +#define FAIRQ_BITMAP_WIDTH (sizeof (fairq_bitmap_t) * 8) +#define FAIRQ_BITMAP_MASK (FAIRQ_BITMAP_WIDTH - 1) + +/* fairq class flags */ +#define FARF_RED 0x0001 /* use RED */ +#define FARF_ECN 0x0002 /* use ECN with RED/BLUE/SFB */ +#define FARF_RIO 0x0004 /* use RIO */ +#define FARF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define FARF_BLUE 0x0100 /* use BLUE */ +#define FARF_SFB 0x0200 /* use SFB */ +#define FARF_FLOWCTL 0x0400 /* enable flow control advisories */ +#define FARF_DEFAULTCLASS 0x1000 /* default class */ +#ifdef BSD_KERNEL_PRIVATE +#define FARF_HAS_PACKETS 0x2000 /* might have queued packets */ +#define FARF_LAZY 0x10000000 /* on-demand resource allocation */ +#endif /* BSD_KERNEL_PRIVATE */ + +#define FARF_USERFLAGS \ + (FARF_RED | FARF_ECN | FARF_RIO | FARF_CLEARDSCP | \ + FARF_BLUE | FARF_SFB | FARF_FLOWCTL | FARF_DEFAULTCLASS) + +#ifdef BSD_KERNEL_PRIVATE +#define FARF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" \ + "\16HASPKTS\35LAZY" +#else +#define FARF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" \ + "\16HASPKTS" +#endif /* !BSD_KERNEL_PRIVATE */ + +typedef u_int32_t fairq_bitmap_t; + +struct fairq_classstats { + u_int32_t class_handle; + u_int32_t priority; + + u_int32_t qlength; + u_int32_t qlimit; + struct pktcntr xmit_cnt; /* transmitted packet counter */ + struct pktcntr drop_cnt; /* dropped packet counter */ + + /* RED, RIO, BLUE, SFB related info */ + classq_type_t qtype; + union { + /* RIO has 3 red stats */ + struct red_stats red[RIO_NDROPPREC]; + struct blue_stats blue; + struct sfb_stats sfb; + }; + classq_state_t qstate; +}; + +#ifdef BSD_KERNEL_PRIVATE + +typedef struct fairq_bucket { + struct fairq_bucket *next; /* circular list */ + struct fairq_bucket *prev; /* circular list */ + class_queue_t queue; /* the actual queue */ + u_int64_t bw_bytes; /* statistics used to calculate bw */ + u_int64_t bw_delta; /* statistics used to calculate bw */ + u_int64_t last_time; + int in_use; +} fairq_bucket_t; + +struct fairq_class { + u_int32_t cl_handle; /* class handle */ + u_int32_t cl_nbuckets; /* (power of 2) */ + u_int32_t cl_nbucket_mask; /* bucket mask */ + u_int32_t cl_qflags; /* class queue flags */ + fairq_bucket_t *cl_buckets; + fairq_bucket_t *cl_head; /* head of circular bucket list */ + fairq_bucket_t *cl_polled; + union { + void *ptr; + struct red *red; /* RED state */ + struct rio *rio; /* RIO state */ + struct blue *blue; /* BLUE state */ + struct sfb *sfb; /* SFB state */ + } cl_qalg; + u_int64_t cl_hogs_m1; + u_int64_t cl_lssc_m1; + u_int64_t cl_bandwidth; + u_int64_t cl_bw_current; + u_int64_t cl_bw_bytes; + u_int64_t cl_bw_delta; + u_int64_t cl_last_time; + classq_type_t cl_qtype; /* rollup */ + classq_state_t cl_qstate; /* state */ + int cl_qlimit; + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct fairq_if *cl_fif; /* back pointer to fif */ + + /* round robin index */ + + /* statistics */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +#define cl_red cl_qalg.red +#define cl_rio cl_qalg.rio +#define cl_blue cl_qalg.blue +#define cl_sfb cl_qalg.sfb + +/* fairq_if flags */ +#define FAIRQIFF_ALTQ 0x1 /* configured via PF/ALTQ */ + +/* + * fairq interface state + */ +struct fairq_if { + struct ifclassq *fif_ifq; /* backpointer to ifclassq */ + int fif_maxpri; /* max priority in use */ + u_int32_t fif_flags; /* flags */ + struct fairq_class *fif_poll_cache; /* cached poll */ + struct fairq_class *fif_default; /* default class */ + struct fairq_class *fif_classes[FAIRQ_MAXPRI]; /* classes */ +}; + +#define FAIRQIF_IFP(_fif) ((_fif)->fif_ifq->ifcq_ifp) + +struct if_ifclassq_stats; + +extern void fairq_init(void); +extern struct fairq_if *fairq_alloc(struct ifnet *, int, boolean_t); +extern int fairq_destroy(struct fairq_if *); +extern void fairq_purge(struct fairq_if *); +extern void fairq_event(struct fairq_if *, cqev_t); +extern int fairq_add_queue(struct fairq_if *, int, u_int32_t, u_int64_t, + u_int32_t, int, u_int64_t, u_int64_t, u_int64_t, u_int64_t, u_int32_t, + struct fairq_class **); +extern int fairq_remove_queue(struct fairq_if *, u_int32_t); +extern int fairq_get_class_stats(struct fairq_if *, u_int32_t, + struct fairq_classstats *); +extern int fairq_enqueue(struct fairq_if *, struct fairq_class *, + struct mbuf *, struct pf_mtag *); +extern struct mbuf *fairq_dequeue(struct fairq_if *, cqdq_op_t); +extern int fairq_setup_ifclassq(struct ifclassq *, u_int32_t); +extern int fairq_teardown_ifclassq(struct ifclassq *ifq); +extern int fairq_getqstats_ifclassq(struct ifclassq *, u_int32_t, + struct if_ifclassq_stats *); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_FAIRQ_H_ */ diff --git a/bsd/net/pktsched/pktsched_hfsc.c b/bsd/net/pktsched/pktsched_hfsc.c new file mode 100644 index 000000000..c7b405380 --- /dev/null +++ b/bsd/net/pktsched/pktsched_hfsc.c @@ -0,0 +1,2057 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_hfsc.c,v 1.25 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_hfsc.c,v 1.17 2002/11/29 07:48:33 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#if PKTSCHED_HFSC + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * function prototypes + */ +#if 0 +static int hfsc_enqueue_ifclassq(struct ifclassq *, struct mbuf *); +static struct mbuf *hfsc_dequeue_ifclassq(struct ifclassq *, cqdq_op_t); +static int hfsc_request_ifclassq(struct ifclassq *, cqrq_t, void *); +#endif +static int hfsc_addq(struct hfsc_class *, struct mbuf *, struct pf_mtag *); +static struct mbuf *hfsc_getq(struct hfsc_class *); +static struct mbuf *hfsc_pollq(struct hfsc_class *); +static void hfsc_purgeq(struct hfsc_if *, struct hfsc_class *, u_int32_t, + u_int32_t *, u_int32_t *); +static void hfsc_print_sc(struct hfsc_if *, u_int32_t, u_int64_t, + struct service_curve *, struct internal_sc *, const char *); +static void hfsc_updateq_linkrate(struct hfsc_if *, struct hfsc_class *); +static void hfsc_updateq(struct hfsc_if *, struct hfsc_class *, cqev_t); + +static int hfsc_clear_interface(struct hfsc_if *); +static struct hfsc_class *hfsc_class_create(struct hfsc_if *, + struct service_curve *, struct service_curve *, struct service_curve *, + struct hfsc_class *, u_int32_t, int, u_int32_t); +static int hfsc_class_destroy(struct hfsc_if *, struct hfsc_class *); +static int hfsc_destroy_locked(struct hfsc_if *); +static struct hfsc_class *hfsc_nextclass(struct hfsc_class *); +static struct hfsc_class *hfsc_clh_to_clp(struct hfsc_if *, u_int32_t); +static const char *hfsc_style(struct hfsc_if *); + +static void set_active(struct hfsc_class *, u_int32_t); +static void set_passive(struct hfsc_class *); + +static void init_ed(struct hfsc_class *, u_int32_t); +static void update_ed(struct hfsc_class *, u_int32_t); +static void update_d(struct hfsc_class *, u_int32_t); +static void init_vf(struct hfsc_class *, u_int32_t); +static void update_vf(struct hfsc_class *, u_int32_t, u_int64_t); +static void update_cfmin(struct hfsc_class *); +static void ellist_insert(struct hfsc_class *); +static void ellist_remove(struct hfsc_class *); +static void ellist_update(struct hfsc_class *); +static struct hfsc_class *ellist_get_mindl(ellist_t *, u_int64_t); +static void actlist_insert(struct hfsc_class *); +static void actlist_remove(struct hfsc_class *); +static void actlist_update(struct hfsc_class *); +static struct hfsc_class *actlist_firstfit(struct hfsc_class *, u_int64_t); + +static inline u_int64_t seg_x2y(u_int64_t, u_int64_t); +static inline u_int64_t seg_y2x(u_int64_t, u_int64_t); +static inline u_int64_t m2sm(u_int64_t); +static inline u_int64_t m2ism(u_int64_t); +static inline u_int64_t d2dx(u_int64_t); +static u_int64_t sm2m(u_int64_t); +static u_int64_t dx2d(u_int64_t); + +static boolean_t sc2isc(struct hfsc_class *, struct service_curve *, + struct internal_sc *, u_int64_t); +static void rtsc_init(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); +static u_int64_t rtsc_y2x(struct runtime_sc *, u_int64_t); +static u_int64_t rtsc_x2y(struct runtime_sc *, u_int64_t); +static void rtsc_min(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); + +#define HFSC_ZONE_MAX 32 /* maximum elements in zone */ +#define HFSC_ZONE_NAME "pktsched_hfsc" /* zone name */ + +static unsigned int hfsc_size; /* size of zone element */ +static struct zone *hfsc_zone; /* zone for hfsc_if */ + +#define HFSC_CL_ZONE_MAX 32 /* maximum elements in zone */ +#define HFSC_CL_ZONE_NAME "pktsched_hfsc_cl" /* zone name */ + +static unsigned int hfsc_cl_size; /* size of zone element */ +static struct zone *hfsc_cl_zone; /* zone for hfsc_class */ + +/* + * macros + */ +#define HFSC_IS_A_PARENT_CLASS(cl) ((cl)->cl_children != NULL) + +#define HT_INFINITY 0xffffffffffffffffLL /* infinite time value */ + +void +hfsc_init(void) +{ + hfsc_size = sizeof (struct hfsc_if); + hfsc_zone = zinit(hfsc_size, HFSC_ZONE_MAX * hfsc_size, + 0, HFSC_ZONE_NAME); + if (hfsc_zone == NULL) { + panic("%s: failed allocating %s", __func__, HFSC_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(hfsc_zone, Z_EXPAND, TRUE); + zone_change(hfsc_zone, Z_CALLERACCT, TRUE); + + hfsc_cl_size = sizeof (struct hfsc_class); + hfsc_cl_zone = zinit(hfsc_cl_size, HFSC_CL_ZONE_MAX * hfsc_cl_size, + 0, HFSC_CL_ZONE_NAME); + if (hfsc_cl_zone == NULL) { + panic("%s: failed allocating %s", __func__, HFSC_CL_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(hfsc_cl_zone, Z_EXPAND, TRUE); + zone_change(hfsc_cl_zone, Z_CALLERACCT, TRUE); +} + +struct hfsc_if * +hfsc_alloc(struct ifnet *ifp, int how, boolean_t altq) +{ + struct hfsc_if *hif; + + hif = (how == M_WAITOK) ? zalloc(hfsc_zone) : zalloc_noblock(hfsc_zone); + if (hif == NULL) + return (NULL); + + bzero(hif, hfsc_size); + TAILQ_INIT(&hif->hif_eligible); + hif->hif_ifq = &ifp->if_snd; + if (altq) { + hif->hif_maxclasses = HFSC_MAX_CLASSES; + hif->hif_flags |= HFSCIFF_ALTQ; + } else { + hif->hif_maxclasses = IFCQ_SC_MAX + 1; /* incl. root class */ + } + + if ((hif->hif_class_tbl = _MALLOC(sizeof (struct hfsc_class *) * + hif->hif_maxclasses, M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) { + log(LOG_ERR, "%s: %s unable to allocate class table array\n", + if_name(ifp), hfsc_style(hif)); + goto error; + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler allocated\n", + if_name(ifp), hfsc_style(hif)); + } + + return (hif); + +error: + if (hif->hif_class_tbl != NULL) { + _FREE(hif->hif_class_tbl, M_DEVBUF); + hif->hif_class_tbl = NULL; + } + zfree(hfsc_zone, hif); + + return (NULL); +} + +int +hfsc_destroy(struct hfsc_if *hif) +{ + struct ifclassq *ifq = hif->hif_ifq; + int err; + + IFCQ_LOCK(ifq); + err = hfsc_destroy_locked(hif); + IFCQ_UNLOCK(ifq); + + return (err); +} + +static int +hfsc_destroy_locked(struct hfsc_if *hif) +{ + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + (void) hfsc_clear_interface(hif); + (void) hfsc_class_destroy(hif, hif->hif_rootclass); + + VERIFY(hif->hif_class_tbl != NULL); + _FREE(hif->hif_class_tbl, M_DEVBUF); + hif->hif_class_tbl = NULL; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler destroyed\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif)); + } + + zfree(hfsc_zone, hif); + + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes except the root class. + */ +static int +hfsc_clear_interface(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + /* clear out the classes */ + while (hif->hif_rootclass != NULL && + (cl = hif->hif_rootclass->cl_children) != NULL) { + /* + * remove the first leaf class found in the hierarchy + * then start over + */ + for (; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!HFSC_IS_A_PARENT_CLASS(cl)) { + (void) hfsc_class_destroy(hif, cl); + break; + } + } + } + + return (0); +} + +/* discard all the queued packets on the interface */ +void +hfsc_purge(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!qempty(&cl->cl_q)) + hfsc_purgeq(hif, cl, 0, NULL, NULL); + } +#if !PF_ALTQ + /* + * This assertion is safe to be made only when PF_ALTQ is not + * configured; otherwise, IFCQ_LEN represents the sum of the + * packets managed by ifcq_disc and altq_disc instances, which + * is possible when transitioning between the two. + */ + VERIFY(IFCQ_LEN(hif->hif_ifq) == 0); +#endif /* !PF_ALTQ */ +} + +void +hfsc_event(struct hfsc_if *hif, cqev_t ev) +{ + struct hfsc_class *cl; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + hfsc_updateq(hif, cl, ev); +} + +int +hfsc_add_queue(struct hfsc_if *hif, struct service_curve *rtsc, + struct service_curve *lssc, struct service_curve *ulsc, + u_int32_t qlimit, int flags, u_int32_t parent_qid, u_int32_t qid, + struct hfsc_class **clp) +{ + struct hfsc_class *cl = NULL, *parent; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + if (parent_qid == HFSC_NULLCLASS_HANDLE && hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = hfsc_clh_to_clp(hif, parent_qid)) == NULL) + return (EINVAL); + + if (hfsc_clh_to_clp(hif, qid) != NULL) + return (EBUSY); + + cl = hfsc_class_create(hif, rtsc, lssc, ulsc, parent, + qlimit, flags, qid); + if (cl == NULL) + return (ENOMEM); + + if (clp != NULL) + *clp = cl; + + return (0); +} + +static struct hfsc_class * +hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, + struct service_curve *fsc, struct service_curve *usc, + struct hfsc_class *parent, u_int32_t qlimit, int flags, u_int32_t qid) +{ + struct ifnet *ifp; + struct ifclassq *ifq; + struct hfsc_class *cl, *p; + u_int64_t eff_rate; + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + /* Sanitize flags unless internally configured */ + if (hif->hif_flags & HFSCIFF_ALTQ) + flags &= HFCF_USERFLAGS; + + if (hif->hif_classes >= hif->hif_maxclasses) { + log(LOG_ERR, "%s: %s out of classes! (max %d)\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif), + hif->hif_maxclasses); + return (NULL); + } + +#if !CLASSQ_RED + if (flags & HFCF_RED) { + log(LOG_ERR, "%s: %s RED not available!\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif)); + return (NULL); + } +#endif /* !CLASSQ_RED */ + +#if !CLASSQ_RIO + if (flags & HFCF_RIO) { + log(LOG_ERR, "%s: %s RIO not available!\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif)); + return (NULL); + } +#endif /* CLASSQ_RIO */ + +#if !CLASSQ_BLUE + if (flags & HFCF_BLUE) { + log(LOG_ERR, "%s: %s BLUE not available!\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif)); + return (NULL); + } +#endif /* CLASSQ_BLUE */ + + /* These are mutually exclusive */ + if ((flags & (HFCF_RED|HFCF_RIO|HFCF_BLUE|HFCF_SFB)) && + (flags & (HFCF_RED|HFCF_RIO|HFCF_BLUE|HFCF_SFB)) != HFCF_RED && + (flags & (HFCF_RED|HFCF_RIO|HFCF_BLUE|HFCF_SFB)) != HFCF_RIO && + (flags & (HFCF_RED|HFCF_RIO|HFCF_BLUE|HFCF_SFB)) != HFCF_BLUE && + (flags & (HFCF_RED|HFCF_RIO|HFCF_BLUE|HFCF_SFB)) != HFCF_SFB) { + log(LOG_ERR, "%s: %s more than one RED|RIO|BLUE|SFB\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif)); + return (NULL); + } + + cl = zalloc(hfsc_cl_zone); + if (cl == NULL) + return (NULL); + + bzero(cl, hfsc_cl_size); + TAILQ_INIT(&cl->cl_actc); + ifq = hif->hif_ifq; + ifp = HFSCIF_IFP(hif); + + if (qlimit == 0 || qlimit > IFCQ_MAXLEN(ifq)) { + qlimit = IFCQ_MAXLEN(ifq); + if (qlimit == 0) + qlimit = DEFAULT_QLIMIT; /* use default */ + } + _qinit(&cl->cl_q, Q_DROPTAIL, qlimit); + + cl->cl_flags = flags; + if (flags & (HFCF_RED|HFCF_RIO|HFCF_BLUE|HFCF_SFB)) { +#if CLASSQ_RED || CLASSQ_RIO + int pkttime; +#endif /* CLASSQ_RED || CLASSQ_RIO */ + u_int64_t m2; + + m2 = 0; + if (rsc != NULL && rsc->m2 > m2) + m2 = rsc->m2; + if (fsc != NULL && fsc->m2 > m2) + m2 = fsc->m2; + if (usc != NULL && usc->m2 > m2) + m2 = usc->m2; + + cl->cl_qflags = 0; + if (flags & HFCF_ECN) { + if (flags & HFCF_BLUE) + cl->cl_qflags |= BLUEF_ECN; + else if (flags & HFCF_SFB) + cl->cl_qflags |= SFBF_ECN; + else if (flags & HFCF_RED) + cl->cl_qflags |= REDF_ECN; + else if (flags & HFCF_RIO) + cl->cl_qflags |= RIOF_ECN; + } + if (flags & HFCF_FLOWCTL) { + if (flags & HFCF_SFB) + cl->cl_qflags |= SFBF_FLOWCTL; + } + if (flags & HFCF_CLEARDSCP) { + if (flags & HFCF_RIO) + cl->cl_qflags |= RIOF_CLEARDSCP; + } +#if CLASSQ_RED || CLASSQ_RIO + /* + * XXX: RED & RIO should be watching link speed and MTU + * events and recompute pkttime accordingly. + */ + if (m2 < 8) + pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + pkttime = (int64_t)ifp->if_mtu * 1000 * 1000 * 1000 / + (m2 / 8); + + /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ +#if CLASSQ_RED + if (flags & HFCF_RED) { + cl->cl_red = red_alloc(ifp, 0, 0, + qlimit(&cl->cl_q) * 10/100, + qlimit(&cl->cl_q) * 30/100, + cl->cl_qflags, pkttime); + if (cl->cl_red != NULL) + qtype(&cl->cl_q) = Q_RED; + } +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (flags & HFCF_RIO) { + cl->cl_rio = + rio_alloc(ifp, 0, NULL, cl->cl_qflags, pkttime); + if (cl->cl_rio != NULL) + qtype(&cl->cl_q) = Q_RIO; + } +#endif /* CLASSQ_RIO */ +#endif /* CLASSQ_RED || CLASSQ_RIO */ +#if CLASSQ_BLUE + if (flags & HFCF_BLUE) { + cl->cl_blue = blue_alloc(ifp, 0, 0, cl->cl_qflags); + if (cl->cl_blue != NULL) + qtype(&cl->cl_q) = Q_BLUE; + } +#endif /* CLASSQ_BLUE */ + if (flags & HFCF_SFB) { + if (!(cl->cl_flags & HFCF_LAZY)) + cl->cl_sfb = sfb_alloc(ifp, qid, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb != NULL || (cl->cl_flags & HFCF_LAZY)) + qtype(&cl->cl_q) = Q_SFB; + } + } + + cl->cl_id = hif->hif_classid++; + cl->cl_handle = qid; + cl->cl_hif = hif; + cl->cl_parent = parent; + + eff_rate = ifnet_output_linkrate(HFSCIF_IFP(hif)); + hif->hif_eff_rate = eff_rate; + + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0) && + (!(rsc->fl & HFSCF_M1_PCT) || (rsc->m1 > 0 && rsc->m1 <= 100)) && + (!(rsc->fl & HFSCF_M2_PCT) || (rsc->m2 > 0 && rsc->m2 <= 100))) { + rsc->fl &= HFSCF_USERFLAGS; + cl->cl_flags |= HFCF_RSC; + cl->cl_rsc0 = *rsc; + (void) sc2isc(cl, &cl->cl_rsc0, &cl->cl_rsc, eff_rate); + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, &cl->cl_rsc, 0, 0); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0) && + (!(fsc->fl & HFSCF_M1_PCT) || (fsc->m1 > 0 && fsc->m1 <= 100)) && + (!(fsc->fl & HFSCF_M2_PCT) || (fsc->m2 > 0 && fsc->m2 <= 100))) { + fsc->fl &= HFSCF_USERFLAGS; + cl->cl_flags |= HFCF_FSC; + cl->cl_fsc0 = *fsc; + (void) sc2isc(cl, &cl->cl_fsc0, &cl->cl_fsc, eff_rate); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0) && + (!(usc->fl & HFSCF_M1_PCT) || (usc->m1 > 0 && usc->m1 <= 100)) && + (!(usc->fl & HFSCF_M2_PCT) || (usc->m2 > 0 && usc->m2 <= 100))) { + usc->fl &= HFSCF_USERFLAGS; + cl->cl_flags |= HFCF_USC; + cl->cl_usc0 = *usc; + (void) sc2isc(cl, &cl->cl_usc0, &cl->cl_usc, eff_rate); + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); + } + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = qid % hif->hif_maxclasses; + if (hif->hif_class_tbl[i] == NULL) { + hif->hif_class_tbl[i] = cl; + } else { + for (i = 0; i < hif->hif_maxclasses; i++) + if (hif->hif_class_tbl[i] == NULL) { + hif->hif_class_tbl[i] = cl; + break; + } + if (i == hif->hif_maxclasses) { + goto err_ret; + } + } + hif->hif_classes++; + + if (flags & HFCF_DEFAULTCLASS) + hif->hif_defaultclass = cl; + + if (parent == NULL) { + /* this is root class */ + hif->hif_rootclass = cl; + } else { + /* add this class to the children list of the parent */ + if ((p = parent->cl_children) == NULL) + parent->cl_children = cl; + else { + while (p->cl_siblings != NULL) + p = p->cl_siblings; + p->cl_siblings = cl; + } + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s created qid=%d pqid=%d qlimit=%d " + "flags=%b\n", if_name(ifp), hfsc_style(hif), cl->cl_handle, + (cl->cl_parent != NULL) ? cl->cl_parent->cl_handle : 0, + qlimit(&cl->cl_q), cl->cl_flags, HFCF_BITS); + if (cl->cl_flags & HFCF_RSC) { + hfsc_print_sc(hif, cl->cl_handle, eff_rate, + &cl->cl_rsc0, &cl->cl_rsc, "rsc"); + } + if (cl->cl_flags & HFCF_FSC) { + hfsc_print_sc(hif, cl->cl_handle, eff_rate, + &cl->cl_fsc0, &cl->cl_fsc, "fsc"); + } + if (cl->cl_flags & HFCF_USC) { + hfsc_print_sc(hif, cl->cl_handle, eff_rate, + &cl->cl_usc0, &cl->cl_usc, "usc"); + } + } + + return (cl); + +err_ret: + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } + zfree(hfsc_cl_zone, cl); + return (NULL); +} + +int +hfsc_remove_queue(struct hfsc_if *hif, u_int32_t qid) +{ + struct hfsc_class *cl; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + if ((cl = hfsc_clh_to_clp(hif, qid)) == NULL) + return (EINVAL); + + return (hfsc_class_destroy(hif, cl)); +} + +static int +hfsc_class_destroy(struct hfsc_if *hif, struct hfsc_class *cl) +{ + u_int32_t i; + + if (cl == NULL) + return (0); + + if (HFSC_IS_A_PARENT_CLASS(cl)) + return (EBUSY); + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + if (!qempty(&cl->cl_q)) + hfsc_purgeq(hif, cl, 0, NULL, NULL); + + if (cl->cl_parent == NULL) { + /* this is root class */ + } else { + struct hfsc_class *p = cl->cl_parent->cl_children; + + if (p == cl) + cl->cl_parent->cl_children = cl->cl_siblings; + else do { + if (p->cl_siblings == cl) { + p->cl_siblings = cl->cl_siblings; + break; + } + } while ((p = p->cl_siblings) != NULL); + VERIFY(p != NULL); + } + + for (i = 0; i < hif->hif_maxclasses; i++) + if (hif->hif_class_tbl[i] == cl) { + hif->hif_class_tbl[i] = NULL; + break; + } + + hif->hif_classes--; + + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } + + if (cl == hif->hif_rootclass) + hif->hif_rootclass = NULL; + if (cl == hif->hif_defaultclass) + hif->hif_defaultclass = NULL; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s destroyed qid=%d slot=%d\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif), + cl->cl_handle, cl->cl_id); + } + + zfree(hfsc_cl_zone, cl); + + return (0); +} + +/* + * hfsc_nextclass returns the next class in the tree. + * usage: + * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + * do_something; + */ +static struct hfsc_class * +hfsc_nextclass(struct hfsc_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_hif->hif_ifq); + + if (cl->cl_children != NULL) + cl = cl->cl_children; + else if (cl->cl_siblings != NULL) + cl = cl->cl_siblings; + else { + while ((cl = cl->cl_parent) != NULL) + if (cl->cl_siblings) { + cl = cl->cl_siblings; + break; + } + } + + return (cl); +} + +int +hfsc_enqueue(struct hfsc_if *hif, struct hfsc_class *cl, struct mbuf *m, + struct pf_mtag *t) +{ + struct ifclassq *ifq = hif->hif_ifq; + u_int32_t len; + int ret; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(cl == NULL || cl->cl_hif == hif); + + if (cl == NULL) { + cl = hfsc_clh_to_clp(hif, t->pftag_qid); + if (cl == NULL || HFSC_IS_A_PARENT_CLASS(cl)) { + cl = hif->hif_defaultclass; + if (cl == NULL) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + } + } + + len = m_pktlen(m); + + ret = hfsc_addq(cl, m, t); + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + /* packet has been freed in hfsc_addq */ + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, 1, len); + IFCQ_DROP_ADD(ifq, 1, len); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + /* NOT_REACHED */ + } + } + IFCQ_INC_LEN(ifq); + cl->cl_hif->hif_packets++; + + /* successfully queued. */ + if (qlen(&cl->cl_q) == 1) + set_active(cl, len); + + return (ret); +} + +/* + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +struct mbuf * +hfsc_dequeue(struct hfsc_if *hif, cqdq_op_t op) +{ + struct ifclassq *ifq = hif->hif_ifq; + struct hfsc_class *cl; + struct mbuf *m; + u_int32_t len, next_len; + int realtime = 0; + u_int64_t cur_time; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (hif->hif_packets == 0) + /* no packet in the tree */ + return (NULL); + + cur_time = read_machclk(); + + if (op == CLASSQDQ_REMOVE && hif->hif_pollcache != NULL) { + + cl = hif->hif_pollcache; + hif->hif_pollcache = NULL; + /* check if the class was scheduled by real-time criteria */ + if (cl->cl_flags & HFCF_RSC) + realtime = (cl->cl_e <= cur_time); + } else { + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = ellist_get_mindl(&hif->hif_eligible, cur_time)) + != NULL) { + realtime = 1; + } else { + int fits = 0; + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = hif->hif_rootclass; + while (HFSC_IS_A_PARENT_CLASS(cl)) { + + cl = actlist_firstfit(cl, cur_time); + if (cl == NULL) { + if (fits > 0) + log(LOG_ERR, "%s: %s " + "%d fit but none found\n", + if_name(HFSCIF_IFP(hif)), + hfsc_style(hif), fits); + return (NULL); + } + /* + * update parent's cl_cvtmin. + * don't update if the new vt is smaller. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; + fits++; + } + } + + if (op == CLASSQDQ_POLL) { + hif->hif_pollcache = cl; + m = hfsc_pollq(cl); + return (m); + } + } + + m = hfsc_getq(cl); + VERIFY(m != NULL); + len = m_pktlen(m); + cl->cl_hif->hif_packets--; + IFCQ_DEC_LEN(ifq); + IFCQ_XMIT_ADD(ifq, 1, len); + PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, 1, len); + + update_vf(cl, len, cur_time); + if (realtime) + cl->cl_cumul += len; + + if (!qempty(&cl->cl_q)) { + if (cl->cl_flags & HFCF_RSC) { + /* update ed */ + next_len = m_pktlen(qhead(&cl->cl_q)); + + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + return (m); + +} + +static int +hfsc_addq(struct hfsc_class *cl, struct mbuf *m, struct pf_mtag *t) +{ + struct ifclassq *ifq = cl->cl_hif->hif_ifq; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_addq(cl->cl_rio, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_addq(cl->cl_red, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_addq(cl->cl_blue, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb == NULL) { + struct ifnet *ifp = HFSCIF_IFP(cl->cl_hif); + + VERIFY(cl->cl_flags & HFCF_LAZY); + IFCQ_CONVERT_LOCK(ifq); + + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb == NULL) { + /* fall back to droptail */ + qtype(&cl->cl_q) = Q_DROPTAIL; + cl->cl_flags &= ~HFCF_SFB; + cl->cl_qflags &= ~(SFBF_ECN | SFBF_FLOWCTL); + + log(LOG_ERR, "%s: %s SFB lazy allocation " + "failed for qid=%d slot=%d, falling back " + "to DROPTAIL\n", if_name(ifp), + hfsc_style(cl->cl_hif), cl->cl_handle, + cl->cl_id); + } + } + if (cl->cl_sfb != NULL) + return (sfb_addq(cl->cl_sfb, &cl->cl_q, m, t)); + } else if (qlen(&cl->cl_q) >= qlimit(&cl->cl_q)) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + if (cl->cl_flags & HFCF_CLEARDSCP) + write_dsfield(m, t, 0); + + _addq(&cl->cl_q, m); + + return (0); +} + +static struct mbuf * +hfsc_getq(struct hfsc_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_hif->hif_ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_getq(cl->cl_rio, &cl->cl_q)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_getq(cl->cl_red, &cl->cl_q)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_getq(cl->cl_blue, &cl->cl_q)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_getq(cl->cl_sfb, &cl->cl_q)); + + return (_getq(&cl->cl_q)); +} + +static struct mbuf * +hfsc_pollq(struct hfsc_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_hif->hif_ifq); + + return (qhead(&cl->cl_q)); +} + +static void +hfsc_purgeq(struct hfsc_if *hif, struct hfsc_class *cl, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes) +{ + struct ifclassq *ifq = hif->hif_ifq; + u_int32_t cnt = 0, len = 0, qlen; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if ((qlen = qlen(&cl->cl_q)) == 0) { + VERIFY(hif->hif_packets == 0); + goto done; + } + + /* become regular mutex before freeing mbufs */ + IFCQ_CONVERT_LOCK(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_purgeq(cl->cl_rio, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_purgeq(cl->cl_red, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_purgeq(cl->cl_blue, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_purgeq(cl->cl_sfb, &cl->cl_q, flow, &cnt, &len); + else + _flushq_flow(&cl->cl_q, flow, &cnt, &len); + + if (cnt > 0) { + VERIFY(qlen(&cl->cl_q) == (qlen - cnt)); + + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, cnt, len); + IFCQ_DROP_ADD(ifq, cnt, len); + + VERIFY(hif->hif_packets >= cnt); + hif->hif_packets -= cnt; + + VERIFY(((signed)IFCQ_LEN(ifq) - cnt) >= 0); + IFCQ_LEN(ifq) -= cnt; + + if (qempty(&cl->cl_q)) { + update_vf(cl, 0, 0); /* remove cl from the actlist */ + set_passive(cl); + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s purge qid=%d slot=%d " + "qlen=[%d,%d] cnt=%d len=%d flow=0x%x\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif), + cl->cl_handle, cl->cl_id, qlen, qlen(&cl->cl_q), + cnt, len, flow); + } + } +done: + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +static void +hfsc_print_sc(struct hfsc_if *hif, u_int32_t qid, u_int64_t eff_rate, + struct service_curve *sc, struct internal_sc *isc, const char *which) +{ + struct ifnet *ifp = HFSCIF_IFP(hif); + + log(LOG_DEBUG, "%s: %s qid=%d {%s_m1=%llu%s [%llu], " + "%s_d=%u msec, %s_m2=%llu%s [%llu]} linkrate=%llu bps\n", + if_name(ifp), hfsc_style(hif), qid, + which, sc->m1, (sc->fl & HFSCF_M1_PCT) ? "%" : " bps", isc->sm1, + which, sc->d, + which, sc->m2, (sc->fl & HFSCF_M2_PCT) ? "%" : " bps", isc->sm2, + eff_rate); +} + +static void +hfsc_updateq_linkrate(struct hfsc_if *hif, struct hfsc_class *cl) +{ + u_int64_t eff_rate = ifnet_output_linkrate(HFSCIF_IFP(hif)); + struct service_curve *sc; + struct internal_sc *isc; + + /* Update parameters only if rate has changed */ + if (eff_rate == hif->hif_eff_rate) + return; + + sc = &cl->cl_rsc0; + isc = &cl->cl_rsc; + if ((cl->cl_flags & HFCF_RSC) && sc2isc(cl, sc, isc, eff_rate)) { + rtsc_init(&cl->cl_deadline, isc, 0, 0); + rtsc_init(&cl->cl_eligible, isc, 0, 0); + if (pktsched_verbose) { + hfsc_print_sc(hif, cl->cl_handle, eff_rate, + sc, isc, "rsc"); + } + } + sc = &cl->cl_fsc0; + isc = &cl->cl_fsc; + if ((cl->cl_flags & HFCF_FSC) && sc2isc(cl, sc, isc, eff_rate)) { + rtsc_init(&cl->cl_virtual, isc, 0, 0); + if (pktsched_verbose) { + hfsc_print_sc(hif, cl->cl_handle, eff_rate, + sc, isc, "fsc"); + } + } + sc = &cl->cl_usc0; + isc = &cl->cl_usc; + if ((cl->cl_flags & HFCF_USC) && sc2isc(cl, sc, isc, eff_rate)) { + rtsc_init(&cl->cl_ulimit, isc, 0, 0); + if (pktsched_verbose) { + hfsc_print_sc(hif, cl->cl_handle, eff_rate, + sc, isc, "usc"); + } + } +} + +static void +hfsc_updateq(struct hfsc_if *hif, struct hfsc_class *cl, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s update qid=%d slot=%d event=%s\n", + if_name(HFSCIF_IFP(hif)), hfsc_style(hif), + cl->cl_handle, cl->cl_id, ifclassq_ev2str(ev)); + } + + if (ev == CLASSQ_EV_LINK_SPEED) + hfsc_updateq_linkrate(hif, cl); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_updateq(cl->cl_rio, ev)); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_updateq(cl->cl_red, ev)); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_updateq(cl->cl_blue, ev)); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_updateq(cl->cl_sfb, ev)); +} + +static void +set_active(struct hfsc_class *cl, u_int32_t len) +{ + if (cl->cl_flags & HFCF_RSC) + init_ed(cl, len); + if (cl->cl_flags & HFCF_FSC) + init_vf(cl, len); + + cl->cl_stats.period++; +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_flags & HFCF_RSC) + ellist_remove(cl); + + /* + * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from actlist + */ +} + +static void +init_ed(struct hfsc_class *cl, u_int32_t next_len) +{ + u_int64_t cur_time; + + cur_time = read_machclk(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, u_int32_t next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_update(cl); +} + +static void +update_d(struct hfsc_class *cl, u_int32_t next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static void +init_vf(struct hfsc_class *cl, u_int32_t len) +{ +#pragma unused(len) + struct hfsc_class *max_cl, *p; + u_int64_t vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + max_cl = actlist_last(&cl->cl_parent->cl_actc); + if (max_cl != NULL) { + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to vtoff of children + * to make a new vt (vtoff + vt) larger than + * the vt in the last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + for (p = cl->cl_parent->cl_children; p != NULL; + p = p->cl_siblings) + p->cl_vtoff += vt; + cl->cl_vt = 0; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + } + cl->cl_initvt = cl->cl_vt; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, + vt, cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + actlist_insert(cl); + + if (cl->cl_flags & HFCF_USC) { + /* class has upper limit curve */ + if (cur_time == 0) + cur_time = read_machclk(); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, u_int32_t len, u_int64_t cur_time) +{ +#pragma unused(cur_time) +#if 0 + u_int64_t myf_bound, delta; +#endif + u_int64_t f; + int go_passive; + + go_passive = (qempty(&cl->cl_q) && (cl->cl_flags & HFCF_FSC)); + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + + cl->cl_total += len; + + if (!(cl->cl_flags & HFCF_FSC) || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt list */ + actlist_remove(cl); + + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt list */ + actlist_update(cl); + + if (cl->cl_flags & HFCF_USC) { + cl->cl_myf = cl->cl_myfadj + + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); +#if 0 + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - machclk_per_tick; + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } +#endif + } + + /* cl_f is max(cl_myf, cl_cfmin) */ + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_cfmin(struct hfsc_class *cl) +{ + struct hfsc_class *p; + u_int64_t cfmin; + + if (TAILQ_EMPTY(&cl->cl_actc)) { + cl->cl_cfmin = 0; + return; + } + cfmin = HT_INFINITY; + TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) { + if (p->cl_f == 0) { + cl->cl_cfmin = 0; + return; + } + if (p->cl_f < cfmin) + cfmin = p->cl_f; + } + cl->cl_cfmin = cfmin; +} + +/* + * TAILQ based ellist and actlist implementation + * (ion wanted to make a calendar queue based implementation) + */ +/* + * eligible list holds backlogged classes being sorted by their eligible times. + * there is one eligible list per interface. + */ + +static void +ellist_insert(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(&hif->hif_eligible, _eligible)) == NULL || + p->cl_e <= cl->cl_e) { + TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist); + return; + } + + TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) { + if (cl->cl_e < p->cl_e) { + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + VERIFY(0); /* should not reach here */ +} + +static void +ellist_remove(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + + TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); +} + +static void +ellist_update(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p, *last; + + /* + * the eligible time of a class increases monotonically. + * if the next entry has a larger eligible time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_ellist); + if (p == NULL || cl->cl_e <= p->cl_e) + return; + + /* check the last entry */ + last = TAILQ_LAST(&hif->hif_eligible, _eligible); + VERIFY(last != NULL); + if (last->cl_e <= cl->cl_e) { + TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { + if (cl->cl_e < p->cl_e) { + TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + VERIFY(0); /* should not reach here */ +} + +/* find the class with the minimum deadline among the eligible classes */ +static struct hfsc_class * +ellist_get_mindl(ellist_t *head, u_int64_t cur_time) +{ + struct hfsc_class *p, *cl = NULL; + + TAILQ_FOREACH(p, head, cl_ellist) { + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return (cl); +} + +/* + * active children list holds backlogged child classes being sorted + * by their virtual time. + * each intermediate class has one active children list. + */ + +static void +actlist_insert(struct hfsc_class *cl) +{ + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(&cl->cl_parent->cl_actc, _active)) == NULL || + p->cl_vt <= cl->cl_vt) { + TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + TAILQ_FOREACH(p, &cl->cl_parent->cl_actc, cl_actlist) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + VERIFY(0); /* should not reach here */ +} + +static void +actlist_remove(struct hfsc_class *cl) +{ + TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); +} + +static void +actlist_update(struct hfsc_class *cl) +{ + struct hfsc_class *p, *last; + + /* + * the virtual time of a class increases monotonically during its + * backlogged period. + * if the next entry has a larger virtual time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_actlist); + if (p == NULL || cl->cl_vt < p->cl_vt) + return; + + /* check the last entry */ + last = TAILQ_LAST(&cl->cl_parent->cl_actc, _active); + VERIFY(last != NULL); + if (last->cl_vt <= cl->cl_vt) { + TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + VERIFY(0); /* should not reach here */ +} + +static struct hfsc_class * +actlist_firstfit(struct hfsc_class *cl, u_int64_t cur_time) +{ + struct hfsc_class *p; + + TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) { + if (p->cl_f <= cur_time) + return (p); + } + return (NULL); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bits/sec + * d: msec + * internal service curve parameters + * sm: (bytes/tsc_interval) << SM_SHIFT + * ism: (tsc_count/byte) << ISM_SHIFT + * dx: tsc_count + * + * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. + * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU + * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective + * digits in decimal using the following table. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ----------+------------------------------------------------------- + * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 + * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 + * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 + * + * nsec/byte 80000 8000 800 80 8 + * ism(500MHz) 40000 4000 400 40 4 + * ism(200MHz) 16000 1600 160 16 1.6 + */ +#define SM_SHIFT 24 +#define ISM_SHIFT 10 + +#define SM_MASK ((1LL << SM_SHIFT) - 1) +#define ISM_MASK ((1LL << ISM_SHIFT) - 1) + +static inline u_int64_t +seg_x2y(u_int64_t x, u_int64_t sm) +{ + u_int64_t y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return (y); +} + +static inline u_int64_t +seg_y2x(u_int64_t y, u_int64_t ism) +{ + u_int64_t x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return (x); +} + +static inline u_int64_t +m2sm(u_int64_t m) +{ + u_int64_t sm; + + sm = (m << SM_SHIFT) / 8 / machclk_freq; + return (sm); +} + +static inline u_int64_t +m2ism(u_int64_t m) +{ + u_int64_t ism; + + if (m == 0) + ism = HT_INFINITY; + else + ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m; + return (ism); +} + +static inline u_int64_t +d2dx(u_int64_t d) +{ + u_int64_t dx; + + dx = (d * machclk_freq) / 1000; + return (dx); +} + +static u_int64_t +sm2m(u_int64_t sm) +{ + u_int64_t m; + + m = (sm * 8 * machclk_freq) >> SM_SHIFT; + return (m); +} + +static u_int64_t +dx2d(u_int64_t dx) +{ + u_int64_t d; + + d = dx * 1000 / machclk_freq; + return (d); +} + +static boolean_t +sc2isc(struct hfsc_class *cl, struct service_curve *sc, struct internal_sc *isc, + u_int64_t eff_rate) +{ + struct hfsc_if *hif = cl->cl_hif; + struct internal_sc oisc = *isc; + u_int64_t m1, m2; + + if (eff_rate == 0 && (sc->fl & (HFSCF_M1_PCT | HFSCF_M2_PCT))) { + /* + * If service curve is configured with percentage and the + * effective uplink rate is not known, assume this is a + * transient case, and that the rate will be updated in + * the near future via CLASSQ_EV_LINK_SPEED. Pick a + * reasonable number for now, e.g. 10 Mbps. + */ + eff_rate = (10 * 1000 * 1000); + + log(LOG_WARNING, "%s: %s qid=%d slot=%d eff_rate unknown; " + "using temporary rate %llu bps\n", if_name(HFSCIF_IFP(hif)), + hfsc_style(hif), cl->cl_handle, cl->cl_id, eff_rate); + } + + m1 = sc->m1; + if (sc->fl & HFSCF_M1_PCT) { + VERIFY(m1 > 0 && m1 <= 100); + m1 = (eff_rate * m1) / 100; + } + + m2 = sc->m2; + if (sc->fl & HFSCF_M2_PCT) { + VERIFY(m2 > 0 && m2 <= 100); + m2 = (eff_rate * m2) / 100; + } + + isc->sm1 = m2sm(m1); + isc->ism1 = m2ism(m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(m2); + isc->ism2 = m2ism(m2); + + /* return non-zero if there's any change */ + return (bcmp(&oisc, isc, sizeof (*isc))); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x, + u_int64_t y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u_int64_t +rtsc_y2x(struct runtime_sc *rtsc, u_int64_t y) +{ + u_int64_t x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return (x); +} + +static u_int64_t +rtsc_x2y(struct runtime_sc *rtsc, u_int64_t x) +{ + u_int64_t y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return (y); +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x, + u_int64_t y) +{ + u_int64_t y1, y2, dx, dy; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; +} + +int +hfsc_get_class_stats(struct hfsc_if *hif, u_int32_t qid, + struct hfsc_classstats *sp) +{ + struct hfsc_class *cl; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + if ((cl = hfsc_clh_to_clp(hif, qid)) == NULL) + return (EINVAL); + + sp->class_id = cl->cl_id; + sp->class_handle = cl->cl_handle; + + if (cl->cl_flags & HFCF_RSC) { + sp->rsc.m1 = sm2m(cl->cl_rsc.sm1); + sp->rsc.d = dx2d(cl->cl_rsc.dx); + sp->rsc.m2 = sm2m(cl->cl_rsc.sm2); + } else { + sp->rsc.m1 = 0; + sp->rsc.d = 0; + sp->rsc.m2 = 0; + } + if (cl->cl_flags & HFCF_FSC) { + sp->fsc.m1 = sm2m(cl->cl_fsc.sm1); + sp->fsc.d = dx2d(cl->cl_fsc.dx); + sp->fsc.m2 = sm2m(cl->cl_fsc.sm2); + } else { + sp->fsc.m1 = 0; + sp->fsc.d = 0; + sp->fsc.m2 = 0; + } + if (cl->cl_flags & HFCF_USC) { + sp->usc.m1 = sm2m(cl->cl_usc.sm1); + sp->usc.d = dx2d(cl->cl_usc.dx); + sp->usc.m2 = sm2m(cl->cl_usc.sm2); + } else { + sp->usc.m1 = 0; + sp->usc.d = 0; + sp->usc.m2 = 0; + } + + sp->total = cl->cl_total; + sp->cumul = cl->cl_cumul; + + sp->d = cl->cl_d; + sp->e = cl->cl_e; + sp->vt = cl->cl_vt; + sp->f = cl->cl_f; + + sp->initvt = cl->cl_initvt; + sp->vtperiod = cl->cl_vtperiod; + sp->parentperiod = cl->cl_parentperiod; + sp->nactive = cl->cl_nactive; + sp->vtoff = cl->cl_vtoff; + sp->cvtmax = cl->cl_cvtmax; + sp->myf = cl->cl_myf; + sp->cfmin = cl->cl_cfmin; + sp->cvtmin = cl->cl_cvtmin; + sp->myfadj = cl->cl_myfadj; + sp->vtadj = cl->cl_vtadj; + + sp->cur_time = read_machclk(); + sp->machclk_freq = machclk_freq; + + sp->qlength = qlen(&cl->cl_q); + sp->qlimit = qlimit(&cl->cl_q); + sp->xmit_cnt = cl->cl_stats.xmit_cnt; + sp->drop_cnt = cl->cl_stats.drop_cnt; + sp->period = cl->cl_stats.period; + + sp->qtype = qtype(&cl->cl_q); + sp->qstate = qstate(&cl->cl_q); +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_getstats(cl->cl_rio, &sp->red[0]); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_getstats(cl->cl_blue, &sp->blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_getstats(cl->cl_sfb, &sp->sfb); + + return (0); +} + +/* convert a class handle to the corresponding class pointer */ +static struct hfsc_class * +hfsc_clh_to_clp(struct hfsc_if *hif, u_int32_t chandle) +{ + u_int32_t i; + struct hfsc_class *cl; + + IFCQ_LOCK_ASSERT_HELD(hif->hif_ifq); + + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % hif->hif_maxclasses; + if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) + return (cl); + for (i = 0; i < hif->hif_maxclasses; i++) + if ((cl = hif->hif_class_tbl[i]) != NULL && + cl->cl_handle == chandle) + return (cl); + return (NULL); +} + +static const char * +hfsc_style(struct hfsc_if *hif) +{ + return ((hif->hif_flags & HFSCIFF_ALTQ) ? "ALTQ_HFSC" : "HFSC"); +} + +int +hfsc_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ +#pragma unused(ifq, flags) + return (ENXIO); /* not yet */ +} + +int +hfsc_teardown_ifclassq(struct ifclassq *ifq) +{ + struct hfsc_if *hif = ifq->ifcq_disc; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(hif != NULL && ifq->ifcq_type == PKTSCHEDT_HFSC); + + (void) hfsc_destroy_locked(hif); + + ifq->ifcq_disc = NULL; + for (i = 0; i < IFCQ_SC_MAX; i++) { + ifq->ifcq_disc_slots[i].qid = 0; + ifq->ifcq_disc_slots[i].cl = NULL; + } + + return (ifclassq_detach(ifq)); +} + +int +hfsc_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot, + struct if_ifclassq_stats *ifqs) +{ + struct hfsc_if *hif = ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_type == PKTSCHEDT_HFSC); + + if (slot >= IFCQ_SC_MAX) + return (EINVAL); + + return (hfsc_get_class_stats(hif, ifq->ifcq_disc_slots[slot].qid, + &ifqs->ifqs_hfsc_stats)); +} +#endif /* PKTSCHED_HFSC */ diff --git a/bsd/net/pktsched/pktsched_hfsc.h b/bsd/net/pktsched/pktsched_hfsc.h new file mode 100644 index 000000000..d22b95380 --- /dev/null +++ b/bsd/net/pktsched/pktsched_hfsc.h @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_hfsc.h,v 1.8 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ */ + +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +#ifndef _NET_PKTSCHED_PKTSCHED_HFSC_H_ +#define _NET_PKTSCHED_PKTSCHED_HFSC_H_ + +#ifdef PRIVATE +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct service_curve { + u_int32_t fl; /* service curve flags */ + u_int64_t m1; /* slope of the first segment in bits/sec */ + u_int32_t d; /* the x-projection of the first segment in msec */ + u_int64_t m2; /* slope of the second segment in bits/sec */ +}; + +/* valid values for service curve flags */ +#define HFSCF_M1_PCT 0x1 /* m1 is in percentage */ +#define HFSCF_M2_PCT 0x10 /* m2 is in percentage */ + +#define HFSCF_USERFLAGS (HFSCF_M1_PCT | HFSCF_M2_PCT) + +/* special class handles */ +#define HFSC_NULLCLASS_HANDLE 0 +#define HFSC_MAX_CLASSES 64 + +/* hfsc class flags */ +#define HFCF_RED 0x0001 /* use RED */ +#define HFCF_ECN 0x0002 /* use ECN with RED/BLUE/SFB */ +#define HFCF_RIO 0x0004 /* use RIO */ +#define HFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define HFCF_BLUE 0x0100 /* use BLUE */ +#define HFCF_SFB 0x0200 /* use SFB */ +#define HFCF_FLOWCTL 0x0400 /* enable flow control advisories */ +#define HFCF_DEFAULTCLASS 0x1000 /* default class */ +#ifdef BSD_KERNEL_PRIVATE +#define HFCF_RSC 0x10000 /* has realtime sc */ +#define HFCF_FSC 0x20000 /* has linkshare sc */ +#define HFCF_USC 0x40000 /* has upperlimit sc */ +#define HFCF_LAZY 0x10000000 /* on-demand resource allocation */ +#endif /* BSD_KERNEL_PRIVATE */ + +#define HFCF_USERFLAGS \ + (HFCF_RED | HFCF_ECN | HFCF_RIO | HFCF_CLEARDSCP | HFCF_BLUE | \ + HFCF_SFB | HFCF_FLOWCTL | HFCF_DEFAULTCLASS) + +#ifdef BSD_KERNEL_PRIVATE +#define HFCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" \ + "\21RSC\22FSC\23USC\35LAZY" +#else +#define HFCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" +#endif /* !BSD_KERNEL_PRIVATE */ + +/* service curve types */ +#define HFSC_REALTIMESC 1 +#define HFSC_LINKSHARINGSC 2 +#define HFSC_UPPERLIMITSC 4 +#define HFSC_DEFAULTSC (HFSC_REALTIMESC|HFSC_LINKSHARINGSC) + +struct hfsc_classstats { + u_int32_t class_id; + u_int32_t class_handle; + struct service_curve rsc; + struct service_curve fsc; + struct service_curve usc; /* upper limit service curve */ + + u_int64_t total; /* total work in bytes */ + u_int64_t cumul; /* cumulative work in bytes */ + /* done by real-time criteria */ + u_int64_t d; /* deadline */ + u_int64_t e; /* eligible time */ + u_int64_t vt; /* virtual time */ + u_int64_t f; /* fit time for upper-limit */ + + /* info helpful for debugging */ + u_int64_t initvt; /* init virtual time */ + u_int64_t vtoff; /* cl_vt_ipoff */ + u_int64_t cvtmax; /* cl_maxvt */ + u_int64_t myf; /* cl_myf */ + u_int64_t cfmin; /* cl_mincf */ + u_int64_t cvtmin; /* cl_mincvt */ + u_int64_t myfadj; /* cl_myfadj */ + u_int64_t vtadj; /* cl_vtadj */ + u_int64_t cur_time; + u_int32_t machclk_freq; + + u_int32_t qlength; + u_int32_t qlimit; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int32_t period; + + u_int32_t vtperiod; /* vt period sequence no */ + u_int32_t parentperiod; /* parent's vt period seqno */ + int nactive; /* number of active children */ + + /* RED, RIO, BLUE, SFB related info */ + classq_type_t qtype; + union { + /* RIO has 3 red stats */ + struct red_stats red[RIO_NDROPPREC]; + struct blue_stats blue; + struct sfb_stats sfb; + }; + classq_state_t qstate; +}; + +#ifdef BSD_KERNEL_PRIVATE +#include +/* + * kernel internal service curve representation + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. for the intel x86 architecture, + * the raw Pentium TSC (Timestamp Counter) value is used. + * virtual time is also calculated in this time scale. + * y-axis: unit is byte. + * + * the service curve parameters are converted to the internal + * representation. + * the slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + * + * note: Intel Pentium TSC never wraps around in several thousands of years. + * x-axis doesn't wrap around for 1089 years with 1GHz clock. + * y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth. + */ + +/* kernel internal representation of a service curve */ +struct internal_sc { + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + u_int64_t x; /* current starting position on x-axis */ + u_int64_t y; /* current starting position on x-axis */ + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* for TAILQ based ellist and actlist implementation */ +struct hfsc_class; +typedef TAILQ_HEAD(_eligible, hfsc_class) ellist_t; +typedef TAILQ_ENTRY(hfsc_class) elentry_t; +typedef TAILQ_HEAD(_active, hfsc_class) actlist_t; +typedef TAILQ_ENTRY(hfsc_class) actentry_t; +#define ellist_first(s) TAILQ_FIRST(s) +#define actlist_first(s) TAILQ_FIRST(s) +#define actlist_last(s) TAILQ_LAST(s, _active) + +struct hfsc_class { + u_int32_t cl_id; /* class id (just for debug) */ + u_int32_t cl_handle; /* class handle */ + struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ + u_int32_t cl_flags; /* misc flags */ + + struct hfsc_class *cl_parent; /* parent class */ + struct hfsc_class *cl_siblings; /* sibling classes */ + struct hfsc_class *cl_children; /* child classes */ + + class_queue_t cl_q; /* class queue structure */ + u_int32_t cl_qflags; /* class queue flags */ + union { + void *ptr; + struct red *red; /* RED state */ + struct rio *rio; /* RIO state */ + struct blue *blue; /* BLUE state */ + struct sfb *sfb; /* SFB state */ + } cl_qalg; + + u_int64_t cl_total; /* total work in bytes */ + u_int64_t cl_cumul; /* cumulative work in bytes */ + /* done by real-time criteria */ + u_int64_t cl_d; /* deadline */ + u_int64_t cl_e; /* eligible time */ + u_int64_t cl_vt; /* virtual time */ + u_int64_t cl_f; /* time when this class will fit for */ + /* link-sharing, max(myf, cfmin) */ + u_int64_t cl_myf; /* my fit-time (as calculated from */ + /* this class's own upperlimit */ + /* curve) */ + u_int64_t cl_myfadj; /* my fit-time adjustment */ + /* (to cancel history dependence) */ + u_int64_t cl_cfmin; /* earliest children's fit-time (used */ + /* with cl_myf to obtain cl_f) */ + u_int64_t cl_cvtmin; /* minimal virtual time among the */ + /* children fit for link-sharing */ + /* (monotonic within a period) */ + u_int64_t cl_vtadj; /* intra-period cumulative vt */ + /* adjustment */ + u_int64_t cl_vtoff; /* inter-period cumulative vt offset */ + u_int64_t cl_cvtmax; /* max child's vt in the last period */ + + u_int64_t cl_initvt; /* init virtual time (for debugging) */ + + struct service_curve cl_rsc0; /* external real-time service curve */ + struct service_curve cl_fsc0; /* external fair service curve */ + struct service_curve cl_usc0; /* external uppperlimit service curve */ + struct internal_sc cl_rsc; /* internal real-time service curve */ + struct internal_sc cl_fsc; /* internal fair service curve */ + struct internal_sc cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + u_int32_t cl_vtperiod; /* vt period sequence no */ + u_int32_t cl_parentperiod; /* parent's vt period seqno */ + u_int32_t cl_nactive; /* number of active children */ + actlist_t cl_actc; /* active children list */ + + actentry_t cl_actlist; /* active children list entry */ + elentry_t cl_ellist; /* eligible list entry */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int32_t period; + } cl_stats; +}; + +#define cl_red cl_qalg.red +#define cl_rio cl_qalg.rio +#define cl_blue cl_qalg.blue +#define cl_sfb cl_qalg.sfb + +/* hfsc_if flags */ +#define HFSCIFF_ALTQ 0x1 /* configured via PF/ALTQ */ + +/* + * hfsc interface state + */ +struct hfsc_if { + struct ifclassq *hif_ifq; /* backpointer to ifclassq */ + struct hfsc_class *hif_rootclass; /* root class */ + struct hfsc_class *hif_defaultclass; /* default class */ + struct hfsc_class **hif_class_tbl; + struct hfsc_class *hif_pollcache; /* cache for poll operation */ + + u_int32_t hif_flags; /* flags */ + u_int32_t hif_maxclasses; /* max # of classes in table */ + u_int32_t hif_classes; /* # of classes in the tree */ + u_int32_t hif_packets; /* # of packets in the tree */ + u_int32_t hif_classid; /* class id sequence number */ + u_int64_t hif_eff_rate; /* last known effective rate */ + + ellist_t hif_eligible; /* eligible list */ +}; + +#define HFSCIF_IFP(_hif) ((_hif)->hif_ifq->ifcq_ifp) + +extern void hfsc_init(void); +extern struct hfsc_if *hfsc_alloc(struct ifnet *, int, boolean_t); +extern int hfsc_destroy(struct hfsc_if *); +extern void hfsc_purge(struct hfsc_if *); +extern void hfsc_event(struct hfsc_if *, cqev_t); +extern int hfsc_add_queue(struct hfsc_if *, struct service_curve *, + struct service_curve *, struct service_curve *, u_int32_t, int, + u_int32_t, u_int32_t, struct hfsc_class **); +extern int hfsc_remove_queue(struct hfsc_if *, u_int32_t); +extern int hfsc_get_class_stats(struct hfsc_if *, u_int32_t, + struct hfsc_classstats *); +extern int hfsc_enqueue(struct hfsc_if *, struct hfsc_class *, + struct mbuf *, struct pf_mtag *); +extern struct mbuf *hfsc_dequeue(struct hfsc_if *, cqdq_op_t); +extern int hfsc_setup_ifclassq(struct ifclassq *, u_int32_t); +extern int hfsc_teardown_ifclassq(struct ifclassq *); +extern int hfsc_getqstats_ifclassq(struct ifclassq *, u_int32_t, + struct if_ifclassq_stats *); +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_HFSC_H_ */ diff --git a/bsd/net/pktsched/pktsched_priq.c b/bsd/net/pktsched/pktsched_priq.c new file mode 100644 index 000000000..c3a6f5e56 --- /dev/null +++ b/bsd/net/pktsched/pktsched_priq.c @@ -0,0 +1,1275 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_priq.c,v 1.21 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_priq.c,v 1.1 2000/10/18 09:15:23 kjc Exp $ */ + +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * priority queue + */ + +#if PKTSCHED_PRIQ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * function prototypes + */ +static int priq_enqueue_ifclassq(struct ifclassq *, struct mbuf *); +static struct mbuf *priq_dequeue_ifclassq(struct ifclassq *, cqdq_op_t); +static int priq_request_ifclassq(struct ifclassq *, cqrq_t, void *); +static int priq_clear_interface(struct priq_if *); +static struct priq_class *priq_class_create(struct priq_if *, int, u_int32_t, + int, u_int32_t); +static int priq_class_destroy(struct priq_if *, struct priq_class *); +static int priq_destroy_locked(struct priq_if *); +static inline int priq_addq(struct priq_class *, struct mbuf *, + struct pf_mtag *); +static inline struct mbuf *priq_getq(struct priq_class *); +static inline struct mbuf *priq_pollq(struct priq_class *); +static void priq_purgeq(struct priq_if *, struct priq_class *, u_int32_t, + u_int32_t *, u_int32_t *); +static void priq_purge_sc(struct priq_if *, cqrq_purge_sc_t *); +static void priq_updateq(struct priq_if *, struct priq_class *, cqev_t); +static int priq_throttle(struct priq_if *, cqrq_throttle_t *); +static int priq_resumeq(struct priq_if *, struct priq_class *); +static int priq_suspendq(struct priq_if *, struct priq_class *); +static inline struct priq_class *priq_clh_to_clp(struct priq_if *, u_int32_t); +static const char *priq_style(struct priq_if *); + +#define PRIQ_ZONE_MAX 32 /* maximum elements in zone */ +#define PRIQ_ZONE_NAME "pktsched_priq" /* zone name */ + +static unsigned int priq_size; /* size of zone element */ +static struct zone *priq_zone; /* zone for priq */ + +#define PRIQ_CL_ZONE_MAX 32 /* maximum elements in zone */ +#define PRIQ_CL_ZONE_NAME "pktsched_priq_cl" /* zone name */ + +static unsigned int priq_cl_size; /* size of zone element */ +static struct zone *priq_cl_zone; /* zone for priq_class */ + +void +priq_init(void) +{ + priq_size = sizeof (struct priq_if); + priq_zone = zinit(priq_size, PRIQ_ZONE_MAX * priq_size, + 0, PRIQ_ZONE_NAME); + if (priq_zone == NULL) { + panic("%s: failed allocating %s", __func__, PRIQ_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(priq_zone, Z_EXPAND, TRUE); + zone_change(priq_zone, Z_CALLERACCT, TRUE); + + priq_cl_size = sizeof (struct priq_class); + priq_cl_zone = zinit(priq_cl_size, PRIQ_CL_ZONE_MAX * priq_cl_size, + 0, PRIQ_CL_ZONE_NAME); + if (priq_cl_zone == NULL) { + panic("%s: failed allocating %s", __func__, PRIQ_CL_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(priq_cl_zone, Z_EXPAND, TRUE); + zone_change(priq_cl_zone, Z_CALLERACCT, TRUE); +} + +struct priq_if * +priq_alloc(struct ifnet *ifp, int how, boolean_t altq) +{ + struct priq_if *pif; + + pif = (how == M_WAITOK) ? zalloc(priq_zone) : zalloc_noblock(priq_zone); + if (pif == NULL) + return (NULL); + + bzero(pif, priq_size); + pif->pif_maxpri = -1; + pif->pif_ifq = &ifp->if_snd; + if (altq) + pif->pif_flags |= PRIQIFF_ALTQ; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler allocated\n", + if_name(ifp), priq_style(pif)); + } + + return (pif); +} + +int +priq_destroy(struct priq_if *pif) +{ + struct ifclassq *ifq = pif->pif_ifq; + int err; + + IFCQ_LOCK(ifq); + err = priq_destroy_locked(pif); + IFCQ_UNLOCK(ifq); + + return (err); +} + +static int +priq_destroy_locked(struct priq_if *pif) +{ + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + (void) priq_clear_interface(pif); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler destroyed\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif)); + } + + zfree(priq_zone, pif); + + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +priq_clear_interface(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) + if ((cl = pif->pif_classes[pri]) != NULL) + priq_class_destroy(pif, cl); + + return (0); +} + +/* discard all the queued packets on the interface */ +void +priq_purge(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(&cl->cl_q)) + priq_purgeq(pif, cl, 0, NULL, NULL); + } +#if !PF_ALTQ + /* + * This assertion is safe to be made only when PF_ALTQ is not + * configured; otherwise, IFCQ_LEN represents the sum of the + * packets managed by ifcq_disc and altq_disc instances, which + * is possible when transitioning between the two. + */ + VERIFY(IFCQ_LEN(pif->pif_ifq) == 0); +#endif /* !PF_ALTQ */ +} + +static void +priq_purge_sc(struct priq_if *pif, cqrq_purge_sc_t *pr) +{ + struct ifclassq *ifq = pif->pif_ifq; + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(pr->sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(pr->sc)); + VERIFY(pr->flow != 0); + + if (pr->sc != MBUF_SC_UNSPEC) { + i = MBUF_SCIDX(pr->sc); + VERIFY(i < IFCQ_SC_MAX); + + priq_purgeq(pif, ifq->ifcq_disc_slots[i].cl, + pr->flow, &pr->packets, &pr->bytes); + } else { + u_int32_t cnt, len; + + pr->packets = 0; + pr->bytes = 0; + + for (i = 0; i < IFCQ_SC_MAX; i++) { + priq_purgeq(pif, ifq->ifcq_disc_slots[i].cl, + pr->flow, &cnt, &len); + pr->packets += cnt; + pr->bytes += len; + } + } +} + +void +priq_event(struct priq_if *pif, cqev_t ev) +{ + struct priq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + for (pri = 0; pri <= pif->pif_maxpri; pri++) + if ((cl = pif->pif_classes[pri]) != NULL) + priq_updateq(pif, cl, ev); +} + +int +priq_add_queue(struct priq_if *pif, int priority, u_int32_t qlimit, + int flags, u_int32_t qid, struct priq_class **clp) +{ + struct priq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + /* check parameters */ + if (priority >= PRIQ_MAXPRI) + return (EINVAL); + if (pif->pif_classes[priority] != NULL) + return (EBUSY); + if (priq_clh_to_clp(pif, qid) != NULL) + return (EBUSY); + + cl = priq_class_create(pif, priority, qlimit, flags, qid); + if (cl == NULL) + return (ENOMEM); + + if (clp != NULL) + *clp = cl; + + return (0); +} + +static struct priq_class * +priq_class_create(struct priq_if *pif, int pri, u_int32_t qlimit, + int flags, u_int32_t qid) +{ + struct ifnet *ifp; + struct ifclassq *ifq; + struct priq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + /* Sanitize flags unless internally configured */ + if (pif->pif_flags & PRIQIFF_ALTQ) + flags &= PRCF_USERFLAGS; + +#if !CLASSQ_RED + if (flags & PRCF_RED) { + log(LOG_ERR, "%s: %s RED not available!\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif)); + return (NULL); + } +#endif /* !CLASSQ_RED */ + +#if !CLASSQ_RIO + if (flags & PRCF_RIO) { + log(LOG_ERR, "%s: %s RIO not available!\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif)); + return (NULL); + } +#endif /* CLASSQ_RIO */ + +#if !CLASSQ_BLUE + if (flags & PRCF_BLUE) { + log(LOG_ERR, "%s: %s BLUE not available!\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif)); + return (NULL); + } +#endif /* CLASSQ_BLUE */ + + /* These are mutually exclusive */ + if ((flags & (PRCF_RED|PRCF_RIO|PRCF_BLUE|PRCF_SFB)) && + (flags & (PRCF_RED|PRCF_RIO|PRCF_BLUE|PRCF_SFB)) != PRCF_RED && + (flags & (PRCF_RED|PRCF_RIO|PRCF_BLUE|PRCF_SFB)) != PRCF_RIO && + (flags & (PRCF_RED|PRCF_RIO|PRCF_BLUE|PRCF_SFB)) != PRCF_BLUE && + (flags & (PRCF_RED|PRCF_RIO|PRCF_BLUE|PRCF_SFB)) != PRCF_SFB) { + log(LOG_ERR, "%s: %s more than one RED|RIO|BLUE|SFB\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif)); + return (NULL); + } + + ifq = pif->pif_ifq; + ifp = PRIQIF_IFP(pif); + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + if (!qempty(&cl->cl_q)) + priq_purgeq(pif, cl, 0, NULL, NULL); +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } else { + cl = zalloc(priq_cl_zone); + if (cl == NULL) + return (NULL); + + bzero(cl, priq_cl_size); + } + + pif->pif_classes[pri] = cl; + if (flags & PRCF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0 || qlimit > IFCQ_MAXLEN(ifq)) { + qlimit = IFCQ_MAXLEN(ifq); + if (qlimit == 0) + qlimit = DEFAULT_QLIMIT; /* use default */ + } + _qinit(&cl->cl_q, Q_DROPTAIL, qlimit); + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = qid; + + if (flags & (PRCF_RED|PRCF_RIO|PRCF_BLUE|PRCF_SFB)) { +#if CLASSQ_RED || CLASSQ_RIO + u_int64_t ifbandwidth = ifnet_output_linkrate(ifp); + int pkttime; +#endif /* CLASSQ_RED || CLASSQ_RIO */ + + cl->cl_qflags = 0; + if (flags & PRCF_ECN) { + if (flags & PRCF_BLUE) + cl->cl_qflags |= BLUEF_ECN; + else if (flags & PRCF_SFB) + cl->cl_qflags |= SFBF_ECN; + else if (flags & PRCF_RED) + cl->cl_qflags |= REDF_ECN; + else if (flags & PRCF_RIO) + cl->cl_qflags |= RIOF_ECN; + } + if (flags & PRCF_FLOWCTL) { + if (flags & PRCF_SFB) + cl->cl_qflags |= SFBF_FLOWCTL; + } + if (flags & PRCF_CLEARDSCP) { + if (flags & PRCF_RIO) + cl->cl_qflags |= RIOF_CLEARDSCP; + } +#if CLASSQ_RED || CLASSQ_RIO + /* + * XXX: RED & RIO should be watching link speed and MTU + * events and recompute pkttime accordingly. + */ + if (ifbandwidth < 8) + pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + pkttime = (int64_t)ifp->if_mtu * 1000 * 1000 * 1000 / + (ifbandwidth / 8); + + /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ +#if CLASSQ_RED + if (flags & PRCF_RED) { + cl->cl_red = red_alloc(ifp, 0, 0, + qlimit(&cl->cl_q) * 10/100, + qlimit(&cl->cl_q) * 30/100, + cl->cl_qflags, pkttime); + if (cl->cl_red != NULL) + qtype(&cl->cl_q) = Q_RED; + } +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (flags & PRCF_RIO) { + cl->cl_rio = + rio_alloc(ifp, 0, NULL, cl->cl_qflags, pkttime); + if (cl->cl_rio != NULL) + qtype(&cl->cl_q) = Q_RIO; + } +#endif /* CLASSQ_RIO */ +#endif /* CLASSQ_RED || CLASSQ_RIO */ +#if CLASSQ_BLUE + if (flags & PRCF_BLUE) { + cl->cl_blue = blue_alloc(ifp, 0, 0, cl->cl_qflags); + if (cl->cl_blue != NULL) + qtype(&cl->cl_q) = Q_BLUE; + } +#endif /* CLASSQ_BLUE */ + if (flags & PRCF_SFB) { + if (!(cl->cl_flags & PRCF_LAZY)) + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb != NULL || (cl->cl_flags & PRCF_LAZY)) + qtype(&cl->cl_q) = Q_SFB; + } + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s created qid=%d pri=%d qlimit=%d " + "flags=%b\n", if_name(ifp), priq_style(pif), + cl->cl_handle, cl->cl_pri, qlimit, flags, PRCF_BITS); + } + + return (cl); +} + +int +priq_remove_queue(struct priq_if *pif, u_int32_t qid) +{ + struct priq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + if ((cl = priq_clh_to_clp(pif, qid)) == NULL) + return (EINVAL); + + return (priq_class_destroy(pif, cl)); +} + +static int +priq_class_destroy(struct priq_if *pif, struct priq_class *cl) +{ + struct ifclassq *ifq = pif->pif_ifq; + int pri; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!qempty(&cl->cl_q)) + priq_purgeq(pif, cl, 0, NULL, NULL); + + VERIFY(cl->cl_pri < PRIQ_MAXPRI); + VERIFY(!pktsched_bit_tst(cl->cl_pri, &pif->pif_bitmap)); + + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + + if (pif->pif_default == cl) + pif->pif_default = NULL; + + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s destroyed qid=%d pri=%d\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif), + cl->cl_handle, cl->cl_pri); + } + + zfree(priq_cl_zone, cl); + + return (0); +} + +int +priq_enqueue(struct priq_if *pif, struct priq_class *cl, struct mbuf *m, + struct pf_mtag *t) +{ + struct ifclassq *ifq = pif->pif_ifq; + u_int32_t pri; + int len, ret; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(cl == NULL || cl->cl_pif == pif); + + if (cl == NULL) { + cl = priq_clh_to_clp(pif, t->pftag_qid); + if (cl == NULL) { + cl = pif->pif_default; + if (cl == NULL) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + } + } + pri = cl->cl_pri; + VERIFY(pri < PRIQ_MAXPRI); + + len = m_pktlen(m); + + ret = priq_addq(cl, m, t); + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + /* packet has been freed in priq_addq */ + PKTCNTR_ADD(&cl->cl_dropcnt, 1, len); + IFCQ_DROP_ADD(ifq, 1, len); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + /* NOT REACHED */ + } + } + IFCQ_INC_LEN(ifq); + + /* class is now active; indicate it as such */ + if (!pktsched_bit_tst(pri, &pif->pif_bitmap)) + pktsched_bit_set(pri, &pif->pif_bitmap); + + /* successfully queued. */ + return (ret); +} + +/* + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +struct mbuf * +priq_dequeue(struct priq_if *pif, cqdq_op_t op) +{ + struct ifclassq *ifq = pif->pif_ifq; + struct priq_class *cl; + struct mbuf *m; + u_int32_t pri, len; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (pif->pif_bitmap == 0) { + /* no active class; nothing to dequeue */ + return (NULL); + } + VERIFY(!IFCQ_IS_EMPTY(ifq)); + + pri = pktsched_fls(pif->pif_bitmap) - 1; /* zero based */ + VERIFY(pri < PRIQ_MAXPRI); + cl = pif->pif_classes[pri]; + VERIFY(cl != NULL && !qempty(&cl->cl_q)); + + if (op == CLASSQDQ_POLL) + return (priq_pollq(cl)); + + m = priq_getq(cl); + VERIFY(m != NULL); /* qalg must be work conserving */ + len = m_pktlen(m); + + IFCQ_DEC_LEN(ifq); + if (qempty(&cl->cl_q)) { + cl->cl_period++; + /* class is now inactive; indicate it as such */ + pktsched_bit_clr(pri, &pif->pif_bitmap); + } + PKTCNTR_ADD(&cl->cl_xmitcnt, 1, len); + IFCQ_XMIT_ADD(ifq, 1, len); + + return (m); +} + +static inline int +priq_addq(struct priq_class *cl, struct mbuf *m, struct pf_mtag *t) +{ + struct priq_if *pif = cl->cl_pif; + struct ifclassq *ifq = pif->pif_ifq; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_addq(cl->cl_rio, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_addq(cl->cl_red, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_addq(cl->cl_blue, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb == NULL) { + struct ifnet *ifp = PRIQIF_IFP(pif); + + VERIFY(cl->cl_flags & PRCF_LAZY); + cl->cl_flags &= ~PRCF_LAZY; + IFCQ_CONVERT_LOCK(ifq); + + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb == NULL) { + /* fall back to droptail */ + qtype(&cl->cl_q) = Q_DROPTAIL; + cl->cl_flags &= ~PRCF_SFB; + cl->cl_qflags &= ~(SFBF_ECN | SFBF_FLOWCTL); + + log(LOG_ERR, "%s: %s SFB lazy allocation " + "failed for qid=%d pri=%d, falling back " + "to DROPTAIL\n", if_name(ifp), + priq_style(pif), cl->cl_handle, + cl->cl_pri); + } else if (pif->pif_throttle != IFNET_THROTTLE_OFF) { + /* if there's pending throttling, set it */ + cqrq_throttle_t tr = { 1, pif->pif_throttle }; + int err = priq_throttle(pif, &tr); + + if (err == EALREADY) + err = 0; + if (err != 0) { + tr.level = IFNET_THROTTLE_OFF; + (void) priq_throttle(pif, &tr); + } + } + } + if (cl->cl_sfb != NULL) + return (sfb_addq(cl->cl_sfb, &cl->cl_q, m, t)); + } else if (qlen(&cl->cl_q) >= qlimit(&cl->cl_q)) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + if (cl->cl_flags & PRCF_CLEARDSCP) + write_dsfield(m, t, 0); + + _addq(&cl->cl_q, m); + + return (0); +} + +static inline struct mbuf * +priq_getq(struct priq_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_pif->pif_ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_getq(cl->cl_rio, &cl->cl_q)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_getq(cl->cl_red, &cl->cl_q)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_getq(cl->cl_blue, &cl->cl_q)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_getq(cl->cl_sfb, &cl->cl_q)); + + return (_getq(&cl->cl_q)); +} + +static inline struct mbuf * +priq_pollq(struct priq_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_pif->pif_ifq); + + return (qhead(&cl->cl_q)); +} + +static void +priq_purgeq(struct priq_if *pif, struct priq_class *cl, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes) +{ + struct ifclassq *ifq = pif->pif_ifq; + u_int32_t cnt = 0, len = 0, qlen; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if ((qlen = qlen(&cl->cl_q)) == 0) { + VERIFY(!pktsched_bit_tst(cl->cl_pri, &pif->pif_bitmap)); + goto done; + } + + /* become regular mutex before freeing mbufs */ + IFCQ_CONVERT_LOCK(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_purgeq(cl->cl_rio, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_purgeq(cl->cl_red, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_purgeq(cl->cl_blue, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_purgeq(cl->cl_sfb, &cl->cl_q, flow, &cnt, &len); + else + _flushq_flow(&cl->cl_q, flow, &cnt, &len); + + if (cnt > 0) { + VERIFY(qlen(&cl->cl_q) == (qlen - cnt)); + + PKTCNTR_ADD(&cl->cl_dropcnt, cnt, len); + IFCQ_DROP_ADD(ifq, cnt, len); + + VERIFY(((signed)IFCQ_LEN(ifq) - cnt) >= 0); + IFCQ_LEN(ifq) -= cnt; + + if (qempty(&cl->cl_q)) + pktsched_bit_clr(cl->cl_pri, &pif->pif_bitmap); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s purge qid=%d pri=%d " + "qlen=[%d,%d] cnt=%d len=%d flow=0x%x\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif), + cl->cl_handle, cl->cl_pri, qlen, qlen(&cl->cl_q), + cnt, len, flow); + } + } +done: + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +static void +priq_updateq(struct priq_if *pif, struct priq_class *cl, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s update qid=%d pri=%d event=%s\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif), + cl->cl_handle, cl->cl_pri, ifclassq_ev2str(ev)); + } + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_updateq(cl->cl_rio, ev)); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_updateq(cl->cl_red, ev)); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_updateq(cl->cl_blue, ev)); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_updateq(cl->cl_sfb, ev)); +} + +int +priq_get_class_stats(struct priq_if *pif, u_int32_t qid, + struct priq_classstats *sp) +{ + struct priq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + if ((cl = priq_clh_to_clp(pif, qid)) == NULL) + return (EINVAL); + + sp->class_handle = cl->cl_handle; + sp->priority = cl->cl_pri; + sp->qlength = qlen(&cl->cl_q); + sp->qlimit = qlimit(&cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(&cl->cl_q); + sp->qstate = qstate(&cl->cl_q); +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_getstats(cl->cl_rio, &sp->red[0]); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_getstats(cl->cl_blue, &sp->blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_getstats(cl->cl_sfb, &sp->sfb); + + return (0); +} + +/* convert a class handle to the corresponding class pointer */ +static inline struct priq_class * +priq_clh_to_clp(struct priq_if *pif, u_int32_t chandle) +{ + struct priq_class *cl; + int idx; + + IFCQ_LOCK_ASSERT_HELD(pif->pif_ifq); + + for (idx = pif->pif_maxpri; idx >= 0; idx--) + if ((cl = pif->pif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + +static const char * +priq_style(struct priq_if *pif) +{ + return ((pif->pif_flags & PRIQIFF_ALTQ) ? "ALTQ_PRIQ" : "PRIQ"); +} + +/* + * priq_enqueue_ifclassq is an enqueue function to be registered to + * (*ifcq_enqueue) in struct ifclassq. + */ +static int +priq_enqueue_ifclassq(struct ifclassq *ifq, struct mbuf *m) +{ + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + log(LOG_ERR, "%s: packet does not have pkthdr\n", + if_name(ifq->ifcq_ifp)); + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + + i = MBUF_SCIDX(mbuf_get_service_class(m)); + VERIFY((u_int32_t)i < IFCQ_SC_MAX); + + return (priq_enqueue(ifq->ifcq_disc, + ifq->ifcq_disc_slots[i].cl, m, m_pftag(m))); +} + +/* + * priq_dequeue_ifclassq is a dequeue function to be registered to + * (*ifcq_dequeue) in struct ifclass. + * + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +static struct mbuf * +priq_dequeue_ifclassq(struct ifclassq *ifq, cqdq_op_t op) +{ + return (priq_dequeue(ifq->ifcq_disc, op)); +} + +static int +priq_request_ifclassq(struct ifclassq *ifq, cqrq_t req, void *arg) +{ + struct priq_if *pif = (struct priq_if *)ifq->ifcq_disc; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + switch (req) { + case CLASSQRQ_PURGE: + priq_purge(pif); + break; + + case CLASSQRQ_PURGE_SC: + priq_purge_sc(pif, (cqrq_purge_sc_t *)arg); + break; + + case CLASSQRQ_EVENT: + priq_event(pif, (cqev_t)arg); + break; + + case CLASSQRQ_THROTTLE: + err = priq_throttle(pif, (cqrq_throttle_t *)arg); + break; + } + return (err); +} + +int +priq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + struct priq_class *cl0, *cl1, *cl2, *cl3, *cl4; + struct priq_class *cl5, *cl6, *cl7, *cl8, *cl9; + struct priq_if *pif; + u_int32_t maxlen = 0, qflags = 0; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + + if (flags & PKTSCHEDF_QALG_RED) + qflags |= PRCF_RED; + if (flags & PKTSCHEDF_QALG_RIO) + qflags |= PRCF_RIO; + if (flags & PKTSCHEDF_QALG_BLUE) + qflags |= PRCF_BLUE; + if (flags & PKTSCHEDF_QALG_SFB) + qflags |= PRCF_SFB; + if (flags & PKTSCHEDF_QALG_ECN) + qflags |= PRCF_ECN; + if (flags & PKTSCHEDF_QALG_FLOWCTL) + qflags |= PRCF_FLOWCTL; + + pif = priq_alloc(ifp, M_WAITOK, FALSE); + if (pif == NULL) + return (ENOMEM); + + if ((maxlen = IFCQ_MAXLEN(ifq)) == 0) + maxlen = if_sndq_maxlen; + + if ((err = priq_add_queue(pif, 0, maxlen, + qflags | PRCF_LAZY, SCIDX_BK_SYS, &cl0)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 1, maxlen, + qflags | PRCF_LAZY, SCIDX_BK, &cl1)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 2, maxlen, + qflags | PRCF_DEFAULTCLASS, SCIDX_BE, &cl2)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 3, maxlen, + qflags | PRCF_LAZY, SCIDX_RD, &cl3)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 4, maxlen, + qflags | PRCF_LAZY, SCIDX_OAM, &cl4)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 5, maxlen, + qflags | PRCF_LAZY, SCIDX_AV, &cl5)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 6, maxlen, + qflags | PRCF_LAZY, SCIDX_RV, &cl6)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 7, maxlen, + qflags | PRCF_LAZY, SCIDX_VI, &cl7)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 8, maxlen, + qflags | PRCF_LAZY, SCIDX_VO, &cl8)) != 0) + goto cleanup; + + if ((err = priq_add_queue(pif, 9, maxlen, + qflags, SCIDX_CTL, &cl9)) != 0) + goto cleanup; + + err = ifclassq_attach(ifq, PKTSCHEDT_PRIQ, pif, + priq_enqueue_ifclassq, priq_dequeue_ifclassq, NULL, + priq_request_ifclassq); + + /* cache these for faster lookup */ + if (err == 0) { + ifq->ifcq_disc_slots[SCIDX_BK_SYS].qid = SCIDX_BK_SYS; + ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl = cl0; + + ifq->ifcq_disc_slots[SCIDX_BK].qid = SCIDX_BK; + ifq->ifcq_disc_slots[SCIDX_BK].cl = cl1; + + ifq->ifcq_disc_slots[SCIDX_BE].qid = SCIDX_BE; + ifq->ifcq_disc_slots[SCIDX_BE].cl = cl2; + + ifq->ifcq_disc_slots[SCIDX_RD].qid = SCIDX_RD; + ifq->ifcq_disc_slots[SCIDX_RD].cl = cl3; + + ifq->ifcq_disc_slots[SCIDX_OAM].qid = SCIDX_OAM; + ifq->ifcq_disc_slots[SCIDX_OAM].cl = cl4; + + ifq->ifcq_disc_slots[SCIDX_AV].qid = SCIDX_AV; + ifq->ifcq_disc_slots[SCIDX_AV].cl = cl5; + + ifq->ifcq_disc_slots[SCIDX_RV].qid = SCIDX_RV; + ifq->ifcq_disc_slots[SCIDX_RV].cl = cl6; + + ifq->ifcq_disc_slots[SCIDX_VI].qid = SCIDX_VI; + ifq->ifcq_disc_slots[SCIDX_VI].cl = cl7; + + ifq->ifcq_disc_slots[SCIDX_VO].qid = SCIDX_VO; + ifq->ifcq_disc_slots[SCIDX_VO].cl = cl8; + + ifq->ifcq_disc_slots[SCIDX_CTL].qid = SCIDX_CTL; + ifq->ifcq_disc_slots[SCIDX_CTL].cl = cl9; + } + +cleanup: + if (err != 0) + (void) priq_destroy_locked(pif); + + return (err); +} + +int +priq_teardown_ifclassq(struct ifclassq *ifq) +{ + struct priq_if *pif = ifq->ifcq_disc; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(pif != NULL && ifq->ifcq_type == PKTSCHEDT_PRIQ); + + (void) priq_destroy_locked(pif); + + ifq->ifcq_disc = NULL; + for (i = 0; i < IFCQ_SC_MAX; i++) { + ifq->ifcq_disc_slots[i].qid = 0; + ifq->ifcq_disc_slots[i].cl = NULL; + } + + return (ifclassq_detach(ifq)); +} + +int +priq_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot, + struct if_ifclassq_stats *ifqs) +{ + struct priq_if *pif = ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_type == PKTSCHEDT_PRIQ); + + if (slot >= IFCQ_SC_MAX) + return (EINVAL); + + return (priq_get_class_stats(pif, ifq->ifcq_disc_slots[slot].qid, + &ifqs->ifqs_priq_stats)); +} + +static int +priq_throttle(struct priq_if *pif, cqrq_throttle_t *tr) +{ + struct ifclassq *ifq = pif->pif_ifq; + struct priq_class *cl; + int err; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(!(pif->pif_flags & PRIQIFF_ALTQ)); + + if (!tr->set) { + tr->level = pif->pif_throttle; + return (0); + } + + if (tr->level == pif->pif_throttle) + return (EALREADY); + + /* Current throttling levels only involve BK_SYS class */ + cl = ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl; + + switch (tr->level) { + case IFNET_THROTTLE_OFF: + err = priq_resumeq(pif, cl); + break; + + case IFNET_THROTTLE_OPPORTUNISTIC: + err = priq_suspendq(pif, cl); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + if (err == 0 || err == ENXIO) { + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s throttling level %sset %d->%d\n", + if_name(PRIQIF_IFP(pif)), priq_style(pif), + (err == 0) ? "" : "lazy ", pif->pif_throttle, + tr->level); + } + pif->pif_throttle = tr->level; + if (err != 0) + err = 0; + else + priq_purgeq(pif, cl, 0, NULL, NULL); + } else { + log(LOG_ERR, "%s: %s unable to set throttling level " + "%d->%d [error=%d]\n", if_name(PRIQIF_IFP(pif)), + priq_style(pif), pif->pif_throttle, tr->level, err); + } + + return (err); +} + +static int +priq_resumeq(struct priq_if *pif, struct priq_class *cl) +{ + struct ifclassq *ifq = pif->pif_ifq; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + err = rio_suspendq(cl->cl_rio, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + err = red_suspendq(cl->cl_red, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + err = blue_suspendq(cl->cl_blue, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, FALSE); + + if (err == 0) + qstate(&cl->cl_q) = QS_RUNNING; + + return (err); +} + +static int +priq_suspendq(struct priq_if *pif, struct priq_class *cl) +{ + struct ifclassq *ifq = pif->pif_ifq; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + err = rio_suspendq(cl->cl_rio, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + err = red_suspendq(cl->cl_red, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + err = blue_suspendq(cl->cl_blue, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb != NULL) { + err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, TRUE); + } else { + VERIFY(cl->cl_flags & PRCF_LAZY); + err = ENXIO; /* delayed throttling */ + } + } + + if (err == 0 || err == ENXIO) + qstate(&cl->cl_q) = QS_SUSPENDED; + + return (err); +} +#endif /* PKTSCHED_PRIQ */ diff --git a/bsd/net/pktsched/pktsched_priq.h b/bsd/net/pktsched/pktsched_priq.h new file mode 100644 index 000000000..4dc9b74bc --- /dev/null +++ b/bsd/net/pktsched/pktsched_priq.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_priq.h,v 1.7 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $ */ +/* + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_PRIQ_H_ +#define _NET_PKTSCHED_PKTSCHED_PRIQ_H_ + +#ifdef PRIVATE +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define PRIQ_MAXPRI 16 /* upper limit of the number of priorities */ + +/* priq class flags */ +#define PRCF_RED 0x0001 /* use RED */ +#define PRCF_ECN 0x0002 /* use ECN with RED/BLUE/SFB */ +#define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define PRCF_BLUE 0x0100 /* use BLUE */ +#define PRCF_SFB 0x0200 /* use SFB */ +#define PRCF_FLOWCTL 0x0400 /* enable flow control advisories */ +#define PRCF_DEFAULTCLASS 0x1000 /* default class */ +#ifdef BSD_KERNEL_PRIVATE +#define PRCF_LAZY 0x10000000 /* on-demand resource allocation */ +#endif /* BSD_KERNEL_PRIVATE */ + +#define PRCF_USERFLAGS \ + (PRCF_RED | PRCF_ECN | PRCF_RIO | PRCF_CLEARDSCP | PRCF_BLUE | \ + PRCF_SFB | PRCF_FLOWCTL | PRCF_DEFAULTCLASS) + +#ifdef BSD_KERNEL_PRIVATE +#define PRCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" \ + "\35LAZY" +#else +#define PRCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" +#endif /* !BSD_KERNEL_PRIVATE */ + +struct priq_classstats { + u_int32_t class_handle; + u_int32_t priority; + + u_int32_t qlength; + u_int32_t qlimit; + u_int32_t period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* RED, RIO, BLUE, SFB related info */ + classq_type_t qtype; + union { + /* RIO has 3 red stats */ + struct red_stats red[RIO_NDROPPREC]; + struct blue_stats blue; + struct sfb_stats sfb; + }; + classq_state_t qstate; +}; + +#ifdef BSD_KERNEL_PRIVATE +struct priq_class { + u_int32_t cl_handle; /* class handle */ + class_queue_t cl_q; /* class queue structure */ + u_int32_t cl_qflags; /* class queue flags */ + union { + void *ptr; + struct red *red; /* RED state */ + struct rio *rio; /* RIO state */ + struct blue *blue; /* BLUE state */ + struct sfb *sfb; /* SFB state */ + } cl_qalg; + int32_t cl_pri; /* priority */ + u_int32_t cl_flags; /* class flags */ + struct priq_if *cl_pif; /* back pointer to pif */ + + /* statistics */ + u_int32_t cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +#define cl_red cl_qalg.red +#define cl_rio cl_qalg.rio +#define cl_blue cl_qalg.blue +#define cl_sfb cl_qalg.sfb + +/* priq_if flags */ +#define PRIQIFF_ALTQ 0x1 /* configured via PF/ALTQ */ + +/* + * priq interface state + */ +struct priq_if { + struct ifclassq *pif_ifq; /* backpointer to ifclassq */ + int pif_maxpri; /* max priority in use */ + u_int32_t pif_flags; /* flags */ + u_int32_t pif_throttle; /* throttling level */ + pktsched_bitmap_t pif_bitmap; /* active class bitmap */ + struct priq_class *pif_default; /* default class */ + struct priq_class *pif_classes[PRIQ_MAXPRI]; /* classes */ +}; + +#define PRIQIF_IFP(_pif) ((_pif)->pif_ifq->ifcq_ifp) + +struct if_ifclassq_stats; + +extern void priq_init(void); +extern struct priq_if *priq_alloc(struct ifnet *, int, boolean_t); +extern int priq_destroy(struct priq_if *); +extern void priq_purge(struct priq_if *); +extern void priq_event(struct priq_if *, cqev_t); +extern int priq_add_queue(struct priq_if *, int, u_int32_t, int, u_int32_t, + struct priq_class **); +extern int priq_remove_queue(struct priq_if *, u_int32_t); +extern int priq_get_class_stats(struct priq_if *, u_int32_t, + struct priq_classstats *); +extern int priq_enqueue(struct priq_if *, struct priq_class *, struct mbuf *, + struct pf_mtag *); +extern struct mbuf *priq_dequeue(struct priq_if *, cqdq_op_t); +extern int priq_setup_ifclassq(struct ifclassq *, u_int32_t); +extern int priq_teardown_ifclassq(struct ifclassq *ifq); +extern int priq_getqstats_ifclassq(struct ifclassq *, u_int32_t, + struct if_ifclassq_stats *); +extern int priq_set_throttle(struct ifclassq *, u_int32_t); +extern int priq_get_throttle(struct ifclassq *, u_int32_t *); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_PRIQ_H_ */ diff --git a/bsd/net/pktsched/pktsched_qfq.c b/bsd/net/pktsched/pktsched_qfq.c new file mode 100644 index 000000000..d7cca36da --- /dev/null +++ b/bsd/net/pktsched/pktsched_qfq.c @@ -0,0 +1,2034 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Quick Fair Queueing is described in + * "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution + * Guarantees" by Fabio Checconi, Paolo Valente, and Luigi Rizzo. + * + * This code is ported from the dummynet(4) QFQ implementation. + * See also http://info.iet.unipi.it/~luigi/qfq/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * function prototypes + */ +static int qfq_enqueue_ifclassq(struct ifclassq *, struct mbuf *); +static struct mbuf *qfq_dequeue_ifclassq(struct ifclassq *, cqdq_op_t); +static int qfq_request_ifclassq(struct ifclassq *, cqrq_t, void *); +static int qfq_clear_interface(struct qfq_if *); +static struct qfq_class *qfq_class_create(struct qfq_if *, u_int32_t, + u_int32_t, u_int32_t, u_int32_t, u_int32_t); +static int qfq_class_destroy(struct qfq_if *, struct qfq_class *); +static int qfq_destroy_locked(struct qfq_if *); +static inline int qfq_addq(struct qfq_class *, struct mbuf *, struct pf_mtag *); +static inline struct mbuf *qfq_getq(struct qfq_class *); +static inline struct mbuf *qfq_pollq(struct qfq_class *); +static void qfq_purgeq(struct qfq_if *, struct qfq_class *, u_int32_t, + u_int32_t *, u_int32_t *); +static void qfq_purge_sc(struct qfq_if *, cqrq_purge_sc_t *); +static void qfq_updateq(struct qfq_if *, struct qfq_class *, cqev_t); +static int qfq_throttle(struct qfq_if *, cqrq_throttle_t *); +static int qfq_resumeq(struct qfq_if *, struct qfq_class *); +static int qfq_suspendq(struct qfq_if *, struct qfq_class *); +static inline struct qfq_class *qfq_clh_to_clp(struct qfq_if *, u_int32_t); +static const char *qfq_style(struct qfq_if *); + +static inline int qfq_gt(u_int64_t, u_int64_t); +static inline u_int64_t qfq_round_down(u_int64_t, u_int32_t); +static inline struct qfq_group *qfq_ffs(struct qfq_if *, pktsched_bitmap_t); +static int qfq_calc_index(struct qfq_class *, u_int32_t, u_int32_t); +static inline pktsched_bitmap_t mask_from(pktsched_bitmap_t, int); +static inline u_int32_t qfq_calc_state(struct qfq_if *, struct qfq_group *); +static inline void qfq_move_groups(struct qfq_if *, pktsched_bitmap_t, + int, int); +static inline void qfq_unblock_groups(struct qfq_if *, int, u_int64_t); +static inline void qfq_make_eligible(struct qfq_if *, u_int64_t); +static inline void qfq_slot_insert(struct qfq_if *, struct qfq_group *, + struct qfq_class *, u_int64_t); +static inline void qfq_front_slot_remove(struct qfq_group *); +static inline struct qfq_class *qfq_slot_scan(struct qfq_if *, + struct qfq_group *); +static inline void qfq_slot_rotate(struct qfq_if *, struct qfq_group *, + u_int64_t); +static inline void qfq_update_eligible(struct qfq_if *, u_int64_t); +static inline int qfq_update_class(struct qfq_if *, struct qfq_group *, + struct qfq_class *); +static inline void qfq_update_start(struct qfq_if *, struct qfq_class *); +static inline void qfq_slot_remove(struct qfq_if *, struct qfq_group *, + struct qfq_class *); +static void qfq_deactivate_class(struct qfq_if *, struct qfq_class *); +static const char *qfq_state2str(int); +#if QFQ_DEBUG +static void qfq_dump_groups(struct qfq_if *, u_int32_t); +static void qfq_dump_sched(struct qfq_if *, const char *); +#endif /* QFQ_DEBUG */ + +#define QFQ_ZONE_MAX 32 /* maximum elements in zone */ +#define QFQ_ZONE_NAME "pktsched_qfq" /* zone name */ + +static unsigned int qfq_size; /* size of zone element */ +static struct zone *qfq_zone; /* zone for qfq */ + +#define QFQ_CL_ZONE_MAX 32 /* maximum elements in zone */ +#define QFQ_CL_ZONE_NAME "pktsched_qfq_cl" /* zone name */ + +static unsigned int qfq_cl_size; /* size of zone element */ +static struct zone *qfq_cl_zone; /* zone for qfq_class */ + +/* + * Maximum number of consecutive slots occupied by backlogged classes + * inside a group. This is approx lmax/lmin + 5. Used when ALTQ is + * available. + * + * XXX check because it poses constraints on MAX_INDEX + */ +#define QFQ_MAX_SLOTS 32 /* default when ALTQ is available */ + +void +qfq_init(void) +{ + qfq_size = sizeof (struct qfq_if); + qfq_zone = zinit(qfq_size, QFQ_ZONE_MAX * qfq_size, + 0, QFQ_ZONE_NAME); + if (qfq_zone == NULL) { + panic("%s: failed allocating %s", __func__, QFQ_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(qfq_zone, Z_EXPAND, TRUE); + zone_change(qfq_zone, Z_CALLERACCT, TRUE); + + qfq_cl_size = sizeof (struct qfq_class); + qfq_cl_zone = zinit(qfq_cl_size, QFQ_CL_ZONE_MAX * qfq_cl_size, + 0, QFQ_CL_ZONE_NAME); + if (qfq_cl_zone == NULL) { + panic("%s: failed allocating %s", __func__, QFQ_CL_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(qfq_cl_zone, Z_EXPAND, TRUE); + zone_change(qfq_cl_zone, Z_CALLERACCT, TRUE); +} + +struct qfq_if * +qfq_alloc(struct ifnet *ifp, int how, boolean_t altq) +{ + struct qfq_if *qif; + + qif = (how == M_WAITOK) ? zalloc(qfq_zone) : zalloc_noblock(qfq_zone); + if (qif == NULL) + return (NULL); + + bzero(qif, qfq_size); + qif->qif_ifq = &ifp->if_snd; + if (altq) { + qif->qif_maxclasses = QFQ_MAX_CLASSES; + qif->qif_maxslots = QFQ_MAX_SLOTS; + qif->qif_flags |= QFQIFF_ALTQ; + } else { + qif->qif_maxclasses = IFCQ_SC_MAX; + /* + * TODO: adi@apple.com + * + * Ideally I would like to have the following + * but QFQ needs further modifications. + * + * qif->qif_maxslots = IFCQ_SC_MAX; + */ + qif->qif_maxslots = QFQ_MAX_SLOTS; + } + + if ((qif->qif_class_tbl = _MALLOC(sizeof (struct qfq_class *) * + qif->qif_maxclasses, M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) { + log(LOG_ERR, "%s: %s unable to allocate class table array\n", + if_name(ifp), qfq_style(qif)); + goto error; + } + + if ((qif->qif_groups = _MALLOC(sizeof (struct qfq_group *) * + (QFQ_MAX_INDEX + 1), M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) { + log(LOG_ERR, "%s: %s unable to allocate group array\n", + if_name(ifp), qfq_style(qif)); + goto error; + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler allocated\n", + if_name(ifp), qfq_style(qif)); + } + + return (qif); + +error: + if (qif->qif_class_tbl != NULL) { + _FREE(qif->qif_class_tbl, M_DEVBUF); + qif->qif_class_tbl = NULL; + } + if (qif->qif_groups != NULL) { + _FREE(qif->qif_groups, M_DEVBUF); + qif->qif_groups = NULL; + } + zfree(qfq_zone, qif); + + return (NULL); +} + +int +qfq_destroy(struct qfq_if *qif) +{ + struct ifclassq *ifq = qif->qif_ifq; + int err; + + IFCQ_LOCK(ifq); + err = qfq_destroy_locked(qif); + IFCQ_UNLOCK(ifq); + + return (err); +} + +static int +qfq_destroy_locked(struct qfq_if *qif) +{ + int i; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + (void) qfq_clear_interface(qif); + + VERIFY(qif->qif_class_tbl != NULL); + _FREE(qif->qif_class_tbl, M_DEVBUF); + qif->qif_class_tbl = NULL; + + VERIFY(qif->qif_groups != NULL); + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + struct qfq_group *grp = qif->qif_groups[i]; + + if (grp != NULL) { + VERIFY(grp->qfg_slots != NULL); + _FREE(grp->qfg_slots, M_DEVBUF); + grp->qfg_slots = NULL; + _FREE(grp, M_DEVBUF); + qif->qif_groups[i] = NULL; + } + } + _FREE(qif->qif_groups, M_DEVBUF); + qif->qif_groups = NULL; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler destroyed\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif)); + } + + zfree(qfq_zone, qif); + + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +qfq_clear_interface(struct qfq_if *qif) +{ + struct qfq_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + /* clear out the classes */ + for (i = 0; i < qif->qif_maxclasses; i++) + if ((cl = qif->qif_class_tbl[i]) != NULL) + qfq_class_destroy(qif, cl); + + return (0); +} + +/* discard all the queued packets on the interface */ +void +qfq_purge(struct qfq_if *qif) +{ + struct qfq_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + for (i = 0; i < qif->qif_maxclasses; i++) { + if ((cl = qif->qif_class_tbl[i]) != NULL) + qfq_purgeq(qif, cl, 0, NULL, NULL); + } +#if !PF_ALTQ + /* + * This assertion is safe to be made only when PF_ALTQ is not + * configured; otherwise, IFCQ_LEN represents the sum of the + * packets managed by ifcq_disc and altq_disc instances, which + * is possible when transitioning between the two. + */ + VERIFY(IFCQ_LEN(qif->qif_ifq) == 0); +#endif /* !PF_ALTQ */ +} + +static void +qfq_purge_sc(struct qfq_if *qif, cqrq_purge_sc_t *pr) +{ + struct ifclassq *ifq = qif->qif_ifq; + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(pr->sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(pr->sc)); + VERIFY(pr->flow != 0); + + if (pr->sc != MBUF_SC_UNSPEC) { + i = MBUF_SCIDX(pr->sc); + VERIFY(i < IFCQ_SC_MAX); + + qfq_purgeq(qif, ifq->ifcq_disc_slots[i].cl, + pr->flow, &pr->packets, &pr->bytes); + } else { + u_int32_t cnt, len; + + pr->packets = 0; + pr->bytes = 0; + + for (i = 0; i < IFCQ_SC_MAX; i++) { + qfq_purgeq(qif, ifq->ifcq_disc_slots[i].cl, + pr->flow, &cnt, &len); + pr->packets += cnt; + pr->bytes += len; + } + } +} + +void +qfq_event(struct qfq_if *qif, cqev_t ev) +{ + struct qfq_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + for (i = 0; i < qif->qif_maxclasses; i++) + if ((cl = qif->qif_class_tbl[i]) != NULL) + qfq_updateq(qif, cl, ev); +} + +int +qfq_add_queue(struct qfq_if *qif, u_int32_t qlimit, u_int32_t weight, + u_int32_t maxsz, u_int32_t flags, u_int32_t qid, struct qfq_class **clp) +{ + struct qfq_class *cl; + u_int32_t w; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + if (qfq_clh_to_clp(qif, qid) != NULL) + return (EBUSY); + + /* check parameters */ + if (weight == 0 || weight > QFQ_MAX_WEIGHT) + return (EINVAL); + + w = (QFQ_ONE_FP / (QFQ_ONE_FP / weight)); + if (qif->qif_wsum + w > QFQ_MAX_WSUM) + return (EINVAL); + + if (maxsz == 0 || maxsz > (1 << QFQ_MTU_SHIFT)) + return (EINVAL); + + cl = qfq_class_create(qif, weight, qlimit, flags, maxsz, qid); + if (cl == NULL) + return (ENOMEM); + + if (clp != NULL) + *clp = cl; + + return (0); +} + +static struct qfq_class * +qfq_class_create(struct qfq_if *qif, u_int32_t weight, u_int32_t qlimit, + u_int32_t flags, u_int32_t maxsz, u_int32_t qid) +{ + struct ifnet *ifp; + struct ifclassq *ifq; + struct qfq_group *grp; + struct qfq_class *cl; + u_int32_t w; /* approximated weight */ + int i; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + /* Sanitize flags unless internally configured */ + if (qif->qif_flags & QFQIFF_ALTQ) + flags &= QFCF_USERFLAGS; + + if (qif->qif_classes >= qif->qif_maxclasses) { + log(LOG_ERR, "%s: %s out of classes! (max %d)\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), + qif->qif_maxclasses); + return (NULL); + } + +#if !CLASSQ_RED + if (flags & QFCF_RED) { + log(LOG_ERR, "%s: %s RED not available!\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif)); + return (NULL); + } +#endif /* !CLASSQ_RED */ + +#if !CLASSQ_RIO + if (flags & QFCF_RIO) { + log(LOG_ERR, "%s: %s RIO not available!\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif)); + return (NULL); + } +#endif /* CLASSQ_RIO */ + +#if !CLASSQ_BLUE + if (flags & QFCF_BLUE) { + log(LOG_ERR, "%s: %s BLUE not available!\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif)); + return (NULL); + } +#endif /* CLASSQ_BLUE */ + + /* These are mutually exclusive */ + if ((flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) && + (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_RED && + (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_RIO && + (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_BLUE && + (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_SFB) { + log(LOG_ERR, "%s: %s more than one RED|RIO|BLUE|SFB\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif)); + return (NULL); + } + + ifq = qif->qif_ifq; + ifp = QFQIF_IFP(qif); + + cl = zalloc(qfq_cl_zone); + if (cl == NULL) + return (NULL); + + bzero(cl, qfq_cl_size); + + if (qlimit == 0 || qlimit > IFCQ_MAXLEN(ifq)) { + qlimit = IFCQ_MAXLEN(ifq); + if (qlimit == 0) + qlimit = DEFAULT_QLIMIT; /* use default */ + } + _qinit(&cl->cl_q, Q_DROPTAIL, qlimit); + cl->cl_qif = qif; + cl->cl_flags = flags; + cl->cl_handle = qid; + + /* + * Find a free slot in the class table. If the slot matching + * the lower bits of qid is free, use this slot. Otherwise, + * use the first free slot. + */ + i = qid % qif->qif_maxclasses; + if (qif->qif_class_tbl[i] == NULL) { + qif->qif_class_tbl[i] = cl; + } else { + for (i = 0; i < qif->qif_maxclasses; i++) { + if (qif->qif_class_tbl[i] == NULL) { + qif->qif_class_tbl[i] = cl; + break; + } + } + if (i == qif->qif_maxclasses) { + zfree(qfq_cl_zone, cl); + return (NULL); + } + } + + w = weight; + VERIFY(w > 0 && w <= QFQ_MAX_WEIGHT); + cl->cl_lmax = maxsz; + cl->cl_inv_w = (QFQ_ONE_FP / w); + w = (QFQ_ONE_FP / cl->cl_inv_w); + VERIFY(qif->qif_wsum + w <= QFQ_MAX_WSUM); + + i = qfq_calc_index(cl, cl->cl_inv_w, cl->cl_lmax); + VERIFY(i <= QFQ_MAX_INDEX); + grp = qif->qif_groups[i]; + if (grp == NULL) { + grp = _MALLOC(sizeof (*grp), M_DEVBUF, M_WAITOK|M_ZERO); + if (grp != NULL) { + grp->qfg_index = i; + grp->qfg_slot_shift = + QFQ_MTU_SHIFT + QFQ_FRAC_BITS - (QFQ_MAX_INDEX - i); + grp->qfg_slots = _MALLOC(sizeof (struct qfq_class *) * + qif->qif_maxslots, M_DEVBUF, M_WAITOK|M_ZERO); + if (grp->qfg_slots == NULL) { + log(LOG_ERR, "%s: %s unable to allocate group " + "slots for index %d\n", if_name(ifp), + qfq_style(qif), i); + } + } else { + log(LOG_ERR, "%s: %s unable to allocate group for " + "qid=%d\n", if_name(ifp), qfq_style(qif), + cl->cl_handle); + } + if (grp == NULL || grp->qfg_slots == NULL) { + qif->qif_class_tbl[qid % qif->qif_maxclasses] = NULL; + if (grp != NULL) + _FREE(grp, M_DEVBUF); + zfree(qfq_cl_zone, cl); + return (NULL); + } else { + qif->qif_groups[i] = grp; + } + } + cl->cl_grp = grp; + qif->qif_wsum += w; + /* XXX cl->cl_S = qif->qif_V; ? */ + /* XXX compute qif->qif_i_wsum */ + + qif->qif_classes++; + + if (flags & QFCF_DEFAULTCLASS) + qif->qif_default = cl; + + if (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) { +#if CLASSQ_RED || CLASSQ_RIO + u_int64_t ifbandwidth = ifnet_output_linkrate(ifp); + int pkttime; +#endif /* CLASSQ_RED || CLASSQ_RIO */ + + cl->cl_qflags = 0; + if (flags & QFCF_ECN) { + if (flags & QFCF_BLUE) + cl->cl_qflags |= BLUEF_ECN; + else if (flags & QFCF_SFB) + cl->cl_qflags |= SFBF_ECN; + else if (flags & QFCF_RED) + cl->cl_qflags |= REDF_ECN; + else if (flags & QFCF_RIO) + cl->cl_qflags |= RIOF_ECN; + } + if (flags & QFCF_FLOWCTL) { + if (flags & QFCF_SFB) + cl->cl_qflags |= SFBF_FLOWCTL; + } + if (flags & QFCF_CLEARDSCP) { + if (flags & QFCF_RIO) + cl->cl_qflags |= RIOF_CLEARDSCP; + } +#if CLASSQ_RED || CLASSQ_RIO + /* + * XXX: RED & RIO should be watching link speed and MTU + * events and recompute pkttime accordingly. + */ + if (ifbandwidth < 8) + pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + pkttime = (int64_t)ifp->if_mtu * 1000 * 1000 * 1000 / + (ifbandwidth / 8); + + /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ +#if CLASSQ_RED + if (flags & QFCF_RED) { + cl->cl_red = red_alloc(ifp, 0, 0, + qlimit(&cl->cl_q) * 10/100, + qlimit(&cl->cl_q) * 30/100, + cl->cl_qflags, pkttime); + if (cl->cl_red != NULL) + qtype(&cl->cl_q) = Q_RED; + } +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (flags & QFCF_RIO) { + cl->cl_rio = + rio_alloc(ifp, 0, NULL, cl->cl_qflags, pkttime); + if (cl->cl_rio != NULL) + qtype(&cl->cl_q) = Q_RIO; + } +#endif /* CLASSQ_RIO */ +#endif /* CLASSQ_RED || CLASSQ_RIO */ +#if CLASSQ_BLUE + if (flags & QFCF_BLUE) { + cl->cl_blue = blue_alloc(ifp, 0, 0, cl->cl_qflags); + if (cl->cl_blue != NULL) + qtype(&cl->cl_q) = Q_BLUE; + } +#endif /* CLASSQ_BLUE */ + if (flags & QFCF_SFB) { + if (!(cl->cl_flags & QFCF_LAZY)) + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb != NULL || (cl->cl_flags & QFCF_LAZY)) + qtype(&cl->cl_q) = Q_SFB; + } + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s created qid=%d grp=%d weight=%d " + "qlimit=%d flags=%b\n", if_name(ifp), qfq_style(qif), + cl->cl_handle, cl->cl_grp->qfg_index, weight, qlimit, + flags, QFCF_BITS); + } + + return (cl); +} + +int +qfq_remove_queue(struct qfq_if *qif, u_int32_t qid) +{ + struct qfq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + if ((cl = qfq_clh_to_clp(qif, qid)) == NULL) + return (EINVAL); + + return (qfq_class_destroy(qif, cl)); +} + +static int +qfq_class_destroy(struct qfq_if *qif, struct qfq_class *cl) +{ + struct ifclassq *ifq = qif->qif_ifq; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + qfq_purgeq(qif, cl, 0, NULL, NULL); + + if (cl->cl_inv_w != 0) { + qif->qif_wsum -= (QFQ_ONE_FP / cl->cl_inv_w); + cl->cl_inv_w = 0; /* reset weight to avoid run twice */ + } + + for (i = 0; i < qif->qif_maxclasses; i++) { + if (qif->qif_class_tbl[i] == cl) { + qif->qif_class_tbl[i] = NULL; + break; + } + } + qif->qif_classes--; + + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } + + if (qif->qif_default == cl) + qif->qif_default = NULL; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s destroyed qid=%d\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), cl->cl_handle); + } + + zfree(qfq_cl_zone, cl); + + return (0); +} + +/* + * Calculate a mask to mimic what would be ffs_from() + */ +static inline pktsched_bitmap_t +mask_from(pktsched_bitmap_t bitmap, int from) +{ + return (bitmap & ~((1UL << from) - 1)); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->qfg_S, qif->qif_V, + * then check if someone is blocking us and possibly add EB + */ +static inline u_int32_t +qfq_calc_state(struct qfq_if *qif, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + u_int32_t state = qfq_gt(grp->qfg_S, qif->qif_V); + pktsched_bitmap_t mask = mask_from(qif->qif_bitmaps[ER], + grp->qfg_index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(qif, mask); + if (qfq_gt(grp->qfg_F, next->qfg_F)) + state |= EB; + } + + return (state); +} + +/* + * In principle + * qif->qif_bitmaps[dst] |= qif->qif_bitmaps[src] & mask; + * qif->qif_bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_if *qif, pktsched_bitmap_t mask, int src, int dst) +{ + qif->qif_bitmaps[dst] |= qif->qif_bitmaps[src] & mask; + qif->qif_bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_if *qif, int index, u_int64_t old_finish) +{ + pktsched_bitmap_t mask = mask_from(qif->qif_bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(qif, mask); + if (!qfq_gt(next->qfg_F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(qif, mask, EB, ER); + qfq_move_groups(qif, mask, IB, IR); +} + +/* + * perhaps + * + * old_V ^= qif->qif_V; + * old_V >>= QFQ_MIN_SLOT_SHIFT; + * if (old_V) { + * ... + * } + */ +static inline void +qfq_make_eligible(struct qfq_if *qif, u_int64_t old_V) +{ + pktsched_bitmap_t mask, vslot, old_vslot; + + vslot = qif->qif_V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(qif, mask, IR, ER); + qfq_move_groups(qif, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->qfg_S rounded on grp->qfg_slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_if *qif, struct qfq_group *grp, + struct qfq_class *cl, u_int64_t roundedS) +{ + u_int64_t slot = (roundedS - grp->qfg_S) >> grp->qfg_slot_shift; + u_int32_t i = (grp->qfg_front + slot) % qif->qif_maxslots; + + cl->cl_next = grp->qfg_slots[i]; + grp->qfg_slots[i] = cl; + pktsched_bit_set(slot, &grp->qfg_full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->qfg_slots[grp->qfg_front]; + + *h = (*h)->cl_next; + if (!*h) + pktsched_bit_clr(0, &grp->qfg_full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in qfg_full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_if *qif, struct qfq_group *grp) +{ + int i; + + if (pktsched_verbose > 2) { + log(LOG_DEBUG, "%s: %s grp=%d full_slots=0x%x\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), grp->qfg_index, + grp->qfg_full_slots); + } + + if (grp->qfg_full_slots == 0) + return (NULL); + + i = pktsched_ffs(grp->qfg_full_slots) - 1; /* zero-based */ + if (i > 0) { + grp->qfg_front = (grp->qfg_front + i) % qif->qif_maxslots; + grp->qfg_full_slots >>= i; + } + + return (grp->qfg_slots[grp->qfg_front]); +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo qif->qif_maxslots) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_if *qif, struct qfq_group *grp, u_int64_t roundedS) +{ +#pragma unused(qif) + u_int32_t i = (grp->qfg_S - roundedS) >> grp->qfg_slot_shift; + + grp->qfg_full_slots <<= i; + grp->qfg_front = (grp->qfg_front - i) % qif->qif_maxslots; +} + +static inline void +qfq_update_eligible(struct qfq_if *qif, u_int64_t old_V) +{ + pktsched_bitmap_t ineligible; + + ineligible = qif->qif_bitmaps[IR] | qif->qif_bitmaps[IB]; + if (ineligible) { + if (!qif->qif_bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(qif, ineligible); + if (qfq_gt(grp->qfg_S, qif->qif_V)) + qif->qif_V = grp->qfg_S; + } + qfq_make_eligible(qif, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_if *qif, struct qfq_group *grp, + struct qfq_class *cl) +{ +#pragma unused(qif) + cl->cl_S = cl->cl_F; + if (qempty(&cl->cl_q)) { + qfq_front_slot_remove(grp); + } else { + u_int32_t len; + u_int64_t roundedS; + + len = m_pktlen(qhead(&cl->cl_q)); + cl->cl_F = cl->cl_S + (u_int64_t)len * cl->cl_inv_w; + roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift); + if (roundedS == grp->qfg_S) + return (0); + + qfq_front_slot_remove(grp); + qfq_slot_insert(qif, grp, cl, roundedS); + } + return (1); +} + +/* + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +struct mbuf * +qfq_dequeue(struct qfq_if *qif, cqdq_op_t op) +{ + pktsched_bitmap_t er_bits = qif->qif_bitmaps[ER]; + struct ifclassq *ifq = qif->qif_ifq; + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + u_int64_t old_V; + u_int32_t len; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + for (;;) { + if (er_bits == 0) { +#if QFQ_DEBUG + if (qif->qif_queued && pktsched_verbose > 1) + qfq_dump_sched(qif, "start dequeue"); +#endif /* QFQ_DEBUG */ + /* no eligible and ready packet */ + return (NULL); + } + grp = qfq_ffs(qif, er_bits); + /* if group is non-empty, use it */ + if (grp->qfg_full_slots != 0) + break; + pktsched_bit_clr(grp->qfg_index, &er_bits); +#if QFQ_DEBUG + qif->qif_emptygrp++; +#endif /* QFQ_DEBUG */ + } + VERIFY(!IFCQ_IS_EMPTY(ifq)); + + cl = grp->qfg_slots[grp->qfg_front]; + VERIFY(cl != NULL && !qempty(&cl->cl_q)); + + if (op == CLASSQDQ_POLL) + return (qfq_pollq(cl)); + + m = qfq_getq(cl); + VERIFY(m != NULL); /* qalg must be work conserving */ + len = m_pktlen(m); + +#if QFQ_DEBUG + qif->qif_queued--; +#endif /* QFQ_DEBUG */ + + IFCQ_DEC_LEN(ifq); + if (qempty(&cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, 1, len); + IFCQ_XMIT_ADD(ifq, 1, len); + + old_V = qif->qif_V; + qif->qif_V += (u_int64_t)len * QFQ_IWSUM; + + if (pktsched_verbose > 2) { + log(LOG_DEBUG, "%s: %s qid=%d dequeue m=%p F=0x%llx V=0x%llx", + if_name(QFQIF_IFP(qif)), qfq_style(qif), cl->cl_handle, + m, cl->cl_F, qif->qif_V); + } + + if (qfq_update_class(qif, grp, cl)) { + u_int64_t old_F = grp->qfg_F; + + cl = qfq_slot_scan(qif, grp); + if (!cl) { /* group gone, remove from ER */ + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]); + } else { + u_int32_t s; + u_int64_t roundedS = + qfq_round_down(cl->cl_S, grp->qfg_slot_shift); + + if (grp->qfg_S == roundedS) + goto skip_unblock; + + grp->qfg_S = roundedS; + grp->qfg_F = roundedS + (2ULL << grp->qfg_slot_shift); + + /* remove from ER and put in the new set */ + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]); + s = qfq_calc_state(qif, grp); + pktsched_bit_set(grp->qfg_index, &qif->qif_bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(qif, grp->qfg_index, old_F); + } + +skip_unblock: + qfq_update_eligible(qif, old_V); + +#if QFQ_DEBUG + if (!qif->qif_bitmaps[ER] && qif->qif_queued && pktsched_verbose > 1) + qfq_dump_sched(qif, "end dequeue"); +#endif /* QFQ_DEBUG */ + + return (m); +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for hat(F) are multiples of sigma_i + * no greater than V+sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_if *qif, struct qfq_class *cl) +{ + pktsched_bitmap_t mask; + u_int64_t limit, roundedF; + int slot_shift = cl->cl_grp->qfg_slot_shift; + + roundedF = qfq_round_down(cl->cl_F, slot_shift); + limit = qfq_round_down(qif->qif_V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->cl_F, qif->qif_V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(qif->qif_bitmaps[ER], cl->cl_grp->qfg_index); + if (mask) { + struct qfq_group *next = qfq_ffs(qif, mask); + if (qfq_gt(roundedF, next->qfg_F)) { + cl->cl_S = next->qfg_F; + return; + } + } + cl->cl_S = qif->qif_V; + } else { /* timestamp is not stale */ + cl->cl_S = cl->cl_F; + } +} + +int +qfq_enqueue(struct qfq_if *qif, struct qfq_class *cl, struct mbuf *m, + struct pf_mtag *t) +{ + struct ifclassq *ifq = qif->qif_ifq; + struct qfq_group *grp; + u_int64_t roundedS; + int len, ret, s; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(cl == NULL || cl->cl_qif == qif); + + if (cl == NULL) { + cl = qfq_clh_to_clp(qif, t->pftag_qid); + if (cl == NULL) { + cl = qif->qif_default; + if (cl == NULL) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + } + } + + len = m_pktlen(m); + + ret = qfq_addq(cl, m, t); + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + /* packet has been freed in qfq_addq */ + PKTCNTR_ADD(&cl->cl_dropcnt, 1, len); + IFCQ_DROP_ADD(ifq, 1, len); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + /* NOT REACHED */ + } + } + IFCQ_INC_LEN(ifq); + +#if QFQ_DEBUG + qif->qif_queued++; +#endif /* QFQ_DEBUG */ + + /* queue was not idle, we're done */ + if (qlen(&cl->cl_q) > 1) + goto done; + + /* queue was idle */ + grp = cl->cl_grp; + qfq_update_start(qif, cl); /* adjust start time */ + + /* compute new finish time and rounded start */ + cl->cl_F = cl->cl_S + (u_int64_t)len * cl->cl_inv_w; + roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift); + + /* + * Insert cl in the correct bucket. + * + * If cl->cl_S >= grp->qfg_S we don't need to adjust the bucket list + * and simply go to the insertion phase. Otherwise grp->qfg_S is + * decreasing, we must make room in the bucket list, and also + * recompute the group state. Finally, if there were no flows + * in this group and nobody was in ER make sure to adjust V. + */ + if (grp->qfg_full_slots != 0) { + if (!qfq_gt(grp->qfg_S, cl->cl_S)) + goto skip_update; + + /* create a slot for this cl->cl_S */ + qfq_slot_rotate(qif, grp, roundedS); + + /* group was surely ineligible, remove */ + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IR]); + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IB]); + } else if (!qif->qif_bitmaps[ER] && qfq_gt(roundedS, qif->qif_V)) { + qif->qif_V = roundedS; + } + + grp->qfg_S = roundedS; + grp->qfg_F = + roundedS + (2ULL << grp->qfg_slot_shift); /* i.e. 2 sigma_i */ + s = qfq_calc_state(qif, grp); + pktsched_bit_set(grp->qfg_index, &qif->qif_bitmaps[s]); + + if (pktsched_verbose > 2) { + log(LOG_DEBUG, "%s: %s qid=%d enqueue m=%p state=%s 0x%x " + "S=0x%llx F=0x%llx V=0x%llx\n", if_name(QFQIF_IFP(qif)), + qfq_style(qif), cl->cl_handle, m, qfq_state2str(s), + qif->qif_bitmaps[s], cl->cl_S, cl->cl_F, qif->qif_V); + } + +skip_update: + qfq_slot_insert(qif, grp, cl, roundedS); + +done: + /* successfully queued. */ + return (ret); +} + +static inline void +qfq_slot_remove(struct qfq_if *qif, struct qfq_group *grp, + struct qfq_class *cl) +{ +#pragma unused(qif) + struct qfq_class **pprev; + u_int32_t i, offset; + u_int64_t roundedS; + + roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift); + offset = (roundedS - grp->qfg_S) >> grp->qfg_slot_shift; + i = (grp->qfg_front + offset) % qif->qif_maxslots; + + pprev = &grp->qfg_slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->cl_next; + + *pprev = cl->cl_next; + if (!grp->qfg_slots[i]) + pktsched_bit_clr(offset, &grp->qfg_full_slots); +} + +/* + * Called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_if *qif, struct qfq_class *cl) +{ + struct qfq_group *grp = cl->cl_grp; + pktsched_bitmap_t mask; + u_int64_t roundedS; + int s; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s deactivate qid=%d grp=%d " + "full_slots=0x%x front=%d bitmaps={ER=0x%x,EB=0x%x," + "IR=0x%x,IB=0x%x}\n", + if_name(QFQIF_IFP(cl->cl_qif)), qfq_style(cl->cl_qif), + cl->cl_handle, grp->qfg_index, grp->qfg_full_slots, + grp->qfg_front, qif->qif_bitmaps[ER], qif->qif_bitmaps[EB], + qif->qif_bitmaps[IR], qif->qif_bitmaps[IB]); +#if QFQ_DEBUG + if (pktsched_verbose > 1) + qfq_dump_sched(qif, "start deactivate"); +#endif /* QFQ_DEBUG */ + } + + cl->cl_F = cl->cl_S; /* not needed if the class goes away */ + qfq_slot_remove(qif, grp, cl); + + if (grp->qfg_full_slots == 0) { + /* + * Nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IR]); + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[EB]); + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IB]); + + if (pktsched_bit_tst(grp->qfg_index, &qif->qif_bitmaps[ER]) && + !(qif->qif_bitmaps[ER] & ~((1UL << grp->qfg_index) - 1))) { + mask = qif->qif_bitmaps[ER] & + ((1UL << grp->qfg_index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = (pktsched_bitmap_t)~0UL; + qfq_move_groups(qif, mask, EB, ER); + qfq_move_groups(qif, mask, IB, IR); + } + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]); + } else if (!grp->qfg_slots[grp->qfg_front]) { + cl = qfq_slot_scan(qif, grp); + roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift); + if (grp->qfg_S != roundedS) { + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]); + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IR]); + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[EB]); + pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IB]); + grp->qfg_S = roundedS; + grp->qfg_F = roundedS + (2ULL << grp->qfg_slot_shift); + s = qfq_calc_state(qif, grp); + pktsched_bit_set(grp->qfg_index, &qif->qif_bitmaps[s]); + } + } + qfq_update_eligible(qif, qif->qif_V); + +#if QFQ_DEBUG + if (pktsched_verbose > 1) + qfq_dump_sched(qif, "end deactivate"); +#endif /* QFQ_DEBUG */ +} + +static const char * +qfq_state2str(int s) +{ + const char *c; + + switch (s) { + case ER: + c = "ER"; + break; + case IR: + c = "IR"; + break; + case EB: + c = "EB"; + break; + case IB: + c = "IB"; + break; + default: + c = "?"; + break; + } + return (c); +} + +static inline int +qfq_addq(struct qfq_class *cl, struct mbuf *m, struct pf_mtag *t) +{ + struct qfq_if *qif = cl->cl_qif; + struct ifclassq *ifq = qif->qif_ifq; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_addq(cl->cl_rio, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_addq(cl->cl_red, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_addq(cl->cl_blue, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb == NULL) { + struct ifnet *ifp = QFQIF_IFP(qif); + + VERIFY(cl->cl_flags & QFCF_LAZY); + cl->cl_flags &= ~QFCF_LAZY; + IFCQ_CONVERT_LOCK(ifq); + + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb == NULL) { + /* fall back to droptail */ + qtype(&cl->cl_q) = Q_DROPTAIL; + cl->cl_flags &= ~QFCF_SFB; + cl->cl_qflags &= ~(SFBF_ECN | SFBF_FLOWCTL); + + log(LOG_ERR, "%s: %s SFB lazy allocation " + "failed for qid=%d grp=%d, falling back " + "to DROPTAIL\n", if_name(ifp), + qfq_style(qif), cl->cl_handle, + cl->cl_grp->qfg_index); + } else if (qif->qif_throttle != IFNET_THROTTLE_OFF) { + /* if there's pending throttling, set it */ + cqrq_throttle_t tr = { 1, qif->qif_throttle }; + int err = qfq_throttle(qif, &tr); + + if (err == EALREADY) + err = 0; + if (err != 0) { + tr.level = IFNET_THROTTLE_OFF; + (void) qfq_throttle(qif, &tr); + } + } + } + if (cl->cl_sfb != NULL) + return (sfb_addq(cl->cl_sfb, &cl->cl_q, m, t)); + } else if (qlen(&cl->cl_q) >= qlimit(&cl->cl_q)) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + if (cl->cl_flags & QFCF_CLEARDSCP) + write_dsfield(m, t, 0); + + _addq(&cl->cl_q, m); + + return (0); +} + +static inline struct mbuf * +qfq_getq(struct qfq_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_qif->qif_ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_getq(cl->cl_rio, &cl->cl_q)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_getq(cl->cl_red, &cl->cl_q)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_getq(cl->cl_blue, &cl->cl_q)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_getq(cl->cl_sfb, &cl->cl_q)); + + return (_getq(&cl->cl_q)); +} + +static inline struct mbuf * +qfq_pollq(struct qfq_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_qif->qif_ifq); + + return (qhead(&cl->cl_q)); +} + +static void +qfq_purgeq(struct qfq_if *qif, struct qfq_class *cl, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes) +{ + struct ifclassq *ifq = qif->qif_ifq; + u_int32_t cnt = 0, len = 0, qlen; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if ((qlen = qlen(&cl->cl_q)) == 0) + goto done; + + /* become regular mutex before freeing mbufs */ + IFCQ_CONVERT_LOCK(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_purgeq(cl->cl_rio, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_purgeq(cl->cl_red, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_purgeq(cl->cl_blue, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_purgeq(cl->cl_sfb, &cl->cl_q, flow, &cnt, &len); + else + _flushq_flow(&cl->cl_q, flow, &cnt, &len); + + if (cnt > 0) { + VERIFY(qlen(&cl->cl_q) == (qlen - cnt)); +#if QFQ_DEBUG + VERIFY(qif->qif_queued >= cnt); + qif->qif_queued -= cnt; +#endif /* QFQ_DEBUG */ + + PKTCNTR_ADD(&cl->cl_dropcnt, cnt, len); + IFCQ_DROP_ADD(ifq, cnt, len); + + VERIFY(((signed)IFCQ_LEN(ifq) - cnt) >= 0); + IFCQ_LEN(ifq) -= cnt; + + if (qempty(&cl->cl_q)) + qfq_deactivate_class(qif, cl); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s purge qid=%d weight=%d " + "qlen=[%d,%d] cnt=%d len=%d flow=0x%x\n", + if_name(QFQIF_IFP(qif)), + qfq_style(qif), cl->cl_handle, + (u_int32_t)(QFQ_ONE_FP / cl->cl_inv_w), qlen, + qlen(&cl->cl_q), cnt, len, flow); + } + } +done: + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +static void +qfq_updateq(struct qfq_if *qif, struct qfq_class *cl, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s update qid=%d weight=%d event=%s\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), + cl->cl_handle, (u_int32_t)(QFQ_ONE_FP / cl->cl_inv_w), + ifclassq_ev2str(ev)); + } + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_updateq(cl->cl_rio, ev)); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_updateq(cl->cl_red, ev)); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_updateq(cl->cl_blue, ev)); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_updateq(cl->cl_sfb, ev)); +} + +int +qfq_get_class_stats(struct qfq_if *qif, u_int32_t qid, + struct qfq_classstats *sp) +{ + struct qfq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + if ((cl = qfq_clh_to_clp(qif, qid)) == NULL) + return (EINVAL); + + sp->class_handle = cl->cl_handle; + sp->index = cl->cl_grp->qfg_index; + sp->weight = (QFQ_ONE_FP / cl->cl_inv_w); + sp->lmax = cl->cl_lmax; + sp->qlength = qlen(&cl->cl_q); + sp->qlimit = qlimit(&cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(&cl->cl_q); + sp->qstate = qstate(&cl->cl_q); +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_getstats(cl->cl_rio, &sp->red[0]); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_getstats(cl->cl_blue, &sp->blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_getstats(cl->cl_sfb, &sp->sfb); + + return (0); +} + +/* convert a class handle to the corresponding class pointer */ +static inline struct qfq_class * +qfq_clh_to_clp(struct qfq_if *qif, u_int32_t chandle) +{ + struct qfq_class *cl; + int i; + + IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq); + + /* + * First, try optimistically the slot matching the lower bits of + * the handle. If it fails, do the linear table search. + */ + i = chandle % qif->qif_maxclasses; + if ((cl = qif->qif_class_tbl[i]) != NULL && cl->cl_handle == chandle) + return (cl); + for (i = 0; i < qif->qif_maxclasses; i++) + if ((cl = qif->qif_class_tbl[i]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + +static const char * +qfq_style(struct qfq_if *qif) +{ + return ((qif->qif_flags & QFQIFF_ALTQ) ? "ALTQ_QFQ" : "QFQ"); +} + +/* + * Generic comparison function, handling wraparound + */ +static inline int +qfq_gt(u_int64_t a, u_int64_t b) +{ + return ((int64_t)(a - b) > 0); +} + +/* + * Round a precise timestamp to its slotted value + */ +static inline u_int64_t +qfq_round_down(u_int64_t ts, u_int32_t shift) +{ + return (ts & ~((1ULL << shift) - 1)); +} + +/* + * Return the pointer to the group with lowest index in the bitmap + */ +static inline struct qfq_group * +qfq_ffs(struct qfq_if *qif, pktsched_bitmap_t bitmap) +{ + int index = pktsched_ffs(bitmap) - 1; /* zero-based */ + VERIFY(index >= 0 && index <= QFQ_MAX_INDEX && + qif->qif_groups[index] != NULL); + return (qif->qif_groups[index]); +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int +qfq_calc_index(struct qfq_class *cl, u_int32_t inv_w, u_int32_t maxlen) +{ + u_int64_t slot_size = (u_int64_t)maxlen *inv_w; + pktsched_bitmap_t size_map; + int index = 0; + + size_map = (pktsched_bitmap_t)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; /* basically a log_2() */ + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; +out: + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s qid=%d grp=%d W=%u, L=%u, I=%d\n", + if_name(QFQIF_IFP(cl->cl_qif)), qfq_style(cl->cl_qif), + cl->cl_handle, index, (u_int32_t)(QFQ_ONE_FP/inv_w), + maxlen, index); + } + return (index); +} + +#if QFQ_DEBUG +static void +qfq_dump_groups(struct qfq_if *qif, u_int32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = qif->qif_groups[i]; + + if (0 == (mask & (1 << i))) + continue; + if (g == NULL) + continue; + + log(LOG_DEBUG, "%s: %s [%2d] full_slots 0x%x\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), i, + g->qfg_full_slots); + log(LOG_DEBUG, "%s: %s S 0x%20llx F 0x%llx %c\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), + g->qfg_S, g->qfg_F, mask & (1 << i) ? '1' : '0'); + + for (j = 0; j < qif->qif_maxslots; j++) { + if (g->qfg_slots[j]) { + log(LOG_DEBUG, "%s: %s bucket %d %p " + "qid %d\n", if_name(QFQIF_IFP(qif)), + qfq_style(qif), j, g->qfg_slots[j], + g->qfg_slots[j]->cl_handle); + } + } + } +} + +static void +qfq_dump_sched(struct qfq_if *qif, const char *msg) +{ + log(LOG_DEBUG, "%s: %s --- in %s: ---\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), msg); + log(LOG_DEBUG, "%s: %s emptygrp %d queued %d V 0x%llx\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_emptygrp, + qif->qif_queued, qif->qif_V); + log(LOG_DEBUG, "%s: %s ER 0x%08x\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[ER]); + log(LOG_DEBUG, "%s: %s EB 0x%08x\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[EB]); + log(LOG_DEBUG, "%s: %s IR 0x%08x\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[IR]); + log(LOG_DEBUG, "%s: %s IB 0x%08x\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[IB]); + qfq_dump_groups(qif, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ + +/* + * qfq_enqueue_ifclassq is an enqueue function to be registered to + * (*ifcq_enqueue) in struct ifclassq. + */ +static int +qfq_enqueue_ifclassq(struct ifclassq *ifq, struct mbuf *m) +{ + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + log(LOG_ERR, "%s: packet does not have pkthdr\n", + if_name(ifq->ifcq_ifp)); + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + + i = MBUF_SCIDX(mbuf_get_service_class(m)); + VERIFY((u_int32_t)i < IFCQ_SC_MAX); + + return (qfq_enqueue(ifq->ifcq_disc, + ifq->ifcq_disc_slots[i].cl, m, m_pftag(m))); +} + +/* + * qfq_dequeue_ifclassq is a dequeue function to be registered to + * (*ifcq_dequeue) in struct ifclass. + * + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +static struct mbuf * +qfq_dequeue_ifclassq(struct ifclassq *ifq, cqdq_op_t op) +{ + return (qfq_dequeue(ifq->ifcq_disc, op)); +} + +static int +qfq_request_ifclassq(struct ifclassq *ifq, cqrq_t req, void *arg) +{ + struct qfq_if *qif = (struct qfq_if *)ifq->ifcq_disc; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + switch (req) { + case CLASSQRQ_PURGE: + qfq_purge(qif); + break; + + case CLASSQRQ_PURGE_SC: + qfq_purge_sc(qif, (cqrq_purge_sc_t *)arg); + break; + + case CLASSQRQ_EVENT: + qfq_event(qif, (cqev_t)arg); + break; + + case CLASSQRQ_THROTTLE: + err = qfq_throttle(qif, (cqrq_throttle_t *)arg); + break; + } + return (err); +} + +int +qfq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + struct qfq_class *cl0, *cl1, *cl2, *cl3, *cl4; + struct qfq_class *cl5, *cl6, *cl7, *cl8, *cl9; + struct qfq_if *qif; + u_int32_t maxlen = 0, qflags = 0; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + + if (flags & PKTSCHEDF_QALG_RED) + qflags |= QFCF_RED; + if (flags & PKTSCHEDF_QALG_RIO) + qflags |= QFCF_RIO; + if (flags & PKTSCHEDF_QALG_BLUE) + qflags |= QFCF_BLUE; + if (flags & PKTSCHEDF_QALG_SFB) + qflags |= QFCF_SFB; + if (flags & PKTSCHEDF_QALG_ECN) + qflags |= QFCF_ECN; + if (flags & PKTSCHEDF_QALG_FLOWCTL) + qflags |= QFCF_FLOWCTL; + + qif = qfq_alloc(ifp, M_WAITOK, FALSE); + if (qif == NULL) + return (ENOMEM); + + if ((maxlen = IFCQ_MAXLEN(ifq)) == 0) + maxlen = if_sndq_maxlen; + + if ((err = qfq_add_queue(qif, maxlen, 300, 1200, + qflags | QFCF_LAZY, SCIDX_BK_SYS, &cl0)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 600, 1400, + qflags | QFCF_LAZY, SCIDX_BK, &cl1)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 2400, 600, + qflags | QFCF_DEFAULTCLASS, SCIDX_BE, &cl2)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 2700, 600, + qflags | QFCF_LAZY, SCIDX_RD, &cl3)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 3000, 400, + qflags | QFCF_LAZY, SCIDX_OAM, &cl4)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 8000, 1000, + qflags | QFCF_LAZY, SCIDX_AV, &cl5)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 15000, 1200, + qflags | QFCF_LAZY, SCIDX_RV, &cl6)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 20000, 1400, + qflags | QFCF_LAZY, SCIDX_VI, &cl7)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 23000, 200, + qflags | QFCF_LAZY, SCIDX_VO, &cl8)) != 0) + goto cleanup; + + if ((err = qfq_add_queue(qif, maxlen, 25000, 200, + qflags, SCIDX_CTL, &cl9)) != 0) + goto cleanup; + + err = ifclassq_attach(ifq, PKTSCHEDT_QFQ, qif, + qfq_enqueue_ifclassq, qfq_dequeue_ifclassq, NULL, + qfq_request_ifclassq); + + /* cache these for faster lookup */ + if (err == 0) { + ifq->ifcq_disc_slots[SCIDX_BK_SYS].qid = SCIDX_BK_SYS; + ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl = cl0; + + ifq->ifcq_disc_slots[SCIDX_BK].qid = SCIDX_BK; + ifq->ifcq_disc_slots[SCIDX_BK].cl = cl1; + + ifq->ifcq_disc_slots[SCIDX_BE].qid = SCIDX_BE; + ifq->ifcq_disc_slots[SCIDX_BE].cl = cl2; + + ifq->ifcq_disc_slots[SCIDX_RD].qid = SCIDX_RD; + ifq->ifcq_disc_slots[SCIDX_RD].cl = cl3; + + ifq->ifcq_disc_slots[SCIDX_OAM].qid = SCIDX_OAM; + ifq->ifcq_disc_slots[SCIDX_OAM].cl = cl4; + + ifq->ifcq_disc_slots[SCIDX_AV].qid = SCIDX_AV; + ifq->ifcq_disc_slots[SCIDX_AV].cl = cl5; + + ifq->ifcq_disc_slots[SCIDX_RV].qid = SCIDX_RV; + ifq->ifcq_disc_slots[SCIDX_RV].cl = cl6; + + ifq->ifcq_disc_slots[SCIDX_VI].qid = SCIDX_VI; + ifq->ifcq_disc_slots[SCIDX_VI].cl = cl7; + + ifq->ifcq_disc_slots[SCIDX_VO].qid = SCIDX_VO; + ifq->ifcq_disc_slots[SCIDX_VO].cl = cl8; + + ifq->ifcq_disc_slots[SCIDX_CTL].qid = SCIDX_CTL; + ifq->ifcq_disc_slots[SCIDX_CTL].cl = cl9; + } + +cleanup: + if (err != 0) + (void) qfq_destroy_locked(qif); + + return (err); +} + +int +qfq_teardown_ifclassq(struct ifclassq *ifq) +{ + struct qfq_if *qif = ifq->ifcq_disc; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(qif != NULL && ifq->ifcq_type == PKTSCHEDT_QFQ); + + (void) qfq_destroy_locked(qif); + + ifq->ifcq_disc = NULL; + for (i = 0; i < IFCQ_SC_MAX; i++) { + ifq->ifcq_disc_slots[i].qid = 0; + ifq->ifcq_disc_slots[i].cl = NULL; + } + + return (ifclassq_detach(ifq)); +} + +int +qfq_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot, + struct if_ifclassq_stats *ifqs) +{ + struct qfq_if *qif = ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_type == PKTSCHEDT_QFQ); + + if (slot >= IFCQ_SC_MAX) + return (EINVAL); + + return (qfq_get_class_stats(qif, ifq->ifcq_disc_slots[slot].qid, + &ifqs->ifqs_qfq_stats)); +} + +static int +qfq_throttle(struct qfq_if *qif, cqrq_throttle_t *tr) +{ + struct ifclassq *ifq = qif->qif_ifq; + struct qfq_class *cl; + int err; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(!(qif->qif_flags & QFQIFF_ALTQ)); + + if (!tr->set) { + tr->level = qif->qif_throttle; + return (0); + } + + if (tr->level == qif->qif_throttle) + return (EALREADY); + + /* Current throttling levels only involve BK_SYS class */ + cl = ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl; + + switch (tr->level) { + case IFNET_THROTTLE_OFF: + err = qfq_resumeq(qif, cl); + break; + + case IFNET_THROTTLE_OPPORTUNISTIC: + err = qfq_suspendq(qif, cl); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + if (err == 0 || err == ENXIO) { + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s throttling level %sset %d->%d\n", + if_name(QFQIF_IFP(qif)), qfq_style(qif), + (err == 0) ? "" : "lazy ", qif->qif_throttle, + tr->level); + } + qif->qif_throttle = tr->level; + if (err != 0) + err = 0; + else + qfq_purgeq(qif, cl, 0, NULL, NULL); + } else { + log(LOG_ERR, "%s: %s unable to set throttling level " + "%d->%d [error=%d]\n", if_name(QFQIF_IFP(qif)), + qfq_style(qif), qif->qif_throttle, tr->level, err); + } + + return (err); +} + +static int +qfq_resumeq(struct qfq_if *qif, struct qfq_class *cl) +{ + struct ifclassq *ifq = qif->qif_ifq; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + err = rio_suspendq(cl->cl_rio, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + err = red_suspendq(cl->cl_red, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + err = blue_suspendq(cl->cl_blue, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, FALSE); + + if (err == 0) + qstate(&cl->cl_q) = QS_RUNNING; + + return (err); +} + +static int +qfq_suspendq(struct qfq_if *qif, struct qfq_class *cl) +{ + struct ifclassq *ifq = qif->qif_ifq; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + err = rio_suspendq(cl->cl_rio, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + err = red_suspendq(cl->cl_red, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + err = blue_suspendq(cl->cl_blue, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb != NULL) { + err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, TRUE); + } else { + VERIFY(cl->cl_flags & QFCF_LAZY); + err = ENXIO; /* delayed throttling */ + } + } + + if (err == 0 || err == ENXIO) + qstate(&cl->cl_q) = QS_SUSPENDED; + + return (err); +} diff --git a/bsd/net/pktsched/pktsched_qfq.h b/bsd/net/pktsched/pktsched_qfq.h new file mode 100644 index 000000000..825cc9215 --- /dev/null +++ b/bsd/net/pktsched/pktsched_qfq.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_QFQ_H_ +#define _NET_PKTSCHED_PKTSCHED_QFQ_H_ + +#ifdef PRIVATE +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* qfq class flags */ +#define QFCF_RED 0x0001 /* use RED */ +#define QFCF_ECN 0x0002 /* use ECN with RED/BLUE/SFB */ +#define QFCF_RIO 0x0004 /* use RIO */ +#define QFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define QFCF_BLUE 0x0100 /* use BLUE */ +#define QFCF_SFB 0x0200 /* use SFB */ +#define QFCF_FLOWCTL 0x0400 /* enable flow control advisories */ +#define QFCF_DEFAULTCLASS 0x1000 /* default class */ +#ifdef BSD_KERNEL_PRIVATE +#define QFCF_LAZY 0x10000000 /* on-demand resource allocation */ +#endif /* BSD_KERNEL_PRIVATE */ + +#define QFCF_USERFLAGS \ + (QFCF_RED | QFCF_ECN | QFCF_RIO | QFCF_CLEARDSCP | QFCF_BLUE | \ + QFCF_SFB | QFCF_FLOWCTL | QFCF_DEFAULTCLASS) + +#ifdef BSD_KERNEL_PRIVATE +#define QFCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" \ + "\35LAZY" +#else +#define QFCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" +#endif /* !BSD_KERNEL_PRIVATE */ + +#define QFQ_MAX_CLASSES 32 +#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */ +#define QFQ_MAX_WEIGHT (1 << QFQ_MAX_WSHIFT) + +struct qfq_classstats { + u_int32_t class_handle; + u_int32_t index; + u_int32_t weight; + u_int32_t lmax; + + u_int32_t qlength; + u_int32_t qlimit; + u_int32_t period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* RED, RIO, BLUE, SFB related info */ + classq_type_t qtype; + union { + /* RIO has 3 red stats */ + struct red_stats red[RIO_NDROPPREC]; + struct blue_stats blue; + struct sfb_stats sfb; + }; + classq_state_t qstate; +}; + +#ifdef BSD_KERNEL_PRIVATE +#define QFQ_DEBUG 1 /* enable extra debugging */ + +/* + * Virtual time computations. + * + * S, F and V are all computed in fixed point arithmetic with + * FRAC_BITS decimal bits. + * + * QFQ_MAX_INDEX is the maximum index allowed for a group. We need + * one bit per index. + * + * QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + * The layout of the bits is as below: + * + * [ MTU_SHIFT ][ FRAC_BITS ] + * [ MAX_INDEX ][ MIN_SLOT_SHIFT ] + * ^.__grp->index = 0 + * *.__grp->slot_shift + * + * where MIN_SLOT_SHIFT is derived by difference from the others. + * + * The max group index corresponds to Lmax/w_min, where + * Lmax=1<group mapping. Class weights are in the + * range [1, QFQ_MAX_WEIGHT], we need to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->qfg_index is the index of the group; and grp->qfg_slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<qif_ifq->ifcq_ifp) + +struct if_ifclassq_stats; + +extern void qfq_init(void); +extern struct qfq_if *qfq_alloc(struct ifnet *, int, boolean_t); +extern int qfq_destroy(struct qfq_if *); +extern void qfq_purge(struct qfq_if *); +extern void qfq_event(struct qfq_if *, cqev_t); +extern int qfq_add_queue(struct qfq_if *, u_int32_t, u_int32_t, u_int32_t, + u_int32_t, u_int32_t, struct qfq_class **); +extern int qfq_remove_queue(struct qfq_if *, u_int32_t); +extern int qfq_get_class_stats(struct qfq_if *, u_int32_t, + struct qfq_classstats *); +extern int qfq_enqueue(struct qfq_if *, struct qfq_class *, struct mbuf *, + struct pf_mtag *); +extern struct mbuf *qfq_dequeue(struct qfq_if *, cqdq_op_t); +extern int qfq_setup_ifclassq(struct ifclassq *, u_int32_t); +extern int qfq_teardown_ifclassq(struct ifclassq *ifq); +extern int qfq_getqstats_ifclassq(struct ifclassq *, u_int32_t, + struct if_ifclassq_stats *); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_QFQ_H_ */ diff --git a/bsd/net/pktsched/pktsched_rmclass.c b/bsd/net/pktsched/pktsched_rmclass.c new file mode 100644 index 000000000..a5f8e5a84 --- /dev/null +++ b/bsd/net/pktsched/pktsched_rmclass.c @@ -0,0 +1,1849 @@ +/* + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_rmclass.c,v 1.13 2007/09/13 20:40:02 chl Exp $ */ +/* $KAME: altq_rmclass.c,v 1.10 2001/02/09 07:20:40 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * LBL code modified by speer@eng.sun.com, May 1977. + * For questions and/or comments, please send mail to cbq@ee.lbl.gov + */ + +#include + +#ident "@(#)rm_class.c 1.48 97/12/05 SMI" + +#if PKTSCHED_CBQ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Local Macros + */ + +#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } + +/* + * Local routines. + */ + +static int rmc_satisfied(struct rm_class *, struct timeval *); +static void rmc_wrr_set_weights(struct rm_ifdat *); +static void rmc_depth_compute(struct rm_class *); +static void rmc_depth_recompute(rm_class_t *); + +static struct mbuf *_rmc_wrr_dequeue_next(struct rm_ifdat *, cqdq_op_t); +static struct mbuf *_rmc_prr_dequeue_next(struct rm_ifdat *, cqdq_op_t); + +static int _rmc_addq(rm_class_t *, struct mbuf *, struct pf_mtag *); +static void _rmc_dropq(rm_class_t *); +static struct mbuf *_rmc_getq(rm_class_t *); +static struct mbuf *_rmc_pollq(rm_class_t *); + +static int rmc_under_limit(struct rm_class *, struct timeval *); +static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); +static void rmc_drop_action(struct rm_class *); +static void rmc_restart(struct rm_class *); +static void rmc_root_overlimit(rm_class_t *, rm_class_t *); + +#define RMC_ZONE_MAX 32 /* maximum elements in zone */ +#define RMC_ZONE_NAME "pktsched_cbq_cl" /* zone name (CBQ for now) */ + +static unsigned int rmc_size; /* size of zone element */ +static struct zone *rmc_zone; /* zone for rm_class */ + +void +rmclass_init(void) +{ + if (rmc_zone != NULL) + return; + + rmc_size = sizeof (struct rm_class); + rmc_zone = zinit(rmc_size, RMC_ZONE_MAX * rmc_size, 0, RMC_ZONE_NAME); + if (rmc_zone == NULL) { + panic("%s: failed allocating %s", __func__, RMC_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(rmc_zone, Z_EXPAND, TRUE); + zone_change(rmc_zone, Z_CALLERACCT, TRUE); +} + +#define BORROW_OFFTIME +/* + * BORROW_OFFTIME (experimental): + * borrow the offtime of the class borrowing from. + * the reason is that when its own offtime is set, the class is unable + * to borrow much, especially when cutoff is taking effect. + * but when the borrowed class is overloaded (advidle is close to minidle), + * use the borrowing class's offtime to avoid overload. + */ +#define ADJUST_CUTOFF +/* + * ADJUST_CUTOFF (experimental): + * if no underlimit class is found due to cutoff, increase cutoff and + * retry the scheduling loop. + * also, don't invoke delay_actions while cutoff is taking effect, + * since a sleeping class won't have a chance to be scheduled in the + * next loop. + * + * now heuristics for setting the top-level variable (cutoff_) becomes: + * 1. if a packet arrives for a not-overlimit class, set cutoff + * to the depth of the class. + * 2. if cutoff is i, and a packet arrives for an overlimit class + * with an underlimit ancestor at a lower level than i (say j), + * then set cutoff to j. + * 3. at scheduling a packet, if there is no underlimit class + * due to the current cutoff level, increase cutoff by 1 and + * then try to schedule again. + */ + +/* + * rm_class_t * + * rmc_newclass(...) - Create a new resource management class at priority + * 'pri' on the interface given by 'ifd'. + * + * nsecPerByte is the data rate of the interface in nanoseconds/byte. + * E.g., 800 for a 10Mb/s ethernet. If the class gets less + * than 100% of the bandwidth, this number should be the + * 'effective' rate for the class. Let f be the + * bandwidth fraction allocated to this class, and let + * nsPerByte be the data rate of the output link in + * nanoseconds/byte. Then nsecPerByte is set to + * nsPerByte / f. E.g., 1600 (= 800 / .5) + * for a class that gets 50% of an ethernet's bandwidth. + * + * action the routine to call when the class is over limit. + * + * maxq max allowable queue size for class (in packets). + * + * parent parent class pointer. + * + * borrow class to borrow from (should be either 'parent' or null). + * + * maxidle max value allowed for class 'idle' time estimate (this + * parameter determines how large an initial burst of packets + * can be before overlimit action is invoked. + * + * offtime how long 'delay' action will delay when class goes over + * limit (this parameter determines the steady-state burst + * size when a class is running over its limit). + * + * Maxidle and offtime have to be computed from the following: If the + * average packet size is s, the bandwidth fraction allocated to this + * class is f, we want to allow b packet bursts, and the gain of the + * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: + * + * ptime = s * nsPerByte * (1 - f) / f + * maxidle = ptime * (1 - g^b) / g^b + * minidle = -ptime * (1 / (f - 1)) + * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) + * + * Operationally, it's convenient to specify maxidle & offtime in units + * independent of the link bandwidth so the maxidle & offtime passed to + * this routine are the above values multiplied by 8*f/(1000*nsPerByte). + * (The constant factor is a scale factor needed to make the parameters + * integers. This scaling also means that the 'unscaled' values of + * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, + * not nanoseconds.) Also note that the 'idle' filter computation keeps + * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of + * maxidle also must be scaled upward by this value. Thus, the passed + * values for maxidle and offtime can be computed as follows: + * + * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) + * offtime = offtime * 8 / (1000 * nsecPerByte) + * + * When USE_HRTIME is employed, then maxidle and offtime become: + * maxidle = maxilde * (8.0 / nsecPerByte); + * offtime = offtime * (8.0 / nsecPerByte); + */ +struct rm_class * +rmc_newclass(int pri, struct rm_ifdat *ifd, u_int32_t nsecPerByte, + void (*action)(rm_class_t *, rm_class_t *), u_int32_t qid, u_int32_t maxq, + struct rm_class *parent, struct rm_class *borrow, u_int32_t maxidle, + int minidle, u_int32_t offtime, int pktsize, int flags) +{ + struct ifnet *ifp; + struct ifclassq *ifq; + struct rm_class *cl; + struct rm_class *peer; + + if (nsecPerByte == 0) { + log(LOG_ERR, "%s: invalid inverse data rate\n", __func__); + return (NULL); + } + + if (pri >= RM_MAXPRIO) { + log(LOG_ERR, "%s: priority %d out of range! (max %d)\n", + __func__, pri, RM_MAXPRIO - 1); + return (NULL); + } + +#if !CLASSQ_RED + if (flags & RMCF_RED) { + log(LOG_ERR, "%s: RED not configured for CBQ!\n", __func__); + return (NULL); + } +#endif /* !CLASSQ_RED */ + +#if !CLASSQ_RIO + if (flags & RMCF_RIO) { + log(LOG_ERR, "%s: RIO not configured for CBQ!\n", __func__); + return (NULL); + } +#endif /* CLASSQ_RIO */ + +#if !CLASSQ_BLUE + if (flags & RMCF_BLUE) { + log(LOG_ERR, "%s: BLUE not configured for CBQ!\n", __func__); + return (NULL); + } +#endif /* CLASSQ_BLUE */ + + /* These are mutually exclusive */ + if ((flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) && + (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_RED && + (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_RIO && + (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_BLUE && + (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_SFB) { + log(LOG_ERR, "%s: RED|RIO|BLUE|SFB mutually exclusive\n", + __func__); + return (NULL); + } + + cl = zalloc(rmc_zone); + if (cl == NULL) + return (NULL); + + bzero(cl, rmc_size); + CALLOUT_INIT(&cl->callout_); + + /* + * Class initialization. + */ + cl->children_ = NULL; + cl->parent_ = parent; + cl->borrow_ = borrow; + cl->leaf_ = 1; + cl->ifdat_ = ifd; + cl->pri_ = pri; + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->depth_ = 0; + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + ifq = ifd->ifq_; + ifp = ifq->ifcq_ifp; + + if (maxq == 0 || maxq > IFCQ_MAXLEN(ifq)) { + maxq = IFCQ_MAXLEN(ifq); + if (maxq == 0) + maxq = DEFAULT_QLIMIT; /* use default */ + } + _qinit(&cl->q_, Q_DROPHEAD, maxq); + + cl->flags_ = flags; + + cl->minidle_ = (minidle * (int)nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; + + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; + + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; + + cl->overlimit = action; + + if (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) { + int pkttime; + + cl->qflags_ = 0; + if (flags & RMCF_ECN) { + if (flags & RMCF_BLUE) + cl->qflags_ |= BLUEF_ECN; + else if (flags & RMCF_SFB) + cl->qflags_ |= SFBF_ECN; + else if (flags & RMCF_RED) + cl->qflags_ |= REDF_ECN; + else if (flags & RMCF_RIO) + cl->qflags_ |= RIOF_ECN; + } + if (flags & RMCF_FLOWCTL) { + if (flags & RMCF_SFB) + cl->qflags_ |= SFBF_FLOWCTL; + } + if (flags & RMCF_FLOWVALVE) { + if (flags & RMCF_RED) + cl->qflags_ |= REDF_FLOWVALVE; + } + if (flags & RMCF_CLEARDSCP) { + if (flags & RMCF_RIO) + cl->qflags_ |= RIOF_CLEARDSCP; + } + pkttime = nsecPerByte * pktsize / 1000; + + /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ +#if CLASSQ_RED + if (flags & RMCF_RED) { + cl->red_ = red_alloc(ifp, 0, 0, + qlimit(&cl->q_) * 10/100, + qlimit(&cl->q_) * 30/100, + cl->qflags_, pkttime); + if (cl->red_ != NULL) + qtype(&cl->q_) = Q_RED; + } +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (flags & RMCF_RIO) { + cl->rio_ = + rio_alloc(ifp, 0, NULL, cl->qflags_, pkttime); + if (cl->rio_ != NULL) + qtype(&cl->q_) = Q_RIO; + } +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (flags & RMCF_BLUE) { + cl->blue_ = blue_alloc(ifp, 0, 0, cl->qflags_); + if (cl->blue_ != NULL) + qtype(&cl->q_) = Q_BLUE; + } +#endif /* CLASSQ_BLUE */ + if (flags & RMCF_SFB) { + if (!(cl->flags_ & RMCF_LAZY)) + cl->sfb_ = sfb_alloc(ifp, qid, + qlimit(&cl->q_), cl->qflags_); + if (cl->sfb_ != NULL || (cl->flags_ & RMCF_LAZY)) + qtype(&cl->q_) = Q_SFB; + } + } + + /* + * put the class into the class tree + */ + if ((peer = ifd->active_[pri]) != NULL) { + /* find the last class at this pri */ + cl->peer_ = peer; + while (peer->peer_ != ifd->active_[pri]) + peer = peer->peer_; + peer->peer_ = cl; + } else { + ifd->active_[pri] = cl; + cl->peer_ = cl; + } + + if (cl->parent_) { + cl->next_ = parent->children_; + parent->children_ = cl; + parent->leaf_ = 0; + } + + /* + * Compute the depth of this class and its ancestors in the class + * hierarchy. + */ + rmc_depth_compute(cl); + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->num_[pri]++; + ifd->alloc_[pri] += cl->allotment_; + rmc_wrr_set_weights(ifd); + } + return (cl); +} + +int +rmc_modclass(struct rm_class *cl, u_int32_t nsecPerByte, int maxq, + u_int32_t maxidle, int minidle, u_int32_t offtime, int pktsize) +{ +#pragma unused(pktsize) + struct rm_ifdat *ifd; + u_int32_t old_allotment; + + ifd = cl->ifdat_; + old_allotment = cl->allotment_; + + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(&cl->q_) = maxq; + + cl->minidle_ = (minidle * nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; + + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; + + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; + rmc_wrr_set_weights(ifd); + } + return (0); +} + +/* + * static void + * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes + * the appropriate run robin weights for the CBQ weighted round robin + * algorithm. + * + * Returns: NONE + */ + +static void +rmc_wrr_set_weights(struct rm_ifdat *ifd) +{ + int i; + struct rm_class *cl, *clh; + + for (i = 0; i < RM_MAXPRIO; i++) { + /* + * This is inverted from that of the simulator to + * maintain precision. + */ + if (ifd->num_[i] == 0) { + ifd->M_[i] = 0; + } else { + ifd->M_[i] = + ifd->alloc_[i] / (ifd->num_[i] * ifd->maxpkt_); + } + /* + * Compute the weighted allotment for each class. + * This takes the expensive div instruction out + * of the main loop for the wrr scheduling path. + * These only get recomputed when a class comes or + * goes. + */ + if (ifd->active_[i] != NULL) { + clh = cl = ifd->active_[i]; + do { + /* safe-guard for slow link or alloc_ == 0 */ + if (ifd->M_[i] == 0) { + cl->w_allotment_ = 0; + } else { + cl->w_allotment_ = + cl->allotment_ / ifd->M_[i]; + } + cl = cl->peer_; + } while ((cl != NULL) && (cl != clh)); + } + } +} + +int +rmc_get_weight(struct rm_ifdat *ifd, int pri) +{ + if ((pri >= 0) && (pri < RM_MAXPRIO)) + return (ifd->M_[pri]); + else + return (0); +} + +/* + * static void + * rmc_depth_compute(struct rm_class *cl) - This function computes the + * appropriate depth of class 'cl' and its ancestors. + * + * Returns: NONE + */ + +static void +rmc_depth_compute(struct rm_class *cl) +{ + rm_class_t *t = cl, *p; + + /* + * Recompute the depth for the branch of the tree. + */ + while (t != NULL) { + p = t->parent_; + if (p && (t->depth_ >= p->depth_)) { + p->depth_ = t->depth_ + 1; + t = p; + } else + t = NULL; + } +} + +/* + * static void + * rmc_depth_recompute(struct rm_class *cl) - This function re-computes + * the depth of the tree after a class has been deleted. + * + * Returns: NONE + */ + +static void +rmc_depth_recompute(rm_class_t *cl) +{ + rm_class_t *p, *t; + + p = cl; + while (p != NULL) { + if ((t = p->children_) == NULL) { + p->depth_ = 0; + } else { + int cdepth = 0; + + while (t != NULL) { + if (t->depth_ > cdepth) + cdepth = t->depth_; + t = t->next_; + } + + if (p->depth_ == cdepth + 1) + /* no change to this parent */ + return; + + p->depth_ = cdepth + 1; + } + + p = p->parent_; + } +} + +/* + * void + * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This + * function deletes a class from the link-sharing structure and frees + * all resources associated with the class. + * + * Returns: NONE + */ + +void +rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) +{ + struct rm_class *p, *head, *previous; + + VERIFY(cl->children_ == NULL); + + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + + /* + * Free packets in the packet queue. + * XXX - this may not be a desired behavior. Packets should be + * re-queued. + */ + rmc_dropall(cl); + + /* + * If the class has a parent, then remove the class from the + * class from the parent's children chain. + */ + if (cl->parent_ != NULL) { + head = cl->parent_->children_; + p = previous = head; + if (head->next_ == NULL) { + VERIFY(head == cl); + cl->parent_->children_ = NULL; + cl->parent_->leaf_ = 1; + } else while (p != NULL) { + if (p == cl) { + if (cl == head) + cl->parent_->children_ = cl->next_; + else + previous->next_ = cl->next_; + cl->next_ = NULL; + p = NULL; + } else { + previous = p; + p = p->next_; + } + } + } + + /* + * Delete class from class priority peer list. + */ + if ((p = ifd->active_[cl->pri_]) != NULL) { + /* + * If there is more than one member of this priority + * level, then look for class(cl) in the priority level. + */ + if (p != p->peer_) { + while (p->peer_ != cl) + p = p->peer_; + p->peer_ = cl->peer_; + + if (ifd->active_[cl->pri_] == cl) + ifd->active_[cl->pri_] = cl->peer_; + } else { + VERIFY(p == cl); + ifd->active_[cl->pri_] = NULL; + } + } + + /* + * Recompute the WRR weights. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] -= cl->allotment_; + ifd->num_[cl->pri_]--; + rmc_wrr_set_weights(ifd); + } + + /* + * Re-compute the depth of the tree. + */ + rmc_depth_recompute(cl->parent_); + + /* + * Free the class structure. + */ + if (cl->qalg_.ptr != NULL) { +#if CLASSQ_RIO + if (q_is_rio(&cl->q_)) + rio_destroy(cl->rio_); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->q_)) + red_destroy(cl->red_); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->q_)) + blue_destroy(cl->blue_); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) + sfb_destroy(cl->sfb_); + cl->qalg_.ptr = NULL; + qtype(&cl->q_) = Q_DROPTAIL; + qstate(&cl->q_) = QS_RUNNING; + } + zfree(rmc_zone, cl); +} + + +/* + * int + * rmc_init(...) - Initialize the resource management data structures + * associated with the output portion of interface 'ifp'. 'ifd' is + * where the structures will be built (for backwards compatibility, the + * structures aren't kept in the ifnet struct). 'nsecPerByte' + * gives the link speed (inverse of bandwidth) in nanoseconds/byte. + * 'restart' is the driver-specific routine that the generic 'delay + * until under limit' action will call to restart output. `maxq' + * is the queue size of the 'link' & 'default' classes. 'maxqueued' + * is the maximum number of packets that the resource management + * code will allow to be queued 'downstream' (this is typically 1). + * + * Returns: 0 on success + */ + +int +rmc_init(struct ifclassq *ifq, struct rm_ifdat *ifd, u_int32_t nsecPerByte, + void (*restart)(struct ifclassq *), u_int32_t qid, int maxq, int maxqueued, + u_int32_t maxidle, int minidle, u_int32_t offtime, int flags) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + int i, mtu; + + /* + * Initialize the CBQ tracing/debug facility. + */ + CBQTRACEINIT(); + + if (nsecPerByte == 0) { + log(LOG_ERR, "%s: %s: invalid inverse data rate)\n", + __func__, if_name(ifp)); + return (EINVAL); + } + + mtu = ifp->if_mtu; + if (mtu < 1) { + log(LOG_ERR, "%s: %s: invalid MTU (interface not " + "initialized?)\n", __func__, if_name(ifp)); + return (EINVAL); + } + bzero((char *)ifd, sizeof (*ifd)); + + ifd->ifq_ = ifq; + ifd->restart = restart; + ifd->maxqueued_ = maxqueued; + ifd->ns_per_byte_ = nsecPerByte; + ifd->maxpkt_ = mtu; + ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; + ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; +#if 1 + ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; + if (mtu * nsecPerByte > 10 * 1000000) + ifd->maxiftime_ /= 4; +#endif + + reset_cutoff(ifd); + CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); + + /* + * Initialize the CBQ's WRR state. + */ + for (i = 0; i < RM_MAXPRIO; i++) { + ifd->alloc_[i] = 0; + ifd->M_[i] = 0; + ifd->num_[i] = 0; + ifd->na_[i] = 0; + ifd->active_[i] = NULL; + } + + /* + * Initialize current packet state. + */ + ifd->qi_ = 0; + ifd->qo_ = 0; + for (i = 0; i < RM_MAXQUEUED; i++) { + ifd->class_[i] = NULL; + ifd->curlen_[i] = 0; + ifd->borrowed_[i] = NULL; + } + + /* + * Create the root class of the link-sharing structure. + */ + if ((ifd->root_ = rmc_newclass(0, ifd, nsecPerByte, + rmc_root_overlimit, qid, maxq, 0, 0, maxidle, minidle, offtime, + 0, 0)) == NULL) { + log(LOG_ERR, "rmc_init: root class not allocated\n"); + return (ENOMEM); + } + ifd->root_->depth_ = 0; + + return (0); +} + +/* + * void + * rmc_queue_packet(struct rm_class *cl, struct mbuf *m) - Add packet given by + * mbuf 'm' to queue for resource class 'cl'. This routine is called + * by a driver's if_output routine. This routine must be called with + * output packet completion interrupts locked out (to avoid racing with + * rmc_dequeue_next). + * + * Returns: 0 on successful queueing + * CLASSQEQ_DROPPED when packet drop occurs + */ +int +rmc_queue_packet(struct rm_class *cl, struct mbuf *m, struct pf_mtag *t) +{ + struct timeval now; + struct rm_ifdat *ifd = cl->ifdat_; + int cpri = cl->pri_; + int is_empty = qempty(&cl->q_); + int ret = 0; + + RM_GETTIME(now); + if (ifd->cutoff_ > 0) { + if (TV_LT(&cl->undertime_, &now)) { + if (ifd->cutoff_ > cl->depth_) + ifd->cutoff_ = cl->depth_; + CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); + } else { + /* + * the class is overlimit. if the class has + * underlimit ancestors, set cutoff to the lowest + * depth among them. + */ + struct rm_class *borrow = cl->borrow_; + + while (borrow != NULL && + borrow->depth_ < ifd->cutoff_) { + if (TV_LT(&borrow->undertime_, &now)) { + ifd->cutoff_ = borrow->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', + ifd->cutoff_); + break; + } + borrow = borrow->borrow_; + } + } + } + + ret = _rmc_addq(cl, m, t); + if (ret != 0 && + (ret == CLASSQEQ_DROPPED || ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP)) { + /* failed */ + return (ret); + } + VERIFY(ret == 0 || ret == CLASSQEQ_SUCCESS_FC); + if (is_empty) { + CBQTRACE(rmc_queue_packet, 'type', cl->stats_.handle); + ifd->na_[cpri]++; + } + + if (qlen(&cl->q_) > qlimit(&cl->q_)) { + /* note: qlimit can be set to 0 or 1 */ + rmc_drop_action(cl); + return (CLASSQEQ_DROPPED); + } + return (ret); +} + +/* + * void + * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all + * classes to see if there are satified. + */ + +static void +rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) +{ + int i; + rm_class_t *p, *bp; + + for (i = RM_MAXPRIO - 1; i >= 0; i--) { + if ((bp = ifd->active_[i]) != NULL) { + p = bp; + do { + if (!rmc_satisfied(p, now)) { + ifd->cutoff_ = p->depth_; + return; + } + p = p->peer_; + } while (p != bp); + } + } + + reset_cutoff(ifd); +} + +/* + * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. + */ + +static int +rmc_satisfied(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p; + + if (cl == NULL) + return (1); + if (TV_LT(now, &cl->undertime_)) + return (1); + if (cl->depth_ == 0) { + if (!cl->sleeping_ && (qlen(&cl->q_) > cl->qthresh_)) + return (0); + else + return (1); + } + if (cl->children_ != NULL) { + p = cl->children_; + while (p != NULL) { + if (!rmc_satisfied(p, now)) + return (0); + p = p->next_; + } + } + + return (1); +} + +/* + * Return 1 if class 'cl' is under limit or can borrow from a parent, + * 0 if overlimit. As a side-effect, this routine will invoke the + * class overlimit action if the class if overlimit. + */ + +static int +rmc_under_limit(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p = cl; + rm_class_t *top; + struct rm_ifdat *ifd = cl->ifdat_; + + ifd->borrowed_[ifd->qi_] = NULL; + /* + * If cl is the root class, then always return that it is + * underlimit. Otherwise, check to see if the class is underlimit. + */ + if (cl->parent_ == NULL) + return (1); + + if (cl->sleeping_) { + if (TV_LT(now, &cl->undertime_)) + return (0); + + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + return (1); + } + + top = NULL; + while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { + if (((cl = cl->borrow_) == NULL) || + (cl->depth_ > ifd->cutoff_)) { +#ifdef ADJUST_CUTOFF + if (cl != NULL) + /* + * cutoff is taking effect, just + * return false without calling + * the delay action. + */ + return (0); +#endif +#ifdef BORROW_OFFTIME + /* + * check if the class can borrow offtime too. + * borrow offtime from the top of the borrow + * chain if the top class is not overloaded. + */ + if (cl != NULL) { + /* + * cutoff is taking effect, use this + * class as top. + */ + top = cl; + CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); + } + if (top != NULL && top->avgidle_ == top->minidle_) + top = NULL; + p->overtime_ = *now; + (p->overlimit)(p, top); +#else + p->overtime_ = *now; + (p->overlimit)(p, NULL); +#endif + return (0); + } + top = cl; + } + + if (cl != p) + ifd->borrowed_[ifd->qi_] = cl; + return (1); +} + +/* + * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to + * Packet-by-packet round robin. + * + * The heart of the weighted round-robin scheduler, which decides which + * class next gets to send a packet. Highest priority first, then + * weighted round-robin within priorites. + * + * Each able-to-send class gets to send until its byte allocation is + * exhausted. Thus, the active pointer is only changed after a class has + * exhausted its allocation. + * + * If the scheduler finds no class that is underlimit or able to borrow, + * then the first class found that had a nonzero queue and is allowed to + * borrow gets to send. + */ + +static struct mbuf * +_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t op) +{ + struct rm_class *cl = NULL, *first = NULL; + u_int32_t deficit; + int cpri; + struct mbuf *m; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == CLASSQDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + if (ifd->efficient_) { + /* check if this class is overlimit */ + if (cl->undertime_.tv_sec != 0 && + rmc_under_limit(cl, &now) == 0) + first = cl; + } + ifd->pollcache_ = NULL; + goto _wrr_out; + } else { + /* mode == CLASSQDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF +_again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + deficit = 0; + /* + * Loop through twice for a priority level, if some class + * was unable to send a packet the first round because + * of the weighted round-robin mechanism. + * During the second loop at this level, deficit==2. + * (This second loop is not needed if for every class, + * "M[cl->pri_])" times "cl->allotment" is greater than + * the byte size for the largest packet in the class.) + */ +_wrr_loop: + cl = ifd->active_[cpri]; + VERIFY(cl != NULL); + do { + if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) + cl->bytes_alloc_ += cl->w_allotment_; + if (!qempty(&cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) { + if (cl->bytes_alloc_ > 0 || deficit > 1) + goto _wrr_out; + + /* underlimit but no alloc */ + deficit = 1; +#if 1 + ifd->borrowed_[ifd->qi_] = NULL; +#endif + } else if (first == NULL && cl->borrow_ != NULL) + first = cl; /* borrowing candidate */ + } + + cl->bytes_alloc_ = 0; + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + + if (deficit == 1) { + /* first loop found an underlimit class with deficit */ + /* Loop on same priority level, with new deficit. */ + deficit = 2; + goto _wrr_loop; + } + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, + * increase cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); + + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ +_wrr_out: + if (op == CLASSQDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + return (NULL); + + if (qempty(&cl->q_)) + ifd->na_[cpri]--; + + /* + * Update class statistics and link data. + */ + if (cl->bytes_alloc_ > 0) + cl->bytes_alloc_ -= m_pktlen(m); + + if ((cl->bytes_alloc_ <= 0) || first == cl) + ifd->active_[cl->pri_] = cl->peer_; + else + ifd->active_[cl->pri_] = cl; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_PPOLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * Dequeue & return next packet from the highest priority class that + * has a packet to send & has enough allocation to send it. This + * routine is called by a driver whenever it needs a new packet to + * output. + */ +static struct mbuf * +_rmc_prr_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t op) +{ + struct mbuf *m; + int cpri; + struct rm_class *cl, *first = NULL; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == CLASSQDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + ifd->pollcache_ = NULL; + goto _prr_out; + } else { + /* mode == CLASSQDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF +_again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + cl = ifd->active_[cpri]; + VERIFY(cl != NULL); + do { + if (!qempty(&cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) + goto _prr_out; + if (first == NULL && cl->borrow_ != NULL) + first = cl; + } + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, increase + * cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ +_prr_out: + if (op == CLASSQDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + return (NULL); + + if (qempty(&cl->q_)) + ifd->na_[cpri]--; + + ifd->active_[cpri] = cl->peer_; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == CLASSQDQ_POLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * struct mbuf * + * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function + * is invoked by the packet driver to get the next packet to be + * dequeued and output on the link. If WRR is enabled, then the + * WRR dequeue next routine will determine the next packet to sent. + * Otherwise, packet-by-packet round robin is invoked. + * + * Returns: NULL, if a packet is not available or if all + * classes are overlimit. + * + * Otherwise, Pointer to the next packet. + */ + +struct mbuf * +rmc_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t mode) +{ + if (ifd->queued_ >= ifd->maxqueued_) + return (NULL); + else if (ifd->wrr_) + return (_rmc_wrr_dequeue_next(ifd, mode)); + else + return (_rmc_prr_dequeue_next(ifd, mode)); +} + +/* + * Update the utilization estimate for the packet that just completed. + * The packet's class & the parent(s) of that class all get their + * estimators updated. This routine is called by the driver's output- + * packet-completion interrupt service routine. + */ + +/* + * a macro to approximate "divide by 1000" that gives 0.000999, + * if a value has enough effective digits. + * (on pentium, mul takes 9 cycles but div takes 46!) + */ +#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) +void +rmc_update_class_util(struct rm_ifdat *ifd) +{ + int idle, avgidle, pktlen; + int pkt_time, tidle; + rm_class_t *cl, *borrowed; + rm_class_t *borrows; + struct timeval *nowp; + + /* + * Get the most recent completed class. + */ + if ((cl = ifd->class_[ifd->qo_]) == NULL) + return; + + pktlen = ifd->curlen_[ifd->qo_]; + borrowed = ifd->borrowed_[ifd->qo_]; + borrows = borrowed; + + PKTCNTR_ADD(&cl->stats_.xmit_cnt, 1, pktlen); + + /* + * Run estimator on class and its ancestors. + */ + /* + * rm_update_class_util is designed to be called when the + * transfer is completed from a xmit complete interrupt, + * but most drivers don't implement an upcall for that. + * so, just use estimated completion time. + * as a result, ifd->qi_ and ifd->qo_ are always synced. + */ + nowp = &ifd->now_[ifd->qo_]; + /* get pkt_time (for link) in usec */ +#if 1 /* use approximation */ + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; +#endif +#if 1 /* ALTQ4PPP */ + if (TV_LT(nowp, &ifd->ifnow_)) { + int iftime; + + /* + * make sure the estimated completion time does not go + * too far. it can happen when the link layer supports + * data compression or the interface speed is set to + * a much lower value. + */ + TV_DELTA(&ifd->ifnow_, nowp, iftime); + if (iftime+pkt_time < ifd->maxiftime_) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); + } + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#else + if (TV_LT(nowp, &ifd->ifnow_)) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#endif + + while (cl != NULL) { + TV_DELTA(&ifd->ifnow_, &cl->last_, idle); + if (idle >= 2000000) + /* + * this class is idle enough, reset avgidle. + * (TV_DELTA returns 2000000 us when delta is large.) + */ + cl->avgidle_ = cl->maxidle_; + + /* get pkt_time (for class) in usec */ +#if 1 /* use approximation */ + pkt_time = pktlen * cl->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = pktlen * cl->ns_per_byte_ / 1000; +#endif + idle -= pkt_time; + + avgidle = cl->avgidle_; + avgidle += idle - (avgidle >> RM_FILTER_GAIN); + cl->avgidle_ = avgidle; + + /* Are we overlimit ? */ + if (avgidle <= 0) { + CBQTRACE(rmc_update_class_util, 'milo', + cl->stats_.handle); + /* + * need some lower bound for avgidle, otherwise + * a borrowing class gets unbounded penalty. + */ + if (avgidle < cl->minidle_) + avgidle = cl->avgidle_ = cl->minidle_; + + /* set next idle to make avgidle 0 */ + tidle = pkt_time + + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); + TV_ADD_DELTA(nowp, tidle, &cl->undertime_); + ++cl->stats_.over; + } else { + cl->avgidle_ = + (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; + cl->undertime_.tv_sec = 0; + if (cl->sleeping_) { + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + } + } + + if (borrows != NULL) { + if (borrows != cl) + ++cl->stats_.borrows; + else + borrows = NULL; + } + cl->last_ = ifd->ifnow_; + cl->last_pkttime_ = pkt_time; + +#if 1 + if (cl->parent_ == NULL) { + /* take stats of root class */ + PKTCNTR_ADD(&cl->stats_.xmit_cnt, 1, pktlen); + } +#endif + + cl = cl->parent_; + } + + /* + * Check to see if cutoff needs to set to a new level. + */ + cl = ifd->class_[ifd->qo_]; + if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { + if ((qlen(&cl->q_) <= 0) || + TV_LT(nowp, &borrowed->undertime_)) { + rmc_tl_satisfied(ifd, nowp); + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', + borrowed->depth_); + } + } + + /* + * Release class slot + */ + ifd->borrowed_[ifd->qo_] = NULL; + ifd->class_[ifd->qo_] = NULL; + ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; + ifd->queued_--; +} + +/* + * void + * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) + * over-limit action routines. These get invoked by rmc_under_limit() + * if a class with packets to send if over its bandwidth limit & can't + * borrow from a parent class. + * + * Returns: NONE + */ + +static void +rmc_drop_action(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + VERIFY(qlen(&cl->q_) > 0); + IFCQ_CONVERT_LOCK(ifd->ifq_); + _rmc_dropq(cl); + if (qempty(&cl->q_)) + ifd->na_[cl->pri_]--; +} + +void +rmc_drop(struct rm_class *cl, u_int32_t flow, u_int32_t *packets, + u_int32_t *bytes) +{ + struct rm_ifdat *ifd = cl->ifdat_; + struct ifclassq *ifq = ifd->ifq_; + u_int32_t pkt = 0, len = 0, qlen; + + if ((qlen = qlen(&cl->q_)) != 0) { + IFCQ_CONVERT_LOCK(ifq); +#if CLASSQ_RIO + if (q_is_rio(&cl->q_)) + rio_purgeq(cl->rio_, &cl->q_, flow, &pkt, &len); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->q_)) + red_purgeq(cl->red_, &cl->q_, flow, &pkt, &len); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->q_)) + blue_purgeq(cl->blue_, &cl->q_, flow, &pkt, &len); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) + sfb_purgeq(cl->sfb_, &cl->q_, flow, &pkt, &len); + else + _flushq_flow(&cl->q_, flow, &pkt, &len); + + if (pkt > 0) { + VERIFY(qlen(&cl->q_) == (qlen - pkt)); + + PKTCNTR_ADD(&cl->stats_.drop_cnt, pkt, len); + IFCQ_DROP_ADD(ifq, pkt, len); + + VERIFY(((signed)IFCQ_LEN(ifq) - pkt) >= 0); + IFCQ_LEN(ifq) -= pkt; + + if (qempty(&cl->q_)) + ifd->na_[cl->pri_]--; + } + } + if (packets != NULL) + *packets = pkt; + if (bytes != NULL) + *bytes = len; +} + +void +rmc_dropall(struct rm_class *cl) +{ + rmc_drop(cl, 0, NULL, NULL); +} + +/* + * void + * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ + * delay action routine. It is invoked via rmc_under_limit when the + * packet is discoverd to be overlimit. + * + * If the delay action is result of borrow class being overlimit, then + * delay for the offtime of the borrowing class that is overlimit. + * + * Returns: NONE + */ + +void +rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) +{ + int ndelay, t, extradelay; + + cl->stats_.overactions++; + TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay); +#ifndef BORROW_OFFTIME + ndelay += cl->offtime_; +#endif + + if (!cl->sleeping_) { + CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); +#ifdef BORROW_OFFTIME + if (borrow != NULL) + extradelay = borrow->offtime_; + else +#endif + extradelay = cl->offtime_; + + /* + * XXX recalculate suspend time: + * current undertime is (tidle + pkt_time) calculated + * from the last transmission. + * tidle: time required to bring avgidle back to 0 + * pkt_time: target waiting time for this class + * we need to replace pkt_time by offtime + */ + extradelay -= cl->last_pkttime_; + if (extradelay > 0) { + TV_ADD_DELTA(&cl->undertime_, extradelay, + &cl->undertime_); + ndelay += extradelay; + } + + cl->sleeping_ = 1; + cl->stats_.delays++; + + /* + * Since packets are phased randomly with respect to the + * clock, 1 tick (the next clock tick) can be an arbitrarily + * short time so we have to wait for at least two ticks. + * NOTE: If there's no other traffic, we need the timer as + * a 'backstop' to restart this class. + */ + if (ndelay > tick * 2) { + /* + * FreeBSD rounds up the tick; + * other BSDs round down the tick. + */ + t = hzto(&cl->undertime_) + 1; + } else { + t = 2; + } + CALLOUT_RESET(&cl->callout_, t, + (timeout_t *)rmc_restart, (caddr_t)cl); + } +} + +/* + * void + * rmc_restart() - is just a helper routine for rmc_delay_action -- it is + * called by the system timer code & is responsible checking if the + * class is still sleeping (it might have been restarted as a side + * effect of the queue scan on a packet arrival) and, if so, restarting + * output for the class. Inspecting the class state & restarting output + * require locking the class structure. In general the driver is + * responsible for locking but this is the only routine that is not + * called directly or indirectly from the interface driver so it has + * know about system locking conventions. + * + * Returns: NONE + */ + +static void +rmc_restart(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + if (cl->sleeping_) { + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + + if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { + CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); + (ifd->restart)(ifd->ifq_); + } + } +} + +/* + * void + * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit + * handling routine for the root class of the link sharing structure. + * + * Returns: NONE + */ +static void +rmc_root_overlimit(struct rm_class *cl, + struct rm_class *borrow) +{ +#pragma unused(cl, borrow) + panic("rmc_root_overlimit"); +} + +/* + * Packet Queue handling routines. Eventually, this is to localize the + * effects on the code whether queues are red queues or droptail + * queues. + */ + +static int +_rmc_addq(rm_class_t *cl, struct mbuf *m, struct pf_mtag *t) +{ +#if CLASSQ_RIO + if (q_is_rio(&cl->q_)) + return (rio_addq(cl->rio_, &cl->q_, m, t)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->q_)) + return (red_addq(cl->red_, &cl->q_, m, t)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->q_)) + return (blue_addq(cl->blue_, &cl->q_, m, t)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->q_)) { + if (cl->sfb_ == NULL) { + struct ifclassq *ifq = cl->ifdat_->ifq_; + struct ifnet *ifp = ifq->ifcq_ifp; + + VERIFY(cl->flags_ & RMCF_LAZY); + IFCQ_CONVERT_LOCK(ifq); + + cl->sfb_ = sfb_alloc(ifp, cl->stats_.handle, + qlimit(&cl->q_), cl->qflags_); + if (cl->sfb_ == NULL) { + /* fall back to droptail */ + qtype(&cl->q_) = Q_DROPTAIL; + cl->flags_ &= ~RMCF_SFB; + cl->qflags_ &= ~(SFBF_ECN | SFBF_FLOWCTL); + + log(LOG_ERR, "%s: CBQ SFB lazy allocation " + "failed for qid=%d pri=%d, falling back " + "to DROPTAIL\n", if_name(ifp), + cl->stats_.handle, cl->pri_); + } + } + if (cl->sfb_ != NULL) + return (sfb_addq(cl->sfb_, &cl->q_, m, t)); + } else if (cl->flags_ & RMCF_CLEARDSCP) + write_dsfield(m, t, 0); + + /* test for qlen > qlimit is done by caller */ + _addq(&cl->q_, m); + return (0); +} + +/* note: _rmc_dropq is not called for red */ +static void +_rmc_dropq(rm_class_t *cl) +{ + struct mbuf *m; + + if ((m = _rmc_getq(cl)) != NULL) + m_freem(m); +} + +static struct mbuf * +_rmc_getq(rm_class_t *cl) +{ +#if CLASSQ_RIO + if (q_is_rio(&cl->q_)) + return (rio_getq(cl->rio_, &cl->q_)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->q_)) + return (red_getq(cl->red_, &cl->q_)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->q_)) + return (blue_getq(cl->blue_, &cl->q_)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) + return (sfb_getq(cl->sfb_, &cl->q_)); + + return (_getq(&cl->q_)); +} + +static struct mbuf * +_rmc_pollq(rm_class_t *cl) +{ + return (qhead(&cl->q_)); +} + +void +rmc_updateq(rm_class_t *cl, cqev_t ev) +{ +#if CLASSQ_RIO + if (q_is_rio(&cl->q_)) + return (rio_updateq(cl->rio_, ev)); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->q_)) + return (red_updateq(cl->red_, ev)); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->q_)) + return (blue_updateq(cl->blue_, ev)); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) + return (sfb_updateq(cl->sfb_, ev)); +} + +#ifdef CBQ_TRACE + +struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; +struct cbqtrace *cbqtrace_ptr = NULL; +int cbqtrace_count; + +/* + * DDB hook to trace cbq events: + * the last 1024 events are held in a circular buffer. + * use "call cbqtrace_dump(N)" to display 20 events from Nth event. + */ +void cbqtrace_dump(int); +static char *rmc_funcname(void *); + +static struct rmc_funcs { + void *func; + char *name; +} rmc_funcs[] = +{ + rmc_init, "rmc_init", + rmc_queue_packet, "rmc_queue_packet", + rmc_under_limit, "rmc_under_limit", + rmc_update_class_util, "rmc_update_class_util", + rmc_delay_action, "rmc_delay_action", + rmc_restart, "rmc_restart", + _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", + NULL, NULL +}; + +static char * +rmc_funcname(void *func) +{ + struct rmc_funcs *fp; + + for (fp = rmc_funcs; fp->func != NULL; fp++) + if (fp->func == func) + return (fp->name); + return ("unknown"); +} + +void +cbqtrace_dump(int counter) +{ + int i, *p; + char *cp; + + counter = counter % NCBQTRACE; + p = (int *)&cbqtrace_buffer[counter]; + + for (i = 0; i < 20; i++) { + log(LOG_DEBUG, "[0x%x] ", *p++); + log(LOG_DEBUG, "%s: ", rmc_funcname((void *)*p++)); + cp = (char *)p++; + log(LOG_DEBUG, "%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); + log(LOG_DEBUG, "%d\n", *p++); + + if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) + p = (int *)cbqtrace_buffer; + } +} +#endif /* CBQ_TRACE */ +#endif /* PKTSCHED_CBQ */ diff --git a/bsd/net/pktsched/pktsched_rmclass.h b/bsd/net/pktsched/pktsched_rmclass.h new file mode 100644 index 000000000..d5f6b13b2 --- /dev/null +++ b/bsd/net/pktsched/pktsched_rmclass.h @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $OpenBSD: altq_rmclass.h,v 1.10 2007/06/17 19:58:58 jasper Exp $ */ +/* $KAME: altq_rmclass.h,v 1.6 2000/12/09 09:22:44 kjc Exp $ */ + +/* + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_RMCLASS_H_ +#define _NET_PKTSCHED_PKTSCHED_RMCLASS_H_ + +#ifdef PRIVATE +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RM_MAXPRIO 8 /* Max priority */ + +/* flags for rmc_init and rmc_newclass */ +/* class flags */ +#define RMCF_RED 0x0001 /* use RED */ +#define RMCF_ECN 0x0002 /* use ECN with RED/BLUE/SFB */ +#define RMCF_RIO 0x0004 /* use RIO */ +#define RMCF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define RMCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ + +/* flags for rmc_init */ +#define RMCF_WRR 0x0100 +#define RMCF_EFFICIENT 0x0200 + +#define RMCF_BLUE 0x10000 /* use BLUE */ +#define RMCF_SFB 0x20000 /* use SFB */ +#define RMCF_FLOWCTL 0x40000 /* enable flow control advisories */ +#ifdef BSD_KERNEL_PRIVATE +#define RMCF_LAZY 0x10000000 /* on-demand resource allocation */ + +typedef struct rm_ifdat rm_ifdat_t; +typedef struct rm_class rm_class_t; + +struct red; +struct rio; +struct blue; +struct sfb; + +/* + * Macros for dealing with time values. We assume all times are + * 'timevals'. `microuptime' is used to get the best available clock + * resolution. If `microuptime' *doesn't* return a value that's about + * ten times smaller than the average packet time on the fastest + * link that will use these routines, a slightly different clock + * scheme than this one should be used. + * (Bias due to truncation error in this scheme will overestimate utilization + * and discriminate against high bandwidth classes. To remove this bias an + * integrator needs to be added. The simplest integrator uses a history of + * 10 * avg.packet.time / min.tick.time packet completion entries. This is + * straight forward to add but we don't want to pay the extra memory + * traffic to maintain it if it's not necessary (occasionally a vendor + * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) + */ + +#define RM_GETTIME(now) microuptime(&now) + +#define TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) || \ + (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec))) + +#define TV_DELTA(a, b, delta) { \ + int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \ + switch (xxs) { \ + default: \ + /* \ + * if (xxs < 0) \ + * printf("rm_class: bogus time values\n"); \ + */ \ + delta = 0; \ + /* fall through */ \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + } \ + } \ +} + +#define TV_ADD_DELTA(a, delta, res) { \ + int xxus = (a)->tv_usec + (delta); \ + \ + (res)->tv_sec = (a)->tv_sec; \ + while (xxus >= 1000000) { \ + ++((res)->tv_sec); \ + xxus -= 1000000; \ + } \ + (res)->tv_usec = xxus; \ +} + +#define RM_TIMEOUT 2 /* 1 Clock tick. */ + +#if 1 +#define RM_MAXQUEUED 1 /* this isn't used in ALTQ/CBQ */ +#else +#define RM_MAXQUEUED 16 /* Max number of packets downstream of CBQ */ +#endif +#define RM_MAXQUEUE 64 /* Max queue length */ +#define RM_FILTER_GAIN 5 /* log2 of gain, e.g., 5 => 31/32 */ +#define RM_POWER (1 << RM_FILTER_GAIN) +#define RM_MAXDEPTH 32 +#define RM_NS_PER_SEC (1000000000) + +typedef struct _rm_class_stats_ { + u_int32_t handle; + u_int32_t depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int32_t over; /* # times went over limit */ + u_int32_t borrows; /* # times tried to borrow */ + u_int32_t overactions; /* # times invoked overlimit action */ + u_int32_t delays; /* # times invoked delay actions */ +} rm_class_stats_t; + +/* + * CBQ Class state structure + */ +struct rm_class { + class_queue_t q_; /* Queue of packets */ + rm_ifdat_t *ifdat_; + int pri_; /* Class priority. */ + int depth_; /* Class depth */ + u_int32_t ns_per_byte_; /* NanoSeconds per byte. */ + u_int32_t maxrate_; /* Bytes per second for this class. */ + u_int32_t allotment_; /* Fraction of link bandwidth. */ + u_int32_t w_allotment_; /* Weighted allotment for WRR */ + int bytes_alloc_; /* Allocation for round of WRR */ + + int avgidle_; + int maxidle_; + int minidle_; + int offtime_; + int sleeping_; /* != 0 if delaying */ + u_int32_t qthresh_; /* Threshold for formal link sharing */ + int leaf_; /* Note whether leaf class or not */ + + rm_class_t *children_; /* Children of this class */ + rm_class_t *next_; /* Next pointer, used if child */ + + rm_class_t *peer_; /* Peer class */ + rm_class_t *borrow_; /* Borrow class */ + rm_class_t *parent_; /* Parent class */ + + void (*overlimit)(struct rm_class *, struct rm_class *); + void (*drop)(struct rm_class *); /* Class drop action. */ + + union { + void *ptr; + struct red *red; /* RED state */ + struct rio *rio; /* RIO state */ + struct blue *blue; /* BLUE state */ + struct sfb *sfb; /* SFB state */ + } qalg_; + int flags_; + u_int32_t qflags_; + + int last_pkttime_; /* saved pkt_time */ + struct timeval undertime_; /* time can next send */ + struct timeval last_; /* time last packet sent */ + struct timeval overtime_; + struct callout callout_; /* for timeout() calls */ + + rm_class_stats_t stats_; /* Class Statistics */ +}; + +#define red_ qalg_.red +#define rio_ qalg_.rio +#define blue_ qalg_.blue +#define sfb_ qalg_.sfb + +/* + * CBQ Interface state + */ +struct rm_ifdat { + int queued_; /* # pkts queued downstream */ + int efficient_; /* Link Efficency bit */ + int wrr_; /* Enable Weighted Round-Robin */ + u_long ns_per_byte_; /* Link byte speed. */ + int maxqueued_; /* Max packets to queue */ + int maxpkt_; /* Max packet size. */ + int qi_; /* In/out pointers for downstream */ + int qo_; /* packets */ + + /* + * Active class state and WRR state. + */ + rm_class_t *active_[RM_MAXPRIO]; /* Active cl's in each pri */ + int na_[RM_MAXPRIO]; /* # of active cl's in a pri */ + int num_[RM_MAXPRIO]; /* # of cl's per pri */ + int alloc_[RM_MAXPRIO]; /* Byte Allocation */ + u_long M_[RM_MAXPRIO]; /* WRR weights. */ + + /* + * Network Interface/Solaris Queue state pointer. + */ + struct ifclassq *ifq_; + rm_class_t *default_; /* Default Pkt class, BE */ + rm_class_t *root_; /* Root Link class. */ + rm_class_t *ctl_; /* Control Traffic class. */ + void (*restart)(struct ifclassq *); /* Restart routine. */ + + /* + * Current packet downstream packet state and dynamic state. + */ + rm_class_t *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */ + rm_class_t *class_[RM_MAXQUEUED]; /* class sending */ + int curlen_[RM_MAXQUEUED]; /* Current pktlen */ + struct timeval now_[RM_MAXQUEUED]; /* Current packet time */ + int is_overlimit_[RM_MAXQUEUED]; /* Current packet time */ + + int cutoff_; /* Cut-off depth for borrowing */ + + struct timeval ifnow_; /* expected xmit completion time */ +#if 1 /* ALTQ4PPP */ + int maxiftime_; /* max delay inside interface */ +#endif + rm_class_t *pollcache_; /* cached rm_class by poll operation */ +}; + +#define RMC_IS_A_PARENT_CLASS(cl) ((cl)->children_ != NULL) + +extern void rmclass_init(void); +extern rm_class_t *rmc_newclass(int, struct rm_ifdat *, u_int32_t, + void (*)(struct rm_class *, struct rm_class *), u_int32_t, + u_int32_t, struct rm_class *, struct rm_class *, + u_int32_t, int, u_int32_t, int, int); +extern void rmc_delete_class(struct rm_ifdat *, struct rm_class *); +extern int rmc_modclass(struct rm_class *, u_int32_t, int, u_int32_t, + int, u_int32_t, int); +extern int rmc_init(struct ifclassq *, struct rm_ifdat *, u_int32_t, + void (*)(struct ifclassq *), u_int32_t, int, int, u_int32_t, + int, u_int32_t, int); +extern int rmc_queue_packet(struct rm_class *, struct mbuf *, struct pf_mtag *); +extern struct mbuf *rmc_dequeue_next(struct rm_ifdat *, cqdq_op_t); +extern void rmc_update_class_util(struct rm_ifdat *); +extern void rmc_delay_action(struct rm_class *, struct rm_class *); +extern void rmc_drop(struct rm_class *, u_int32_t, u_int32_t *, u_int32_t *); +extern void rmc_dropall(struct rm_class *); +extern int rmc_get_weight(struct rm_ifdat *, int); +extern void rmc_updateq(struct rm_class *, cqev_t); + +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_RMCLASS_H_ */ diff --git a/bsd/net/pktsched/pktsched_rmclass_debug.h b/bsd/net/pktsched/pktsched_rmclass_debug.h new file mode 100644 index 000000000..dd3f364f5 --- /dev/null +++ b/bsd/net/pktsched/pktsched_rmclass_debug.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* $NetBSD: altq_rmclass_debug.h,v 1.7 2006/10/12 19:59:08 peter Exp $ */ +/* $KAME: altq_rmclass_debug.h,v 1.3 2002/11/29 04:36:24 kjc Exp $ */ + +/* + * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_RMCLASS_DEBUG_H_ +#define _NET_PKTSCHED_PKTSCHED_RMCLASS_DEBUG_H_ + +/* #pragma ident "@(#)rm_class_debug.h 1.7 98/05/04 SMI" */ + +/* + * Cbq debugging macros + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef BSD_KERNEL_PRIVATE + +#ifdef CBQ_TRACE +#ifndef NCBQTRACE +#define NCBQTRACE (16 * 1024) +#endif + +/* + * To view the trace output, using adb, type: + * adb -k /dev/ksyms /dev/mem , then type + * cbqtrace_count/D to get the count, then type + * cbqtrace_buffer,0tcount/Dp4C" "Xn + * This will dump the trace buffer from 0 to count. + */ +/* + * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events + * from Nth event in the circular buffer. + */ + +struct cbqtrace { + int count; + int function; /* address of function */ + int trace_action; /* descriptive 4 characters */ + int object; /* object operated on */ +}; + +extern struct cbqtrace cbqtrace_buffer[]; +extern struct cbqtrace *cbqtrace_ptr; +extern int cbqtrace_count; + +#define CBQTRACEINIT() { \ + if (cbqtrace_ptr == NULL) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else { \ + cbqtrace_ptr = cbqtrace_buffer; \ + bzero((void *)cbqtrace_ptr, sizeof (cbqtrace_buffer)); \ + cbqtrace_count = 0; \ + } \ +} + +#define CBQTRACE(func, act, obj) { \ + int *_p = &cbqtrace_ptr->count; \ + *_p++ = ++cbqtrace_count; \ + *_p++ = (int)(func); \ + *_p++ = (int)(act); \ + *_p++ = (int)(obj); \ + if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE]) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else \ + cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \ + } +#else + +/* If no tracing, define no-ops */ +#define CBQTRACEINIT() +#define CBQTRACE(a, b, c) + +#endif /* !CBQ_TRACE */ + +#endif /* BSD_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _NET_PKTSCHED_PKTSCHED_RMCLASS_DEBUG_H_ */ diff --git a/bsd/net/pktsched/pktsched_tcq.c b/bsd/net/pktsched/pktsched_tcq.c new file mode 100644 index 000000000..fb66ee785 --- /dev/null +++ b/bsd/net/pktsched/pktsched_tcq.c @@ -0,0 +1,1215 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * traffic class queue + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +/* + * function prototypes + */ +static int tcq_enqueue_ifclassq(struct ifclassq *, struct mbuf *); +static struct mbuf *tcq_dequeue_tc_ifclassq(struct ifclassq *, + mbuf_svc_class_t, cqdq_op_t); +static int tcq_request_ifclassq(struct ifclassq *, cqrq_t, void *); +static int tcq_clear_interface(struct tcq_if *); +static struct tcq_class *tcq_class_create(struct tcq_if *, int, u_int32_t, + int, u_int32_t); +static int tcq_class_destroy(struct tcq_if *, struct tcq_class *); +static int tcq_destroy_locked(struct tcq_if *); +static inline int tcq_addq(struct tcq_class *, struct mbuf *, + struct pf_mtag *); +static inline struct mbuf *tcq_getq(struct tcq_class *); +static inline struct mbuf *tcq_pollq(struct tcq_class *); +static void tcq_purgeq(struct tcq_if *, struct tcq_class *, u_int32_t, + u_int32_t *, u_int32_t *); +static void tcq_purge_sc(struct tcq_if *, cqrq_purge_sc_t *); +static void tcq_updateq(struct tcq_if *, struct tcq_class *, cqev_t); +static int tcq_throttle(struct tcq_if *, cqrq_throttle_t *); +static int tcq_resumeq(struct tcq_if *, struct tcq_class *); +static int tcq_suspendq(struct tcq_if *, struct tcq_class *); +static struct mbuf *tcq_dequeue_cl(struct tcq_if *, struct tcq_class *, + mbuf_svc_class_t, cqdq_op_t); +static inline struct tcq_class *tcq_clh_to_clp(struct tcq_if *, u_int32_t); +static const char *tcq_style(struct tcq_if *); + +#define TCQ_ZONE_MAX 32 /* maximum elements in zone */ +#define TCQ_ZONE_NAME "pktsched_tcq" /* zone name */ + +static unsigned int tcq_size; /* size of zone element */ +static struct zone *tcq_zone; /* zone for tcq */ + +#define TCQ_CL_ZONE_MAX 32 /* maximum elements in zone */ +#define TCQ_CL_ZONE_NAME "pktsched_tcq_cl" /* zone name */ + +static unsigned int tcq_cl_size; /* size of zone element */ +static struct zone *tcq_cl_zone; /* zone for tcq_class */ + +void +tcq_init(void) +{ + tcq_size = sizeof (struct tcq_if); + tcq_zone = zinit(tcq_size, TCQ_ZONE_MAX * tcq_size, + 0, TCQ_ZONE_NAME); + if (tcq_zone == NULL) { + panic("%s: failed allocating %s", __func__, TCQ_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(tcq_zone, Z_EXPAND, TRUE); + zone_change(tcq_zone, Z_CALLERACCT, TRUE); + + tcq_cl_size = sizeof (struct tcq_class); + tcq_cl_zone = zinit(tcq_cl_size, TCQ_CL_ZONE_MAX * tcq_cl_size, + 0, TCQ_CL_ZONE_NAME); + if (tcq_cl_zone == NULL) { + panic("%s: failed allocating %s", __func__, TCQ_CL_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(tcq_cl_zone, Z_EXPAND, TRUE); + zone_change(tcq_cl_zone, Z_CALLERACCT, TRUE); +} + +struct tcq_if * +tcq_alloc(struct ifnet *ifp, int how, boolean_t altq) +{ + struct tcq_if *tif; + + tif = (how == M_WAITOK) ? zalloc(tcq_zone) : zalloc_noblock(tcq_zone); + if (tif == NULL) + return (NULL); + + bzero(tif, tcq_size); + tif->tif_maxpri = -1; + tif->tif_ifq = &ifp->if_snd; + if (altq) + tif->tif_flags |= TCQIFF_ALTQ; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler allocated\n", + if_name(ifp), tcq_style(tif)); + } + + return (tif); +} + +int +tcq_destroy(struct tcq_if *tif) +{ + struct ifclassq *ifq = tif->tif_ifq; + int err; + + IFCQ_LOCK(ifq); + err = tcq_destroy_locked(tif); + IFCQ_UNLOCK(ifq); + + return (err); +} + +static int +tcq_destroy_locked(struct tcq_if *tif) +{ + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + (void) tcq_clear_interface(tif); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s scheduler destroyed\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif)); + } + + zfree(tcq_zone, tif); + + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +tcq_clear_interface(struct tcq_if *tif) +{ + struct tcq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + /* clear out the classes */ + for (pri = 0; pri <= tif->tif_maxpri; pri++) + if ((cl = tif->tif_classes[pri]) != NULL) + tcq_class_destroy(tif, cl); + + return (0); +} + +/* discard all the queued packets on the interface */ +void +tcq_purge(struct tcq_if *tif) +{ + struct tcq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + for (pri = 0; pri <= tif->tif_maxpri; pri++) { + if ((cl = tif->tif_classes[pri]) != NULL && !qempty(&cl->cl_q)) + tcq_purgeq(tif, cl, 0, NULL, NULL); + } +#if !PF_ALTQ + /* + * This assertion is safe to be made only when PF_ALTQ is not + * configured; otherwise, IFCQ_LEN represents the sum of the + * packets managed by ifcq_disc and altq_disc instances, which + * is possible when transitioning between the two. + */ + VERIFY(IFCQ_LEN(tif->tif_ifq) == 0); +#endif /* !PF_ALTQ */ +} + +static void +tcq_purge_sc(struct tcq_if *tif, cqrq_purge_sc_t *pr) +{ + struct ifclassq *ifq = tif->tif_ifq; + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + VERIFY(pr->sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(pr->sc)); + VERIFY(pr->flow != 0); + + if (pr->sc != MBUF_SC_UNSPEC) { + i = MBUF_SCIDX(pr->sc); + VERIFY(i < IFCQ_SC_MAX); + + tcq_purgeq(tif, ifq->ifcq_disc_slots[i].cl, + pr->flow, &pr->packets, &pr->bytes); + } else { + u_int32_t cnt, len; + + pr->packets = 0; + pr->bytes = 0; + + for (i = 0; i < IFCQ_SC_MAX; i++) { + tcq_purgeq(tif, ifq->ifcq_disc_slots[i].cl, + pr->flow, &cnt, &len); + pr->packets += cnt; + pr->bytes += len; + } + } +} + +void +tcq_event(struct tcq_if *tif, cqev_t ev) +{ + struct tcq_class *cl; + int pri; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + for (pri = 0; pri <= tif->tif_maxpri; pri++) + if ((cl = tif->tif_classes[pri]) != NULL) + tcq_updateq(tif, cl, ev); +} + +int +tcq_add_queue(struct tcq_if *tif, int priority, u_int32_t qlimit, + int flags, u_int32_t qid, struct tcq_class **clp) +{ + struct tcq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + /* check parameters */ + if (priority >= TCQ_MAXPRI) + return (EINVAL); + if (tif->tif_classes[priority] != NULL) + return (EBUSY); + if (tcq_clh_to_clp(tif, qid) != NULL) + return (EBUSY); + + cl = tcq_class_create(tif, priority, qlimit, flags, qid); + if (cl == NULL) + return (ENOMEM); + + if (clp != NULL) + *clp = cl; + + return (0); +} + +static struct tcq_class * +tcq_class_create(struct tcq_if *tif, int pri, u_int32_t qlimit, + int flags, u_int32_t qid) +{ + struct ifnet *ifp; + struct ifclassq *ifq; + struct tcq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + /* Sanitize flags unless internally configured */ + if (tif->tif_flags & TCQIFF_ALTQ) + flags &= TQCF_USERFLAGS; + +#if !CLASSQ_RED + if (flags & TQCF_RED) { + log(LOG_ERR, "%s: %s RED not available!\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif)); + return (NULL); + } +#endif /* !CLASSQ_RED */ + +#if !CLASSQ_RIO + if (flags & TQCF_RIO) { + log(LOG_ERR, "%s: %s RIO not available!\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif)); + return (NULL); + } +#endif /* CLASSQ_RIO */ + +#if !CLASSQ_BLUE + if (flags & TQCF_BLUE) { + log(LOG_ERR, "%s: %s BLUE not available!\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif)); + return (NULL); + } +#endif /* CLASSQ_BLUE */ + + /* These are mutually exclusive */ + if ((flags & (TQCF_RED|TQCF_RIO|TQCF_BLUE|TQCF_SFB)) && + (flags & (TQCF_RED|TQCF_RIO|TQCF_BLUE|TQCF_SFB)) != TQCF_RED && + (flags & (TQCF_RED|TQCF_RIO|TQCF_BLUE|TQCF_SFB)) != TQCF_RIO && + (flags & (TQCF_RED|TQCF_RIO|TQCF_BLUE|TQCF_SFB)) != TQCF_BLUE && + (flags & (TQCF_RED|TQCF_RIO|TQCF_BLUE|TQCF_SFB)) != TQCF_SFB) { + log(LOG_ERR, "%s: %s more than one RED|RIO|BLUE|SFB\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif)); + return (NULL); + } + + ifq = tif->tif_ifq; + ifp = TCQIF_IFP(tif); + + if ((cl = tif->tif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + if (!qempty(&cl->cl_q)) + tcq_purgeq(tif, cl, 0, NULL, NULL); +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } else { + cl = zalloc(tcq_cl_zone); + if (cl == NULL) + return (NULL); + + bzero(cl, tcq_cl_size); + } + + tif->tif_classes[pri] = cl; + if (flags & TQCF_DEFAULTCLASS) + tif->tif_default = cl; + if (qlimit == 0 || qlimit > IFCQ_MAXLEN(ifq)) { + qlimit = IFCQ_MAXLEN(ifq); + if (qlimit == 0) + qlimit = DEFAULT_QLIMIT; /* use default */ + } + _qinit(&cl->cl_q, Q_DROPTAIL, qlimit); + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > tif->tif_maxpri) + tif->tif_maxpri = pri; + cl->cl_tif = tif; + cl->cl_handle = qid; + + if (flags & (TQCF_RED|TQCF_RIO|TQCF_BLUE|TQCF_SFB)) { +#if CLASSQ_RED || CLASSQ_RIO + u_int64_t ifbandwidth = ifnet_output_linkrate(ifp); + int pkttime; +#endif /* CLASSQ_RED || CLASSQ_RIO */ + + cl->cl_qflags = 0; + if (flags & TQCF_ECN) { + if (flags & TQCF_BLUE) + cl->cl_qflags |= BLUEF_ECN; + else if (flags & TQCF_SFB) + cl->cl_qflags |= SFBF_ECN; + else if (flags & TQCF_RED) + cl->cl_qflags |= REDF_ECN; + else if (flags & TQCF_RIO) + cl->cl_qflags |= RIOF_ECN; + } + if (flags & TQCF_FLOWCTL) { + if (flags & TQCF_SFB) + cl->cl_qflags |= SFBF_FLOWCTL; + } + if (flags & TQCF_CLEARDSCP) { + if (flags & TQCF_RIO) + cl->cl_qflags |= RIOF_CLEARDSCP; + } +#if CLASSQ_RED || CLASSQ_RIO + /* + * XXX: RED & RIO should be watching link speed and MTU + * events and recompute pkttime accordingly. + */ + if (ifbandwidth < 8) + pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + pkttime = (int64_t)ifp->if_mtu * 1000 * 1000 * 1000 / + (ifbandwidth / 8); + + /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ +#if CLASSQ_RED + if (flags & TQCF_RED) { + cl->cl_red = red_alloc(ifp, 0, 0, + qlimit(&cl->cl_q) * 10/100, + qlimit(&cl->cl_q) * 30/100, + cl->cl_qflags, pkttime); + if (cl->cl_red != NULL) + qtype(&cl->cl_q) = Q_RED; + } +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (flags & TQCF_RIO) { + cl->cl_rio = + rio_alloc(ifp, 0, NULL, cl->cl_qflags, pkttime); + if (cl->cl_rio != NULL) + qtype(&cl->cl_q) = Q_RIO; + } +#endif /* CLASSQ_RIO */ +#endif /* CLASSQ_RED || CLASSQ_RIO */ +#if CLASSQ_BLUE + if (flags & TQCF_BLUE) { + cl->cl_blue = blue_alloc(ifp, 0, 0, cl->cl_qflags); + if (cl->cl_blue != NULL) + qtype(&cl->cl_q) = Q_BLUE; + } +#endif /* CLASSQ_BLUE */ + if (flags & TQCF_SFB) { + if (!(cl->cl_flags & TQCF_LAZY)) + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb != NULL || (cl->cl_flags & TQCF_LAZY)) + qtype(&cl->cl_q) = Q_SFB; + } + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s created qid=%d pri=%d qlimit=%d " + "flags=%b\n", if_name(ifp), tcq_style(tif), + cl->cl_handle, cl->cl_pri, qlimit, flags, TQCF_BITS); + } + + return (cl); +} + +int +tcq_remove_queue(struct tcq_if *tif, u_int32_t qid) +{ + struct tcq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + if ((cl = tcq_clh_to_clp(tif, qid)) == NULL) + return (EINVAL); + + return (tcq_class_destroy(tif, cl)); +} + +static int +tcq_class_destroy(struct tcq_if *tif, struct tcq_class *cl) +{ + struct ifclassq *ifq = tif->tif_ifq; + int pri; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!qempty(&cl->cl_q)) + tcq_purgeq(tif, cl, 0, NULL, NULL); + + tif->tif_classes[cl->cl_pri] = NULL; + if (tif->tif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (tif->tif_classes[pri] != NULL) { + tif->tif_maxpri = pri; + break; + } + if (pri < 0) + tif->tif_maxpri = -1; + } + + if (tif->tif_default == cl) + tif->tif_default = NULL; + + if (cl->cl_qalg.ptr != NULL) { +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_destroy(cl->cl_rio); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_destroy(cl->cl_red); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_destroy(cl->cl_blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_destroy(cl->cl_sfb); + cl->cl_qalg.ptr = NULL; + qtype(&cl->cl_q) = Q_DROPTAIL; + qstate(&cl->cl_q) = QS_RUNNING; + } + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s destroyed qid=%d pri=%d\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif), + cl->cl_handle, cl->cl_pri); + } + + zfree(tcq_cl_zone, cl); + return (0); +} + +int +tcq_enqueue(struct tcq_if *tif, struct tcq_class *cl, struct mbuf *m, + struct pf_mtag *t) +{ + struct ifclassq *ifq = tif->tif_ifq; + int len, ret; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(cl == NULL || cl->cl_tif == tif); + + if (cl == NULL) { + cl = tcq_clh_to_clp(tif, t->pftag_qid); + if (cl == NULL) { + cl = tif->tif_default; + if (cl == NULL) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + } + } + + len = m_pktlen(m); + + ret = tcq_addq(cl, m, t); + if (ret != 0) { + if (ret == CLASSQEQ_SUCCESS_FC) { + /* packet enqueued, return advisory feedback */ + ret = EQFULL; + } else { + VERIFY(ret == CLASSQEQ_DROPPED || + ret == CLASSQEQ_DROPPED_FC || + ret == CLASSQEQ_DROPPED_SP); + /* packet has been freed in tcq_addq */ + PKTCNTR_ADD(&cl->cl_dropcnt, 1, len); + IFCQ_DROP_ADD(ifq, 1, len); + switch (ret) { + case CLASSQEQ_DROPPED: + return (ENOBUFS); + case CLASSQEQ_DROPPED_FC: + return (EQFULL); + case CLASSQEQ_DROPPED_SP: + return (EQSUSPENDED); + } + /* NOT REACHED */ + } + } + IFCQ_INC_LEN(ifq); + + /* successfully queued. */ + return (ret); +} + +/* + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +struct mbuf * +tcq_dequeue_tc(struct tcq_if *tif, mbuf_svc_class_t sc, cqdq_op_t op) +{ + return (tcq_dequeue_cl(tif, NULL, sc, op)); +} + +static struct mbuf * +tcq_dequeue_cl(struct tcq_if *tif, struct tcq_class *cl, + mbuf_svc_class_t sc, cqdq_op_t op) +{ + struct ifclassq *ifq = tif->tif_ifq; + struct mbuf *m; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (cl == NULL) { + cl = tcq_clh_to_clp(tif, MBUF_SCIDX(sc)); + if (cl == NULL) + return (NULL); + } + + if (qempty(&cl->cl_q)) + return (NULL); + + VERIFY(!IFCQ_IS_EMPTY(ifq)); + + if (op == CLASSQDQ_POLL) + return (tcq_pollq(cl)); + + m = tcq_getq(cl); + if (m != NULL) { + IFCQ_DEC_LEN(ifq); + if (qempty(&cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, 1, m_pktlen(m)); + IFCQ_XMIT_ADD(ifq, 1, m_pktlen(m)); + } + return (m); +} + +static inline int +tcq_addq(struct tcq_class *cl, struct mbuf *m, struct pf_mtag *t) +{ + struct tcq_if *tif = cl->cl_tif; + struct ifclassq *ifq = tif->tif_ifq; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_addq(cl->cl_rio, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_addq(cl->cl_red, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_addq(cl->cl_blue, &cl->cl_q, m, t)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb == NULL) { + struct ifnet *ifp = TCQIF_IFP(tif); + + VERIFY(cl->cl_flags & TQCF_LAZY); + cl->cl_flags &= ~TQCF_LAZY; + IFCQ_CONVERT_LOCK(ifq); + + cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle, + qlimit(&cl->cl_q), cl->cl_qflags); + if (cl->cl_sfb == NULL) { + /* fall back to droptail */ + qtype(&cl->cl_q) = Q_DROPTAIL; + cl->cl_flags &= ~TQCF_SFB; + cl->cl_qflags &= ~(SFBF_ECN | SFBF_FLOWCTL); + + log(LOG_ERR, "%s: %s SFB lazy allocation " + "failed for qid=%d pri=%d, falling back " + "to DROPTAIL\n", if_name(ifp), + tcq_style(tif), cl->cl_handle, + cl->cl_pri); + } else if (tif->tif_throttle != IFNET_THROTTLE_OFF) { + /* if there's pending throttling, set it */ + cqrq_throttle_t tr = { 1, tif->tif_throttle }; + int err = tcq_throttle(tif, &tr); + + if (err == EALREADY) + err = 0; + if (err != 0) { + tr.level = IFNET_THROTTLE_OFF; + (void) tcq_throttle(tif, &tr); + } + } + } + if (cl->cl_sfb != NULL) + return (sfb_addq(cl->cl_sfb, &cl->cl_q, m, t)); + } else if (qlen(&cl->cl_q) >= qlimit(&cl->cl_q)) { + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (CLASSQEQ_DROPPED); + } + + if (cl->cl_flags & TQCF_CLEARDSCP) + write_dsfield(m, t, 0); + + _addq(&cl->cl_q, m); + + return (0); +} + +static inline struct mbuf * +tcq_getq(struct tcq_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_tif->tif_ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_getq(cl->cl_rio, &cl->cl_q)); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_getq(cl->cl_red, &cl->cl_q)); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_getq(cl->cl_blue, &cl->cl_q)); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_getq(cl->cl_sfb, &cl->cl_q)); + + return (_getq(&cl->cl_q)); +} + +static inline struct mbuf * +tcq_pollq(struct tcq_class *cl) +{ + IFCQ_LOCK_ASSERT_HELD(cl->cl_tif->tif_ifq); + + return (qhead(&cl->cl_q)); +} + +static void +tcq_purgeq(struct tcq_if *tif, struct tcq_class *cl, u_int32_t flow, + u_int32_t *packets, u_int32_t *bytes) +{ + struct ifclassq *ifq = tif->tif_ifq; + u_int32_t cnt = 0, len = 0, qlen; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if ((qlen = qlen(&cl->cl_q)) == 0) + goto done; + + /* become regular mutex before freeing mbufs */ + IFCQ_CONVERT_LOCK(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_purgeq(cl->cl_rio, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_purgeq(cl->cl_red, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_purgeq(cl->cl_blue, &cl->cl_q, flow, &cnt, &len); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_purgeq(cl->cl_sfb, &cl->cl_q, flow, &cnt, &len); + else + _flushq_flow(&cl->cl_q, flow, &cnt, &len); + + if (cnt > 0) { + VERIFY(qlen(&cl->cl_q) == (qlen - cnt)); + + PKTCNTR_ADD(&cl->cl_dropcnt, cnt, len); + IFCQ_DROP_ADD(ifq, cnt, len); + + VERIFY(((signed)IFCQ_LEN(ifq) - cnt) >= 0); + IFCQ_LEN(ifq) -= cnt; + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s purge qid=%d pri=%d " + "qlen=[%d,%d] cnt=%d len=%d flow=0x%x\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif), + cl->cl_handle, cl->cl_pri, qlen, qlen(&cl->cl_q), + cnt, len, flow); + } + } +done: + if (packets != NULL) + *packets = cnt; + if (bytes != NULL) + *bytes = len; +} + +static void +tcq_updateq(struct tcq_if *tif, struct tcq_class *cl, cqev_t ev) +{ + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s update qid=%d pri=%d event=%s\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif), + cl->cl_handle, cl->cl_pri, ifclassq_ev2str(ev)); + } + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + return (rio_updateq(cl->cl_rio, ev)); +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + return (red_updateq(cl->cl_red, ev)); +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + return (blue_updateq(cl->cl_blue, ev)); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + return (sfb_updateq(cl->cl_sfb, ev)); +} + +int +tcq_get_class_stats(struct tcq_if *tif, u_int32_t qid, + struct tcq_classstats *sp) +{ + struct tcq_class *cl; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + if ((cl = tcq_clh_to_clp(tif, qid)) == NULL) + return (EINVAL); + + sp->class_handle = cl->cl_handle; + sp->priority = cl->cl_pri; + sp->qlength = qlen(&cl->cl_q); + sp->qlimit = qlimit(&cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(&cl->cl_q); + sp->qstate = qstate(&cl->cl_q); +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif /* CLASSQ_RED */ +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + rio_getstats(cl->cl_rio, &sp->red[0]); +#endif /* CLASSQ_RIO */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + blue_getstats(cl->cl_blue, &sp->blue); +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + sfb_getstats(cl->cl_sfb, &sp->sfb); + + return (0); +} + +/* convert a class handle to the corresponding class pointer */ +static inline struct tcq_class * +tcq_clh_to_clp(struct tcq_if *tif, u_int32_t chandle) +{ + struct tcq_class *cl; + int idx; + + IFCQ_LOCK_ASSERT_HELD(tif->tif_ifq); + + for (idx = tif->tif_maxpri; idx >= 0; idx--) + if ((cl = tif->tif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + +static const char * +tcq_style(struct tcq_if *tif) +{ + return ((tif->tif_flags & TCQIFF_ALTQ) ? "ALTQ_TCQ" : "TCQ"); +} + +/* + * tcq_enqueue_ifclassq is an enqueue function to be registered to + * (*ifcq_enqueue) in struct ifclassq. + */ +static int +tcq_enqueue_ifclassq(struct ifclassq *ifq, struct mbuf *m) +{ + u_int32_t i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + if (!(m->m_flags & M_PKTHDR)) { + /* should not happen */ + log(LOG_ERR, "%s: packet does not have pkthdr\n", + if_name(ifq->ifcq_ifp)); + IFCQ_CONVERT_LOCK(ifq); + m_freem(m); + return (ENOBUFS); + } + + i = MBUF_SCIDX(mbuf_get_service_class(m)); + VERIFY((u_int32_t)i < IFCQ_SC_MAX); + + return (tcq_enqueue(ifq->ifcq_disc, + ifq->ifcq_disc_slots[i].cl, m, m_pftag(m))); +} + +/* + * tcq_dequeue_tc_ifclassq is a dequeue function to be registered to + * (*ifcq_dequeue) in struct ifclass. + * + * note: CLASSQDQ_POLL returns the next packet without removing the packet + * from the queue. CLASSQDQ_REMOVE is a normal dequeue operation. + * CLASSQDQ_REMOVE must return the same packet if called immediately + * after CLASSQDQ_POLL. + */ +static struct mbuf * +tcq_dequeue_tc_ifclassq(struct ifclassq *ifq, mbuf_svc_class_t sc, + cqdq_op_t op) +{ + u_int32_t i = MBUF_SCIDX(sc); + + VERIFY((u_int32_t)i < IFCQ_SC_MAX); + + return (tcq_dequeue_cl(ifq->ifcq_disc, + ifq->ifcq_disc_slots[i].cl, sc, op)); +} + +static int +tcq_request_ifclassq(struct ifclassq *ifq, cqrq_t req, void *arg) +{ + struct tcq_if *tif = (struct tcq_if *)ifq->ifcq_disc; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + + switch (req) { + case CLASSQRQ_PURGE: + tcq_purge(tif); + break; + + case CLASSQRQ_PURGE_SC: + tcq_purge_sc(tif, (cqrq_purge_sc_t *)arg); + break; + + case CLASSQRQ_EVENT: + tcq_event(tif, (cqev_t)arg); + break; + + case CLASSQRQ_THROTTLE: + err = tcq_throttle(tif, (cqrq_throttle_t *)arg); + break; + } + return (err); +} + +int +tcq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + struct tcq_class *cl0, *cl1, *cl2, *cl3; + struct tcq_if *tif; + u_int32_t maxlen = 0, qflags = 0; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_disc == NULL); + VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); + + if (flags & PKTSCHEDF_QALG_RED) + qflags |= TQCF_RED; + if (flags & PKTSCHEDF_QALG_RIO) + qflags |= TQCF_RIO; + if (flags & PKTSCHEDF_QALG_BLUE) + qflags |= TQCF_BLUE; + if (flags & PKTSCHEDF_QALG_SFB) + qflags |= TQCF_SFB; + if (flags & PKTSCHEDF_QALG_ECN) + qflags |= TQCF_ECN; + if (flags & PKTSCHEDF_QALG_FLOWCTL) + qflags |= TQCF_FLOWCTL; + + tif = tcq_alloc(ifp, M_WAITOK, FALSE); + if (tif == NULL) + return (ENOMEM); + + if ((maxlen = IFCQ_MAXLEN(ifq)) == 0) + maxlen = if_sndq_maxlen; + + if ((err = tcq_add_queue(tif, 0, maxlen, + qflags | PRCF_LAZY, SCIDX_BK, &cl0)) != 0) + goto cleanup; + + if ((err = tcq_add_queue(tif, 1, maxlen, + qflags | TQCF_DEFAULTCLASS, SCIDX_BE, &cl1)) != 0) + goto cleanup; + + if ((err = tcq_add_queue(tif, 2, maxlen, + qflags | PRCF_LAZY, SCIDX_VI, &cl2)) != 0) + goto cleanup; + + if ((err = tcq_add_queue(tif, 3, maxlen, + qflags, SCIDX_VO, &cl3)) != 0) + goto cleanup; + + err = ifclassq_attach(ifq, PKTSCHEDT_TCQ, tif, + tcq_enqueue_ifclassq, NULL, tcq_dequeue_tc_ifclassq, + tcq_request_ifclassq); + + /* cache these for faster lookup */ + if (err == 0) { + /* Map {BK_SYS,BK} to TC_BK */ + ifq->ifcq_disc_slots[SCIDX_BK_SYS].qid = SCIDX_BK; + ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl = cl0; + + ifq->ifcq_disc_slots[SCIDX_BK].qid = SCIDX_BK; + ifq->ifcq_disc_slots[SCIDX_BK].cl = cl0; + + /* Map {BE,RD,OAM} to TC_BE */ + ifq->ifcq_disc_slots[SCIDX_BE].qid = SCIDX_BE; + ifq->ifcq_disc_slots[SCIDX_BE].cl = cl1; + + ifq->ifcq_disc_slots[SCIDX_RD].qid = SCIDX_BE; + ifq->ifcq_disc_slots[SCIDX_RD].cl = cl1; + + ifq->ifcq_disc_slots[SCIDX_OAM].qid = SCIDX_BE; + ifq->ifcq_disc_slots[SCIDX_OAM].cl = cl1; + + /* Map {AV,RV,VI} to TC_VI */ + ifq->ifcq_disc_slots[SCIDX_AV].qid = SCIDX_VI; + ifq->ifcq_disc_slots[SCIDX_AV].cl = cl2; + + ifq->ifcq_disc_slots[SCIDX_RV].qid = SCIDX_VI; + ifq->ifcq_disc_slots[SCIDX_RV].cl = cl2; + + ifq->ifcq_disc_slots[SCIDX_VI].qid = SCIDX_VI; + ifq->ifcq_disc_slots[SCIDX_VI].cl = cl2; + + /* Map {VO,CTL} to TC_VO */ + ifq->ifcq_disc_slots[SCIDX_VO].qid = SCIDX_VO; + ifq->ifcq_disc_slots[SCIDX_VO].cl = cl3; + + ifq->ifcq_disc_slots[SCIDX_CTL].qid = SCIDX_VO; + ifq->ifcq_disc_slots[SCIDX_CTL].cl = cl3; + } + +cleanup: + if (err != 0) + (void) tcq_destroy_locked(tif); + + return (err); +} + +int +tcq_teardown_ifclassq(struct ifclassq *ifq) +{ + struct tcq_if *tif = ifq->ifcq_disc; + int i; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(tif != NULL && ifq->ifcq_type == PKTSCHEDT_TCQ); + + (void) tcq_destroy_locked(tif); + + ifq->ifcq_disc = NULL; + for (i = 0; i < IFCQ_SC_MAX; i++) { + ifq->ifcq_disc_slots[i].qid = 0; + ifq->ifcq_disc_slots[i].cl = NULL; + } + + return (ifclassq_detach(ifq)); +} + +int +tcq_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot, + struct if_ifclassq_stats *ifqs) +{ + struct tcq_if *tif = ifq->ifcq_disc; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(ifq->ifcq_type == PKTSCHEDT_TCQ); + + if (slot >= IFCQ_SC_MAX) + return (EINVAL); + + return (tcq_get_class_stats(tif, ifq->ifcq_disc_slots[slot].qid, + &ifqs->ifqs_tcq_stats)); +} + +static int +tcq_throttle(struct tcq_if *tif, cqrq_throttle_t *tr) +{ + struct ifclassq *ifq = tif->tif_ifq; + struct tcq_class *cl; + int err; + + IFCQ_LOCK_ASSERT_HELD(ifq); + VERIFY(!(tif->tif_flags & TCQIFF_ALTQ)); + + if (!tr->set) { + tr->level = tif->tif_throttle; + return (0); + } + + if (tr->level == tif->tif_throttle) + return (EALREADY); + + /* Current throttling levels only involve BK_SYS class */ + cl = ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl; + + switch (tr->level) { + case IFNET_THROTTLE_OFF: + err = tcq_resumeq(tif, cl); + break; + + case IFNET_THROTTLE_OPPORTUNISTIC: + err = tcq_suspendq(tif, cl); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + if (err == 0 || err == ENXIO) { + if (pktsched_verbose) { + log(LOG_DEBUG, "%s: %s throttling %slevel set %d->%d\n", + if_name(TCQIF_IFP(tif)), tcq_style(tif), + (err == 0) ? "" : "lazy ", tif->tif_throttle, + tr->level); + } + tif->tif_throttle = tr->level; + if (err != 0) + err = 0; + else + tcq_purgeq(tif, cl, 0, NULL, NULL); + } else { + log(LOG_ERR, "%s: %s unable to set throttling level " + "%d->%d [error=%d]\n", if_name(TCQIF_IFP(tif)), + tcq_style(tif), tif->tif_throttle, tr->level, err); + } + + return (err); +} + +static int +tcq_resumeq(struct tcq_if *tif, struct tcq_class *cl) +{ + struct ifclassq *ifq = tif->tif_ifq; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + err = rio_suspendq(cl->cl_rio, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + err = red_suspendq(cl->cl_red, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + err = blue_suspendq(cl->cl_blue, &cl->cl_q, FALSE); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL) + err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, FALSE); + + if (err == 0) + qstate(&cl->cl_q) = QS_RUNNING; + + return (err); +} + +static int +tcq_suspendq(struct tcq_if *tif, struct tcq_class *cl) +{ + struct ifclassq *ifq = tif->tif_ifq; + int err = 0; + + IFCQ_LOCK_ASSERT_HELD(ifq); + +#if CLASSQ_RIO + if (q_is_rio(&cl->cl_q)) + err = rio_suspendq(cl->cl_rio, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_RIO */ +#if CLASSQ_RED + if (q_is_red(&cl->cl_q)) + err = red_suspendq(cl->cl_red, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_RED */ +#if CLASSQ_BLUE + if (q_is_blue(&cl->cl_q)) + err = blue_suspendq(cl->cl_blue, &cl->cl_q, TRUE); + else +#endif /* CLASSQ_BLUE */ + if (q_is_sfb(&cl->cl_q)) { + if (cl->cl_sfb != NULL) { + err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, TRUE); + } else { + VERIFY(cl->cl_flags & TQCF_LAZY); + err = ENXIO; /* delayed throttling */ + } + } + + if (err == 0 || err == ENXIO) + qstate(&cl->cl_q) = QS_SUSPENDED; + + return (err); +} diff --git a/bsd/net/pktsched/pktsched_tcq.h b/bsd/net/pktsched/pktsched_tcq.h new file mode 100644 index 000000000..8b85caace --- /dev/null +++ b/bsd/net/pktsched/pktsched_tcq.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_PKTSCHED_PKTSCHED_TCQ_H_ +#define _NET_PKTSCHED_PKTSCHED_TCQ_H_ + +#ifdef PRIVATE +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TCQ_MAXPRI 4 /* upper limit of the number of priorities */ + +/* tcq class flags */ +#define TQCF_RED 0x0001 /* use RED */ +#define TQCF_ECN 0x0002 /* use ECN with RED/BLUE/SFB */ +#define TQCF_RIO 0x0004 /* use RIO */ +#define TQCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define TQCF_BLUE 0x0100 /* use BLUE */ +#define TQCF_SFB 0x0200 /* use SFB */ +#define TQCF_FLOWCTL 0x0400 /* enable flow control advisories */ +#define TQCF_DEFAULTCLASS 0x1000 /* default class */ +#ifdef BSD_KERNEL_PRIVATE +#define TQCF_LAZY 0x10000000 /* on-demand resource allocation */ +#endif /* BSD_KERNEL_PRIVATE */ + +#define TQCF_USERFLAGS \ + (TQCF_RED | TQCF_ECN | TQCF_RIO | TQCF_CLEARDSCP | TQCF_BLUE | \ + TQCF_SFB | TQCF_FLOWCTL | TQCF_DEFAULTCLASS) + +#ifdef BSD_KERNEL_PRIVATE +#define TQCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL\15DEFAULT" \ + "\35LAZY" +#else +#define TQCF_BITS \ + "\020\1RED\2ECN\3RIO\5CLEARDSCP\11BLUE\12SFB\13FLOWCTL" +#endif /* !BSD_KERNEL_PRIVATE */ + +struct tcq_classstats { + u_int32_t class_handle; + u_int32_t priority; + + u_int32_t qlength; + u_int32_t qlimit; + u_int32_t period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* RED, RIO, BLUE, SFB related info */ + classq_type_t qtype; + union { + /* RIO has 3 red stats */ + struct red_stats red[RIO_NDROPPREC]; + struct blue_stats blue; + struct sfb_stats sfb; + }; + classq_state_t qstate; +}; + +#ifdef BSD_KERNEL_PRIVATE +struct tcq_class { + u_int32_t cl_handle; /* class handle */ + class_queue_t cl_q; /* class queue structure */ + u_int32_t cl_qflags; /* class queue flags */ + union { + void *ptr; + struct red *red; /* RED state */ + struct rio *rio; /* RIO state */ + struct blue *blue; /* BLUE state */ + struct sfb *sfb; /* SFB state */ + } cl_qalg; + int32_t cl_pri; /* priority */ + u_int32_t cl_flags; /* class flags */ + struct tcq_if *cl_tif; /* back pointer to tif */ + + /* statistics */ + u_int32_t cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +#define cl_red cl_qalg.red +#define cl_rio cl_qalg.rio +#define cl_blue cl_qalg.blue +#define cl_sfb cl_qalg.sfb + +/* tcq_if flags */ +#define TCQIFF_ALTQ 0x1 /* configured via PF/ALTQ */ + +/* + * tcq interface state + */ +struct tcq_if { + struct ifclassq *tif_ifq; /* backpointer to ifclassq */ + int tif_maxpri; /* max priority in use */ + u_int32_t tif_flags; /* flags */ + u_int32_t tif_throttle; /* throttling level */ + struct tcq_class *tif_default; /* default class */ + struct tcq_class *tif_classes[TCQ_MAXPRI]; /* classes */ +}; + +#define TCQIF_IFP(_tif) ((_tif)->tif_ifq->ifcq_ifp) + +struct if_ifclassq_stats; + +extern void tcq_init(void); +extern struct tcq_if *tcq_alloc(struct ifnet *, int, boolean_t); +extern int tcq_destroy(struct tcq_if *); +extern void tcq_purge(struct tcq_if *); +extern void tcq_event(struct tcq_if *, cqev_t); +extern int tcq_add_queue(struct tcq_if *, int, u_int32_t, int, u_int32_t, + struct tcq_class **); +extern int tcq_remove_queue(struct tcq_if *, u_int32_t); +extern int tcq_get_class_stats(struct tcq_if *, u_int32_t, + struct tcq_classstats *); +extern int tcq_enqueue(struct tcq_if *, struct tcq_class *, struct mbuf *, + struct pf_mtag *); +extern struct mbuf *tcq_dequeue_tc(struct tcq_if *, mbuf_svc_class_t, + cqdq_op_t); +extern int tcq_setup_ifclassq(struct ifclassq *, u_int32_t); +extern int tcq_teardown_ifclassq(struct ifclassq *ifq); +extern int tcq_getqstats_ifclassq(struct ifclassq *, u_int32_t qid, + struct if_ifclassq_stats *); +#endif /* BSD_KERNEL_PRIVATE */ +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _NET_PKTSCHED_PKTSCHED_TCQ_H_ */ diff --git a/bsd/net/radix.c b/bsd/net/radix.c index 1213828f7..51c90586a 100644 --- a/bsd/net/radix.c +++ b/bsd/net/radix.c @@ -101,7 +101,6 @@ static char *rn_zeros, *rn_ones; extern lck_grp_t *domain_proto_mtx_grp; extern lck_attr_t *domain_proto_mtx_attr; -lck_mtx_t *rn_mutex; #define rn_masktop (mask_rnhead->rnh_treetop) #undef Bcmp @@ -1173,6 +1172,4 @@ rn_init(void) *cp++ = -1; if (rn_inithead((void **)&mask_rnhead, 0) == 0) panic("rn_init 2"); - - rn_mutex = lck_mtx_alloc_init(domain_proto_mtx_grp, domain_proto_mtx_attr); } diff --git a/bsd/net/raw_usrreq.c b/bsd/net/raw_usrreq.c index 035c50926..1284ca8c7 100644 --- a/bsd/net/raw_usrreq.c +++ b/bsd/net/raw_usrreq.c @@ -73,7 +73,8 @@ #include -lck_mtx_t *raw_mtx; /*### global raw cb mutex for now */ +decl_lck_mtx_data(,raw_mtx_data); /*### global raw cb mutex for now */ +lck_mtx_t *raw_mtx = &raw_mtx_data; lck_attr_t *raw_mtx_attr; lck_grp_t *raw_mtx_grp; lck_grp_attr_t *raw_mtx_grp_attr; @@ -89,10 +90,7 @@ raw_init(void) raw_mtx_attr = lck_attr_alloc_init(); - if ((raw_mtx = lck_mtx_alloc_init(raw_mtx_grp, raw_mtx_attr)) == NULL) { - printf("raw_init: can't alloc raw_mtx\n"); - return; - } + lck_mtx_init(raw_mtx, raw_mtx_grp, raw_mtx_attr); LIST_INIT(&rawcb_list); } diff --git a/bsd/net/route.c b/bsd/net/route.c index 5ed681a0f..132768cfe 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -60,7 +60,7 @@ * @(#)route.c 8.2 (Berkeley) 11/15/93 * $FreeBSD: src/sys/net/route.c,v 1.59.2.3 2001/07/29 19:18:02 ume Exp $ */ - + #include #include #include @@ -204,7 +204,8 @@ struct route_cb route_cb; __private_extern__ struct rtstat rtstat = { 0, 0, 0, 0, 0 }; struct radix_node_head *rt_tables[AF_MAX+1]; -lck_mtx_t *rnh_lock; /* global routing tables mutex */ +decl_lck_mtx_data(,rnh_lock_data); /* global routing tables mutex */ +lck_mtx_t *rnh_lock = &rnh_lock_data; static lck_attr_t *rnh_lock_attr; static lck_grp_t *rnh_lock_grp; static lck_grp_attr_t *rnh_lock_grp_attr; @@ -214,7 +215,6 @@ static lck_attr_t *rte_mtx_attr; static lck_grp_t *rte_mtx_grp; static lck_grp_attr_t *rte_mtx_grp_attr; -lck_mtx_t *route_domain_mtx; /*### global routing tables mutex for now */ int rttrash = 0; /* routes not in table but not freed */ unsigned int rte_debug; @@ -337,9 +337,6 @@ struct sockaddr_inifscope { #define sin_scope_id un._in_index.ifscope }; -#define SA(sa) ((struct sockaddr *)(size_t)(sa)) -#define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) -#define SIN6(sa) ((struct sockaddr_in6 *)(size_t)(sa)) #define SINIFSCOPE(sa) ((struct sockaddr_inifscope *)(size_t)(sa)) #define SIN6IFSCOPE(sa) SIN6(sa) @@ -398,7 +395,7 @@ static unsigned int primary6_ifscope = IFSCOPE_NONE; SYSCTL_DECL(_net_idle_route); static int rt_if_idle_expire_timeout = RT_IF_IDLE_EXPIRE_TIMEOUT; -SYSCTL_INT(_net_idle_route, OID_AUTO, expire_timeout, CTLFLAG_RW, +SYSCTL_INT(_net_idle_route, OID_AUTO, expire_timeout, CTLFLAG_RW|CTLFLAG_LOCKED, &rt_if_idle_expire_timeout, 0, "Default expiration time on routes for " "interface idle reference counting"); @@ -749,11 +746,7 @@ route_init(void) rnh_lock_grp_attr = lck_grp_attr_alloc_init(); rnh_lock_grp = lck_grp_alloc_init("route", rnh_lock_grp_attr); rnh_lock_attr = lck_attr_alloc_init(); - if ((rnh_lock = lck_mtx_alloc_init(rnh_lock_grp, - rnh_lock_attr)) == NULL) { - printf("route_init: can't alloc rnh_lock\n"); - return; - } + lck_mtx_init(rnh_lock, rnh_lock_grp, rnh_lock_attr); rte_mtx_grp_attr = lck_grp_attr_alloc_init(); rte_mtx_grp = lck_grp_alloc_init(RTE_NAME, rte_mtx_grp_attr); @@ -763,7 +756,6 @@ route_init(void) rn_init(); /* initialize all zeroes, all ones, mask table */ lck_mtx_unlock(rnh_lock); rtable_init((void **)rt_tables); - route_domain_mtx = routedomain.dom_mtx; if (rte_debug & RTD_DEBUG) size = sizeof (struct rtentry_dbg); @@ -1453,7 +1445,7 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, #else if (dst != NULL && dst->sa_family == AF_INET && ip_doscopedroute) #endif /* !INET6 */ - dst = sa_copy(SA(dst), &dst_ss, NULL); + dst = sa_copy(SA((uintptr_t)dst), &dst_ss, NULL); #if INET6 if (gw != NULL && @@ -1462,7 +1454,7 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, #else if (gw != NULL && gw->sa_family == AF_INET && ip_doscopedroute) #endif /* !INET6 */ - gw = sa_copy(SA(gw), &gw_ss, NULL); + gw = sa_copy(SA((uintptr_t)gw), &gw_ss, NULL); if (!(flags & RTF_GATEWAY)) { /* @@ -1708,6 +1700,14 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, */ rt->rt_flags |= RTF_CONDEMNED; + /* + * Clear RTF_ROUTER if it's set. + */ + if (rt->rt_flags & RTF_ROUTER) { + VERIFY(rt->rt_flags & RTF_HOST); + rt->rt_flags &= ~RTF_ROUTER; + } + /* * Now search what's left of the subtree for any cloned * routes which might have been formed from this node. @@ -1819,10 +1819,13 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * When scoped routing is enabled, cloned entries are * always scoped according to the interface portion of * the parent route. The exception to this are IPv4 - * link local addresses. + * link local addresses, or those routes that are cloned + * from a RTF_PROXY route. For the latter, the clone + * gets to keep the RTF_PROXY flag. */ - if (af == AF_INET && - IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) { + if ((af == AF_INET && + IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) || + (rt->rt_flags & RTF_PROXY)) { ifscope = IFSCOPE_NONE; flags &= ~RTF_IFSCOPE; } else { @@ -1878,11 +1881,12 @@ makeroute: * also add the rt_gwroute if possible. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { + int tmp = error; RT_UNLOCK(rt); nstat_route_detach(rt); rte_lock_destroy(rt); rte_free(rt); - senderr(error); + senderr(tmp); } /* @@ -1949,10 +1953,8 @@ makeroute: * then un-make it (this should be a function) */ if (rn == NULL) { - if (rt->rt_gwroute) { - rtfree_locked(rt->rt_gwroute); - rt->rt_gwroute = NULL; - } + /* Clear gateway route */ + rt_set_gwroute(rt, rt_key(rt), NULL); if (rt->rt_ifa) { IFA_REMREF(rt->rt_ifa); rt->rt_ifa = NULL; @@ -1978,8 +1980,10 @@ makeroute: */ if (req == RTM_RESOLVE) { RT_LOCK_SPIN(*ret_nrt); - VERIFY((*ret_nrt)->rt_expire == 0 || (*ret_nrt)->rt_rmx.rmx_expire != 0); - VERIFY((*ret_nrt)->rt_expire != 0 || (*ret_nrt)->rt_rmx.rmx_expire == 0); + VERIFY((*ret_nrt)->rt_expire == 0 || + (*ret_nrt)->rt_rmx.rmx_expire != 0); + VERIFY((*ret_nrt)->rt_expire != 0 || + (*ret_nrt)->rt_rmx.rmx_expire == 0); rt->rt_rmx = (*ret_nrt)->rt_rmx; rt_setexpire(rt, (*ret_nrt)->rt_expire); if ((*ret_nrt)->rt_flags & (RTF_CLONING | RTF_PRCLONING)) { @@ -2029,10 +2033,13 @@ makeroute: } /* - * We repeat the same procedure from rt_setgate() here because - * it doesn't fire when we call it there because the node - * hasn't been added to the tree yet. + * We repeat the same procedures from rt_setgate() here + * because they weren't completed when we called it earlier, + * since the node was embryonic. */ + if ((rt->rt_flags & RTF_GATEWAY) && rt->rt_gwroute != NULL) + rt_set_gwroute(rt, rt_key(rt), rt->rt_gwroute); + if (req == RTM_ADD && !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) { struct rtfc_arg arg; @@ -2044,7 +2051,7 @@ makeroute: } else { RT_UNLOCK(rt); } - + nstat_route_new_entry(rt); break; } @@ -2053,6 +2060,7 @@ bad: IFA_REMREF(ifa); return (error); } +#undef senderr int rtrequest(int req, struct sockaddr *dst, struct sockaddr *gateway, @@ -2221,6 +2229,7 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { int dlen = SA_SIZE(dst->sa_len), glen = SA_SIZE(gate->sa_len); struct radix_node_head *rnh = rt_tables[dst->sa_family]; + boolean_t loop = FALSE; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); @@ -2235,14 +2244,39 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) /* Add an extra ref for ourselves */ RT_ADDREF_LOCKED(rt); + if (rt->rt_flags & RTF_GATEWAY) { + if ((dst->sa_len == gate->sa_len) && + (dst->sa_family == AF_INET || dst->sa_family == AF_INET6)) { + struct sockaddr_storage dst_ss, gate_ss; + + (void) sa_copy(dst, &dst_ss, NULL); + (void) sa_copy(gate, &gate_ss, NULL); + + loop = equal(SA(&dst_ss), SA(&gate_ss)); + } else { + loop = (dst->sa_len == gate->sa_len && + equal(dst, gate)); + } + } + + /* + * A (cloning) network route with the destination equal to the gateway + * will create an endless loop (see notes below), so disallow it. + */ + if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) == + RTF_GATEWAY) && loop) { + /* Release extra ref */ + RT_REMREF_LOCKED(rt); + return (EADDRNOTAVAIL); + } + /* * A host route with the destination equal to the gateway * will interfere with keeping LLINFO in the routing * table, so disallow it. */ if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) == - (RTF_HOST|RTF_GATEWAY)) && (dst->sa_len == gate->sa_len) && - (bcmp(dst, gate, dst->sa_len) == 0)) { + (RTF_HOST|RTF_GATEWAY)) && loop) { /* * The route might already exist if this is an RTM_CHANGE * or a routing redirect, so try to delete it. @@ -2279,8 +2313,12 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) ifscope = IFSCOPE_NONE; RT_UNLOCK(rt); - gwrt = rtalloc1_scoped_locked(gate, 1, - RTF_CLONING | RTF_PRCLONING, ifscope); + /* + * Don't ignore RTF_CLONING, since we prefer that rt_gwroute + * points to a clone rather than a cloning route; see above + * check for cloning loop avoidance (dst == gate). + */ + gwrt = rtalloc1_scoped_locked(gate, 1, RTF_PRCLONING, ifscope); if (gwrt != NULL) RT_LOCK_ASSERT_NOTHELD(gwrt); RT_LOCK(rt); @@ -2330,9 +2368,8 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) return (EBUSY); } - if (rt->rt_gwroute != NULL) - rtfree_locked(rt->rt_gwroute); - rt->rt_gwroute = gwrt; + /* Set gateway route; callee adds ref to gwrt if non-NULL */ + rt_set_gwroute(rt, dst, gwrt); /* * In case the (non-scoped) default route gets modified via @@ -2356,8 +2393,14 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) if ((dst->sa_family == AF_INET) && gwrt != NULL && gwrt->rt_gateway->sa_family == AF_LINK && (gwrt->rt_ifp->if_index == get_primary_ifscope(AF_INET) || - get_primary_ifscope(AF_INET) == IFSCOPE_NONE)) - kdp_set_gateway_mac(SDL(gwrt->rt_gateway)->sdl_data); + get_primary_ifscope(AF_INET) == IFSCOPE_NONE)) { + kdp_set_gateway_mac(SDL((void *)gwrt->rt_gateway)-> + sdl_data); + } + + /* Release extra ref from rtalloc1() */ + if (gwrt != NULL) + RT_REMREF(gwrt); } /* @@ -2373,9 +2416,8 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) /* The underlying allocation is done with M_WAITOK set */ R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) { - if (rt->rt_gwroute != NULL) - rtfree_locked(rt->rt_gwroute); - rt->rt_gwroute = NULL; + /* Clear gateway route */ + rt_set_gwroute(rt, dst, NULL); /* Release extra ref */ RT_REMREF_LOCKED(rt); return (ENOBUFS); @@ -2436,6 +2478,60 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) #undef SA_SIZE +void +rt_set_gwroute(struct rtentry *rt, struct sockaddr *dst, struct rtentry *gwrt) +{ + boolean_t gwrt_isrouter; + + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + RT_LOCK_ASSERT_HELD(rt); + + if (gwrt != NULL) + RT_ADDREF(gwrt); /* for this routine */ + + /* + * Get rid of existing gateway route; if rt_gwroute is already + * set to gwrt, this is slightly redundant (though safe since + * we held an extra ref above) but makes the code simpler. + */ + if (rt->rt_gwroute != NULL) { + struct rtentry *ogwrt = rt->rt_gwroute; + + VERIFY(rt != ogwrt); /* sanity check */ + rt->rt_gwroute = NULL; + RT_UNLOCK(rt); + rtfree_locked(ogwrt); + RT_LOCK(rt); + VERIFY(rt->rt_gwroute == NULL); + } + + /* + * And associate the new gateway route. + */ + if ((rt->rt_gwroute = gwrt) != NULL) { + RT_ADDREF(gwrt); /* for rt */ + + if (rt->rt_flags & RTF_WASCLONED) { + /* rt_parent might be NULL if rt is embryonic */ + gwrt_isrouter = (rt->rt_parent != NULL && + SA_DEFAULT(rt_key(rt->rt_parent)) && + !RT_HOST(rt->rt_parent)); + } else { + gwrt_isrouter = (SA_DEFAULT(dst) && !RT_HOST(rt)); + } + + /* If gwrt points to a default router, mark it accordingly */ + if (gwrt_isrouter && RT_HOST(gwrt) && + !(gwrt->rt_flags & RTF_ROUTER)) { + RT_LOCK(gwrt); + gwrt->rt_flags |= RTF_ROUTER; + RT_UNLOCK(gwrt); + } + + RT_REMREF(gwrt); /* for this routine */ + } +} + static void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) @@ -2706,7 +2802,7 @@ rt_validate(struct rtentry *rt) { RT_LOCK_ASSERT_HELD(rt); - if (!(rt->rt_flags & RTF_CONDEMNED)) { + if ((rt->rt_flags & (RTF_UP | RTF_CONDEMNED)) == RTF_UP) { int af = rt_key(rt)->sa_family; if (af == AF_INET) @@ -2970,6 +3066,34 @@ rt_clear_idleref(struct rtentry *rt) } } +void +rt_set_proxy(struct rtentry *rt, boolean_t set) +{ + lck_mtx_lock(rnh_lock); + RT_LOCK(rt); + /* + * Search for any cloned routes which might have + * been formed from this node, and delete them. + */ + if (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING)) { + struct radix_node_head *rnh = rt_tables[rt_key(rt)->sa_family]; + + if (set) + rt->rt_flags |= RTF_PROXY; + else + rt->rt_flags &= ~RTF_PROXY; + + RT_UNLOCK(rt); + if (rnh != NULL && rt_mask(rt)) { + rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt), + rt_fixdelete, rt); + } + } else { + RT_UNLOCK(rt); + } + lck_mtx_unlock(rnh_lock); +} + static void rte_lock_init(struct rtentry *rt) { @@ -3189,3 +3313,255 @@ route_copyin( /* This function consumes the reference */ src->ro_rt = NULL; } + +/* + * route_to_gwroute will find the gateway route for a given route. + * + * If the route is down, look the route up again. + * If the route goes through a gateway, get the route to the gateway. + * If the gateway route is down, look it up again. + * If the route is set to reject, verify it hasn't expired. + * + * If the returned route is non-NULL, the caller is responsible for + * releasing the reference and unlocking the route. + */ +#define senderr(e) { error = (e); goto bad; } +errno_t +route_to_gwroute(const struct sockaddr *net_dest, struct rtentry *hint0, + struct rtentry **out_route) +{ + uint64_t timenow; + struct rtentry *rt = hint0, *hint = hint0; + errno_t error = 0; + unsigned int ifindex; + boolean_t gwroute; + + *out_route = NULL; + + if (rt == NULL) + return (0); + + /* + * Next hop determination. Because we may involve the gateway route + * in addition to the original route, locking is rather complicated. + * The general concept is that regardless of whether the route points + * to the original route or to the gateway route, this routine takes + * an extra reference on such a route. This extra reference will be + * released at the end. + * + * Care must be taken to ensure that the "hint0" route never gets freed + * via rtfree(), since the caller may have stored it inside a struct + * route with a reference held for that placeholder. + */ + RT_LOCK_SPIN(rt); + ifindex = rt->rt_ifp->if_index; + RT_ADDREF_LOCKED(rt); + if (!(rt->rt_flags & RTF_UP)) { + RT_REMREF_LOCKED(rt); + RT_UNLOCK(rt); + /* route is down, find a new one */ + hint = rt = rtalloc1_scoped((struct sockaddr *) + (size_t)net_dest, 1, 0, ifindex); + if (hint != NULL) { + RT_LOCK_SPIN(rt); + ifindex = rt->rt_ifp->if_index; + } else { + senderr(EHOSTUNREACH); + } + } + + /* + * We have a reference to "rt" by now; it will either + * be released or freed at the end of this routine. + */ + RT_LOCK_ASSERT_HELD(rt); + if ((gwroute = (rt->rt_flags & RTF_GATEWAY))) { + struct rtentry *gwrt = rt->rt_gwroute; + struct sockaddr_storage ss; + struct sockaddr *gw = (struct sockaddr *)&ss; + + VERIFY(rt == hint); + RT_ADDREF_LOCKED(hint); + + /* If there's no gateway rt, look it up */ + if (gwrt == NULL) { + bcopy(rt->rt_gateway, gw, MIN(sizeof (ss), + rt->rt_gateway->sa_len)); + RT_UNLOCK(rt); + goto lookup; + } + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + + /* + * Take gwrt's lock while holding route's lock; + * this is okay since gwrt never points back + * to "rt", so no lock ordering issues. + */ + RT_LOCK_SPIN(gwrt); + if (!(gwrt->rt_flags & RTF_UP)) { + rt->rt_gwroute = NULL; + RT_UNLOCK(gwrt); + bcopy(rt->rt_gateway, gw, MIN(sizeof (ss), + rt->rt_gateway->sa_len)); + RT_UNLOCK(rt); + rtfree(gwrt); +lookup: + lck_mtx_lock(rnh_lock); + gwrt = rtalloc1_scoped_locked(gw, 1, 0, ifindex); + + RT_LOCK(rt); + /* + * Bail out if the route is down, no route + * to gateway, circular route, or if the + * gateway portion of "rt" has changed. + */ + if (!(rt->rt_flags & RTF_UP) || gwrt == NULL || + gwrt == rt || !equal(gw, rt->rt_gateway)) { + if (gwrt == rt) { + RT_REMREF_LOCKED(gwrt); + gwrt = NULL; + } + VERIFY(rt == hint); + RT_REMREF_LOCKED(hint); + hint = NULL; + RT_UNLOCK(rt); + if (gwrt != NULL) + rtfree_locked(gwrt); + lck_mtx_unlock(rnh_lock); + senderr(EHOSTUNREACH); + } + VERIFY(gwrt != NULL); + /* + * Set gateway route; callee adds ref to gwrt; + * gwrt has an extra ref from rtalloc1() for + * this routine. + */ + rt_set_gwroute(rt, rt_key(rt), gwrt); + VERIFY(rt == hint); + RT_REMREF_LOCKED(rt); /* hint still holds a refcnt */ + RT_UNLOCK(rt); + lck_mtx_unlock(rnh_lock); + rt = gwrt; + } else { + RT_ADDREF_LOCKED(gwrt); + RT_UNLOCK(gwrt); + VERIFY(rt == hint); + RT_REMREF_LOCKED(rt); /* hint still holds a refcnt */ + RT_UNLOCK(rt); + rt = gwrt; + } + VERIFY(rt == gwrt && rt != hint); + + /* + * This is an opportunity to revalidate the parent route's + * rt_gwroute, in case it now points to a dead route entry. + * Parent route won't go away since the clone (hint) holds + * a reference to it. rt == gwrt. + */ + RT_LOCK_SPIN(hint); + if ((hint->rt_flags & (RTF_WASCLONED | RTF_UP)) == + (RTF_WASCLONED | RTF_UP)) { + struct rtentry *prt = hint->rt_parent; + VERIFY(prt != NULL); + + RT_CONVERT_LOCK(hint); + RT_ADDREF(prt); + RT_UNLOCK(hint); + rt_revalidate_gwroute(prt, rt); + RT_REMREF(prt); + } else { + RT_UNLOCK(hint); + } + + /* Clean up "hint" now; see notes above regarding hint0 */ + if (hint == hint0) + RT_REMREF(hint); + else + rtfree(hint); + hint = NULL; + + /* rt == gwrt; if it is now down, give up */ + RT_LOCK_SPIN(rt); + if (!(rt->rt_flags & RTF_UP)) { + RT_UNLOCK(rt); + senderr(EHOSTUNREACH); + } + } + + if (rt->rt_flags & RTF_REJECT) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + timenow = net_uptime(); + if (rt->rt_expire == 0 || timenow < rt->rt_expire) { + RT_UNLOCK(rt); + senderr(!gwroute ? EHOSTDOWN : EHOSTUNREACH); + } + } + + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + + /* Caller is responsible for cleaning up "rt" */ + *out_route = rt; + return (0); + +bad: + /* Clean up route (either it is "rt" or "gwrt") */ + if (rt != NULL) { + RT_LOCK_SPIN(rt); + if (rt == hint0) { + RT_REMREF_LOCKED(rt); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + rtfree(rt); + } + } + return (error); +} +#undef senderr + +void +rt_revalidate_gwroute(struct rtentry *rt, struct rtentry *gwrt) +{ + VERIFY(rt->rt_flags & (RTF_CLONING | RTF_PRCLONING)); + VERIFY(gwrt != NULL); + + RT_LOCK_SPIN(rt); + if ((rt->rt_flags & (RTF_GATEWAY | RTF_UP)) == (RTF_GATEWAY | RTF_UP) && + rt->rt_ifp == gwrt->rt_ifp && rt->rt_gateway->sa_family == + rt_key(gwrt)->sa_family && (rt->rt_gwroute == NULL || + !(rt->rt_gwroute->rt_flags & RTF_UP))) { + boolean_t isequal; + + if (rt->rt_gateway->sa_family == AF_INET || + rt->rt_gateway->sa_family == AF_INET6) { + struct sockaddr_storage key_ss, gw_ss; + /* + * We need to compare rt_key and rt_gateway; create + * local copies to get rid of any ifscope association. + */ + (void) sa_copy(rt_key(gwrt), &key_ss, NULL); + (void) sa_copy(rt->rt_gateway, &gw_ss, NULL); + + isequal = equal(SA(&key_ss), SA(&gw_ss)); + } else { + isequal = equal(rt_key(gwrt), rt->rt_gateway); + } + + /* If they are the same, update gwrt */ + if (isequal) { + RT_UNLOCK(rt); + lck_mtx_lock(rnh_lock); + RT_LOCK(rt); + rt_set_gwroute(rt, rt_key(rt), gwrt); + RT_UNLOCK(rt); + lck_mtx_unlock(rnh_lock); + } else { + RT_UNLOCK(rt); + } + } else { + RT_UNLOCK(rt); + } +} diff --git a/bsd/net/route.h b/bsd/net/route.h index 47aa3f902..c5fe155d6 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,6 +103,9 @@ struct rt_reach_info { u_int32_t ri_probes; /* total # of probes */ u_int64_t ri_snd_expire; /* transmit expiration (calendar) time */ u_int64_t ri_rcv_expire; /* receive expiration (calendar) time */ + int32_t ri_rssi; /* received signal strength */ + int32_t ri_lqm; /* link quality metric */ + int32_t ri_npm; /* node proximity metric */ }; #else struct route; @@ -154,13 +157,14 @@ struct rt_metrics { #ifndef RNF_NORMAL #include #endif +struct ifnet_llreach_info; /* forward declaration */ /* * Kernel routing entry structure (private). */ struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ -#define rt_key(r) ((struct sockaddr *)((r)->rt_nodes->rn_key)) -#define rt_mask(r) ((struct sockaddr *)((r)->rt_nodes->rn_mask)) +#define rt_key(r) ((struct sockaddr *)(void *)((r)->rt_nodes->rn_key)) +#define rt_mask(r) ((struct sockaddr *)(void *)((r)->rt_nodes->rn_mask)) struct sockaddr *rt_gateway; /* value */ int32_t rt_refcnt; /* # held references */ uint32_t rt_flags; /* up/down?, host/net */ @@ -170,6 +174,8 @@ struct rtentry { void *rt_llinfo; /* pointer to link level info cache */ void (*rt_llinfo_get_ri) /* llinfo get reachability info fn */ (struct rtentry *, struct rt_reach_info *); + void (*rt_llinfo_get_iflri) /* ifnet llinfo get reach. info fn */ + (struct rtentry *, struct ifnet_llreach_info *); void (*rt_llinfo_purge)(struct rtentry *); /* llinfo purge fn */ void (*rt_llinfo_free)(void *); /* link level info free function */ struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ @@ -222,7 +228,9 @@ extern void rt_setexpire(struct rtentry *, uint64_t); #define RTF_IFSCOPE 0x1000000 /* has valid interface scope */ #define RTF_CONDEMNED 0x2000000 /* defunct; no longer modifiable */ #define RTF_IFREF 0x4000000 /* route holds a ref to interface */ - /* 0x8000000 and up unassigned */ +#define RTF_PROXY 0x8000000 /* proxying, no interface scope */ +#define RTF_ROUTER 0x10000000 /* host is a router */ + /* 0x20000000 and up unassigned */ /* * Routing statistics. @@ -535,6 +543,12 @@ extern void rt_set_idleref(struct rtentry *); extern void rt_clear_idleref(struct rtentry *); extern void rt_aggdrain(int); extern boolean_t rt_validate(struct rtentry *); +extern void rt_set_proxy(struct rtentry *, boolean_t); +extern void rt_set_gwroute(struct rtentry *, struct sockaddr *, + struct rtentry *); +extern void rt_revalidate_gwroute(struct rtentry *, struct rtentry *); +extern errno_t route_to_gwroute(const struct sockaddr *, struct rtentry *, + struct rtentry **); #ifdef XNU_KERNEL_PRIVATE extern void route_copyin(struct route *src, struct route *dst, size_t length); diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index 42b20064a..d8a1b60b2 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,14 +118,17 @@ static void rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *, struct sockaddr *, unsigned int); static void rt_drainall(void); +#ifndef SIN #define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) +#endif - -SYSCTL_NODE(_net, OID_AUTO, idle, CTLFLAG_RW, 0, "idle network monitoring"); +SYSCTL_NODE(_net, OID_AUTO, idle, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "idle network monitoring"); static struct timeval last_ts; -SYSCTL_NODE(_net_idle, OID_AUTO, route, CTLFLAG_RW, 0, "idle route monitoring"); +SYSCTL_NODE(_net_idle, OID_AUTO, route, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "idle route monitoring"); static int rt_if_idle_drain_interval = RT_IF_IDLE_DRAIN_INTERVAL; SYSCTL_INT(_net_idle_route, OID_AUTO, drain_interval, CTLFLAG_RW, @@ -330,7 +333,7 @@ route_output(struct mbuf *m, struct socket *so) int sendonlytoself = 0; unsigned int ifscope = IFSCOPE_NONE; -#define senderr(e) { error = e; goto flush;} +#define senderr(e) { error = (e); goto flush;} if (m == NULL || ((m->m_len < sizeof(intptr_t)) && (m = m_pullup(m, sizeof(intptr_t))) == 0)) return (ENOBUFS); @@ -433,6 +436,12 @@ route_output(struct mbuf *m, struct socket *so) ifscope = rtm->rtm_index; } + /* + * RTF_PROXY can only be set internally from within the kernel. + */ + if (rtm->rtm_flags & RTF_PROXY) + senderr(EINVAL); + /* * For AF_INET, always zero out the embedded scope ID. If this is * a scoped request, it must be done explicitly by setting RTF_IFSCOPE @@ -464,7 +473,7 @@ route_output(struct mbuf *m, struct socket *so) * confusing the routing table with a wrong route to the previous default gateway */ { -#define satosinaddr(sa) (((struct sockaddr_in *)sa)->sin_addr.s_addr) +#define satosinaddr(sa) (((struct sockaddr_in *)(void *)sa)->sin_addr.s_addr) if (check_routeselfref && (info.rti_info[RTAX_DST] && info.rti_info[RTAX_DST]->sa_family == AF_INET) && (info.rti_info[RTAX_NETMASK] && satosinaddr(info.rti_info[RTAX_NETMASK]) == INADDR_BROADCAST) && @@ -620,8 +629,9 @@ route_output(struct mbuf *m, struct socket *so) case RTM_CHANGE: if (info.rti_info[RTAX_GATEWAY] && (error = rt_setgate(rt, rt_key(rt), info.rti_info[RTAX_GATEWAY]))) { + int tmp = error; RT_UNLOCK(rt); - senderr(error); + senderr(tmp); } /* * If they tried to change things but didn't specify @@ -1162,8 +1172,7 @@ again: if (rw->w_tmemsize < len) { if (rw->w_tmem) FREE(rw->w_tmem, M_RTABLE); - rw->w_tmem = (caddr_t) - _MALLOC(len, M_RTABLE, M_WAITOK); /*###LD0412 was NOWAIT */ + rw->w_tmem = _MALLOC(len, M_RTABLE, M_WAITOK); if (rw->w_tmem) rw->w_tmemsize = len; } @@ -1175,7 +1184,7 @@ again: } } if (cp) { - struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; + struct rt_msghdr *rtm = (struct rt_msghdr *)(void *)cp0; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; @@ -1392,7 +1401,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) if (w->w_op != NET_RT_DUMP2) { size = rt_msg2(RTM_GET, &info, 0, w); if (w->w_req && w->w_tmem) { - struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; + struct rt_msghdr *rtm = + (struct rt_msghdr *)(void *)w->w_tmem; rtm->rtm_flags = rt->rt_flags; rtm->rtm_use = rt->rt_use; @@ -1409,7 +1419,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) } else { size = rt_msg2(RTM_GET2, &info, 0, w); if (w->w_req && w->w_tmem) { - struct rt_msghdr2 *rtm = (struct rt_msghdr2 *)w->w_tmem; + struct rt_msghdr2 *rtm = + (struct rt_msghdr2 *)(void *)w->w_tmem; rtm->rtm_flags = rt->rt_flags; rtm->rtm_use = rt->rt_use; @@ -1455,7 +1466,8 @@ sysctl_dumpentry_ext(struct radix_node *rn, void *vw) size = rt_msg2(RTM_GET_EXT, &info, 0, w); if (w->w_req && w->w_tmem) { - struct rt_msghdr_ext *ertm = (struct rt_msghdr_ext *)w->w_tmem; + struct rt_msghdr_ext *ertm = + (struct rt_msghdr_ext *)(void *)w->w_tmem; ertm->rtm_flags = rt->rt_flags; ertm->rtm_use = rt->rt_use; @@ -1465,8 +1477,12 @@ sysctl_dumpentry_ext(struct radix_node *rn, void *vw) ertm->rtm_seq = 0; ertm->rtm_errno = 0; ertm->rtm_addrs = info.rti_addrs; - if (rt->rt_llinfo_get_ri == NULL) + if (rt->rt_llinfo_get_ri == NULL) { bzero(&ertm->rtm_ri, sizeof (ertm->rtm_ri)); + ertm->rtm_ri.ri_rssi = IFNET_RSSI_UNKNOWN; + ertm->rtm_ri.ri_lqm = IFNET_LQM_THRESH_OFF; + ertm->rtm_ri.ri_npm = IFNET_NPM_THRESH_UNKNOWN; + } else rt->rt_llinfo_get_ri(rt, &ertm->rtm_ri); @@ -1538,7 +1554,7 @@ sysctl_iflist(int af, struct walkarg *w) len = rt_msg2(RTM_IFINFO, &info, (caddr_t)cp, NULL); info.rti_info[RTAX_IFP] = NULL; - ifm = (struct if_msghdr *)cp; + ifm = (struct if_msghdr *)(void *)cp; ifm->ifm_index = ifp->if_index; ifm->ifm_flags = (u_short)ifp->if_flags; if_data_internal_to_if_data(ifp, &ifp->if_data, @@ -1573,7 +1589,7 @@ sysctl_iflist(int af, struct walkarg *w) } len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, NULL); - ifam = (struct ifa_msghdr *)cp; + ifam = (struct ifa_msghdr *)(void *)cp; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_metric = ifa->ifa_metric; @@ -1667,13 +1683,14 @@ sysctl_iflist2(int af, struct walkarg *w) len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)cp, NULL); info.rti_info[RTAX_IFP] = NULL; - ifm = (struct if_msghdr2 *)cp; + ifm = (struct if_msghdr2 *)(void *)cp; ifm->ifm_addrs = info.rti_addrs; ifm->ifm_flags = (u_short)ifp->if_flags; ifm->ifm_index = ifp->if_index; - ifm->ifm_snd_len = ifp->if_snd.ifq_len; - ifm->ifm_snd_maxlen = ifp->if_snd.ifq_maxlen; - ifm->ifm_snd_drops = ifp->if_snd.ifq_drops; + ifm->ifm_snd_len = IFCQ_LEN(&ifp->if_snd); + ifm->ifm_snd_maxlen = IFCQ_MAXLEN(&ifp->if_snd); + ifm->ifm_snd_drops = + ifp->if_snd.ifcq_dropcnt.packets; ifm->ifm_timer = ifp->if_timer; if_data_internal_to_if_data64(ifp, &ifp->if_data, &ifm->ifm_data); @@ -1706,7 +1723,7 @@ sysctl_iflist2(int af, struct walkarg *w) } len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, 0); - ifam = (struct ifa_msghdr *)cp; + ifam = (struct ifa_msghdr *)(void *)cp; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_metric = ifa->ifa_metric; @@ -1761,7 +1778,7 @@ sysctl_iflist2(int af, struct walkarg *w) } len = rt_msg2(RTM_NEWMADDR2, &info, (caddr_t)cp, 0); - ifmam = (struct ifma_msghdr2 *)cp; + ifmam = (struct ifma_msghdr2 *)(void *)cp; ifmam->ifmam_addrs = info.rti_addrs; ifmam->ifmam_flags = 0; ifmam->ifmam_index = diff --git a/bsd/netat/drv_dep.c b/bsd/netat/drv_dep.c index 01ba010ca..6add8260f 100644 --- a/bsd/netat/drv_dep.c +++ b/bsd/netat/drv_dep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,8 +84,6 @@ short appletalk_inited = 0; void atalk_load(void); void atalk_unload(void); -extern lck_mtx_t *domain_proto_mtx; - extern int pktsIn, pktsOut; @@ -99,9 +97,9 @@ void atalk_load() for 2225395 this happens in adsp_open and is undone on ADSP_UNLINK */ - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(TRUE); proto_register_input(PF_APPLETALK, at_input_packet, NULL, 0); - lck_mtx_lock(domain_proto_mtx); + domain_proto_mtx_lock(); } /* atalk_load */ /* Undo everything atalk_load() did. */ @@ -190,7 +188,7 @@ int pat_output(patp, mlist, dst_addr, type) (m->m_next)->m_len); #endif atalk_unlock(); - dlil_output(patp->aa_ifp, PF_APPLETALK, m, NULL, &dst, 0); + dlil_output(patp->aa_ifp, PF_APPLETALK, m, NULL, &dst, 0, NULL); atalk_lock(); pktsOut++; diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index 91973125c..3a98cd8d6 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -19,7 +19,6 @@ DATAFILES = \ bootp.h icmp6.h if_ether.h icmp_var.h \ igmp.h igmp_var.h in.h in_pcb.h \ in_systm.h in_var.h ip.h ip6.h \ - ip_fw.h ip_fw2.h \ ip_icmp.h ip_mroute.h ip_var.h tcp.h \ tcp_fsm.h tcp_seq.h tcp_timer.h tcp_var.h \ tcpip.h udp.h udp_var.h @@ -29,8 +28,10 @@ KERNELFILES = \ PRIVATE_DATAFILES = \ ip_dummynet.h \ + ip_flowid.h \ + ip_fw.h ip_fw2.h \ tcp_debug.h \ - in_gif.h ip_compat.h + in_gif.h ip_compat.h PRIVATE_KERNELFILES = ${KERNELFILES} \ ip_ecn.h ip_encap.h diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index aab7c4ffd..841d3b390 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000,2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,10 +95,6 @@ #define _NETINET_ICMP6_H_ #include -#ifdef XNU_KERNEL_PRIVATE -#include -#endif - #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ @@ -333,6 +329,7 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 #define ND_OPT_RDNSS 25 /* RFC 5006 */ +#define ND_OPT_DNSSL 31 /* RFC 6106 */ #define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */ @@ -382,6 +379,14 @@ struct nd_opt_rdnss { /* recursive domain name system servers */ struct in6_addr nd_opt_rdnss_addr[1]; } __attribute__((__packed__)); +struct nd_opt_dnssl { /* domain name search list */ + u_int8_t nd_opt_dnssl_type; + u_int8_t nd_opt_dnssl_len; + u_int16_t nd_opt_dnssl_reserved; + u_int32_t nd_opt_dnssl_lifetime; + u_int8_t nd_opt_dnssl_domains[8]; +} __attribute__((__packed__)); + /* * icmp6 namelookup */ @@ -648,26 +653,27 @@ struct icmp6stat { #if 0 /*obsoleted*/ #define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ #endif -#define ICMPV6CTL_ND6_PRUNE 6 -#define ICMPV6CTL_ND6_DELAY 8 +#define ICMPV6CTL_ND6_PRUNE 6 +#define ICMPV6CTL_ND6_DELAY 8 #define ICMPV6CTL_ND6_UMAXTRIES 9 #define ICMPV6CTL_ND6_MMAXTRIES 10 #define ICMPV6CTL_ND6_USELOOPBACK 11 /*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ -#define ICMPV6CTL_NODEINFO 13 +#define ICMPV6CTL_NODEINFO 13 #define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ #define ICMPV6CTL_ND6_MAXNUDHINT 15 #define ICMPV6CTL_MTUDISC_HIWAT 16 #define ICMPV6CTL_MTUDISC_LOWAT 17 -#define ICMPV6CTL_ND6_DEBUG 18 +#define ICMPV6CTL_ND6_DEBUG 18 #define ICMPV6CTL_ND6_DRLIST 19 #define ICMPV6CTL_ND6_PRLIST 20 #define ICMPV6CTL_MLD_MAXSRCFILTER 21 #define ICMPV6CTL_MLD_SOMAXSRC 22 #define ICMPV6CTL_MLD_VERSION 23 #define ICMPV6CTL_ND6_MAXQLEN 24 -#define ICMPV6CTL_ND6_ACCEPT_6TO4 25 -#define ICMPV6CTL_MAXID 26 +#define ICMPV6CTL_ND6_ACCEPT_6TO4 25 +#define ICMPV6CTL_ND6_OPTIMISTIC_DAD 26 /* RFC 4429 */ +#define ICMPV6CTL_MAXID 27 #ifdef KERNEL_PRIVATE #define ICMPV6CTL_NAMES { \ @@ -697,6 +703,7 @@ struct icmp6stat { { 0, 0 }, \ { 0, 0 }, \ { "nd6_accept_6to4", CTLTYPE_INT }, \ + { "nd6_optimistic_dad", CTLTYPE_INT }, \ } #define RTF_PROBEMTU RTF_PROTO1 @@ -722,14 +729,14 @@ void icmp6_mtudisc_update(struct ip6ctlparam *, int); extern lck_rw_t icmp6_ifs_rwlock; /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ -do { \ - lck_rw_lock_shared(&icmp6_ifs_rwlock); \ - if ((ifp) && (ifp)->if_index <= if_index \ - && (ifp)->if_index < icmp6_ifstatmax \ - && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ - atomic_add_64(&icmp6_ifstat[(ifp)->if_index]->tag, 1); \ - } \ - lck_rw_done(&icmp6_ifs_rwlock); \ +do { \ + lck_rw_lock_shared(&icmp6_ifs_rwlock); \ + if ((ifp) && (ifp)->if_index <= if_index \ + && (ifp)->if_index < icmp6_ifstatmax \ + && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ + icmp6_ifstat[(ifp)->if_index]->tag++; \ + } \ + lck_rw_done(&icmp6_ifs_rwlock); \ } while (0) #define icmp6_ifoutstat_inc(ifp, type, code) \ diff --git a/bsd/netinet/if_ether.h b/bsd/netinet/if_ether.h index 1705a1413..e796437dd 100644 --- a/bsd/netinet/if_ether.h +++ b/bsd/netinet/if_ether.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -132,7 +132,8 @@ struct sockaddr_inarp { struct in_addr sin_srcaddr; u_short sin_tos; u_short sin_other; -#define SIN_PROXY 1 +#define SIN_PROXY 0x1 +#define SIN_ROUTER 0x2 }; /* * IP and ethernet specific routing flags diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index 1142c99e2..7e0cd82e9 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -556,6 +556,9 @@ igmp_domifattach(struct ifnet *ifp, int how) IGI_ADDREF_LOCKED(igi); /* hold a reference for igi_head */ IGI_ADDREF_LOCKED(igi); /* hold a reference for caller */ IGI_UNLOCK(igi); + ifnet_lock_shared(ifp); + igmp_initsilent(ifp, igi); + ifnet_lock_done(ifp); LIST_INSERT_HEAD(&igi_head, igi, igi_link); @@ -586,6 +589,9 @@ igmp_domifreattach(struct igmp_ifinfo *igi) igi->igi_debug |= IFD_ATTACHED; IGI_ADDREF_LOCKED(igi); /* hold a reference for igi_head */ IGI_UNLOCK(igi); + ifnet_lock_shared(ifp); + igmp_initsilent(ifp, igi); + ifnet_lock_done(ifp); LIST_INSERT_HEAD(&igi_head, igi, igi_link); @@ -651,6 +657,20 @@ igi_delete(const struct ifnet *ifp, struct igmp_inm_relhead *inm_dthead) panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); } +__private_extern__ void +igmp_initsilent(struct ifnet *ifp, struct igmp_ifinfo *igi) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED); + + IGI_LOCK_ASSERT_NOTHELD(igi); + IGI_LOCK(igi); + if (!(ifp->if_flags & IFF_MULTICAST)) + igi->igi_flags |= IGIF_SILENT; + else + igi->igi_flags &= ~IGIF_SILENT; + IGI_UNLOCK(igi); +} + static void igi_initvar(struct igmp_ifinfo *igi, struct ifnet *ifp, int reattach) { @@ -664,10 +684,6 @@ igi_initvar(struct igmp_ifinfo *igi, struct ifnet *ifp, int reattach) igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; - /* ifnet is not yet attached; no need to hold ifnet lock */ - if (!(ifp->if_flags & IFF_MULTICAST)) - igi->igi_flags |= IGIF_SILENT; - if (!reattach) SLIST_INIT(&igi->igi_relinmhead); @@ -1553,6 +1569,9 @@ igmp_input(struct mbuf *m, int off) IGMPSTAT_INC(igps_rcv_total); OIGMPSTAT_INC(igps_rcv_total); + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); iphlen = off; @@ -1578,12 +1597,14 @@ igmp_input(struct mbuf *m, int off) else minlen = IGMP_MINLEN; - M_STRUCT_GET(igmp, struct igmp *, m, off, minlen); + /* A bit more expensive than M_STRUCT_GET, but ensures alignment */ + M_STRUCT_GET0(igmp, struct igmp *, m, off, minlen); if (igmp == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); OIGMPSTAT_INC(igps_rcv_tooshort); return; } + VERIFY(IS_P2ALIGNED(igmp, sizeof (u_int32_t))); /* * Validate checksum. @@ -1669,13 +1690,19 @@ igmp_input(struct mbuf *m, int off) return; } igmpv3len = IGMP_V3_QUERY_MINLEN + srclen; - M_STRUCT_GET(igmpv3, struct igmpv3 *, m, + /* + * A bit more expensive than M_STRUCT_GET, + * but ensures alignment. + */ + M_STRUCT_GET0(igmpv3, struct igmpv3 *, m, off, igmpv3len); if (igmpv3 == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); OIGMPSTAT_INC(igps_rcv_tooshort); return; } + VERIFY(IS_P2ALIGNED(igmpv3, + sizeof (u_int32_t))); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return; @@ -2857,6 +2884,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, int type; in_addr_t naddr; uint8_t mode; + u_int16_t ig_numsrc; INM_LOCK_ASSERT_HELD(inm); IGI_LOCK_ASSERT_HELD(inm->inm_igi); @@ -3026,12 +3054,12 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, if (record_has_sources) { if (m == m0) { md = m_last(m); - pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + - md->m_len - nbytes); + pig = (struct igmp_grouprec *)(void *) + (mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); - pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + - off); + pig = (struct igmp_grouprec *)(void *) + (mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { @@ -3065,7 +3093,8 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, } IGMP_PRINTF(("%s: msrcs is %d this packet\n", __func__, msrcs)); - pig->ig_numsrc = htons(msrcs); + ig_numsrc = htons(msrcs); + bcopy(&ig_numsrc, &pig->ig_numsrc, sizeof (ig_numsrc)); nbytes += (msrcs * sizeof(in_addr_t)); } @@ -3114,7 +3143,8 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, if (m == NULL) return (-ENOMEM); md = m_getptr(m, 0, &off); - pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); + pig = (struct igmp_grouprec *)(void *) + (mtod(md, uint8_t *) + off); IGMP_PRINTF(("%s: allocated next packet\n", __func__)); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { @@ -3157,7 +3187,8 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, if (msrcs == m0srcs) break; } - pig->ig_numsrc = htons(msrcs); + ig_numsrc = htons(msrcs); + bcopy(&ig_numsrc, &pig->ig_numsrc, sizeof (ig_numsrc)); nbytes += (msrcs * sizeof(in_addr_t)); IGMP_PRINTF(("%s: enqueueing next packet\n", __func__)); @@ -3216,6 +3247,7 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; + u_int16_t ig_numsrc; INM_LOCK_ASSERT_HELD(inm); @@ -3301,12 +3333,12 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); - pig = (struct igmp_grouprec *)(mtod(md, + pig = (struct igmp_grouprec *)(void *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); - pig = (struct igmp_grouprec *)(mtod(md, + pig = (struct igmp_grouprec *)(void *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } @@ -3384,7 +3416,8 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; - pig->ig_numsrc = htons(rsrcs); + ig_numsrc = htons(rsrcs); + bcopy(&ig_numsrc, &pig->ig_numsrc, sizeof (ig_numsrc)); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. @@ -3658,6 +3691,13 @@ igmp_sendpkt(struct mbuf *m, struct ifnet *ifp) #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif + + if (ifp->if_eflags & IFEF_TXSTART) { + /* Use control service class if the interface supports + * transmit-start model. + */ + (void) m_set_service_class(m0, MBUF_SC_CTL); + } bzero(&ro, sizeof (ro)); error = ip_output(m0, ipopts, &ro, 0, imo, NULL); if (ro.ro_rt != NULL) { diff --git a/bsd/netinet/igmp_var.h b/bsd/netinet/igmp_var.h index 8fdaab868..30a0cacb3 100644 --- a/bsd/netinet/igmp_var.h +++ b/bsd/netinet/igmp_var.h @@ -310,6 +310,7 @@ extern void igmp_leavegroup(struct in_multi *); extern void igmp_slowtimo(void); extern void igi_addref(struct igmp_ifinfo *, int); extern void igi_remref(struct igmp_ifinfo *); +__private_extern__ void igmp_initsilent(struct ifnet *, struct igmp_ifinfo *); SYSCTL_DECL(_net_inet_igmp); diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index 85b9d38af..1df980df6 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -103,8 +103,9 @@ static int in_mask2len(struct in_addr *); static void in_len2mask(struct in_addr *, int); -static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, - struct ifnet *, struct proc *); +static int in_lifaddr_ioctl(struct socket *, u_long, struct if_laddrreq *, + struct ifnet *, struct proc *); +static int in_setrouter(struct ifnet *, int); static void in_socktrim(struct sockaddr_in *); static int in_ifinit(struct ifnet *, @@ -366,35 +367,37 @@ in_domifattach(struct ifnet *ifp) */ /* ARGSUSED */ int -in_control( - struct socket *so, - u_long cmd, - caddr_t data, - struct ifnet *ifp, - struct proc *p) +in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, + struct proc *p) { - struct ifreq *ifr = (struct ifreq *)data; - struct in_ifaddr *ia = NULL, *iap; + struct in_ifaddr *ia = NULL; struct ifaddr *ifa; - struct in_aliasreq *ifra = (struct in_aliasreq *)data; struct sockaddr_in oldaddr; int error = 0; int hostIsNew, maskIsNew; - struct kev_msg ev_msg; - struct kev_in_data in_event_data; + struct kev_msg ev_msg; + struct kev_in_data in_event_data; + + bzero(&in_event_data, sizeof (struct kev_in_data)); + bzero(&ev_msg, sizeof (struct kev_msg)); - bzero(&in_event_data, sizeof(struct kev_in_data)); - bzero(&ev_msg, sizeof(struct kev_msg)); switch (cmd) { - case SIOCALIFADDR: - case SIOCDLIFADDR: + case SIOCALIFADDR: /* struct if_laddrreq */ + case SIOCDLIFADDR: /* struct if_laddrreq */ if ((error = proc_suser(p)) != 0) - return error; - /*fall through*/ - case SIOCGLIFADDR: - if (!ifp) - return EINVAL; - return in_lifaddr_ioctl(so, cmd, data, ifp, p); + return (error); + /* FALLTHRU */ + case SIOCGLIFADDR: { /* struct if_laddrreq */ + struct if_laddrreq iflr; + + if (ifp == NULL) + return (EINVAL); + + bcopy(data, &iflr, sizeof (iflr)); + error = in_lifaddr_ioctl(so, cmd, &iflr, ifp, p); + bcopy(&iflr, data, sizeof (iflr)); + return (error); + } } /* @@ -403,51 +406,75 @@ in_control( * If an alias address was specified, find that one instead of * the first one on the interface. */ - if (ifp) { + if (ifp != NULL) { + struct in_ifaddr *iap; + struct sockaddr_in sin; + + bcopy(&((struct ifreq *)(void *)data)->ifr_addr, + &sin, sizeof (sin)); + lck_rw_lock_shared(in_ifaddr_rwlock); - for (iap = in_ifaddrhead.tqh_first; iap; - iap = iap->ia_link.tqe_next) - if (iap->ia_ifp == ifp) { - IFA_LOCK(&iap->ia_ifa); - if (((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr.s_addr == - iap->ia_addr.sin_addr.s_addr) { - ia = iap; + for (iap = in_ifaddrhead.tqh_first; iap != NULL; + iap = iap->ia_link.tqe_next) { + if (iap->ia_ifp != ifp) + continue; + + IFA_LOCK(&iap->ia_ifa); + if (sin.sin_addr.s_addr == + iap->ia_addr.sin_addr.s_addr) { + ia = iap; + IFA_UNLOCK(&iap->ia_ifa); + break; + } else if (ia == NULL) { + ia = iap; + if (sin.sin_family != AF_INET) { IFA_UNLOCK(&iap->ia_ifa); break; - } else if (ia == NULL) { - ia = iap; - if (ifr->ifr_addr.sa_family != AF_INET) { - IFA_UNLOCK(&iap->ia_ifa); - break; - } } - IFA_UNLOCK(&iap->ia_ifa); } + IFA_UNLOCK(&iap->ia_ifa); + } /* take a reference on ia before releasing lock */ - if (ia != NULL) { + if (ia != NULL) IFA_ADDREF(&ia->ia_ifa); - } lck_rw_done(in_ifaddr_rwlock); } + switch (cmd) { - case SIOCAUTOADDR: - case SIOCARPIPLL: + case SIOCAUTOADDR: /* struct ifreq */ + case SIOCARPIPLL: /* struct ifreq */ + case SIOCSETROUTERMODE: /* struct ifreq */ if ((error = proc_suser(p)) != 0) { goto done; } - if (ifp == 0) { + if (ifp == NULL) { error = EADDRNOTAVAIL; goto done; } break; - case SIOCAIFADDR: - case SIOCDIFADDR: - if (ifp == 0) { + case SIOCAIFADDR: /* struct ifaliasreq */ + case SIOCDIFADDR: { /* struct ifreq */ + struct sockaddr_in addr, dstaddr; + + if (ifp == NULL) { error = EADDRNOTAVAIL; goto done; } - if (ifra->ifra_addr.sin_family == AF_INET) { + + if (cmd == SIOCAIFADDR) { + bcopy(&((struct in_aliasreq *)(void *)data)-> + ifra_addr, &addr, sizeof (addr)); + bcopy(&((struct in_aliasreq *)(void *)data)-> + ifra_dstaddr, &dstaddr, sizeof (dstaddr)); + } else { + VERIFY(cmd == SIOCDIFADDR); + bcopy(&((struct ifreq *)(void *)data)->ifr_addr, + &addr, sizeof (addr)); + bzero(&dstaddr, sizeof (dstaddr)); + } + + if (addr.sin_family == AF_INET) { struct in_ifaddr *oia; lck_rw_lock_shared(in_ifaddr_rwlock); @@ -455,7 +482,7 @@ in_control( IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == - ifra->ifra_addr.sin_addr.s_addr) { + addr.sin_addr.s_addr) { IFA_ADDREF_LOCKED(&ia->ia_ifa); IFA_UNLOCK(&ia->ia_ifa); break; @@ -465,26 +492,38 @@ in_control( lck_rw_done(in_ifaddr_rwlock); if (oia != NULL) IFA_REMREF(&oia->ia_ifa); - if ((ifp->if_flags & IFF_POINTOPOINT) - && (cmd == SIOCAIFADDR) - && (ifra->ifra_dstaddr.sin_addr.s_addr - == INADDR_ANY)) { + if ((ifp->if_flags & IFF_POINTOPOINT) && + (cmd == SIOCAIFADDR) && + (dstaddr.sin_addr.s_addr == INADDR_ANY)) { error = EDESTADDRREQ; goto done; } - } - else if (cmd == SIOCAIFADDR) { + } else if (cmd == SIOCAIFADDR) { error = EINVAL; goto done; } - if (cmd == SIOCDIFADDR && ia == 0) { + if (cmd == SIOCDIFADDR && ia == NULL) { error = EADDRNOTAVAIL; goto done; } /* FALLTHROUGH */ - case SIOCSIFADDR: - case SIOCSIFNETMASK: - case SIOCSIFDSTADDR: + } + case SIOCSIFADDR: /* struct ifreq */ + case SIOCSIFNETMASK: /* struct ifreq */ + case SIOCSIFDSTADDR: { /* struct ifreq */ + struct sockaddr_in addr; + + if (cmd == SIOCAIFADDR) { + /* fell thru from above; just repeat it */ + bcopy(&((struct in_aliasreq *)(void *)data)-> + ifra_addr, &addr, sizeof (addr)); + } else { + VERIFY(cmd == SIOCDIFADDR || cmd == SIOCSIFADDR || + cmd == SIOCSIFNETMASK || cmd == SIOCSIFDSTADDR); + bcopy(&((struct ifreq *)(void *)data)->ifr_addr, + &addr, sizeof (addr)); + } + /* socket is NULL if called from in_purgeaddrs() */ if (so != NULL && (so->so_state & SS_PRIV) == 0) { error = EPERM; @@ -495,12 +534,11 @@ in_control( error = EPERM; goto done; } - if (ifp == 0) { + if (ifp == NULL) { error = EADDRNOTAVAIL; goto done; } - if (ifra->ifra_addr.sin_family != AF_INET - && cmd == SIOCSIFADDR) { + if (addr.sin_family != AF_INET && cmd == SIOCSIFADDR) { error = EINVAL; goto done; } @@ -521,7 +559,7 @@ in_control( ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_sockmask.sin_len = 8; if (ifp->if_flags & IFF_BROADCAST) { - ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_len = sizeof (ia->ia_addr); ia->ia_broadaddr.sin_family = AF_INET; } ia->ia_ifp = ifp; @@ -544,104 +582,161 @@ in_control( IFA_ADDREF(ifa); TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); lck_rw_done(in_ifaddr_rwlock); - error = in_domifattach(ifp); - /* discard error,can be cold with unsupported interfaces */ - if (error) - error = 0; + /* discard error */ + (void) in_domifattach(ifp); + error = 0; } break; + } - case SIOCPROTOATTACH: - case SIOCPROTODETACH: + case SIOCPROTOATTACH: /* struct ifreq */ + case SIOCPROTODETACH: /* struct ifreq */ if ((error = proc_suser(p)) != 0) { goto done; } - if (ifp == 0) { + if (ifp == NULL) { error = EADDRNOTAVAIL; goto done; } break; - case SIOCSIFBRDADDR: + case SIOCSIFBRDADDR: /* struct ifreq */ if ((so->so_state & SS_PRIV) == 0) { error = EPERM; goto done; } /* FALLTHROUGH */ - - case SIOCGIFADDR: - case SIOCGIFNETMASK: - case SIOCGIFDSTADDR: - case SIOCGIFBRDADDR: - if (ia == (struct in_ifaddr *)0) { + case SIOCGIFADDR: /* struct ifreq */ + case SIOCGIFNETMASK: /* struct ifreq */ + case SIOCGIFDSTADDR: /* struct ifreq */ + case SIOCGIFBRDADDR: /* struct ifreq */ + if (ia == NULL) { error = EADDRNOTAVAIL; goto done; } break; } + switch (cmd) { - case SIOCAUTOADDR: + case SIOCAUTOADDR: { /* struct ifreq */ + int intval; + + VERIFY(ifp != NULL); + bcopy(&((struct ifreq *)(void *)data)->ifr_intval, + &intval, sizeof (intval)); + ifnet_lock_exclusive(ifp); - if (ifr->ifr_intval) - ifp->if_eflags |= IFEF_AUTOCONFIGURING; - else + if (intval) { + /* + * An interface in IPv4 router mode implies that it + * is configured with a static IP address and should + * not act as a DHCP client; prevent SIOCAUTOADDR from + * being set in that mode. + */ + if (ifp->if_eflags & IFEF_IPV4_ROUTER) { + intval = 0; /* be safe; clear flag if set */ + error = EBUSY; + } else { + ifp->if_eflags |= IFEF_AUTOCONFIGURING; + } + } + if (!intval) ifp->if_eflags &= ~IFEF_AUTOCONFIGURING; ifnet_lock_done(ifp); break; - - case SIOCARPIPLL: + } + + case SIOCARPIPLL: { /* struct ifreq */ + int intval; + + VERIFY(ifp != NULL); + bcopy(&((struct ifreq *)(void *)data)->ifr_intval, + &intval, sizeof (intval)); ipv4_ll_arp_aware = 1; + ifnet_lock_exclusive(ifp); - if (ifr->ifr_data) - ifp->if_eflags |= IFEF_ARPLL; - else + if (intval) { + /* + * An interface in IPv4 router mode implies that it + * is configured with a static IP address and should + * not have to deal with IPv4 Link-Local Address; + * prevent SIOCARPIPLL from being set in that mode. + */ + if (ifp->if_eflags & IFEF_IPV4_ROUTER) { + intval = 0; /* be safe; clear flag if set */ + error = EBUSY; + } else { + ifp->if_eflags |= IFEF_ARPLL; + } + } + if (!intval) ifp->if_eflags &= ~IFEF_ARPLL; ifnet_lock_done(ifp); break; + } - case SIOCGIFADDR: + case SIOCGIFADDR: /* struct ifreq */ + VERIFY(ia != NULL); IFA_LOCK(&ia->ia_ifa); - *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + bcopy(&ia->ia_addr, &((struct ifreq *)(void *)data)->ifr_addr, + sizeof (struct sockaddr_in)); IFA_UNLOCK(&ia->ia_ifa); break; - case SIOCGIFBRDADDR: + case SIOCGIFBRDADDR: /* struct ifreq */ + VERIFY(ia != NULL); if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } IFA_LOCK(&ia->ia_ifa); - *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + bcopy(&ia->ia_broadaddr, + &((struct ifreq *)(void *)data)->ifr_broadaddr, + sizeof (struct sockaddr_in)); IFA_UNLOCK(&ia->ia_ifa); break; - case SIOCGIFDSTADDR: + case SIOCGIFDSTADDR: /* struct ifreq */ + VERIFY(ia != NULL); if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } IFA_LOCK(&ia->ia_ifa); - *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + bcopy(&ia->ia_dstaddr, + &((struct ifreq *)(void *)data)->ifr_dstaddr, + sizeof (struct sockaddr_in)); IFA_UNLOCK(&ia->ia_ifa); break; - case SIOCGIFNETMASK: + case SIOCGIFNETMASK: /* struct ifreq */ + VERIFY(ia != NULL); IFA_LOCK(&ia->ia_ifa); - *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + bcopy(&ia->ia_sockmask, + &((struct ifreq *)(void *)data)->ifr_addr, + sizeof (struct sockaddr_in)); IFA_UNLOCK(&ia->ia_ifa); break; - case SIOCSIFDSTADDR: + case SIOCSIFDSTADDR: /* struct ifreq */ + VERIFY(ifp != NULL && ia != NULL); if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } IFA_LOCK(&ia->ia_ifa); oldaddr = ia->ia_dstaddr; - ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + bcopy(&((struct ifreq *)(void *)data)->ifr_dstaddr, + &ia->ia_dstaddr, sizeof (struct sockaddr_in)); if (ia->ia_dstaddr.sin_family == AF_INET) ia->ia_dstaddr.sin_len = sizeof (struct sockaddr_in); IFA_UNLOCK(&ia->ia_ifa); + /* + * NOTE: SIOCSIFDSTADDR is defined with struct ifreq + * as parameter, but here we are sending it down + * to the interface with a pointer to struct ifaddr, + * for legacy reasons. + */ error = ifnet_ioctl(ifp, PF_INET, SIOCSIFDSTADDR, ia); IFA_LOCK(&ia->ia_ifa); if (error == EOPNOTSUPP) { @@ -660,11 +755,12 @@ in_control( ev_msg.event_code = KEV_INET_SIFDSTADDR; - if (ia->ia_ifa.ifa_dstaddr) - in_event_data.ia_dstaddr = - ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; - else - in_event_data.ia_dstaddr.s_addr = 0; + if (ia->ia_ifa.ifa_dstaddr) { + in_event_data.ia_dstaddr = ((struct sockaddr_in *) + (void *)ia->ia_ifa.ifa_dstaddr)->sin_addr; + } else { + in_event_data.ia_dstaddr.s_addr = INADDR_ANY; + } in_event_data.ia_addr = ia->ia_addr.sin_addr; in_event_data.ia_net = ia->ia_net; @@ -673,12 +769,13 @@ in_control( in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; IFA_UNLOCK(&ia->ia_ifa); - strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); + (void) strncpy(&in_event_data.link_data.if_name[0], + ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; ev_msg.dv[0].data_ptr = &in_event_data; - ev_msg.dv[0].data_length = sizeof(struct kev_in_data); + ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); @@ -701,13 +798,15 @@ in_control( lck_mtx_unlock(rnh_lock); break; - case SIOCSIFBRDADDR: + case SIOCSIFBRDADDR: /* struct ifreq */ + VERIFY(ia != NULL); if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } IFA_LOCK(&ia->ia_ifa); - ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + bcopy(&((struct ifreq *)(void *)data)->ifr_broadaddr, + &ia->ia_broadaddr, sizeof (struct sockaddr_in)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; @@ -715,12 +814,12 @@ in_control( ev_msg.event_code = KEV_INET_SIFBRDADDR; - if (ia->ia_ifa.ifa_dstaddr) - in_event_data.ia_dstaddr = - ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; - else - in_event_data.ia_dstaddr.s_addr = 0; - + if (ia->ia_ifa.ifa_dstaddr) { + in_event_data.ia_dstaddr = ((struct sockaddr_in *) + (void *)ia->ia_ifa.ifa_dstaddr)->sin_addr; + } else { + in_event_data.ia_dstaddr.s_addr = INADDR_ANY; + } in_event_data.ia_addr = ia->ia_addr.sin_addr; in_event_data.ia_net = ia->ia_net; in_event_data.ia_netmask = ia->ia_netmask; @@ -728,36 +827,43 @@ in_control( in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; IFA_UNLOCK(&ia->ia_ifa); - strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); + (void) strncpy(&in_event_data.link_data.if_name[0], + ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; ev_msg.dv[0].data_ptr = &in_event_data; - ev_msg.dv[0].data_length = sizeof(struct kev_in_data); + ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - break; - case SIOCSIFADDR: + case SIOCSIFADDR: { /* struct ifreq */ + struct sockaddr_in addr; + + VERIFY(ifp != NULL && ia != NULL); + bcopy(&((struct ifreq *)(void *)data)->ifr_addr, + &addr, sizeof (addr)); /* * If this is a new address, the reference count for the * hash table has been taken at creation time above. */ - error = in_ifinit(ifp, ia, - (struct sockaddr_in *)&ifr->ifr_addr, 1); + error = in_ifinit(ifp, ia, &addr, 1); #if PF if (!error) (void) pf_ifaddr_hook(ifp, cmd); #endif /* PF */ break; + } - case SIOCPROTOATTACH: + case SIOCPROTOATTACH: /* struct ifreq */ + VERIFY(ifp != NULL); error = in_domifattach(ifp); break; - case SIOCPROTODETACH: + case SIOCPROTODETACH: /* struct ifreq */ + VERIFY(ifp != NULL); /* * If an IPv4 address is still present, refuse to detach. */ @@ -779,10 +885,26 @@ in_control( error = proto_unplumb(PF_INET, ifp); break; - case SIOCSIFNETMASK: { - u_long i; + case SIOCSETROUTERMODE: { /* struct ifreq */ + int intval; + + VERIFY(ifp != NULL); + bcopy(&((struct ifreq *)(void *)data)->ifr_intval, + &intval, sizeof (intval)); + + error = in_setrouter(ifp, intval); + break; + } + + case SIOCSIFNETMASK: { /* struct ifreq */ + struct sockaddr_in addr; + in_addr_t i; + + VERIFY(ifp != NULL && ia != NULL); + bcopy(&((struct ifreq *)(void *)data)->ifr_addr, + &addr, sizeof (addr)); + i = addr.sin_addr.s_addr; - i = ifra->ifra_addr.sin_addr.s_addr; IFA_LOCK(&ia->ia_ifa); ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr = i); ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -791,12 +913,12 @@ in_control( ev_msg.event_code = KEV_INET_SIFNETMASK; - if (ia->ia_ifa.ifa_dstaddr) - in_event_data.ia_dstaddr = - ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; - else - in_event_data.ia_dstaddr.s_addr = 0; - + if (ia->ia_ifa.ifa_dstaddr) { + in_event_data.ia_dstaddr = ((struct sockaddr_in *) + (void *)ia->ia_ifa.ifa_dstaddr)->sin_addr; + } else { + in_event_data.ia_dstaddr.s_addr = INADDR_ANY; + } in_event_data.ia_addr = ia->ia_addr.sin_addr; in_event_data.ia_net = ia->ia_net; in_event_data.ia_netmask = ia->ia_netmask; @@ -804,54 +926,65 @@ in_control( in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; IFA_UNLOCK(&ia->ia_ifa); - strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); + (void) strncpy(&in_event_data.link_data.if_name[0], + ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; ev_msg.dv[0].data_ptr = &in_event_data; - ev_msg.dv[0].data_length = sizeof(struct kev_in_data); + ev_msg.dv[0].data_length = sizeof (struct kev_in_data); ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - break; } - case SIOCAIFADDR: + + case SIOCAIFADDR: { /* struct ifaliasreq */ + struct sockaddr_in addr, broadaddr, mask; + + VERIFY(ifp != NULL && ia != NULL); + bcopy(&((struct ifaliasreq *)(void *)data)->ifra_addr, + &addr, sizeof (addr)); + bcopy(&((struct ifaliasreq *)(void *)data)->ifra_broadaddr, + &broadaddr, sizeof (broadaddr)); + bcopy(&((struct ifaliasreq *)(void *)data)->ifra_mask, + &mask, sizeof (mask)); + maskIsNew = 0; hostIsNew = 1; error = 0; IFA_LOCK(&ia->ia_ifa); if (ia->ia_addr.sin_family == AF_INET) { - if (ifra->ifra_addr.sin_len == 0) { - ifra->ifra_addr = ia->ia_addr; + if (addr.sin_len == 0) { + addr = ia->ia_addr; hostIsNew = 0; - } else if (ifra->ifra_addr.sin_addr.s_addr == - ia->ia_addr.sin_addr.s_addr) + } else if (addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) { hostIsNew = 0; + } } - if (ifra->ifra_mask.sin_len) { + if (mask.sin_len) { IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 0); IFA_LOCK(&ia->ia_ifa); - ia->ia_sockmask = ifra->ifra_mask; + ia->ia_sockmask = mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); maskIsNew = 1; } if ((ifp->if_flags & IFF_POINTOPOINT) && - (ifra->ifra_dstaddr.sin_family == AF_INET)) { + (broadaddr.sin_family == AF_INET)) { IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 0); IFA_LOCK(&ia->ia_ifa); - ia->ia_dstaddr = ifra->ifra_dstaddr; + ia->ia_dstaddr = broadaddr; ia->ia_dstaddr.sin_len = sizeof (struct sockaddr_in); maskIsNew = 1; /* We lie; but the effect's the same */ } - if (ifra->ifra_addr.sin_family == AF_INET && - (hostIsNew || maskIsNew)) { + if (addr.sin_family == AF_INET && (hostIsNew || maskIsNew)) { IFA_UNLOCK(&ia->ia_ifa); - error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + error = in_ifinit(ifp, ia, &addr, 0); } else { IFA_UNLOCK(&ia->ia_ifa); } @@ -861,51 +994,54 @@ in_control( #endif /* PF */ IFA_LOCK(&ia->ia_ifa); if ((ifp->if_flags & IFF_BROADCAST) && - (ifra->ifra_broadaddr.sin_family == AF_INET)) - ia->ia_broadaddr = ifra->ifra_broadaddr; + (broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = broadaddr; /* * Report event. */ - if ((error == 0) || (error == EEXIST)) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_INET_SUBCLASS; - - if (hostIsNew) - ev_msg.event_code = KEV_INET_NEW_ADDR; - else - ev_msg.event_code = KEV_INET_CHANGED_ADDR; - - if (ia->ia_ifa.ifa_dstaddr) - in_event_data.ia_dstaddr = - ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; - else - in_event_data.ia_dstaddr.s_addr = 0; - - in_event_data.ia_addr = ia->ia_addr.sin_addr; - in_event_data.ia_net = ia->ia_net; - in_event_data.ia_netmask = ia->ia_netmask; - in_event_data.ia_subnet = ia->ia_subnet; - in_event_data.ia_subnetmask = ia->ia_subnetmask; - in_event_data.ia_netbroadcast = ia->ia_netbroadcast; - IFA_UNLOCK(&ia->ia_ifa); - strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); - in_event_data.link_data.if_family = ifp->if_family; - in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; - - ev_msg.dv[0].data_ptr = &in_event_data; - ev_msg.dv[0].data_length = sizeof(struct kev_in_data); - ev_msg.dv[1].data_length = 0; - - kev_post_msg(&ev_msg); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_INET_SUBCLASS; + + if (hostIsNew) + ev_msg.event_code = KEV_INET_NEW_ADDR; + else + ev_msg.event_code = KEV_INET_CHANGED_ADDR; + + if (ia->ia_ifa.ifa_dstaddr) { + in_event_data.ia_dstaddr = + ((struct sockaddr_in *)(void *)ia-> + ia_ifa.ifa_dstaddr)->sin_addr; + } else { + in_event_data.ia_dstaddr.s_addr = INADDR_ANY; + } + in_event_data.ia_addr = ia->ia_addr.sin_addr; + in_event_data.ia_net = ia->ia_net; + in_event_data.ia_netmask = ia->ia_netmask; + in_event_data.ia_subnet = ia->ia_subnet; + in_event_data.ia_subnetmask = ia->ia_subnetmask; + in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); + (void) strncpy(&in_event_data.link_data.if_name[0], + ifp->if_name, IFNAMSIZ); + in_event_data.link_data.if_family = ifp->if_family; + in_event_data.link_data.if_unit = ifp->if_unit; + + ev_msg.dv[0].data_ptr = &in_event_data; + ev_msg.dv[0].data_length = sizeof (struct kev_in_data); + ev_msg.dv[1].data_length = 0; + + kev_post_msg(&ev_msg); } else { - IFA_UNLOCK(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); } break; + } - case SIOCDIFADDR: + case SIOCDIFADDR: /* struct ifreq */ + VERIFY(ifp != NULL && ia != NULL); error = ifnet_ioctl(ifp, PF_INET, SIOCDIFADDR, ia); if (error == EOPNOTSUPP) error = 0; @@ -921,12 +1057,12 @@ in_control( ev_msg.event_code = KEV_INET_ADDR_DELETED; IFA_LOCK(&ia->ia_ifa); - if (ia->ia_ifa.ifa_dstaddr) - in_event_data.ia_dstaddr = - ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; - else - in_event_data.ia_dstaddr.s_addr = 0; - + if (ia->ia_ifa.ifa_dstaddr) { + in_event_data.ia_dstaddr = ((struct sockaddr_in *) + (void *)ia->ia_ifa.ifa_dstaddr)->sin_addr; + } else { + in_event_data.ia_dstaddr.s_addr = INADDR_ANY; + } in_event_data.ia_addr = ia->ia_addr.sin_addr; in_event_data.ia_net = ia->ia_net; in_event_data.ia_netmask = ia->ia_netmask; @@ -934,7 +1070,8 @@ in_control( in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; IFA_UNLOCK(&ia->ia_ifa); - strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); + (void) strncpy(&in_event_data.link_data.if_name[0], + ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -987,8 +1124,8 @@ in_control( ifp->if_allhostsinm = NULL; in_delmulti(inm); - /* release the reference for allhostsinm pointer */ - INM_REMREF(inm); + /* release the reference for allhostsinm */ + INM_REMREF(inm); } lck_mtx_unlock(&ifp->if_addrconfig_lock); } else { @@ -1004,6 +1141,12 @@ in_control( */ ifa = ifa_ifpgetprimary(ifp, AF_INET); if (ifa != NULL) { + /* + * NOTE: SIOCSIFADDR is defined with struct ifreq + * as parameter, but here we are sending it down + * to the interface with a pointer to struct ifaddr, + * for legacy reasons. + */ error = ifnet_ioctl(ifp, PF_INET, SIOCSIFADDR, ifa); if (error == EOPNOTSUPP) error = 0; @@ -1017,71 +1160,59 @@ in_control( break; #ifdef __APPLE__ - case SIOCSETOT: { - /* - * Inspiration from tcp_ctloutput() and ip_ctloutput() - * Special ioctl for OpenTransport sockets - */ - struct inpcb *inp, *cloned_inp; - int error2 = 0; - int cloned_fd = *(int *)data; - - inp = sotoinpcb(so); - if (inp == NULL) { - break; - } - - /* let's make sure it's either -1 or a valid file descriptor */ - if (cloned_fd != -1) { - struct socket *cloned_so; - error2 = file_socket(cloned_fd, &cloned_so); - if (error2){ - break; - } - cloned_inp = sotoinpcb(cloned_so); + case SIOCSETOT: { /* int */ + /* + * Inspiration from tcp_ctloutput() and ip_ctloutput() + * Special ioctl for OpenTransport sockets + */ + struct inpcb *inp, *cloned_inp; + int error2 = 0; + int cloned_fd; + + bcopy(data, &cloned_fd, sizeof (cloned_fd)); + + inp = sotoinpcb(so); + if (inp == NULL) { + break; + } + + /* let's make sure it's either -1 or a valid file descriptor */ + if (cloned_fd != -1) { + struct socket *cloned_so; + error2 = file_socket(cloned_fd, &cloned_so); + if (error2) { + break; + } + cloned_inp = sotoinpcb(cloned_so); file_drop(cloned_fd); - } else { - cloned_inp = NULL; - } - - if (cloned_inp == NULL) { - /* OT always uses IP_PORTRANGE_HIGH */ - inp->inp_flags &= ~(INP_LOWPORT); - inp->inp_flags |= INP_HIGHPORT; - /* For UDP, OT allows broadcast by default */ - if (so->so_type == SOCK_DGRAM) - so->so_options |= SO_BROADCAST; - /* For TCP we want to see MSG_OOB when receive urgent data */ - else if (so->so_type == SOCK_STREAM) - so->so_options |= SO_WANTOOBFLAG; - } else { - inp->inp_ip_tos = cloned_inp->inp_ip_tos; - inp->inp_ip_ttl = cloned_inp->inp_ip_ttl; - inp->inp_flags = cloned_inp->inp_flags; - - /* Multicast options */ - if (cloned_inp->inp_moptions != NULL) { - struct ip_moptions *cloned_imo = cloned_inp->inp_moptions; - struct ip_moptions *imo = inp->inp_moptions; - - if (imo == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one. - */ - imo = ip_allocmoptions(M_WAITOK); - if (imo == NULL) { - error2 = ENOBUFS; - break; - } - inp->inp_moptions = imo; - } - - error2 = imo_clone(cloned_imo, imo); - } - } - break; - } + } else { + cloned_inp = NULL; + } + + if (cloned_inp == NULL) { + /* OT always uses IP_PORTRANGE_HIGH */ + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; + /* + * For UDP, OT allows broadcast by default; + * for TCP we want to see MSG_OOB when we + * receive urgent data. + */ + if (so->so_type == SOCK_DGRAM) + so->so_options |= SO_BROADCAST; + else if (so->so_type == SOCK_STREAM) + so->so_options |= SO_WANTOOBFLAG; + } else { + inp->inp_ip_tos = cloned_inp->inp_ip_tos; + inp->inp_ip_ttl = cloned_inp->inp_ip_ttl; + inp->inp_flags = cloned_inp->inp_flags; + + /* Multicast options */ + if (cloned_inp->inp_moptions != NULL) + error2 = imo_clone(cloned_inp, inp); + } + break; + } #endif /* __APPLE__ */ default: @@ -1111,21 +1242,12 @@ in_control( * other values may be returned from in_ioctl() */ static int -in_lifaddr_ioctl( - struct socket *so, - u_long cmd, - caddr_t data, - struct ifnet *ifp, - struct proc *p) +in_lifaddr_ioctl(struct socket *so, u_long cmd, struct if_laddrreq *iflr, + struct ifnet *ifp, struct proc *p) { - struct if_laddrreq *iflr = (struct if_laddrreq *)data; struct ifaddr *ifa; - /* sanity checks */ - if (!data || !ifp) { - panic("invalid argument to in_lifaddr_ioctl"); - /*NOTREACHED*/ - } + VERIFY(ifp != NULL); switch (cmd) { case SIOCGLIFADDR: @@ -1288,6 +1410,35 @@ in_lifaddr_ioctl( return EOPNOTSUPP; /*just for safety*/ } +/* + * Handle SIOCSETROUTERMODE to set or clear the IPv4 router mode flag on + * the interface. When in this mode, IPv4 Link-Local Address support is + * disabled in ARP, and DHCP client support is disabled in IP input; turning + * any of them on would cause an error to be returned. Entering or exiting + * this mode will result in the removal of IPv4 addresses currently configured + * on the interface. + */ +static int +in_setrouter(struct ifnet *ifp, int enable) +{ + if (ifp->if_flags & IFF_LOOPBACK) + return (ENODEV); + + ifnet_lock_exclusive(ifp); + if (enable) { + ifp->if_eflags |= IFEF_IPV4_ROUTER; + ifp->if_eflags &= ~(IFEF_ARPLL | IFEF_AUTOCONFIGURING); + } else { + ifp->if_eflags &= ~IFEF_IPV4_ROUTER; + } + ifnet_lock_done(ifp); + + /* purge all IPv4 addresses configured on this interface */ + in_purgeaddrs(ifp); + + return (0); +} + /* * Delete any existing route for an interface. */ @@ -1463,6 +1614,12 @@ in_ifinit( * be reconfigured with the current primary IPV4 address. */ if (error == 0 && cmd == SIOCAIFADDR) { + /* + * NOTE: SIOCSIFADDR is defined with struct ifreq + * as parameter, but here we are sending it down + * to the interface with a pointer to struct ifaddr, + * for legacy reasons. + */ error = ifnet_ioctl(ifp, PF_INET, SIOCSIFADDR, ifa0); if (error == EOPNOTSUPP) error = 0; @@ -1674,9 +1831,9 @@ in_purgeaddrs(struct ifnet *ifp) IFA_LOCK(ifa); s = &((struct sockaddr_in *) - ifa->ifa_addr)->sin_addr; + (void *)ifa->ifa_addr)->sin_addr; d = &((struct sockaddr_in *) - ifa->ifa_dstaddr)->sin_addr; + (void *)ifa->ifa_dstaddr)->sin_addr; (void) inet_ntop(AF_INET, &s->s_addr, s_addr, sizeof (s_addr)); (void) inet_ntop(AF_INET, &d->s_addr, s_dstaddr, diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index 4e66c26c7..4e0d49b9e 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -430,7 +430,7 @@ struct ip_opts { #define IP_RECVIF 20 /* bool; receive reception if w/dgram */ /* for IPSEC */ #define IP_IPSEC_POLICY 21 /* int; set/get security policy */ -#define IP_FAITH 22 /* bool; accept FAITH'ed connections */ +#define IP_FAITH 22 /* deprecated */ #ifdef __APPLE__ #define IP_STRIPHDR 23 /* bool: drop receive of raw IP header */ #endif @@ -601,13 +601,13 @@ struct sockaddr; * We use uint32_t here to be consistent. */ int setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t, - uint32_t, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); + uint32_t, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); int getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *, - uint32_t *, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); + uint32_t *, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); int setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, - uint32_t, uint32_t, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); + uint32_t, uint32_t, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, - uint32_t *, uint32_t *, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); + uint32_t *, uint32_t *, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); #endif /* @@ -737,7 +737,7 @@ struct in_pktinfo { #define IPCTL_STATS 12 /* ipstat structure */ #define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ #define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ -#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */ +#define IPCTL_KEEPFAITH 15 /* deprecated */ #define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ #define IPCTL_MAXID 17 @@ -796,6 +796,9 @@ extern int inaddr_local(struct in_addr); #define in_nullhost(x) ((x).s_addr == INADDR_ANY) #define in_allhosts(x) ((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP)) +#define SIN(s) ((struct sockaddr_in *)(void *)s) +#define satosin(sa) SIN(sa) +#define sintosa(sin) ((struct sockaddr *)(void *)(sin)) #endif /* KERNEL_PRIVATE */ #define MAX_IPv4_STR_LEN 16 #define MAX_IPv6_STR_LEN 64 diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 8a4dfcd14..7dd09e904 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2011 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,8 +83,6 @@ #include #include -#define SA(p) ((struct sockaddr *)(p)) -#define SIN(s) ((struct sockaddr_in *)s) #define CONST_LLADDR(s) ((const u_char*)((s)->sdl_data + (s)->sdl_nlen)) #define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) @@ -208,6 +206,7 @@ static struct llinfo_arp *arp_llinfo_alloc(void); static void arp_llinfo_free(void *); static void arp_llinfo_purge(struct rtentry *); static void arp_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); +static void arp_llinfo_get_iflri(struct rtentry *, struct ifnet_llreach_info *); static __inline void arp_llreach_use(struct llinfo_arp *); static __inline int arp_llreach_reachable(struct llinfo_arp *); @@ -301,12 +300,38 @@ arp_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) if (lr == NULL) { bzero(ri, sizeof (*ri)); + ri->ri_rssi = IFNET_RSSI_UNKNOWN; + ri->ri_lqm = IFNET_LQM_THRESH_OFF; + ri->ri_npm = IFNET_NPM_THRESH_UNKNOWN; } else { IFLR_LOCK(lr); /* Export to rt_reach_info structure */ ifnet_lr2ri(lr, ri); - /* Export ARP send expiration time */ - ri->ri_snd_expire = ifnet_llreach_up2cal(lr, la->la_lastused); + /* Export ARP send expiration (calendar) time */ + ri->ri_snd_expire = + ifnet_llreach_up2calexp(lr, la->la_lastused); + IFLR_UNLOCK(lr); + } +} + +static void +arp_llinfo_get_iflri(struct rtentry *rt, struct ifnet_llreach_info *iflri) +{ + struct llinfo_arp *la = rt->rt_llinfo; + struct if_llreach *lr = la->la_llreach; + + if (lr == NULL) { + bzero(iflri, sizeof (*iflri)); + iflri->iflri_rssi = IFNET_RSSI_UNKNOWN; + iflri->iflri_lqm = IFNET_LQM_THRESH_OFF; + iflri->iflri_npm = IFNET_NPM_THRESH_UNKNOWN; + } else { + IFLR_LOCK(lr); + /* Export to ifnet_llreach_info structure */ + ifnet_lr2iflri(lr, iflri); + /* Export ARP send expiration (uptime) time */ + iflri->iflri_snd_expire = + ifnet_llreach_up2upexp(lr, la->la_lastused); IFLR_UNLOCK(lr); } } @@ -579,7 +604,7 @@ arp_rtrequest( * such as older version of routed or gated might provide, * restore cloning bit. */ - if ((rt->rt_flags & RTF_HOST) == 0 && + if ((rt->rt_flags & RTF_HOST) == 0 && rt_mask(rt) != NULL && SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) rt->rt_flags |= RTF_CLONING; if (rt->rt_flags & RTF_CLONING) { @@ -605,7 +630,7 @@ arp_rtrequest( arp_llreach_use(la); /* Mark use timestamp */ RT_UNLOCK(rt); dlil_send_arp(rt->rt_ifp, ARPOP_REQUEST, - SDL(gate), rt_key(rt), NULL, rt_key(rt)); + SDL(gate), rt_key(rt), NULL, rt_key(rt), 0); RT_LOCK(rt); } /*FALLTHROUGH*/ @@ -631,6 +656,7 @@ arp_rtrequest( break; } rt->rt_llinfo_get_ri = arp_llinfo_get_ri; + rt->rt_llinfo_get_iflri = arp_llinfo_get_iflri; rt->rt_llinfo_purge = arp_llinfo_purge; rt->rt_llinfo_free = arp_llinfo_free; @@ -859,192 +885,6 @@ arp_lookup_route(const struct in_addr *addr, int create, int proxy, return (0); } -/* - * arp_route_to_gateway_route will find the gateway route for a given route. - * - * If the route is down, look the route up again. - * If the route goes through a gateway, get the route to the gateway. - * If the gateway route is down, look it up again. - * If the route is set to reject, verify it hasn't expired. - * - * If the returned route is non-NULL, the caller is responsible for - * releasing the reference and unlocking the route. - */ -#define senderr(e) { error = (e); goto bad; } -__private_extern__ errno_t -arp_route_to_gateway_route(const struct sockaddr *net_dest, route_t hint0, - route_t *out_route) -{ - uint64_t timenow; - route_t rt = hint0, hint = hint0; - errno_t error = 0; - - *out_route = NULL; - - /* - * Next hop determination. Because we may involve the gateway route - * in addition to the original route, locking is rather complicated. - * The general concept is that regardless of whether the route points - * to the original route or to the gateway route, this routine takes - * an extra reference on such a route. This extra reference will be - * released at the end. - * - * Care must be taken to ensure that the "hint0" route never gets freed - * via rtfree(), since the caller may have stored it inside a struct - * route with a reference held for that placeholder. - */ - if (rt != NULL) { - unsigned int ifindex; - - RT_LOCK_SPIN(rt); - ifindex = rt->rt_ifp->if_index; - RT_ADDREF_LOCKED(rt); - if (!(rt->rt_flags & RTF_UP)) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - /* route is down, find a new one */ - hint = rt = rtalloc1_scoped((struct sockaddr *) - (size_t)net_dest, 1, 0, ifindex); - if (hint != NULL) { - RT_LOCK_SPIN(rt); - ifindex = rt->rt_ifp->if_index; - } else { - senderr(EHOSTUNREACH); - } - } - - /* - * We have a reference to "rt" by now; it will either - * be released or freed at the end of this routine. - */ - RT_LOCK_ASSERT_HELD(rt); - if (rt->rt_flags & RTF_GATEWAY) { - struct rtentry *gwrt = rt->rt_gwroute; - struct sockaddr_in gw; - - /* If there's no gateway rt, look it up */ - if (gwrt == NULL) { - gw = *((struct sockaddr_in *)rt->rt_gateway); - RT_UNLOCK(rt); - goto lookup; - } - /* Become a regular mutex */ - RT_CONVERT_LOCK(rt); - - /* - * Take gwrt's lock while holding route's lock; - * this is okay since gwrt never points back - * to "rt", so no lock ordering issues. - */ - RT_LOCK_SPIN(gwrt); - if (!(gwrt->rt_flags & RTF_UP)) { - struct rtentry *ogwrt; - - rt->rt_gwroute = NULL; - RT_UNLOCK(gwrt); - gw = *((struct sockaddr_in *)rt->rt_gateway); - RT_UNLOCK(rt); - rtfree(gwrt); -lookup: - gwrt = rtalloc1_scoped( - (struct sockaddr *)&gw, 1, 0, ifindex); - - RT_LOCK(rt); - /* - * Bail out if the route is down, no route - * to gateway, circular route, or if the - * gateway portion of "rt" has changed. - */ - if (!(rt->rt_flags & RTF_UP) || - gwrt == NULL || gwrt == rt || - !equal(SA(&gw), rt->rt_gateway)) { - if (gwrt == rt) { - RT_REMREF_LOCKED(gwrt); - gwrt = NULL; - } - RT_UNLOCK(rt); - if (gwrt != NULL) - rtfree(gwrt); - senderr(EHOSTUNREACH); - } - - /* Remove any existing gwrt */ - ogwrt = rt->rt_gwroute; - if ((rt->rt_gwroute = gwrt) != NULL) - RT_ADDREF(gwrt); - - /* Clean up "rt" now while we can */ - if (rt == hint0) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - } else { - RT_UNLOCK(rt); - rtfree(rt); - } - rt = gwrt; - /* Now free the replaced gwrt */ - if (ogwrt != NULL) - rtfree(ogwrt); - /* If still no route to gateway, bail out */ - if (rt == NULL) - senderr(EHOSTUNREACH); - } else { - RT_ADDREF_LOCKED(gwrt); - RT_UNLOCK(gwrt); - /* Clean up "rt" now while we can */ - if (rt == hint0) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - } else { - RT_UNLOCK(rt); - rtfree(rt); - } - rt = gwrt; - } - - /* rt == gwrt; if it is now down, give up */ - RT_LOCK_SPIN(rt); - if (!(rt->rt_flags & RTF_UP)) { - RT_UNLOCK(rt); - senderr(EHOSTUNREACH); - } - } - - if (rt->rt_flags & RTF_REJECT) { - VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); - VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); - timenow = net_uptime(); - if (rt->rt_expire == 0 || - timenow < rt->rt_expire) { - RT_UNLOCK(rt); - senderr(rt == hint ? EHOSTDOWN : EHOSTUNREACH); - } - } - - /* Become a regular mutex */ - RT_CONVERT_LOCK(rt); - - /* Caller is responsible for cleaning up "rt" */ - *out_route = rt; - } - return (0); - -bad: - /* Clean up route (either it is "rt" or "gwrt") */ - if (rt != NULL) { - RT_LOCK_SPIN(rt); - if (rt == hint0) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - } else { - RT_UNLOCK(rt); - rtfree(rt); - } - } - return (error); -} -#undef senderr - /* * This is the ARP pre-output routine; care must be taken to ensure that * the "hint" route never gets freed via rtfree(), since the caller may @@ -1077,7 +917,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, * Callee holds a reference on the route and returns * with the route entry locked, upon success. */ - result = arp_route_to_gateway_route((const struct sockaddr*) + result = route_to_gwroute((const struct sockaddr *) net_dest, hint, &route); if (result != 0) return (result); @@ -1194,6 +1034,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, if (llinfo->la_asked++ < arp_maxtries) { struct ifaddr *rt_ifa = route->rt_ifa; struct sockaddr *sa; + u_int32_t rtflags; /* Become a regular mutex, just in case */ RT_CONVERT_LOCK(route); @@ -1208,9 +1049,11 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, sa = rt_ifa->ifa_addr; IFA_UNLOCK(rt_ifa); arp_llreach_use(llinfo); /* Mark use timestamp */ + rtflags = route->rt_flags; RT_UNLOCK(route); dlil_send_arp(ifp, ARPOP_REQUEST, NULL, - sa, NULL, (const struct sockaddr*)net_dest); + sa, NULL, (const struct sockaddr*)net_dest, + rtflags); IFA_REMREF(rt_ifa); RT_LOCK(route); result = EJUSTRETURN; @@ -1385,7 +1228,7 @@ match: u_char storage[sizeof(struct kev_in_collision) + MAX_HW_LEN]; bzero(&ev_msg, sizeof(struct kev_msg)); bzero(storage, (sizeof(struct kev_in_collision) + MAX_HW_LEN)); - in_collision = (struct kev_in_collision*)storage; + in_collision = (struct kev_in_collision*)(void *)storage; log(LOG_ERR, "%s%d duplicate IP address %s sent from address %s\n", ifp->if_name, ifp->if_unit, inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, sizeof(ipv4str)), @@ -1646,7 +1489,7 @@ match: /* Update the expire time for the route and clear the reject flag */ if (route->rt_expire) { uint64_t timenow; - + timenow = net_uptime(); rt_setexpire(route, rt_expiry(route, timenow, arpt_keep)); @@ -1666,7 +1509,7 @@ match: llinfo->la_hold = NULL; RT_UNLOCK(route); - dlil_output(ifp, PF_INET, m0, (caddr_t)route, rt_key(route), 0); + dlil_output(ifp, PF_INET, m0, (caddr_t)route, rt_key(route), 0, NULL); RT_REMREF(route); route = NULL; } @@ -1748,7 +1591,7 @@ respond: dlil_send_arp(ifp, ARPOP_REPLY, target_hw, (const struct sockaddr*)target_ip, - sender_hw, (const struct sockaddr*)sender_ip); + sender_hw, (const struct sockaddr*)sender_ip, 0); done: if (best_ia != NULL) @@ -1766,5 +1609,5 @@ arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) ifa->ifa_flags |= RTF_CLONING; sa = ifa->ifa_addr; IFA_UNLOCK(ifa); - dlil_send_arp(ifp, ARPOP_REQUEST, NULL, sa, NULL, sa); + dlil_send_arp(ifp, ARPOP_REQUEST, NULL, sa, NULL, sa, 0); } diff --git a/bsd/netinet/in_arp.h b/bsd/netinet/in_arp.h index 99a106572..56573aad9 100644 --- a/bsd/netinet/in_arp.h +++ b/bsd/netinet/in_arp.h @@ -76,8 +76,6 @@ extern void arp_llreach_set_reachable(struct ifnet *, void *, unsigned int); extern errno_t arp_lookup_ip(ifnet_t interface, const struct sockaddr_in *ip_dest, struct sockaddr_dl *ll_dest, size_t ll_dest_len, route_t hint, mbuf_t packet); -__private_extern__ errno_t arp_route_to_gateway_route(const struct sockaddr *, - route_t, route_t *); #endif /* KERNEL_PRIVATE */ /*! diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c index f32cef303..594e27577 100644 --- a/bsd/netinet/in_cksum.c +++ b/bsd/netinet/in_cksum.c @@ -177,7 +177,7 @@ inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, for (; skip && m; m = m->m_next) { if (m->m_len > skip) { mlen = m->m_len - skip; - w = (u_short *)(m->m_data+skip); + w = (u_short *)(void *)(m->m_data+skip); goto skip_start; } else { skip -= m->m_len; @@ -200,7 +200,7 @@ inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, */ s_util.c[1] = *(char *)w; sum += s_util.s; - w = (u_short *)((char *)w + 1); + w = (u_short *)(void *)((char *)w + 1); mlen = m->m_len - 1; len--; } else { @@ -218,7 +218,7 @@ skip_start: REDUCE; sum <<= 8; s_util.c[0] = *(u_char *)w; - w = (u_short *)((char *)w + 1); + w = (u_short *)(void *)((char *)w + 1); mlen--; byte_swapped = 1; } diff --git a/bsd/netinet/in_dhcp.c b/bsd/netinet/in_dhcp.c index 90fc06ae5..b0bad24cb 100644 --- a/bsd/netinet/in_dhcp.c +++ b/bsd/netinet/in_dhcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1988-2010 Apple Inc. All rights reserved. + * Copyright (c) 1988-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,16 +105,16 @@ inet_aifaddr(struct socket * so, const char * name, bzero(&ifra, sizeof(ifra)); strlcpy(ifra.ifra_name, name, sizeof(ifra.ifra_name)); if (addr) { - *((struct sockaddr_in *)&ifra.ifra_addr) = blank_sin; - ((struct sockaddr_in *)&ifra.ifra_addr)->sin_addr = *addr; + *((struct sockaddr_in *)(void *)&ifra.ifra_addr) = blank_sin; + ((struct sockaddr_in *)(void *)&ifra.ifra_addr)->sin_addr = *addr; } if (mask) { - *((struct sockaddr_in *)&ifra.ifra_mask) = blank_sin; - ((struct sockaddr_in *)&ifra.ifra_mask)->sin_addr = *mask; + *((struct sockaddr_in *)(void *)&ifra.ifra_mask) = blank_sin; + ((struct sockaddr_in *)(void *)&ifra.ifra_mask)->sin_addr = *mask; } if (broadcast) { - *((struct sockaddr_in *)&ifra.ifra_broadaddr) = blank_sin; - ((struct sockaddr_in *)&ifra.ifra_broadaddr)->sin_addr = *broadcast; + *((struct sockaddr_in *)(void *)&ifra.ifra_broadaddr) = blank_sin; + ((struct sockaddr_in *)(void *)&ifra.ifra_broadaddr)->sin_addr = *broadcast; } return (ifioctl(so, SIOCAIFADDR, (caddr_t)&ifra, current_proc())); } @@ -140,13 +140,13 @@ struct dhcp_context { static __inline__ struct dhcp_packet * dhcp_context_request(struct dhcp_context * context) { - return ((struct dhcp_packet *)context->request); + return ((struct dhcp_packet *)(void *)context->request); } static __inline__ struct dhcp * dhcp_context_reply(struct dhcp_context * context) { - return ((struct dhcp *)context->reply); + return ((struct dhcp *)(void *)context->reply); } struct mbuf * ip_pkt_to_mbuf(caddr_t pkt, int pktsize); @@ -291,7 +291,7 @@ link_print(struct sockaddr_dl * dl_p) static struct sockaddr_dl * link_from_ifnet(struct ifnet * ifp) { - return ((struct sockaddr_dl *)ifp->if_lladdr->ifa_addr); + return ((struct sockaddr_dl *)(void *)ifp->if_lladdr->ifa_addr); } /* @@ -309,7 +309,7 @@ send_packet(struct ifnet * ifp, struct dhcp_packet * pkt, int pkt_size) dest.sin_port = htons(IPPORT_BOOTPS); dest.sin_addr.s_addr = INADDR_BROADCAST; m = ip_pkt_to_mbuf((caddr_t)pkt, pkt_size); - return dlil_output(ifp, PF_INET, m, 0, (struct sockaddr *)&dest, 0); + return dlil_output(ifp, PF_INET, m, 0, (struct sockaddr *)&dest, 0, NULL); } /* diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index 9a6cb3db6..c65eceb68 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,13 +105,13 @@ in_gif_output( __unused struct rtentry *rt) { struct gif_softc *sc = ifnet_softc(ifp); - struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; - struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; - struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; + struct sockaddr_in *dst = (struct sockaddr_in *)(void *)&sc->gif_ro.ro_dst; + struct sockaddr_in *sin_src = (struct sockaddr_in *)(void *)sc->gif_psrc; + struct sockaddr_in *sin_dst = (struct sockaddr_in *)(void *)sc->gif_pdst; struct ip iphdr; /* capsule IP header, host byte ordered */ int proto, error; u_int8_t tos; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; if (sin_src == NULL || sin_dst == NULL || sin_src->sin_family != AF_INET || @@ -343,8 +343,8 @@ gif_encapcheck4( /* sanity check done in caller */ sc = (struct gif_softc *)arg; - src = (struct sockaddr_in *)sc->gif_psrc; - dst = (struct sockaddr_in *)sc->gif_pdst; + src = (struct sockaddr_in *)(void *)sc->gif_psrc; + dst = (struct sockaddr_in *)(void *)sc->gif_pdst; mbuf_copydata((struct mbuf *)(size_t)m, 0, sizeof(ip), &ip); diff --git a/bsd/netinet/in_mcast.c b/bsd/netinet/in_mcast.c index 1854fd26e..deded6a55 100644 --- a/bsd/netinet/in_mcast.c +++ b/bsd/netinet/in_mcast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * Copyright (c) 2010-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -347,7 +347,7 @@ imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); - gsin = (const struct sockaddr_in *)group; + gsin = (struct sockaddr_in *)(uintptr_t)(size_t)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) @@ -399,7 +399,7 @@ imo_match_source(const struct ip_moptions *imo, const size_t gidx, imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ - psa = (const sockunion_t *)src; + psa = (sockunion_t *)(uintptr_t)(size_t)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); @@ -448,9 +448,21 @@ imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, } int -imo_clone(struct ip_moptions *from, struct ip_moptions *to) +imo_clone(struct inpcb *from_inp, struct inpcb *to_inp) { int i, err = 0; + struct ip_moptions *from; + struct ip_moptions *to; + + from = inp_findmoptions(from_inp); + if (from == NULL) + return (ENOMEM); + + to = inp_findmoptions(to_inp); + if (to == NULL) { + IMO_REMREF(from); + return (ENOMEM); + } IMO_LOCK(from); IMO_LOCK(to); @@ -497,16 +509,21 @@ imo_clone(struct ip_moptions *from, struct ip_moptions *to) * Source filtering doesn't apply to OpenTransport socket, * so simply hold additional reference count per membership. */ - for (i = 0; i < from->imo_num_memberships; i++) { - to->imo_membership[i] = from->imo_membership[i]; - INM_ADDREF(from->imo_membership[i]); + for (i = 0; i < from->imo_num_memberships; i++) { + to->imo_membership[i] = + in_addmulti(&from->imo_membership[i]->inm_addr, + from->imo_membership[i]->inm_ifp); + if (to->imo_membership[i] == NULL) + break; to->imo_num_memberships++; } VERIFY(to->imo_num_memberships == from->imo_num_memberships); done: IMO_UNLOCK(to); + IMO_REMREF(to); IMO_UNLOCK(from); + IMO_REMREF(from); return (err); } @@ -1710,7 +1727,11 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) if (ifp == NULL) return (EADDRNOTAVAIL); - + + if ((size_t) msfr.msfr_nsrcs > + SIZE_MAX / sizeof(struct sockaddr_storage)) + msfr.msfr_nsrcs = SIZE_MAX / sizeof(struct sockaddr_storage); + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; @@ -1750,12 +1771,13 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) tss = NULL; if (tmp_ptr != USER_ADDR_NULL && msfr.msfr_nsrcs > 0) { - tss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + tss = _MALLOC((size_t) msfr.msfr_nsrcs * sizeof(*tss), M_TEMP, M_WAITOK | M_ZERO); if (tss == NULL) { IMO_UNLOCK(imo); return (ENOBUFS); } + bzero(tss, (size_t) msfr.msfr_nsrcs * sizeof(*tss)); } /* @@ -1785,8 +1807,7 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) IMO_UNLOCK(imo); if (tss != NULL) { - error = copyout(tss, tmp_ptr, - sizeof(struct sockaddr_storage) * ncsrcs); + error = copyout(tss, tmp_ptr, ncsrcs * sizeof(*tss)); FREE(tss, M_TEMP); if (error) return (error); @@ -1980,7 +2001,7 @@ inp_lookup_mcast_ifp(const struct inpcb *inp, unsigned int ifscope = IFSCOPE_NONE; if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) - ifscope = inp->inp_boundif; + ifscope = inp->inp_boundifp->if_index; bzero(&ro, sizeof (ro)); memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); @@ -2673,6 +2694,10 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) memcpy(&msfr, &msfr32, sizeof(msfr)); } + if ((size_t) msfr.msfr_nsrcs > + SIZE_MAX / sizeof(struct sockaddr_storage)) + msfr.msfr_nsrcs = SIZE_MAX / sizeof(struct sockaddr_storage); + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); @@ -2742,14 +2767,14 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) IGMP_PRINTF(("%s: loading %lu source list entries\n", __func__, (unsigned long)msfr.msfr_nsrcs)); - kss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + kss = _MALLOC((size_t) msfr.msfr_nsrcs * sizeof(*kss), M_TEMP, M_WAITOK); if (kss == NULL) { error = ENOMEM; goto out_imo_locked; } error = copyin(tmp_ptr, kss, - sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + (size_t) msfr.msfr_nsrcs * sizeof(*kss)); if (error) { FREE(kss, M_TEMP); goto out_imo_locked; diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 51eeca0b3..043057a59 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include #include #include +#include #include @@ -89,6 +90,8 @@ #include #include #include +#include +#include #include #include @@ -99,8 +102,6 @@ #include #endif /* INET6 */ -#include "faith.h" - #if IPSEC #include #include @@ -108,6 +109,7 @@ #include #include +#include #if IPSEC extern int ipsec_bypass; @@ -175,14 +177,79 @@ SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | C extern int udp_use_randomport; extern int tcp_use_randomport; +/* Structs used for flowhash computation */ +struct inp_flowhash_key_addr { + union { + struct in_addr v4; + struct in6_addr v6; + u_int8_t addr8[16]; + u_int16_t addr16[8]; + u_int32_t addr32[4]; + } infha; +}; + +struct inp_flowhash_key { + struct inp_flowhash_key_addr infh_laddr; + struct inp_flowhash_key_addr infh_faddr; + u_int32_t infh_lport; + u_int32_t infh_fport; + u_int32_t infh_af; + u_int32_t infh_proto; + u_int32_t infh_rand1; + u_int32_t infh_rand2; +}; + +u_int32_t inp_hash_seed = 0; + +static __inline int infc_cmp(const struct inp_fc_entry *, + const struct inp_fc_entry *); +lck_grp_t *inp_lck_grp; +lck_grp_attr_t *inp_lck_grp_attr; +lck_attr_t *inp_lck_attr; +decl_lck_mtx_data(, inp_fc_lck); + +RB_HEAD(inp_fc_tree, inp_fc_entry) inp_fc_tree; +RB_PROTOTYPE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp); + +RB_GENERATE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp); + +static unsigned int inp_fcezone_size; +static struct zone *inp_fcezone; +#define INP_FCEZONE_NAME "inp_fcezone" +#define INP_FCEZONE_MAX 32 + /* * in_pcb.c: manage the Protocol Control Blocks. - * - * NOTE: It is assumed that most of these functions will be called at - * splnet(). XXX - There are, unfortunately, a few exceptions to this - * rule that should be fixed. */ +/* + * Initialize data structures required to deliver + * flow advisories. + */ +void +socket_flowadv_init(void) +{ + inp_lck_grp_attr = lck_grp_attr_alloc_init(); + inp_lck_grp = lck_grp_alloc_init("inp_lck_grp", inp_lck_grp_attr); + + inp_lck_attr = lck_attr_alloc_init(); + lck_mtx_init(&inp_fc_lck, inp_lck_grp, inp_lck_attr); + + RB_INIT(&inp_fc_tree); + + inp_fcezone_size = P2ROUNDUP(sizeof (struct inp_fc_entry), + sizeof (u_int64_t)); + inp_fcezone = zinit(inp_fcezone_size, + INP_FCEZONE_MAX * inp_fcezone_size, 0, INP_FCEZONE_NAME); + if (inp_fcezone == NULL) { + panic("%s: failed allocating %s", __func__, + INP_FCEZONE_NAME); + /* NOTREACHED */ + } + zone_change(inp_fcezone, Z_EXPAND, TRUE); + zone_change(inp_fcezone, Z_CALLERACCT, FALSE); +} + /* * Allocate a PCB and associate it with the socket. * @@ -218,7 +285,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc * #if TEMPDEBUG printf("PCBALLOC reusing PCB for socket %x\n", so); #endif - inp = (struct inpcb *) so->so_saved_pcb; + inp = (struct inpcb *)(void *)so->so_saved_pcb; temp = inp->inp_saved_ppcb; bzero((caddr_t) inp, sizeof(*inp)); inp->inp_saved_ppcb = temp; @@ -375,9 +442,9 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) socket_unlock(so, 0); /* keep reference on socket */ lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { - unsigned int outif = 0; + struct ifnet *outif = NULL; - sin = (struct sockaddr_in *)nam; + sin = (struct sockaddr_in *)(void *)nam; if (nam->sa_len != sizeof (*sin)) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); @@ -415,7 +482,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } else { IFA_LOCK(ifa); - outif = ifa->ifa_ifp->if_index; + outif = ifa->ifa_ifp; IFA_UNLOCK(ifa); IFA_REMREF(ifa); } @@ -436,7 +503,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } #endif - if (so->so_uid && + if (kauth_cred_getuid(so->so_cred) && !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { t = in_pcblookup_local_and_cleanup(inp->inp_pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD); @@ -445,17 +512,12 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && - (so->so_uid != t->inp_socket->so_uid) && - ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0)) { -#if INET6 - if (ntohl(sin->sin_addr.s_addr) != - INADDR_ANY || - ntohl(t->inp_laddr.s_addr) != - INADDR_ANY || - INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket)) -#endif /* INET6 */ - { + (kauth_cred_getuid(so->so_cred) != + kauth_cred_getuid(t->inp_socket->so_cred)) && + ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0) && + (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != INADDR_ANY)) + { #ifdef __APPLE_API_PRIVATE if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0)) @@ -471,7 +533,6 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) socket_lock(so, 0); return (EADDRINUSE); - } } } t = in_pcblookup_local_and_cleanup(pcbinfo, sin->sin_addr, @@ -479,13 +540,12 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) if (t && (reuseport & t->inp_socket->so_options) == 0) { #if INET6 - if (ip6_mapped_addr_on == 0 || - ntohl(sin->sin_addr.s_addr) != + if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket)) + INP_SOCKAF(so) != AF_INET6 || + INP_SOCKAF(t->inp_socket) != AF_INET6) #endif /* INET6 */ { #ifdef __APPLE_API_PRIVATE @@ -506,7 +566,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } inp->inp_laddr = sin->sin_addr; - inp->inp_last_outif = outif; + inp->inp_last_outifp = outif; } if (lport == 0) { u_short first, last; @@ -564,7 +624,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; - inp->inp_last_outif = 0; + inp->inp_last_outifp = NULL; return (EADDRNOTAVAIL); } --*lastport; @@ -588,7 +648,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; - inp->inp_last_outif = 0; + inp->inp_last_outifp = NULL; return (EADDRNOTAVAIL); } ++*lastport; @@ -604,7 +664,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) if (in_pcbinshash(inp, 1) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; - inp->inp_last_outif = 0; + inp->inp_last_outifp = NULL; lck_rw_done(pcbinfo->mtx); return (EAGAIN); } @@ -631,10 +691,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) */ int in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, - struct sockaddr_in *plocal_sin, unsigned int *out_ifscope) + struct sockaddr_in *plocal_sin, struct ifnet **outif) { struct in_ifaddr *ia; - struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); @@ -653,9 +713,6 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ -#define satosin(sa) ((struct sockaddr_in *)(sa)) -#define sintosa(sin) ((struct sockaddr *)(sin)) -#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) IFA_LOCK_SPIN(&ia->ia_ifa); if (sin->sin_addr.s_addr == INADDR_ANY) sin->sin_addr = IA_SIN(ia)->sin_addr; @@ -678,10 +735,10 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, */ ia = (struct in_ifaddr *)0; - if (out_ifscope != NULL && *out_ifscope != IFSCOPE_NONE) - ifscope = *out_ifscope; + if (outif != NULL && *outif != NULL) + ifscope = (*outif)->if_index; else if (inp->inp_flags & INP_BOUND_IF) - ifscope = inp->inp_boundif; + ifscope = inp->inp_boundifp->if_index; nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; /* @@ -710,7 +767,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); - ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr = sin->sin_addr; rtalloc_scoped(ro, ifscope); if (ro->ro_rt != NULL) @@ -727,6 +784,9 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, RT_UNLOCK(ro->ro_rt); rtfree(ro->ro_rt); ro->ro_rt = NULL; + soevent(inp->inp_socket, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_IFDENIED)); } } /* @@ -772,6 +832,9 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, ia->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) { IFA_REMREF(&ia->ia_ifa); ia = NULL; + soevent(inp->inp_socket, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_IFDENIED)); } if (ia == 0) return (EADDRNOTAVAIL); @@ -814,8 +877,8 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, */ IFA_LOCK_SPIN(&ia->ia_ifa); *plocal_sin = ia->ia_addr; - if (out_ifscope != NULL) - *out_ifscope = ia->ia_ifp->if_index; + if (outif != NULL) + *outif = ia->ia_ifp; IFA_UNLOCK(&ia->ia_ifa); IFA_REMREF(&ia->ia_ifa); } @@ -830,17 +893,18 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, unsigned int *ifscope) +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, + struct ifnet **outif) { struct sockaddr_in ifaddr; - struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam; struct inpcb *pcb; int error; /* * Call inner routine, to assign local interface address. */ - if ((error = in_pcbladdr(inp, nam, &ifaddr, ifscope)) != 0) + if ((error = in_pcbladdr(inp, nam, &ifaddr, outif)) != 0) return(error); socket_unlock(inp->inp_socket, 0); @@ -874,7 +938,7 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, unsigned socket_lock(inp->inp_socket, 0); } inp->inp_laddr = ifaddr.sin_addr; - inp->inp_last_outif = ifscope ? *ifscope : IFSCOPE_NONE; + inp->inp_last_outifp = (outif != NULL) ? *outif : NULL; inp->inp_flags |= INP_INADDR_ANY; } else { @@ -980,9 +1044,9 @@ in_pcbdispose(struct inpcb *inp) lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE); inp->inp_gencnt = ++ipi->ipi_gencnt; - /*### access ipi in in_pcbremlists */ + /* access ipi in in_pcbremlists */ in_pcbremlists(inp); - + if (so) { if (so->so_proto->pr_flags & PR_PCBLOCK) { sofreelastref(so, 0); @@ -1300,7 +1364,7 @@ in_pcblookup_hash_exists( int wildcard, uid_t *uid, gid_t *gid, - __unused struct ifnet *ifp) + struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp; @@ -1309,7 +1373,7 @@ in_pcblookup_hash_exists( *uid = UID_MAX; *gid = GID_MAX; - + /* * We may have found the pcb in the last lookup - check this first. */ @@ -1326,6 +1390,11 @@ in_pcblookup_hash_exists( if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if (ip_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->inp_flags & INP_RECV_ANYIF)) + continue; + if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && @@ -1334,8 +1403,10 @@ in_pcblookup_hash_exists( /* * Found. */ - *uid = inp->inp_socket->so_uid; - *gid = inp->inp_socket->so_gid; + *uid = kauth_cred_getuid( + inp->inp_socket->so_cred); + *gid = kauth_cred_getgid( + inp->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1354,17 +1425,19 @@ in_pcblookup_hash_exists( if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if (ip_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->inp_flags & INP_RECV_ANYIF)) + continue; + if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_lport == lport) { -#if defined(NFAITH) && NFAITH > 0 - if (ifp && ifp->if_type == IFT_FAITH && - (inp->inp_flags & INP_FAITH) == 0) - continue; -#endif if (inp->inp_laddr.s_addr == laddr.s_addr) { if ((found = (inp->inp_socket != NULL))) { - *uid = inp->inp_socket->so_uid; - *gid = inp->inp_socket->so_gid; + *uid = kauth_cred_getuid( + inp->inp_socket->so_cred); + *gid = kauth_cred_getgid( + inp->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1385,8 +1458,10 @@ in_pcblookup_hash_exists( #if INET6 if (local_wild_mapped != NULL) { if ((found = (local_wild_mapped->inp_socket != NULL))) { - *uid = local_wild_mapped->inp_socket->so_uid; - *gid = local_wild_mapped->inp_socket->so_gid; + *uid = kauth_cred_getuid( + local_wild_mapped->inp_socket->so_cred); + *gid = kauth_cred_getgid( + local_wild_mapped->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1397,8 +1472,10 @@ in_pcblookup_hash_exists( } if (local_wild != NULL) { if ((found = (local_wild->inp_socket != NULL))) { - *uid = local_wild->inp_socket->so_uid; - *gid = local_wild->inp_socket->so_gid; + *uid = kauth_cred_getuid( + local_wild->inp_socket->so_cred); + *gid = kauth_cred_getgid( + local_wild->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1423,7 +1500,7 @@ in_pcblookup_hash( struct in_addr laddr, u_int lport_arg, int wildcard, - __unused struct ifnet *ifp) + struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp; @@ -1444,6 +1521,11 @@ in_pcblookup_hash( if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if (ip_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->inp_flags & INP_RECV_ANYIF)) + continue; + if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && @@ -1456,8 +1538,8 @@ in_pcblookup_hash( return (inp); } else { /* it's there but dead, say it isn't found */ - lck_rw_done(pcbinfo->mtx); - return(NULL); + lck_rw_done(pcbinfo->mtx); + return (NULL); } } } @@ -1473,21 +1555,21 @@ in_pcblookup_hash( if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if (ip_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->inp_flags & INP_RECV_ANYIF)) + continue; + if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_lport == lport) { -#if defined(NFAITH) && NFAITH > 0 - if (ifp && ifp->if_type == IFT_FAITH && - (inp->inp_flags & INP_FAITH) == 0) - continue; -#endif if (inp->inp_laddr.s_addr == laddr.s_addr) { if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { lck_rw_done(pcbinfo->mtx); return (inp); } else { /* it's there but dead, say it isn't found */ - lck_rw_done(pcbinfo->mtx); - return(NULL); + lck_rw_done(pcbinfo->mtx); + return (NULL); } } else if (inp->inp_laddr.s_addr == INADDR_ANY) { @@ -1509,8 +1591,8 @@ in_pcblookup_hash( return (local_wild_mapped); } else { /* it's there but dead, say it isn't found */ - lck_rw_done(pcbinfo->mtx); - return(NULL); + lck_rw_done(pcbinfo->mtx); + return (NULL); } } #endif /* INET6 */ @@ -1522,8 +1604,8 @@ in_pcblookup_hash( return (local_wild); } else { /* it's there but dead, say it isn't found */ - lck_rw_done(pcbinfo->mtx); - return(NULL); + lck_rw_done(pcbinfo->mtx); + return (NULL); } } @@ -1581,6 +1663,9 @@ in_pcbinshash(struct inpcb *inp, int locked) if (phd->phd_port == inp->inp_lport) break; } + + VERIFY(inp->inp_state != INPCB_STATE_DEAD); + /* * If none exists, malloc one and tack it on. */ @@ -1631,11 +1716,12 @@ in_pcbrehash(struct inpcb *inp) /* * Remove PCB from various lists. + * Must be called pcbinfo lock is held in exclusive mode. */ -//###LOCK must be called with list lock held void in_pcbremlists(struct inpcb *inp) { + struct inp_fc_entry *infce; inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt; if (inp->inp_lport) { @@ -1649,6 +1735,11 @@ in_pcbremlists(struct inpcb *inp) } } LIST_REMOVE(inp, inp_list); + + infce = inp_fc_getinp(inp->inp_flowhash); + if (infce != NULL) + inp_fc_entry_free(infce); + inp->inp_pcbinfo->ipi_count--; } @@ -1763,7 +1854,7 @@ inpcb_to_compat( bzero(inp_compat, sizeof(*inp_compat)); inp_compat->inp_fport = inp->inp_fport; inp_compat->inp_lport = inp->inp_lport; - inp_compat->nat_owner = inp->nat_owner; + inp_compat->nat_owner = 0; inp_compat->nat_cookie = inp->nat_cookie; inp_compat->inp_gencnt = inp->inp_gencnt; inp_compat->inp_flags = inp->inp_flags; @@ -1848,7 +1939,7 @@ inp_route_copyout(struct inpcb *inp, struct route *dst) rtfree(src->ro_rt); src->ro_rt = NULL; } - + route_copyout(dst, src, sizeof(*dst)); } @@ -1869,9 +1960,21 @@ inp_route_copyin(struct inpcb *inp, struct route *src) /* * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option. */ -void +int inp_bindif(struct inpcb *inp, unsigned int ifscope) { + struct ifnet *ifp = NULL; + + ifnet_head_lock_shared(); + if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE && + (ifp = ifindex2ifnet[ifscope]) == NULL)) { + ifnet_head_done(); + return (ENXIO); + } + ifnet_head_done(); + + VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE); + /* * A zero interface scope value indicates an "unbind". * Otherwise, take in whatever value the app desires; @@ -1881,8 +1984,8 @@ inp_bindif(struct inpcb *inp, unsigned int ifscope) * route lookup from this point on will require an * exact match for the embedded interface scope. */ - inp->inp_boundif = ifscope; - if (inp->inp_boundif == IFSCOPE_NONE) + inp->inp_boundifp = ifp; + if (inp->inp_boundifp == NULL) inp->inp_flags &= ~INP_BOUND_IF; else inp->inp_flags |= INP_BOUND_IF; @@ -1892,6 +1995,8 @@ inp_bindif(struct inpcb *inp, unsigned int ifscope) rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } + + return (0); } /* @@ -1915,3 +2020,262 @@ inp_nocellular(struct inpcb *inp, unsigned int val) return (0); } + +/* + * Calculate flow hash for an inp, used by an interface to identify a + * flow. When an interface provides flow control advisory, this flow + * hash is used as an identifier. + */ +u_int32_t +inp_calc_flowhash(struct inpcb *inp) +{ + struct inp_flowhash_key fh __attribute__((aligned(8))); + u_int32_t flowhash = 0; + + if (inp_hash_seed == 0) + inp_hash_seed = RandomULong(); + + bzero(&fh, sizeof (fh)); + + bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof (fh.infh_laddr)); + bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof (fh.infh_faddr)); + + fh.infh_lport = inp->inp_lport; + fh.infh_fport = inp->inp_fport; + fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET; + fh.infh_proto = inp->inp_ip_p; + fh.infh_rand1 = RandomULong(); + fh.infh_rand2 = RandomULong(); + +try_again: + flowhash = net_flowhash(&fh, sizeof (fh), inp_hash_seed); + if (flowhash == 0) { + /* try to get a non-zero flowhash */ + inp_hash_seed = RandomULong(); + goto try_again; + } + + return flowhash; +} + +/* + * Function to compare inp_fc_entries in inp flow control tree + */ +static inline int +infc_cmp(const struct inp_fc_entry *fc1, const struct inp_fc_entry *fc2) +{ + return (fc1->infc_flowhash - fc2->infc_flowhash); +} + +int +inp_fc_addinp(struct inpcb *inp) +{ + struct inp_fc_entry keyfc, *infc; + u_int32_t flowhash = inp->inp_flowhash; + + keyfc.infc_flowhash = flowhash; + + lck_mtx_lock_spin(&inp_fc_lck); + infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc); + if (infc != NULL && infc->infc_inp == inp) { + /* Entry is already in inp_fc_tree, return */ + lck_mtx_unlock(&inp_fc_lck); + return (1); + } + + if (infc != NULL) { + /* + * There is a different fc entry with the same + * flow hash but different inp pointer. There + * can be a collision on flow hash but the + * probability is low. Let's just avoid + * adding a second one when there is a collision + */ + lck_mtx_unlock(&inp_fc_lck); + return (0); + } + + /* become regular mutex */ + lck_mtx_convert_spin(&inp_fc_lck); + + infc = zalloc_noblock(inp_fcezone); + if (infc == NULL) { + /* memory allocation failed */ + lck_mtx_unlock(&inp_fc_lck); + return (0); + } + bzero(infc, sizeof (*infc)); + + infc->infc_flowhash = flowhash; + infc->infc_inp = inp; + + RB_INSERT(inp_fc_tree, &inp_fc_tree, infc); + lck_mtx_unlock(&inp_fc_lck); + return (1); +} + +struct inp_fc_entry* +inp_fc_getinp(u_int32_t flowhash) +{ + struct inp_fc_entry keyfc, *infc; + + keyfc.infc_flowhash = flowhash; + + lck_mtx_lock_spin(&inp_fc_lck); + infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc); + if (infc == NULL) { + /* inp is not present, return */ + lck_mtx_unlock(&inp_fc_lck); + return (NULL); + } + + RB_REMOVE(inp_fc_tree, &inp_fc_tree, infc); + + if (in_pcb_checkstate(infc->infc_inp, WNT_ACQUIRE, 0) == + WNT_STOPUSING) { + /* become regular mutex */ + lck_mtx_convert_spin(&inp_fc_lck); + + /* + * This inp is going away, just don't process it. + */ + inp_fc_entry_free(infc); + infc = NULL; + } + lck_mtx_unlock(&inp_fc_lck); + + return (infc); +} + +void +inp_fc_entry_free(struct inp_fc_entry *infc) +{ + zfree(inp_fcezone, infc); +} + +void +inp_fc_feedback(struct inpcb *inp) +{ + struct socket *so = inp->inp_socket; + + /* we already hold a want_cnt on this inp, socket can't be null */ + VERIFY (so != NULL); + socket_lock(so, 1); + + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + socket_unlock(so, 1); + return; + } + + /* + * Return if the connection is not in flow-controlled state. + * This can happen if the connection experienced + * loss while it was in flow controlled state + */ + if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) { + socket_unlock(so, 1); + return; + } + inp_reset_fc_state(inp); + + if (so->so_proto->pr_type == SOCK_STREAM) + inp_fc_unthrottle_tcp(inp); + + socket_unlock(so, 1); +} + +void +inp_reset_fc_state(struct inpcb *inp) +{ + struct socket *so = inp->inp_socket; + int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0; + int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0; + + inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED); + + if (suspended) { + so->so_flags &= ~(SOF_SUSPENDED); + soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME)); + } + + if (inp->inp_sndinprog_cnt > 0) + inp->inp_flags |= INP_FC_FEEDBACK; + + /* Give a write wakeup to unblock the socket */ + if (needwakeup) + sowwakeup(so); +} + +int +inp_set_fc_state(struct inpcb *inp, int advcode) +{ + /* + * If there was a feedback from the interface when + * send operation was in progress, we should ignore + * this flow advisory to avoid a race between setting + * flow controlled state and receiving feedback from + * the interface + */ + if (inp->inp_flags & INP_FC_FEEDBACK) + return(0); + + inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED); + if (inp_fc_addinp(inp)) { + switch (advcode) { + case FADV_FLOW_CONTROLLED: + inp->inp_flags |= INP_FLOW_CONTROLLED; + break; + case FADV_SUSPENDED: + inp->inp_flags |= INP_FLOW_SUSPENDED; + soevent(inp->inp_socket, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND)); + + /* Record the fact that suspend event was sent */ + inp->inp_socket->so_flags |= SOF_SUSPENDED; + break; + } + } + return(1); +} + +/* + * Handler for SO_FLUSH socket option. + */ +int +inp_flush(struct inpcb *inp, int optval) +{ + u_int32_t flowhash = inp->inp_flowhash; + struct rtentry *rt; + + /* Either all classes or one of the valid ones */ + if (optval != SO_TC_ALL && !SO_VALID_TC(optval)) + return (EINVAL); + + /* We need a flow hash for identification */ + if (flowhash == 0) + return (0); + + /* We need a cached route for the interface */ + if ((rt = inp->inp_route.ro_rt) != NULL) { + struct ifnet *ifp = rt->rt_ifp; + if_qflush_sc(ifp, so_tc2msc(optval), flowhash, NULL, NULL, 0); + } + + return (0); +} + +/* + * Clear the INP_INADDR_ANY flag (special case for PPP only) + */ +void inp_clear_INP_INADDR_ANY(struct socket *so) +{ + struct inpcb *inp = NULL; + + socket_lock(so, 1); + inp = sotoinpcb(so); + if (inp) { + inp->inp_flags &= ~INP_INADDR_ANY; + } + socket_unlock(so, 1); +} + diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 728b93e33..63dddb8fd 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,9 +74,10 @@ #include #include #ifdef KERNEL_PRIVATE -#ifdef KERNEL +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ #include -#endif #endif /* KERNEL_PRIVATE */ #include /* for IPSEC */ @@ -85,7 +86,9 @@ #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ +#endif /* KERNEL_PRIVATE */ +#ifdef BSD_KERNEL_PRIVATE /* * Common structure pcb for internet protocol implementation. * Here are stored pointers to local and foreign host table @@ -95,9 +98,9 @@ */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); -#endif /* KERNEL_PRIVATE */ -typedef u_quad_t inp_gen_t; +#endif /* BSD_KERNEL_PRIVATE */ +typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing @@ -108,7 +111,7 @@ struct in_addr_4in6 { struct in_addr ia46_addr4; }; -#ifdef KERNEL_PRIVATE +#ifdef KERNEL_PRIVATE /* * NB: the zone allocator is type-stable EXCEPT FOR THE FIRST TWO LONGS * of the structure. Therefore, it is important that the members in @@ -119,34 +122,44 @@ struct icmp6_filter; #if CONFIG_MACF_NET struct label; #endif +struct ifnet; + +#ifdef BSD_KERNEL_PRIVATE +/* Flow control entry per socket */ +struct inp_fc_entry { + RB_ENTRY(inp_fc_entry) infc_link; + u_int32_t infc_flowhash; + struct inpcb *infc_inp; +}; +#endif /* BSD_KERNEL_PRIVATE */ -struct inp_stat -{ +struct inp_stat { u_int64_t rxpackets; u_int64_t rxbytes; u_int64_t txpackets; u_int64_t txbytes; }; + struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ - int inp_wantcnt; /* pcb wanted count. protected by pcb list lock */ - int inp_state; /* state of this pcb, in use, recycled, ready for recycling... */ + int inp_wantcnt; /* pcb wanted count. protected by pcb list lock */ + int inp_state; /* state of this pcb, in use, recycled, ready for recycling... */ u_short inp_fport; /* foreign port */ u_short inp_lport; /* local port */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ void *inp_ppcb; /* pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* PCB list info */ struct socket *inp_socket; /* back pointer to socket */ - u_char nat_owner; /* Used to NAT TCP/UDP traffic */ u_int32_t nat_cookie; /* Cookie stored and returned to NAT */ LIST_ENTRY(inpcb) inp_portlist; /* list for this PCB's local port */ struct inpcbport *inp_phd; /* head of this list */ inp_gen_t inp_gencnt; /* generation count of this instance */ - int inp_flags; /* generic IP/datagram flags */ + u_int32_t inp_flags; /* generic IP/datagram flags */ u_int32_t inp_flow; - u_char inp_vflag; /* INP_IPV4 or INP_IPV6 */ + u_char inp_sndinprog_cnt; /* outstanding send operations */ + u_char inp_vflag; /* INP_IPV4 or INP_IPV6 */ u_char inp_ip_ttl; /* time to live proto */ u_char inp_ip_p; /* protocol proto */ @@ -196,14 +209,16 @@ struct inpcb { caddr_t inp_saved_ppcb; /* place to save pointer while cached */ struct inpcbpolicy *inp_sp; decl_lck_mtx_data( ,inpcb_mtx); /* inpcb per-socket mutex */ - unsigned int inp_boundif; /* interface scope for INP_BOUND_IF */ - unsigned int inp_last_outif; /* last known outgoing interface */ + struct ifnet *inp_boundifp; /* interface for INP_BOUND_IF */ + struct ifnet *inp_last_outifp; /* last known outgoing interface */ u_int32_t inp_reserved[2]; /* reserved for future use */ + u_int32_t inp_flowhash; /* flow hash */ + #if CONFIG_MACF_NET struct label *inp_label; /* MAC label */ #endif struct inp_stat *inp_stat; - u_int8_t inp_stat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; + u_int8_t inp_stat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; }; #endif /* KERNEL_PRIVATE */ @@ -422,6 +437,7 @@ struct xinpcb_n { u_short inp6_ifindex; short inp6_hops; } inp_depend6; + u_int32_t inp_flowhash; }; #endif /* PRIVATE */ @@ -442,12 +458,14 @@ struct xinpgen { #define INP_IPV6 0x2 #define inp_faddr inp_dependfaddr.inp46_foreign.ia46_addr4 #define inp_laddr inp_dependladdr.inp46_local.ia46_addr4 +#define in6p_faddr inp_dependfaddr.inp6_foreign +#define in6p_laddr inp_dependladdr.inp6_local + +#ifdef BSD_KERNEL_PRIVATE #define inp_route inp_dependroute.inp4_route #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions -#define in6p_faddr inp_dependfaddr.inp6_foreign -#define in6p_laddr inp_dependladdr.inp6_local #define in6p_route inp_dependroute.inp6_route #define in6p_ip6_hlim inp_depend6.inp6_hlim #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ @@ -462,14 +480,19 @@ struct xinpgen { #define in6p_ifindex inp_depend6.inp6_ifindex #define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ #define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ +#endif /* BSD_KERNEL_PRIVATE */ + #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ + +#ifdef BSD_KERNEL_PRIVATE #define in6p_state inp_state #define in6p_wantcnt inp_wantcnt -#define in6p_last_outif inp_last_outif +#define in6p_last_outifp inp_last_outifp +#endif /* BSD_KERNEL_PRIVATE */ -#ifdef KERNEL_PRIVATE +#ifdef BSD_KERNEL_PRIVATE struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; @@ -479,18 +502,18 @@ struct inpcbport { struct inpcbinfo { /* XXX documentation, prefixes */ struct inpcbhead *hashbase; #ifdef __APPLE__ - u_int32_t hashsize; /* in elements */ + u_int32_t hashsize; /* in elements */ #endif - u_long hashmask; /* needs to be u_long as expected by hash functions */ + u_long hashmask; /* u_long as expected by hash functions */ struct inpcbporthead *porthashbase; - u_long porthashmask; /* needs to be u_long as expected by hash functions */ + u_long porthashmask; /* u_long as expected by hash functions */ struct inpcbhead *listhead; u_short lastport; u_short lastlow; u_short lasthi; - void *ipi_zone; /* zone to allocate pcbs from */ - u_int ipi_count; /* number of pcbs in this list */ - u_quad_t ipi_gencnt; /* current generation count */ + void *ipi_zone; /* zone to allocate pcbs from */ + u_int ipi_count; /* number of pcbs in this list */ + u_quad_t ipi_gencnt; /* current generation count */ #ifdef __APPLE__ #ifdef _KERN_LOCKS_H_ lck_attr_t *mtx_attr; /* mutex attributes */ @@ -511,29 +534,44 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) -#endif /* KERNEL_PRIVATE */ +#define INP_IS_FLOW_CONTROLLED(_inp_) ((_inp_)->inp_flags & INP_FLOW_CONTROLLED) +#define INP_IS_FLOW_SUSPENDED(_inp_) \ + (((_inp_)->inp_flags & INP_FLOW_SUSPENDED) || \ + ((_inp_)->inp_socket->so_flags & SOF_SUSPENDED)) +#define INP_WAIT_FOR_IF_FEEDBACK(_inp_) \ + (((_inp_)->inp_flags & (INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED)) != 0) + +#endif /* BSD_KERNEL_PRIVATE */ /* flags in inp_flags: */ +#ifdef BSD_KERNEL_PRIVATE #define INP_RECVOPTS 0x01 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x04 /* receive IP dst address */ #define INP_HDRINCL 0x08 /* user supplies entire IP header */ #define INP_HIGHPORT 0x10 /* user wants "high" port binding */ #define INP_LOWPORT 0x20 /* user wants "low" port binding */ +#endif /* BSD_KERNEL_PRIVATE */ #define INP_ANONPORT 0x40 /* port chosen for user */ +#ifdef BSD_KERNEL_PRIVATE #define INP_RECVIF 0x80 /* receive incoming interface */ #define INP_MTUDISC 0x100 /* user can do MTU discovery */ #ifdef __APPLE__ #define INP_STRIPHDR 0x200 /* Strip headers in raw_ip, for OT support */ #endif -#define INP_FAITH 0x400 /* accept FAITH'ed connections */ +#define INP_RECV_ANYIF 0x400 /* don't restrict inbound interface */ +#endif /* BSD_KERNEL_PRIVATE */ #define INP_INADDR_ANY 0x800 /* local address wasn't specified */ +#ifdef BSD_KERNEL_PRIVATE #define INP_RECVTTL 0x1000 #define INP_UDP_NOCKSUM 0x2000 /* Turn off outbound UDP checksum */ #define INP_BOUND_IF 0x4000 /* bind socket to an ifindex */ +#endif /* BSD_KERNEL_PRIVATE */ #define IN6P_IPV6_V6ONLY 0x8000 /* restrict AF_INET6 socket for v6 */ + +#ifdef BSD_KERNEL_PRIVATE #define IN6P_PKTINFO 0x10000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x20000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x40000 /* receive hop-by-hop options */ @@ -542,14 +580,19 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ +#endif /* BSD_KERNEL_PRIVATE */ + #define IN6P_BINDV6ONLY 0x1000000 /* do not grab IPv4 traffic */ + +#ifdef BSD_KERNEL_PRIVATE #define IN6P_RFC2292 0x2000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x4000000 /* receive path MTU */ #define INP_PKTINFO 0x8000000 /* receive and send PKTINFO for IPv4 */ - +#define INP_FLOW_SUSPENDED 0x10000000 /* flow suspended */ #define INP_NO_IFT_CELLULAR 0x20000000 /* do not use IFT_CELLULAR route */ +#define INP_FLOW_CONTROLLED 0x40000000 /* flow controlled */ +#define INP_FC_FEEDBACK 0x80000000 /* got interface flow adv feedback */ -#ifdef KERNEL_PRIVATE #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|INP_PKTINFO|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ @@ -565,7 +608,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define IN6P_ANONPORT INP_ANONPORT #define IN6P_RECVIF INP_RECVIF #define IN6P_MTUDISC INP_MTUDISC -#define IN6P_FAITH INP_FAITH +#define IN6P_RECV_ANYIF INP_RECV_ANYIF #define IN6P_CONTROLOPTS INP_CONTROLOPTS #define IN6P_NO_IFT_CELLULAR INP_NO_IFT_CELLULAR /* @@ -580,25 +623,31 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define INPCB_OWNED_BY_X 0x80 #define INPCB_MAX_IDS 7 #endif /* __APPLE__ */ +#endif /* BSD_KERNEL_PRIVATE */ #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ +#ifdef BSD_KERNEL_PRIVATE #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_SOCKTYPE(so) so->so_proto->pr_type #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) +#define INP_CHECK_SOCKTYPE(so, type) (INP_SOCKTYPE(so) == type) -#ifdef KERNEL extern int ipport_lowfirstauto; extern int ipport_lowlastauto; extern int ipport_firstauto; extern int ipport_lastauto; +#endif /* BSD_KERNEL_PRIVATE */ + extern int ipport_hifirstauto; extern int ipport_hilastauto; struct sysctl_req; +#ifdef BSD_KERNEL_PRIVATE + #define INPCB_STATE_INUSE 0x1 /* freshly allocated PCB, it's in use */ #define INPCB_STATE_CACHED 0x2 /* this pcb is sitting in a a cache */ #define INPCB_STATE_DEAD 0x3 /* should treat as gone, will be garbage collected and freed */ @@ -611,13 +660,14 @@ extern void in_losing(struct inpcb *); extern void in_rtchange(struct inpcb *, int); extern int in_pcballoc(struct socket *, struct inpcbinfo *, struct proc *); extern int in_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); -extern int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *, unsigned int *); +extern int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *, + struct ifnet **); extern void in_pcbdetach(struct inpcb *); extern void in_pcbdispose (struct inpcb *); extern void in_pcbdisconnect(struct inpcb *); extern int in_pcbinshash(struct inpcb *, int); extern int in_pcbladdr(struct inpcb *, struct sockaddr *, - struct sockaddr_in *, unsigned int *); + struct sockaddr_in *, struct ifnet **); extern struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); extern struct inpcb *in_pcblookup_local_and_cleanup(struct inpcbinfo *, @@ -641,12 +691,29 @@ extern void inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp); #endif extern int get_pcblist_n(short , struct sysctl_req *, struct inpcbinfo *); +extern void inpcb_get_ports_used(unsigned int , uint8_t *, struct inpcbinfo *); + +#define INPCB_OPPORTUNISTIC_THROTTLEON 0x0001 +#define INPCB_OPPORTUNISTIC_SETCMD 0x0002 +extern uint32_t inpcb_count_opportunistic(unsigned int , struct inpcbinfo *, u_int32_t); extern void inp_route_copyout(struct inpcb *, struct route *); extern void inp_route_copyin(struct inpcb *, struct route *); -extern void inp_bindif(struct inpcb *, unsigned int); +extern int inp_bindif(struct inpcb *, unsigned int); extern int inp_nocellular(struct inpcb *, unsigned int); +extern u_int32_t inp_calc_flowhash(struct inpcb *); +extern void socket_flowadv_init(void); +extern int inp_fc_addinp(struct inpcb *); +extern struct inp_fc_entry *inp_fc_getinp(u_int32_t); +extern void inp_fc_entry_free(struct inp_fc_entry *); +extern void inp_fc_feedback(struct inpcb *); +extern void inp_reset_fc_state(struct inpcb *); +extern int inp_set_fc_state(struct inpcb *, int advcode); +extern void inp_fc_unthrottle_tcp(struct inpcb *); +extern int inp_flush(struct inpcb *, int); +#endif /* BSD_KERNEL_PRIVATE */ -#endif /* KERNEL */ +#ifdef KERNEL_PRIVATE +extern void inp_clear_INP_INADDR_ANY(struct socket *so); #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_IN_PCB_H_ */ diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c index 9ff8839b5..00ef00324 100644 --- a/bsd/netinet/in_pcblist.c +++ b/bsd/netinet/in_pcblist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Inc. All rights reserved. + * Copyright (c) 2010-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,8 +67,10 @@ #include #include #include +#include #include +#include #include #include @@ -124,7 +126,7 @@ sotoxsocket_n(struct socket *so, struct xsocket_n *xso) xso->so_error = so->so_error; xso->so_pgid = so->so_pgid; xso->so_oobmark = so->so_oobmark; - xso->so_uid = so->so_uid; + xso->so_uid = kauth_cred_getuid(so->so_cred); } } @@ -186,6 +188,7 @@ inpcb_to_xinpcb_n(struct inpcb *inp, struct xinpcb_n *xinp) xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex; xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; + xinp->inp_flowhash = inp->inp_flowhash; } __private_extern__ void @@ -381,3 +384,69 @@ done: return error; } +__private_extern__ void +inpcb_get_ports_used(unsigned int ifindex, uint8_t *bitfield, struct inpcbinfo *pcbinfo) +{ + lck_rw_lock_shared(pcbinfo->mtx); + + struct inpcb *inp; + inp_gen_t gencnt = pcbinfo->ipi_gencnt; + for (inp = LIST_FIRST(pcbinfo->listhead); inp; inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD && + (ifindex == 0 || inp->inp_last_outifp == NULL || ifindex == inp->inp_last_outifp->if_index)) { + uint16_t port = ntohs(inp->inp_lport); + bitfield[port / 8] |= 1 << (port & 0x7); + } + } + + lck_rw_done(pcbinfo->mtx); +} + +__private_extern__ uint32_t +inpcb_count_opportunistic(unsigned int ifindex, struct inpcbinfo *pcbinfo, + u_int32_t flags) +{ + uint32_t opportunistic = 0; + + lck_rw_lock_shared(pcbinfo->mtx); + + struct inpcb *inp; + inp_gen_t gencnt = pcbinfo->ipi_gencnt; + for (inp = LIST_FIRST(pcbinfo->listhead); + inp; inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && + inp->inp_state != INPCB_STATE_DEAD && + inp->inp_socket != NULL && + so_get_opportunistic(inp->inp_socket) && + inp->inp_last_outifp != NULL && + ifindex == inp->inp_last_outifp->if_index) { + opportunistic++; + struct socket *so = inp->inp_socket; + if ((flags & INPCB_OPPORTUNISTIC_SETCMD) && + (so->so_state & SS_ISCONNECTED)) { + socket_lock(so, 1); + if (flags & INPCB_OPPORTUNISTIC_THROTTLEON) { + so->so_flags |= SOF_SUSPENDED; + soevent(so, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_SUSPEND)); + } else { + so->so_flags &= ~(SOF_SUSPENDED); + soevent(so, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_RESUME)); + } + SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] " + "%s\n", so->last_pid, so, INP_SOCKAF(so), + INP_SOCKTYPE(so), + (so->so_flags & SOF_SUSPENDED) ? + "SUSPENDED" : "RESUMED")); + socket_unlock(so, 1); + } + } + } + + lck_rw_done(pcbinfo->mtx); + + return (opportunistic); +} diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index 7c979683a..f3f3be8e7 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -213,7 +213,7 @@ struct protosw inetsw[] = { encap_init, 0, 0, 0, 0, &rip_usrreqs, - 0, 0, 0, { 0, 0 }, 0, { 0 } + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, # if INET6 { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, @@ -222,7 +222,7 @@ struct protosw inetsw[] = { encap_init, 0, 0, 0, 0, &rip_usrreqs, - 0, 0, 0, { 0, 0 }, 0, { 0 } + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, #endif #if IPDIVERT @@ -242,7 +242,7 @@ struct protosw inetsw[] = { 0, 0, 0, 0, 0, &rip_usrreqs, - 0, 0, 0, { 0, 0 }, 0, { 0 } + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, #endif #if NSIP @@ -252,7 +252,7 @@ struct protosw inetsw[] = { 0, 0, 0, 0, 0, &rip_usrreqs, - 0, 0, 0, { 0, 0 }, 0, { 0 } + 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } }, #endif /* raw wildcard */ diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 2d0c2735d..ca9a4247e 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -106,7 +106,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; - struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + struct sockaddr_in *sin = (struct sockaddr_in *)(void *)rt_key(rt); struct radix_node *ret; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -145,11 +145,9 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, /* Become a regular mutex */ RT_CONVERT_LOCK(rt); IFA_LOCK_SPIN(rt->rt_ifa); -#define satosin(sa) ((struct sockaddr_in *)sa) if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == sin->sin_addr.s_addr) rt->rt_flags |= RTF_LOCAL; -#undef satosin IFA_UNLOCK(rt->rt_ifa); } } @@ -211,8 +209,9 @@ in_validate(struct radix_node *rn) /* It's one of ours; unexpire it */ rt->rt_flags &= ~RTPRF_OURS; rt_setexpire(rt, 0); - } else if ((rt->rt_flags & RTF_LLINFO) && - (rt->rt_flags & RTF_HOST) && rt->rt_gateway != NULL && + } else if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) == + (RTF_LLINFO | RTF_HOST) && rt->rt_llinfo != NULL && + rt->rt_gateway != NULL && rt->rt_gateway->sa_family == AF_LINK) { /* It's ARP; let it be handled there */ arp_validate(rt); diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c index 54b5fcc1d..02d9ccc86 100644 --- a/bsd/netinet/in_tclass.c +++ b/bsd/netinet/in_tclass.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2011 Apple Inc. All rights reserved. + * Copyright (c) 2009-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,32 +60,32 @@ extern char *proc_name_address(void *p); static int tfp_count = 0; -static TAILQ_HEAD(, tclass_for_proc) tfp_head = TAILQ_HEAD_INITIALIZER(tfp_head); +static TAILQ_HEAD(, tclass_for_proc) tfp_head = + TAILQ_HEAD_INITIALIZER(tfp_head); struct tclass_for_proc { TAILQ_ENTRY(tclass_for_proc) tfp_link; - int tfp_class; - pid_t tfp_pid; - char tfp_pname[MAXCOMLEN + 1]; + int tfp_class; + pid_t tfp_pid; + char tfp_pname[MAXCOMLEN + 1]; }; -extern void tcp_set_background_cc(struct socket *); -extern void tcp_set_foreground_cc(struct socket *); - -int dscp_code_from_mbuf_tclass(int ); - -static int get_pid_tclass(pid_t , int *); -static int get_pname_tclass(const char * , int *); -static int set_pid_tclass(pid_t , int ); -static int set_pname_tclass(const char * , int ); +static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t); +static int get_pid_tclass(struct so_tcdbg *); +static int get_pname_tclass(struct so_tcdbg *); +static int set_pid_tclass(struct so_tcdbg *); +static int set_pname_tclass(struct so_tcdbg *); +static int flush_pid_tclass(struct so_tcdbg *); static int purge_tclass_for_proc(void); static int flush_tclass_for_proc(void); +static void so_set_lro(struct socket*, int); +int get_tclass_for_curr_proc(int *); - -static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ -static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ -static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ -static lck_mtx_t *tclass_lock = NULL; +static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ +static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ +static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ +decl_lck_mtx_data(static, tclass_lock_data); +static lck_mtx_t *tclass_lock = &tclass_lock_data; /* * Must be called with tclass_lock held @@ -94,12 +94,12 @@ static struct tclass_for_proc * find_tfp_by_pid(pid_t pid) { struct tclass_for_proc *tfp; - + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { if (tfp->tfp_pid == pid) break; } - return tfp; + return (tfp); } /* @@ -109,36 +109,39 @@ static struct tclass_for_proc * find_tfp_by_pname(const char *pname) { struct tclass_for_proc *tfp; - + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { - if (strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0) + if (strncmp(pname, tfp->tfp_pname, + sizeof (tfp->tfp_pname)) == 0) break; } - return tfp; + return (tfp); } -static int -get_tclass_for_curr_proc(void) +__private_extern__ int +get_tclass_for_curr_proc(int *sotc) { - struct tclass_for_proc *tfp; - int sotc = SO_TC_BE; + struct tclass_for_proc *tfp = NULL; proc_t p = current_proc(); /* Not ref counted */ pid_t pid = proc_pid(p); char *pname = proc_name_address(p); - + + *sotc = -1; + lck_mtx_lock(tclass_lock); - + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { - if ((tfp->tfp_pid == pid) || - (tfp->tfp_pid == -1 && strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0)) { - sotc = tfp->tfp_class; + if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 && + strncmp(pname, tfp->tfp_pname, + sizeof (tfp->tfp_pname)) == 0)) { + *sotc = tfp->tfp_class; break; - } + } } lck_mtx_unlock(tclass_lock); - return sotc; + return ((tfp == NULL) ? 0 : 1); } /* @@ -154,13 +157,13 @@ purge_tclass_for_proc(void) TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { proc_t p; - + if (tfp->tfp_pid == -1) continue; if ((p = proc_find(tfp->tfp_pid)) == NULL) { tfp_count--; TAILQ_REMOVE(&tfp_head, tfp, tfp_link); - + _FREE(tfp, M_TEMP); } else { proc_rele(p); @@ -168,8 +171,8 @@ purge_tclass_for_proc(void) } lck_mtx_unlock(tclass_lock); - - return error; + + return (error); } /* @@ -200,10 +203,10 @@ flush_tclass_for_proc(void) TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { free_tclass_for_proc(tfp); } - + lck_mtx_unlock(tclass_lock); - - return error; + + return (error); } @@ -211,40 +214,39 @@ flush_tclass_for_proc(void) * Must be called with tclass_lock held */ static struct tclass_for_proc * -alloc_tclass_for_proc(pid_t pid, const char *pname, int tclass) +alloc_tclass_for_proc(pid_t pid, const char *pname) { struct tclass_for_proc *tfp; - + if (pid == -1 && pname == NULL) - return NULL; + return (NULL); - tfp = _MALLOC(sizeof(struct tclass_for_proc), M_TEMP, M_NOWAIT | M_ZERO); + tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO); if (tfp == NULL) - return NULL; - + return (NULL); + tfp->tfp_pid = pid; - tfp->tfp_class = tclass; /* - * Add per pid entries before per proc name so we can find + * Add per pid entries before per proc name so we can find * a specific instance of a process before the general name base entry. */ if (pid != -1) { TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link); } else { - strlcpy(tfp->tfp_pname, pname, sizeof(tfp->tfp_pname)); + strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname)); TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link); } - + tfp_count++; - return tfp; + return (tfp); } /* * -1 for tclass means to remove the entry */ -int -set_pid_tclass(pid_t pid, int tclass) +int +set_pid_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; proc_t p = NULL; @@ -252,205 +254,279 @@ set_pid_tclass(pid_t pid, int tclass) struct fileproc *fp; struct tclass_for_proc *tfp; int i; + pid_t pid = so_tcdbg->so_tcdbg_pid; + int tclass = so_tcdbg->so_tcdbg_tclass; p = proc_find(pid); if (p == NULL) { - printf("set_pid_tclass proc_find(%d) \n", pid); + printf("%s proc_find(%d) failed\n", __func__, pid); goto done; } - + /* Need a tfp */ lck_mtx_lock(tclass_lock); - + tfp = find_tfp_by_pid(pid); - if (tclass == -1) { - if (tfp != NULL) { - free_tclass_for_proc(tfp); - error = 0; - } - lck_mtx_unlock(tclass_lock); - goto done; - } else { + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(pid, NULL); if (tfp == NULL) { - tfp = alloc_tclass_for_proc(pid, NULL, tclass); - if (tfp == NULL) { - lck_mtx_unlock(tclass_lock); - error = ENOBUFS; - goto done; - } - } else { - tfp->tfp_class = tclass; + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; } } + tfp->tfp_class = tclass; + lck_mtx_unlock(tclass_lock); if (tfp != NULL) { proc_fdlock(p); - + fdp = p->p_fd; for (i = 0; i < fdp->fd_nfiles; i++) { struct socket *so; - + fp = fdp->fd_ofiles[i]; - if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || - fp->f_fglob->fg_type != DTYPE_SOCKET) + if (fp == NULL || + (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) continue; - + so = (struct socket *)fp->f_fglob->fg_data; - if (so->so_proto->pr_domain->dom_family != AF_INET && - so->so_proto->pr_domain->dom_family != AF_INET6) + if (so->so_proto->pr_domain->dom_family != AF_INET && + so->so_proto->pr_domain->dom_family != AF_INET6) continue; socket_lock(so, 1); - error = so_set_traffic_class(so, tclass != -1 ? tclass : SO_TC_BE); - socket_unlock(so, 1); - if (error != 0) { - printf("set_pid_tclass so_set_traffic_class(%p, %d) failed %d\n", so, tclass, error); - error = 0; + if (tclass != -1) { + error = so_set_traffic_class(so, tclass); + if (error != 0) { + printf("%s: so_set_traffic_class" + "(so=%p, fd=%d, tclass=%d) " + "failed %d\n", __func__, + so, i, tclass, error); + error = 0; + } } + socket_unlock(so, 1); } - + proc_fdunlock(p); } - - error = 0; + + error = 0; done: if (p != NULL) proc_rele(p); - - return error; + + return (error); } -int -set_pname_tclass(const char *pname, int tclass) +int +set_pname_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; struct tclass_for_proc *tfp; lck_mtx_lock(tclass_lock); - - tfp = find_tfp_by_pname(pname); - if (tclass == -1) { - if (tfp != NULL) - free_tclass_for_proc(tfp); - } else { + + tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname); if (tfp == NULL) { - tfp = alloc_tclass_for_proc(-1, pname, tclass); - if (tfp == NULL) { - lck_mtx_unlock(tclass_lock); - error = ENOBUFS; - goto done; - } - } else { - tfp->tfp_class = tclass; + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; } } + tfp->tfp_class = so_tcdbg->so_tcdbg_tclass; + lck_mtx_unlock(tclass_lock); - - error = 0; + + error = 0; done: - - return error; + + return (error); } -int -get_pid_tclass(pid_t pid, int *tclass) +static int +flush_pid_tclass(struct so_tcdbg *so_tcdbg) +{ + pid_t pid = so_tcdbg->so_tcdbg_pid; + int tclass = so_tcdbg->so_tcdbg_tclass; + struct filedesc *fdp; + int error = EINVAL; + proc_t p; + int i; + + p = proc_find(pid); + if (p == PROC_NULL) { + printf("%s proc_find(%d) failed\n", __func__, pid); + goto done; + } + + proc_fdlock(p); + fdp = p->p_fd; + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *so; + struct fileproc *fp; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || + (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) + continue; + + so = (struct socket *)fp->f_fglob->fg_data; + error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass, + sizeof (tclass)); + if (error != 0) { + printf("%s: setsockopt(SO_FLUSH) (so=%p, fd=%d, " + "tclass=%d) failed %d\n", __func__, so, i, tclass, + error); + error = 0; + } + } + proc_fdunlock(p); + + error = 0; +done: + if (p != PROC_NULL) + proc_rele(p); + + return (error); +} + +int +get_pid_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; proc_t p = NULL; struct tclass_for_proc *tfp; - - *tclass = -1; /* Means not set */ + pid_t pid = so_tcdbg->so_tcdbg_pid; + + so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ + so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ p = proc_find(pid); if (p == NULL) { - printf("get_pid_tclass proc_find(%d) \n", pid); + printf("%s proc_find(%d) failed\n", __func__, pid); goto done; } - + /* Need a tfp */ lck_mtx_lock(tclass_lock); - + tfp = find_tfp_by_pid(pid); if (tfp != NULL) { - *tclass = tfp->tfp_class ; + so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; error = 0; } lck_mtx_unlock(tclass_lock); done: if (p != NULL) proc_rele(p); - - return error; + + return (error); } -int -get_pname_tclass(const char *pname, int *tclass) +int +get_pname_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; struct tclass_for_proc *tfp; - - *tclass = -1; /* Means not set */ + + so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ + so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ /* Need a tfp */ lck_mtx_lock(tclass_lock); - - tfp = find_tfp_by_pname(pname); + + tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); if (tfp != NULL) { - *tclass = tfp->tfp_class ; + so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; error = 0; } lck_mtx_unlock(tclass_lock); - - return error; + + return (error); } +static int +delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg) +{ + int error = EINVAL; + pid_t pid = so_tcdbg->so_tcdbg_pid; + struct tclass_for_proc *tfp = NULL; + + lck_mtx_lock(tclass_lock); + if (pid != -1) + tfp = find_tfp_by_pid(pid); + else + tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); + + if (tfp != NULL) { + free_tclass_for_proc(tfp); + error = 0; + } + + lck_mtx_unlock(tclass_lock); + + return (error); +} /* * Setting options requires privileges */ -__private_extern__ int +__private_extern__ int so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg) { int error = 0; - + if ((so->so_state & SS_PRIV) == 0) - return EPERM; + return (EPERM); socket_unlock(so, 0); switch (so_tcdbg->so_tcdbg_cmd) { case SO_TCDBG_PID: - error = set_pid_tclass(so_tcdbg->so_tcdbg_pid, so_tcdbg->so_tcdbg_tclass); + error = set_pid_tclass(so_tcdbg); break; - + case SO_TCDBG_PNAME: - error = set_pname_tclass(so_tcdbg->so_tcdbg_pname, so_tcdbg->so_tcdbg_tclass); + error = set_pname_tclass(so_tcdbg); break; - + case SO_TCDBG_PURGE: error = purge_tclass_for_proc(); break; - + case SO_TCDBG_FLUSH: error = flush_tclass_for_proc(); break; - + + case SO_TCDBG_DELETE: + error = delete_tclass_for_pid_pname(so_tcdbg); + break; + + case SO_TCDBG_TCFLUSH_PID: + error = flush_pid_tclass(so_tcdbg); + break; + default: error = EINVAL; break; - } socket_lock(so, 0); - return error; + return (error); } /* * Not required to be privileged to get */ -__private_extern__ int +__private_extern__ int sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) { int error = 0; @@ -458,23 +534,24 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) void *buf = NULL; size_t len = sopt->sopt_valsize; - error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg), sizeof(struct so_tcdbg)); + error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), + sizeof (struct so_tcdbg)); if (error != 0) - return error; - + return (error); + sopt->sopt_valsize = len; - + socket_unlock(so, 0); switch (so_tcdbg.so_tcdbg_cmd) { case SO_TCDBG_PID: - error = get_pid_tclass(so_tcdbg.so_tcdbg_pid, &so_tcdbg.so_tcdbg_tclass); + error = get_pid_tclass(&so_tcdbg); break; - + case SO_TCDBG_PNAME: - error = get_pname_tclass(so_tcdbg.so_tcdbg_pname, &so_tcdbg.so_tcdbg_tclass); + error = get_pname_tclass(&so_tcdbg); break; - + case SO_TCDBG_COUNT: lck_mtx_lock(tclass_lock); so_tcdbg.so_tcdbg_count = tfp_count; @@ -492,7 +569,7 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) error = EINVAL; break; } - len = alloc_count * sizeof(struct so_tcdbg); + len = alloc_count * sizeof (struct so_tcdbg); lck_mtx_unlock(tclass_lock); buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); @@ -513,33 +590,35 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) } else { ptr->so_tcdbg_cmd = SO_TCDBG_PNAME; ptr->so_tcdbg_pid = -1; - strlcpy(ptr->so_tcdbg_pname, tfp->tfp_pname, sizeof(ptr->so_tcdbg_pname)); + strlcpy(ptr->so_tcdbg_pname, + tfp->tfp_pname, + sizeof (ptr->so_tcdbg_pname)); } ptr->so_tcdbg_tclass = tfp->tfp_class; ptr++; } - + lck_mtx_unlock(tclass_lock); } break; - + default: error = EINVAL; break; - } socket_lock(so, 0); if (error == 0) { if (buf == NULL) { - error = sooptcopyout(sopt, &so_tcdbg, sizeof(struct so_tcdbg)); + error = sooptcopyout(sopt, &so_tcdbg, + sizeof (struct so_tcdbg)); } else { error = sooptcopyout(sopt, buf, len); _FREE(buf, M_TEMP); } } - return error; + return (error); } @@ -547,78 +626,121 @@ __private_extern__ int so_set_traffic_class(struct socket *so, int optval) { int error = 0; - - if (optval < SO_TC_BE || optval > SO_TC_VO) { + + if (optval < SO_TC_BE || optval > SO_TC_CTL) { error = EINVAL; } else { - so->so_traffic_class = optval; - - if ((INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) && - INP_SOCKTYPE(so) == SOCK_STREAM) { - set_tcp_stream_priority(so); + switch (optval) { + case _SO_TC_BK: + optval = SO_TC_BK; + break; + case _SO_TC_VI: + optval = SO_TC_VI; + break; + case _SO_TC_VO: + optval = SO_TC_VO; + break; + default: + if (!SO_VALID_TC(optval)) + error = EINVAL; + break; + } + + if (error == 0) { + int oldval = so->so_traffic_class; + + VERIFY(SO_VALID_TC(optval)); + so->so_traffic_class = optval; + + if ((INP_SOCKAF(so) == AF_INET || + INP_SOCKAF(so) == AF_INET6) && + INP_SOCKTYPE(so) == SOCK_STREAM) { + set_tcp_stream_priority(so); + + /* Set/unset use of Large Receive Offload */ + so_set_lro(so, optval); + } + + if ((INP_SOCKAF(so) == AF_INET || + INP_SOCKAF(so) == AF_INET6) && + optval != oldval && (optval == SO_TC_BK_SYS || + oldval == SO_TC_BK_SYS)) { + /* + * If the app switches from BK_SYS to something + * else, resume the socket if it was suspended. + */ + if (oldval == SO_TC_BK_SYS) + inp_reset_fc_state(so->so_pcb); + + SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] " + "opportunistic %s\n", so->last_pid, + so, INP_SOCKAF(so), INP_SOCKTYPE(so), + (optval == SO_TC_BK_SYS) ? "ON" : "OFF")); + } } } - return error; + return (error); } __private_extern__ void so_set_default_traffic_class(struct socket *so) { - int sotc = SO_TC_BE; + int sotc = -1; - if (tfp_count > 0 && (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) { - sotc = get_tclass_for_curr_proc(); + if (tfp_count > 0 && + (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) { + get_tclass_for_curr_proc(&sotc); } - - so->so_traffic_class = sotc; - - return; + + so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE; } +__private_extern__ int +so_set_opportunistic(struct socket *so, int optval) +{ + return (so_set_traffic_class(so, (optval == 0) ? + SO_TC_BE : SO_TC_BK_SYS)); +} __private_extern__ int -mbuf_traffic_class_from_control(struct mbuf *control) +so_get_opportunistic(struct socket *so) +{ + return (so->so_traffic_class == SO_TC_BK_SYS); +} + +__private_extern__ mbuf_svc_class_t +mbuf_service_class_from_control(struct mbuf *control) { struct cmsghdr *cm; - - for (cm = M_FIRST_CMSGHDR(control); - cm != NULL; - cm = M_NXT_CMSGHDR(control, cm)) { + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + + for (cm = M_FIRST_CMSGHDR(control); cm != NULL; + cm = M_NXT_CMSGHDR(control, cm)) { int tc; - if (cm->cmsg_len < sizeof(struct cmsghdr)) + if (cm->cmsg_len < sizeof (struct cmsghdr)) break; - + if (cm->cmsg_level != SOL_SOCKET || - cm->cmsg_type != SO_TRAFFIC_CLASS) + cm->cmsg_type != SO_TRAFFIC_CLASS) continue; - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) + if (cm->cmsg_len != CMSG_LEN(sizeof (int))) continue; - - tc = *(int *)CMSG_DATA(cm); - - switch (tc) { - case SO_TC_BE: - return MBUF_TC_BE; - case SO_TC_BK: - return MBUF_TC_BK; - case SO_TC_VI: - return MBUF_TC_VI; - case SO_TC_VO: - return MBUF_TC_VO; - default: - break; - } + + tc = *(int *)(void *)CMSG_DATA(cm); + msc = so_tc2msc(tc); + if (MBUF_VALID_SC(msc)) + break; } - - return MBUF_TC_UNSPEC; + + return (msc); } __private_extern__ int -dscp_code_from_mbuf_tclass(int mtc) +dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc) { int dscp_code; - + switch (mtc) { default: case MBUF_TC_BE: @@ -634,56 +756,65 @@ dscp_code_from_mbuf_tclass(int mtc) dscp_code = 0x30; break; } - - return dscp_code; + + return (dscp_code); } __private_extern__ void so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) { - uint32_t sotc = m->m_pkthdr.prio; + uint32_t sotc = m_get_traffic_class(m); if (sotc >= SO_TC_STATS_MAX) sotc = SO_TC_BE; - - so->so_tc_stats[sotc].rxpackets += 1; - so->so_tc_stats[sotc].rxbytes += ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; - return; + so->so_tc_stats[sotc].rxpackets += 1; + so->so_tc_stats[sotc].rxbytes += + ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; } __private_extern__ void set_tcp_stream_priority(struct socket *so) { struct tcpcb *tp = intotcpcb(sotoinpcb(so)); + int old_cc = tp->tcp_cc_index; + int recvbg = IS_TCP_RECV_BG(so); - /* If the socket was marked as a background socket or if the - * traffic class is set to background with traffic class socket - * option then make both send and recv side of the stream to be - * background. The variable sotcdb which can be set with sysctl + /* + * If the socket was marked as a background socket or if the + * traffic class is set to background with traffic class socket + * option then make both send and recv side of the stream to be + * background. The variable sotcdb which can be set with sysctl * is used to disable these settings for testing. */ - if (soisbackground(so) || so->so_traffic_class == SO_TC_BK) { + if (soisthrottled(so) || IS_SO_TC_BACKGROUND(so->so_traffic_class)) { if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0) { - if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) tcp_set_foreground_cc(so); } else { - if (tp->tcp_cc_index != TCP_CC_ALGO_BACKGROUND_INDEX) + if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX) tcp_set_background_cc(so); } - + /* Set receive side background flags */ - if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) { - so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); - } else { - so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; - } + if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) + tcp_clear_recv_bg(so); + else + tcp_set_recv_bg(so); } else { - so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); - if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_clear_recv_bg(so); + if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) tcp_set_foreground_cc(so); } - return; + + if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) { + SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] TCP %s send; " + "%s recv\n", so->last_pid, so, INP_SOCKAF(so), + INP_SOCKTYPE(so), + (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ? + "background" : "foreground", + IS_TCP_RECV_BG(so) ? "background" : "foreground")); + } } /* @@ -692,119 +823,126 @@ set_tcp_stream_priority(struct socket *so) * - set the DSCP code following the WMM mapping */ __private_extern__ void -set_packet_tclass(struct mbuf *m, struct socket *so, int in_mtc, int isipv6) +set_packet_service_class(struct mbuf *m, struct socket *so, + mbuf_svc_class_t in_msc, u_int32_t flags) { - int mtc = MBUF_TC_BE; /* Best effort by default */ - struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ + mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */ + struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ struct ip *ip = mtod(m, struct ip *); #if INET6 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #endif /* INET6 */ - + int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0; + if (!(m->m_flags & M_PKTHDR)) return; - - /* + + /* * Here is the precedence: * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all * 2) Traffic class passed via ancillary data to sendmsdg(2) * 3) Traffic class socket option last */ - if (soisbackground(so)) { - mtc = MBUF_TC_BK; - } else if (in_mtc != MBUF_TC_UNSPEC) { - if (in_mtc >= MBUF_TC_BE && in_mtc <= MBUF_TC_VO) - mtc = in_mtc; + if (in_msc != MBUF_SC_UNSPEC) { + if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL) + msc = in_msc; } else { - switch (so->so_traffic_class) { - case SO_TC_BE: - mtc = MBUF_TC_BE; - break; - case SO_TC_BK: - mtc = MBUF_TC_BK; - break; - case SO_TC_VI: - mtc = MBUF_TC_VI; - break; - case SO_TC_VO: - mtc = MBUF_TC_VO; - break; - default: - break; - } + VERIFY(SO_VALID_TC(so->so_traffic_class)); + msc = so_tc2msc(so->so_traffic_class); + /* Assert because tc must have been valid */ + VERIFY(MBUF_VALID_SC(msc)); } - + + /* + * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority. + */ + if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc)) + msc = MBUF_SC_BK; + /* - * Set the traffic class in the mbuf packet header prio field + * Set the traffic class in the mbuf packet header svc field */ - if ((sotcdb & SOTCDB_NO_MTC)) + if (sotcdb & SOTCDB_NO_MTC) goto no_mbtc; - m->m_pkthdr.prio = mtc; - + + /* Elevate service class if the packet is a pure TCP ACK. + * We can do this only when the flow is not a background + * flow and the outgoing interface supports + * transmit-start model. + */ + if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK)) + msc = MBUF_SC_CTL; + + (void) m_set_service_class(m, msc); + + /* + * Set the privileged traffic auxiliary flag if applicable, or clear it. + */ + if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) && + msc != MBUF_SC_UNSPEC) + m->m_pkthdr.aux_flags |= MAUXF_PRIO_PRIVILEGED; + else + m->m_pkthdr.aux_flags &= ~MAUXF_PRIO_PRIVILEGED; + no_mbtc: /* - * Quick exit when best effort + * Quick exit when best effort */ - if (mtc == MBUF_TC_BE) + if (msc == MBUF_SC_BE) goto no_dscp; + /* - * Now let set the DSCP code in IPv4 or IPv6 header - * By default do this only for local traffic if a code is not already set + * The default behavior is for the networking stack to not set the + * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is + * cleared, set the DSCP code in IPv4 or IPv6 header only for local + * traffic, if it is not already set. */ - if ((sotcdb & SOTCDB_NO_DSCP)) + if (sotcdb & SOTCDB_NO_DSCP) goto no_dscp; - + /* - * Test if a IP TOS or IPV6 TCLASS has already been set on the socket or the raw packet + * Test if a IP TOS or IPV6 TCLASS has already been set + * on the socket or the raw packet. */ - if ((sotcdb & SOTCDB_NO_DSCPTST) == 0) { + if (!(sotcdb & SOTCDB_NO_DSCPTST)) { #if INET6 - if (isipv6) - { - if ((so->so_type == SOCK_RAW && (ip6->ip6_flow & htonl(0xff << 20)) != 0) || - (inp->in6p_outputopts && inp->in6p_outputopts->ip6po_tclass != -1)) + if (isipv6) { + if ((so->so_type == SOCK_RAW && + (ip6->ip6_flow & htonl(0xff << 20)) != 0) || + (inp->in6p_outputopts && + inp->in6p_outputopts->ip6po_tclass != -1)) goto no_dscp; - } - else + } else #endif /* INET6 */ - { - if ((so->so_type == SOCK_RAW && (inp->inp_flags & INP_HDRINCL)) || - inp->inp_ip_tos != 0) - goto no_dscp; - } + if ((so->so_type == SOCK_RAW && + (inp->inp_flags & INP_HDRINCL)) || + inp->inp_ip_tos != 0) + goto no_dscp; } - + /* * Test if destination is local */ - if ((sotcdb & SOTCDB_NO_LCLTST) == 0) { + if (!(sotcdb & SOTCDB_NO_LCLTST)) { int islocal = 0; - struct route *ro = &inp->inp_route; + struct rtentry *rt = inp->inp_route.ro_rt; if (so->so_type == SOCK_STREAM) { - struct tcpcb *tp = intotcpcb(inp); - - if ((tp->t_flags & TF_LOCAL)) + if (intotcpcb(inp)->t_flags & TF_LOCAL) islocal = 1; - } - else -#if INET6 - if (isipv6) - { - if ((ro != NULL && ro->ro_rt != NULL && - (ro->ro_rt->rt_gateway->sa_family == AF_LINK || - (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || - in6addr_local(&ip6->ip6_dst)) + } else if (rt != NULL && + (rt->rt_gateway->sa_family == AF_LINK || + (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) { + if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT)) islocal = 1; - } - else + } else +#if INET6 + if (isipv6 && in6addr_local(&ip6->ip6_dst)) { + islocal = 1; + } else #endif /* INET6 */ - { - if ((ro != NULL && ro->ro_rt != NULL && - (ro->ro_rt->rt_gateway->sa_family == AF_LINK || - (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || - inaddr_local(ip->ip_dst)) - islocal = 1; + if (inaddr_local(ip->ip_dst)) { + islocal = 1; } if (islocal == 0) goto no_dscp; @@ -812,28 +950,38 @@ no_mbtc: #if INET6 if (isipv6) - ip6->ip6_flow |= - htonl(dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 20); + ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass( + m_get_traffic_class(m)) << 20); else #endif /* INET6 */ - ip->ip_tos |= dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 2; - + ip->ip_tos |= dscp_code_from_mbuf_tclass( + m_get_traffic_class(m)) << 2; + no_dscp: /* * For TCP with background traffic class switch CC algo based on sysctl */ - if (so->so_type == SOCK_STREAM) { + if (so->so_type == SOCK_STREAM) set_tcp_stream_priority(so); - } - + + so_tc_update_stats(m, so, msc); +} + +__private_extern__ void +so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc) +{ + mbuf_traffic_class_t mtc; + /* * Assume socket and mbuf traffic class values are the same - * Also assume the socket lock is held + * Also assume the socket lock is held. Note that the stats + * at the socket layer are reduced down to the legacy traffic + * classes; we could/should potentially expand so_tc_stats[]. */ + mtc = MBUF_SC2TC(msc); + VERIFY(mtc < SO_TC_STATS_MAX); so->so_tc_stats[mtc].txpackets += 1; so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len; - - return; } __private_extern__ void @@ -842,9 +990,100 @@ socket_tclass_init(void) tclass_lck_grp_attr = lck_grp_attr_alloc_init(); tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); tclass_lck_attr = lck_attr_alloc_init(); - if ((tclass_lock = lck_mtx_alloc_init(tclass_lck_grp, tclass_lck_attr)) == NULL) { - panic("failed to allocate memory for tclass\n"); + lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr); +} + +__private_extern__ mbuf_svc_class_t +so_tc2msc(int tc) +{ + mbuf_svc_class_t msc; + + switch (tc) { + case SO_TC_BK_SYS: + msc = MBUF_SC_BK_SYS; + break; + case SO_TC_BK: + case _SO_TC_BK: + msc = MBUF_SC_BK; + break; + case SO_TC_BE: + msc = MBUF_SC_BE; + break; + case SO_TC_RD: + msc = MBUF_SC_RD; + break; + case SO_TC_OAM: + msc = MBUF_SC_OAM; + break; + case SO_TC_AV: + msc = MBUF_SC_AV; + break; + case SO_TC_RV: + msc = MBUF_SC_RV; + break; + case SO_TC_VI: + case _SO_TC_VI: + msc = MBUF_SC_VI; + break; + case SO_TC_VO: + case _SO_TC_VO: + msc = MBUF_SC_VO; + break; + case SO_TC_CTL: + msc = MBUF_SC_CTL; + break; + case SO_TC_ALL: + default: + msc = MBUF_SC_UNSPEC; + break; } + + return (msc); } +__private_extern__ int +so_svc2tc(mbuf_svc_class_t svc) +{ + switch (svc) { + case MBUF_SC_UNSPEC: + return SO_TC_BE; + case MBUF_SC_BK_SYS: + return SO_TC_BK_SYS; + case MBUF_SC_BK: + return SO_TC_BK; + case MBUF_SC_BE: + return SO_TC_BE; + case MBUF_SC_RD: + return SO_TC_RD; + case MBUF_SC_OAM: + return SO_TC_OAM; + case MBUF_SC_AV: + return SO_TC_AV; + case MBUF_SC_RV: + return SO_TC_RV; + case MBUF_SC_VI: + return SO_TC_VI; + case MBUF_SC_VO: + return SO_TC_VO; + case MBUF_SC_CTL: + return SO_TC_CTL; + default: + return SO_TC_BE; + } +} + +/* + * LRO is turned on for AV streaming and background classes. + */ +static void +so_set_lro(struct socket *so, int optval) +{ + if ((optval == SO_TC_BK) || + (optval == SO_TC_BK_SYS) || + (optval == SO_TC_AV)) { + so->so_flags |= SOF_USELRO; + } else { + so->so_flags &= ~SOF_USELRO; + } +} diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index 70a0370ed..74100b4a3 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -96,6 +96,8 @@ struct in_ifaddr { struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ TAILQ_ENTRY(in_ifaddr) ia_hash; /* hash bucket entry */ }; + +#define ifatoia(ifa) ((struct in_ifaddr *)(void *)(ifa)) #endif /* XNU_KERNEL_PRIVATE */ struct in_aliasreq { @@ -458,6 +460,7 @@ do { \ struct route; struct ip_moptions; +struct inpcb; /* * Return values for imo_multi_filter(). @@ -470,7 +473,7 @@ struct ip_moptions; extern void in_ifaddr_init(void); extern int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); -extern int imo_clone(struct ip_moptions *, struct ip_moptions *); +extern int imo_clone(struct inpcb *, struct inpcb *); extern void inm_commit(struct in_multi *); extern void inm_clear_recorded(struct in_multi *); extern void inm_print(const struct in_multi *); @@ -498,8 +501,6 @@ extern void in_purgeaddrs(struct ifnet *); extern void imf_leave(struct in_mfilter *); extern void imf_purge(struct in_mfilter *); -struct inpcb; - __private_extern__ int inp_join_group(struct inpcb *, struct sockopt *); __private_extern__ int inp_leave_group(struct inpcb *, struct sockopt *); __private_extern__ void in_multihead_lock_exclusive(void); diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index 600a796d1..1af24caec 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -242,7 +242,7 @@ divert_packet(struct mbuf *m, int incoming, int port, int rule) continue; } divsrc.sin_addr = - ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + ((struct sockaddr_in *)(void *) ifa->ifa_addr)->sin_addr; IFA_UNLOCK(ifa); break; } @@ -314,12 +314,12 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, { struct inpcb *const inp = sotoinpcb(so); struct ip *const ip = mtod(m, struct ip *); - struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in *sin = (struct sockaddr_in *)(void *)addr; int error = 0; - mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; if (control != NULL) { - mtc = mbuf_traffic_class_from_control(control); + msc = mbuf_service_class_from_control(control); m_freem(control); /* XXX */ } @@ -357,7 +357,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; struct route ro; struct ip_moptions *imo; @@ -381,7 +382,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); - set_packet_tclass(m, so, mtc, 0); + set_packet_service_class(m, so, msc, 0); imo = inp->inp_moptions; if (imo != NULL) @@ -518,7 +519,7 @@ div_bind(struct socket *so, struct sockaddr *nam, struct proc *p) if (nam->sa_family != AF_INET) { error = EAFNOSUPPORT; } else { - ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr = INADDR_ANY; error = in_pcbbind(inp, nam, p); } return error; diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 048cff004..5ebcb2a51 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,10 +89,12 @@ #include #include #include -//#include #include #include #include +#if DUMMYNET +#include +#endif /* DUMMYNET */ #include #include #include @@ -101,6 +103,11 @@ #include #include +#include /* for ip6_input, ip6_output prototypes */ +#include + +static struct ip_fw default_rule; + /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timer.c) @@ -211,7 +218,8 @@ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, static lck_grp_t *dn_mutex_grp; static lck_grp_attr_t *dn_mutex_grp_attr; static lck_attr_t *dn_mutex_attr; -static lck_mtx_t *dn_mutex; +decl_lck_mtx_data(static, dn_mutex_data); +static lck_mtx_t *dn_mutex = &dn_mutex_data; static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); @@ -220,7 +228,6 @@ static void dummynet(void *); static void dummynet_flush(void); void dummynet_drain(void); static ip_dn_io_t dummynet_io; -static void dn_rule_delete(void *); int if_tx_rdy(struct ifnet *ifp); @@ -687,11 +694,12 @@ static struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); -/* KASSERT(mtag != NULL && - mtag->m_tag_id == KERNEL_MODULE_TAG_ID && - mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET, - ("packet on dummynet queue w/o dummynet tag!")); -*/ + + if (!(mtag != NULL && + mtag->m_tag_id == KERNEL_MODULE_TAG_ID && + mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET)) + panic("packet on dummynet queue w/o dummynet tag: %p", m); + return (struct dn_pkt_tag *)(mtag+1); } @@ -716,16 +724,16 @@ dn_tag_get(struct mbuf *m) static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) { - struct mbuf *m ; - struct dn_pkt_tag *pkt ; - u_int64_t schedule_time; + struct mbuf *m ; + struct dn_pkt_tag *pkt = NULL; + u_int64_t schedule_time; lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - ASSERT(serialize >= 0); + ASSERT(serialize >= 0); if (serialize == 0) { while ((m = pipe->head) != NULL) { pkt = dn_tag_get(m); - if (!DN_KEY_LEQ(pkt->output_time, curr_time)) + if (!DN_KEY_LEQ(pkt->dn_output_time, curr_time)) break; pipe->head = m->m_nextpkt; @@ -738,19 +746,19 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) if (*tail != NULL) (*tail)->m_nextpkt = NULL; - } + } - schedule_time = DN_KEY_LEQ(pkt->output_time, curr_time) ? - curr_time+1 : pkt->output_time; + schedule_time = pkt == NULL || DN_KEY_LEQ(pkt->dn_output_time, curr_time) ? + curr_time + 1 : pkt->dn_output_time; - /* if there are leftover packets, put the pipe into the heap for next ready event */ - if ((m = pipe->head) != NULL) { + /* if there are leftover packets, put the pipe into the heap for next ready event */ + if ((m = pipe->head) != NULL) { pkt = dn_tag_get(m); /* XXX should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful */ heap_insert(&extract_heap, schedule_time, pipe); - } + } } /* @@ -783,7 +791,7 @@ move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, q->len-- ; q->len_bytes -= len ; - dt->output_time = curr_time + p->delay ; + dt->dn_output_time = curr_time + p->delay ; if (p->head == NULL) p->head = pkt; @@ -875,11 +883,11 @@ ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) int64_t p_numbytes = p->numbytes; lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - + if (p->if_name[0] == 0) /* tx clock is simulated */ p_numbytes += ( curr_time - p->sched_time ) * p->bandwidth; else { /* tx clock is for real, the ifq must be empty or this is a NOP */ - if (p->ifp && p->ifp->if_snd.ifq_head != NULL) + if (p->ifp && !IFCQ_IS_EMPTY(&p->ifp->if_snd)) return ; else { DPRINTF(("dummynet: pipe %d ready from %s --\n", @@ -968,7 +976,7 @@ ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) if (p->bandwidth > 0) t = ( p->bandwidth -1 - p_numbytes) / p->bandwidth ; - dn_tag_get(p->tail)->output_time += t ; + dn_tag_get(p->tail)->dn_output_time += t ; p->sched_time = curr_time ; heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); /* XXX should check errors on heap_insert, and drain the whole @@ -1055,7 +1063,7 @@ dummynet(__unused void * unused) q->S = q->F + 1 ; /* mark timestamp as invalid */ pe->sum -= q->fs->weight ; } - + /* check the heaps to see if there's still stuff in there, and * only set the timer if there are packets to process */ @@ -1097,10 +1105,15 @@ dummynet_send(struct mbuf *m) m->m_nextpkt = NULL; pkt = dn_tag_get(m); + DPRINTF(("dummynet_send m: %p dn_dir: %d dn_flags: 0x%x\n", + m, pkt->dn_dir, pkt->dn_flags)); + switch (pkt->dn_dir) { case DN_TO_IP_OUT: { - struct route tmp_rt = pkt->ro; - (void)ip_output(m, NULL, &tmp_rt, pkt->flags, NULL, NULL); + struct route tmp_rt = pkt->dn_ro; + /* Force IP_RAWOUTPUT as the IP header is fully formed */ + pkt->dn_flags |= IP_RAWOUTPUT | IP_FORWARDING; + (void)ip_output(m, NULL, &tmp_rt, pkt->dn_flags, NULL, NULL); if (tmp_rt.ro_rt) { rtfree(tmp_rt.ro_rt); tmp_rt.ro_rt = NULL; @@ -1110,7 +1123,22 @@ dummynet_send(struct mbuf *m) case DN_TO_IP_IN : proto_inject(PF_INET, m); break ; - +#ifdef INET6 + case DN_TO_IP6_OUT: { + struct route_in6 ro6; + + ro6 = pkt->dn_ro6; + + ip6_output(m, NULL, &ro6, IPV6_FORWARDING, NULL, NULL, NULL); + + if (ro6.ro_rt) + rtfree(ro6.ro_rt); + break; + } + case DN_TO_IP6_IN: + proto_inject(PF_INET6, m); + break; +#endif /* INET6 */ default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(m); @@ -1150,7 +1178,7 @@ if_tx_rdy(struct ifnet *ifp) } if (p != NULL) { DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name, - ifp->if_unit, ifp->if_snd.ifq_len)); + ifp->if_unit, IFCQ_LEN(&ifp->if_snd))); p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p, &head, &tail); } @@ -1161,11 +1189,12 @@ if_tx_rdy(struct ifnet *ifp) lck_mtx_unlock(dn_mutex); - /* Send out the de-queued list of ready-to-send packets */ if (head != NULL) { dummynet_send(head); + lck_mtx_lock(dn_mutex); serialize--; + lck_mtx_unlock(dn_mutex); } return 0; } @@ -1243,41 +1272,84 @@ create_queue(struct dn_flow_set *fs, int i) * so that further searches take less time. */ static struct dn_flow_queue * -find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) +find_queue(struct dn_flow_set *fs, struct ip_flow_id *id) { int i = 0 ; /* we need i and q for new allocations */ struct dn_flow_queue *q, *prev; + int is_v6 = IS_IP6_FLOW_ID(id); if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) q = fs->rq[0] ; else { - /* first, do the masking */ - id->dst_ip &= fs->flow_mask.dst_ip ; - id->src_ip &= fs->flow_mask.src_ip ; + /* first, do the masking, then hash */ id->dst_port &= fs->flow_mask.dst_port ; id->src_port &= fs->flow_mask.src_port ; id->proto &= fs->flow_mask.proto ; id->flags = 0 ; /* we don't care about this one */ - /* then, hash function */ - i = ( (id->dst_ip) & 0xffff ) ^ - ( (id->dst_ip >> 15) & 0xffff ) ^ - ( (id->src_ip << 1) & 0xffff ) ^ - ( (id->src_ip >> 16 ) & 0xffff ) ^ - (id->dst_port << 1) ^ (id->src_port) ^ - (id->proto ); + if (is_v6) { + APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); + APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); + id->flow_id6 &= fs->flow_mask.flow_id6; + + i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ + ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ + ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ + ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ + + ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ + ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ + ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ + ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ + + ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ + ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ + ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ + ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ + + ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ + ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ + ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ + ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ + + (id->dst_port << 1) ^ (id->src_port) ^ + (id->proto ) ^ + (id->flow_id6); + } else { + id->dst_ip &= fs->flow_mask.dst_ip ; + id->src_ip &= fs->flow_mask.src_ip ; + + i = ( (id->dst_ip) & 0xffff ) ^ + ( (id->dst_ip >> 15) & 0xffff ) ^ + ( (id->src_ip << 1) & 0xffff ) ^ + ( (id->src_ip >> 16 ) & 0xffff ) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->proto ); + } i = i % fs->rq_size ; /* finally, scan the current list for a match */ searches++ ; for (prev=NULL, q = fs->rq[i] ; q ; ) { search_steps++; - if (id->dst_ip == q->id.dst_ip && - id->src_ip == q->id.src_ip && - id->dst_port == q->id.dst_port && - id->src_port == q->id.src_port && - id->proto == q->id.proto && - id->flags == q->id.flags) - break ; /* found */ - else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { + if (is_v6 && + IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && + IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && + id->dst_port == q->id.dst_port && + id->src_port == q->id.src_port && + id->proto == q->id.proto && + id->flags == q->id.flags && + id->flow_id6 == q->id.flow_id6) + break ; /* found */ + + if (!is_v6 && id->dst_ip == q->id.dst_ip && + id->src_ip == q->id.src_ip && + id->dst_port == q->id.dst_port && + id->src_port == q->id.src_port && + id->proto == q->id.proto && + id->flags == q->id.flags) + break ; /* found */ + + /* No match. Check if we can expire the entry */ + if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { /* entry is idle and not in any heap, expire it */ struct dn_flow_queue *old_q = q ; @@ -1451,9 +1523,9 @@ locate_pipe(int pipe_nr) * */ static int -dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) +dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int client) { - struct mbuf *head = NULL, *tail = NULL; + struct mbuf *head = NULL, *tail = NULL; struct dn_pkt_tag *pkt; struct m_tag *mtag; struct dn_flow_set *fs = NULL; @@ -1464,15 +1536,28 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) struct timespec ts; struct timeval tv; + DPRINTF(("dummynet_io m: %p pipe: %d dir: %d client: %d\n", + m, pipe_nr, dir, client)); + +#if IPFIREWALL #if IPFW2 - ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs; + if (client == DN_CLIENT_IPFW) { + ipfw_insn *cmd = fwa->fwa_ipfw_rule->cmd + fwa->fwa_ipfw_rule->act_ofs; - if (cmd->opcode == O_LOG) - cmd += F_LEN(cmd); - is_pipe = (cmd->opcode == O_PIPE); + if (cmd->opcode == O_LOG) + cmd += F_LEN(cmd); + is_pipe = (cmd->opcode == O_PIPE); + } #else - is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; + if (client == DN_CLIENT_IPFW) + is_pipe = (fwa->fwa_ipfw_rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; #endif +#endif /* IPFIREWALL */ + +#if DUMMYNET + if (client == DN_CLIENT_PF) + is_pipe = fwa->fwa_flags == DN_IS_PIPE ? 1 : 0; +#endif /* DUMMYNET */ pipe_nr &= 0xffff ; @@ -1482,7 +1567,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) * here we convert secs and usecs to msecs (just divide the * usecs and take the closest whole number). */ - microuptime(&tv); + microuptime(&tv); curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); /* @@ -1511,7 +1596,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) goto dropit ; } } - q = find_queue(fs, &(fwa->f_id)); + q = find_queue(fs, &(fwa->fwa_id)); if ( q == NULL ) goto dropit ; /* cannot allocate queue */ /* @@ -1542,28 +1627,70 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) bzero(pkt, sizeof(struct dn_pkt_tag)); /* ok, i can handle the pkt now... */ /* build and enqueue packet + parameters */ - pkt->rule = fwa->rule ; + /* + * PF is checked before ipfw so remember ipfw rule only when + * the caller is ipfw. When the caller is PF, fwa_ipfw_rule + * is a fake rule just used for convenience + */ + if (client == DN_CLIENT_IPFW) + pkt->dn_ipfw_rule = fwa->fwa_ipfw_rule; + pkt->dn_pf_rule = fwa->fwa_pf_rule; pkt->dn_dir = dir ; + pkt->dn_client = client; - pkt->ifp = fwa->oif; + pkt->dn_ifp = fwa->fwa_oif; if (dir == DN_TO_IP_OUT) { - /* - * We need to copy *ro because for ICMP pkts (and maybe others) - * the caller passed a pointer into the stack; dst might also be - * a pointer into *ro so it needs to be updated. - */ - pkt->ro = *(fwa->ro); - if (fwa->ro->ro_rt) - RT_ADDREF(fwa->ro->ro_rt); - - if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ - fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; - - bcopy (fwa->dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); - pkt->flags = fwa->flags; - if (fwa->ipoa != NULL) - pkt->ipoa = *(fwa->ipoa); - } + /* + * We need to copy *ro because for ICMP pkts (and maybe others) + * the caller passed a pointer into the stack; dst might also be + * a pointer into *ro so it needs to be updated. + */ + if (fwa->fwa_ro) { + pkt->dn_ro = *(fwa->fwa_ro); + if (fwa->fwa_ro->ro_rt) + RT_ADDREF(fwa->fwa_ro->ro_rt); + } + if (fwa->fwa_dst) { + if (fwa->fwa_dst == (struct sockaddr_in *)&fwa->fwa_ro->ro_dst) /* dst points into ro */ + fwa->fwa_dst = (struct sockaddr_in *)&(pkt->dn_ro.ro_dst) ; + + bcopy (fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); + } + } else if (dir == DN_TO_IP6_OUT) { + if (fwa->fwa_ro6) { + pkt->dn_ro6 = *(fwa->fwa_ro6); + if (fwa->fwa_ro6->ro_rt) + RT_ADDREF(fwa->fwa_ro6->ro_rt); + } + if (fwa->fwa_ro6_pmtu) { + pkt->dn_ro6_pmtu = *(fwa->fwa_ro6_pmtu); + if (fwa->fwa_ro6_pmtu->ro_rt) + RT_ADDREF(fwa->fwa_ro6_pmtu->ro_rt); + } + if (fwa->fwa_dst6) { + if (fwa->fwa_dst6 == (struct sockaddr_in6 *)&fwa->fwa_ro6->ro_dst) /* dst points into ro */ + fwa->fwa_dst6 = (struct sockaddr_in6 *)&(pkt->dn_ro6.ro_dst) ; + + bcopy (fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6)); + } + pkt->dn_origifp = fwa->fwa_origifp; + pkt->dn_mtu = fwa->fwa_mtu; + pkt->dn_alwaysfrag = fwa->fwa_alwaysfrag; + pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen; + if (fwa->fwa_exthdrs) { + bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs)); + /* + * Need to zero out the source structure so the mbufs + * won't be freed by ip6_output() + */ + bzero(fwa->fwa_exthdrs, sizeof(struct ip6_exthdrs)); + } + } + if (dir == DN_TO_IP_OUT || dir == DN_TO_IP6_OUT) { + pkt->dn_flags = fwa->fwa_oflags; + if (fwa->fwa_ipoa != NULL) + pkt->dn_ipoa = *(fwa->fwa_ipoa); + } if (q->head == NULL) q->head = m; else @@ -1587,7 +1714,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) if (pipe->bandwidth) t = SET_TICKS(m, q, pipe); q->sched_time = curr_time ; - if (t == 0) /* must process it now */ + if (t == 0) /* must process it now */ ready_event( q , &head, &tail ); else heap_insert(&ready_heap, curr_time + t , q ); @@ -1653,9 +1780,10 @@ done: } lck_mtx_unlock(dn_mutex); - - if (head != NULL) + + if (head != NULL) { dummynet_send(head); + } return 0; @@ -1675,9 +1803,9 @@ dropit: struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \ if (tag) { \ struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \ - if (n->ro.ro_rt != NULL) { \ - rtfree(n->ro.ro_rt); \ - n->ro.ro_rt = NULL; \ + if (n->dn_ro.ro_rt != NULL) { \ + rtfree(n->dn_ro.ro_rt); \ + n->dn_ro.ro_rt = NULL; \ } \ } \ m_tag_delete(_m, tag); \ @@ -1761,9 +1889,11 @@ dummynet_flush(void) lck_mtx_lock(dn_mutex); - /* remove all references to pipes ...*/ - flush_pipe_ptrs(NULL); - +#if IPFW2 + /* remove all references to pipes ...*/ + flush_pipe_ptrs(NULL); +#endif /* IPFW2 */ + /* Free heaps so we don't have unwanted events. */ heap_free(&ready_heap); heap_free(&wfq_ready_heap); @@ -1789,9 +1919,8 @@ dummynet_flush(void) } -extern struct ip_fw *ip_fw_default_rule ; static void -dn_rule_delete_fs(struct dn_flow_set *fs, void *r) +dn_ipfw_rule_delete_fs(struct dn_flow_set *fs, void *r) { int i ; struct dn_flow_queue *q ; @@ -1801,8 +1930,8 @@ dn_rule_delete_fs(struct dn_flow_set *fs, void *r) for (q = fs->rq[i] ; q ; q = q->next ) for (m = q->head ; m ; m = m->m_nextpkt ) { struct dn_pkt_tag *pkt = dn_tag_get(m) ; - if (pkt->rule == r) - pkt->rule = ip_fw_default_rule ; + if (pkt->dn_ipfw_rule == r) + pkt->dn_ipfw_rule = &default_rule ; } } /* @@ -1810,7 +1939,7 @@ dn_rule_delete_fs(struct dn_flow_set *fs, void *r) * from packets matching this rule. */ void -dn_rule_delete(void *r) +dn_ipfw_rule_delete(void *r) { struct dn_pipe *p ; struct dn_flow_set *fs ; @@ -1827,16 +1956,16 @@ dn_rule_delete(void *r) */ for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(fs, &flowsethash[i], next) - dn_rule_delete_fs(fs, r); + dn_ipfw_rule_delete_fs(fs, r); for (i = 0; i < HASHSIZE; i++) SLIST_FOREACH(p, &pipehash[i], next) { fs = &(p->fs); - dn_rule_delete_fs(fs, r); + dn_ipfw_rule_delete_fs(fs, r); for (m = p->head ; m ; m = m->m_nextpkt ) { pkt = dn_tag_get(m); - if (pkt->rule == r) - pkt->rule = ip_fw_default_rule; + if (pkt->dn_ipfw_rule == r) + pkt->dn_ipfw_rule = &default_rule; } } lck_mtx_unlock(dn_mutex); @@ -1933,9 +2062,9 @@ set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) x->qsize = 1024*1024 ; } else { if (x->qsize == 0) - x->qsize = 50; + x->qsize = 50 ; if (x->qsize > 100) - x->qsize = 50; + x->qsize = 50 ; } /* configuring RED */ if ( x->flags_fs & DN_IS_RED ) @@ -2161,8 +2290,10 @@ delete_pipe(struct dn_pipe *p) /* Unlink from list of pipes. */ SLIST_REMOVE(&pipehash[HASH(b->pipe_nr)], b, dn_pipe, next); +#if IPFW2 /* remove references to this pipe from the ip_fw rules. */ flush_pipe_ptrs(&(b->fs)); +#endif /* IPFW2 */ /* Remove all references to this pipe from flow_sets. */ for (i = 0; i < HASHSIZE; i++) @@ -2193,8 +2324,10 @@ delete_pipe(struct dn_pipe *p) return EINVAL ; /* not found */ } +#if IPFW2 /* remove references to this flow_set from the ip_fw rules. */ flush_pipe_ptrs(b); +#endif /* IPFW2 */ /* Unlink from list of flowsets. */ SLIST_REMOVE( &flowsethash[HASH(b->fs_nr)], b, dn_flow_set, next); @@ -2443,21 +2576,30 @@ ip_dn_init(void) dn_mutex_grp_attr = lck_grp_attr_alloc_init(); dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr); dn_mutex_attr = lck_attr_alloc_init(); - - if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) { - printf("ip_dn_init: can't alloc dn_mutex\n"); - return; - } + lck_mtx_init(dn_mutex, dn_mutex_grp, dn_mutex_attr); ready_heap.size = ready_heap.elements = 0 ; - ready_heap.offset = 0 ; + ready_heap.offset = 0 ; + + wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; + wfq_ready_heap.offset = 0 ; - wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; - wfq_ready_heap.offset = 0 ; + extract_heap.size = extract_heap.elements = 0 ; + extract_heap.offset = 0 ; + ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; - extract_heap.size = extract_heap.elements = 0 ; - extract_heap.offset = 0 ; - ip_dn_ctl_ptr = ip_dn_ctl; - ip_dn_io_ptr = dummynet_io; - ip_dn_ruledel_ptr = dn_rule_delete; + bzero(&default_rule, sizeof default_rule); + + default_rule.act_ofs = 0; + default_rule.rulenum = IPFW_DEFAULT_RULE; + default_rule.cmd_len = 1; + default_rule.set = RESVD_SET; + + default_rule.cmd[0].len = 1; + default_rule.cmd[0].opcode = +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + 1 ? O_ACCEPT : +#endif + O_DENY; } diff --git a/bsd/netinet/ip_dummynet.h b/bsd/netinet/ip_dummynet.h index e5dd1f337..b55a36b93 100644 --- a/bsd/netinet/ip_dummynet.h +++ b/bsd/netinet/ip_dummynet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,6 +60,16 @@ #include #ifdef PRIVATE + +#include + +/* Apply ipv6 mask on ipv6 addr */ +#define APPLY_MASK(addr,mask) \ + (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ + (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ + (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ + (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; + /* * Definition of dummynet data structures. In the structures, I decided * not to use the macros in in the hope of making the code @@ -147,20 +157,48 @@ struct dn_heap { */ #ifdef KERNEL #include /* for ip_out_args */ +#include /* for ip6_out_args */ +#include /* for ip6_out_args */ struct dn_pkt_tag { - struct ip_fw *rule; /* matching rule */ - int dn_dir; /* action when packet comes out. */ + struct ip_fw *dn_ipfw_rule; /* matching IPFW rule */ + void *dn_pf_rule; /* matching PF rule */ + int dn_dir; /* action when packet comes out. */ #define DN_TO_IP_OUT 1 #define DN_TO_IP_IN 2 #define DN_TO_BDG_FWD 3 - - dn_key output_time; /* when the pkt is due for delivery */ - struct ifnet *ifp; /* interface, for ip_output */ - struct sockaddr_in dn_dst ; - struct route ro; /* route, for ip_output. MUST COPY */ - int flags ; /* flags, for ip_output (IPv6 ?) */ - struct ip_out_args ipoa; /* output args, for ip_output. MUST COPY */ +#define DN_TO_IP6_IN 4 +#define DN_TO_IP6_OUT 5 + dn_key dn_output_time; /* when the pkt is due for delivery */ + struct ifnet *dn_ifp; /* interface, for ip[6]_output */ + union { + struct sockaddr_in _dn_dst; + struct sockaddr_in6 _dn_dst6 ; + } dn_dst_; +#define dn_dst dn_dst_._dn_dst +#define dn_dst6 dn_dst_._dn_dst6 + union { + struct route _dn_ro; /* route, for ip_output. MUST COPY */ + struct route_in6 _dn_ro6; /* route, for ip6_output. MUST COPY */ + } dn_ro_; +#define dn_ro dn_ro_._dn_ro +#define dn_ro6 dn_ro_._dn_ro6 + struct route_in6 dn_ro6_pmtu; /* for ip6_output */ + struct ifnet *dn_origifp; /* for ip6_output */ + u_int32_t dn_mtu; /* for ip6_output */ + int dn_alwaysfrag; /* for ip6_output */ + u_int32_t dn_unfragpartlen; /* for ip6_output */ + struct ip6_exthdrs dn_exthdrs; /* for ip6_output */ + int dn_flags ; /* flags, for ip[6]_output */ + int dn_client; +#define DN_CLIENT_IPFW 1 +#define DN_CLIENT_PF 2 + union { + struct ip_out_args _dn_ipoa; /* output args, for ip_output. MUST COPY */ + struct ip6_out_args _dn_ip6oa; /* output args, for ip_output. MUST COPY */ + } dn_ipoa_; +#define dn_ipoa dn_ipoa_._dn_ipoa +#define dn_ip6oa dn_ipoa_._dn_ip6oa }; #else struct dn_pkt; @@ -236,7 +274,7 @@ flow using a number of heaps defined into the pipe itself. */ struct dn_flow_queue { struct dn_flow_queue *next ; - struct ipfw_flow_id id ; + struct ip_flow_id id ; struct mbuf *head, *tail ; /* queue of packets */ u_int len ; @@ -299,7 +337,7 @@ struct dn_flow_set { int qsize ; /* queue size in slots or bytes */ int plr ; /* pkt loss rate (2^31-1 means 100%) */ - struct ipfw_flow_id flow_mask ; + struct ip_flow_id flow_mask ; /* hash table of queues onto this flow_set */ int rq_size ; /* number of slots */ @@ -384,12 +422,11 @@ SLIST_HEAD(dn_pipe_head, dn_pipe); void ip_dn_init(void); /* called from raw_ip.c:load_ipfw() */ typedef int ip_dn_ctl_t(struct sockopt *); /* raw_ip.c */ -typedef void ip_dn_ruledel_t(void *); /* ip_fw.c */ typedef int ip_dn_io_t(struct mbuf *m, int pipe_nr, int dir, - struct ip_fw_args *fwa); + struct ip_fw_args *fwa, int ); extern ip_dn_ctl_t *ip_dn_ctl_ptr; -extern ip_dn_ruledel_t *ip_dn_ruledel_ptr; extern ip_dn_io_t *ip_dn_io_ptr; +void dn_ipfw_rule_delete(void *); #define DUMMYNET_LOADED (ip_dn_io_ptr != NULL) #pragma pack(4) @@ -403,7 +440,7 @@ struct dn_heap_32 { struct dn_flow_queue_32 { user32_addr_t next ; - struct ipfw_flow_id id ; + struct ip_flow_id id ; user32_addr_t head, tail ; /* queue of packets */ u_int len ; @@ -454,7 +491,7 @@ struct dn_flow_set_32 { int qsize ; /* queue size in slots or bytes */ int plr ; /* pkt loss rate (2^31-1 means 100%) */ - struct ipfw_flow_id flow_mask ; + struct ip_flow_id flow_mask ; /* hash table of queues onto this flow_set */ int rq_size ; /* number of slots */ @@ -528,7 +565,7 @@ struct dn_heap_64 { struct dn_flow_queue_64 { user64_addr_t next ; - struct ipfw_flow_id id ; + struct ip_flow_id id ; user64_addr_t head, tail ; /* queue of packets */ u_int len ; @@ -579,7 +616,7 @@ struct dn_flow_set_64 { int qsize ; /* queue size in slots or bytes */ int plr ; /* pkt loss rate (2^31-1 means 100%) */ - struct ipfw_flow_id flow_mask ; + struct ip_flow_id flow_mask ; /* hash table of queues onto this flow_set */ int rq_size ; /* number of slots */ @@ -654,7 +691,7 @@ ip_dn_claim_rule(struct mbuf *m) KERNEL_TAG_TYPE_DUMMYNET, NULL); if (mtag != NULL) { mtag->m_tag_type = KERNEL_TAG_TYPE_NONE; - return (((struct dn_pkt_tag *)(mtag+1))->rule); + return (((struct dn_pkt_tag *)(mtag+1))->dn_ipfw_rule); } else return (NULL); } diff --git a/bsd/netinet/ip_encap.c b/bsd/netinet/ip_encap.c index 0d487326a..db393938f 100644 --- a/bsd/netinet/ip_encap.c +++ b/bsd/netinet/ip_encap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include @@ -171,6 +172,9 @@ encap4_input(m, off) va_end(ap); #endif + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); #ifdef __APPLE__ proto = ip->ip_p; @@ -268,8 +272,10 @@ encap6_input(struct mbuf **mp, int *offp, int proto) struct encaptab *ep, *match; int prio, matchprio; - ip6 = mtod(m, struct ip6_hdr *); + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); bzero(&s, sizeof(s)); s.sin6_family = AF_INET6; s.sin6_len = sizeof(struct sockaddr_in6); diff --git a/bsd/netinet/ip_flowid.h b/bsd/netinet/ip_flowid.h new file mode 100644 index 000000000..1fe210311 --- /dev/null +++ b/bsd/netinet/ip_flowid.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef __IP_FLOWID_H__ +#define __IP_FLOWID_H__ + +#include +#include + +/* + * This structure is used as a flow mask and a flow id for various + * parts of the code. + */ +struct ip_flow_id { + u_int32_t dst_ip; + u_int32_t src_ip; + u_int16_t dst_port; + u_int16_t src_port; + u_int8_t proto; + u_int8_t flags; /* protocol-specific flags */ + u_int8_t addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */ + struct in6_addr dst_ip6; /* could also store MAC addr! */ + struct in6_addr src_ip6; + u_int32_t flow_id6; + u_int32_t frag_id6; +}; + +#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) + +#ifdef KERNEL +struct route_in6; +struct sockaddr_in6; +struct pf_rule; +struct ip_fw; + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *fwa_m; /* the mbuf chain */ + struct ifnet *fwa_oif; /* output interface */ + struct sockaddr_in *fwa_next_hop; /* forward address */ + struct ip_fw *fwa_ipfw_rule; /* matching IPFW rule */ + struct pf_rule *fwa_pf_rule; /* matching PF rule */ + struct ether_header *fwa_eh; /* for bridged packets */ + int fwa_flags; /* for dummynet */ + int fwa_oflags; /* for dummynet */ + union { + struct ip_out_args *_fwa_ipoa; /* for dummynet */ + struct ip6_out_args *_fwa_ip6oa; /* for dummynet */ + } fwa_ipoa_; + union { + struct route *_fwa_ro; /* for dummynet */ + struct route_in6 *_fwa_ro6; /* for dummynet */ + } fwa_ro_; + union { + struct sockaddr_in *_fwa_dst; /* for dummynet */ + struct sockaddr_in6 *_fwa_dst6; /* for IPv6 dummynet */ + } fwa_dst_; + struct route_in6 *fwa_ro6_pmtu; /* for IPv6 output */ + struct ifnet *fwa_origifp; /* for IPv6 output */ + u_int32_t fwa_mtu; /* for IPv6 output */ + int fwa_alwaysfrag; /* for IPv6 output */ + u_int32_t fwa_unfragpartlen; /* for IPv6 output */ + struct ip6_exthdrs *fwa_exthdrs; /* for IPv6 output */ + struct ip_flow_id fwa_id; /* grabbed from IP header */ + u_int16_t fwa_divert_rule;/* divert cookie */ + u_int32_t fwa_cookie; +}; +#define fwa_ipoa fwa_ipoa_._fwa_ipoa +#define fwa_ip6oa fwa_ipoa_._fwa_ip6oa +#define fwa_ro fwa_ro_._fwa_ro +#define fwa_ro6 fwa_ro_._fwa_ro6 +#define fwa_dst fwa_dst_._fwa_dst +#define fwa_dst6 fwa_dst_._fwa_dst6 + +#endif /* KERNEL */ + +#endif /* __IP_FLOWID_H__ */ diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index bb66be5d7..a8422fa31 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,7 @@ #include #include #include +#include #include #include @@ -140,8 +142,6 @@ int fw_verbose; static int verbose_limit; extern int fw_bypass; -#define IPFW_DEFAULT_RULE 65535 - #define IPFW_RULE_INACTIVE 1 /* @@ -301,14 +301,11 @@ static ip_fw_chk_t ipfw_chk; lck_grp_t *ipfw_mutex_grp; lck_grp_attr_t *ipfw_mutex_grp_attr; lck_attr_t *ipfw_mutex_attr; -lck_mtx_t *ipfw_mutex; +decl_lck_mtx_data(,ipfw_mutex_data); +lck_mtx_t *ipfw_mutex = &ipfw_mutex_data; extern void ipfwsyslog( int level, const char *format,...); -#if DUMMYNET -ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL; /* hook into dummynet */ -#endif /* DUMMYNET */ - #define KEV_LOG_SUBCLASS 10 #define IPFWLOGEVENT 0 @@ -350,7 +347,10 @@ void ipfwsyslog( int level, const char *format,...) ev_msg.event_code = IPFWLOGEVENT; /* get rid of the trailing \n */ - dptr[loglen-1] = 0; + if (loglen < msgsize) + dptr[loglen-1] = 0; + else + dptr[msgsize-1] = 0; pri = LOG_PRI(level); @@ -693,6 +693,19 @@ copyfrom64fw( struct ip_fw_64 *fw64, struct ip_fw *user_ip_fw, size_t copysize) return( sizeof(struct ip_fw) + cmdsize - 4); } +void +externalize_flow_id(struct ipfw_flow_id *dst, struct ip_flow_id *src); +void +externalize_flow_id(struct ipfw_flow_id *dst, struct ip_flow_id *src) +{ + dst->dst_ip = src->dst_ip; + dst->src_ip = src->src_ip; + dst->dst_port = src->dst_port; + dst->src_port = src->src_port; + dst->proto = src->proto; + dst->flags = src->flags; +} + static void cp_dyn_to_comp_32( struct ipfw_dyn_rule_compat_32 *dyn_rule_vers1, int *len) { @@ -704,8 +717,8 @@ void cp_dyn_to_comp_32( struct ipfw_dyn_rule_compat_32 *dyn_rule_vers1, int *len for (i = 0; i < curr_dyn_buckets; i++) { for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next) { dyn_rule_vers1->chain = (user32_addr_t)(p->rule->rulenum); - dyn_rule_vers1->id = p->id; - dyn_rule_vers1->mask = p->id; + externalize_flow_id(&dyn_rule_vers1->id, &p->id); + externalize_flow_id(&dyn_rule_vers1->mask, &p->id); dyn_rule_vers1->type = p->dyn_type; dyn_rule_vers1->expire = p->expire; dyn_rule_vers1->pcnt = p->pcnt; @@ -739,8 +752,8 @@ void cp_dyn_to_comp_64( struct ipfw_dyn_rule_compat_64 *dyn_rule_vers1, int *len for (i = 0; i < curr_dyn_buckets; i++) { for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next) { dyn_rule_vers1->chain = (user64_addr_t) p->rule->rulenum; - dyn_rule_vers1->id = p->id; - dyn_rule_vers1->mask = p->id; + externalize_flow_id(&dyn_rule_vers1->id, &p->id); + externalize_flow_id(&dyn_rule_vers1->mask, &p->id); dyn_rule_vers1->type = p->dyn_type; dyn_rule_vers1->expire = p->expire; dyn_rule_vers1->pcnt = p->pcnt; @@ -1239,7 +1252,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh, * and we want to find both in the same bucket. */ static __inline int -hash_packet(struct ipfw_flow_id *id) +hash_packet(struct ip_flow_id *id) { u_int32_t i; @@ -1355,7 +1368,7 @@ next: * lookup a dynamic rule. */ static ipfw_dyn_rule * -lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, +lookup_dyn_rule(struct ip_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* @@ -1527,7 +1540,7 @@ realloc_dynamic_table(void) * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * -add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) +add_dyn_rule(struct ip_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; @@ -1585,7 +1598,7 @@ add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) * If the lookup fails, then install one. */ static ipfw_dyn_rule * -lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) +lookup_dyn_parent(struct ip_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; @@ -1629,10 +1642,10 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n", cmd->o.opcode, - (args->f_id.src_ip), (args->f_id.src_port), - (args->f_id.dst_ip), (args->f_id.dst_port) );) + (args->fwa_id.src_ip), (args->fwa_id.src_port), + (args->fwa_id.dst_ip), (args->fwa_id.dst_port) );) - q = lookup_dyn_rule(&args->f_id, NULL, NULL); + q = lookup_dyn_rule(&args->fwa_id, NULL, NULL); if (q != NULL) { /* should never occur */ if (last_log != timenow.tv_sec) { @@ -1658,13 +1671,13 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ - add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); + add_dyn_rule(&args->fwa_id, O_KEEP_STATE, rule); break; case O_LIMIT: /* limit number of sessions */ { u_int16_t limit_mask = cmd->limit_mask; - struct ipfw_flow_id id; + struct ip_flow_id id; ipfw_dyn_rule *parent; DEB(printf("ipfw: installing dyn-limit rule %d\n", @@ -1672,16 +1685,16 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, id.dst_ip = id.src_ip = 0; id.dst_port = id.src_port = 0; - id.proto = args->f_id.proto; + id.proto = args->fwa_id.proto; if (limit_mask & DYN_SRC_ADDR) - id.src_ip = args->f_id.src_ip; + id.src_ip = args->fwa_id.src_ip; if (limit_mask & DYN_DST_ADDR) - id.dst_ip = args->f_id.dst_ip; + id.dst_ip = args->fwa_id.dst_ip; if (limit_mask & DYN_SRC_PORT) - id.src_port = args->f_id.src_port; + id.src_port = args->fwa_id.src_port; if (limit_mask & DYN_DST_PORT) - id.dst_port = args->f_id.dst_port; + id.dst_port = args->fwa_id.dst_port; parent = lookup_dyn_parent(&id, rule); if (parent == NULL) { printf("ipfw: add parent failed\n"); @@ -1701,14 +1714,14 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, return 1; } } - add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); + add_dyn_rule(&args->fwa_id, O_LIMIT, (struct ip_fw *)parent); } break; default: printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode); return 1; } - lookup_dyn_rule(&args->f_id, NULL, NULL); /* XXX just set lifetime */ + lookup_dyn_rule(&args->fwa_id, NULL, NULL); /* XXX just set lifetime */ return 0; } @@ -1719,7 +1732,7 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, * Otherwise we are sending a keepalive, and flags & TH_ */ static struct mbuf * -send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) +send_pkt(struct ip_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m; struct ip *ip; @@ -1803,35 +1816,35 @@ send_reject(struct ip_fw_args *args, int code, int offset, __unused int ip_len) if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ - if (args->eh != NULL) { - struct ip *ip = mtod(args->m, struct ip *); + if (args->fwa_eh != NULL) { + struct ip *ip = mtod(args->fwa_m, struct ip *); ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } - args->m->m_flags |= M_SKIP_FIREWALL; - icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); - } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) { + args->fwa_m->m_flags |= M_SKIP_FIREWALL; + icmp_error(args->fwa_m, ICMP_UNREACH, code, 0L, 0); + } else if (offset == 0 && args->fwa_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = - L3HDR(struct tcphdr, mtod(args->m, struct ip *)); + L3HDR(struct tcphdr, mtod(args->fwa_m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; - - m = send_pkt(&(args->f_id), ntohl(tcp->th_seq), + + m = send_pkt(&(args->fwa_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) { struct route sro; /* fake route */ - + bzero (&sro, sizeof (sro)); ip_output_list(m, 0, NULL, &sro, 0, NULL, NULL); if (sro.ro_rt) RTFREE(sro.ro_rt); } } - m_freem(args->m); + m_freem(args->fwa_m); } else - m_freem(args->m); - args->m = NULL; + m_freem(args->fwa_m); + args->fwa_m = NULL; } /** @@ -1877,18 +1890,18 @@ lookup_next_rule(struct ip_fw *me) * * Parameters: * - * args->m (in/out) The packet; we set to NULL when/if we nuke it. + * args->fwa_m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. - * args->eh (in) Mac header if present, or NULL for layer3 packet. - * args->oif Outgoing interface, or NULL if packet is incoming. + * args->fwa_eh (in) Mac header if present, or NULL for layer3 packet. + * args->fwa_oif Outgoing interface, or NULL if packet is incoming. * The incoming interface is in the mbuf. (in) - * args->divert_rule (in/out) + * args->fwa_divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * - * args->rule Pointer to the last matching rule (in/out) - * args->next_hop Socket we are forwarding to (out). - * args->f_id Addresses grabbed from the packet (out) + * args->fwa_ipfw_rule Pointer to the last matching rule (in/out) + * args->fwa_next_hop Socket we are forwarding to (out). + * args->fwa_id Addresses grabbed from the packet (out) * * Return value: * @@ -1917,10 +1930,10 @@ ipfw_chk(struct ip_fw_args *args) * the implementation of the various instructions to make sure * that they still work. * - * args->eh The MAC header. It is non-null for a layer2 + * args->fwa_eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * - * m | args->m Pointer to the mbuf, as received from the caller. + * m | args->fwa_m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies @@ -1929,16 +1942,16 @@ ipfw_chk(struct ip_fw_args *args) * in sync with it (the packet is supposed to start with * the ip header). */ - struct mbuf *m = args->m; + struct mbuf *m = args->fwa_m; struct ip *ip = mtod(m, struct ip *); /* - * oif | args->oif If NULL, ipfw_chk has been called on the + * oif | args->fwa_oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, bdg_forward, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ - struct ifnet *oif = args->oif; + struct ifnet *oif = args->fwa_oif; struct ip_fw *f = NULL; /* matching rule */ int retval = 0; @@ -2003,23 +2016,23 @@ ipfw_chk(struct ip_fw_args *args) */ pktlen = m->m_pkthdr.len; - if (args->eh == NULL || /* layer 3 packet */ + if (args->fwa_eh == NULL || /* layer 3 packet */ ( m->m_pkthdr.len >= sizeof(struct ip) && - ntohs(args->eh->ether_type) == ETHERTYPE_IP)) + ntohs(args->fwa_eh->ether_type) == ETHERTYPE_IP)) hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster matching. */ if (hlen == 0) { /* do not grab addresses for non-ip pkts */ - proto = args->f_id.proto = 0; /* mark f_id invalid */ + proto = args->fwa_id.proto = 0; /* mark f_id invalid */ goto after_ip_checks; } - proto = args->f_id.proto = ip->ip_p; + proto = args->fwa_id.proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; - if (args->eh != NULL) { /* layer 2 packets are as on the wire */ + if (args->fwa_eh != NULL) { /* layer 2 packets are as on the wire */ offset = ntohs(ip->ip_off) & IP_OFFMASK; ip_len = ntohs(ip->ip_len); } else { @@ -2031,7 +2044,7 @@ ipfw_chk(struct ip_fw_args *args) #define PULLUP_TO(len) \ do { \ if ((m)->m_len < (len)) { \ - args->m = m = m_pullup(m, (len)); \ + args->fwa_m = m = m_pullup(m, (len)); \ if (m == 0) \ goto pullup_failed; \ ip = mtod(m, struct ip *); \ @@ -2048,7 +2061,7 @@ ipfw_chk(struct ip_fw_args *args) tcp = L3HDR(struct tcphdr, ip); dst_port = tcp->th_dport; src_port = tcp->th_sport; - args->f_id.flags = tcp->th_flags; + args->fwa_id.flags = tcp->th_flags; } break; @@ -2065,7 +2078,7 @@ ipfw_chk(struct ip_fw_args *args) case IPPROTO_ICMP: PULLUP_TO(hlen + 4); /* type, code and checksum. */ - args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type; + args->fwa_id.flags = L3HDR(struct icmp, ip)->icmp_type; break; default: @@ -2074,13 +2087,13 @@ ipfw_chk(struct ip_fw_args *args) #undef PULLUP_TO } - args->f_id.src_ip = ntohl(src_ip.s_addr); - args->f_id.dst_ip = ntohl(dst_ip.s_addr); - args->f_id.src_port = src_port = ntohs(src_port); - args->f_id.dst_port = dst_port = ntohs(dst_port); + args->fwa_id.src_ip = ntohl(src_ip.s_addr); + args->fwa_id.dst_ip = ntohl(dst_ip.s_addr); + args->fwa_id.src_port = src_port = ntohs(src_port); + args->fwa_id.dst_port = dst_port = ntohs(dst_port); after_ip_checks: - if (args->rule) { + if (args->fwa_ipfw_rule) { /* * Packet has already been tagged. Look for the next rule * to restart processing. @@ -2094,18 +2107,18 @@ after_ip_checks: return 0; } - f = args->rule->next_rule; + f = args->fwa_ipfw_rule->next_rule; if (f == NULL) - f = lookup_next_rule(args->rule); + f = lookup_next_rule(args->fwa_ipfw_rule); } else { /* * Find the starting rule. It can be either the first * one, or the one after divert_rule if asked so. */ - int skipto = args->divert_rule; + int skipto = args->fwa_divert_rule; f = layer3_chain; - if (args->eh == NULL && skipto != 0) { + if (args->fwa_eh == NULL && skipto != 0) { if (skipto >= IPFW_DEFAULT_RULE) { lck_mtx_unlock(ipfw_mutex); return(IP_FW_PORT_DENY_FLAG); /* invalid */ @@ -2118,7 +2131,7 @@ after_ip_checks: } } } - args->divert_rule = 0; /* reset to avoid confusion later */ + args->fwa_divert_rule = 0; /* reset to avoid confusion later */ /* * Now scan the rules, and parse microinstructions for each rule. @@ -2224,7 +2237,7 @@ check_body: if (cmd->opcode == O_UID) { match = #ifdef __APPLE__ - (pcb->inp_socket->so_uid == (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); + (kauth_cred_getuid(pcb->inp_socket->so_cred) == (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); #else !socheckuid(pcb->inp_socket, (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); @@ -2258,12 +2271,12 @@ check_body: break; case O_MACADDR2: - if (args->eh != NULL) { /* have MAC header */ + if (args->fwa_eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; - u_int32_t *hdr = (u_int32_t *)args->eh; + u_int32_t *hdr = (u_int32_t *)args->fwa_eh; match = ( want[0] == (hdr[0] & mask[0]) && @@ -2273,9 +2286,9 @@ check_body: break; case O_MAC_TYPE: - if (args->eh != NULL) { + if (args->fwa_eh != NULL) { u_int16_t t = - ntohs(args->eh->ether_type); + ntohs(args->fwa_eh->ether_type); u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; @@ -2295,7 +2308,7 @@ check_body: break; case O_LAYER2: - match = (args->eh != NULL); + match = (args->fwa_eh != NULL); break; case O_PROTO: @@ -2341,8 +2354,8 @@ check_body: u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? - args->f_id.dst_ip : - args->f_id.src_ip; + args->fwa_id.dst_ip : + args->fwa_id.src_ip; if (addr < d[0]) break; @@ -2478,7 +2491,7 @@ check_body: case O_LOG: if (fw_verbose) - ipfw_log(f, hlen, args->eh, m, oif); + ipfw_log(f, hlen, args->fwa_eh, m, oif); match = 1; break; @@ -2564,7 +2577,7 @@ check_body: * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && - (q = lookup_dyn_rule(&args->f_id, + (q = lookup_dyn_rule(&args->fwa_id, &dyn_dir, proto == IPPROTO_TCP ? L3HDR(struct tcphdr, ip) : NULL)) != NULL) { @@ -2596,15 +2609,15 @@ check_body: case O_PIPE: case O_QUEUE: - args->rule = f; /* report matching rule */ + args->fwa_ipfw_rule = f; /* report matching rule */ retval = cmd->arg1 | IP_FW_PORT_DYNT_FLAG; goto done; case O_DIVERT: case O_TEE: - if (args->eh) /* not on layer 2 */ + if (args->fwa_eh) /* not on layer 2 */ break; - args->divert_rule = f->rulenum; + args->fwa_divert_rule = f->rulenum; retval = (cmd->opcode == O_DIVERT) ? cmd->arg1 : cmd->arg1 | IP_FW_PORT_TEE_FLAG; @@ -2636,7 +2649,7 @@ check_body: !IN_MULTICAST(dst_ip.s_addr)) { send_reject(args, cmd->arg1, offset,ip_len); - m = args->m; + m = args->fwa_m; } /* FALLTHROUGH */ case O_DENY: @@ -2644,10 +2657,10 @@ check_body: goto done; case O_FORWARD_IP: - if (args->eh) /* not valid on layer2 pkts */ + if (args->fwa_eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) - args->next_hop = + args->fwa_next_hop = &((ipfw_insn_sa *)cmd)->sa; retval = 0; goto done; @@ -2843,7 +2856,7 @@ delete_rule(struct ip_fw **head, struct ip_fw *prev, struct ip_fw *rule) #if DUMMYNET if (DUMMYNET_LOADED) - ip_dn_ruledel_ptr(rule); + dn_ipfw_rule_delete(rule); #endif /* DUMMYNET */ _FREE(rule, M_IPFW); return n; @@ -3505,7 +3518,7 @@ ipfw_ctl(struct sockopt *sopt) ipfw_dyn_dst->parent = CAST_DOWN(user64_addr_t, p->parent); ipfw_dyn_dst->pcnt = p->pcnt; ipfw_dyn_dst->bcnt = p->bcnt; - ipfw_dyn_dst->id = p->id; + externalize_flow_id(&ipfw_dyn_dst->id, &p->id); ipfw_dyn_dst->expire = TIME_LEQ(p->expire, timenow.tv_sec) ? 0 : p->expire - timenow.tv_sec; @@ -3531,7 +3544,7 @@ ipfw_ctl(struct sockopt *sopt) ipfw_dyn_dst->parent = CAST_DOWN_EXPLICIT(user32_addr_t, p->parent); ipfw_dyn_dst->pcnt = p->pcnt; ipfw_dyn_dst->bcnt = p->bcnt; - ipfw_dyn_dst->id = p->id; + externalize_flow_id(&ipfw_dyn_dst->id, &p->id); ipfw_dyn_dst->expire = TIME_LEQ(p->expire, timenow.tv_sec) ? 0 : p->expire - timenow.tv_sec; @@ -3915,7 +3928,7 @@ ipfw_tick(__unused void * unused) } } lck_mtx_unlock(ipfw_mutex); - + for (m = mnext = m0; m != NULL; m = mnext) { struct route sro; /* fake route */ @@ -3939,11 +3952,7 @@ ipfw_init(void) ipfw_mutex_grp_attr = lck_grp_attr_alloc_init(); ipfw_mutex_grp = lck_grp_alloc_init("ipfw", ipfw_mutex_grp_attr); ipfw_mutex_attr = lck_attr_alloc_init(); - - if ((ipfw_mutex = lck_mtx_alloc_init(ipfw_mutex_grp, ipfw_mutex_attr)) == NULL) { - printf("ipfw_init: can't alloc ipfw_mutex\n"); - return; - } + lck_mtx_init(ipfw_mutex, ipfw_mutex_grp, ipfw_mutex_attr); layer3_chain = NULL; diff --git a/bsd/netinet/ip_fw2.h b/bsd/netinet/ip_fw2.h index 5e093b170..10566531d 100644 --- a/bsd/netinet/ip_fw2.h +++ b/bsd/netinet/ip_fw2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -415,6 +415,40 @@ struct ipfw_flow_id { */ typedef struct _ipfw_dyn_rule ipfw_dyn_rule; +#ifdef XNU_KERNEL_PRIVATE + +#include + +/* + * Note: + * The internal version of "struct _ipfw_dyn_rule" differs from + * its external version because the field "id" is of type + * "struct ip_flow_id" in the internal version. The type of the + * field "id" for the external version is "ipfw_dyn_rule for + * backwards compatibility reasons. + */ + +struct _ipfw_dyn_rule { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ip_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; +#else /* XNU_KERNEL_PRIVATE */ struct _ipfw_dyn_rule { ipfw_dyn_rule *next; /* linked list of rules. */ struct ip_fw *rule; /* pointer to rule */ @@ -435,6 +469,7 @@ struct _ipfw_dyn_rule { u_int16_t dyn_type; /* rule type */ u_int16_t count; /* refcount */ }; +#endif /* XNU_KERNEL_PRIVATE */ /* * Definitions for IP option names. @@ -585,35 +620,20 @@ typedef struct _ipfw_insn_pipe_32{ #endif /* KERNEL */ #ifdef KERNEL + +#define IPFW_DEFAULT_RULE 65535 + #if IPFIREWALL #define IP_FW_PORT_DYNT_FLAG 0x10000 #define IP_FW_PORT_TEE_FLAG 0x20000 #define IP_FW_PORT_DENY_FLAG 0x40000 -/* - * Arguments for calling ipfw_chk() and dummynet_io(). We put them - * all into a structure because this way it is easier and more - * efficient to pass variables around and extend the interface. - */ -struct ip_fw_args { - struct mbuf *m; /* the mbuf chain */ - struct ifnet *oif; /* output interface */ - struct sockaddr_in *next_hop; /* forward address */ - struct ip_fw *rule; /* matching rule */ - struct ether_header *eh; /* for bridged packets */ - - struct route *ro; /* for dummynet */ - struct sockaddr_in *dst; /* for dummynet */ - int flags; /* for dummynet */ - struct ip_out_args *ipoa; /* for dummynet */ - - struct ipfw_flow_id f_id; /* grabbed from IP header */ - u_int16_t divert_rule; /* divert cookie */ - u_int32_t retval; -}; -//struct ip_fw_args; - +#ifdef PRIVATE +#include +#else +struct ip_fw_args; +#endif /* * Function definitions. */ diff --git a/bsd/netinet/ip_fw2_compat.c b/bsd/netinet/ip_fw2_compat.c index 712f49241..1022e03f1 100644 --- a/bsd/netinet/ip_fw2_compat.c +++ b/bsd/netinet/ip_fw2_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1515,7 +1515,7 @@ ipfw_version_latest_to_one_32(struct ip_fw_32 *curr_rule, struct ip_fw_compat_32 if (!rule_vers1) return; - bzero(rule_vers1, sizeof(struct ip_fw_compat)); + bzero(rule_vers1, sizeof(struct ip_fw_compat_32)); rule_vers1->version = IP_FW_VERSION_1; rule_vers1->context = CAST_DOWN_EXPLICIT(user32_addr_t,curr_rule->context); @@ -1541,7 +1541,7 @@ ipfw_version_latest_to_one_64(struct ip_fw_64 *curr_rule, struct ip_fw_compat_64 if (!rule_vers1) return; - bzero(rule_vers1, sizeof(struct ip_fw_compat)); + bzero(rule_vers1, sizeof(struct ip_fw_compat_64)); rule_vers1->version = IP_FW_VERSION_1; rule_vers1->context = CAST_DOWN_EXPLICIT(__uint64_t, curr_rule->context); diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 48ea2f0f5..576067247 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -98,11 +99,6 @@ #if IPSEC #include #include -#endif - -#if defined(NFAITH) && NFAITH > 0 -#include "faith.h" -#include #endif /* XXX This one should go in sys/mbuf.h. It is used to avoid that @@ -144,22 +140,32 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &log_redirect, 0, ""); #if ICMP_BANDLIM - + +/* Default values in case CONFIG_ICMP_BANDLIM is not defined in the MASTER file */ +#ifndef CONFIG_ICMP_BANDLIM +#if !CONFIG_EMBEDDED +#define CONFIG_ICMP_BANDLIM 250 +#else /* CONFIG_EMBEDDED */ +#define CONFIG_ICMP_BANDLIM 50 +#endif /* CONFIG_EMBEDDED */ +#endif /* CONFIG_ICMP_BANDLIM */ + /* * ICMP error-response bandwidth limiting sysctl. If not enabled, sysctl * variable content is -1 and read-only. */ -static int icmplim = 250; +static int icmplim = CONFIG_ICMP_BANDLIM; SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW | CTLFLAG_LOCKED, &icmplim, 0, ""); -#else + +#else /* ICMP_BANDLIM */ static int icmplim = -1; SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD | CTLFLAG_LOCKED, &icmplim, 0, ""); -#endif +#endif /* ICMP_BANDLIM */ /* * ICMP broadcast echo sysctl @@ -192,11 +198,16 @@ icmp_error( u_int32_t nextmtu) { struct ip *oip = mtod(n, struct ip *), *nip; - unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; + unsigned oiplen; struct icmp *icp; struct mbuf *m; unsigned icmplen; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n); + + oiplen = IP_VHL_HL(oip->ip_vhl) << 2; + #if ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); @@ -212,7 +223,8 @@ icmp_error( goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiplen + ICMP_MINLEN && - !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { + !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiplen))-> + icmp_type)) { icmpstat.icps_oldicmp++; goto freeit; } @@ -312,12 +324,17 @@ icmp_input(struct mbuf *m, int hlen) { struct icmp *icp; struct ip *ip = mtod(m, struct ip *); - int icmplen = ip->ip_len; + int icmplen; int i; struct in_ifaddr *ia; void (*ctlfunc)(int, struct sockaddr *, void *); int code; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + + icmplen = ip->ip_len; + /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. @@ -353,21 +370,6 @@ icmp_input(struct mbuf *m, int hlen) m->m_len += hlen; m->m_data -= hlen; -#if defined(NFAITH) && 0 < NFAITH - if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { - /* - * Deliver very specific ICMP type only. - */ - switch (icp->icmp_type) { - case ICMP_UNREACH: - case ICMP_TIMXCEED: - break; - default: - goto freeit; - } - } -#endif - #if ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, @@ -514,7 +516,6 @@ icmp_input(struct mbuf *m, int hlen) goto reflect; case ICMP_MASKREQ: -#define satosin(sa) ((struct sockaddr_in *)(sa)) if (icmpmaskrepl == 0) break; /* @@ -810,10 +811,13 @@ icmp_send(struct mbuf *m, struct mbuf *opts) int hlen; struct icmp *icp; struct route ro; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, + IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR }; - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) { ipoa.ipoa_boundif = m->m_pkthdr.rcvif->if_index; + ipoa.ipoa_flags |= IPOAF_BOUND_IF; + } hlen = IP_VHL_HL(ip->ip_vhl) << 2; m->m_data += hlen; @@ -1059,9 +1063,6 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IP_PORTRANGE: case IP_RECVIF: case IP_IPSEC_POLICY: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif case IP_STRIPHDR: case IP_RECVTTL: case IP_BOUND_IF: @@ -1092,6 +1093,8 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *n int icmplen; if ((inp->inp_flags & INP_HDRINCL) != 0) { + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); /* * This is not raw IP, we liberal only for fields TOS, id and TTL */ @@ -1141,8 +1144,8 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *n ours: /* Do not trust we got a valid checksum */ ip->ip_sum = 0; - - icp = (struct icmp *)(((char *)m->m_data) + hlen); + + icp = (struct icmp *)(void *)(((char *)m->m_data) + hlen); icmplen = m->m_pkthdr.len - hlen; } else { if ((icmplen = m->m_pkthdr.len) < ICMP_MINLEN) { diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 761b4b40c..6953044bd 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -135,11 +135,6 @@ #include #endif -#include "faith.h" -#if defined(NFAITH) && NFAITH > 0 -#include -#endif - #if DUMMYNET #include #endif @@ -148,6 +143,8 @@ #include #endif /* PF */ +#include + #if IPSEC extern int ipsec_bypass; extern lck_mtx_t *sadb_mutex; @@ -155,7 +152,8 @@ extern lck_mtx_t *sadb_mutex; lck_grp_t *sadb_stat_mutex_grp; lck_grp_attr_t *sadb_stat_mutex_grp_attr; lck_attr_t *sadb_stat_mutex_attr; -lck_mtx_t *sadb_stat_mutex; +decl_lck_mtx_data(, sadb_stat_mutex_data); +lck_mtx_t *sadb_stat_mutex = &sadb_stat_mutex_data; #endif @@ -188,9 +186,8 @@ SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, "Enable accepting source routed IP packets"); static int ip_keepfaith = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, - &ip_keepfaith, 0, - "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RD | CTLFLAG_LOCKED, + &ip_keepfaith, 0, ""); static int nipq = 0; /* total # of reass queues */ static int maxnipq; @@ -213,6 +210,10 @@ int ip_doscopedroute = 1; SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); +int ip_restrictrecvif = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, restrictrecvif, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_restrictrecvif, 0, "Enable inbound interface restrictions"); + /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table @@ -230,7 +231,6 @@ static int ip_checkinterface = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); - #if DIAGNOSTIC static int ipprintfs = 0; #endif @@ -244,7 +244,8 @@ static int ipqmaxlen = IFQ_MAXLEN; static lck_grp_attr_t *in_ifaddr_rwlock_grp_attr; static lck_grp_t *in_ifaddr_rwlock_grp; static lck_attr_t *in_ifaddr_rwlock_attr; -lck_rw_t *in_ifaddr_rwlock; +decl_lck_rw_data(, in_ifaddr_rwlock_data); +lck_rw_t *in_ifaddr_rwlock = &in_ifaddr_rwlock_data; /* Protected by in_ifaddr_rwlock */ struct in_ifaddrhead in_ifaddrhead; /* first inet address */ @@ -280,7 +281,6 @@ lck_attr_t *ip_mutex_attr; lck_grp_t *ip_mutex_grp; lck_grp_attr_t *ip_mutex_grp_attr; lck_mtx_t *inet_domain_mutex; -extern lck_mtx_t *domain_proto_mtx; #if IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -300,14 +300,12 @@ ip_fw_chk_t *ip_fw_chk_ptr; int fw_enable = 1; int fw_bypass = 1; int fw_one_pass = 0; +#endif /* IPFIREWALL */ #if DUMMYNET ip_dn_io_t *ip_dn_io_ptr; #endif -int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **) = NULL; -#endif /* IPFIREWALL */ - SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local"); struct ip_linklocal_stat ip_linklocal_stat; @@ -358,6 +356,7 @@ static void ip_fwd_route_copyout(struct ifnet *, struct route *); static void ip_fwd_route_copyin(struct ifnet *, struct route *); void ipintr(void); void in_dinit(void); +static inline u_short ip_cksum(struct mbuf *, int); #if RANDOM_IP_ID extern u_short ip_id; @@ -367,8 +366,37 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_use_randomid, 0, "Randomize IP packets IDs"); #endif -#define satosin(sa) ((struct sockaddr_in *)(sa)) -#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) +/* + * On platforms which require strict alignment (currently for anything but + * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not, + * copy the contents of the mbuf chain into a new chain, and free the original + * one. Create some head room in the first mbuf of the new chain, in case + * it's needed later on. + */ +#if defined(__i386__) || defined(__x86_64__) +#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0) +#else /* !__i386__ && !__x86_64__ */ +#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \ + if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \ + struct mbuf *_n; \ + struct ifnet *__ifp = (_ifp); \ + atomic_add_64(&(__ifp)->if_alignerrs, 1); \ + if (((_m)->m_flags & M_PKTHDR) && \ + (_m)->m_pkthdr.header != NULL) \ + (_m)->m_pkthdr.header = NULL; \ + _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \ + if (_n == NULL) { \ + atomic_add_32(&ipstat.ips_toosmall, 1); \ + m_freem(_m); \ + (_m) = NULL; \ + _action \ + } else { \ + VERIFY(_n != (_m)); \ + (_m) = _n; \ + } \ + } \ +} while (0) +#endif /* !__i386__ && !__x86_64__ */ /* * IP initialization: fill in IP protocol switch table. @@ -392,7 +420,7 @@ ip_init(void) in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock", in_ifaddr_rwlock_grp_attr); in_ifaddr_rwlock_attr = lck_attr_alloc_init(); - in_ifaddr_rwlock = lck_rw_alloc_init(in_ifaddr_rwlock_grp, + lck_rw_init(in_ifaddr_rwlock, in_ifaddr_rwlock_grp, in_ifaddr_rwlock_attr); TAILQ_INIT(&in_ifaddrhead); @@ -447,10 +475,7 @@ ip_init(void) sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat", sadb_stat_mutex_grp_attr); sadb_stat_mutex_attr = lck_attr_alloc_init(); - if ((sadb_stat_mutex = lck_mtx_alloc_init(sadb_stat_mutex_grp, sadb_stat_mutex_attr)) == NULL) { - printf("ip_init: can't alloc sadb_stat_mutex\n"); - return; - } + lck_mtx_init(sadb_stat_mutex, sadb_stat_mutex_grp, sadb_stat_mutex_attr); #endif arp_init(); @@ -540,7 +565,6 @@ in_dinit(void) if (!inetdomain_initted) { - /* kprintf("Initing %d protosw entries\n", in_proto_count); */ dp = &inetdomain; dp->dom_flags = DOM_REENTRANT; @@ -549,18 +573,21 @@ in_dinit(void) inet_domain_mutex = dp->dom_mtx; inetdomain_initted = 1; - lck_mtx_unlock(domain_proto_mtx); + domain_proto_mtx_unlock(TRUE); proto_register_input(PF_INET, ip_proto_input, NULL, 1); - lck_mtx_lock(domain_proto_mtx); + domain_proto_mtx_lock(); } } +void +ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto) +{ + ip_proto_dispatch_in(m, hlen, proto, 0); +} + __private_extern__ void -ip_proto_dispatch_in( - struct mbuf *m, - int hlen, - u_int8_t proto, - ipfilter_t inject_ipfref) +ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, + ipfilter_t inject_ipfref) { struct ipfilter *filter; int seen = (inject_ipfref == 0); @@ -568,7 +595,7 @@ ip_proto_dispatch_in( struct ip *ip; void (*pr_input)(struct mbuf *, int len); - if (!TAILQ_EMPTY(&ipv4_filters)) { + if (!TAILQ_EMPTY(&ipv4_filters)) { ipf_ref(); TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { @@ -576,8 +603,17 @@ ip_proto_dispatch_in( seen = 1; } else if (filter->ipf_filter.ipf_input) { errno_t result; - + if (changed_header == 0) { + /* + * Perform IP header alignment fixup, + * if needed, before passing packet + * into filter(s). + */ + IP_HDR_ALIGNMENT_FIXUP(m, + m->m_pkthdr.rcvif, + ipf_unref(); return;); + changed_header = 1; ip = mtod(m, struct ip *); ip->ip_len = htons(ip->ip_len + hlen); @@ -586,7 +622,8 @@ ip_proto_dispatch_in( ip->ip_sum = in_cksum(m, hlen); } result = filter->ipf_filter.ipf_input( - filter->ipf_filter.cookie, (mbuf_t*)&m, hlen, proto); + filter->ipf_filter.cookie, (mbuf_t*)&m, + hlen, proto); if (result == EJUSTRETURN) { ipf_unref(); return; @@ -596,10 +633,14 @@ ip_proto_dispatch_in( m_freem(m); return; } - } + } } ipf_unref(); } + + /* Perform IP header alignment fixup (post-filters), if needed */ + IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return;); + /* * If there isn't a specific lock for the protocol * we're about to call, use the generic lock for AF_INET. @@ -633,12 +674,14 @@ ip_input(struct mbuf *m) struct ip *ip; struct ipq *fp; struct in_ifaddr *ia = NULL; - int hlen, checkif; - u_short sum; + unsigned int hlen, checkif; + u_short sum = 0; struct in_addr pkt_dst; #if IPFIREWALL int i; u_int32_t div_info = 0; /* packet divert/tee info */ +#endif +#if IPFIREWALL || DUMMYNET struct ip_fw_args args; struct m_tag *tag; #endif @@ -647,12 +690,11 @@ ip_input(struct mbuf *m) /* Check if the mbuf is still valid after interface filter processing */ MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); -#if IPFIREWALL - args.eh = NULL; - args.oif = NULL; - args.rule = NULL; - args.divert_rule = 0; /* divert cookie */ - args.next_hop = NULL; + /* Perform IP header alignment fixup, if needed */ + IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, goto bad;); + +#if IPFIREWALL || DUMMYNET + bzero(&args, sizeof(struct ip_fw_args)); /* * Don't bother searching for tag(s) if there's none. @@ -667,7 +709,8 @@ ip_input(struct mbuf *m) struct dn_pkt_tag *dn_tag; dn_tag = (struct dn_pkt_tag *)(tag+1); - args.rule = dn_tag->rule; + args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; + args.fwa_pf_rule = dn_tag->dn_pf_rule; m_tag_delete(m, tag); } @@ -679,7 +722,7 @@ ip_input(struct mbuf *m) struct divert_tag *div_tag; div_tag = (struct divert_tag *)(tag+1); - args.divert_rule = div_tag->cookie; + args.fwa_divert_rule = div_tag->cookie; m_tag_delete(m, tag); } @@ -690,7 +733,7 @@ ip_input(struct mbuf *m) struct ip_fwd_tag *ipfwd_tag; ipfwd_tag = (struct ip_fwd_tag *)(tag+1); - args.next_hop = ipfwd_tag->next_hop; + args.fwa_next_hop = ipfwd_tag->next_hop; m_tag_delete(m, tag); } @@ -700,17 +743,24 @@ ip_input(struct mbuf *m) panic("ip_input no HDR"); #endif - if (args.rule) { /* dummynet already filtered us */ +#if DUMMYNET + if (args.fwa_ipfw_rule || args.fwa_pf_rule) { /* dummynet already filtered us */ ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; inject_filter_ref = ipf_get_inject_filter(m); - goto iphack ; +#if IPFIREWALL + if (args.fwa_ipfw_rule) + goto iphack; +#endif /* IPFIREWALL */ + if (args.fwa_pf_rule) + goto check_with_pf; } +#endif /* DUMMYNET */ ipfw_tags_done: -#endif /* IPFIREWALL */ +#endif /* IPFIREWALL || DUMMYNET*/ /* - * No need to proccess packet twice if we've already seen it. + * No need to process packet twice if we've already seen it. */ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) inject_filter_ref = ipf_get_inject_filter(m); @@ -729,7 +779,6 @@ ipfw_tags_done: } OSAddAtomic(1, &ipstat.ips_total); - if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; @@ -781,38 +830,9 @@ ipfw_tags_done: goto bad; } } - if ((IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) == 0) - || (apple_hwcksum_rx == 0) || - ((m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) && ip->ip_p != IPPROTO_TCP)) { - m->m_pkthdr.csum_flags = 0; /* invalidate HW generated checksum flags */ - } - if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { - sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); - } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || - apple_hwcksum_tx == 0) { - /* - * Either this is not loopback packet coming from an interface - * that does not support checksum offloading, or it is loopback - * packet that has undergone software checksumming at the send - * side because apple_hwcksum_tx was set to 0. In this case, - * calculate the checksum in software to validate the packet. - */ - sum = in_cksum(m, hlen); - } else { - /* - * This is a loopback packet without any valid checksum since - * the send side has bypassed it (apple_hwcksum_tx set to 1). - * We get here because apple_hwcksum_rx was set to 0, and so - * we pretend that all is well. - */ - sum = 0; - m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; - m->m_pkthdr.csum_data = 0xffff; - } + sum = ip_cksum(m, hlen); if (sum) { - OSAddAtomic(1, &ipstat.ips_badsum); goto bad; } @@ -878,19 +898,27 @@ tooshort: m_adj(m, ip->ip_len - m->m_pkthdr.len); } + +#if DUMMYNET +check_with_pf: +#endif #if PF /* Invoke inbound packet filter */ - if (PF_IS_ENABLED) { + if (PF_IS_ENABLED) { int error; - error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE); - if (error != 0) { +#if DUMMYNET + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE, &args); +#else + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE, NULL); +#endif /* DUMMYNET */ + if (error != 0 || m == NULL) { if (m != NULL) { panic("%s: unexpected packet %p\n", __func__, m); /* NOTREACHED */ } /* Already freed by callee */ return; - } + } ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; } @@ -909,28 +937,20 @@ iphack: * Check if we want to allow this packet to be processed. * Consider it to be bad if not. */ - if (fr_checkp) { - struct mbuf *m1 = m; - - if (fr_checkp(ip, hlen, m->m_pkthdr.rcvif, 0, &m1) || !m1) { - return; - } - ip = mtod(m = m1, struct ip *); - } if (fw_enable && IPFW_LOADED) { #if IPFIREWALL_FORWARD /* * If we've been forwarded from the output side, then * skip the firewall a second time */ - if (args.next_hop) + if (args.fwa_next_hop) goto ours; #endif /* IPFIREWALL_FORWARD */ - args.m = m; + args.fwa_m = m; i = ip_fw_chk_ptr(&args); - m = args.m; + m = args.fwa_m; if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ if (m) @@ -939,13 +959,13 @@ iphack: } ip = mtod(m, struct ip *); /* just in case m changed */ - if (i == 0 && args.next_hop == NULL) { /* common case */ + if (i == 0 && args.fwa_next_hop == NULL) { /* common case */ goto pass; } #if DUMMYNET if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { /* Send packet to the appropriate pipe */ - ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args); + ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args, DN_CLIENT_IPFW); return; } #endif /* DUMMYNET */ @@ -957,7 +977,7 @@ iphack: } #endif #if IPFIREWALL_FORWARD - if (i == 0 && args.next_hop != NULL) { + if (i == 0 && args.fwa_next_hop != NULL) { goto pass; } #endif @@ -978,7 +998,7 @@ pass: */ ip_nhops = 0; /* for source routed packets */ #if IPFIREWALL - if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop)) { + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.fwa_next_hop)) { #else if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) { #endif @@ -1009,8 +1029,8 @@ pass: * changed by use of 'ipfw fwd'. */ #if IPFIREWALL - pkt_dst = args.next_hop == NULL ? - ip->ip_dst : args.next_hop->sin_addr; + pkt_dst = args.fwa_next_hop == NULL ? + ip->ip_dst : args.fwa_next_hop->sin_addr; #else pkt_dst = ip->ip_dst; #endif @@ -1032,7 +1052,7 @@ pass: checkif = ip_checkinterface && (ipforwarding == 0) && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) #if IPFIREWALL - && (args.next_hop == NULL); + && (args.fwa_next_hop == NULL); #else ; #endif @@ -1069,7 +1089,6 @@ pass: if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { struct ifaddr *ifa; struct ifnet *ifp = m->m_pkthdr.rcvif; - ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { IFA_LOCK_SPIN(ifa); @@ -1159,19 +1178,6 @@ pass: ip = mtod(m, struct ip *); /* in case it changed */ } -#if defined(NFAITH) && 0 < NFAITH - /* - * FAITH(Firewall Aided Internet Translator) - */ - if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { - if (ip_keepfaith) { - if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) - goto ours; - } - m_freem(m); - return; - } -#endif /* * Not for us; forward if possible and desirable. */ @@ -1180,7 +1186,7 @@ pass: m_freem(m); } else { #if IPFIREWALL - ip_forward(m, 0, args.next_hop); + ip_forward(m, 0, args.fwa_next_hop); #else ip_forward(m, 0, NULL); #endif @@ -1288,13 +1294,13 @@ found: /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf, and update - * the divert info in div_info and args.divert_rule. + * the divert info in div_info and args.fwa_divert_rule. */ OSAddAtomic(1, &ipstat.ips_fragments); m->m_pkthdr.header = ip; #if IPDIVERT m = ip_reass(m, fp, &ipq[sum], - (u_int16_t *)&div_info, &args.divert_rule); + (u_int16_t *)&div_info, &args.fwa_divert_rule); #else m = ip_reass(m, fp, &ipq[sum]); #endif @@ -1355,7 +1361,7 @@ found: #endif /* Deliver packet to divert input routine */ OSAddAtomic(1, &ipstat.ips_delivered); - divert_packet(m, 1, div_info & 0xffff, args.divert_rule); + divert_packet(m, 1, div_info & 0xffff, args.fwa_divert_rule); /* If 'tee', continue with original packet */ if (clone == NULL) { @@ -1386,7 +1392,7 @@ found: OSAddAtomic(1, &ipstat.ips_delivered); { #if IPFIREWALL - if (args.next_hop && ip->ip_p == IPPROTO_TCP) { + if (args.fwa_next_hop && ip->ip_p == IPPROTO_TCP) { /* TCP needs IPFORWARD info if available */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; @@ -1399,14 +1405,18 @@ found: } ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); - ipfwd_tag->next_hop = args.next_hop; + ipfwd_tag->next_hop = args.fwa_next_hop; m_tag_prepend(m, fwd_tag); KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - + if (sw_lro) { + m = tcp_lro(m, hlen); + if (m == NULL) + return; + } /* TCP deals with its own locking */ ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); } else { @@ -1416,6 +1426,11 @@ found: ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); } #else + if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) { + m = tcp_lro(m, hlen); + if (m == NULL) + return; + } ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); #endif @@ -1819,6 +1834,9 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) struct sockaddr_in ipaddr = { sizeof (ipaddr), AF_INET , 0 , { 0 }, { 0, } }; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); @@ -1928,9 +1946,10 @@ nosourcerouting: if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * -#define SA struct sockaddr * - if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) { - ia = (INA)ifa_ifwithnet((SA)&ipaddr); + if ((ia = (INA)ifa_ifwithdstaddr( + (struct sockaddr *)&ipaddr)) == 0) { + ia = (INA)ifa_ifwithnet( + (struct sockaddr *)&ipaddr); } } else { ia = ip_rtaddr(ipaddr.sin_addr); @@ -1975,7 +1994,8 @@ nosourcerouting: * locate outgoing interface; if we're the destination, * use the incoming interface (should be same). */ - if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0) { + if ((ia = (INA)ifa_ifwithaddr((struct sockaddr *) + &ipaddr)) == 0) { if ((ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; @@ -1993,7 +2013,7 @@ nosourcerouting: case IPOPT_TS: code = cp - (u_char *)ip; - ipt = (struct ip_timestamp *)cp; + ipt = (struct ip_timestamp *)(void *)cp; if (ipt->ipt_len < 4 || ipt->ipt_len > 40) { code = (u_char *)&ipt->ipt_len - (u_char *)ip; goto bad; @@ -2011,7 +2031,7 @@ nosourcerouting: } break; } - sin = (struct in_addr *)(cp + ipt->ipt_ptr - 1); + sin = (struct in_addr *)(void *)(cp + ipt->ipt_ptr - 1); switch (ipt->ipt_flg) { case IPOPT_TS_TSONLY: @@ -2025,8 +2045,8 @@ nosourcerouting: goto bad; } ipaddr.sin_addr = dst; - ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, - m->m_pkthdr.rcvif); + ia = (INA)ifaof_ifpforaddr((struct sockaddr *) + &ipaddr, m->m_pkthdr.rcvif); if (ia == 0) continue; IFA_LOCK(&ia->ia_ifa); @@ -2047,7 +2067,8 @@ nosourcerouting: } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); - if ((ia = (struct in_ifaddr*)ifa_ifwithaddr((SA)&ipaddr)) == 0) + if ((ia = (struct in_ifaddr*)ifa_ifwithaddr( + (struct sockaddr *)&ipaddr)) == 0) continue; IFA_REMREF(&ia->ia_ifa); ia = NULL; @@ -2090,7 +2111,7 @@ ip_rtaddr(struct in_addr dst) struct route ro; bzero(&ro, sizeof (ro)); - sin = (struct sockaddr_in *)&ro.ro_dst; + sin = (struct sockaddr_in *)(void *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof (*sin); sin->sin_addr = dst; @@ -2173,7 +2194,7 @@ ip_srcroute(void) ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &ip_srcrt.nop, OPTSIZ); - q = (struct in_addr *)(mtod(m, caddr_t) + + q = (struct in_addr *)(void *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* @@ -2213,6 +2234,9 @@ ip_stripoptions(struct mbuf *m, __unused struct mbuf *mopt) caddr_t opts; int olen; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); opts = (caddr_t)(ip + 1); i = m->m_len - (sizeof (struct ip) + olen); @@ -2331,7 +2355,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) n_long dest; struct in_addr pkt_dst; u_int32_t nextmtu = 0; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0 }; struct ifnet *ifp = m->m_pkthdr.rcvif; #if PF struct pf_mtag *pf_mtag; @@ -2374,13 +2398,15 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) #if PF pf_mtag = pf_find_mtag(m); - if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) - ipoa.ipoa_boundif = pf_mtag->rtableid; + if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) { + ipoa.ipoa_boundif = pf_mtag->pftag_rtableid; + ipoa.ipoa_flags |= IPOAF_BOUND_IF; + } #endif /* PF */ ip_fwd_route_copyout(ifp, &fwd_rt); - sin = (struct sockaddr_in *)&fwd_rt.ro_dst; + sin = (struct sockaddr_in *)(void *)&fwd_rt.ro_dst; if (fwd_rt.ro_rt == NULL || fwd_rt.ro_rt->generation_id != route_generation || pkt_dst.s_addr != sin->sin_addr.s_addr) { @@ -2715,7 +2741,7 @@ ip_savecontrol( goto makedummy; IFA_LOCK_SPIN(ifa); - sdp = (struct sockaddr_dl *)ifa->ifa_addr; + sdp = (struct sockaddr_dl *)(void *)ifa->ifa_addr; /* * Change our mind and don't try copy. */ @@ -2749,8 +2775,8 @@ makedummy: } } if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { - int tc = m->m_pkthdr.prio; - + int tc = m_get_traffic_class(m); + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), SO_TRAFFIC_CLASS, SOL_SOCKET, mp); if (*mp == NULL) { @@ -2814,3 +2840,51 @@ ip_rsvp_done(void) } return 0; } + +static inline u_short +ip_cksum(struct mbuf *m, int hlen) +{ + + u_short sum; + struct ip *ip; + + ip = mtod(m, struct ip *); + + if ((IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) == 0) + || (apple_hwcksum_rx == 0) || + ((m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) && ip->ip_p != IPPROTO_TCP)) { + m->m_pkthdr.csum_flags = 0; /* invalidate HW generated checksum flags */ + + } + + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || + apple_hwcksum_tx == 0) { + /* + * Either this is not loopback packet coming from an interface + * that does not support checksum offloading, or it is loopback + * packet that has undergone software checksumming at the send + * side because apple_hwcksum_tx was set to 0. In this case, + * calculate the checksum in software to validate the packet. + */ + sum = in_cksum(m, hlen); + } else { + /* + * This is a loopback packet without any valid checksum since + * the send side has bypassed it (apple_hwcksum_tx set to 1). + * We get here because apple_hwcksum_rx was set to 0, and so + * we pretend that all is well. + */ + sum = 0; + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + m->m_pkthdr.csum_data = 0xffff; + } + + if (sum) { + OSAddAtomic(1, &ipstat.ips_badsum); + } + + return sum; +} diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index c4530e994..aece80368 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,8 +104,6 @@ #include #endif -#include "faith.h" - #include #include #include @@ -146,7 +144,6 @@ (ntohl(a.s_addr))&0xFF); #endif - u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); @@ -162,8 +159,6 @@ int ip_optcopy(struct ip *, struct ip *); void in_delayed_cksum_offset(struct mbuf *, int ); void in_cksum_offset(struct mbuf* , size_t ); -extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); - extern struct protosw inetsw[]; extern struct ip_linklocal_stat ip_linklocal_stat; @@ -252,7 +247,6 @@ ip_output( * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified] * key_spdacquire:??? [IPSEC] * ipsec4_output:??? [IPSEC] - * :??? [firewall] * ip_dn_io_ptr:??? [dummynet] * dlil_output:??? [DLIL] * dlil_output_list:??? [DLIL] @@ -269,12 +263,11 @@ ip_output_list( struct route *ro, int flags, struct ip_moptions *imo, - struct ip_out_args *ipoa - ) + struct ip_out_args *ipoa) { struct ip *ip; struct ifnet *ifp = NULL; - struct mbuf *m = m0, **mppn = NULL; + struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt; int hlen = sizeof (struct ip); int len = 0, error = 0; struct sockaddr_in *dst = NULL; @@ -293,9 +286,11 @@ ip_output_list( #endif #if IPFIREWALL int off; + struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; +#endif +#if IPFIREWALL || DUMMYNET struct ip_fw_args args; struct m_tag *tag; - struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; #endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; @@ -307,9 +302,11 @@ ip_output_list( struct mbuf * packetlist; int pktcnt = 0, tso = 0; u_int32_t bytecnt = 0; - unsigned int ifscope; - unsigned int nocell; - boolean_t select_srcif; + unsigned int ifscope = IFSCOPE_NONE; + unsigned int nocell = 0; + boolean_t select_srcif, srcbound; + struct flowadv *adv = NULL; + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); #if IPSEC @@ -317,12 +314,8 @@ ip_output_list( #endif /* IPSEC */ packetlist = m0; -#if IPFIREWALL - args.next_hop = NULL; - args.eh = NULL; - args.rule = NULL; - args.divert_rule = 0; /* divert cookie */ - args.ipoa = NULL; +#if IPFIREWALL || DUMMYNET + bzero(&args, sizeof(struct ip_fw_args)); if (SLIST_EMPTY(&m0->m_pkthdr.tags)) goto ipfw_tags_done; @@ -334,18 +327,21 @@ ip_output_list( struct dn_pkt_tag *dn_tag; dn_tag = (struct dn_pkt_tag *)(tag+1); - args.rule = dn_tag->rule; + args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; + args.fwa_pf_rule = dn_tag->dn_pf_rule; opt = NULL; - saved_route = dn_tag->ro; + saved_route = dn_tag->dn_ro; ro = &saved_route; imo = NULL; bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); dst = &dst_buf; - ifp = dn_tag->ifp; - flags = dn_tag->flags; - saved_ipoa = dn_tag->ipoa; - ipoa = &saved_ipoa; + ifp = dn_tag->dn_ifp; + flags = dn_tag->dn_flags; + if ((dn_tag->dn_flags & IP_OUTARGS)) { + saved_ipoa = dn_tag->dn_ipoa; + ipoa = &saved_ipoa; + } m_tag_delete(m0, tag); } @@ -357,24 +353,27 @@ ip_output_list( struct divert_tag *div_tag; div_tag = (struct divert_tag *)(tag+1); - args.divert_rule = div_tag->cookie; + args.fwa_divert_rule = div_tag->cookie; m_tag_delete(m0, tag); } #endif /* IPDIVERT */ +#if IPFIREWALL if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; ipfwd_tag = (struct ip_fwd_tag *)(tag+1); next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; - + m_tag_delete(m0, tag); } -ipfw_tags_done: #endif /* IPFIREWALL */ +ipfw_tags_done: +#endif /* IPFIREWALL || DUMMYNET */ + m = m0; #if DIAGNOSTIC @@ -388,34 +387,47 @@ ipfw_tags_done: bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); ippo = &ipf_pktopts; - /* - * At present the IP_OUTARGS flag implies a request for IP to - * perform source interface selection. In the forwarding case, - * only the ifscope value is used, as source interface selection - * doesn't take place. - */ if (ip_doscopedroute && (flags & IP_OUTARGS)) { - select_srcif = !(flags & IP_FORWARDING); - ifscope = ipoa->ipoa_boundif; - ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; - ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); + /* + * In the forwarding case, only the ifscope value is used, + * as source interface selection doesn't take place. + */ + if ((select_srcif = (!(flags & IP_FORWARDING) && + (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) { + ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; + } + + if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) && + ipoa->ipoa_boundif != IFSCOPE_NONE) { + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags |= + (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE)); + } + + if ((srcbound = (ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR))) + ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; } else { select_srcif = FALSE; + srcbound = FALSE; ifscope = IFSCOPE_NONE; } + if ((flags & IP_OUTARGS) && (ipoa->ipoa_flags & IPOAF_NO_CELLULAR)) { + nocell = 1; + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } + if (flags & IP_OUTARGS) { - nocell = ipoa->ipoa_nocell; - if (nocell) - ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; - } else { - nocell = 0; + adv = &ipoa->ipoa_flowadv; + adv->code = FADV_SUCCESS; } -#if IPFIREWALL - if (args.rule != NULL) { /* dummynet already saw us */ +#if DUMMYNET + if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) { + /* dummynet already saw us */ ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + pkt_dst = ip->ip_dst; if (ro->ro_rt != NULL) { RT_LOCK_SPIN(ro->ro_rt); ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; @@ -431,10 +443,15 @@ ipfw_tags_done: so = ipsec_getsocket(m); (void)ipsec_setsocket(m, NULL); } -#endif - goto sendit; +#endif /* IPSEC */ +#if IPFIREWALL + if (args.fwa_ipfw_rule != NULL) + goto skip_ipsec; +#endif /* #if IPFIREWALL */ + if (args.fwa_pf_rule != NULL) + goto sendit; } -#endif /* IPFIREWALL */ +#endif /* DUMMYNET */ #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { @@ -455,6 +472,12 @@ loopit: if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; + /* Update the chain */ + if (m != m0) { + if (m0 == packetlist) + packetlist = m; + m0 = m; + } } ip = mtod(m, struct ip *); #if IPFIREWALL @@ -466,8 +489,8 @@ loopit: * packet of the chain. This could cause the route to be inavertandly changed * to the route to the gateway address (instead of the route to the destination). */ - args.next_hop = next_hop_from_ipfwd_tag; - pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; + args.fwa_next_hop = next_hop_from_ipfwd_tag; + pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst; #else pkt_dst = ip->ip_dst; #endif @@ -496,7 +519,7 @@ loopit: } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - + #if DEBUG /* For debugging, we let the stack forge congestion */ if (forge_ce != 0 && @@ -509,8 +532,8 @@ loopit: KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - dst = (struct sockaddr_in *)&ro->ro_dst; + + dst = (struct sockaddr_in *)(void *)&ro->ro_dst; /* * If there is a cached route, @@ -562,8 +585,6 @@ loopit: * If routing to interface only, * short circuit routing lookup. */ -#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) -#define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { if (ia) IFA_REMREF(&ia->ia_ifa); @@ -621,14 +642,15 @@ loopit: } /* - * If the source address is spoofed (in the case - * of IP_RAWOUTPUT), or if this is destined for - * local/loopback, just let it go out using the - * interface of the route. Otherwise, there's no - * interface having such an address, so bail out. + * If the source address is spoofed (in the case of + * IP_RAWOUTPUT on an unbounded socket), or if this + * is destined for local/loopback, just let it go out + * using the interface of the route. Otherwise, + * there's no interface having such an address, + * so bail out. */ - if (ifa == NULL && !(flags & IP_RAWOUTPUT) && - ifscope != lo_ifp->if_index) { + if (ifa == NULL && (!(flags & IP_RAWOUTPUT) || + srcbound) && ifscope != lo_ifp->if_index) { error = EADDRNOTAVAIL; goto bad; } @@ -737,8 +759,10 @@ loopit: } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) { + dst = (struct sockaddr_in *)(void *) + ro->ro_rt->rt_gateway; + } if (ro->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); } else { @@ -761,7 +785,7 @@ loopit: * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - dst = (struct sockaddr_in *)&ro->ro_dst; + dst = (struct sockaddr_in *)(void *)&ro->ro_dst; /* * See if the caller provided any multicast options */ @@ -906,6 +930,7 @@ loopit: m_freem(m); if (inm != NULL) INM_REMREF(inm); + OSAddAtomic(1, &ipstat.ips_cantforward); goto done; } } @@ -928,7 +953,6 @@ loopit: goto sendit; } -#ifndef notdef /* * If source address not specified yet, use address * of outgoing interface. @@ -946,7 +970,6 @@ loopit: fwd_rewrite_src++; #endif /* IPFIREWALL_FORWARD */ } -#endif /* notdef */ /* * Look for broadcast address and @@ -975,14 +998,31 @@ loopit: sendit: #if PF /* Invoke outbound packet filter */ - if ( PF_IS_ENABLED) { + if (PF_IS_ENABLED) { int rc; - rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE); - if (rc != 0) { - if (packetlist == m0) { + + m0 = m; /* Save for later */ +#if DUMMYNET + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; + if (flags & IP_OUTARGS) + args.fwa_ipoa = ipoa; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args); +#else /* DUMMYNET */ + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL); +#endif /* DUMMYNET */ + if (rc != 0 || m == NULL) { + /* Move to the next packet */ + m = *mppn; + + /* Skip ahead if first packet in list got dropped */ + if (packetlist == m0) packetlist = m; - mppn = NULL; - } + if (m != NULL) { m0 = m; /* Next packet in the chain */ @@ -1007,7 +1047,7 @@ sendit: ip_linklocal_stat.iplls_out_total++; if (ip->ip_ttl != MAXTTL) { ip_linklocal_stat.iplls_out_badttl++; - ip->ip_ttl = MAXTTL; + ip->ip_ttl = MAXTTL; } } @@ -1026,7 +1066,7 @@ sendit: } ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ #if BYTE_ORDER != BIG_ENDIAN @@ -1051,7 +1091,7 @@ sendit: } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); @@ -1079,7 +1119,7 @@ sendit: sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); if (sp == NULL) { - IPSEC_STAT_INCREMENT(ipsecstat.out_inval); + IPSEC_STAT_INCREMENT(ipsecstat.out_inval); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); goto bad; } @@ -1102,7 +1142,7 @@ sendit: /* no need to do IPsec. */ KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); goto skip_ipsec; - + case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ @@ -1146,9 +1186,9 @@ sendit: struct ip *, ip, struct ip6_hdr *, NULL); error = ipsec4_output(&ipsec_state, sp, flags); - + m0 = m = ipsec_state.m; - + if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore @@ -1163,7 +1203,7 @@ sendit: ipsec_saved_route = ro; ro = &ipsec_state.ro; } - dst = (struct sockaddr_in *)ipsec_state.dst; + dst = (struct sockaddr_in *)(void *)ipsec_state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m0 = NULL; @@ -1189,7 +1229,7 @@ sendit: /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); - + #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else @@ -1239,13 +1279,13 @@ sendit: NTOHS(ip->ip_len); NTOHS(ip->ip_off); #endif - + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); - + /* Pass to filters again */ if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; /* Check that a TSO frame isn't passed to a filter. @@ -1258,7 +1298,7 @@ sendit: } ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ #if BYTE_ORDER != BIG_ENDIAN @@ -1280,7 +1320,7 @@ sendit: } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); @@ -1295,35 +1335,19 @@ skip_ipsec: #endif /*IPSEC*/ #if IPFIREWALL - /* - * IpHack's section. - * - Xlate: translate packet's addr/port (NAT). - * - Firewall: deny/allow/etc. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ - if (fr_checkp) { - struct mbuf *m1 = m; - - if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { - goto done; - } - ip = mtod(m0 = m = m1, struct ip *); - } - /* * Check with the firewall... * but not if we are already being fwd'd from a firewall. */ - if (fw_enable && IPFW_LOADED && !args.next_hop) { + if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) { struct sockaddr_in *old = dst; - args.m = m; - args.next_hop = dst; - args.oif = ifp; + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; off = ip_fw_chk_ptr(&args); - m = args.m; - dst = args.next_hop; + m = args.fwa_m; + dst = args.fwa_next_hop; /* * On return we must do the following: @@ -1347,12 +1371,12 @@ skip_ipsec: goto done ; } ip = mtod(m, struct ip *); - + if (off == 0 && dst == old) {/* common case */ goto pass ; } #if DUMMYNET - if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { + if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { /* * pass the pkt to dummynet. Need to include * pipe number, m, ifp, ro, dst because these are @@ -1362,14 +1386,14 @@ skip_ipsec: * XXX note: if the ifp or ro entry are deleted * while a pkt is in dummynet, we are in trouble! */ - args.ro = ro; - args.dst = dst; - args.flags = flags; + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; if (flags & IP_OUTARGS) - args.ipoa = ipoa; + args.fwa_ipoa = ipoa; error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, - &args); + &args, DN_CLIENT_IPFW); goto done; } #endif /* DUMMYNET */ @@ -1398,7 +1422,7 @@ skip_ipsec: #endif /* Deliver packet to divert input routine */ - divert_packet(m, 0, off & 0xffff, args.divert_rule); + divert_packet(m, 0, off & 0xffff, args.fwa_divert_rule); /* If 'tee', continue with original packet */ if (clone != NULL) { @@ -1474,7 +1498,7 @@ skip_ipsec: } ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); - ipfwd_tag->next_hop = args.next_hop; + ipfwd_tag->next_hop = args.fwa_next_hop; m_tag_prepend(m, fwd_tag); @@ -1500,13 +1524,14 @@ skip_ipsec: #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); -#endif - +#endif + /* we need to call dlil_output to run filters * and resync to avoid recursion loops. */ if (lo_ifp) { - dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0); + dlil_output(lo_ifp, PF_INET, m, 0, + (struct sockaddr *)dst, 0, adv); } else { printf("ip_output: no loopback ifp for forwarding!!!\n"); @@ -1540,7 +1565,7 @@ skip_ipsec: ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; + dst = (struct sockaddr_in *)(void *)ro_fwd->ro_rt->rt_gateway; if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); @@ -1552,7 +1577,7 @@ skip_ipsec: RT_UNLOCK(ro_fwd->ro_rt); rtfree(ro->ro_rt); ro->ro_rt = ro_fwd->ro_rt; - dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + dst = (struct sockaddr_in *)(void *)&ro_fwd->ro_dst; /* * If we added a default src ip earlier, @@ -1601,7 +1626,7 @@ pass: #endif m->m_pkthdr.csum_flags |= CSUM_IP; tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4); - + sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); @@ -1616,12 +1641,11 @@ pass: /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */ u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */ u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF; - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ + m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ m->m_pkthdr.csum_data = (csumprev + offset) << 16 ; m->m_pkthdr.csum_data += offset; - sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ - } - else { + sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ + } else { /* let the software handle any UDP or TCP checksums */ sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); } @@ -1629,7 +1653,7 @@ pass: sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) & m->m_pkthdr.csum_flags; } - + if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; @@ -1649,20 +1673,20 @@ pass: */ if ((u_short)ip->ip_len <= ifp->if_mtu || tso || ifp->if_hwassist & CSUM_FRAGMENT) { - if (tso) + if (tso) m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; - + #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); } - + #ifndef __APPLE__ /* Record statistics for this interface address. */ if (!(flags & IP_FORWARDING) && ia != NULL) { @@ -1679,8 +1703,8 @@ pass: if (packetchain == 0) { if (ro->ro_rt && nstat_collect) nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); - error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, - (struct sockaddr *)dst); + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst, 0, adv); goto done; } else { /* packet chaining allows us to reuse the route for all packets */ @@ -1696,12 +1720,12 @@ sendchain: if (ro->ro_rt && nstat_collect) nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); //send - error = ifnet_output(ifp, PF_INET, packetlist, - ro->ro_rt, (struct sockaddr *)dst); + error = dlil_output(ifp, PF_INET, packetlist, + ro->ro_rt, (struct sockaddr *)dst, 0, adv); pktcnt = 0; bytecnt = 0; goto done; - + } m0 = m; pktcnt++; @@ -1768,8 +1792,8 @@ sendchain: panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); if (ro->ro_rt && nstat_collect) nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); - error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, - (struct sockaddr *)dst); + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst, 0, adv); } else m_freem(m); } @@ -1873,6 +1897,10 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) m->m_pkthdr.rcvif = 0; m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; + + M_COPY_PFTAG(m, m0); + m_set_service_class(m, m0->m_pkthdr.svc); + #if CONFIG_MACF_NET mac_netinet_fragment(m0, m); #endif @@ -1909,7 +1937,7 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_off); #endif - + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); @@ -1952,30 +1980,35 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) ip_offset -= m->m_len; m = m->m_next; if (m == NULL) { - printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); + printf("in_delayed_cksum_withoffset failed - " + "ip_offset wasn't in the packet\n"); return; } } - - /* Sometimes the IP header is not contiguous, yes this can happen! */ - if (ip_offset + sizeof(struct ip) > m->m_len) { -#if DEBUG + + /* + * In case the IP header is not contiguous, or not 32-bit + * aligned, copy it to a local buffer. + */ + if ((ip_offset + sizeof(struct ip) > m->m_len) || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { +#if DEBUG printf("delayed m_pullup, m->len: %d off: %d\n", m->m_len, ip_offset); #endif m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); - - ip = (struct ip *)buf; + + ip = (struct ip *)(void *)buf; } else { - ip = (struct ip*)(m->m_data + ip_offset); + ip = (struct ip*)(void *)(m->m_data + ip_offset); } - + /* Gross */ if (ip_offset) { m->m_len -= ip_offset; m->m_data += ip_offset; } - + offset = IP_VHL_HL(ip->ip_vhl) << 2 ; /* @@ -2021,15 +2054,18 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) /* Insert the checksum in the existing chain */ if (offset + ip_offset + sizeof(u_short) > m->m_len) { char tmp[2]; - + #if DEBUG printf("delayed m_copyback, m->len: %d off: %d p: %d\n", m->m_len, offset + ip_offset, ip->ip_p); #endif - *(u_short *)tmp = csum; + *(u_short *)(void *)tmp = csum; m_copyback(m, offset + ip_offset, 2, tmp); - } else - *(u_short *)(m->m_data + offset + ip_offset) = csum; + } else if (IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { + *(u_short *)(void *)(m->m_data + offset + ip_offset) = csum; + } else { + bcopy(&csum, (m->m_data + offset + ip_offset), sizeof (csum)); + } } void @@ -2049,33 +2085,38 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) /* Save copy of first mbuf pointer and the ip_offset before modifying */ struct mbuf* m0 = m; size_t ip_offset_copy = ip_offset; - + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; if (m == NULL) { - printf("in_cksum_offset failed - ip_offset wasn't in the packet\n"); + printf("in_cksum_offset failed - ip_offset wasn't " + "in the packet\n"); return; } } - - /* Sometimes the IP header is not contiguous, yes this can happen! */ - if (ip_offset + sizeof(struct ip) > m->m_len) { + /* + * In case the IP header is not contiguous, or not 32-bit + * aligned, copy it to a local buffer. + */ + if ((ip_offset + sizeof(struct ip) > m->m_len) || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { #if DEBUG - printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %lu\n", - m->m_len, ip_offset); -#endif + printf("in_cksum_offset - delayed m_pullup, m->len: %d " + "off: %lu\n", m->m_len, ip_offset); +#endif m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); - ip = (struct ip *)buf; + ip = (struct ip *)(void *)buf; ip->ip_sum = 0; - m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum); + m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, + (caddr_t)&ip->ip_sum); } else { - ip = (struct ip*)(m->m_data + ip_offset); + ip = (struct ip*)(void *)(m->m_data + ip_offset); ip->ip_sum = 0; } - + /* Gross */ if (ip_offset) { m->m_len -= ip_offset; @@ -2122,16 +2163,25 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) m->m_data -= ip_offset; } - /* Insert the checksum in the existing chain if IP header not contiguous */ + /* + * Insert the checksum in the existing chain if IP header not + * contiguous, or if it's not 32-bit aligned, i.e. all the cases + * where it was copied to a local buffer. + */ if (ip_offset + sizeof(struct ip) > m->m_len) { char tmp[2]; #if DEBUG - printf("in_cksum_offset m_copyback, m->len: %u off: %lu p: %d\n", - m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); + printf("in_cksum_offset m_copyback, m->len: %u off: %lu " + "p: %d\n", m->m_len, + ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); #endif - *(u_short *)tmp = ip->ip_sum; + *(u_short *)(void *)tmp = ip->ip_sum; m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp); + } else if (!IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { + bcopy(&ip->ip_sum, + (m->m_data + ip_offset + offsetof(struct ip, ip_sum)), + sizeof (u_short)); } } @@ -2286,9 +2336,6 @@ ip_ctloutput(so, sopt) case IP_RECVDSTADDR: case IP_RECVIF: case IP_RECVTTL: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif case IP_RECVPKTINFO: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -2329,11 +2376,6 @@ ip_ctloutput(so, sopt) OPTSET(INP_RECVTTL); break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - OPTSET(INP_FAITH); - break; -#endif case IP_RECVPKTINFO: OPTSET(INP_PKTINFO); break; @@ -2400,7 +2442,7 @@ ip_ctloutput(so, sopt) */ ifnet_release(ifp); } - inp_bindif(inp, ifscope); + error = inp_bindif(inp, ifscope); } break; #endif @@ -2533,7 +2575,7 @@ ip_ctloutput(so, sopt) if (error) break; - inp_bindif(inp, optval); + error = inp_bindif(inp, optval); break; case IP_NO_IFT_CELLULAR: @@ -2584,9 +2626,6 @@ ip_ctloutput(so, sopt) case IP_RECVIF: case IP_RECVTTL: case IP_PORTRANGE: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif case IP_RECVPKTINFO: switch (sopt->sopt_name) { @@ -2629,11 +2668,6 @@ ip_ctloutput(so, sopt) optval = 0; break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - optval = OPTBIT(INP_FAITH); - break; -#endif case IP_RECVPKTINFO: optval = OPTBIT(INP_PKTINFO); break; @@ -2681,7 +2715,7 @@ ip_ctloutput(so, sopt) case IP_BOUND_IF: if (inp->inp_flags & INP_BOUND_IF) - optval = inp->inp_boundif; + optval = inp->inp_boundifp->if_index; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; @@ -2691,7 +2725,8 @@ ip_ctloutput(so, sopt) break; case IP_OUT_IF: - optval = inp->inp_last_outif; + optval = (inp->inp_last_outifp != NULL) ? + inp->inp_last_outifp->if_index : 0; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; @@ -3045,7 +3080,7 @@ ip_mloopback(ifp, m, dst, hlen) if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; dlil_output(lo_ifp, PF_INET, copym, 0, - (struct sockaddr *) dst, 0); + (struct sockaddr *) dst, 0, NULL); } else { printf("Warning: ip_output call to dlil_find_dltag failed!\n"); m_freem(copym); diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index 971a88126..763c1e919 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -236,17 +236,44 @@ struct ip_linklocal_stat { #define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets (0x0020) */ #define IP_OUTARGS 0x100 /* has ancillary output info */ +#ifdef XNU_KERNEL_PRIVATE +#define IP_HDR_ALIGNED_P(_ip) ((((uintptr_t)(_ip)) & ((uintptr_t)3)) == 0) + +/* + * On platforms which require strict alignment (currently for anything but + * i386 or x86_64), this macro checks whether the pointer to the IP header + * is 32-bit aligned, and assert otherwise. + */ +#if defined(__i386__) || defined(__x86_64__) +#define IP_HDR_STRICT_ALIGNMENT_CHECK(_ip) do { } while (0) +#else /* !__i386__ && !__x86_64__ */ +#define IP_HDR_STRICT_ALIGNMENT_CHECK(_ip) do { \ + if (!IP_HDR_ALIGNED_P(_ip)) { \ + panic_plain("\n%s: Unaligned IP header %p\n", \ + __func__, _ip); \ + } \ +} while (0) +#endif /* !__i386__ && !__x86_64__ */ +#endif /* XNU_KERNEL_PRIVATE */ + struct ip; struct inpcb; struct route; struct sockopt; +#include + /* * Extra information passed to ip_output when IP_OUTARGS is set. */ struct ip_out_args { - unsigned int ipoa_boundif; /* bound outgoing interface */ - unsigned int ipoa_nocell; /* don't use IFT_CELLULAR */ + unsigned int ipoa_boundif; /* boundif interface index */ + struct flowadv ipoa_flowadv; /* flow advisory code */ + u_int32_t ipoa_flags; /* IPOAF flags (see below) */ +#define IPOAF_SELECT_SRCIF 0x00000001 /* src interface selection */ +#define IPOAF_BOUND_IF 0x00000002 /* boundif value is valid */ +#define IPOAF_BOUND_SRCADDR 0x00000004 /* bound to src address */ +#define IPOAF_NO_CELLULAR 0x00000010 /* skip IFT_CELLULAR */ }; extern struct ipstat ipstat; @@ -263,6 +290,7 @@ extern u_int32_t (*ip_mcast_src)(int); extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; extern int ip_doscopedroute; +extern int ip_restrictrecvif; extern void ip_moptions_init(void); extern struct ip_moptions *ip_allocmoptions(int); @@ -304,7 +332,7 @@ int ip_rsvp_done(void); int ip_rsvp_vif_init(struct socket *, struct sockopt *); int ip_rsvp_vif_done(struct socket *, struct sockopt *); void ip_rsvp_force_done(struct socket *); - +void ip_proto_dispatch_in_wrapper(struct mbuf *, int, u_int8_t); void in_delayed_cksum(struct mbuf *m); extern void tcp_in_cksum_stats(u_int32_t); diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index b03f56cd1..ecab6700a 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2011 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,7 +59,8 @@ * the IP filter is marjed and kipf_delayed_remove is set so that when * kipf_ref eventually goes down to zero, the IP filter is removed */ -static lck_mtx_t *kipf_lock = 0; +decl_lck_mtx_data(static, kipf_lock_data); +static lck_mtx_t *kipf_lock = &kipf_lock_data; static u_int32_t kipf_ref = 0; static u_int32_t kipf_delayed_remove = 0; u_int32_t kipf_count = 0; @@ -270,7 +271,7 @@ ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) errno_t error = 0; struct m_tag *mtag = NULL; struct ip_moptions *imo = NULL; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0 }; /* Make the IP header contiguous in the mbuf */ if ((size_t)m->m_len < sizeof (struct ip)) { @@ -298,14 +299,18 @@ ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) imo->imo_multicast_loop = options->ippo_mcast_loop; } - if (options != NULL && - (options->ippo_flags & (IPPOF_BOUND_IF | IPPOF_NO_IFT_CELLULAR))) { + if (options != NULL) { + if (options->ippo_flags & IPPOF_SELECT_SRCIF) + ipoa.ipoa_flags |= IPOAF_SELECT_SRCIF; if (options->ippo_flags & IPPOF_BOUND_IF) { + ipoa.ipoa_flags |= IPOAF_BOUND_IF; ipoa.ipoa_boundif = options->ippo_flags >> IPPOF_SHIFT_IFSCOPE; } if (options->ippo_flags & IPPOF_NO_IFT_CELLULAR) - ipoa.ipoa_nocell = 1; + ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; + if (options->ippo_flags & IPPOF_BOUND_SRCADDR) + ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR; } bzero(&ro, sizeof(struct route)); @@ -341,7 +346,7 @@ ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) errno_t error = 0; struct m_tag *mtag = NULL; struct ip6_moptions *im6o = NULL; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, 0 }; /* Make the IP header contiguous in the mbuf */ if ((size_t)m->m_len < sizeof(struct ip6_hdr)) { @@ -369,14 +374,18 @@ ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) im6o->im6o_multicast_loop = options->ippo_mcast_loop; } - if (options != NULL && - (options->ippo_flags & (IPPOF_BOUND_IF | IPPOF_NO_IFT_CELLULAR))) { + if (options != NULL) { + if (options->ippo_flags & IPPOF_SELECT_SRCIF) + ip6oa.ip6oa_flags |= IP6OAF_SELECT_SRCIF; if (options->ippo_flags & IPPOF_BOUND_IF) { + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; ip6oa.ip6oa_boundif = options->ippo_flags >> IPPOF_SHIFT_IFSCOPE; } if (options->ippo_flags & IPPOF_NO_IFT_CELLULAR) - ip6oa.ip6oa_nocell = 1; + ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; + if (options->ippo_flags & IPPOF_BOUND_SRCADDR) + ip6oa.ip6oa_flags |= IP6OAF_BOUND_SRCADDR; } bzero(&ro, sizeof(struct route_in6)); @@ -481,19 +490,9 @@ ipf_init(void) goto done; } - kipf_lock = lck_mtx_alloc_init(lck_grp, lck_attributes); - if (kipf_lock == 0) { - printf("ipf_init: lck_mtx_alloc_init failed\n"); - error = ENOMEM; - goto done; - } + lck_mtx_init(kipf_lock, lck_grp, lck_attributes); + done: - if (error != 0) { - if (kipf_lock) { - lck_mtx_free(kipf_lock, lck_grp); - kipf_lock = 0; - } - } if (lck_grp) { lck_grp_free(lck_grp); lck_grp = 0; diff --git a/bsd/netinet/kpi_ipfilter.h b/bsd/netinet/kpi_ipfilter.h index 1f7fae6f0..6fe3727fe 100644 --- a/bsd/netinet/kpi_ipfilter.h +++ b/bsd/netinet/kpi_ipfilter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -54,6 +54,8 @@ struct ipf_pktopts { #ifdef PRIVATE #define IPPOF_BOUND_IF 0x2 #define IPPOF_NO_IFT_CELLULAR 0x4 +#define IPPOF_SELECT_SRCIF 0x8 +#define IPPOF_BOUND_SRCADDR 0x10 #define IPPOF_SHIFT_IFSCOPE 16 #endif /* PRIVATE */ diff --git a/osfmk/ddb/db_cond.h b/bsd/netinet/lro_ext.h similarity index 51% rename from osfmk/ddb/db_cond.h rename to bsd/netinet/lro_ext.h index 4e8c98a21..db2a4322e 100644 --- a/osfmk/ddb/db_cond.h +++ b/bsd/netinet/lro_ext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -21,52 +21,44 @@ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and - * limitations under the License. + * limitations under the License * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:47 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:18:37 ezf - * change marker to not FREE - * [1994/09/22 21:09:41 ezf] - * - * Revision 1.1.2.3 1993/09/17 21:34:31 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:07 robert] - * - * Revision 1.1.2.2 1993/07/27 18:27:04 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:11:18 elliston] - * - * $EndLog$ - */ -#ifndef _DDB_DB_COND_H_ -#define _DDB_DB_COND_H_ +#ifndef LRO_EXT_H_ +#define LRO_EXT_H_ -#include -#include +#ifdef BSD_KERNEL_PRIVATE -/* Prototypes for functions exported by this module. - */ +/* All definitions exported from LRO go into this file */ + +extern int sw_lro; +extern int lrodebug; + +/* flow return values */ +#define TCP_LRO_NAN 0x00 /* No flow exists */ +#define TCP_LRO_CONSUMED 0x01 /* LRO consumed the packet */ +#define TCP_LRO_EJECT_FLOW 0x02 /* LRO ejected the flow */ +#define TCP_LRO_COALESCE 0x03 /* LRO to coalesce the packet */ +#define TCP_LRO_COLLISION 0x04 /* Two flows map to the same slot */ + +void tcp_lro_init(void); + +/* When doing LRO in IP call this function */ +struct mbuf* tcp_lro(struct mbuf *m, unsigned int hlen); -void db_cond_free(db_thread_breakpoint_t bkpt); +/* TCP calls this to start coalescing a flow */ +int tcp_start_coalescing(struct ip *, struct tcphdr *, int tlen); -boolean_t db_cond_check(db_thread_breakpoint_t bkpt); +/* TCP calls this to stop coalescing a flow */ +int tcp_lro_remove_state(struct in_addr, struct in_addr, unsigned short, + unsigned short); -void db_cond_print(db_thread_breakpoint_t bkpt); +/* TCP calls this to keep the seq number updated */ +void tcp_update_lro_seq(__uint32_t, struct in_addr, struct in_addr, + unsigned short, unsigned short); -void db_cond_cmd(void); +#endif -#endif /* !_DDB_DB_COND_H_ */ +#endif /* LRO_EXT_H_ */ diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index 0b63a3c0d..f06517a26 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -130,10 +131,10 @@ struct inpcbinfo ripcbinfo; /* control hooks for ipfw and dummynet */ #if IPFIREWALL ip_fw_ctl_t *ip_fw_ctl_ptr; +#endif /* IPFIREWALL */ #if DUMMYNET ip_dn_ctl_t *ip_dn_ctl_ptr; #endif /* DUMMYNET */ -#endif /* IPFIREWALL */ /* * Nominal space allocated to a raw ip socket. @@ -202,6 +203,9 @@ rip_input(m, iphlen) struct mbuf *opts = 0; int skipit = 0, ret = 0; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ripsrc.sin_addr = ip->ip_src; lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(inp, &ripcb, inp_list) { @@ -349,21 +353,28 @@ rip_output( register struct ip *ip; register struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; - struct ip_out_args ipoa; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; struct ip_moptions *imo; int error = 0; - mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; if (control != NULL) { - mtc = mbuf_traffic_class_from_control(control); + msc = mbuf_service_class_from_control(control); m_freem(control); } - /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; - ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + flags |= IP_OUTARGS; + /* If socket was bound to an ifindex, tell ip_output about it */ + if (inp->inp_flags & INP_BOUND_IF) { + ipoa.ipoa_boundif = inp->inp_boundifp->if_index; + ipoa.ipoa_flags |= IPOAF_BOUND_IF; + } + if (inp->inp_flags & INP_NO_IFT_CELLULAR) + ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; + + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); /* * If the user handed us a complete IP packet, use it. @@ -411,6 +422,9 @@ rip_output( OSAddAtomic(1, &ipstat.ips_rawout); } + if (inp->inp_laddr.s_addr != INADDR_ANY) + ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR; + #if IPSEC if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) { m_freem(m); @@ -424,7 +438,9 @@ rip_output( inp->inp_route.ro_rt = NULL; } - set_packet_tclass(m, so, mtc, 0); + set_packet_service_class(m, so, msc, 0); + m->m_pkthdr.m_flowhash = inp->inp_flowhash; + m->m_pkthdr.m_fhflags |= PF_TAG_FLOWHASH; #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); @@ -446,7 +462,7 @@ rip_output( if (inp->inp_route.ro_rt != NULL) { struct rtentry *rt = inp->inp_route.ro_rt; - unsigned int outif; + struct ifnet *outif; if ((rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) || inp->inp_socket == NULL || @@ -463,12 +479,11 @@ rip_output( } /* * If this is a connected socket and the destination - * route is unicast, update outif with that of the route - * interface index used by IP. + * route is unicast, update outif with that of the + * route interface used by IP. */ - if (rt != NULL && - (outif = rt->rt_ifp->if_index) != inp->inp_last_outif) - inp->inp_last_outif = outif; + if (rt != NULL && (outif = rt->rt_ifp) != inp->inp_last_outifp) + inp->inp_last_outifp = outif; } return (error); @@ -503,7 +518,9 @@ rip_ctloutput(so, sopt) struct inpcb *inp = sotoinpcb(so); int error, optval; - if (sopt->sopt_level != IPPROTO_IP) + /* Allow at this level */ + if (sopt->sopt_level != IPPROTO_IP && + !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH)) return (EINVAL); error = 0; @@ -516,10 +533,10 @@ rip_ctloutput(so, sopt) error = sooptcopyout(sopt, &optval, sizeof optval); break; - case IP_STRIPHDR: - optval = inp->inp_flags & INP_STRIPHDR; - error = sooptcopyout(sopt, &optval, sizeof optval); - break; + case IP_STRIPHDR: + optval = inp->inp_flags & INP_STRIPHDR; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; #if IPFIREWALL case IP_FW_ADD: @@ -537,6 +554,8 @@ rip_ctloutput(so, sopt) #if DUMMYNET case IP_DUMMYNET_GET: + if (!DUMMYNET_LOADED) + ip_dn_init(); if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); else @@ -576,17 +595,16 @@ rip_ctloutput(so, sopt) inp->inp_flags &= ~INP_HDRINCL; break; - case IP_STRIPHDR: - error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); - if (error) - break; - if (optval) - inp->inp_flags |= INP_STRIPHDR; - else - inp->inp_flags &= ~INP_STRIPHDR; - break; - + case IP_STRIPHDR: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval) + inp->inp_flags |= INP_STRIPHDR; + else + inp->inp_flags &= ~INP_STRIPHDR; + break; #if IPFIREWALL case IP_FW_ADD: @@ -612,6 +630,8 @@ rip_ctloutput(so, sopt) case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: + if (!DUMMYNET_LOADED) + ip_dn_init(); if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); else @@ -632,11 +652,11 @@ rip_ctloutput(so, sopt) case IP_RSVP_VIF_ON: error = ip_rsvp_vif_init(so, sopt); break; - + case IP_RSVP_VIF_OFF: error = ip_rsvp_vif_done(so, sopt); break; - + case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: @@ -649,6 +669,14 @@ rip_ctloutput(so, sopt) break; #endif /* MROUTING */ + case SO_FLUSH: + if ((error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval))) != 0) + break; + + error = inp_flush(inp, optval); + break; + default: error = ip_ctloutput(so, sopt); break; @@ -822,9 +850,9 @@ __private_extern__ int rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); - struct sockaddr_in *addr = (struct sockaddr_in *)nam; + struct sockaddr_in *addr = (struct sockaddr_in *)(void *)nam; struct ifaddr *ifa = NULL; - unsigned int outif = 0; + struct ifnet *outif = NULL; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -837,12 +865,12 @@ rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) } else if (ifa) { IFA_LOCK(ifa); - outif = ifa->ifa_ifp->if_index; + outif = ifa->ifa_ifp; IFA_UNLOCK(ifa); IFA_REMREF(ifa); } inp->inp_laddr = addr->sin_addr; - inp->inp_last_outif = outif; + inp->inp_last_outifp = outif; return 0; } @@ -850,7 +878,7 @@ __private_extern__ int rip_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); - struct sockaddr_in *addr = (struct sockaddr_in *)nam; + struct sockaddr_in *addr = (struct sockaddr_in *)(void *)nam; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -861,6 +889,7 @@ rip_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) return EAFNOSUPPORT; inp->inp_faddr = addr->sin_addr; soisconnected(so); + return 0; } @@ -889,7 +918,7 @@ rip_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr m_freem(m); return ENOTCONN; } - dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; + dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr; } return rip_output(m, so, dst, control); } diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index a3a183bfe..99264dcbe 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -213,19 +213,13 @@ struct tcphdr { * buffer queues. */ #ifdef PRIVATE -#define TCP_INFO 0x200 /* retrieve tcp_info structure */ - +#define TCP_INFO 0x200 /* retrieve tcp_info structure */ +#define TCP_NOTSENT_LOWAT 0x201 /* Low water mark for TCP unsent data */ +#define TCP_MEASURE_SND_BW 0x202 /* Measure sender's bandwidth for this connection */ +#define TCP_MEASURE_BW_BURST 0x203 /* Burst size to use for bandwidth measurement */ +#define TCP_PEER_PID 0x204 /* Lookup pid of the process we're connected to */ /* - * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits - * the caller to query certain information about the state of a TCP - * connection. We provide an overlapping set of fields with the Linux - * implementation, but since this is a fixed size structure, room has been - * left for growth. In order to maximize potential future compatibility with - * the Linux API, the same variable names and order have been adopted, and - * padding left to make room for omitted fields in case they are added later. - * - * XXX: This is currently an unstable ABI/API, in that it is expected to - * change. + * The TCP_INFO socket option is a private API and is subject to change */ #pragma pack(4) @@ -234,26 +228,52 @@ struct tcphdr { #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 +#define TCPI_FLAG_LOSSRECOVERY 0x01 /* Currently in loss recovery */ + struct tcp_info { u_int8_t tcpi_state; /* TCP FSM state. */ u_int8_t tcpi_options; /* Options enabled on conn. */ u_int8_t tcpi_snd_wscale; /* RFC1323 send shift value. */ u_int8_t tcpi_rcv_wscale; /* RFC1323 recv shift value. */ + u_int32_t tcpi_flags; /* extra flags (TCPI_FLAG_xxx) */ + + u_int32_t tcpi_rto; /* Retransmission timeout in milliseconds */ u_int32_t tcpi_snd_mss; /* Max segment size for send. */ u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ + u_int32_t tcpi_rttcur; /* Most recent value of RTT */ + u_int32_t tcpi_srtt; /* Smoothed RTT */ + u_int32_t tcpi_rttvar; /* RTT variance */ + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ u_int32_t tcpi_rcv_space; /* Advertised recv window. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ - u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ int32_t tcpi_last_outif; /* if_index of interface used to send last */ + u_int32_t tcpi_snd_sbbytes; /* bytes in snd buffer including data inflight */ + + u_int64_t tcpi_txbytes __attribute__((aligned(8))); + /* total bytes sent */ + u_int64_t tcpi_txretransmitbytes __attribute__((aligned(8))); + /* total bytes retransmitted */ + u_int64_t tcpi_txunacked __attribute__((aligned(8))); + /* current number of bytes not acknowledged */ + u_int64_t tcpi_rxbytes __attribute__((aligned(8))); + /* total bytes received */ + u_int64_t tcpi_rxduplicatebytes __attribute__((aligned(8))); + /* total duplicate bytes received */ + u_int64_t tcpi_snd_bw __attribute__((aligned(8))); /* measured send bandwidth in bits/sec */ +}; + +struct tcp_measure_bw_burst { + u_int32_t min_burst_size; /* Minimum number of packets to use */ + u_int32_t max_burst_size; /* Maximum number of packets to use */ }; /* diff --git a/bsd/netinet/tcp_cc.h b/bsd/netinet/tcp_cc.h index c78ba3531..cf1f0fb80 100644 --- a/bsd/netinet/tcp_cc.h +++ b/bsd/netinet/tcp_cc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Inc. All rights reserved. + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,7 +97,7 @@ struct tcp_cc_algo { void (*ack_rcvd) (struct tcpcb *tp, struct tcphdr *th); /* called before entering FR */ - void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th); + void (*pre_fr) (struct tcpcb *tp); /* after exiting FR */ void (*post_fr) (struct tcpcb *tp, struct tcphdr *th); @@ -120,5 +120,8 @@ extern struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; #define CC_ALGO(tp) (tcp_cc_algo_list[tp->tcp_cc_index]) +extern void tcp_cc_resize_sndbuf(struct tcpcb *tp); +extern void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp); + #endif /* KERNEL */ #endif /* _NETINET_CC_H_ */ diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 6f06b2b14..7d29a31c2 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,6 +78,7 @@ #include #include #include +#include #include /* before tcp_seq.h, for tcp_random18() */ @@ -134,6 +135,7 @@ struct tcphdr tcp_savetcp; #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */ #include +#include #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0) #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2) @@ -147,6 +149,8 @@ tcp_cc tcp_ccgen; extern int ipsec_bypass; #endif +extern int32_t total_sbmb_cnt; + struct tcpstat tcpstat; static int log_in_vain = 0; @@ -225,11 +229,49 @@ int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH; SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ"); +u_int32_t tcp_do_autorcvbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_do_autorcvbuf, 0, "Enable automatic socket buffer tuning"); + +u_int32_t tcp_autorcvbuf_inc_shift = 3; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_autorcvbuf_inc_shift, 0, "Shift for increment in receive socket buffer size"); + +u_int32_t tcp_autorcvbuf_max = 512 * 1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_autorcvbuf_max, 0, "Maximum receive socket buffer size"); + +int sw_lro = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED, + &sw_lro, 0, "Used to coalesce TCP packets"); + +int lrodebug = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg, CTLFLAG_RW | CTLFLAG_LOCKED, + &lrodebug, 0, "Used to debug SW LRO"); + +int lro_start = 3; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt, CTLFLAG_RW | CTLFLAG_LOCKED, + &lro_start, 0, "Segments for starting LRO computed as power of 2"); + +extern int tcp_do_autosendbuf; + #if CONFIG_IFEF_NOWINDOWSCALE int tcp_obey_ifef_nowindowscale = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_obey_ifef_nowindowscale, 0, ""); #endif +/* This limit will determine when the receive socket buffer tuning will + * kick in. Currently it will start when the bw*delay measured in + * last RTT is more than half of the current hiwat on the buffer. + */ +uint32_t tcp_rbuf_hiwat_shift = 1; + +/* This limit will determine when the socket buffer will be increased + * to accommodate an application reading slowly. When the amount of + * space left in the buffer is less than one forth of the bw*delay + * measured in last RTT. + */ +uint32_t tcp_rbuf_win_shift = 2; extern int tcp_TCPTV_MIN; extern int tcp_acc_iaj_high; @@ -258,13 +300,23 @@ static inline int tcp_stretch_ack_enable(struct tcpcb *tp); #if TRAFFIC_MGT static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, int reset_size); void compute_iaj(struct tcpcb *tp); -static inline void clear_iaj_state(struct tcpcb *tp); #endif /* TRAFFIC_MGT */ #if INET6 static inline unsigned int tcp_maxmtu6(struct rtentry *); #endif +static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb, + struct tcpopt *to, u_int32_t tlen); + +void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb); +static void tcp_sbsnd_trim(struct sockbuf *sbsnd); +static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp); +static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb, + u_int32_t newsize, u_int32_t idealsize); + +#define TCPTV_RCVNOTS_QUANTUM 100 +#define TCP_RCVNOTS_BYTELEVEL 204800 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #if INET6 #define ND6_HINT(tp) \ @@ -284,8 +336,6 @@ extern void postevent(struct socket *, struct sockbuf *, int); extern void ipfwsyslog( int level, const char *format,...); extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr ); extern int fw_verbose; -__private_extern__ int tcp_sockthreshold; -__private_extern__ int tcp_win_scale; #if IPFIREWALL #define log_in_vain_log( a ) { \ @@ -301,6 +351,8 @@ __private_extern__ int tcp_win_scale; int tcp_rcvunackwin = TCPTV_UNACKWIN; int tcp_maxrcvidle = TCPTV_MAXRCVIDLE; int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks"); #define DELAY_ACK(tp, th) (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th)) @@ -312,6 +364,7 @@ uint32_t get_base_rtt(struct tcpcb *tp); void tcp_set_background_cc(struct socket *so); void tcp_set_foreground_cc(struct socket *so); static void tcp_set_new_cc(struct socket *so, uint16_t cc_index); +static void tcp_bwmeas_check(struct tcpcb *tp); #if TRAFFIC_MGT void @@ -319,7 +372,7 @@ reset_acc_iaj(struct tcpcb *tp) { tp->acc_iaj = 0; tp->iaj_rwintop = 0; - clear_iaj_state(tp); + CLEAR_IAJ_STATE(tp); } static inline void @@ -334,12 +387,6 @@ update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size) } } -static inline void -clear_iaj_state(struct tcpcb *tp) -{ - tp->iaj_rcv_ts = 0; -} - /* For every 32 bit unsigned integer(v), this function will find the * largest integer n such that (n*n <= v). This takes at most 16 iterations * irrespective of the value of v and does not involve multiplications. @@ -433,6 +480,34 @@ compute_iaj(struct tcpcb *tp) } #endif /* TRAFFIC_MGT */ +/* Check if enough amount of data has been acknowledged since + * bw measurement was started + */ +static void +tcp_bwmeas_check(struct tcpcb *tp) +{ + int32_t bw_meas_bytes; + uint32_t bw, bytes, elapsed_time; + bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start; + if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 && + bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) { + bytes = bw_meas_bytes; + elapsed_time = tcp_now - tp->t_bwmeas->bw_ts; + if (elapsed_time > 0) { + bw = bytes / elapsed_time; + if ( bw > 0) { + if (tp->t_bwmeas->bw_sndbw > 0) { + tp->t_bwmeas->bw_sndbw = + (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3; + } else { + tp->t_bwmeas->bw_sndbw = bw; + } + } + } + tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS); + } +} + static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; @@ -596,8 +671,16 @@ present: if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); - if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) { + /* Stop using LRO once out of order packets arrive */ + if (tp->t_flagsext & TF_LRO_OFFLOADED) { + tcp_lro_remove_state(tp->t_inpcb->inp_laddr, + tp->t_inpcb->inp_faddr, + th->th_dport, th->th_sport); + tp->t_flagsext &= ~TF_LRO_OFFLOADED; + } return (0); + } do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; @@ -609,6 +692,11 @@ present: so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ if (sbappendstream(&so->so_rcv, q->tqe_m)) dowakeup = 1; + if (tp->t_flagsext & TF_LRO_OFFLOADED) { + tcp_update_lro_seq(tp->rcv_nxt, + tp->t_inpcb->inp_laddr, + tp->t_inpcb->inp_faddr, th->th_dport, th->th_sport); + } } zfree(tcp_reass_zone, q); tcp_reass_qsize--; @@ -645,7 +733,7 @@ present: */ static void tcp_reduce_congestion_window( - struct tcpcb *tp, struct tcphdr *th) + struct tcpcb *tp) { /* * If the current tcp cc module has @@ -653,7 +741,7 @@ tcp_reduce_congestion_window( * before entering FR, call it */ if (CC_ALGO(tp)->pre_fr != NULL) - CC_ALGO(tp)->pre_fr(tp, th); + CC_ALGO(tp)->pre_fr(tp); ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tp->t_timer[TCPT_REXMT] = 0; @@ -675,9 +763,13 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) #pragma unused(proto) register struct mbuf *m = *mp; struct in6_ifaddr *ia6; + struct ifnet *ifp = ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) ? m->m_pkthdr.rcvif: NULL; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE); + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? @@ -694,6 +786,10 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->icmp6unreach, 1); + return (IPPROTO_DONE); } IFA_UNLOCK(&ia6->ia_ifa); @@ -705,6 +801,238 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) } #endif +/* Depending on the usage of mbuf space in the system, this function + * will return true or false. This is used to determine if a socket + * buffer can take more memory from the system for auto-tuning or not. + */ +u_int8_t +tcp_cansbgrow(struct sockbuf *sb) +{ + /* Calculate the host level space limit in terms of MSIZE buffers. + * We can use a maximum of half of the available mbuf space for + * socket buffers. + */ + u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT)); + + /* Calculate per sb limit in terms of bytes. We optimize this limit + * for upto 16 socket buffers. + */ + + u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT); + + if ((total_sbmb_cnt < mblim) && + (sb->sb_hiwat < sbspacelim)) { + return(1); + } + return(0); +} + +void +tcp_sbrcv_reserve(struct tcpcb *tp, + struct sockbuf *sbrcv, + u_int32_t newsize, + u_int32_t idealsize) { + + /* newsize should not exceed max */ + newsize = min(newsize, tcp_autorcvbuf_max); + + /* The receive window scale negotiated at the + * beginning of the connection will also set a + * limit on the socket buffer size + */ + newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale); + + /* Set new socket buffer size */ + if (newsize > sbrcv->sb_hiwat && + (sbreserve(sbrcv, newsize) == 1)) { + sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize, + (idealsize != 0) ? idealsize : newsize), + tcp_autorcvbuf_max); + + /* Again check the limit set by the advertised + * window scale + */ + sbrcv->sb_idealsize = min(sbrcv->sb_idealsize, + TCP_MAXWIN << tp->rcv_scale); + } +} + +/* + * This function is used to grow a receive socket buffer. It + * will take into account system-level memory usage and the + * bandwidth available on the link to make a decision. + */ +static void +tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, + struct tcpopt *to, u_int32_t pktlen) { + + if (tcp_do_autorcvbuf == 0 || + (sbrcv->sb_flags & SB_AUTOSIZE) == 0 || + tcp_cansbgrow(sbrcv) == 0 || + sbrcv->sb_hiwat >= tcp_autorcvbuf_max) { + /* Can not resize the socket buffer, just return */ + goto out; + } + + if (TSTMP_GT(tcp_now, + tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) { + /* If there has been an idle period in the + * connection, just restart the measurement + */ + goto out; + } + + if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) != + (TF_REQ_TSTMP | TF_RCVD_TSTMP)) { + /* + * Timestamp option is not supported on this connection. + * If the connection reached a state to indicate that + * the receive socket buffer needs to grow, increase + * the high water mark. + */ + if (TSTMP_GEQ(tcp_now, + tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) { + if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) { + tcp_sbrcv_reserve(tp, sbrcv, + tcp_autorcvbuf_max, 0); + } + goto out; + } else { + tp->rfbuf_cnt += pktlen; + return; + } + } else if (to->to_tsecr != 0) { + /* If the timestamp shows that one RTT has + * completed, we can stop counting the + * bytes. Here we consider increasing + * the socket buffer if it fits the following + * criteria: + * 1. the bandwidth measured in last rtt, is more + * than half of sb_hiwat, this will help to scale the + * buffer according to the bandwidth on the link. + * 2. the space left in sbrcv is less than + * one forth of the bandwidth measured in last rtt, this + * will help to accommodate an application reading slowly. + */ + if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) { + if ((tp->rfbuf_cnt > (sbrcv->sb_hiwat - + (sbrcv->sb_hiwat >> tcp_rbuf_hiwat_shift)) || + (sbrcv->sb_hiwat - sbrcv->sb_cc) < + (tp->rfbuf_cnt >> tcp_rbuf_win_shift))) { + u_int32_t rcvbuf_inc; + /* + * Increment the receive window by a multiple of + * maximum sized segments. This will prevent a + * connection from sending smaller segments on + * wire if it is limited by the receive window. + * + * Set the ideal size based on current bandwidth + * measurements. We set the ideal size on receive + * socket buffer to be twice the bandwidth delay + * product. + */ + rcvbuf_inc = tp->t_maxseg << tcp_autorcvbuf_inc_shift; + tcp_sbrcv_reserve(tp, sbrcv, + sbrcv->sb_hiwat + rcvbuf_inc, + (tp->rfbuf_cnt * 2)); + } + goto out; + } else { + tp->rfbuf_cnt += pktlen; + return; + } + } +out: + /* Restart the measurement */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + return; +} + +/* This function will trim the excess space added to the socket buffer + * to help a slow-reading app. The ideal-size of a socket buffer depends + * on the link bandwidth or it is set by an application and we aim to + * reach that size. + */ +void +tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) { + if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 && + sbrcv->sb_hiwat > sbrcv->sb_idealsize) { + int32_t trim; + /* compute the difference between ideal and current sizes */ + u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize; + + /* Compute the maximum advertised window for + * this connection. + */ + u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt; + + /* How much can we trim the receive socket buffer? + * 1. it can not be trimmed beyond the max rcv win advertised + * 2. if possible, leave 1/16 of bandwidth*delay to + * avoid closing the win completely + */ + u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4)); + + /* Sometimes leave can be zero, in that case leave at least + * a few segments worth of space. + */ + if (leave == 0) + leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift; + + trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave); + trim = imin(trim, (int32_t)diff); + + if (trim > 0) + sbreserve(sbrcv, (sbrcv->sb_hiwat - trim)); + } +} + +/* We may need to trim the send socket buffer size for two reasons: + * 1. if the rtt seen on the connection is climbing up, we do not + * want to fill the buffers any more. + * 2. if the congestion win on the socket backed off, there is no need + * to hold more mbufs for that connection than what the cwnd will allow. + */ +void +tcp_sbsnd_trim(struct sockbuf *sbsnd) { + if (tcp_do_autosendbuf == 1 && + ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) == + (SB_AUTOSIZE | SB_TRIM)) && + (sbsnd->sb_idealsize > 0) && + (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) { + u_int32_t trim = 0; + if (sbsnd->sb_cc <= sbsnd->sb_idealsize) { + trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize; + } else { + trim = sbsnd->sb_hiwat - sbsnd->sb_cc; + } + sbreserve(sbsnd, (sbsnd->sb_hiwat - trim)); + } + if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize) + sbsnd->sb_flags &= ~(SB_TRIM); +} + +/* + * If timestamp option was not negotiated on this connection + * and this connection is on the receiving side of a stream + * then we can not measure the delay on the link accurately. + * Instead of enabling automatic receive socket buffer + * resizing, just give more space to the receive socket buffer. + */ +static inline void +tcp_sbrcv_tstmp_check(struct tcpcb *tp) { + struct socket *so = tp->t_inpcb->inp_socket; + u_int32_t newsize = 2 * tcp_recvspace; + struct sockbuf *sbrcv = &so->so_rcv; + + if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) != + (TF_REQ_TSTMP | TF_RCVD_TSTMP) && + (sbrcv->sb_flags & SB_AUTOSIZE) != 0) { + tcp_sbrcv_reserve(tp, sbrcv, newsize, 0); + } +} + /* A receiver will evaluate the flow of packets on a connection * to see if it can reduce ack traffic. The receiver will start * stretching acks if all of the following conditions are met: @@ -732,6 +1060,7 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { return(1); } + return(0); } @@ -770,8 +1099,7 @@ tcp_input(m, off0) struct in6_addr laddr6; #endif int dropsocket = 0; - int iss = 0; - int nosock = 0; + int iss = 0, nosock = 0; u_int32_t tiwin; struct tcpopt to; /* options in this segment */ struct sockaddr_in *next_hop = NULL; @@ -782,24 +1110,20 @@ tcp_input(m, off0) u_char ip_ecn = IPTOS_ECN_NOTECT; unsigned int ifscope, nocell = 0; uint8_t isconnected, isdisconnected; + struct ifnet *ifp = ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) ? m->m_pkthdr.rcvif: NULL; + int nlropkts = m->m_pkthdr.lro_npkts; + int mauxf_sw_lro_pkt = (m->m_pkthdr.aux_flags & MAUXF_SW_LRO_PKT) ? 1 : 0; + int turnoff_lro = 0; +#define TCP_INC_VAR(stat, npkts) do { \ + if (mauxf_sw_lro_pkt) { \ + stat += npkts; \ + } else { \ + stat++; \ + } \ +} while (0) - /* - * Record the interface where this segment arrived on; this does not - * affect normal data output (for non-detached TCP) as it provides a - * hint about which route and interface to use for sending in the - * absence of a PCB, when scoped routing (and thus source interface - * selection) are enabled. - */ - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) - ifscope = m->m_pkthdr.rcvif->if_index; - else - ifscope = IFSCOPE_NONE; - - /* Since this is an entry point for input processing of tcp packets, we - * can update the tcp clock here. - */ - calculate_tcp_clock(); - + TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts); + /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) { fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, @@ -828,34 +1152,51 @@ tcp_input(m, off0) #endif bzero((char *)&to, sizeof(to)); - tcpstat.tcps_rcvtotal++; - - - #if INET6 if (isipv6) { + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; - th = (struct tcphdr *)((caddr_t)ip6 + off0); + th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0); if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; - else - th->th_sum = in6_cksum_phdr(&ip6->ip6_src, - &ip6->ip6_dst, htonl(sizeof(struct tcphdr)), - htonl(IPPROTO_TCP)); + else { + /* + * There is no established protocol for the case + * where IPv6 psuedoheader checksum is not computed + * with our current drivers. Current drivers set + * CSUM_PSEUDO_HDR. So if we do get here, we should + * recalculate checksum. + */ + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + th->th_sum = 0; + } else { + th->th_sum = 0xffff; + } + } th->th_sum ^= 0xffff; if (th->th_sum) { tcpstat.tcps_rcvbadsum++; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->badformat, 1); + goto dropnosock; } } else { if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->badformat, 1); + goto dropnosock; } } @@ -873,6 +1214,10 @@ tcp_input(m, off0) */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->unspecv6, 1); + goto dropnosock; } DTRACE_TCP5(receive, sruct mbuf *, m, struct inpcb *, NULL, @@ -900,39 +1245,32 @@ tcp_input(m, off0) return; } } + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; - th = (struct tcphdr *)((caddr_t)ip + off0); + th = (struct tcphdr *)(void *)((caddr_t)ip + off0); tlen = ip->ip_len; - DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, - struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th); - - KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), - (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), - th->th_seq, th->th_ack, th->th_win); - + if (m->m_pkthdr.aux_flags & MAUXF_SW_LRO_DID_CSUM) { + goto skip_checksum; + } if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) { u_short pseudo; char b[9]; - *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0]; - *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4]; - *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8]; - - bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); - ipov->ih_len = (u_short)tlen; + bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); + bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; #if BYTE_ORDER != BIG_ENDIAN HTONS(ipov->ih_len); #endif - pseudo = in_cksum(m, sizeof (struct ip)); - - *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0]; - *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4]; - *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8]; - + bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); + th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF)); } else { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) @@ -948,35 +1286,40 @@ tcp_input(m, off0) /* * Checksum extended TCP header and data. */ - *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0]; - *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4]; - *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8]; - - len = sizeof (struct ip) + tlen; - bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); + bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); ipov->ih_len = (u_short)tlen; - #if BYTE_ORDER != BIG_ENDIAN HTONS(ipov->ih_len); #endif - + len = sizeof (struct ip) + tlen; th->th_sum = in_cksum(m, len); - - *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0]; - *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4]; - *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8]; + bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); tcp_in_cksum_stats(len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->badformat, 1); + if (lrodebug) printf("tcp_input: bad xsum len = %d, tlen = %d, flags = %x, csum_flags = %x.\n",len, tlen, m->m_flags, m->m_pkthdr.csum_flags); goto dropnosock; } +skip_checksum: #if INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK); + + DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th); + + KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), + (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), + th->th_seq, th->th_ack, th->th_win); + } /* @@ -986,6 +1329,10 @@ tcp_input(m, off0) off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->badformat, 1); + goto dropnosock; } tlen -= off; /* tlen is used instead of ti->ti_len */ @@ -994,7 +1341,7 @@ tcp_input(m, off0) if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, return); ip6 = mtod(m, struct ip6_hdr *); - th = (struct tcphdr *)((caddr_t)ip6 + off0); + th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0); } else #endif /* INET6 */ { @@ -1005,7 +1352,7 @@ tcp_input(m, off0) } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; - th = (struct tcphdr *)((caddr_t)ip + off0); + th = (struct tcphdr *)(void *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); @@ -1020,11 +1367,11 @@ tcp_input(m, off0) if ((optlen == TCPOLEN_TSTAMP_APPA || (optlen > TCPOLEN_TSTAMP_APPA && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && - *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && + *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) && (th->th_flags & TH_SYN) == 0) { to.to_flags |= TOF_TS; - to.to_tsval = ntohl(*(u_int32_t *)(optp + 4)); - to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8)); + to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4)); + to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8)); optp = NULL; /* we've parsed the options */ } } @@ -1038,19 +1385,13 @@ tcp_input(m, off0) * * This is a violation of the TCP specification. */ - if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) { + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->synfin, 1); + goto dropnosock; -#endif - - /* - * Convert TCP protocol specific fields to host format. - */ - -#if BYTE_ORDER != BIG_ENDIAN - NTOHL(th->th_seq); - NTOHL(th->th_ack); - NTOHS(th->th_win); - NTOHS(th->th_urp); + } #endif /* @@ -1062,6 +1403,34 @@ tcp_input(m, off0) * parameters to be unchanged. */ drop_hdrlen = off0 + off; + + /* Since this is an entry point for input processing of tcp packets, we + * can update the tcp clock here. + */ + calculate_tcp_clock(); + + /* + * Record the interface where this segment arrived on; this does not + * affect normal data output (for non-detached TCP) as it provides a + * hint about which route and interface to use for sending in the + * absence of a PCB, when scoped routing (and thus source interface + * selection) are enabled. + */ + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ifscope = m->m_pkthdr.rcvif->if_index; + else + ifscope = IFSCOPE_NONE; + + /* + * Convert TCP protocol specific fields to host format. + */ + +#if BYTE_ORDER != BIG_ENDIAN + NTOHL(th->th_seq); + NTOHL(th->th_ack); + NTOHS(th->th_win); + NTOHS(th->th_urp); +#endif /* * Locate pcb for segment. @@ -1120,7 +1489,8 @@ findpcb: * the segment arrived on. */ if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) - ifscope = inp->inp_boundif; + ifscope = inp->inp_boundifp->if_index; + /* * If the PCB is present and the socket isn't allowed to use * the cellular interface, indicate it as such for tcp_respond. @@ -1136,6 +1506,10 @@ findpcb: IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING) inp = NULL; // pretend we didn't find it + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->badformatipsec, 1); + goto dropnosock; } } else @@ -1144,6 +1518,10 @@ findpcb: IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING) inp = NULL; // pretend we didn't find it + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->badformatipsec, 1); + goto dropnosock; } } @@ -1189,7 +1567,7 @@ findpcb: ntohs(th->th_sport), thflags); break; case 3: - if ((thflags & TH_SYN) && + if ((thflags & TH_SYN) && !(thflags & TH_ACK) && !(m->m_flags & (M_BCAST | M_MCAST)) && #if INET6 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) || @@ -1210,6 +1588,7 @@ findpcb: } if (blackhole) { if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) + switch (blackhole) { case 1: if (thflags & TH_SYN) @@ -1222,6 +1601,10 @@ findpcb: } } rstreason = BANDLIM_RST_CLOSEDPORT; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->noconnnolist, 1); + goto dropwithresetnosock; } so = inp->inp_socket; @@ -1248,6 +1631,10 @@ findpcb: tp = intotcpcb(inp); if (tp == 0) { rstreason = BANDLIM_RST_CLOSEDPORT; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->noconnlist, 1); + goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) @@ -1290,14 +1677,16 @@ findpcb: #if INET6 struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ - unsigned int head_ifscope; - unsigned int head_nocell; + struct ifnet *head_ifscope; + unsigned int head_nocell, head_recvanyif; /* Get listener's bound-to-interface, if any */ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + inp->inp_boundifp : NULL; /* Get listener's no-cellular information, if any */ head_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + /* Get listener's recv-any-interface, if any */ + head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF); /* * If the state is LISTEN then ignore segment if it contains an RST. @@ -1306,6 +1695,10 @@ findpcb: * If it is from this socket, drop it, it must be forged. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->listbadsyn, 1); + if (thflags & TH_RST) { goto drop; } @@ -1398,6 +1791,10 @@ findpcb: IFA_REMREF(&ia6->ia_ifa); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->deprecate6, 1); + goto dropwithreset; } IFA_UNLOCK(&ia6->ia_ifa); @@ -1466,13 +1863,15 @@ findpcb: dropsocket++; /* * Inherit INP_BOUND_IF from listener; testing if - * head_ifscope is non-zero is sufficient, since it + * head_ifscope is non-NULL is sufficient, since it * can only be set to a non-zero value earlier if * the listener has such a flag set. */ - if (head_ifscope != IFSCOPE_NONE) { + if (head_ifscope != NULL) { inp->inp_flags |= INP_BOUND_IF; - inp->inp_boundif = head_ifscope; + inp->inp_boundifp = head_ifscope; + } else { + inp->inp_flags &= ~INP_BOUND_IF; } /* * Inherit INP_NO_IFT_CELLULAR from listener. @@ -1480,6 +1879,13 @@ findpcb: if (head_nocell) { inp->inp_flags |= INP_NO_IFT_CELLULAR; } + /* + * Inherit {IN,IN6}_RECV_ANYIF from listener. + */ + if (head_recvanyif) + inp->inp_flags |= INP_RECV_ANYIF; + else + inp->inp_flags &= ~INP_RECV_ANYIF; #if INET6 if (isipv6) inp->in6p_laddr = ip6->ip6_dst; @@ -1502,7 +1908,7 @@ findpcb: inp->in6p_laddr = in6addr_any; else #endif /* INET6 */ - inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; tcp_lock(oso, 0, 0); /* release ref on parent */ tcp_unlock(oso, 1, 0); @@ -1531,7 +1937,7 @@ findpcb: M_NOWAIT); } else #endif /* INET6 */ - inp->inp_options = ip_srcroute(); + inp->inp_options = ip_srcroute(); tcp_lock(oso, 0, 0); #if IPSEC /* copy old policy into new socket's */ @@ -1553,21 +1959,13 @@ findpcb: tp->t_flagsext |= (tp0->t_flagsext & TF_RXTFINDROP); tp->t_keepinit = tp0->t_keepinit; tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; + if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) + tp->t_notsent_lowat = tp0->t_notsent_lowat; /* now drop the reference on the listener */ tcp_unlock(oso, 1, 0); - /* Compute proper scaling value from buffer space */ - if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) { - tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); - so->so_rcv.sb_hiwat = imin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES); - } - else { - while (tp->request_r_scale < TCP_MAX_WINSHIFT && - TCP_MAXWIN << tp->request_r_scale < - so->so_rcv.sb_hiwat) - tp->request_r_scale++; - } + tcp_set_max_rwinscale(tp, so); KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } @@ -1643,7 +2041,7 @@ findpcb: * certain criteria defined in tcp_stretch_ack_enable function. */ if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) { - tp->rcv_waitforss++; + TCP_INC_VAR(tp->rcv_waitforss, nlropkts); } if (tcp_stretch_ack_enable(tp)) { tp->t_flags |= TF_STRETCHACK; @@ -1659,7 +2057,13 @@ findpcb: tp->rcv_by_unackwin = tlen + off; } } - + + /* + * Keep track of how many bytes were received in the LRO packet + */ + if ((mauxf_sw_lro_pkt) && (nlropkts > 2)) { + tp->t_lropktlen += tlen; + } /* Explicit Congestion Notification - Flag that we need to send ECT if + The IP Congestion experienced flag was set. @@ -1671,8 +2075,7 @@ findpcb: TE_SENDECE will be cleared when we receive a packet with TH_CWR set. */ if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED && - (tp->ecn_flags & (TE_SETUPSENT | TE_SETUPRECEIVED)) == - (TE_SETUPSENT | TE_SETUPRECEIVED) && tlen > 0 && + ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)) && tlen > 0 && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp->ecn_flags |= TE_SENDECE; @@ -1693,7 +2096,19 @@ findpcb: if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_STRETCHACK) != 0 && ((ip_ecn == IPTOS_ECN_CE) || ((thflags & TH_CWR) == TH_CWR))) tcp_reset_stretch_ack(tp); - + + /* + * Try to determine if we are receiving a packet after a long time. + * Use our own approximation of idletime to roughly measure remote + * end's idle time. Since slowstart is used after an idle period + * we want to avoid doing LRO if the remote end is not up to date + * on initial window support and starts with 1 or 2 packets as its IW. + */ + if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) && + ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) { + turnoff_lro = 1; + } + /* * Segment received on connection. * Reset idle time and keep-alive timer. @@ -1748,25 +2163,29 @@ findpcb: TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) { + int seg_size = tlen; if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) { - tp->iaj_pktcnt++; + TCP_INC_VAR(tp->iaj_pktcnt, nlropkts); } - if ( tp->iaj_size == 0 || tlen > tp->iaj_size || - (tlen == tp->iaj_size && tp->iaj_rcv_ts == 0)) { + if (m->m_pkthdr.aux_flags & MAUXF_SW_LRO_PKT) { + seg_size = m->m_pkthdr.lro_pktlen; + } + if ( tp->iaj_size == 0 || seg_size > tp->iaj_size || + (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) { /* State related to inter-arrival jitter is uninitialized * or we are trying to find a good first packet to start * computing the metric */ - update_iaj_state(tp, tlen, 0); + update_iaj_state(tp, seg_size, 0); } else { - if (tlen == tp->iaj_size) { + if (seg_size == tp->iaj_size) { /* Compute inter-arrival jitter taking this packet * as the second packet */ compute_iaj(tp); } - if (tlen < tp->iaj_size) { + if (seg_size < tp->iaj_size) { /* There is a smaller packet in the stream. * Some times the maximum size supported on a path can * change if there is a new link with smaller MTU. @@ -1776,16 +2195,16 @@ findpcb: */ tp->iaj_small_pkt++; if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) { - update_iaj_state(tp, tlen, 1); + update_iaj_state(tp, seg_size, 1); } else { - clear_iaj_state(tp); + CLEAR_IAJ_STATE(tp); } } else { - update_iaj_state(tp, tlen, 0); + update_iaj_state(tp, seg_size, 0); } } } else { - clear_iaj_state(tp); + CLEAR_IAJ_STATE(tp); } #endif /* TRAFFIC_MGT */ @@ -1860,6 +2279,7 @@ findpcb: tp->t_badrxtwin = 0; tp->t_rxtshift = 0; tp->rxt_start = 0; + tcp_bad_rexmt_fix_sndbuf(tp); DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp, struct tcphdr *, th, int32_t, TCP_CC_BAD_REXMT_RECOVERY); @@ -1875,9 +2295,9 @@ findpcb: if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) && TSTMP_GEQ(tcp_now, to.to_tsecr)) { tcp_xmit_timer(tp, - tcp_now - to.to_tsecr); + tcp_now - to.to_tsecr); } else if (tp->t_rtttime && - SEQ_GT(th->th_ack, tp->t_rtseq)) { + SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); } acked = th->th_ack - tp->snd_una; @@ -1896,10 +2316,13 @@ findpcb: int32_t, TCP_CC_INSEQ_ACK_RCVD); sbdrop(&so->so_snd, acked); + tcp_sbsnd_trim(&so->so_snd); + if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; + /* * pull snd_wl2 up to prevent seq wrap relative * to th_ack. @@ -1923,6 +2346,9 @@ findpcb: else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && + tp->t_bwmeas != NULL) + tcp_bwmeas_check(tp); sowwakeup(so); /* has to be done with socket lock held */ if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); @@ -1941,6 +2367,31 @@ findpcb: * with nothing on the reassembly queue and * we have enough buffer space to take it. */ + + /* + * If this is a connection in steady state, start + * coalescing packets belonging to this flow. + */ + if (turnoff_lro) { + tcp_lro_remove_state(tp->t_inpcb->inp_laddr, + tp->t_inpcb->inp_faddr, + tp->t_inpcb->inp_lport, + tp->t_inpcb->inp_fport); + tp->t_flagsext &= ~TF_LRO_OFFLOADED; + tp->t_idleat = tp->rcv_nxt; + } else if (sw_lro && !mauxf_sw_lro_pkt && !isipv6 && + (so->so_flags & SOF_USELRO) && + (m->m_pkthdr.rcvif->if_type != IFT_CELLULAR) && + (m->m_pkthdr.rcvif->if_type != IFT_LOOP) && + ((th->th_seq - tp->irs) > + (tp->t_maxseg << lro_start)) && + ((tp->t_idleat == 0) || ((th->th_seq - + tp->t_idleat) > (tp->t_maxseg << lro_start)))) { + tp->t_flagsext |= TF_LRO_OFFLOADED; + tcp_start_coalescing(ip, th, tlen); + tp->t_idleat = 0; + } + /* Clean receiver SACK report if present */ if (tp->sack_enable && tp->rcv_numsacks) tcp_clean_sackreport(tp); @@ -1956,13 +2407,21 @@ findpcb: * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; - tcpstat.tcps_rcvpack++; + TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts); tcpstat.tcps_rcvbyte += tlen; if (nstat_collect) { - locked_add_64(&inp->inp_stat->rxpackets, 1); + if (m->m_pkthdr.aux_flags & MAUXF_SW_LRO_PKT) { + locked_add_64(&inp->inp_stat->rxpackets, m->m_pkthdr.lro_npkts); + } + else { + locked_add_64(&inp->inp_stat->rxpackets, 1); + } locked_add_64(&inp->inp_stat->rxbytes, tlen); } ND6_HINT(tp); /* some progress has been done */ + + tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); + /* * Add data to socket buffer. */ @@ -1983,12 +2442,12 @@ findpcb: (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), th->th_seq, th->th_ack, th->th_win); } + TCP_INC_VAR(tp->t_unacksegs, nlropkts); if (DELAY_ACK(tp, th)) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); } - tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); @@ -2117,6 +2576,9 @@ findpcb: tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); dropsocket = 0; /* committed to socket */ + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); + /* reset the incomp processing flag */ so->so_flags &= ~(SOF_INCOMP_INPROGRESS); tcpstat.tcps_accepts++; @@ -2124,6 +2586,7 @@ findpcb: /* ECN-setup SYN */ tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); } + #if CONFIG_IFEF_NOWINDOWSCALE if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL && (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) { @@ -2143,6 +2606,10 @@ findpcb: (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->ooopacket, 1); + goto dropwithreset; } break; @@ -2164,10 +2631,17 @@ findpcb: (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->ooopacket, 1); + goto dropwithreset; } if (thflags & TH_RST) { if ((thflags & TH_ACK) != 0) { + soevent(so, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_CONNRESET)); tp = tcp_drop(tp, ECONNREFUSED); postevent(so, 0, EV_RESET); } @@ -2202,18 +2676,18 @@ findpcb: tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } - tp->rcv_adv += tp->rcv_wnd; + tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ - if (DELAY_ACK(tp, th) && tlen != 0) { + TCP_INC_VAR(tp->t_unacksegs, nlropkts); + if (DELAY_ACK(tp, th) && tlen != 0 ) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); } - tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; @@ -2225,6 +2699,7 @@ findpcb: * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = tcp_now; + tcp_sbrcv_tstmp_check(tp); if (tp->t_flags & TF_NEEDFIN) { DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); @@ -2382,6 +2857,8 @@ trimthenstep6: switch (tp->t_state) { case TCPS_SYN_RECEIVED: + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->rstinsynrcv, 1); so->so_error = ECONNREFUSED; goto close; @@ -2401,6 +2878,11 @@ trimthenstep6: postevent(so, 0, EV_RESET); DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, int32_t, TCPS_CLOSED); + + soevent(so, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_CONNRESET)); + tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); @@ -2464,6 +2946,10 @@ trimthenstep6: */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->dospacket, 1); + goto dropwithreset; } @@ -2495,9 +2981,14 @@ trimthenstep6: * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; + if (todrop == 1) { + /* This could be a keepalive */ + soevent(so, SO_FILT_HINT_LOCKED | + SO_FILT_HINT_KEEPALIVE); + } todrop = tlen; tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += todrop; + tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; @@ -2528,6 +3019,10 @@ trimthenstep6: tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->cleanup, 1); + goto dropwithreset; } @@ -2607,6 +3102,10 @@ trimthenstep6: tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; postevent(so, 0, EV_RESET); + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->synwindow, 1); + goto dropwithreset; } @@ -2653,6 +3152,7 @@ trimthenstep6: * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = tcp_now; + tcp_sbrcv_tstmp_check(tp); if (tp->t_flags & TF_NEEDFIN) { DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); @@ -2787,12 +3287,14 @@ trimthenstep6: * before entering FR, call it */ if (CC_ALGO(tp)->pre_fr != NULL) - CC_ALGO(tp)->pre_fr(tp, th); + CC_ALGO(tp)->pre_fr(tp); ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtttime = 0; - tp->ecn_flags |= TE_SENDCWR; + if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) { + tp->ecn_flags |= TE_SENDCWR; + } if (tp->sack_enable) { tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; @@ -2900,6 +3402,7 @@ process_ACK: tp->t_badrxtwin = 0; /* XXX probably not required */ tp->t_rxtshift = 0; tp->rxt_start = 0; + tcp_bad_rexmt_fix_sndbuf(tp); DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, struct tcphdr *, th, @@ -2948,13 +3451,13 @@ process_ACK: goto step6; if ((thflags & TH_ECE) != 0 && - (tp->ecn_flags & TE_SETUPSENT) != 0) { + ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)) { /* * Reduce the congestion window if we haven't done so. */ if (!tp->sack_enable && !IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { - tcp_reduce_congestion_window(tp, th); + tcp_reduce_congestion_window(tp); DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, struct tcpcb *, tp, struct tcphdr *, th, int32_t, TCP_CC_ECN_RCVD); @@ -2983,6 +3486,7 @@ process_ACK: ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); + tcp_sbsnd_trim(&so->so_snd); tp->snd_wnd -= acked; ourfinisacked = 0; } @@ -3003,7 +3507,10 @@ process_ACK: } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; - + if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && + tp->t_bwmeas != NULL) + tcp_bwmeas_check(tp); + /* * sowwakeup must happen after snd_una, et al. are updated so that * the sequence numbers are in sync with so_snd @@ -3219,25 +3726,32 @@ dodata: if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { - if (DELAY_ACK(tp, th) && ((tp->t_flags & TF_ACKNOW) == 0)) { + TCP_INC_VAR(tp->t_unacksegs, nlropkts); + if (DELAY_ACK(tp, th) && + ((tp->t_flags & TF_ACKNOW) == 0) ) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); } - tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; - tcpstat.tcps_rcvpack++; + TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts); tcpstat.tcps_rcvbyte += tlen; if (nstat_collect) { - locked_add_64(&inp->inp_stat->rxpackets, 1); + if (m->m_pkthdr.aux_flags & MAUXF_SW_LRO_PKT) { + locked_add_64(&inp->inp_stat->rxpackets, m->m_pkthdr.lro_npkts); + } else { + locked_add_64(&inp->inp_stat->rxpackets, 1); + } locked_add_64(&inp->inp_stat->rxbytes, tlen); } ND6_HINT(tp); + + tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); so_recv_data_stat(so, m, drop_hdrlen); if (sbappendstream(&so->so_rcv, m)) sorwakeup(so); @@ -3266,14 +3780,6 @@ dodata: } } - /* - * Note the amount of data that peer has sent into - * our window, in order to estimate the sender's - * buffer size. - */ - len = (u_int)(so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt)); - if (len > so->so_rcv.sb_maxused) - so->so_rcv.sb_maxused = len; } else { m_freem(m); thflags &= ~TH_FIN; @@ -3288,20 +3794,18 @@ dodata: socantrcvmore(so); postevent(so, 0, EV_FIN); /* - * If connection is half-synchronized - * (ie NEEDSYN flag on) then delay ACK, * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ + TCP_INC_VAR(tp->t_unacksegs, nlropkts); if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); } - tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; @@ -3403,6 +3907,10 @@ dropafterack: (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; + + if (ifp != NULL && ifp->if_tcp_stat != NULL) + atomic_add_64(&ifp->if_tcp_stat->dospacket, 1); + goto dropwithreset; } #if TCPDEBUG @@ -3723,8 +4231,9 @@ tcp_xmit_timer(tp, rtt) delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; - if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) - tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + if (tp->t_rttbest == 0 || + tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. @@ -3733,7 +4242,6 @@ tcp_xmit_timer(tp, rtt) */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); - tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, tp->t_rttvar); tp->t_rtttime = 0; @@ -3784,13 +4292,20 @@ static inline unsigned int tcp_maxmtu6(struct rtentry *rt) { unsigned int maxmtu; + struct nd_ifinfo *ndi; RT_LOCK_ASSERT_HELD(rt); lck_rw_lock_shared(nd_if_rwlock); + if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized) + ndi = NULL; + if (ndi != NULL) + lck_mtx_lock(&ndi->lock); if (rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(rt->rt_ifp); else maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp)); + if (ndi != NULL) + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); return (maxmtu); @@ -3935,33 +4450,11 @@ tcp_mss(tp, offer, input_ifscope) * or rttvar. Convert from the route-table units * to scaled multiples of the slow timeout timer. */ - if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { - /* - * XXX the lock bit for RTT indicates that the value - * is also a minimum value; this is subject to time. - */ - if (rt->rt_rmx.rmx_locks & RTV_RTT) - tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); - else - tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; - tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); - tcpstat.tcps_usedrtt++; - if (rt->rt_rmx.rmx_rttvar) { - tp->t_rttvar = rt->rt_rmx.rmx_rttvar / - (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); - tcpstat.tcps_usedrttvar++; - } else { - /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; - } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX, - TCP_ADD_REXMTSLOP(tp)); - } - else + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) { + tcp_getrt_rtt(tp, rt); + } else { tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; + } #if INET6 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt)); @@ -4289,7 +4782,6 @@ tcp_dropdropablreq(struct socket *head) lck_rw_lock_exclusive(tcbinfo.mtx); tcp_lock(so, 0, 0); - /* Release the reference held for so_incomp queue */ so->so_usecount--; @@ -4362,6 +4854,54 @@ tcp_set_new_cc(struct socket *so, uint16_t cc_index) } } +void +tcp_set_recv_bg(struct socket *so) +{ + if (!IS_TCP_RECV_BG(so)) + so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; +} + +void +tcp_clear_recv_bg(struct socket *so) +{ + if (IS_TCP_RECV_BG(so)) + so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); +} + +void +inp_fc_unthrottle_tcp(struct inpcb *inp) +{ + struct tcpcb *tp = inp->inp_ppcb; + /* + * Back off the slow-start threshold and enter + * congestion avoidance phase + */ + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp); + + tp->snd_cwnd = tp->snd_ssthresh; + + /* + * Restart counting for ABC as we changed the + * congestion window just now. + */ + tp->t_bytes_acked = 0; + + /* Reset retransmit shift as we know that the reason + * for delay in sending a packet is due to flow + * control on the outgoing interface. There is no need + * to backoff retransmit timer. + */ + tp->t_rxtshift = 0; + + /* + * Start the output stream again. Since we are + * not retransmitting data, do not reset the + * retransmit timer or rtt calculation. + */ + tcp_output(tp); +} + static int tcp_getstat SYSCTL_HANDLER_ARGS { diff --git a/bsd/netinet/tcp_ledbat.c b/bsd/netinet/tcp_ledbat.c index 5baf28bea..1d1d5e5e7 100644 --- a/bsd/netinet/tcp_ledbat.c +++ b/bsd/netinet/tcp_ledbat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Inc. All rights reserved. + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,7 +60,7 @@ int tcp_ledbat_cleanup(struct tcpcb *tp); void tcp_ledbat_cwnd_init(struct tcpcb *tp); void tcp_ledbat_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); void tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); -void tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_pre_fr(struct tcpcb *tp); void tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th); void tcp_ledbat_after_idle(struct tcpcb *tp); void tcp_ledbat_after_timeout(struct tcpcb *tp); @@ -290,9 +290,7 @@ tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { } void -tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th) { -#pragma unused(th) - +tcp_ledbat_pre_fr(struct tcpcb *tp) { uint32_t win; win = min(tp->snd_wnd, tp->snd_cwnd) / @@ -302,6 +300,8 @@ tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th) { tp->snd_ssthresh = win * tp->t_maxseg; if (tp->bg_ssthresh > tp->snd_ssthresh) tp->bg_ssthresh = tp->snd_ssthresh; + + tcp_cc_resize_sndbuf(tp); } void @@ -380,6 +380,8 @@ tcp_ledbat_after_timeout(struct tcpcb *tp) { if (tp->bg_ssthresh > tp->snd_ssthresh) tp->bg_ssthresh = tp->snd_ssthresh; + + tcp_cc_resize_sndbuf(tp); } } @@ -401,7 +403,7 @@ int tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th) { if ((tp->t_flags & TF_RXWIN0SENT) == 0 && (th->th_flags & TH_PUSH) == 0 && - (tp->t_flags & TF_DELACK) == 0) + (tp->t_unacksegs == 1)) return(1); return(0); } diff --git a/bsd/netinet/tcp_lro.c b/bsd/netinet/tcp_lro.c new file mode 100644 index 000000000..55ebb0e38 --- /dev/null +++ b/bsd/netinet/tcp_lro.c @@ -0,0 +1,997 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int lrocount = 0; /* A counter used for debugging only */ +unsigned int lro_seq_outoforder = 0; /* Counter for debugging */ +unsigned int lro_seq_mismatch = 0; /* Counter for debugging */ +unsigned int lro_eject_req = 0; /* Counter for tracking flow ejections */ +unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */ +unsigned int lro_single_flushes = 0; +unsigned int lro_double_flushes = 0; +unsigned int lro_good_flushes = 0; + +unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED, + &coalesc_sz, 0, "Max coalescing size"); + +unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED, + &coalesc_time, 0, "Max coalescing time"); + +struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS]; + +char lro_flow_map[TCP_LRO_FLOW_MAP]; + +static lck_attr_t *tcp_lro_mtx_attr = NULL; /* mutex attributes */ +static lck_grp_t *tcp_lro_mtx_grp = NULL; /* mutex group */ +static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL; /* mutex group attrs */ +decl_lck_mtx_data( ,tcp_lro_lock); /* Used to synchronize updates */ + +unsigned int lro_byte_count = 0; + +uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */ +uint32_t lro_timer_set = 0; + +/* Some LRO stats */ +u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */ +thread_call_t tcp_lro_timer; + +extern u_int32_t kipf_count; + +static void tcp_lro_timer_proc(void*, void*); +static void lro_update_stats(struct mbuf*); +static void lro_update_flush_stats(struct mbuf *); +static void tcp_lro_flush_flows(void); +static void tcp_lro_sched_timer(uint64_t); +static void lro_proto_input(struct mbuf *); + +static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ipovly *, + struct tcphdr*); +static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*, + int); + +void +tcp_lro_init(void) +{ + int i; + + bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS); + for (i = 0; i < TCP_LRO_FLOW_MAP; i++) { + lro_flow_map[i] = TCP_LRO_FLOW_UNINIT; + } + + /* + * allocate lock group attribute, group and attribute for tcp_lro_lock + */ + tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init(); + tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr); + tcp_lro_mtx_attr = lck_attr_alloc_init(); + lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr); + + tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL); + if (tcp_lro_timer == NULL) { + panic_plain("%s: unable to allocate lro timer", __func__); + } + + return; +} + +static int +tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash, + int *flow_id ) +{ + struct lro_flow *flow; + tcp_seq seqnum; + unsigned int off = 0; + int payload_len = 0; + + *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, + tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1)); + + *flow_id = lro_flow_map[*hash]; + if (*flow_id == TCP_LRO_FLOW_NOTFOUND) { + return TCP_LRO_NAN; + } + + seqnum = tcp_hdr->th_seq; + off = tcp_hdr->th_off << 2; + payload_len = ip_hdr->ip_len - off; + + flow = &lro_flow_list[*flow_id]; + + if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && + (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && + (flow->lr_fport == tcp_hdr->th_sport) && + (flow->lr_lport == tcp_hdr->th_dport)) { + if (flow->lr_tcphdr == NULL) { + if (ntohl(seqnum) == flow->lr_seq) { + return TCP_LRO_COALESCE; + } + if (lrodebug >= 4) { + printf("%s: seqnum = %x, lr_seq = %x\n", + __func__, ntohl(seqnum), flow->lr_seq); + } + lro_seq_mismatch++; + if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) { + lro_seq_outoforder++; + /* + * Whenever we receive out of order packets it + * signals loss and recovery and LRO doesn't + * let flows recover quickly. So eject. + */ + flow->lr_flags |= LRO_EJECT_REQ; + + } + return TCP_LRO_NAN; + } + + if (flow->lr_flags & LRO_EJECT_REQ) { + if (lrodebug) + printf("%s: eject. \n", __func__); + return TCP_LRO_EJECT_FLOW; + } + if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) { + if (lrodebug) { + printf("%s: th_ack = %x flow_ack = %x \n", + __func__, tcp_hdr->th_ack, + flow->lr_tcphdr->th_ack); + } + return TCP_LRO_EJECT_FLOW; + } + + if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) { + return TCP_LRO_COALESCE; + } else { + /* LRO does not handle loss recovery well, eject */ + flow->lr_flags |= LRO_EJECT_REQ; + return TCP_LRO_EJECT_FLOW; + } + } + if (lrodebug) printf("tcp_lro_matching_tuple: collision \n"); + return TCP_LRO_COLLISION; +} + +static void +tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr, + int hash, u_int32_t timestamp, int payload_len) +{ + struct lro_flow *flow = NULL; + + flow = &lro_flow_list[flow_id]; + + flow->lr_hash_map = hash; + flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr; + flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr; + flow->lr_fport = tcp_hdr->th_sport; + flow->lr_lport = tcp_hdr->th_dport; + lro_flow_map[hash] = flow_id; + flow->lr_timestamp = timestamp; + flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len; + flow->lr_flags = 0; + return; +} + +static void +tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr, + int payload_len, int drop_hdrlen, struct tcpopt *topt, + u_int32_t* tsval, u_int32_t* tsecr, int thflags) +{ + struct lro_flow *flow = NULL; + struct mbuf *last; + struct ip *ip = NULL; + + flow = &lro_flow_list[flow_id]; + if (flow->lr_mhead) { + if (lrodebug) + printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq, + payload_len); + m_adj(lro_mb, drop_hdrlen); + + last = flow->lr_mtail; + while (last->m_next != NULL) { + last = last->m_next; + } + last->m_next = lro_mb; + + flow->lr_mtail = lro_mb; + + ip = mtod(flow->lr_mhead, struct ip *); + ip->ip_len += lro_mb->m_pkthdr.len; + flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len; + + if (flow->lr_len == 0) { + panic_plain("%s: Inconsistent LRO flow state", __func__); + } + flow->lr_len += payload_len; + flow->lr_seq += payload_len; + /* + * This bit is re-OR'd each time a packet is added to the + * large coalesced packet. + */ + flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT; + flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */ + if (flow->lr_mhead->m_pkthdr.lro_pktlen < + lro_mb->m_pkthdr.lro_pktlen) { + /* + * For TCP Inter Arrival Jitter calculation, return max + * size encountered while coalescing a stream of pkts. + */ + flow->lr_mhead->m_pkthdr.lro_pktlen = + lro_mb->m_pkthdr.lro_pktlen; + } + /* Update the timestamp value */ + if (topt->to_flags & TOF_TS) { + if ((flow->lr_tsval) && + (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) { + *(flow->lr_tsval) = htonl(topt->to_tsval); + } + if ((flow->lr_tsecr) && + (topt->to_tsecr != 0) && + (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) { + if (lrodebug >= 2) { + printf("%s: instantaneous RTT = %d \n", __func__, + topt->to_tsecr - ntohl(*(flow->lr_tsecr))); + } + *(flow->lr_tsecr) = htonl(topt->to_tsecr); + } + } + /* Coalesce the flags */ + if (thflags) { + flow->lr_tcphdr->th_flags |= thflags; + } + /* Update receive window */ + flow->lr_tcphdr->th_win = tcphdr->th_win; + } else { + if (lro_mb) { + flow->lr_mhead = flow->lr_mtail = lro_mb; + flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT; + flow->lr_tcphdr = tcphdr; + if ((topt) && (topt->to_flags & TOF_TS)) { + ASSERT(tsval != NULL); + ASSERT(tsecr != NULL); + flow->lr_tsval = tsval; + flow->lr_tsecr = tsecr; + } + flow->lr_len = payload_len; + flow->lr_timestamp = tcp_now; + tcp_lro_sched_timer(0); + } + flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len; + } + if (lro_mb) { + tcpstat.tcps_coalesced_pack++; + } + return; +} + +static struct mbuf * +tcp_lro_eject_flow(int flow_id) +{ + struct mbuf *mb = NULL; + + mb = lro_flow_list[flow_id].lr_mhead; + ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id); + lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT; + bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow)); + + return mb; +} + +static struct mbuf* +tcp_lro_eject_coalesced_pkt(int flow_id) +{ + struct mbuf *mb = NULL; + mb = lro_flow_list[flow_id].lr_mhead; + lro_flow_list[flow_id].lr_mhead = + lro_flow_list[flow_id].lr_mtail = NULL; + lro_flow_list[flow_id].lr_tcphdr = NULL; + return mb; +} + +static struct mbuf* +tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr, + struct tcphdr *tcp_hdr, int payload_len, + int drop_hdrlen, int hash, struct tcpopt *topt, + u_int32_t *tsval, u_int32_t *tsecr) +{ + int i; + int slot_available = 0; + int candidate_flow = 0; + u_int32_t oldest_timestamp; + struct mbuf *mb = NULL; + int collision = 0; + + oldest_timestamp = tcp_now; + + /* handle collision */ + if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) { + if (lrodebug) { + collision = 1; + } + candidate_flow = lro_flow_map[hash]; + tcpstat.tcps_flowtbl_collision++; + goto kick_flow; + } + + for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) { + if (lro_flow_list[i].lr_mhead == NULL) { + candidate_flow = i; + slot_available = 1; + break; + } + if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) { + candidate_flow = i; + oldest_timestamp = lro_flow_list[i].lr_timestamp; + } + } + + if (!slot_available) { + tcpstat.tcps_flowtbl_full++; +kick_flow: + /* kick the oldest flow */ + mb = tcp_lro_eject_flow(candidate_flow); + + if (lrodebug) { + if (!slot_available) { + printf("%s: slot unavailable.\n",__func__); + } + if (collision) { + printf("%s: collision.\n",__func__); + } + } + } else { + candidate_flow = i; /* this is now the flow to be used */ + + } + + tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash, + tcp_now, payload_len); + tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len, + drop_hdrlen, topt, tsval, tsecr, 0); + return mb; +} + +struct mbuf* +tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr, + struct tcphdr *tcp_hdr, int drop_hdrlen) +{ + int flow_id = TCP_LRO_FLOW_UNINIT; + int hash; + unsigned int off = 0; + int eject_flow = 0; + int optlen; + int retval = 0; + struct mbuf *mb = NULL; + int payload_len = 0; + u_char *optp = NULL; + int thflags = 0; + struct tcpopt to; + int ret_response = TCP_LRO_CONSUMED; + int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0; + u_int8_t ecn; + + if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) { + if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + m_freem(lro_mb); + if (lrodebug) { + printf("tcp_lro_process_pkt:mbuf too short.\n"); + } + return NULL; + } + } + + if ((lro_mb = lro_tcp_xsum_validate(lro_mb, + (struct ipovly*)ip_hdr, tcp_hdr)) == NULL) { + if (lrodebug) { + printf("tcp_lro_process_pkt: TCP xsum failed.\n"); + } + return NULL; + } + + /* Update stats */ + lro_pkt_count++; + + /* Avoids checksumming in tcp_input */ + lro_mb->m_pkthdr.aux_flags |= MAUXF_SW_LRO_DID_CSUM; + + off = tcp_hdr->th_off << 2; + optlen = off - sizeof (struct tcphdr); + payload_len = ip_hdr->ip_len - off; + optp = (u_char *)(tcp_hdr + 1); + /* + * Do quick retrieval of timestamp options ("options + * prediction?"). If timestamp is the only option and it's + * formatted as recommended in RFC 1323 appendix A, we + * quickly get the values now and not bother calling + * tcp_dooptions(), etc. + */ + if ((optlen == TCPOLEN_TSTAMP_APPA || + (optlen > TCPOLEN_TSTAMP_APPA && + optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && + *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && + (tcp_hdr->th_flags & TH_SYN) == 0) { + to.to_flags |= TOF_TS; + to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4)); + to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8)); + } else { + /* + * If TCP timestamps are not in use, or not the first option, + * skip LRO path since timestamps are used to avoid LRO + * from introducing additional latencies for retransmissions + * and other slow-paced transmissions. + */ + to.to_flags = to.to_tsecr = 0; + eject_flow = 1; + } + + /* list all the conditions that can trigger a flow ejection here */ + + thflags = tcp_hdr->th_flags; + if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) { + eject_flow = tcpflags = 1; + } + + if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) && + (to.to_flags & TOF_TS))) { + eject_flow = unknown_tcpopts = 1; + } + + if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */ + eject_flow = 1; + } + + /* Can't coalesce ECN marked packets. */ + ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK; + if (ecn == IPTOS_ECN_CE) { + /* + * ECN needs quick notification + */ + if (lrodebug) { + printf("%s: ECE bits set.\n", __func__); + } + eject_flow = 1; + } + + lck_mtx_lock_spin(&tcp_lro_lock); + + retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id); + + switch (retval) { + case TCP_LRO_NAN: + lck_mtx_unlock(&tcp_lro_lock); + ret_response = TCP_LRO_FLOW_NOTFOUND; + break; + + case TCP_LRO_COALESCE: + if ((payload_len != 0) && (unknown_tcpopts == 0) && + (tcpflags == 0) && (ecn == 0) && (to.to_flags & TOF_TS)) { + tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len, + drop_hdrlen, &to, + (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL, + (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL, + thflags); + if (lrodebug >= 2) { + printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n", + lro_flow_list[flow_id].lr_len, flow_id, + payload_len, drop_hdrlen, optlen, + ntohs(lro_flow_list[flow_id].lr_lport), + ntohl(tcp_hdr->th_seq)); + } + if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) { + eject_flow = 1; + } + coalesced = 1; + } + if (eject_flow) { + mb = tcp_lro_eject_coalesced_pkt(flow_id); + lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) + + payload_len; + lck_mtx_unlock(&tcp_lro_lock); + if (mb) { + lro_proto_input(mb); + } + if (!coalesced) { + if (lrodebug >= 2) { + printf("%s: pkt payload_len = %d \n", __func__, payload_len); + } + lro_proto_input(lro_mb); + } + } else { + lck_mtx_unlock(&tcp_lro_lock); + } + break; + + case TCP_LRO_EJECT_FLOW: + mb = tcp_lro_eject_coalesced_pkt(flow_id); + lck_mtx_unlock(&tcp_lro_lock); + if (mb) { + if (lrodebug) + printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len); + lro_proto_input(mb); + } + + lro_proto_input(lro_mb); + break; + + case TCP_LRO_COLLISION: + lck_mtx_unlock(&tcp_lro_lock); + ret_response = TCP_LRO_FLOW_NOTFOUND; + break; + + default: + lck_mtx_unlock(&tcp_lro_lock); + panic_plain("%s: unrecognized type %d", __func__, retval); + break; + } + + if (ret_response == TCP_LRO_FLOW_NOTFOUND) { + lro_proto_input(lro_mb); + } + return NULL; +} + +static void +tcp_lro_timer_proc(void *arg1, void *arg2) +{ +#pragma unused(arg1, arg2) + + lck_mtx_lock_spin(&tcp_lro_lock); + lro_timer_set = 0; + lck_mtx_unlock(&tcp_lro_lock); + tcp_lro_flush_flows(); +} + +static void +tcp_lro_flush_flows(void) +{ + int i = 0; + struct mbuf *mb; + struct lro_flow *flow; + int active_flows = 0; + int outstanding_flows = 0; + int tcpclock_updated = 0; + + lck_mtx_lock(&tcp_lro_lock); + + while (i < TCP_LRO_NUM_FLOWS) { + flow = &lro_flow_list[i]; + if (flow->lr_mhead != NULL) { + active_flows++; + if (!tcpclock_updated) { + calculate_tcp_clock(); + tcpclock_updated = 1; + } + if (((tcp_now - flow->lr_timestamp) >= coalesc_time) || + (flow->lr_mhead->m_pkthdr.lro_npkts >= + coalesc_sz)) { + + if (lrodebug >= 2) + printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n", + flow->lr_len, + flow->lr_mhead->m_pkthdr.lro_npkts, + flow->lr_timestamp, tcp_now); + + mb = tcp_lro_eject_flow(i); + + if (mb) { + lck_mtx_unlock(&tcp_lro_lock); + lro_update_flush_stats(mb); + lro_proto_input(mb); + lck_mtx_lock(&tcp_lro_lock); + } + + } else { + tcp_lro_sched_timer(0); + outstanding_flows++; + if (lrodebug >= 2) { + printf("tcp_lro_flush_flows: did not flush flow of len =%d deadline = %x timestamp = %x \n", + flow->lr_len, tcp_now, flow->lr_timestamp); + } + } + } + if (flow->lr_flags & LRO_EJECT_REQ) { + mb = tcp_lro_eject_flow(i); + if (mb) { + lck_mtx_unlock(&tcp_lro_lock); + lro_proto_input(mb); + lro_eject_req++; + lck_mtx_lock(&tcp_lro_lock); + } + } + i++; + } + lck_mtx_unlock(&tcp_lro_lock); +#if 0 + if (lrocount == 900) { + printf("%s: %d %d %d %d oo: %d mismatch: %d ej_req: %d coll: %d \n", + __func__, + tcpstat.tcps_coalesced_pack, + tcpstat.tcps_lro_twopack, + tcpstat.tcps_lro_multpack, + tcpstat.tcps_lro_largepack, + lro_seq_outoforder, + lro_seq_mismatch, + lro_eject_req, + tcpstat.tcps_flowtbl_collision); + printf("%s: all: %d single: %d double: %d good: %d \n", + __func__, lro_flushes, lro_single_flushes, + lro_double_flushes, lro_good_flushes); + lrocount = 0; + } else { + lrocount++; + } + if ((lrodebug >= 2) && (active_flows > 1)) { + printf("lro_flush_flows: active_flows = %d \n", active_flows); + } +#endif +} + +/* + * Must be called with tcp_lro_lock held. + * The hint is non-zero for longer waits. The wait time dictated by coalesc_time + * takes precedence, so lro_timer_set is not set for the hint case + */ +static void +tcp_lro_sched_timer(uint64_t hint) +{ + if (lro_timer_set) { + return; + } + + lro_timer_set = 1; + if (!hint) { + /* the intent is to wake up every coalesc_time msecs */ + clock_interval_to_deadline(coalesc_time, + (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline); + } else { + clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ, + &lro_deadline); + } + thread_call_enter_delayed(tcp_lro_timer, lro_deadline); +} + +struct mbuf* +tcp_lro(struct mbuf *m, unsigned int hlen) +{ + struct ip *ip_hdr; + unsigned int tlen; + struct tcphdr * tcp_hdr = NULL; + unsigned int off = 0; + + if (kipf_count != 0) + return m; + + /* + * Experiments on cellular show that the RTT is much higher + * than the coalescing time of 5 msecs, causing lro to flush + * 80% of the time on a single packet. Increasing + * coalescing time for cellular does not show marked + * improvement to throughput either. Loopback perf is hurt + * by the 5 msec latency and it already sends large packets. + */ + if ((m->m_pkthdr.rcvif->if_type == IFT_CELLULAR) || + (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) { + return m; + } + + ip_hdr = mtod(m, struct ip*); + + /* only TCP is coalesced */ + if (ip_hdr->ip_p != IPPROTO_TCP) { + return m; + } + + if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) { + if (lrodebug) printf("tcp_lro m_pullup \n"); + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + if (lrodebug) { + printf("ip_lro: rcvshort.\n"); + } + return NULL; + } + } + + tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen); + tlen = ip_hdr->ip_len ; //ignore IP header bytes len + m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */ + m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */ + off = tcp_hdr->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + if (lrodebug) { + printf("ip_lro: TCP off greater than TCP header.\n"); + } + return m; + } + + return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off)); +} + +static void +lro_proto_input(struct mbuf *m) +{ + struct ip* ip_hdr = mtod(m, struct ip*); + + if (lrodebug >= 3) { + printf("lro_proto_input: ip_len = %d \n", + ip_hdr->ip_len); + } + lro_update_stats(m); + ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p); +} + +static struct mbuf * +lro_tcp_xsum_validate(struct mbuf *m, struct ipovly *ipov, struct tcphdr * th) +{ + + struct ip* ip = (struct ip*)ipov; + int tlen = ip->ip_len; + int len; + struct ifnet *ifp = ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) ? + m->m_pkthdr.rcvif: NULL; + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) { + u_short pseudo; + char b[9]; + + bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); + bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ipov->ih_len); +#endif + pseudo = in_cksum(m, sizeof (struct ip)); + bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); + + th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF)); + } else { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + + ip->ip_len + IPPROTO_TCP)); + } + th->th_sum ^= 0xffff; + } else { + char b[9]; + /* + * Checksum extended TCP header and data. + */ + bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); + bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ipov->ih_len); +#endif + len = sizeof (struct ip) + tlen; + th->th_sum = in_cksum(m, len); + bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); + + tcp_in_cksum_stats(len); + } + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + if (ifp != NULL && ifp->if_tcp_stat != NULL) { + atomic_add_64(&ifp->if_tcp_stat->badformat, 1); + } + if (lrodebug) + printf("lro_tcp_xsum_validate: bad xsum and drop m = %p.\n",m); + m_freem(m); + return NULL; + } + /* revert back the order as IP will look into this again. */ +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ipov->ih_len); +#endif + return m; +} + +/* + * When TCP detects a stable, steady flow without out of ordering, + * with a sufficiently high cwnd, it invokes LRO. + */ +int +tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen) +{ + int hash; + int flow_id; + struct mbuf *eject_mb; + struct lro_flow *lf; + + hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, + tcp_hdr->th_sport, tcp_hdr->th_dport, + (TCP_LRO_FLOW_MAP - 1)); + + + lck_mtx_lock_spin(&tcp_lro_lock); + flow_id = lro_flow_map[hash]; + if (flow_id != TCP_LRO_FLOW_NOTFOUND) { + lf = &lro_flow_list[flow_id]; + if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && + (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && + (lf->lr_fport == tcp_hdr->th_sport) && + (lf->lr_lport == tcp_hdr->th_dport)) { + if ((lf->lr_tcphdr == NULL) && + (lf->lr_seq != (tcp_hdr->th_seq + tlen))) { + lf->lr_seq = tcp_hdr->th_seq + tlen; + } + lf->lr_flags &= ~LRO_EJECT_REQ; + } + lck_mtx_unlock(&tcp_lro_lock); + return 0; + } + + HTONL(tcp_hdr->th_seq); + HTONL(tcp_hdr->th_ack); + eject_mb = + tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash, + NULL, NULL, NULL); + + lck_mtx_unlock(&tcp_lro_lock); + + NTOHL(tcp_hdr->th_seq); + NTOHL(tcp_hdr->th_ack); + if (lrodebug >= 3) { + printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n", + __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, + tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq); + } + ASSERT(eject_mb == NULL); + return 0; +} + +/* + * When TCP detects loss or idle condition, it stops offloading + * to LRO. + */ +int +tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr, + unsigned short sport, unsigned short dport) +{ + int hash, flow_id; + struct lro_flow *lf; + + hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, + (TCP_LRO_FLOW_MAP - 1)); + lck_mtx_lock_spin(&tcp_lro_lock); + flow_id = lro_flow_map[hash]; + if (flow_id == TCP_LRO_FLOW_UNINIT) { + lck_mtx_unlock(&tcp_lro_lock); + return 0; + } + lf = &lro_flow_list[flow_id]; + if ((lf->lr_faddr.s_addr == daddr.s_addr) && + (lf->lr_laddr.s_addr == saddr.s_addr) && + (lf->lr_fport == dport) && + (lf->lr_lport == sport)) { + if (lrodebug) { + printf("%s: %x %x\n", __func__, + lf->lr_flags, lf->lr_seq); + } + lf->lr_flags |= LRO_EJECT_REQ; + } + lck_mtx_unlock(&tcp_lro_lock); + return 0; +} + +void +tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr, + unsigned short sport, unsigned short dport) +{ + int hash, flow_id; + struct lro_flow *lf; + + hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport, + (TCP_LRO_FLOW_MAP - 1)); + lck_mtx_lock_spin(&tcp_lro_lock); + flow_id = lro_flow_map[hash]; + if (flow_id == TCP_LRO_FLOW_UNINIT) { + lck_mtx_unlock(&tcp_lro_lock); + return; + } + lf = &lro_flow_list[flow_id]; + if ((lf->lr_faddr.s_addr == daddr.s_addr) && + (lf->lr_laddr.s_addr == saddr.s_addr) && + (lf->lr_fport == dport) && + (lf->lr_lport == sport) && + (lf->lr_tcphdr == NULL)) { + lf->lr_seq = (tcp_seq)rcv_nxt; + } + lck_mtx_unlock(&tcp_lro_lock); + return; +} + +static void +lro_update_stats(struct mbuf *m) +{ + switch(m->m_pkthdr.lro_npkts) { + case 0: /* fall through */ + case 1: + break; + + case 2: + tcpstat.tcps_lro_twopack++; + break; + + case 3: /* fall through */ + case 4: + tcpstat.tcps_lro_multpack++; + break; + + default: + tcpstat.tcps_lro_largepack++; + break; + } + return; +} + +static void +lro_update_flush_stats(struct mbuf *m) +{ + lro_flushes++; + switch(m->m_pkthdr.lro_npkts) { + case 0: ASSERT(0); + case 1: lro_single_flushes++; + break; + case 2: lro_double_flushes++; + break; + default: lro_good_flushes++; + break; + } + return; +} diff --git a/bsd/netinet/tcp_lro.h b/bsd/netinet/tcp_lro.h new file mode 100644 index 000000000..9f1fe01c9 --- /dev/null +++ b/bsd/netinet/tcp_lro.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef TCP_LRO_H_ +#define TCP_LRO_H_ + +#ifdef BSD_KERNEL_PRIVATE + +#define TCP_LRO_NUM_FLOWS (16) /* must be <= 255 for char lro_flow_map */ +#define TCP_LRO_FLOW_MAP (1024) + +struct lro_flow { + struct mbuf *lr_mhead; /* coalesced mbuf chain head */ + struct mbuf *lr_mtail; /* coalesced mbuf chain tail */ + struct tcphdr *lr_tcphdr; /* ptr to TCP hdr in frame */ + u_int32_t *lr_tsval; /* address of tsval in frame */ + u_int32_t *lr_tsecr; /* tsecr field in TCP header */ + tcp_seq lr_seq; /* next expected seq num */ + unsigned int lr_len; /* length of LRO frame */ + struct in_addr lr_faddr; /* foreign address */ + struct in_addr lr_laddr; /* local address */ + unsigned short int lr_fport; /* foreign port */ + unsigned short int lr_lport; /* local port */ + u_int32_t lr_timestamp; /* for ejecting the flow */ + unsigned short int lr_hash_map; /* back pointer to hash map */ + unsigned short int lr_flags; /* pad */ +} __attribute__((aligned(8))); + +/* lr_flags - only 16 bits available */ +#define LRO_EJECT_REQ 0x1 + + +#define TCP_LRO_FLOW_UNINIT TCP_LRO_NUM_FLOWS+1 +#define TCP_LRO_FLOW_NOTFOUND TCP_LRO_FLOW_UNINIT + +/* Max packets to be coalesced before pushing to app */ +#define LRO_MX_COALESCE_PKTS (8) + +/* + * Min num of bytes in a packet to trigger coalescing + */ +#define LRO_MIN_COALESC_SZ (1300) + +/* + * Max amount of time to wait before flushing flows in msecs. + * Units are in msecs. + * This number has been carefully chosen and should be altered with care. + */ +#define LRO_MX_TIME_TO_BUFFER 10 + +/* similar to INP_PCBHASH */ +#define LRO_HASH(faddr, laddr, fport, lport, mask) \ + (((faddr) ^ ((laddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) +#endif + +#endif /* TCP_LRO_H_ */ diff --git a/bsd/netinet/tcp_newreno.c b/bsd/netinet/tcp_newreno.c index 5c9db2de9..8d256db71 100644 --- a/bsd/netinet/tcp_newreno.c +++ b/bsd/netinet/tcp_newreno.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Inc. All rights reserved. + * Copyright (c) 2010-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,10 +25,46 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $ + */ #include #include #include #include +#include #include #include @@ -52,7 +88,7 @@ int tcp_newreno_cleanup(struct tcpcb *tp); void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp); void tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); void tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); -void tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_pre_fr(struct tcpcb *tp); void tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); void tcp_newreno_after_idle(struct tcpcb *tp); void tcp_newreno_after_timeout(struct tcpcb *tp); @@ -77,6 +113,43 @@ struct tcp_cc_algo tcp_cc_newreno = { extern int tcp_do_rfc3465; extern int tcp_do_rfc3465_lim2; extern int maxseg_unacked; +extern u_int32_t tcp_autosndbuf_max; + +#define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ + sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ + tcp_autosndbuf_max); + +void tcp_cc_resize_sndbuf(struct tcpcb *tp) { + struct sockbuf *sb; + /* If the send socket buffer size is bigger than ssthresh, + * it is time to trim it because we do not want to hold + * too many mbufs in the socket buffer + */ + sb = &(tp->t_inpcb->inp_socket->so_snd); + if (sb->sb_hiwat > tp->snd_ssthresh && + (sb->sb_flags & SB_AUTOSIZE) != 0) { + if (sb->sb_idealsize > tp->snd_ssthresh) { + SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); + } + sb->sb_flags |= SB_TRIM; + } +} + +void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) { + struct sockbuf *sb; + sb = &(tp->t_inpcb->inp_socket->so_snd); + if ((sb->sb_flags & (SB_TRIM|SB_AUTOSIZE)) == (SB_TRIM|SB_AUTOSIZE)) { + /* If there was a retransmission that was not necessary + * then the size of socket buffer can be restored to + * what it was before + */ + SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); + if (sb->sb_hiwat <= sb->sb_idealsize) { + sbreserve(sb, sb->sb_idealsize); + sb->sb_flags &= ~SB_TRIM; + } + } +} int tcp_newreno_init(struct tcpcb *tp) { #pragma unused(tp) @@ -202,8 +275,7 @@ tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { } void -tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th) { -#pragma unused(th) +tcp_newreno_pre_fr(struct tcpcb *tp) { uint32_t win; @@ -212,6 +284,8 @@ tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th) { if ( win < 2 ) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; + tcp_cc_resize_sndbuf(tp); + } void @@ -273,6 +347,8 @@ tcp_newreno_after_timeout(struct tcpcb *tp) { tp->snd_ssthresh = win * tp->t_maxseg; tp->t_bytes_acked = 0; tp->t_dupacks = 0; + + tcp_cc_resize_sndbuf(tp); } } @@ -302,15 +378,15 @@ tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th) { case 2: if ((tp->t_flags & TF_RXWIN0SENT) == 0 && (th->th_flags & TH_PUSH) == 0 && - (tp->t_flags & TF_DELACK) == 0) + (tp->t_unacksegs == 1)) return(1); break; case 3: if ((tp->t_flags & TF_RXWIN0SENT) == 0 && (th->th_flags & TH_PUSH) == 0 && - ((tp->t_unacksegs == 0) || + ((tp->t_unacksegs == 1) || ((tp->t_flags & TF_STRETCHACK) != 0 && - tp->t_unacksegs < (maxseg_unacked - 1)))) + tp->t_unacksegs < (maxseg_unacked)))) return(1); break; } diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 5c310770d..8a9eeb9cf 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,6 +83,8 @@ #include #include #include +#include +#include #include #include @@ -118,6 +120,8 @@ #include #endif /* MAC_SOCKET */ +#include + #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) @@ -171,6 +175,22 @@ int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_react_limit, 1, "Accumulated IAJ when receiver starts to react"); +uint32_t tcp_do_autosendbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautosndbuf, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_do_autosendbuf, 1, "Enable send socket buffer auto-tuning"); + +uint32_t tcp_autosndbuf_inc = 8 * 1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufinc, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_autosndbuf_inc, 1, "Increment in send socket bufffer size"); + +uint32_t tcp_autosndbuf_max = 512 * 1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, autosndbufmax, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_autosndbuf_max, 1, "Maximum send socket buffer size"); + +uint32_t tcp_prioritize_acks = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ack_prioritize, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_prioritize_acks, 1, "Prioritize pure acks"); + static int32_t packchain_newlist = 0; static int32_t packchain_looped = 0; static int32_t packchain_sent = 0; @@ -193,11 +213,13 @@ extern int ip_use_randomid; extern u_int32_t dlil_filter_count; extern u_int32_t kipf_count; extern int tcp_recv_bg; +extern int maxseg_unacked; static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int, - struct mbuf *, int, int, int32_t); + struct mbuf *, int, int, int32_t, boolean_t); -static inline int is_tcp_recv_bg(struct socket *so); +extern uint32_t get_base_rtt(struct tcpcb *tp); +static struct mbuf* tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th); static __inline__ u_int16_t get_socket_id(struct socket * s) @@ -214,12 +236,6 @@ get_socket_id(struct socket * s) return (val); } -static inline int -is_tcp_recv_bg(struct socket *so) -{ - return (so->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG); -} - /* * Tcp output routine: figure out what should be sent and send it. * @@ -237,7 +253,14 @@ is_tcp_recv_bg(struct socket *so) * ip_output_list:EMSGSIZE * ip_output_list:ENOBUFS * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL] - * ip6_output:??? [IPV6 only] + * ip6_output_list:EINVAL + * ip6_output_list:EOPNOTSUPP + * ip6_output_list:EHOSTUNREACH + * ip6_output_list:EADDRNOTAVAIL + * ip6_output_list:ENETUNREACH + * ip6_output_list:EMSGSIZE + * ip6_output_list:ENOBUFS + * ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL] */ int tcp_output(struct tcpcb *tp) @@ -271,12 +294,15 @@ tcp_output(struct tcpcb *tp) struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options; #if INET6 int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ; - struct ip6_pktopts *inp6_pktopts = tp->t_inpcb->inp_depend6.inp6_outputopts; #endif short packchain_listadd = 0; u_int16_t socket_id = get_socket_id(so); int so_options = so->so_options; struct rtentry *rt; + u_int32_t basertt, svc_flags = 0; + u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0; + struct mbuf *mnext = NULL; + int sackoptlen = 0; /* * Determine length of data that should be transmitted, @@ -290,7 +316,7 @@ tcp_output(struct tcpcb *tp) * will take care of wrap around of tcp_now */ idle_time = tcp_now - tp->t_rcvtime; - if (idle && idle_time >= tp->t_rxtcur) { + if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) { if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp); DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, @@ -309,14 +335,12 @@ again: #if INET6 if (isipv6) { - KERNEL_DEBUG(DBG_LAYER_BEG, ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), sendalot,0,0); - } - else + } else #endif { @@ -325,6 +349,7 @@ again: (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), sendalot,0,0); + } /* * If the route generation id changed, we need to check that our * local (source) IP address is still valid. If it isn't either @@ -335,7 +360,9 @@ again: if (rt != NULL && (!(rt->rt_flags & RTF_UP) || rt->generation_id != route_generation)) { struct ifnet *ifp; - struct in_ifaddr *ia; + struct in_ifaddr *ia = NULL; + struct in6_ifaddr *ia6 = NULL; + int found_srcaddr = 0; /* disable multipages at the socket */ somultipages(so, FALSE); @@ -343,8 +370,21 @@ again: /* Disable TSO for the socket until we know more */ tp->t_flags &= ~TF_TSO; + if (isipv6) { + ia6 = ifa_foraddr6(&tp->t_inpcb->in6p_laddr); + if (ia6 != NULL) + found_srcaddr = 1; + } else { + ia = ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr); + if (ia != NULL) + found_srcaddr = 1; + } + /* check that the source address is still valid */ - if ((ia = ifa_foraddr(tp->t_inpcb->inp_laddr.s_addr)) == NULL) { + if (found_srcaddr == 0) { + + soevent(so, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR)); if (tp->t_state >= TCPS_CLOSE_WAIT) { tcp_drop(tp, EADDRNOTAVAIL); @@ -380,7 +420,11 @@ again: return(0); /* silently ignore, keep data in socket: address may be back */ } } - IFA_REMREF(&ia->ia_ifa); + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + + if (ia6 != NULL) + IFA_REMREF(&ia6->ia_ifa); /* * Address is still valid; check for multipages capability @@ -408,7 +452,6 @@ again: tp->t_flags |= TF_PMTUD; RT_UNLOCK(rt); - } } /* @@ -467,8 +510,9 @@ again: /* Can rexmit part of the current hole */ len = ((int32_t)min(cwin, tp->snd_recover - p->rxmit)); - } else + } else { len = ((int32_t)min(cwin, p->end - p->rxmit)); + } if (len > 0) { off = p->rxmit - tp->snd_una; /* update off only if we really transmit SACK data */ sack_rxmit = 1; @@ -477,14 +521,16 @@ again: tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); if (nstat_collect) { - nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, min(len, tp->t_maxseg), NSTAT_TX_FLAG_RETRANSMIT); + nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, + min(len, tp->t_maxseg), NSTAT_TX_FLAG_RETRANSMIT); locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); - locked_add_64(&tp->t_inpcb->inp_stat->txbytes, min(len, tp->t_maxseg)); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, + min(len, tp->t_maxseg)); tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg); } - } - else + } else { len = 0; + } } after_sack_rexmit: /* @@ -589,23 +635,31 @@ after_sack_rexmit: flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) { - while (!(tp->t_flags & TF_SENDINPROG) && - tp->t_pktlist_head != NULL) { + while (tp->t_inpcb->inp_sndinprog_cnt == 0 && + tp->t_pktlist_head != NULL) { packetlist = tp->t_pktlist_head; packchain_listadd = tp->t_lastchain; packchain_sent++; TCP_PKTLIST_CLEAR(tp); - tp->t_flags |= TF_SENDINPROG; error = tcp_ip_output(so, tp, packetlist, packchain_listadd, tp_inp_options, - (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), 0); + (so_options & SO_DONTROUTE), + (sack_rxmit | (sack_bytes_rxmt != 0)), 0, +#ifdef INET6 + isipv6); +#else + 0); +#endif + - tp->t_flags &= ~TF_SENDINPROG; } - /* tcp was closed while we were in ip; resume close */ - if ((tp->t_flags & - (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { + /* + * tcp was closed while we were in ip, + * resume close + */ + if (tp->t_inpcb->inp_sndinprog_cnt == 0 && + (tp->t_flags & TF_CLOSING)) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); } else { @@ -613,7 +667,7 @@ after_sack_rexmit: } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - return 0; + return(0); } } @@ -654,6 +708,46 @@ after_sack_rexmit: } } + /* Automatic sizing of send socket buffer. Increase the send socket buffer + * size if all of the following criteria are met + * 1. the receiver has enough buffer space for this data + * 2. send buffer is filled to 7/8th with data (so we actually + * have data to make use of it); + * 3. our send window (slow start and congestion controlled) is + * larger than sent but unacknowledged data in send buffer. + */ + basertt = get_base_rtt(tp); + if (tcp_do_autosendbuf == 1 && + !INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb) && !IN_FASTRECOVERY(tp) && + (so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE && + tcp_cansbgrow(&so->so_snd)) { + if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && + so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && + sendwin >= (so->so_snd.sb_cc - + (tp->snd_nxt - tp->snd_una))) { + /* Also increase the send buffer only if the + * round-trip time is not increasing because we do + * not want to contribute to latency by filling buffers. + * We also do not want to hold onto application's + * old data for too long. Interactive applications would + * rather discard old data. + */ + if (tp->t_rttcur <= + (basertt + 25)) { + if (sbreserve(&so->so_snd, + min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, + tcp_autosndbuf_max)) == 1) { + so->so_snd.sb_idealsize = so->so_snd.sb_hiwat; + } + } else { + so->so_snd.sb_idealsize = + max(tcp_sendspace, so->so_snd.sb_hiwat - + (2 * tcp_autosndbuf_inc)); + so->so_snd.sb_flags |= SB_TRIM; + } + } + } + /* * Truncate to the maximum segment length or enable TCP Segmentation * Offloading (if supported by hardware) and ensure that FIN is removed @@ -717,47 +811,57 @@ after_sack_rexmit: * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * + * - we've timed out (e.g. persist timer) + * - we need to retransmit * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY - * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) - * - we need to retransmit */ if (len) { - if (len >= tp->t_maxseg) { - tp->t_flags |= TF_MAXSEGSNT; - goto send; - } - if (!(tp->t_flags & TF_MORETOCOME) && - (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) && - (tp->t_flags & TF_NOPUSH) == 0 && - len + off >= so->so_snd.sb_cc) { - tp->t_flags &= ~TF_MAXSEGSNT; - goto send; - } if (tp->t_force) { tp->t_flags &= ~TF_MAXSEGSNT; goto send; } - if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { - tp->t_flags &= ~TF_MAXSEGSNT; - goto send; - } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->t_flags &= ~TF_MAXSEGSNT; goto send; } if (sack_rxmit) goto send; + + /* + * Send new data on the connection only if it is + * not flow controlled + */ + if (!INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb) || + tp->t_state != TCPS_ESTABLISHED) { + if (len >= tp->t_maxseg) { + tp->t_flags |= TF_MAXSEGSNT; + goto send; + } + if (!(tp->t_flags & TF_MORETOCOME) && + (idle || tp->t_flags & TF_NODELAY || tp->t_flags & TF_MAXSEGSNT) && + (tp->t_flags & TF_NOPUSH) == 0 && + len + off >= so->so_snd.sb_cc) { + tp->t_flags &= ~TF_MAXSEGSNT; + goto send; + } + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { + tp->t_flags &= ~TF_MAXSEGSNT; + goto send; + } + } else { + tcpstat.tcps_fcholdpacket++; + } } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two - * max size segments, or at least 50% of the maximum possible + * max size segments, or at least 25% of the maximum possible * window, then want to send a window update to peer. * Skip this if the connection is in T/TCP half-open state. */ @@ -771,17 +875,31 @@ after_sack_rexmit: (tp->rcv_adv - tp->rcv_nxt); if (adv >= (int32_t) (2 * tp->t_maxseg)) { - - /* - * Update only if the resulting scaled value of the window changed, or + /* Update only if the resulting scaled value of the window changed, or * if there is a change in the sequence since the last ack. * This avoids what appears as dupe ACKS (see rdar://5640997) + * + * If streaming is detected avoid sending too many window updates. + * We will depend on the delack timer to send a window update + * when needed. */ - - if ((tp->last_ack_sent != tp->rcv_nxt) || (((recwin + adv) >> tp->rcv_scale) > recwin)) + if ((tp->t_flags & TF_STRETCHACK) == 0 && + (tp->last_ack_sent != tp->rcv_nxt || + ((recwin + adv) >> tp->rcv_scale) > recwin)) { goto send; + } + + /* Make sure that the delayed ack timer is set if we + * delayed sending a window update because of streaming + * detection. + */ + if ((tp->t_flags & TF_STRETCHACK) != 0 && + (tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } } - if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) + if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) goto send; } @@ -808,8 +926,9 @@ after_sack_rexmit: * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ - if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) && - tp->t_timer[TCPT_REXMT] == 0 && + if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && + SEQ_GT(tp->snd_max, tp->snd_una) && + tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); goto just_return; @@ -847,20 +966,25 @@ just_return: * If there is no reason to send a segment, just return. * but if there is some packets left in the packet list, send them now. */ - while (!(tp->t_flags & TF_SENDINPROG) && tp->t_pktlist_head != NULL) { + while (tp->t_inpcb->inp_sndinprog_cnt == 0 && + tp->t_pktlist_head != NULL) { packetlist = tp->t_pktlist_head; packchain_listadd = tp->t_lastchain; packchain_sent++; TCP_PKTLIST_CLEAR(tp); - tp->t_flags |= TF_SENDINPROG; error = tcp_ip_output(so, tp, packetlist, packchain_listadd, - tp_inp_options, (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin); - - tp->t_flags &= ~TF_SENDINPROG; + tp_inp_options, (so_options & SO_DONTROUTE), + (sack_rxmit | (sack_bytes_rxmt != 0)), recwin, +#ifdef INET6 + isipv6); +#else + 0); +#endif } /* tcp was closed while we were in ip; resume close */ - if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { + if (tp->t_inpcb->inp_sndinprog_cnt == 0 && + (tp->t_flags & TF_CLOSING)) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); } else { @@ -884,7 +1008,7 @@ send: hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif - hdrlen = sizeof (struct tcpiphdr); + hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if ((tp->t_flags & TF_NOOPT) == 0) { @@ -899,7 +1023,7 @@ send: if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { - *((u_int32_t *)(opt + optlen)) = htonl( + *((u_int32_t *)(void *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | @@ -980,7 +1104,7 @@ send: * the CWR flag on data packets. Pure acks don't have this set. */ if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 && - !SEQ_LT(tp->snd_nxt, tp->snd_max)) { + !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) { flags |= TH_CWR; tp->ecn_flags &= ~TE_SENDCWR; } @@ -1001,7 +1125,7 @@ send: (flags & TH_RST) == 0 && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_TSTMP))) { - u_int32_t *lp = (u_int32_t *)(opt + optlen); + u_int32_t *lp = (u_int32_t *)(void *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); @@ -1010,6 +1134,10 @@ send: optlen += TCPOLEN_TSTAMP_APPA; } + /* Note the timestamp for receive buffer autosizing */ + if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = tcp_now; + if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) { /* * Tack on the SACK permitted option *last*. @@ -1051,7 +1179,7 @@ send: if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 && MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) { - int nsack, sackoptlen, padlen; + int nsack, padlen; u_char *bp = (u_char *)opt + optlen; u_int32_t *lp; @@ -1072,7 +1200,7 @@ send: tcpstat.tcps_sack_send_blocks++; *bp++ = TCPOPT_SACK; *bp++ = sackoptlen; - lp = (u_int32_t *)bp; + lp = (u_int32_t *)(void *)bp; for (i = 0; i < nsack; i++) { struct sackblk sack = tp->sackblks[i]; *lp++ = htonl(sack.start); @@ -1157,6 +1285,23 @@ send: #endif /*#endif*/ + /* Check if there is enough data in the send socket + * buffer to start measuring bw + */ + if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && + (tp->t_bwmeas != NULL) && + (tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0 && + (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) >= + tp->t_bwmeas->bw_minsize) { + tp->t_bwmeas->bw_size = min((so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)), + tp->t_bwmeas->bw_maxsize); + tp->t_flagsext |= TF_BWMEAS_INPROGRESS; + tp->t_bwmeas->bw_start = tp->snd_max; + tp->t_bwmeas->bw_ts = tcp_now; + } + + VERIFY(tp->t_inpcb->inp_flowhash != 0); + /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from @@ -1315,7 +1460,7 @@ send: MH_ALIGN(m, hdrlen); } else #endif - m->m_data += max_linkhdr; + m->m_data += max_linkhdr; m->m_len = hdrlen; } m->m_pkthdr.rcvif = 0; @@ -1325,22 +1470,23 @@ send: #if INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); - th = (struct tcphdr *)(ip6 + 1); + th = (struct tcphdr *)(void *)(ip6 + 1); tcp_fillheaders(tp, ip6, th); if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && - !SEQ_LT(tp->snd_nxt, tp->snd_max)) { + !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) { ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); } + svc_flags |= PKT_SCF_IPV6; } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; - th = (struct tcphdr *)(ip + 1); + th = (struct tcphdr *)(void *)(ip + 1); /* this picks up the pseudo header (w/o the length) */ tcp_fillheaders(tp, ip, th); if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && - !SEQ_LT(tp->snd_nxt, tp->snd_max)) { + !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) { ip->ip_tos = IPTOS_ECN_ECT0; } } @@ -1350,7 +1496,7 @@ send: * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ - if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + if (flags & TH_FIN && (tp->t_flags & TF_SENTFIN) && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* @@ -1398,7 +1544,7 @@ send: } #if TRAFFIC_MGT - if (tcp_recv_bg == 1 || is_tcp_recv_bg(so)) { + if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) { if (tp->acc_iaj > tcp_acc_iaj_react_limit) { uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg; if (tp->iaj_rwintop == 0 || @@ -1567,21 +1713,12 @@ timer: * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ +#ifdef INET6 /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ -#if INET6 if (isipv6) { - struct rtentry *rt6; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; - unsigned int outif; - - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | - (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), - 0,0,0); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. @@ -1594,49 +1731,23 @@ timer: : NULL); /* TODO: IPv6 IP6TOS_ECT bit on */ -#if IPSEC - if (ipsec_bypass == 0 && ipsec_setsocket(m, so) != 0) { - m_freem(m); - error = ENOBUFS; - goto out; - } -#endif /*IPSEC*/ - m->m_pkthdr.socket_id = socket_id; - - rt6 = tp->t_inpcb->in6p_route.ro_rt; - if (rt6 != NULL && rt6->rt_ifp != NULL - && rt6->rt_ifp != lo_ifp) - set_packet_tclass(m, so, MBUF_TC_UNSPEC, 1); - - DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, struct ip6_hdr *, ip6, - struct tcpcb *, tp, struct tcphdr *, th); - - if (tp->t_inpcb->inp_flags & INP_BOUND_IF) - ip6oa.ip6oa_boundif = tp->t_inpcb->inp_boundif; - - ip6oa.ip6oa_nocell = (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; - - error = ip6_output(m, inp6_pktopts, &tp->t_inpcb->in6p_route, - (so_options & SO_DONTROUTE) | IPV6_OUTARGS, NULL, NULL, - &ip6oa); - - /* Refresh rt6 as we may have lost the route while in ip6_output() */ - if ((rt6 = tp->t_inpcb->in6p_route.ro_rt) != NULL && - (outif = rt6->rt_ifp->if_index) != tp->t_inpcb->in6p_last_outif) - tp->t_inpcb->in6p_last_outif = outif; + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + sendalot,0,0); } else #endif /* INET6 */ - { - ip->ip_len = m->m_pkthdr.len; - ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ - ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK); /* XXX */ - - - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | - (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), - 0,0,0); + { + ip->ip_len = m->m_pkthdr.len; + ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ + ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK);/* XXX */ + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + 0,0,0); + } /* * See if we should do MTU discovery. @@ -1647,9 +1758,11 @@ timer: * 4) the MTU is not locked (if it is, then discovery has been * disabled for that route) */ - - if (path_mtu_discovery && (tp->t_flags & TF_PMTUD)) - ip->ip_off |= IP_DF; +#ifdef INET6 + if (!isipv6) +#endif + if (path_mtu_discovery && (tp->t_flags & TF_PMTUD)) + ip->ip_off |= IP_DF; #if IPSEC if (ipsec_bypass == 0) @@ -1661,18 +1774,50 @@ timer: */ lost = 0; m->m_pkthdr.socket_id = socket_id; + + /* + * Embed the flow hash in pkt hdr and mark the packet as + * capable of flow controlling + */ + m->m_pkthdr.m_flowhash = tp->t_inpcb->inp_flowhash; + m->m_pkthdr.m_fhflags |= + (PF_TAG_TCP | PF_TAG_FLOWHASH | PF_TAG_FLOWADV); + m->m_nextpkt = NULL; - if (tp->t_inpcb->inp_route.ro_rt != NULL && - tp->t_inpcb->inp_route.ro_rt->rt_ifp != NULL && - tp->t_inpcb->inp_route.ro_rt->rt_ifp != lo_ifp) - set_packet_tclass(m, so, MBUF_TC_UNSPEC, 0); + if (tp->t_inpcb->inp_last_outifp != NULL && + tp->t_inpcb->inp_last_outifp != lo_ifp) { + /* Hint to prioritize this packet if + * 1. if the packet has no data + * 2. the interface supports transmit-start model and did + * not disable ACK prioritization. + * 3. Only ACK flag is set. + * 4. there is no outstanding data on this connection. + */ + if (tcp_prioritize_acks != 0 && len == 0 && + (tp->t_inpcb->inp_last_outifp->if_eflags & + (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART && + th->th_flags == TH_ACK && tp->snd_una == tp->snd_max && + tp->t_timer[TCPT_REXMT] == 0) { + svc_flags |= PKT_SCF_TCP_ACK; + } + set_packet_service_class(m, so, MBUF_SC_UNSPEC, svc_flags); + } tp->t_pktlist_sentlen += len; tp->t_lastchain++; - DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, - struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th); +#ifdef INET6 + if (isipv6) { + DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, + struct ip6 *, ip6, struct tcpcb *, tp, struct tcphdr *, + th); + } else +#endif + { + DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, + struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th); + } if (tp->t_pktlist_head != NULL) { tp->t_pktlist_tail->m_nextpkt = m; @@ -1682,25 +1827,53 @@ timer: tp->t_pktlist_head = tp->t_pktlist_tail = m; } + if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) && + ((th->th_flags & TH_ACK) == TH_ACK) && (!len) && + (tp->t_state == TCPS_ESTABLISHED)) { + /* For a pure ACK, see if you need to send more of them */ + mnext = tcp_send_lroacks(tp, m, th); + if (mnext) { + tp->t_pktlist_tail->m_nextpkt = mnext; + if (mnext->m_nextpkt == NULL) { + tp->t_pktlist_tail = mnext; + tp->t_lastchain++; + } else { + struct mbuf *tail, *next; + next = mnext->m_nextpkt; + tail = next->m_nextpkt; + while (tail) { + next = tail; + tail = tail->m_nextpkt; + tp->t_lastchain++; + } + tp->t_pktlist_tail = next; + } + } + } + if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) || (tp->snd_cwnd <= (tp->snd_wnd / 8)) || (tp->t_flags & (TH_PUSH | TF_ACKNOW)) || tp->t_force != 0 || tp->t_lastchain >= tcp_packet_chaining) { error = 0; - while (!(tp->t_flags & TF_SENDINPROG) && - tp->t_pktlist_head != NULL) { + while (tp->t_inpcb->inp_sndinprog_cnt == 0 && + tp->t_pktlist_head != NULL) { packetlist = tp->t_pktlist_head; packchain_listadd = tp->t_lastchain; packchain_sent++; lost = tp->t_pktlist_sentlen; TCP_PKTLIST_CLEAR(tp); - tp->t_flags |= TF_SENDINPROG; error = tcp_ip_output(so, tp, packetlist, packchain_listadd, tp_inp_options, - (so_options & SO_DONTROUTE), (sack_rxmit | (sack_bytes_rxmt != 0)), recwin); + (so_options & SO_DONTROUTE), + (sack_rxmit | (sack_bytes_rxmt != 0)), recwin, +#ifdef INET6 + isipv6); +#else + 0); +#endif - tp->t_flags &= ~TF_SENDINPROG; if (error) { /* * Take into account the rest of unsent @@ -1715,20 +1888,19 @@ timer: } } /* tcp was closed while we were in ip; resume close */ - if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { + if (tp->t_inpcb->inp_sndinprog_cnt == 0 && + (tp->t_flags & TF_CLOSING)) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); return (0); } - } - else { + } else { error = 0; packchain_looped++; tcpstat.tcps_sndtotal++; goto again; } - } if (error) { /* * Assume that the packets were lost, so back out the @@ -1757,8 +1929,9 @@ out: if (error == ENOBUFS) { if (!tp->t_timer[TCPT_REXMT] && - !tp->t_timer[TCPT_PERSIST]) - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + !tp->t_timer[TCPT_PERSIST]) + tp->t_timer[TCPT_REXMT] = + OFFSET_FROM_START(tp, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; tp->t_bytes_acked = 0; @@ -1807,25 +1980,6 @@ out: tcpstat.tcps_sndtotal++; -#if INET6 - /* - * Data sent (as far as we can tell). - * If this advertises a larger window than any other segment, - * then remember the size of the advertised window. - * Make sure ACK/DELACK conditions are cleared before - * we unlock the socket. - * NOTE: for now, this is done in tcp_ip_output for IPv4 - */ - if (isipv6) { - if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) - tp->rcv_adv = tp->rcv_nxt + recwin; - tp->last_ack_sent = tp->rcv_nxt; - tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); - tp->t_timer[TCPT_DELACK] = 0; - tp->t_unacksegs = 0; - } -#endif - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); if (sendalot) goto again; @@ -1836,24 +1990,63 @@ out: static int tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, - int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin) + int cnt, struct mbuf *opt, int flags, int sack_in_progress, int recwin, + boolean_t isipv6) { int error = 0; boolean_t chain; boolean_t unlocked = FALSE; struct inpcb *inp = tp->t_inpcb; - struct ip_out_args ipoa; + struct ip_out_args ipoa = + { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR }; struct route ro; - unsigned int outif; + struct ifnet *outif = NULL; +#ifdef INET6 + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; + struct route_in6 ro6; + struct flowadv *adv = + (isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv); +#else + struct flowadv *adv = &ipoa.ipoa_flowadv; +#endif /* !INET6 */ /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; - ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; - flags |= IP_OUTARGS; + if (inp->inp_flags & INP_BOUND_IF) { +#ifdef INET6 + if (isipv6) { + ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; + } else +#endif + { + ipoa.ipoa_boundif = inp->inp_boundifp->if_index; + ipoa.ipoa_flags |= IPOAF_BOUND_IF; + } + } + + if (inp->inp_flags & INP_NO_IFT_CELLULAR) { +#ifdef INET6 + if (isipv6) + ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; + else +#endif + ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; + } +#ifdef INET6 + if (isipv6) + flags |= IPV6_OUTARGS; + else +#endif + flags |= IP_OUTARGS; /* Copy the cached route and take an extra reference */ - inp_route_copyout(inp, &ro); +#ifdef INET6 + if (isipv6) + in6p_route_copyout(inp, &ro6); + else +#endif + inp_route_copyout(inp, &ro); /* * Data sent (as far as we can tell). @@ -1869,6 +2062,9 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, tp->t_timer[TCPT_DELACK] = 0; tp->t_unacksegs = 0; + /* Increment the count of outstanding send operations */ + inp->inp_sndinprog_cnt++; + /* * If allowed, unlock TCP socket while in IP * but only if the connection is established and @@ -1878,9 +2074,10 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, * - we're not in Fast Recovery mode * - if we're not sending from an upcall. */ - if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) && + if (tcp_output_unlocked && !so->so_upcallusecount && (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) && ((tp->t_flags & TF_FASTRECOVERY) == 0)) { + unlocked = TRUE; socket_unlock(so, 0); } @@ -1920,7 +2117,16 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, */ cnt = 0; } - error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa); +#ifdef INET6 + if (isipv6) + error = ip6_output_list(pkt, cnt, + inp->in6p_outputopts, &ro6, flags, NULL, NULL, + &ip6oa); + else +#endif + error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL, + &ipoa); + if (chain || error) { /* * If we sent down a chain then we are done since @@ -1937,13 +2143,71 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, if (unlocked) socket_lock(so, 0); - if (ro.ro_rt != NULL && - (outif = ro.ro_rt->rt_ifp->if_index) != inp->inp_last_outif) - inp->inp_last_outif = outif; + /* + * Enter flow controlled state if the connection is established + * and is not in recovery. + * + * A connection will enter suspended state even if it is in + * recovery. + */ + if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) || + adv->code == FADV_SUSPENDED) && + !(tp->t_flags & TF_CLOSING) && + tp->t_state == TCPS_ESTABLISHED) { + int rc; + rc = inp_set_fc_state(inp, adv->code); + + if (rc == 1) + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, ((adv->code == FADV_FLOW_CONTROLLED) ? + TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND)); + } - /* Synchronize cached PCB route */ - inp_route_copyin(inp, &ro); + /* + * When an interface queue gets suspended, some of the + * packets are dropped. Return ENOBUFS, to update the + * pcb state. + */ + if (adv->code == FADV_SUSPENDED) + error = ENOBUFS; + + VERIFY(inp->inp_sndinprog_cnt > 0); + if ( --inp->inp_sndinprog_cnt == 0) + inp->inp_flags &= ~(INP_FC_FEEDBACK); +#ifdef INET6 + if (isipv6) { + if (ro6.ro_rt != NULL && (outif = ro6.ro_rt->rt_ifp) != + inp->in6p_last_outifp) + inp->in6p_last_outifp = outif; + } else +#endif + if (ro.ro_rt != NULL && (outif = ro.ro_rt->rt_ifp) != + inp->inp_last_outifp) + inp->inp_last_outifp = outif; + + if ((inp->inp_flags & INP_NO_IFT_CELLULAR) && outif != NULL && + outif->if_type == IFT_CELLULAR) + soevent(inp->inp_socket, + (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED)); + + /* Synchronize cached PCB route & options */ +#ifdef INET6 + if (isipv6) + in6p_route_copyin(inp, &ro6); + else +#endif + inp_route_copyin(inp, &ro); + + if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 && + tp->t_inpcb->inp_route.ro_rt != NULL) { + /* If we found the route and there is an rtt on it + * reset the retransmit timer + */ + tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt); + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + } return (error); } @@ -1970,10 +2234,87 @@ tcp_setpersist(tp) */ TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], t * tcp_backoff[tp->t_rxtshift], - TCPTV_PERSMIN, TCPTV_PERSMAX, - TCP_ADD_REXMTSLOP(tp)); + TCPTV_PERSMIN, TCPTV_PERSMAX, 0); tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } + +/* + * Send as many acks as data coalesced. Every other packet when stretch + * ACK is not enabled. Every 8 packets, if stretch ACK is enabled. + */ +static struct mbuf* +tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th) +{ + struct mbuf *mnext = NULL, *ack_chain = NULL, *tail = NULL; + int count = 0; + tcp_seq org_ack = ntohl(th->th_ack); + tcp_seq prev_ack = 0; + int tack_offset = 28; /* XXX IPv6 not supported */ + int ack_size = (tp->t_flags & TF_STRETCHACK) ? + (maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1); + int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2; + struct mbuf *prev_ack_pkt = NULL; + struct socket *so = tp->t_inpcb->inp_socket; + + count = tp->t_lropktlen/tp->t_maxseg; + + prev_ack = (org_ack - tp->t_lropktlen) + ack_size; + if (prev_ack < org_ack) { + ack_chain = m_dup(m, M_DONTWAIT); + if (ack_chain) { + th->th_ack = htonl(prev_ack); + tail = ack_chain; + count -= segs_acked; /* accounts for prev_ack packet */ + count = (count <= segs_acked) ? 0 : count - segs_acked; + tcpstat.tcps_sndacks++; + so_tc_update_stats(m, so, m_get_service_class(m)); + } else { + return NULL; + } + } + else { + tp->t_lropktlen = 0; + return NULL; + } + + prev_ack_pkt = ack_chain; + + while (count > 0) { + if ((prev_ack + ack_size) < org_ack) { + prev_ack += ack_size; + } else { + /* + * The last ACK sent must have the ACK number that TCP + * thinks is the last sent ACK number. + */ + prev_ack = org_ack; + } + mnext = m_dup(prev_ack_pkt, M_DONTWAIT); + if (mnext) { + HTONL(prev_ack); + bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, 4); + NTOHL(prev_ack); + tail->m_nextpkt = mnext; + tail = mnext; + count -= segs_acked; + tcpstat.tcps_sndacks++; + so_tc_update_stats(m, so, m_get_service_class(m)); + if (lrodebug == 5) { + printf("%s: lropktlen = %d count = %d, th_ack = %x \n", + __func__, tp->t_lropktlen, count, + th->th_ack); + } + } else { + if (lrodebug == 5) { + printf("%s: failed to alloc mbuf.\n", __func__); + } + break; + } + prev_ack_pkt = mnext; + } + tp->t_lropktlen = 0; + return ack_chain; +} diff --git a/bsd/netinet/tcp_seq.h b/bsd/netinet/tcp_seq.h index df7bfa4e9..5eb5c3b93 100644 --- a/bsd/netinet/tcp_seq.h +++ b/bsd/netinet/tcp_seq.h @@ -107,7 +107,7 @@ #define tcp_sendseqinit(tp) \ (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ - (tp)->snd_recover = (tp)->snd_high = (tp)->iss + (tp)->snd_recover = (tp)->iss #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * TCP_RETRANSHZ) /* timestamp wrap-around time */ diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index 8cf658482..355d4f0d5 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -142,6 +142,8 @@ #include #include +#include + #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) extern int tcp_lq_overflow; @@ -162,6 +164,8 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, "Default TCP Maximum Segment Size for IPv6"); #endif +extern int tcp_do_autorcvbuf; + /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds @@ -242,23 +246,32 @@ extern struct tcp_cc_algo tcp_cc_ledbat; SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport"); +__private_extern__ int tcp_win_scale = 3; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_win_scale, 0, "Window scaling factor"); + static void tcp_cleartaocache(void); static void tcp_notify(struct inpcb *, int); static void tcp_cc_init(void); struct zone *sack_hole_zone; struct zone *tcp_reass_zone; +struct zone *tcp_bwmeas_zone; /* The array containing pointers to currently implemented TCP CC algorithms */ struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; -extern unsigned int total_mb_cnt; -extern unsigned int total_cl_cnt; -extern int sbspace_factor; -extern int tcp_sockthreshold; extern int slowlink_wsize; /* window correction for slow links */ extern int path_mtu_discovery; +extern u_int32_t tcp_autorcvbuf_max; +extern u_int32_t tcp_autorcvbuf_inc_shift; +static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb); + +#define TCP_BWMEAS_BURST_MINSIZE 6 +#define TCP_BWMEAS_BURST_MAXSIZE 25 + +static uint32_t bwmeas_elm_size; /* * Target size of TCP PCB hash tables. Must be a power of two. @@ -300,6 +313,7 @@ static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); static lck_attr_t *tcp_uptime_mtx_attr = NULL; /* mutex attributes */ static lck_grp_t *tcp_uptime_mtx_grp = NULL; /* mutex group definition */ static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; /* mutex group attributes */ +int tcp_notsent_lowat_check(struct socket *so); int get_inpcb_str_size(void) @@ -382,14 +396,25 @@ tcp_init() zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE); zone_change(tcp_reass_zone, Z_EXPAND, TRUE); + bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t)); + tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, "tcp_bwmeas_zone"); + if (tcp_bwmeas_zone == NULL) { + panic("%s: failed allocating tcp_bwmeas_zone", __func__); + /* NOTREACHED */ + } + zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE); + zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE); + #if INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ - if (max_protohdr < TCP_MINPROTOHDR) - max_protohdr = TCP_MINPROTOHDR; - if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) + if (max_protohdr < TCP_MINPROTOHDR) { + _max_protohdr = TCP_MINPROTOHDR; + _max_protohdr = max_protohdr; /* round it up */ + } + if (max_linkhdr + max_protohdr > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR @@ -440,6 +465,9 @@ tcp_init() /* Initialize TCP congestion control algorithms list */ tcp_cc_init(); + + /* Initialize TCP LRO data structures */ + tcp_lro_init(); } /* @@ -563,7 +591,7 @@ tcp_respond( struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ - unsigned int outif; + struct ifnet *outif; #if INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; @@ -606,13 +634,13 @@ tcp_respond( bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); - nth = (struct tcphdr *)(ip6 + 1); + nth = (struct tcphdr *)(void *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); - nth = (struct tcphdr *)(ip + 1); + nth = (struct tcphdr *)(void *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; @@ -625,13 +653,17 @@ tcp_respond( #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #if INET6 if (isipv6) { + /* Expect 32-bit aligned IP on strict-align platforms */ + IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); - nth = (struct tcphdr *)(ip6 + 1); + nth = (struct tcphdr *)(void *)(ip6 + 1); } else #endif /* INET6 */ { + /* Expect 32-bit aligned IP on strict-align platforms */ + IP_HDR_STRICT_ALIGNMENT_CHECK(ip); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); - nth = (struct tcphdr *)(ip + 1); + nth = (struct tcphdr *)(void *)(ip + 1); } if (th != nth) { /* @@ -717,12 +749,29 @@ tcp_respond( } #endif - if (tp != NULL) - set_packet_tclass(m, tp->t_inpcb->inp_socket, MBUF_TC_UNSPEC, isipv6); + if (tp != NULL) { + u_int32_t svc_flags = 0; + if (isipv6) { + svc_flags |= PKT_SCF_IPV6; + } + set_packet_service_class(m, tp->t_inpcb->inp_socket, + MBUF_SC_UNSPEC, svc_flags); + + /* Embed flowhash and flow control flags */ + m->m_pkthdr.m_flowhash = tp->t_inpcb->inp_flowhash; + m->m_pkthdr.m_fhflags |= + (PF_TAG_TCP | PF_TAG_FLOWHASH | PF_TAG_FLOWADV); + } #if INET6 if (isipv6) { - struct ip6_out_args ip6oa = { ifscope, nocell }; + struct ip6_out_args ip6oa = { ifscope, { 0 }, + IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; + + if (ifscope != IFSCOPE_NONE) + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; + if (nocell) + ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL, NULL, &ip6oa); @@ -730,15 +779,21 @@ tcp_respond( if (ro6 == &sro6) { rtfree(ro6->ro_rt); ro6->ro_rt = NULL; - } else if ((outif = ro6->ro_rt->rt_ifp->if_index) != - tp->t_inpcb->in6p_last_outif) { - tp->t_inpcb->in6p_last_outif = outif; + } else if ((outif = ro6->ro_rt->rt_ifp) != + tp->t_inpcb->in6p_last_outifp) { + tp->t_inpcb->in6p_last_outifp = outif; } } } else #endif /* INET6 */ { - struct ip_out_args ipoa = { ifscope, nocell }; + struct ip_out_args ipoa = { ifscope, { 0 }, + IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR }; + + if (ifscope != IFSCOPE_NONE) + ipoa.ipoa_flags |= IPOAF_BOUND_IF; + if (nocell) + ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; if (ro != &sro) { /* Copy the cached route and take an extra reference */ @@ -751,9 +806,9 @@ tcp_respond( if (ro != &sro) { if (sro.ro_rt != NULL && - (outif = sro.ro_rt->rt_ifp->if_index) != - tp->t_inpcb->inp_last_outif) - tp->t_inpcb->inp_last_outif = outif; + (outif = sro.ro_rt->rt_ifp) != + tp->t_inpcb->inp_last_outifp) + tp->t_inpcb->inp_last_outifp = outif; /* Synchronize cached PCB route */ inp_route_copyin(tp->t_inpcb, &sro); } else if (sro.ro_rt != NULL) { @@ -782,11 +837,11 @@ tcp_newtcpcb(inp) calculate_tcp_clock(); if (so->cached_in_sock_layer == 0) { - it = (struct inp_tp *)inp; + it = (struct inp_tp *)(void *)inp; tp = &it->tcb; } else - tp = (struct tcpcb *) inp->inp_saved_ppcb; + tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb; bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); @@ -820,11 +875,9 @@ tcp_newtcpcb(inp) } tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = tcp_now; - tp->t_bw_rtttime = 0; tp->tentry.timer_start = tcp_now; tp->t_persist_timeout = tcp_max_persist_timeout; tp->t_persist_stop = 0; @@ -852,7 +905,7 @@ tcp_drop(tp, errno) struct socket *so = tp->t_inpcb->inp_socket; #if CONFIG_DTRACE struct inpcb *inp = tp->t_inpcb; -#endif /* CONFIG_DTRACE */ +#endif if (TCPS_HAVERCVDSYN(tp->t_state)) { DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, @@ -868,6 +921,39 @@ tcp_drop(tp, errno) return (tcp_close(tp)); } +void +tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) +{ + u_int32_t rtt = rt->rt_rmx.rmx_rtt; + int isnetlocal = (tp->t_flags & TF_LOCAL); + + if (rtt != 0) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); + else + tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; + tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); + } +} + /* * Close a TCP control block: * discard all space held by the tcp @@ -904,7 +990,8 @@ tcp_close(tp) * point both flags should be cleared and we can proceed further * with the cleanup. */ - if (tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) { + if ((tp->t_flags & TF_CLOSING) || + inp->inp_sndinprog_cnt > 0) { tp->t_flags |= TF_CLOSING; return (NULL); } @@ -941,14 +1028,14 @@ tcp_close(tp) if (rt == NULL) goto no_valid_rt; - sin6 = (struct sockaddr_in6 *)rt_key(rt); + sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) goto no_valid_rt; } else #endif /* INET6 */ if (rt == NULL || !(rt->rt_flags & RTF_UP) || - ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr == + ((struct sockaddr_in *)(void *)rt_key(rt))->sin_addr.s_addr == INADDR_ANY || rt->generation_id != route_generation) { if (tp->t_state >= TCPS_CLOSE_WAIT) { DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, @@ -1051,6 +1138,9 @@ no_valid_rt: (void) tcp_freeq(tp); tcp_free_sackholes(tp); + if (tp->t_bwmeas != NULL) { + tcp_bwmeas_free(tp); + } /* Free the packet list */ if (tp->t_pktlist_head != NULL) @@ -1066,6 +1156,16 @@ no_valid_rt: */ sodisconnectwakeup(so); + /* + * Clean up any LRO state + */ + if (tp->t_flagsext & TF_LRO_OFFLOADED) { + tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, + inp->inp_lport, + inp->inp_fport); + tp->t_flagsext &= ~TF_LRO_OFFLOADED; + } + #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); @@ -1178,6 +1278,30 @@ tcp_notify(inp, error) #endif } +struct bwmeas* +tcp_bwmeas_alloc(struct tcpcb *tp) +{ + struct bwmeas *elm; + elm = zalloc(tcp_bwmeas_zone); + if (elm == NULL) + return(elm); + + bzero(elm, bwmeas_elm_size); + elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE; + elm->bw_maxsizepkts = TCP_BWMEAS_BURST_MAXSIZE; + elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg; + elm->bw_maxsize = elm->bw_maxsizepkts * tp->t_maxseg; + return(elm); +} + +void +tcp_bwmeas_free(struct tcpcb* tp) +{ + zfree(tcp_bwmeas_zone, tp->t_bwmeas); + tp->t_bwmeas = NULL; + tp->t_flagsext &= ~(TF_MEASURESNDBW); +} + /* * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format. * The otcpcb data structure is passed to user space and must not change. @@ -1334,7 +1458,7 @@ tcp_pcblist SYSCTL_HANDLER_ARGS inpcb_to_compat(inp, &xt.xt_inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb != NULL) { - tcpcb_to_otcpcb((struct tcpcb *)inp_ppcb, + tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb, &xt.xt_tp); } else { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); @@ -1566,6 +1690,18 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); +__private_extern__ void +tcp_get_ports_used(unsigned int ifindex, uint8_t *bitfield) +{ + inpcb_get_ports_used(ifindex, bitfield, &tcbinfo); +} + +__private_extern__ uint32_t +tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags) +{ + return inpcb_count_opportunistic(ifindex, &tcbinfo, flags); +} + void tcp_ctlinput(cmd, sa, vip) int cmd; @@ -1574,16 +1710,13 @@ tcp_ctlinput(cmd, sa, vip) { tcp_seq icmp_tcp_seq; struct ip *ip = vip; - struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; - - void (*notify)(struct inpcb *, int) = tcp_notify; - struct icmp *icp; + void (*notify)(struct inpcb *, int) = tcp_notify; - faddr = ((struct sockaddr_in *)sa)->sin_addr; + faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; @@ -1603,19 +1736,22 @@ tcp_ctlinput(cmd, sa, vip) else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { - icp = (struct icmp *)((caddr_t)ip - - offsetof(struct icmp, icmp_ip)); - th = (struct tcphdr *)((caddr_t)ip - + (IP_VHL_HL(ip->ip_vhl) << 2)); - inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, 0, NULL); + struct tcphdr th; + struct icmp *icp; + + icp = (struct icmp *)(void *) + ((caddr_t)ip - offsetof(struct icmp, icmp_ip)); + bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)), + &th, sizeof (th)); + inp = in_pcblookup_hash(&tcbinfo, faddr, th.th_dport, + ip->ip_src, th.th_sport, 0, NULL); if (inp != NULL && inp->inp_socket != NULL) { tcp_lock(inp->inp_socket, 1, 0); if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { tcp_unlock(inp->inp_socket, 1, 0); return; } - icmp_tcp_seq = htonl(th->th_seq); + icmp_tcp_seq = htonl(th.th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { @@ -2001,7 +2137,7 @@ tcp_rtlookup(inp, input_ifscope) ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); - ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr = inp->inp_faddr; /* @@ -2012,7 +2148,7 @@ tcp_rtlookup(inp, input_ifscope) * input_ifscope is IFSCOPE_NONE). */ ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : input_ifscope; + inp->inp_boundifp->if_index : input_ifscope; if (rt != NULL) RT_UNLOCK(rt); @@ -2095,7 +2231,7 @@ tcp_rtlookup6(inp, input_ifscope) * input_ifscope is IFSCOPE_NONE). */ ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : input_ifscope; + inp->inp_boundifp->if_index : input_ifscope; if (rt != NULL) RT_UNLOCK(rt); @@ -2175,7 +2311,7 @@ ipsec_hdrsiz_tcp(tp) #if INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); - th = (struct tcphdr *)(ip6 + 1); + th = (struct tcphdr *)(void *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcp_fillheaders(tp, ip6, th); @@ -2328,11 +2464,31 @@ tcp_getlock( } } +/* Determine if we can grow the recieve socket buffer to avoid sending + * a zero window update to the peer. We allow even socket buffers that + * have fixed size (set by the application) to grow if the resource + * constraints are met. They will also be trimmed after the application + * reads data. + */ +static void +tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb) { + u_int32_t rcvbufinc = tp->t_maxseg << tcp_autorcvbuf_inc_shift; + if (tcp_do_autorcvbuf == 1 && + tcp_cansbgrow(sb) && + (tp->t_flags & TF_SLOWLINK) == 0 && + (sb->sb_hiwat - sb->sb_cc) < rcvbufinc && + (sb->sb_hiwat < tcp_autorcvbuf_max)) { + sbreserve(sb, (sb->sb_hiwat + rcvbufinc)); + } +} + int32_t tcp_sbspace(struct tcpcb *tp) { struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv; - int32_t space, newspace; + int32_t space; + + tcp_sbrcv_grow_rwin(tp, sb); space = ((int32_t) imin((sb->sb_hiwat - sb->sb_cc), (sb->sb_mbmax - sb->sb_mbcnt))); @@ -2352,21 +2508,6 @@ tcp_sbspace(struct tcpcb *tp) if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 ) return imin(space, slowlink_wsize); - /* - * Check for ressources constraints before over-ajusting the amount of space we can - * advertise in the TCP window size updates. - */ - - if (sbspace_factor && (tp->t_inpcb->inp_pcbinfo->ipi_count < tcp_sockthreshold) && - (total_mb_cnt / 8) < (mbstat.m_clusters / sbspace_factor)) { - if (space < (int32_t)(sb->sb_maxused - sb->sb_cc)) {/* make sure we don't constrain the window if we have enough ressources */ - space = (int32_t) imax((sb->sb_maxused - sb->sb_cc), tp->rcv_maxbyps); - } - newspace = (int32_t) imax(((int32_t)sb->sb_maxused - sb->sb_cc), (int32_t)tp->rcv_maxbyps); - - if (newspace > space) - space = newspace; - } return space; } /* @@ -2451,4 +2592,57 @@ calculate_tcp_clock() return; } +/* Compute receive window scaling that we are going to request + * for this connection based on sb_hiwat. Try to leave some + * room to potentially increase the window size upto a maximum + * defined by the constant tcp_autorcvbuf_max. + */ +void +tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) { + u_int32_t maxsockbufsize; + + tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); + maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? + so->so_rcv.sb_hiwat : tcp_autorcvbuf_max; + + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize) + tp->request_r_scale++; + tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT); + +} + +int +tcp_notsent_lowat_check(struct socket *so) { + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = NULL; + int notsent = 0; + if (inp != NULL) { + tp = intotcpcb(inp); + } + + notsent = so->so_snd.sb_cc - + (tp->snd_nxt - tp->snd_una); + + /* When we send a FIN or SYN, not_sent can be negative. + * In that case also we need to send a write event to the + * process if it is waiting. In the FIN case, it will + * get an error from send because cantsendmore will be set. + */ + if (notsent <= tp->t_notsent_lowat) { + return(1); + } + + /* When Nagle's algorithm is not disabled, it is better + * to wakeup the client until there is atleast one + * maxseg of data to write. + */ + if ((tp->t_flags & TF_NODELAY) == 0 && + notsent > 0 && notsent < tp->t_maxseg) { + return(1); + } + return(0); +} + + /* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */ diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index a8369ca71..a8df38947 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,6 +78,7 @@ #include /* before tcp_seq.h, for tcp_random18() */ #include +#include #include #include @@ -536,7 +537,7 @@ tcp_timers(tp, timer) int timer; { register int rexmt; - struct socket *so_tmp; + struct socket *so; struct tcptemp *t_template; int optlen = 0; int idle_time = 0; @@ -549,7 +550,7 @@ tcp_timers(tp, timer) int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; #endif /* INET6 */ - so_tmp = tp->t_inpcb->inp_socket; + so = tp->t_inpcb->inp_socket; idle_time = tcp_now - tp->t_rcvtime; switch (timer) { @@ -581,7 +582,6 @@ tcp_timers(tp, timer) * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: - tcp_free_sackholes(tp); /* Drop a connection in the retransmit timer * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times * 2. If the time spent in this retransmission episode is more than @@ -602,9 +602,12 @@ tcp_timers(tp, timer) tcpstat.tcps_timeoutdrop++; } tp->t_rxtshift = TCP_MAXRXTSHIFT; + postevent(so, 0, EV_TIMEOUT); + soevent(so, + (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); - postevent(so_tmp, 0, EV_TIMEOUT); + break; } @@ -633,6 +636,7 @@ tcp_timers(tp, timer) tp->rxt_start = tcp_now; } tcpstat.tcps_rexmttimeo++; + if (tp->t_state == TCPS_SYN_SENT) rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; else @@ -642,12 +646,17 @@ tcp_timers(tp, timer) TCP_ADD_REXMTSLOP(tp)); tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) + goto fc_output; + + tcp_free_sackholes(tp); /* * Check for potential Path MTU Discovery Black Hole */ if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) { - if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && (tp->t_rxtshift == 2)) { + if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && + (tp->t_rxtshift == 2)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). @@ -708,6 +717,7 @@ tcp_timers(tp, timer) if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); + /* * If losing, let the lower level know and try for * a better route. Also, if we backed off this far, @@ -747,6 +757,13 @@ tcp_timers(tp, timer) tp->t_dupacks = 0; EXIT_FASTRECOVERY(tp); + /* CWR notifications are to be sent on new data right after + * RTOs, Fast Retransmits and ECE notification receipts. + */ + if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) { + tp->ecn_flags |= TE_SENDCWR; + } +fc_output: DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp, struct tcphdr *, NULL, int32_t, TCP_CC_REXMT_TIMEOUT); @@ -774,11 +791,13 @@ tcp_timers(tp, timer) if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && (idle_time >= tcp_maxpersistidle || idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || - ((tp->t_persist_stop != 0) && (tp->t_persist_stop <= tcp_now))) { + ((tp->t_persist_stop != 0) && + TSTMP_LEQ(tp->t_persist_stop, tcp_now))) { tcpstat.tcps_persistdrop++; - so_tmp = tp->t_inpcb->inp_socket; + postevent(so, 0, EV_TIMEOUT); + soevent(so, + (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); tp = tcp_drop(tp, ETIMEDOUT); - postevent(so_tmp, 0, EV_TIMEOUT); break; } tcp_setpersist(tp); @@ -818,7 +837,7 @@ tcp_timers(tp, timer) unsigned int ifscope, nocell = 0; if (tp->t_inpcb->inp_flags & INP_BOUND_IF) - ifscope = tp->t_inpcb->inp_boundif; + ifscope = tp->t_inpcb->inp_boundifp->if_index; else ifscope = IFSCOPE_NONE; @@ -851,6 +870,13 @@ tcp_timers(tp, timer) if ((tp->t_flags & TF_STRETCHACK) != 0) tcp_reset_stretch_ack(tp); + /* If we are measuring inter packet arrival jitter for + * throttling a connection, this delayed ack might be + * the reason for accumulating some jitter. So let's + * restart the measurement. + */ + CLEAR_IAJ_STATE(tp); + tcpstat.tcps_delack++; (void) tcp_output(tp); } @@ -863,8 +889,10 @@ tcp_timers(tp, timer) #endif dropit: tcpstat.tcps_keepdrops++; + postevent(so, 0, EV_TIMEOUT); + soevent(so, + (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); tp = tcp_drop(tp, ETIMEDOUT); - postevent(so_tmp, 0, EV_TIMEOUT); break; } return (tp); diff --git a/bsd/netinet/tcp_timer.h b/bsd/netinet/tcp_timer.h index df1162053..213e87a39 100644 --- a/bsd/netinet/tcp_timer.h +++ b/bsd/netinet/tcp_timer.h @@ -145,11 +145,12 @@ #define TCPTV_UNACKWIN ( TCP_RETRANSHZ/10 ) /* Window for counting rcv bytes to see if ack-stretching can start (default 100 ms) */ #define TCPTV_MAXRCVIDLE (TCP_RETRANSHZ/5 ) /* Receiver idle time, avoid ack-stretching after that*/ +#define TCPTV_RCVBUFIDLE (TCP_RETRANSHZ/2) /* Receiver idle time, for rcv socket buffer resizing */ /* No ack stretching during slow-start, until we see some packets. * By the time the receiver gets 512 packets, the senders cwnd - * should open by a few hundred packets considering the progression - * during slow-start. + * should open by a few hundred packets consdering the + * slow-start progression. */ #define TCP_RCV_SS_PKTCOUNT 512 @@ -232,7 +233,7 @@ struct tcptimerlist { #define TCPTV_REXMTSLOP ( TCP_RETRANSHZ/5 ) /* rexmt slop allowed (200 ms) */ /* macro to decide when retransmit slop (described above) should be added */ -#define TCP_ADD_REXMTSLOP(tp) ((tp->t_flags & TF_LOCAL) != 0 || tp->t_state >= TCPS_ESTABLISHED) +#define TCP_ADD_REXMTSLOP(tp) (tp->t_state >= TCPS_ESTABLISHED) #define TCPT_RANGESET(tv, value, tvmin, tvmax, addslop) do { \ (tv) = ((addslop) ? tcp_rexmt_slop : 0) + (value); \ @@ -246,6 +247,13 @@ struct tcptimerlist { (tp->t_keepidle && (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ? \ tp->t_keepidle : tcp_keepidle) +/* Since we did not add rexmt slop for local connections, we should add + * it to idle timeout. Otherwise local connections will reach idle state + * quickly + */ +#define TCP_IDLETIMEOUT(tp) \ + (((TCP_ADD_REXMTSLOP(tp)) ? 0 : tcp_rexmt_slop) + tp->t_rxtcur) + extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index d4fddb517..19405c584 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -126,10 +126,6 @@ static struct tcpcb * static struct tcpcb * tcp_usrclosed(struct tcpcb *); -__private_extern__ int tcp_win_scale = 3; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_win_scale, 0, "Window scaling factor"); - static u_int32_t tcps_in_sw_cksum; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_in_sw_cksum, 0, @@ -150,6 +146,10 @@ SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LO &tcps_out_sw_cksum_bytes, "Amount of transmitted data checksummed in software"); +extern uint32_t tcp_autorcvbuf_max; + +extern void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb); + #if TCPDEBUG #define TCPDEBUG0 int ostate = 0 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 @@ -161,15 +161,6 @@ SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LO #define TCPDEBUG2(req) #endif -#if CONFIG_USESOCKTHRESHOLD -__private_extern__ unsigned int tcp_sockthreshold = 64; -#else -__private_extern__ unsigned int tcp_sockthreshold = 0; -#endif -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold"); - - SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0 , 0, tcp_sysctl_info, "S", "TCP info per tuple"); @@ -288,7 +279,7 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) * Must check for multicast addresses and disallow binding * to them. */ - sinp = (struct sockaddr_in *)nam; + sinp = (struct sockaddr_in *)(void *)nam; if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; @@ -321,7 +312,7 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) * Must check for multicast addresses and disallow binding * to them. */ - sin6p = (struct sockaddr_in6 *)nam; + sin6p = (struct sockaddr_in6 *)(void *)nam; if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { error = EAFNOSUPPORT; @@ -436,7 +427,7 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) /* * Must disallow TCP ``connections'' to multicast addresses. */ - sinp = (struct sockaddr_in *)nam; + sinp = (struct sockaddr_in *)(void *)nam; if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; @@ -469,7 +460,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) /* * Must disallow TCP ``connections'' to multicast addresses. */ - sin6p = (struct sockaddr_in6 *)nam; + sin6p = (struct sockaddr_in6 *)(void *)nam; if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { error = EAFNOSUPPORT; @@ -633,6 +624,8 @@ tcp_usr_rcvd(struct socket *so, __unused int flags) /* In case we got disconnected from the peer */ if (tp == 0) goto out; + tcp_sbrcv_trim(tp, &so->so_rcv); + tcp_output(tp); COMMON_END(PRU_RCVD); } @@ -728,7 +721,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, error = tcp6_connect(tp, nam, p); else #endif /* INET6 */ - error = tcp_connect(tp, nam, p); + error = tcp_connect(tp, nam, p); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; @@ -899,12 +892,12 @@ tcp_connect(tp, nam, p) struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct tcpcb *otp; - struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam; struct sockaddr_in ifaddr; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; int error; - unsigned int outif = 0; + struct ifnet *outif = NULL; if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, p); @@ -965,30 +958,17 @@ skip_oinp: } if (inp->inp_laddr.s_addr == INADDR_ANY) { inp->inp_laddr = ifaddr.sin_addr; - inp->inp_last_outif = outif; + inp->inp_last_outifp = outif; } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; in_pcbrehash(inp); lck_rw_done(inp->inp_pcbinfo->mtx); - /* Compute window scaling to requesti according to sb_hiwat - * or leave us some room to increase potentially increase the window size depending - * on the default win scale - */ - while (tp->request_r_scale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) - tp->request_r_scale++; + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); - /* - * Inflate window size only if no setsockopt was performed on the recv sockbuf and - * if we're not over our number of active pcbs. - */ - - if (((so->so_rcv.sb_flags & SB_USRSIZE) == 0) && (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold)) { - tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); - so->so_rcv.sb_hiwat = min(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES); - } + tcp_set_max_rwinscale(tp, so); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -1031,27 +1011,31 @@ tcp6_connect(tp, nam, p) struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct tcpcb *otp; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)nam; struct in6_addr addr6; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; - int error; - unsigned int outif = 0; + int error = 0; + struct ifnet *outif = NULL; if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, p); if (error) - return error; + goto done; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. + * + * in6_pcbladdr() might return an ifp with its reference held + * even in the error case, so make sure that it's released + * whenever it's non-NULL. */ error = in6_pcbladdr(inp, nam, &addr6, &outif); if (error) - return error; + goto done; tcp_unlock(inp->inp_socket, 0, 0); oinp = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, @@ -1064,10 +1048,12 @@ tcp6_connect(tp, nam, p) if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && ((int)(tcp_now - otp->t_starttime)) < tcp_msl && - (otp->t_flags & TF_RCVD_CC)) + (otp->t_flags & TF_RCVD_CC)) { otp = tcp_close(otp); - else - return EADDRINUSE; + } else { + error = EADDRINUSE; + goto done; + } } if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { /*lock inversion issue, mostly with udp multicast packets */ @@ -1077,7 +1063,7 @@ tcp6_connect(tp, nam, p) } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { inp->in6p_laddr = addr6; - inp->in6p_last_outif = outif; + inp->in6p_last_outifp = outif; /* no reference needed */ } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; @@ -1086,10 +1072,10 @@ tcp6_connect(tp, nam, p) in_pcbrehash(inp); lck_rw_done(inp->inp_pcbinfo->mtx); - /* Compute window scaling to request. */ - while (tp->request_r_scale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) - tp->request_r_scale++; + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); + + tcp_set_max_rwinscale(tp, so); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -1119,7 +1105,11 @@ tcp6_connect(tp, nam, p) tp->t_flags |= TF_SENDCCNEW; } - return 0; +done: + if (outif != NULL) + ifnet_release(outif); + + return (error); } #endif /* INET6 */ @@ -1129,11 +1119,13 @@ tcp6_connect(tp, nam, p) __private_extern__ void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { + struct inpcb *inp = tp->t_inpcb; + bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; - if (tp->t_state > TCPS_LISTEN) { + if (tp->t_state > TCPS_LISTEN) { if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) @@ -1143,21 +1135,48 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } - + + /* Are we in retranmission episode */ + if (tp->snd_max != tp->snd_nxt) + ti->tcpi_flags |= TCPI_FLAG_LOSSRECOVERY; + else + ti->tcpi_flags &= ~TCPI_FLAG_LOSSRECOVERY; + + ti->tcpi_rto = tp->t_timer[TCPT_REXMT] ? tp->t_rxtcur : 0; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; + ti->tcpi_rttcur = tp->t_rttcur; + ti->tcpi_srtt = tp->t_srtt >> TCP_RTT_SHIFT; + ti->tcpi_rttvar = tp->t_rttvar >> TCP_RTTVAR_SHIFT; + ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; + ti->tcpi_snd_sbbytes = tp->t_inpcb->inp_socket->so_snd.sb_cc; ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_snd_wnd = tp->snd_wnd; - ti->tcpi_snd_bwnd = tp->snd_bwnd; ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_rcv_nxt = tp->rcv_nxt; + + /* convert bytes/msec to bits/sec */ + if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && + tp->t_bwmeas != NULL) { + ti->tcpi_snd_bw = (tp->t_bwmeas->bw_sndbw * 8000); + } - ti->tcpi_last_outif = tp->t_inpcb->inp_last_outif; + ti->tcpi_last_outif = (tp->t_inpcb->inp_last_outifp == NULL) ? 0 : + tp->t_inpcb->inp_last_outifp->if_index; + + //atomic_get_64(ti->tcpi_txbytes, &inp->inp_stat->txbytes); + ti->tcpi_txbytes = inp->inp_stat->txbytes; + ti->tcpi_txretransmitbytes = tp->t_stat.txretransmitbytes; + ti->tcpi_txunacked = tp->snd_max - tp->snd_una; + + //atomic_get_64(ti->tcpi_rxbytes, &inp->inp_stat->rxbytes); + ti->tcpi_rxbytes = inp->inp_stat->rxbytes; + ti->tcpi_rxduplicatebytes = tp->t_stat.rxduplicatebytes; } } @@ -1249,6 +1268,41 @@ tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused return 0; } +static int +tcp_lookup_peer_pid_locked(struct socket *so, pid_t *out_pid) +{ + int error = EHOSTUNREACH; + *out_pid = -1; + if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; + + struct inpcb *inp = (struct inpcb*)so->so_pcb; + uint16_t lport = inp->inp_lport; + uint16_t fport = inp->inp_fport; + struct inpcb *finp = NULL; + + if (inp->inp_vflag & INP_IPV6) { + struct in6_addr laddr6 = inp->in6p_laddr; + struct in6_addr faddr6 = inp->in6p_faddr; + socket_unlock(so, 0); + finp = in6_pcblookup_hash(&tcbinfo, &laddr6, lport, &faddr6, fport, 0, NULL); + socket_lock(so, 0); + } else if (inp->inp_vflag & INP_IPV4) { + struct in_addr laddr4 = inp->inp_laddr; + struct in_addr faddr4 = inp->inp_faddr; + socket_unlock(so, 0); + finp = in_pcblookup_hash(&tcbinfo, laddr4, lport, faddr4, fport, 0, NULL); + socket_lock(so, 0); + } + + if (finp) { + *out_pid = finp->inp_socket->last_pid; + error = 0; + in_pcb_checkstate(finp, WNT_RELEASE, 0); + } + + return error; +} + /* * The new sockopt interface makes it possible for us to block in the * copyin/out step (if we take a page fault). Taking a page fault at @@ -1270,7 +1324,9 @@ tcp_ctloutput(so, sopt) if (inp == NULL) { return (ECONNRESET); } - if (sopt->sopt_level != IPPROTO_TCP) { + /* Allow at this level */ + if (sopt->sopt_level != IPPROTO_TCP && + !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH)) { #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) error = ip6_ctloutput(so, sopt); @@ -1328,6 +1384,58 @@ tcp_ctloutput(so, sopt) else tp->t_flagsext &= ~opt; break; + case TCP_MEASURE_SND_BW: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + opt = TF_MEASURESNDBW; + if (optval) { + if (tp->t_bwmeas == NULL) { + tp->t_bwmeas = tcp_bwmeas_alloc(tp); + if (tp->t_bwmeas == NULL) { + error = ENOMEM; + break; + } + } + tp->t_flagsext |= opt; + } else { + tp->t_flagsext &= ~opt; + /* Reset snd bw measurement state */ + tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS); + if (tp->t_bwmeas != NULL) { + tcp_bwmeas_free(tp); + } + } + break; + case TCP_MEASURE_BW_BURST: { + struct tcp_measure_bw_burst in; + uint32_t minpkts, maxpkts; + bzero(&in, sizeof(in)); + + error = sooptcopyin(sopt, &in, sizeof(in), + sizeof(in)); + if (error) + break; + if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 || + tp->t_bwmeas == NULL) { + error = EINVAL; + break; + } + minpkts = (in.min_burst_size != 0) ? in.min_burst_size : + tp->t_bwmeas->bw_minsizepkts; + maxpkts = (in.max_burst_size != 0) ? in.max_burst_size : + tp->t_bwmeas->bw_maxsizepkts; + if (minpkts > maxpkts) { + error = EINVAL; + break; + } + tp->t_bwmeas->bw_minsizepkts = minpkts; + tp->t_bwmeas->bw_maxsizepkts = maxpkts; + tp->t_bwmeas->bw_minsize = (minpkts * tp->t_maxseg); + tp->t_bwmeas->bw_maxsize = (maxpkts * tp->t_maxseg); + break; + } case TCP_MAXSEG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -1366,7 +1474,7 @@ tcp_ctloutput(so, sopt) else tp->t_keepinit = optval * TCP_RETRANSHZ; break; - + case PERSIST_TIMEOUT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -1387,6 +1495,33 @@ tcp_ctloutput(so, sopt) else tp->rxt_conndroptime = optval * TCP_RETRANSHZ; break; + case TCP_NOTSENT_LOWAT: + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + if (optval < 0) { + error = EINVAL; + break; + } else { + if (optval == 0) { + so->so_flags &= ~(SOF_NOTSENT_LOWAT); + tp->t_notsent_lowat = 0; + } else { + so->so_flags |= SOF_NOTSENT_LOWAT; + tp->t_notsent_lowat = optval; + } + } + break; + + case SO_FLUSH: + if ((error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval))) != 0) + break; + + error = inp_flush(inp, optval); + break; + default: error = ENOPROTOOPT; break; @@ -1422,12 +1557,42 @@ tcp_ctloutput(so, sopt) case TCP_RXT_FINDROP: optval = tp->t_flagsext & TF_RXTFINDROP; break; + case TCP_MEASURE_SND_BW: + optval = tp->t_flagsext & TF_MEASURESNDBW; + break; case TCP_INFO: { struct tcp_info ti; tcp_fill_info(tp, &ti); error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info)); goto done; + /* NOT REACHED */ + } + case TCP_MEASURE_BW_BURST: { + struct tcp_measure_bw_burst out; + if ((tp->t_flagsext & TF_MEASURESNDBW) == 0 || + tp->t_bwmeas == NULL) { + error = EINVAL; + break; + } + out.min_burst_size = tp->t_bwmeas->bw_minsizepkts; + out.max_burst_size = tp->t_bwmeas->bw_maxsizepkts; + error = sooptcopyout(sopt, &out, sizeof(out)); + goto done; + } + case TCP_NOTSENT_LOWAT: + if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) { + optval = tp->t_notsent_lowat; + } else { + optval = 0; + } + break; + case TCP_PEER_PID: { + pid_t pid; + error = tcp_lookup_peer_pid_locked(so, &pid); + if (error == 0) + error = sooptcopyout(sopt, &pid, sizeof(pid)); + goto done; } default: error = ENOPROTOOPT; @@ -1508,7 +1673,6 @@ tcp_attach(so, p) register struct tcpcb *tp; struct inpcb *inp; int error; - u_long sb_effective_max; #if INET6 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; #endif @@ -1520,27 +1684,14 @@ tcp_attach(so, p) inp = sotoinpcb(so); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - /* - * The goal is to let clients machines use large send/rcv default windows to compensate for link - * latency and make sure the receiver is not constraining the sender window. - * But we doon't want to have a few connections use all our mbuf space for servers. - * This is done by watching a threshold of tcpcbs in use and bumping the default send and rcvspace - * only if that threshold isn't reached. - * We're also advertising a much bigger window size (tuneable by sysctl) in correlation with * the max socket buffer size if - * we consider that we have enough ressources for it. This window will be adjusted depending on the - * global socket layer buffer use with the use of tcp_sbpace - */ - - if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) { - sb_effective_max = (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES; - error = soreserve(so, max(min((TCP_MAXWIN << tcp_win_scale)/4, sb_effective_max), tcp_sendspace), - max(min((TCP_MAXWIN << tcp_win_scale)/2, sb_effective_max), tcp_recvspace)); - } - else - error = soreserve(so, tcp_sendspace, tcp_recvspace); + error = soreserve(so, tcp_sendspace, tcp_recvspace); if (error) return (error); } + if ((so->so_rcv.sb_flags & SB_USRSIZE) == 0) + so->so_rcv.sb_flags |= SB_AUTOSIZE; + if ((so->so_snd.sb_flags & SB_USRSIZE) == 0) + so->so_snd.sb_flags |= SB_AUTOSIZE; #if INET6 if (isipv6) { diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 4066829cb..efffc7bad 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -170,6 +170,17 @@ struct tcptemp { struct tcphdr tt_t; }; +struct bwmeas { + tcp_seq bw_start; /* start of bw measurement */ + uint32_t bw_ts; /* timestamp when bw measurement started */ + uint32_t bw_size; /* burst size in bytes for this bw measurement */ + uint32_t bw_minsizepkts; /* Min burst size as segments */ + uint32_t bw_maxsizepkts; /* Max burst size as segments */ + uint32_t bw_minsize; /* Min size in bytes */ + uint32_t bw_maxsize; /* Max size in bytes */ + uint32_t bw_sndbw; /* Measured send bw */ +}; + #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ /* @@ -210,7 +221,6 @@ struct tcpcb { #define TF_WASFRECOVERY 0x400000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x800000 /* require MD5 digests (RFC2385) */ #define TF_MAXSEGSNT 0x1000000 /* last segment sent was a full segment */ -#define TF_SENDINPROG 0x2000000 /* send is in progress */ #define TF_PMTUD 0x4000000 /* Perform Path MTU Discovery for this connection */ #define TF_CLOSING 0x8000000 /* pending tcp close */ #define TF_TSO 0x10000000 /* TCP Segment Offloading is enable on this connection */ @@ -239,12 +249,10 @@ struct tcpcb { u_int32_t snd_wnd; /* send window */ u_int32_t snd_cwnd; /* congestion-controlled window */ - u_int32_t snd_bwnd; /* bandwidth-controlled window */ u_int32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ - u_int32_t snd_bandwidth; /* calculated bandwidth or 0 */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ @@ -254,8 +262,8 @@ struct tcpcb { int t_rtttime; /* tcp clock when rtt calculation was started */ tcp_seq t_rtseq; /* sequence number being timed */ - int t_bw_rtttime; /* used for bandwidth calculation */ - tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ + u_int32_t rfbuf_cnt; /* recv buffer autoscaling byte count */ int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ @@ -302,7 +310,7 @@ struct tcpcb { int t_unacksegs; /* received but unacked segments: used for delaying acks */ u_int32_t t_persist_timeout; /* ZWP persistence limit as set by PERSIST_TIMEOUT */ u_int32_t t_persist_stop; /* persistence limit deadline if triggered by ZWP */ - + u_int32_t t_notsent_lowat; /* Low water for not sent data */ /* 3529618 MSS overload prevention */ u_int32_t rcv_reset; @@ -320,10 +328,9 @@ struct tcpcb { #define TE_SENDIPECT 0x04 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ #define TE_SENDCWR 0x08 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ #define TE_SENDECE 0x10 /* Indicate that the next packet should have the TCP ECE flag set */ - tcp_seq snd_high; /* for use in NewReno Fast Recovery */ - tcp_seq snd_high_prev; /* snd_high prior to retransmit */ +#define TE_ECN_ON (TE_SETUPSENT | TE_SETUPRECEIVED) /* Indicate ECN was successfully negotiated on a connection) */ + tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ - u_char snd_limited; /* segments limited transmitted */ /* anti DoS counters */ u_int32_t rcv_second; /* start of interval second */ @@ -362,7 +369,9 @@ struct tcpcb { uint32_t t_flagsext; /* Another field to accommodate more flags */ #define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ #define TF_RCVUNACK_WAITSS 0x2 /* set when the receiver should not stretch acks */ - +#define TF_BWMEAS_INPROGRESS 0x4 /* Indicate BW meas is happening */ +#define TF_MEASURESNDBW 0x8 /* Measure send bw on this connection */ +#define TF_LRO_OFFLOADED 0x10 /* Connection LRO offloaded */ #if TRAFFIC_MGT /* Inter-arrival jitter related state */ uint32_t iaj_rcv_ts; /* tcp clock when the first packet was received */ @@ -374,10 +383,26 @@ struct tcpcb { uint32_t avg_iaj; /* Mean */ uint32_t std_dev_iaj; /* Standard deviation */ #endif /* TRAFFIC_MGT */ + struct bwmeas *t_bwmeas; /* State for bandwidth measurement */ + uint32_t t_lropktlen; /* Bytes in a LRO frame */ + tcp_seq t_idleat; /* rcv_nxt at idle time */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) -#define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY + +/* + * If the connection is in a throttled state due to advisory feedback from + * the interface output queue, reset that state. We do this in favor + * of entering recovery because the data transfer during recovery + * should be just a trickle and it will help to improve performance. + * We also do not want to back off twice in the same RTT. + */ +#define ENTER_FASTRECOVERY(_tp_) do { \ + (_tp_)->t_flags |= TF_FASTRECOVERY; \ + if (INP_IS_FLOW_CONTROLLED((_tp_)->t_inpcb)) \ + inp_reset_fc_state((_tp_)->t_inpcb); \ +} while(0) + #define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY #if CONFIG_DTRACE @@ -394,7 +419,9 @@ enum tcp_cc_event { TCP_CC_ECN_RCVD, TCP_CC_BAD_REXMT_RECOVERY, TCP_CC_OUTPUT_ERROR, - TCP_CC_CHANGE_ALGO + TCP_CC_CHANGE_ALGO, + TCP_CC_FLOW_CONTROL, + TCP_CC_SUSPEND }; #endif /* CONFIG_DTRACE */ @@ -685,14 +712,40 @@ struct tcpstat { /* SACK related stats */ u_int32_t tcps_sack_recovery_episode; /* SACK recovery episodes */ - u_int32_t tcps_sack_rexmits; /* SACK rexmit segments */ - u_int32_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ - u_int32_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ - u_int32_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ - u_int32_t tcps_sack_sboverflow; /* SACK sendblock overflow */ + u_int32_t tcps_sack_rexmits; /* SACK rexmit segments */ + u_int32_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ + u_int32_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ + u_int32_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ + u_int32_t tcps_sack_sboverflow; /* SACK sendblock overflow */ u_int32_t tcps_bg_rcvtotal; /* total background packets received */ u_int32_t tcps_rxtfindrop; /* drop conn after retransmitting FIN */ + u_int32_t tcps_fcholdpacket; /* packets withheld because of flow control */ + + /* LRO related stats */ + u_int32_t tcps_coalesced_pack; /* number of coalesced packets */ + u_int32_t tcps_flowtbl_full; /* times flow table was full */ + u_int32_t tcps_flowtbl_collision; /* collisions in flow tbl */ + u_int32_t tcps_lro_twopack; /* 2 packets coalesced */ + u_int32_t tcps_lro_multpack; /* 3 or 4 pkts coalesced */ + u_int32_t tcps_lro_largepack; /* 5 or more pkts coalesced */ +}; + +struct tcpstat_local { + u_int64_t badformat; + u_int64_t unspecv6; + u_int64_t synfin; + u_int64_t badformatipsec; + u_int64_t noconnnolist; + u_int64_t noconnlist; + u_int64_t listbadsyn; + u_int64_t icmp6unreach; + u_int64_t deprecate6; + u_int64_t ooopacket; + u_int64_t rstinsynrcv; + u_int64_t dospacket; + u_int64_t cleanup; + u_int64_t synwindow; }; #pragma pack(4) @@ -961,6 +1014,7 @@ int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); +void tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt); struct rmxp_tao * tcp_gettaocache(struct inpcb *); void tcp_init(void) __attribute__((section("__TEXT, initcode"))); @@ -998,8 +1052,22 @@ void tcp_free_sackholes(struct tcpcb *tp); int32_t tcp_sbspace(struct tcpcb *tp); void tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp); void tcp_reset_stretch_ack(struct tcpcb *tp); +void tcp_get_ports_used(unsigned int , uint8_t *); +uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags); +void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so); +u_int8_t tcp_cansbgrow(struct sockbuf *sb); +struct bwmeas* tcp_bwmeas_alloc(struct tcpcb *tp); +void tcp_bwmeas_free(struct tcpcb *tp); + +extern void tcp_set_background_cc(struct socket *); +extern void tcp_set_foreground_cc(struct socket *); +extern void tcp_set_recv_bg(struct socket *); +extern void tcp_clear_recv_bg(struct socket *); +#define IS_TCP_RECV_BG(_so) \ + ((_so)->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG) #if TRAFFIC_MGT +#define CLEAR_IAJ_STATE(_tp_) (_tp_)->iaj_rcv_ts = 0 void reset_acc_iaj(struct tcpcb *tp); #endif /* TRAFFIC_MGT */ diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 37cc4153c..e5462393d 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -273,6 +273,7 @@ udp_input(m, iphlen) }; struct udp_ip6 udp_ip6; #endif /* INET6 */ + struct ifnet *ifp = (m->m_pkthdr.rcvif != NULL) ? m->m_pkthdr.rcvif: NULL; udpstat.udps_ipackets++; @@ -280,6 +281,9 @@ udp_input(m, iphlen) if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) m->m_pkthdr.csum_flags = 0; /* invalidate hwcksum for UDP */ + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + /* * Strip IP options, if any; should skip this, * make available to user, and use on returned packets, @@ -303,11 +307,16 @@ udp_input(m, iphlen) } ip = mtod(m, struct ip *); } - uh = (struct udphdr *)((caddr_t)ip + iphlen); + uh = (struct udphdr *)(void *)((caddr_t)ip + iphlen); /* destination port of 0 is illegal, based on RFC768. */ - if (uh->uh_dport == 0) + if (uh->uh_dport == 0) { + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->port0, 1); + goto bad; + } KERNEL_DEBUG(DBG_LAYER_IN_BEG, uh->uh_dport, uh->uh_sport, ip->ip_src.s_addr, ip->ip_dst.s_addr, uh->uh_ulen); @@ -320,6 +329,10 @@ udp_input(m, iphlen) if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { udpstat.udps_badlen++; + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badlength, 1); + goto bad; } m_adj(m, len - ip->ip_len); @@ -344,21 +357,23 @@ udp_input(m, iphlen) } else { char b[9]; doudpcksum: - *(uint32_t*)&b[0] = *(uint32_t*)&((struct ipovly *)ip)->ih_x1[0]; - *(uint32_t*)&b[4] = *(uint32_t*)&((struct ipovly *)ip)->ih_x1[4]; - *(uint8_t*)&b[8] = *(uint8_t*)&((struct ipovly *)ip)->ih_x1[8]; - - bzero(((struct ipovly *)ip)->ih_x1, 9); + bcopy(((struct ipovly *)ip)->ih_x1, b, + sizeof (((struct ipovly *)ip)->ih_x1)); + bzero(((struct ipovly *)ip)->ih_x1, + sizeof (((struct ipovly *)ip)->ih_x1)); ((struct ipovly *)ip)->ih_len = uh->uh_ulen; uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); - - *(uint32_t*)&((struct ipovly *)ip)->ih_x1[0] = *(uint32_t*)&b[0]; - *(uint32_t*)&((struct ipovly *)ip)->ih_x1[4] = *(uint32_t*)&b[4]; - *(uint8_t*)&((struct ipovly *)ip)->ih_x1[8] = *(uint8_t*)&b[8]; + bcopy(b, ((struct ipovly *)ip)->ih_x1, + sizeof (((struct ipovly *)ip)->ih_x1)); + udp_in_cksum_stats(len); } if (uh->uh_sum) { udpstat.udps_badsum++; + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badchksum, 1); + m_freem(m); KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); return; @@ -369,7 +384,7 @@ doudpcksum: udpstat.udps_nosum++; #endif - isbroadcast = in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif); + isbroadcast = in_broadcast(ip->ip_dst, ifp); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || isbroadcast) { @@ -414,6 +429,11 @@ doudpcksum: if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if (ip_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->inp_flags & INP_RECV_ANYIF)) + continue; + if ((inp->inp_moptions == NULL) && (ntohl(ip->ip_dst.s_addr) != INADDR_ALLHOSTS_GROUP) && (isbroadcast == 0) ) @@ -466,7 +486,7 @@ doudpcksum: group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; - blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, + blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in); if (blocked == MCAST_PASS) @@ -524,11 +544,17 @@ doudpcksum: */ if (reuse_sock == 0 || m == NULL) break; + + /* + * Expect 32-bit aligned data pointer on strict-align + * platforms. + */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); /* * Recompute IP and UDP header pointers for new mbuf */ ip = mtod(m, struct ip *); - uh = (struct udphdr *)((caddr_t)ip + iphlen); + uh = (struct udphdr *)(void *)((caddr_t)ip + iphlen); } lck_rw_done(pcbinfo->mtx); @@ -539,6 +565,10 @@ doudpcksum: * for a broadcast or multicast datgram.) */ udpstat.udps_noportbcast++; + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->port_unreach, 1); + goto bad; } @@ -565,8 +595,14 @@ doudpcksum: KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); return; } + /* + * Expect 32-bit aligned data pointer on strict-align + * platforms. + */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); - uh = (struct udphdr *)((caddr_t)ip + iphlen); + uh = (struct udphdr *)(void *)((caddr_t)ip + iphlen); } /* Check for NAT keepalive packet */ if (payload_len == 1 && *(u_int8_t*)((caddr_t)uh + sizeof(struct udphdr)) == 0xFF) { @@ -574,7 +610,7 @@ doudpcksum: KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); return; } - else if (payload_len == 4 && *(u_int32_t*)((caddr_t)uh + sizeof(struct udphdr)) != 0) { + else if (payload_len == 4 && *(u_int32_t*)(void *)((caddr_t)uh + sizeof(struct udphdr)) != 0) { /* UDP encapsulated IPSec packet to pass through NAT */ KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); /* preserve the udp header */ @@ -588,8 +624,12 @@ doudpcksum: * Locate pcb for datagram. */ inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); + ip->ip_dst, uh->uh_dport, 1, ifp); if (inp == NULL) { + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->port_unreach, 1); + if (log_in_vain) { char buf[MAX_IPv4_STR_LEN]; char buf2[MAX_IPv4_STR_LEN]; @@ -621,7 +661,7 @@ doudpcksum: goto bad; #endif if (blackhole) - if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) + if (ifp && ifp->if_type != IFT_LOOP) goto bad; *ip = save_ip; ip->ip_len += iphlen; @@ -633,6 +673,10 @@ doudpcksum: if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { udp_unlock(inp->inp_socket, 1, 0); + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->cleanup, 1); + goto bad; } #if IPSEC @@ -640,8 +684,12 @@ doudpcksum: if (ipsec4_in_reject_so(m, inp->inp_socket)) { IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); udp_unlock(inp->inp_socket, 1, 0); + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badipsec, 1); + goto bad; - } + } } #endif /*IPSEC*/ @@ -718,10 +766,14 @@ ip_2_ip6_hdr(ip6, ip) ip6->ip6_plen = ip->ip_len; ip6->ip6_nxt = ip->ip_p; ip6->ip6_hlim = ip->ip_ttl; - ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] = - IPV6_ADDR_INT32_SMP; - ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; - ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; + if (ip->ip_src.s_addr) { + ip6->ip6_src.s6_addr32[2] = IPV6_ADDR_INT32_SMP; + ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; + } + if (ip->ip_dst.s_addr) { + ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_SMP; + ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; + } } #endif @@ -825,14 +877,13 @@ udp_ctlinput(cmd, sa, vip) void *vip; { struct ip *ip = vip; - struct udphdr *uh; void (*notify)(struct inpcb *, int) = udp_notify; struct in_addr faddr; struct inpcb *inp; - faddr = ((struct sockaddr_in *)sa)->sin_addr; + faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) - return; + return; if (PRC_IS_REDIRECT(cmd)) { ip = 0; @@ -842,12 +893,15 @@ udp_ctlinput(cmd, sa, vip) else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { - uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, 0, NULL); + struct udphdr uh; + + bcopy(((caddr_t)ip + (ip->ip_hl << 2)), &uh, sizeof (uh)); + inp = in_pcblookup_hash(&udbinfo, faddr, uh.uh_dport, + ip->ip_src, uh.uh_sport, 0, NULL); if (inp != NULL && inp->inp_socket != NULL) { udp_lock(inp->inp_socket, 1, 0); - if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == + WNT_STOPUSING) { udp_unlock(inp->inp_socket, 1, 0); return; } @@ -864,7 +918,9 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) int error, optval; struct inpcb *inp; - if (sopt->sopt_level != IPPROTO_UDP) + /* Allow at this level */ + if (sopt->sopt_level != IPPROTO_UDP && + !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH)) return (ip_ctloutput(so, sopt)); error = 0; @@ -890,6 +946,14 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) inp->inp_flags &= ~INP_UDP_NOCKSUM; break; + case SO_FLUSH: + if ((error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval))) != 0) + break; + + error = inp_flush(inp, optval); + break; + default: error = ENOPROTOOPT; break; @@ -1131,11 +1195,22 @@ udp_pcblist_n SYSCTL_HANDLER_ARGS return error; } - SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist_n, "S,xinpcb_n", "List of active UDP sockets"); +__private_extern__ void +udp_get_ports_used(unsigned int ifindex, uint8_t *bitfield) +{ + inpcb_get_ports_used(ifindex, bitfield, &udbinfo); +} + +__private_extern__ uint32_t +udp_count_opportunistic(unsigned int ifindex, u_int32_t flags) +{ + return inpcb_count_opportunistic(ifindex, &udbinfo, flags); +} + static __inline__ u_int16_t get_socket_id(struct socket * s) { @@ -1152,10 +1227,10 @@ get_socket_id(struct socket * s) } static int -udp_check_pktinfo(struct mbuf *control, unsigned int *ifindex, struct in_addr *laddr) +udp_check_pktinfo(struct mbuf *control, struct ifnet **outif, struct in_addr *laddr) { struct cmsghdr *cm = 0; - struct in_pktinfo *pktinfo; + struct in_pktinfo *pktinfo; struct ifnet *ifp; /* @@ -1171,14 +1246,14 @@ udp_check_pktinfo(struct mbuf *control, unsigned int *ifindex, struct in_addr *l for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { if (cm->cmsg_len < sizeof(struct cmsghdr) || cm->cmsg_len > control->m_len) return (EINVAL); - + if (cm->cmsg_level != IPPROTO_IP || cm->cmsg_type != IP_PKTINFO) continue; - if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return (EINVAL); - pktinfo = (struct in_pktinfo *)CMSG_DATA(cm); + pktinfo = (struct in_pktinfo *)(void *)CMSG_DATA(cm); /* Check for a valid ifindex in pktinfo */ ifnet_head_lock_shared(); @@ -1199,7 +1274,8 @@ udp_check_pktinfo(struct mbuf *control, unsigned int *ifindex, struct in_addr *l ifnet_head_done(); - *ifindex = pktinfo->ipi_ifindex; + if (outif != NULL) + *outif = ifp; laddr->s_addr = INADDR_ANY; break; } @@ -1207,7 +1283,8 @@ udp_check_pktinfo(struct mbuf *control, unsigned int *ifindex, struct in_addr *l ifnet_head_done(); /* Use the provided ipi_spec_dst address for temp source address */ - *ifindex = 0; + if (outif != NULL) + *outif = NULL; *laddr = pktinfo->ipi_spec_dst; break; } @@ -1234,23 +1311,32 @@ udp_output(inp, m, addr, control, p) struct mbuf *inpopts; struct ip_moptions *mopts; struct route ro; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; - mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; - unsigned int origoutif; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; + struct ifnet *outif = NULL; + struct flowadv *adv = &ipoa.ipoa_flowadv; + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + struct ifnet *origoutifp; + int flowadv = 0; + + /* Enable flow advisory only when connected */ + flowadv = (so->so_state & SS_ISCONNECTED) ? 1 : 0; pi_laddr.s_addr = INADDR_ANY; KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); if (control != NULL) { - mtc = mbuf_traffic_class_from_control(control); + msc = mbuf_service_class_from_control(control); - error = udp_check_pktinfo(control, &ipoa.ipoa_boundif, &pi_laddr); + error = udp_check_pktinfo(control, &outif, &pi_laddr); m_freem(control); if (error) goto release; pktinfo++; + if (outif != NULL) + ipoa.ipoa_boundif = outif->if_index; } KERNEL_DEBUG(DBG_LAYER_OUT_BEG, inp->inp_fport, inp->inp_lport, @@ -1262,16 +1348,26 @@ udp_output(inp, m, addr, control, p) goto release; } - lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); - + if (flowadv && INP_WAIT_FOR_IF_FEEDBACK(inp)) { + /* + * The socket is flow-controlled, drop the packets + * until the inp is not flow controlled + */ + error = ENOBUFS; + goto release; + } /* * If socket was bound to an ifindex, tell ip_output about it. * If the ancillary IP_PKTINFO option contains an interface index, * it takes precedence over the one specified by IP_BOUND_IF. */ - if (ipoa.ipoa_boundif == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) - ipoa.ipoa_boundif = inp->inp_boundif; - ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + if (ipoa.ipoa_boundif == IFSCOPE_NONE && + (inp->inp_flags & INP_BOUND_IF)) { + outif = inp->inp_boundifp; + ipoa.ipoa_boundif = outif->if_index; + } + if (inp->inp_flags & INP_NO_IFT_CELLULAR) + ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; soopts |= IP_OUTARGS; /* If there was a routing change, discard cached route and check @@ -1284,18 +1380,22 @@ udp_output(inp, m, addr, control, p) /* src address is gone? */ if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) { - if (((inp->inp_flags & INP_INADDR_ANY) == 0) || (so->so_state & SS_ISCONNECTED)) { + if (((inp->inp_flags & INP_INADDR_ANY) == 0) || + (so->so_state & SS_ISCONNECTED)) { /* Rdar://5448998 * If the source address is gone, return an error if: * - the source was specified * - the socket was already connected */ + soevent(so, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_NOSRCADDR)); error = EADDRNOTAVAIL; goto release; } else { /* new src will be set later */ inp->inp_laddr.s_addr = INADDR_ANY; - inp->inp_last_outif = 0; + inp->inp_last_outifp = NULL; } } if (ia != NULL) @@ -1305,7 +1405,7 @@ udp_output(inp, m, addr, control, p) inp->inp_route.ro_rt = NULL; } - origoutif = inp->inp_last_outif; + origoutifp = inp->inp_last_outifp; /* IP_PKTINFO option check. * If a temporary scope or src address is provided, use it for this packet only @@ -1322,13 +1422,13 @@ udp_output(inp, m, addr, control, p) origladdr = laddr = inp->inp_laddr; } - origoutif = inp->inp_last_outif; + origoutifp = inp->inp_last_outifp; faddr = inp->inp_faddr; lport = inp->inp_lport; fport = inp->inp_fport; if (addr) { - sin = (struct sockaddr_in *)addr; + sin = (struct sockaddr_in *)(void *)addr; if (faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; @@ -1342,7 +1442,7 @@ udp_output(inp, m, addr, control, p) if (pi_laddr.s_addr != INADDR_ANY) /* if we have a source address specified, use that */ inp->inp_laddr = pi_laddr; - error = in_pcbconnect(inp, addr, p, &ipoa.ipoa_boundif); /* if a scope is specified, use it */ + error = in_pcbconnect(inp, addr, p, &outif); /* if a scope is specified, use it */ if (error) { goto release; } @@ -1351,6 +1451,8 @@ udp_output(inp, m, addr, control, p) faddr = inp->inp_faddr; fport = inp->inp_fport; udp_dodisconnect = 1; + ipoa.ipoa_boundif = (outif != NULL) ? + outif->if_index : IFSCOPE_NONE; } else { /* Fast path case @@ -1361,10 +1463,12 @@ udp_output(inp, m, addr, control, p) * priority is always given to the scope provided by INP_BOUND_IF. */ if (laddr.s_addr == INADDR_ANY) { - if ((error = in_pcbladdr(inp, addr, &ifaddr, &ipoa.ipoa_boundif)) != 0) - goto release; - laddr = ifaddr.sin_addr; - inp->inp_flags |= INP_INADDR_ANY; /* from pcbconnect: remember we don't care about src addr.*/ + if ((error = in_pcbladdr(inp, addr, &ifaddr, &outif)) != 0) + goto release; + laddr = ifaddr.sin_addr; + inp->inp_flags |= INP_INADDR_ANY; /* from pcbconnect: remember we don't care about src addr.*/ + ipoa.ipoa_boundif = (outif != NULL) ? + outif->if_index : IFSCOPE_NONE; } faddr = sin->sin_addr; @@ -1380,6 +1484,8 @@ udp_output(inp, m, addr, control, p) #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); #endif + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); /* * Calculate data length and get a mbuf @@ -1429,18 +1535,37 @@ udp_output(inp, m, addr, control, p) goto abort; } #endif /*IPSEC*/ - m->m_pkthdr.socket_id = get_socket_id(inp->inp_socket); inpopts = inp->inp_options; soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); mopts = inp->inp_moptions; - if (mopts != NULL) - IMO_ADDREF(mopts); + if (mopts != NULL) { + IMO_LOCK(mopts); + IMO_ADDREF_LOCKED(mopts); + if (IN_MULTICAST(ntohl(ui->ui_dst.s_addr)) && + mopts->imo_multicast_ifp != NULL) { + inp->inp_last_outifp = mopts->imo_multicast_ifp; + } + IMO_UNLOCK(mopts); + } /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); - set_packet_tclass(m, so, mtc, 0); + set_packet_service_class(m, so, msc, 0); + m->m_pkthdr.socket_id = get_socket_id(inp->inp_socket); + m->m_pkthdr.m_flowhash = inp->inp_flowhash; + m->m_pkthdr.m_fhflags |= PF_TAG_FLOWHASH; + if (flowadv) + m->m_pkthdr.m_fhflags |= PF_TAG_FLOWADV; + + if (ipoa.ipoa_boundif != IFSCOPE_NONE) + ipoa.ipoa_flags |= IPOAF_BOUND_IF; + + if (laddr.s_addr != INADDR_ANY) + ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR; + + inp->inp_sndinprog_cnt++; socket_unlock(so, 0); error = ip_output_list(m, 0, inpopts, &ro, soopts, mopts, &ipoa); @@ -1453,6 +1578,20 @@ udp_output(inp, m, addr, control, p) locked_add_64(&inp->inp_stat->txpackets, 1); locked_add_64(&inp->inp_stat->txbytes, len); } + + if (flowadv && (adv->code == FADV_FLOW_CONTROLLED || + adv->code == FADV_SUSPENDED)) { + /* return a hint to the application that + * the packet has been dropped + */ + error = ENOBUFS; + inp_set_fc_state(inp, adv->code); + } + + VERIFY(inp->inp_sndinprog_cnt > 0); + if ( --inp->inp_sndinprog_cnt == 0) + inp->inp_flags &= ~(INP_FC_FEEDBACK); + /* Synchronize PCB cached route */ inp_route_copyin(inp, &ro); @@ -1465,10 +1604,10 @@ abort: } in_pcbdisconnect(inp); inp->inp_laddr = origladdr; /* XXX rehash? */ - inp->inp_last_outif = origoutif; + inp->inp_last_outifp = origoutifp; } else if (inp->inp_route.ro_rt != NULL) { struct rtentry *rt = inp->inp_route.ro_rt; - unsigned int outif; + struct ifnet *outifp; if (rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) rt = NULL; /* unusable */ @@ -1480,12 +1619,11 @@ abort: inp->inp_route.ro_rt = NULL; } /* - * If the destination route is unicast, update outif with - * that of the route interface index used by IP. + * If the destination route is unicast, update outifp with + * that of the route interface used by IP. */ - if (rt != NULL && - (outif = rt->rt_ifp->if_index) != inp->inp_last_outif) - inp->inp_last_outif = outif; + if (rt != NULL && (outifp = rt->rt_ifp) != inp->inp_last_outifp) + inp->inp_last_outifp = outifp; } release: @@ -1496,8 +1634,8 @@ release: } u_int32_t udp_sendspace = 9216; /* really max datagram size */ -/* 40 1K datagrams */ -u_int32_t udp_recvspace = 40 * (1024 + +/* 187 1K datagrams (approx 192 KB) */ +u_int32_t udp_recvspace = 187 * (1024 + #if INET6 sizeof(struct sockaddr_in6) #else @@ -1573,7 +1711,8 @@ udp_attach(struct socket *so, __unused int proto, struct proc *p) inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; - nstat_udp_new_pcb(inp); + if (nstat_collect) + nstat_udp_new_pcb(inp); return 0; } @@ -1606,8 +1745,11 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; error = in_pcbconnect(inp, nam, p, NULL); - if (error == 0) + if (error == 0) { soisconnected(so); + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); + } return error; } @@ -1636,9 +1778,13 @@ udp_disconnect(struct socket *so) return ENOTCONN; in_pcbdisconnect(inp); + + /* reset flow controlled state, just in case */ + inp_reset_fc_state(inp); + inp->inp_laddr.s_addr = INADDR_ANY; so->so_state &= ~SS_ISCONNECTED; /* XXX */ - inp->inp_last_outif = 0; + inp->inp_last_outifp = NULL; return 0; } diff --git a/bsd/netinet/udp_var.h b/bsd/netinet/udp_var.h index 3a75d1faf..776109a59 100644 --- a/bsd/netinet/udp_var.h +++ b/bsd/netinet/udp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,6 +128,17 @@ struct udpstat { SYSCTL_DECL(_net_inet_udp); +struct udpstat_local { + u_int64_t port_unreach; + u_int64_t faithprefix; /* deprecated */ + u_int64_t port0; + u_int64_t badlength; + u_int64_t badchksum; + u_int64_t badmcast; + u_int64_t cleanup; + u_int64_t badipsec; +}; + extern struct pr_usrreqs udp_usrreqs; extern struct inpcbhead udb; extern struct inpcbinfo udbinfo; @@ -152,6 +163,8 @@ lck_mtx_t * udp_getlock (struct socket *, int); #else void * udp_getlock (struct socket *, int); #endif +void udp_get_ports_used(unsigned int, uint8_t *); +uint32_t udp_count_opportunistic(unsigned int, u_int32_t); #endif /* KERNEL_PRIVATE */ #endif /* _NETINET_UDP_VAR_H_ */ diff --git a/bsd/netinet6/Makefile b/bsd/netinet6/Makefile index f765bace4..141f50860 100644 --- a/bsd/netinet6/Makefile +++ b/bsd/netinet6/Makefile @@ -18,14 +18,14 @@ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ ah.h ipsec.h pim6.h \ esp.h in6.h ipcomp.h raw_ip6.h \ - in6_var.h ip6_mroute.h nd6.h ip6_fw.h + in6_var.h ip6_mroute.h nd6.h PRIVATE_DATAFILES = \ - in6_pcb.h ip6_var.h pim6_var.h mld6_var.h + in6_pcb.h ip6_var.h pim6_var.h mld6_var.h ip6_fw.h PRIVATE_KERNELFILES = \ ah6.h esp6.h esp_rijndael.h in6_gif.h in6_ifattach.h \ - in6_prefix.h ip6_ecn.h ip6_fw.h \ + in6_prefix.h ip6_ecn.h \ ip6protosw.h ipcomp6.h ipsec6.h \ raw_ip6.h scope6_var.h tcp6_var.h udp6_var.h diff --git a/bsd/netinet6/ah_core.c b/bsd/netinet6/ah_core.c index 27098a76f..a471825e9 100644 --- a/bsd/netinet6/ah_core.c +++ b/bsd/netinet6/ah_core.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,7 +108,7 @@ #include #include #include -#include +#include #include @@ -300,7 +300,7 @@ ah_keyed_md5_init(state, sav) { size_t padlen; size_t keybitlen; - u_int8_t buf[32]; + u_int8_t buf[32] __attribute__((aligned(4))); if (!state) panic("ah_keyed_md5_init: what?"); @@ -369,7 +369,7 @@ ah_keyed_md5_result(state, addr, l) caddr_t addr; size_t l; { - u_char digest[16]; + u_char digest[16] __attribute__((aligned(4))); if (!state) panic("ah_keyed_md5_result: what?"); @@ -420,7 +420,7 @@ ah_keyed_sha1_init(state, sav) SHA1_CTX *ctxt; size_t padlen; size_t keybitlen; - u_int8_t buf[32]; + u_int8_t buf[32] __attribute__((aligned(4))); if (!state) panic("ah_keyed_sha1_init: what?"); @@ -491,7 +491,7 @@ ah_keyed_sha1_result(state, addr, l) caddr_t addr; size_t l; { - u_char digest[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ + u_char digest[SHA1_RESULTLEN] __attribute__((aligned(4))); /* SHA-1 generates 160 bits */ SHA1_CTX *ctxt; if (!state || !state->foo) @@ -543,7 +543,7 @@ ah_hmac_md5_init(state, sav) { u_char *ipad; u_char *opad; - u_char tk[16]; + u_char tk[16] __attribute__((aligned(4))); u_char *key; size_t keylen; size_t i; @@ -559,7 +559,7 @@ ah_hmac_md5_init(state, sav) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 64); - ctxt = (MD5_CTX *)(opad + 64); + ctxt = (MD5_CTX *)(void *)(opad + 64); /* compress the key if necessery */ if (64 < _KEYLEN(state->sav->key_auth)) { @@ -599,7 +599,7 @@ ah_hmac_md5_loop(state, addr, len) if (!state || !state->foo) panic("ah_hmac_md5_loop: what?"); - ctxt = (MD5_CTX *)(((caddr_t)state->foo) + 128); + ctxt = (MD5_CTX *)(void *)(((caddr_t)state->foo) + 128); MD5Update(ctxt, addr, len); } @@ -609,7 +609,7 @@ ah_hmac_md5_result(state, addr, l) caddr_t addr; size_t l; { - u_char digest[16]; + u_char digest[16] __attribute__((aligned(4))); u_char *ipad; u_char *opad; MD5_CTX *ctxt; @@ -619,7 +619,7 @@ ah_hmac_md5_result(state, addr, l) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 64); - ctxt = (MD5_CTX *)(opad + 64); + ctxt = (MD5_CTX *)(void *)(opad + 64); MD5Final(&digest[0], ctxt); @@ -669,7 +669,7 @@ ah_hmac_sha1_init(state, sav) u_char *ipad; u_char *opad; SHA1_CTX *ctxt; - u_char tk[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ + u_char tk[SHA1_RESULTLEN] __attribute__((aligned(4))); /* SHA-1 generates 160 bits */ u_char *key; size_t keylen; size_t i; @@ -685,7 +685,7 @@ ah_hmac_sha1_init(state, sav) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 64); - ctxt = (SHA1_CTX *)(opad + 64); + ctxt = (SHA1_CTX *)(void *)(opad + 64); /* compress the key if necessery */ if (64 < _KEYLEN(state->sav->key_auth)) { @@ -726,7 +726,7 @@ ah_hmac_sha1_loop(state, addr, len) if (!state || !state->foo) panic("ah_hmac_sha1_loop: what?"); - ctxt = (SHA1_CTX *)(((u_char *)state->foo) + 128); + ctxt = (SHA1_CTX *)(void *)(((u_char *)state->foo) + 128); SHA1Update(ctxt, (caddr_t)addr, (size_t)len); } @@ -736,7 +736,7 @@ ah_hmac_sha1_result(state, addr, l) caddr_t addr; size_t l; { - u_char digest[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ + u_char digest[SHA1_RESULTLEN] __attribute__((aligned(4))); /* SHA-1 generates 160 bits */ u_char *ipad; u_char *opad; SHA1_CTX *ctxt; @@ -746,7 +746,7 @@ ah_hmac_sha1_result(state, addr, l) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 64); - ctxt = (SHA1_CTX *)(opad + 64); + ctxt = (SHA1_CTX *)(void *)(opad + 64); SHA1Final((caddr_t)&digest[0], ctxt); @@ -809,7 +809,7 @@ ah_hmac_sha2_256_init(state, sav) u_char *ipad; u_char *opad; SHA256_CTX *ctxt; - u_char tk[SHA256_DIGEST_LENGTH]; + u_char tk[SHA256_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *key; size_t keylen; size_t i; @@ -825,7 +825,7 @@ ah_hmac_sha2_256_init(state, sav) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 64); - ctxt = (SHA256_CTX *)(opad + 64); + ctxt = (SHA256_CTX *)(void *)(opad + 64); /* compress the key if necessery */ if (64 < _KEYLEN(state->sav->key_auth)) { @@ -869,7 +869,7 @@ ah_hmac_sha2_256_loop(state, addr, len) if (!state || !state->foo) panic("ah_hmac_sha2_256_loop: what?"); - ctxt = (SHA256_CTX *)(((u_char *)state->foo) + 128); + ctxt = (SHA256_CTX *)(void *)(((u_char *)state->foo) + 128); SHA256_Update(ctxt, (const u_int8_t *)addr, (size_t)len); } @@ -879,7 +879,7 @@ ah_hmac_sha2_256_result(state, addr, l) caddr_t addr; size_t l; { - u_char digest[SHA256_DIGEST_LENGTH]; + u_char digest[SHA256_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *ipad; u_char *opad; SHA256_CTX *ctxt; @@ -889,7 +889,7 @@ ah_hmac_sha2_256_result(state, addr, l) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 64); - ctxt = (SHA256_CTX *)(opad + 64); + ctxt = (SHA256_CTX *)(void *)(opad + 64); SHA256_Final((u_int8_t *)digest, ctxt); @@ -951,7 +951,7 @@ ah_hmac_sha2_384_init(state, sav) u_char *ipad; u_char *opad; SHA384_CTX *ctxt; - u_char tk[SHA384_DIGEST_LENGTH]; + u_char tk[SHA384_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *key; size_t keylen; size_t i; @@ -968,7 +968,7 @@ ah_hmac_sha2_384_init(state, sav) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 128); - ctxt = (SHA384_CTX *)(opad + 128); + ctxt = (SHA384_CTX *)(void *)(opad + 128); /* compress the key if necessery */ if (128 < _KEYLEN(state->sav->key_auth)) { @@ -1012,7 +1012,7 @@ ah_hmac_sha2_384_loop(state, addr, len) if (!state || !state->foo) panic("ah_hmac_sha2_384_loop: what?"); - ctxt = (SHA384_CTX *)(((u_char *)state->foo) + 256); + ctxt = (SHA384_CTX *)(void *)(((u_char *)state->foo) + 256); SHA384_Update(ctxt, (const u_int8_t *)addr, (size_t)len); } @@ -1032,7 +1032,7 @@ ah_hmac_sha2_384_result(state, addr, l) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 128); - ctxt = (SHA384_CTX *)(opad + 128); + ctxt = (SHA384_CTX *)(void *)(opad + 128); SHA384_Final((u_int8_t *)digest, ctxt); @@ -1094,7 +1094,7 @@ ah_hmac_sha2_512_init(state, sav) u_char *ipad; u_char *opad; SHA512_CTX *ctxt; - u_char tk[SHA512_DIGEST_LENGTH]; + u_char tk[SHA512_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *key; size_t keylen; size_t i; @@ -1111,7 +1111,7 @@ ah_hmac_sha2_512_init(state, sav) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 128); - ctxt = (SHA512_CTX *)(opad + 128); + ctxt = (SHA512_CTX *)(void *)(opad + 128); /* compress the key if necessery */ if (128 < _KEYLEN(state->sav->key_auth)) { @@ -1155,7 +1155,7 @@ ah_hmac_sha2_512_loop(state, addr, len) if (!state || !state->foo) panic("ah_hmac_sha2_512_loop: what?"); - ctxt = (SHA512_CTX *)(((u_char *)state->foo) + 256); + ctxt = (SHA512_CTX *)(void *)(((u_char *)state->foo) + 256); SHA512_Update(ctxt, (const u_int8_t *) addr, (size_t)len); } @@ -1165,7 +1165,7 @@ ah_hmac_sha2_512_result(state, addr, l) caddr_t addr; size_t l; { - u_char digest[SHA512_DIGEST_LENGTH]; + u_char digest[SHA512_DIGEST_LENGTH] __attribute__((aligned(4))); u_char *ipad; u_char *opad; SHA512_CTX *ctxt; @@ -1175,7 +1175,7 @@ ah_hmac_sha2_512_result(state, addr, l) ipad = (u_char *)state->foo; opad = (u_char *)(ipad + 128); - ctxt = (SHA512_CTX *)(opad + 128); + ctxt = (SHA512_CTX *)(void *)(opad + 128); SHA512_Final((u_int8_t *)digest, ctxt); @@ -1257,7 +1257,7 @@ ah4_calccksum(m, ahdat, len, algo, sav) int hdrtype; size_t advancewidth; struct ah_algorithm_state algos; - u_char sumbuf[AH_MAXSUMSIZE]; + u_char sumbuf[AH_MAXSUMSIZE] __attribute__((aligned(4))); int error = 0; int ahseen; struct mbuf *n = NULL; @@ -1503,7 +1503,7 @@ ah6_calccksum(m, ahdat, len, algo, sav) int error; int ahseen; struct ah_algorithm_state algos; - u_char sumbuf[AH_MAXSUMSIZE]; + u_char sumbuf[AH_MAXSUMSIZE] __attribute__((aligned(4))); if ((m->m_flags & M_PKTHDR) == 0) return EINVAL; diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index a448295b7..05d575b5a 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -152,9 +153,15 @@ ah4_input(struct mbuf *m, int off) } } + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); - ah = (struct ah *)(((caddr_t)ip) + off); + ah = (struct ah *)(void *)(((caddr_t)ip) + off); #else + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); IP6_EXTHDR_GET(ah, struct ah *, m, off, sizeof(struct newah)); if (ah == NULL) { @@ -260,9 +267,11 @@ ah4_input(struct mbuf *m, int off) IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto fail; } + /* Expect 32-bit aligned data ptr on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); ip = mtod(m, struct ip *); - ah = (struct ah *)(((caddr_t)ip) + off); + ah = (struct ah *)(void *)(((caddr_t)ip) + off); } #else IP6_EXTHDR_GET(ah, struct ah *, m, off, @@ -628,7 +637,7 @@ ah6_input(struct mbuf **mp, int *offp, int proto) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct ah), {return IPPROTO_DONE;}); - ah = (struct ah *)(mtod(m, caddr_t) + off); + ah = (struct ah *)(void *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ah, struct ah *, m, off, sizeof(struct newah)); if (ah == NULL) { @@ -637,6 +646,9 @@ ah6_input(struct mbuf **mp, int *offp, int proto) return IPPROTO_DONE; } #endif + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); nxt = ah->ah_nxt; @@ -1059,7 +1071,7 @@ ah6_ctlinput(cmd, sa, d) m_copydata(m, off, sizeof(ah), (caddr_t)&ah); ahp = &ah; } else - ahp = (struct newah *)(mtod(m, caddr_t) + off); + ahp = (struct newah *)(void *)(mtod(m, caddr_t) + off); if (cmd == PRC_MSGSIZE) { int valid = 0; @@ -1069,7 +1081,7 @@ ah6_ctlinput(cmd, sa, d) * the address in the ICMP message payload. */ sa6_src = ip6cp->ip6c_src; - sa6_dst = (struct sockaddr_in6 *)sa; + sa6_dst = (struct sockaddr_in6 *)(void *)sa; sav = key_allocsa(AF_INET6, (caddr_t)&sa6_src->sin6_addr, (caddr_t)&sa6_dst->sin6_addr, diff --git a/bsd/netinet6/ah_output.c b/bsd/netinet6/ah_output.c index 918196d8e..18391b187 100644 --- a/bsd/netinet6/ah_output.c +++ b/bsd/netinet6/ah_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -277,7 +277,7 @@ ah4_output(m, sav) if (sav->flags & SADB_X_EXT_OLD) { struct ah *ahdr; - ahdr = (struct ah *)ahdrpos; + ahdr = (struct ah *)(void *)ahdrpos; ahsumpos = (u_char *)(ahdr + 1); ahdr->ah_len = plen >> 2; ahdr->ah_nxt = ip->ip_p; @@ -287,7 +287,7 @@ ah4_output(m, sav) } else { struct newah *ahdr; - ahdr = (struct newah *)ahdrpos; + ahdr = (struct newah *)(void *)ahdrpos; ahsumpos = (u_char *)(ahdr + 1); ahdr->ah_len = (plen >> 2) + 1; /* plus one for seq# */ ahdr->ah_nxt = ip->ip_p; @@ -617,7 +617,7 @@ ah4_finaldst(m) return NULL; } i += q[i + IPOPT_OLEN] - sizeof(struct in_addr); - return (struct in_addr *)(q + i); + return (struct in_addr *)(void *)(q + i); default: if (q[i + IPOPT_OLEN] < 2 || optlen - i < q[i + IPOPT_OLEN]) { diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index 905de9ba2..8a1d06c0a 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -101,9 +101,7 @@ #include #include #include -#include -#include -#include +#include #include @@ -111,6 +109,7 @@ #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETIPSEC, 3) #define DBG_FNC_ESPAUTH NETDBG_CODE(DBG_NETIPSEC, (8 << 8)) +#define MAX_SBUF_LEN 2000 extern lck_mtx_t *sadb_mutex; @@ -130,22 +129,6 @@ static int esp_des_blockdecrypt(const struct esp_algorithm *, static int esp_des_blockencrypt(const struct esp_algorithm *, struct secasvar *, u_int8_t *, u_int8_t *); static int esp_cbc_mature(struct secasvar *); -#if ALLCRYPTO -static int esp_blowfish_schedule(const struct esp_algorithm *, - struct secasvar *); -static int esp_blowfish_schedlen(const struct esp_algorithm *); -static int esp_blowfish_blockdecrypt(const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *); -static int esp_blowfish_blockencrypt(const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *); -static int esp_cast128_schedule(const struct esp_algorithm *, - struct secasvar *); -static int esp_cast128_schedlen(const struct esp_algorithm *); -static int esp_cast128_blockdecrypt(const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *); -static int esp_cast128_blockencrypt(const struct esp_algorithm *, - struct secasvar *, u_int8_t *, u_int8_t *); -#endif /* ALLCRYPTO */ static int esp_3des_schedule(const struct esp_algorithm *, struct secasvar *); static int esp_3des_schedlen(const struct esp_algorithm *); @@ -178,19 +161,6 @@ static const struct esp_algorithm null_esp = { 1, 0, esp_null_mature, 0, 2048, 0, "null", esp_common_ivlen, esp_null_decrypt, esp_null_encrypt, NULL, NULL, NULL }; -#if ALLCRYPTO -static const struct esp_algorithm blowfish_cbc = - { 8, 8, esp_cbc_mature, 40, 448, esp_blowfish_schedlen, "blowfish-cbc", - esp_common_ivlen, esp_cbc_decrypt, - esp_cbc_encrypt, esp_blowfish_schedule, - esp_blowfish_blockdecrypt, esp_blowfish_blockencrypt, }; -static const struct esp_algorithm cast128_cbc = - { 8, 8, esp_cbc_mature, 40, 128, esp_cast128_schedlen, - "cast128-cbc", - esp_common_ivlen, esp_cbc_decrypt, - esp_cbc_encrypt, esp_cast128_schedule, - esp_cast128_blockdecrypt, esp_cast128_blockencrypt, }; -#endif /* ALLCRYPTO */ static const struct esp_algorithm aes_cbc = { 16, 16, esp_cbc_mature, 128, 256, esp_aes_schedlen, "aes-cbc", @@ -202,10 +172,6 @@ static const struct esp_algorithm *esp_algorithms[] = { &des_cbc, &des3_cbc, &null_esp, -#if ALLCRYPTO - &blowfish_cbc, - &cast128_cbc, -#endif /* ALLCRYPTO */ &aes_cbc }; @@ -213,7 +179,6 @@ const struct esp_algorithm * esp_algorithm_lookup(idx) int idx; { - switch (idx) { case SADB_EALG_DESCBC: return &des_cbc; @@ -221,12 +186,6 @@ esp_algorithm_lookup(idx) return &des3_cbc; case SADB_EALG_NULL: return &null_esp; -#if ALLCRYPTO - case SADB_X_EALG_BLOWFISHCBC: - return &blowfish_cbc; - case SADB_X_EALG_CAST128CBC: - return &cast128_cbc; -#endif /* ALLCRYPTO */ case SADB_X_EALG_RIJNDAELCBC: return &aes_cbc; default: @@ -401,8 +360,7 @@ static int esp_des_schedlen( __unused const struct esp_algorithm *algo) { - - return sizeof(des_key_schedule); + return sizeof(des_ecb_key_schedule); } static int @@ -412,8 +370,8 @@ esp_des_schedule( { lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); - if (des_key_sched((des_cblock *)_KEYBUF(sav->key_enc), - *(des_key_schedule *)sav->sched)) + if (des_ecb_key_sched((des_cblock *)_KEYBUF(sav->key_enc), + (des_ecb_key_schedule *)sav->sched)) return EINVAL; else return 0; @@ -426,11 +384,10 @@ esp_des_blockdecrypt( u_int8_t *s, u_int8_t *d) { - /* assumption: d has a good alignment */ bcopy(s, d, sizeof(DES_LONG) * 2); des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, - *(des_key_schedule *)sav->sched, DES_DECRYPT); + (des_ecb_key_schedule *)sav->sched, DES_DECRYPT); return 0; } @@ -441,11 +398,10 @@ esp_des_blockencrypt( u_int8_t *s, u_int8_t *d) { - /* assumption: d has a good alignment */ bcopy(s, d, sizeof(DES_LONG) * 2); des_ecb_encrypt((des_cblock *)d, (des_cblock *)d, - *(des_key_schedule *)sav->sched, DES_ENCRYPT); + (des_ecb_key_schedule *)sav->sched, DES_ENCRYPT); return 0; } @@ -498,9 +454,6 @@ esp_cbc_mature(sav) return 1; } break; - case SADB_X_EALG_BLOWFISHCBC: - case SADB_X_EALG_CAST128CBC: - break; case SADB_X_EALG_RIJNDAELCBC: /* allows specific key sizes only */ if (!(keylen == 128 || keylen == 192 || keylen == 256)) { @@ -515,123 +468,12 @@ esp_cbc_mature(sav) return 0; } -#if ALLCRYPTO -static int -esp_blowfish_schedlen( - __unused const struct esp_algorithm *algo) -{ - - return sizeof(BF_KEY); -} - -static int -esp_blowfish_schedule( - __unused const struct esp_algorithm *algo, - struct secasvar *sav) -{ - - lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); - BF_set_key((BF_KEY *)sav->sched, _KEYLEN(sav->key_enc), - (u_int8_t *) _KEYBUF(sav->key_enc)); - return 0; -} - -static int -esp_blowfish_blockdecrypt( - __unused const struct esp_algorithm *algo, - struct secasvar *sav, - u_int8_t *s, - u_int8_t *d) -{ - /* HOLY COW! BF_decrypt() takes values in host byteorder */ - BF_LONG t[2]; - - bcopy(s, t, sizeof(t)); - t[0] = ntohl(t[0]); - t[1] = ntohl(t[1]); - BF_decrypt(t, (BF_KEY *)sav->sched); - t[0] = htonl(t[0]); - t[1] = htonl(t[1]); - bcopy(t, d, sizeof(t)); - return 0; -} - -static int -esp_blowfish_blockencrypt( - __unused const struct esp_algorithm *algo, - struct secasvar *sav, - u_int8_t *s, - u_int8_t *d) -{ - /* HOLY COW! BF_encrypt() takes values in host byteorder */ - BF_LONG t[2]; - - bcopy(s, t, sizeof(t)); - t[0] = ntohl(t[0]); - t[1] = ntohl(t[1]); - BF_encrypt(t, (BF_KEY *)sav->sched); - t[0] = htonl(t[0]); - t[1] = htonl(t[1]); - bcopy(t, d, sizeof(t)); - return 0; -} - -static int -esp_cast128_schedlen( - __unused const struct esp_algorithm *algo) -{ - - return sizeof(u_int32_t) * 32; -} - -static int -esp_cast128_schedule( - __unused const struct esp_algorithm *algo, - struct secasvar *sav) -{ - lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); - set_cast128_subkey((u_int32_t *)sav->sched, (u_int8_t *) _KEYBUF(sav->key_enc), - _KEYLEN(sav->key_enc)); - return 0; -} - -static int -esp_cast128_blockdecrypt( - __unused const struct esp_algorithm *algo, - struct secasvar *sav, - u_int8_t *s, - u_int8_t *d) -{ - - if (_KEYLEN(sav->key_enc) <= 80 / 8) - cast128_decrypt_round12(d, s, (u_int32_t *)sav->sched); - else - cast128_decrypt_round16(d, s, (u_int32_t *)sav->sched); - return 0; -} - -static int -esp_cast128_blockencrypt( - __unused const struct esp_algorithm *algo, - struct secasvar *sav, - u_int8_t *s, - u_int8_t *d) -{ - - if (_KEYLEN(sav->key_enc) <= 80 / 8) - cast128_encrypt_round12(d, s, (u_int32_t *)sav->sched); - else - cast128_encrypt_round16(d, s, (u_int32_t *)sav->sched); - return 0; -} -#endif /* ALLCRYPTO */ - static int esp_3des_schedlen( __unused const struct esp_algorithm *algo) { - return sizeof(des_key_schedule) * 3; + return sizeof(des3_ecb_key_schedule); } static int @@ -639,20 +481,13 @@ esp_3des_schedule( __unused const struct esp_algorithm *algo, struct secasvar *sav) { - int error; - des_key_schedule *p; - int i; - char *k; - lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); - p = (des_key_schedule *)sav->sched; - k = _KEYBUF(sav->key_enc); - for (i = 0; i < 3; i++) { - error = des_key_sched((des_cblock *)(k + 8 * i), p[i]); - if (error) - return EINVAL; - } - return 0; + + if (des3_ecb_key_sched((des_cblock *)_KEYBUF(sav->key_enc), + (des3_ecb_key_schedule *)sav->sched)) + return EINVAL; + else + return 0; } static int @@ -662,13 +497,10 @@ esp_3des_blockdecrypt( u_int8_t *s, u_int8_t *d) { - des_key_schedule *p; - /* assumption: d has a good alignment */ - p = (des_key_schedule *)sav->sched; bcopy(s, d, sizeof(DES_LONG) * 2); - des_ecb3_encrypt((des_cblock *)d, (des_cblock *)d, - p[0], p[1], p[2], DES_DECRYPT); + des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d, + (des3_ecb_key_schedule *)sav->sched, DES_DECRYPT); return 0; } @@ -679,13 +511,10 @@ esp_3des_blockencrypt( u_int8_t *s, u_int8_t *d) { - des_key_schedule *p; - /* assumption: d has a good alignment */ - p = (des_key_schedule *)sav->sched; bcopy(s, d, sizeof(DES_LONG) * 2); - des_ecb3_encrypt((des_cblock *)d, (des_cblock *)d, - p[0], p[1], p[2], DES_ENCRYPT); + des3_ecb_encrypt((des_cblock *)d, (des_cblock *)d, + (des3_ecb_key_schedule *)sav->sched, DES_ENCRYPT); return 0; } @@ -713,12 +542,12 @@ esp_cbc_decrypt(m, off, sav, algo, ivlen) int soff, doff; /* offset from the head of chain, to head of this mbuf */ int sn, dn; /* offset from the head of the mbuf, to meat */ size_t ivoff, bodyoff; - u_int8_t iv[MAXIVLEN], *ivp; - u_int8_t sbuf[MAXIVLEN], *sp; + u_int8_t iv[MAXIVLEN] __attribute__((aligned(4))), *ivp; + u_int8_t *sbuf = NULL, *sp, *sp_unaligned; u_int8_t *p, *q; struct mbuf *scut; int scutoff; - int i; + int i, result = 0; int blocklen; int derived; @@ -820,6 +649,10 @@ esp_cbc_decrypt(m, off, sav, algo, ivlen) while (s && s->m_len == 0) s = s->m_next; + // Allocate blocksized buffer for unaligned or non-contiguous access + sbuf = (u_int8_t *)_MALLOC(blocklen, M_SECA, M_DONTWAIT); + if (sbuf == NULL) + return ENOBUFS; while (soff < m->m_pkthdr.len) { /* source */ if (sn + blocklen <= s->m_len) { @@ -848,12 +681,19 @@ esp_cbc_decrypt(m, off, sav, algo, ivlen) m_freem(m); if (d0) m_freem(d0); - return ENOBUFS; + result = ENOBUFS; + goto end; } if (!d0) d0 = d; if (dp) dp->m_next = d; + + // try to make mbuf data aligned + if (!IPSEC_IS_P2ALIGNED(d->m_data)) { + m_adj(d, IPSEC_GET_P2UNALIGNED_OFS(d->m_data)); + } + d->m_len = 0; d->m_len = (M_TRAILINGSPACE(d) / blocklen) * blocklen; if (d->m_len > i) @@ -862,8 +702,22 @@ esp_cbc_decrypt(m, off, sav, algo, ivlen) } /* decrypt */ + // check input pointer alignment and use a separate aligned buffer (if sp is unaligned on 4-byte boundary). + if (IPSEC_IS_P2ALIGNED(sp)) { + sp_unaligned = NULL; + } else { + sp_unaligned = sp; + sp = sbuf; + memcpy(sp, sp_unaligned, blocklen); + } + // no need to check output pointer alignment (*algo->blockdecrypt)(algo, sav, sp, mtod(d, u_int8_t *) + dn); + // update unaligned pointers + if (!IPSEC_IS_P2ALIGNED(sp_unaligned)) { + sp = sp_unaligned; + } + /* xor */ p = ivp ? ivp : iv; q = mtod(d, u_int8_t *) + dn; @@ -895,8 +749,10 @@ esp_cbc_decrypt(m, off, sav, algo, ivlen) /* just in case */ bzero(iv, sizeof(iv)); bzero(sbuf, sizeof(sbuf)); - - return 0; +end: + if (sbuf != NULL) + FREE(sbuf, M_SECA); + return result; } static int @@ -913,12 +769,12 @@ esp_cbc_encrypt( int soff, doff; /* offset from the head of chain, to head of this mbuf */ int sn, dn; /* offset from the head of the mbuf, to meat */ size_t ivoff, bodyoff; - u_int8_t iv[MAXIVLEN], *ivp; - u_int8_t sbuf[MAXIVLEN], *sp; + u_int8_t iv[MAXIVLEN] __attribute__((aligned(4))), *ivp; + u_int8_t *sbuf = NULL, *sp, *sp_unaligned; u_int8_t *p, *q; struct mbuf *scut; int scutoff; - int i; + int i, result = 0; int blocklen; int derived; @@ -1026,6 +882,10 @@ esp_cbc_encrypt( while (s && s->m_len == 0) s = s->m_next; + // Allocate blocksized buffer for unaligned or non-contiguous access + sbuf = (u_int8_t *)_MALLOC(blocklen, M_SECA, M_DONTWAIT); + if (sbuf == NULL) + return ENOBUFS; while (soff < m->m_pkthdr.len) { /* source */ if (sn + blocklen <= s->m_len) { @@ -1054,12 +914,19 @@ esp_cbc_encrypt( m_freem(m); if (d0) m_freem(d0); - return ENOBUFS; + result = ENOBUFS; + goto end; } if (!d0) d0 = d; if (dp) dp->m_next = d; + + // try to make mbuf data aligned + if (!IPSEC_IS_P2ALIGNED(d->m_data)) { + m_adj(d, IPSEC_GET_P2UNALIGNED_OFS(d->m_data)); + } + d->m_len = 0; d->m_len = (M_TRAILINGSPACE(d) / blocklen) * blocklen; if (d->m_len > i) @@ -1074,8 +941,22 @@ esp_cbc_encrypt( q[i] ^= p[i]; /* encrypt */ + // check input pointer alignment and use a separate aligned buffer (if sp is not aligned on 4-byte boundary). + if (IPSEC_IS_P2ALIGNED(sp)) { + sp_unaligned = NULL; + } else { + sp_unaligned = sp; + sp = sbuf; + memcpy(sp, sp_unaligned, blocklen); + } + // no need to check output pointer alignment (*algo->blockencrypt)(algo, sav, sp, mtod(d, u_int8_t *) + dn); + // update unaligned pointers + if (!IPSEC_IS_P2ALIGNED(sp_unaligned)) { + sp = sp_unaligned; + } + /* next iv */ ivp = mtod(d, u_int8_t *) + dn; @@ -1099,8 +980,10 @@ esp_cbc_encrypt( bzero(sbuf, sizeof(sbuf)); key_sa_stir_iv(sav); - - return 0; +end: + if (sbuf != NULL) + FREE(sbuf, M_SECA); + return result; } /*------------------------------------------------------------*/ @@ -1117,7 +1000,7 @@ esp_auth(m0, skip, length, sav, sum) struct mbuf *m; size_t off; struct ah_algorithm_state s; - u_char sumbuf[AH_MAXSUMSIZE]; + u_char sumbuf[AH_MAXSUMSIZE] __attribute__((aligned(4))); const struct ah_algorithm *algo; size_t siz; int error; diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index 2052493ab..7470d023e 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -196,6 +197,9 @@ esp4_input(m, off) } } + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); // expect udp-encap and esp packets only if (ip->ip_p != IPPROTO_ESP && @@ -205,7 +209,7 @@ esp4_input(m, off) IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; } - esp = (struct esp *)(((u_int8_t *)ip) + off); + esp = (struct esp *)(void *)(((u_int8_t *)ip) + off); #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else @@ -276,8 +280,8 @@ esp4_input(m, off) /* check ICV */ { - u_char sum0[AH_MAXSUMSIZE]; - u_char sum[AH_MAXSUMSIZE]; + u_char sum0[AH_MAXSUMSIZE] __attribute__((aligned(4))); + u_char sum[AH_MAXSUMSIZE] __attribute__((aligned(4))); const struct ah_algorithm *sumalgo; size_t siz; @@ -438,14 +442,25 @@ noreplaycheck: (sav->flags & SADB_X_EXT_OLD) == 0 && seq && sav->replay && seq >= sav->replay->lastseq) { - struct udphdr *encap_uh = (__typeof__(encap_uh))((caddr_t)ip + off); + struct udphdr *encap_uh = (__typeof__(encap_uh))(void *)((caddr_t)ip + off); if (encap_uh->uh_sport && ntohs(encap_uh->uh_sport) != sav->remote_ike_port) { sav->remote_ike_port = ntohs(encap_uh->uh_sport); } } ip = esp4_input_strip_UDP_encap(m, off); - esp = (struct esp *)(((u_int8_t *)ip) + off); + esp = (struct esp *)(void *)(((u_int8_t *)ip) + off); + } + + if (sav->utun_is_keepalive_fn) { + if (sav->utun_is_keepalive_fn(sav->utun_pcb, &m, nxt, sav->flags, (off + esplen + ivlen))) { + if (m) { + // not really bad, we just wanna exit + IPSEC_STAT_INCREMENT(ipsecstat.in_success); + m = NULL; + } + goto bad; + } } /* was it transmitted over the IPsec tunnel SA? */ @@ -513,6 +528,12 @@ noreplaycheck: } } + /* + * Expect 32-bit aligned data pointer on strict-align + * platforms. + */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); /* ECN consideration. */ @@ -560,6 +581,15 @@ noreplaycheck: /* Clear the csum flags, they can't be valid for the inner headers */ m->m_pkthdr.csum_flags = 0; + + if (sav->utun_in_fn) { + if (!(sav->utun_in_fn(sav->utun_pcb, &m, ifamily == AF_INET ? PF_INET : PF_INET6))) { + m = NULL; + // we just wanna exit since packet has been completely processed + goto bad; + } + } + if (proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m) != 0) goto bad; @@ -633,7 +663,7 @@ noreplaycheck: } ip = mtod(m, struct ip *); } - udp = (struct udphdr *)(((u_int8_t *)ip) + off); + udp = (struct udphdr *)(void *)(((u_int8_t *)ip) + off); lck_mtx_lock(sadb_mutex); if (sav->natt_encapsulated_src_port == 0) { @@ -652,6 +682,14 @@ noreplaycheck: struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, struct ip *, ip, struct ip6_hdr *, NULL); + if (sav->utun_in_fn) { + if (!(sav->utun_in_fn(sav->utun_pcb, &m, PF_INET))) { + m = NULL; + // we just wanna exit since packet has been completely processed + goto bad; + } + } + ip_proto_dispatch_in(m, off, nxt, 0); } else m_freem(m); @@ -708,7 +746,7 @@ esp6_input(struct mbuf **mp, int *offp, int proto) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, ESPMAXLEN, {return IPPROTO_DONE;}); - esp = (struct esp *)(mtod(m, caddr_t) + off); + esp = (struct esp *)(void *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(esp, struct esp *, m, off, ESPMAXLEN); if (esp == NULL) { @@ -716,6 +754,9 @@ esp6_input(struct mbuf **mp, int *offp, int proto) return IPPROTO_DONE; } #endif + /* Expect 32-bit data aligned pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); if (ntohs(ip6->ip6_plen) == 0) { @@ -790,8 +831,8 @@ esp6_input(struct mbuf **mp, int *offp, int proto) /* check ICV */ { - u_char sum0[AH_MAXSUMSIZE]; - u_char sum[AH_MAXSUMSIZE]; + u_char sum0[AH_MAXSUMSIZE] __attribute__((aligned(4))); + u_char sum[AH_MAXSUMSIZE] __attribute__((aligned(4))); const struct ah_algorithm *sumalgo; size_t siz; @@ -926,6 +967,17 @@ noreplaycheck: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - taillen); + if (sav->utun_is_keepalive_fn) { + if (sav->utun_is_keepalive_fn(sav->utun_pcb, &m, nxt, sav->flags, (off + esplen + ivlen))) { + if (m) { + // not really bad, we just wanna exit + IPSEC_STAT_INCREMENT(ipsec6stat.in_success); + m = NULL; + } + goto bad; + } + } + /* was it transmitted over the IPsec tunnel SA? */ if (ipsec6_tunnel_validate(m, off + esplen + ivlen, nxt, sav)) { ifaddr_t ifa; @@ -993,6 +1045,14 @@ noreplaycheck: } } + if (sav->utun_in_fn) { + if (!(sav->utun_in_fn(sav->utun_pcb, &m, PF_INET6))) { + m = NULL; + // we just wanna exit since packet has been completely processed + goto bad; + } + } + if (proto_input(PF_INET6, m) != 0) goto bad; nxt = IPPROTO_DONE; @@ -1091,6 +1151,14 @@ noreplaycheck: IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); goto bad; } + + if (sav->utun_in_fn) { + if (!(sav->utun_in_fn(sav->utun_pcb, &m, PF_INET6))) { + m = NULL; + // we just wanna exit since packet has been completely processed + goto bad; + } + } } *offp = off; @@ -1183,7 +1251,7 @@ esp6_ctlinput(cmd, sa, d) m_copydata(m, off, sizeof(esp), (caddr_t)&esp); espp = &esp; } else - espp = (struct newesp*)(mtod(m, caddr_t) + off); + espp = (struct newesp*)(void *)(mtod(m, caddr_t) + off); if (cmd == PRC_MSGSIZE) { int valid = 0; @@ -1193,7 +1261,7 @@ esp6_ctlinput(cmd, sa, d) * the address in the ICMP message payload. */ sa6_src = ip6cp->ip6c_src; - sa6_dst = (struct sockaddr_in6 *)sa; + sa6_dst = (struct sockaddr_in6 *)(void *)sa; sav = key_allocsa(AF_INET6, (caddr_t)&sa6_src->sin6_addr, (caddr_t)&sa6_dst->sin6_addr, diff --git a/bsd/netinet6/esp_output.c b/bsd/netinet6/esp_output.c index 8d16d2c62..9f6c0e0f0 100644 --- a/bsd/netinet6/esp_output.c +++ b/bsd/netinet6/esp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -457,7 +457,7 @@ esp_output(m, nexthdrp, md, af, sav) m->m_pkthdr.len += esphlen; if (udp_encapsulate) { udp = mtod(n, struct udphdr *); - esp = (struct esp *)((caddr_t)udp + sizeof(struct udphdr)); + esp = (struct esp *)(void *)((caddr_t)udp + sizeof(struct udphdr)); } else { esp = mtod(n, struct esp *); } @@ -468,7 +468,7 @@ esp_output(m, nexthdrp, md, af, sav) esp = mtod(md, struct esp *); if (udp_encapsulate) { udp = mtod(md, struct udphdr *); - esp = (struct esp *)((caddr_t)udp + sizeof(struct udphdr)); + esp = (struct esp *)(void *)((caddr_t)udp + sizeof(struct udphdr)); } else { esp = mtod(md, struct esp *); } @@ -726,7 +726,7 @@ esp_output(m, nexthdrp, md, af, sav) { const struct ah_algorithm *aalgo; - u_char authbuf[AH_MAXSUMSIZE]; + u_char authbuf[AH_MAXSUMSIZE] __attribute__((aligned(4))); u_char *p; size_t siz; #if INET diff --git a/bsd/netinet6/esp_rijndael.c b/bsd/netinet6/esp_rijndael.c index 0f3ce7c27..af5ddff1f 100644 --- a/bsd/netinet6/esp_rijndael.c +++ b/bsd/netinet6/esp_rijndael.c @@ -64,6 +64,7 @@ #include #include #include +#include #include @@ -74,13 +75,14 @@ #include #include -#include +#include #include #include #define AES_BLOCKLEN 16 +#define MAX_SBUF_LEN 2000 extern lck_mtx_t *sadb_mutex; @@ -149,8 +151,8 @@ esp_cbc_decrypt_aes(m, off, sav, algo, ivlen) int soff; /* offset from the head of chain, to head of this mbuf */ int sn, dn; /* offset from the head of the mbuf, to meat */ size_t ivoff, bodyoff; - u_int8_t iv[AES_BLOCKLEN], *dptr; - u_int8_t sbuf[AES_BLOCKLEN], *sp; + u_int8_t iv[AES_BLOCKLEN] __attribute__((aligned(4))), *dptr; + u_int8_t sbuf[MAX_SBUF_LEN] __attribute__((aligned(4))), *sp, *sp_unaligned; struct mbuf *scut; int scutoff; int i, len; @@ -251,6 +253,12 @@ esp_cbc_decrypt_aes(m, off, sav, algo, ivlen) d0 = d; if (dp) dp->m_next = d; + + // try to make mbuf data aligned + if (!IPSEC_IS_P2ALIGNED(d->m_data)) { + m_adj(d, IPSEC_GET_P2UNALIGNED_OFS(d->m_data)); + } + d->m_len = M_TRAILINGSPACE(d); d->m_len -= d->m_len % AES_BLOCKLEN; if (d->m_len > i) @@ -264,9 +272,23 @@ esp_cbc_decrypt_aes(m, off, sav, algo, ivlen) len = d->m_len - dn; /* decrypt */ + // check input pointer alignment and use a separate aligned buffer (if sp is unaligned on 4-byte boundary). + if (IPSEC_IS_P2ALIGNED(sp)) { + sp_unaligned = NULL; + } else { + sp_unaligned = sp; + sp = sbuf; + memcpy(sp, sp_unaligned, len); + } + // no need to check output pointer alignment aes_decrypt_cbc(sp, iv, len >> 4, dptr + dn, (aes_decrypt_ctx*)(&(((aes_ctx*)sav->sched)->decrypt))); + // update unaligned pointers + if (!IPSEC_IS_P2ALIGNED(sp_unaligned)) { + sp = sp_unaligned; + } + /* udpate offsets */ sn += len; dn += len; @@ -309,8 +331,9 @@ esp_cbc_encrypt_aes( int soff; /* offset from the head of chain, to head of this mbuf */ int sn, dn; /* offset from the head of the mbuf, to meat */ size_t ivoff, bodyoff; - u_int8_t *ivp, *dptr; - u_int8_t sbuf[AES_BLOCKLEN], *sp; + u_int8_t *ivp, *dptr, *ivp_unaligned; + u_int8_t sbuf[MAX_SBUF_LEN] __attribute__((aligned(4))), *sp, *sp_unaligned; + u_int8_t ivp_aligned_buf[AES_BLOCKLEN] __attribute__((aligned(4))); struct mbuf *scut; int scutoff; int i, len; @@ -412,6 +435,11 @@ esp_cbc_encrypt_aes( if (dp) dp->m_next = d; + // try to make mbuf data aligned + if (!IPSEC_IS_P2ALIGNED(d->m_data)) { + m_adj(d, IPSEC_GET_P2UNALIGNED_OFS(d->m_data)); + } + d->m_len = M_TRAILINGSPACE(d); d->m_len -= d->m_len % AES_BLOCKLEN; if (d->m_len > i) @@ -425,9 +453,34 @@ esp_cbc_encrypt_aes( len = d->m_len - dn; /* encrypt */ + // check input pointer alignment and use a separate aligned buffer (if sp is not aligned on 4-byte boundary). + if (IPSEC_IS_P2ALIGNED(sp)) { + sp_unaligned = NULL; + } else { + sp_unaligned = sp; + sp = sbuf; + memcpy(sp, sp_unaligned, len); + } + // check ivp pointer alignment and use a separate aligned buffer (if ivp is not aligned on 4-byte boundary). + if (IPSEC_IS_P2ALIGNED(ivp)) { + ivp_unaligned = NULL; + } else { + ivp_unaligned = ivp; + ivp = ivp_aligned_buf; + memcpy(ivp, ivp_unaligned, len); + } + // no need to check output pointer alignment aes_encrypt_cbc(sp, ivp, len >> 4, dptr + dn, (aes_encrypt_ctx*)(&(((aes_ctx*)sav->sched)->encrypt))); + // update unaligned pointers + if (!IPSEC_IS_P2ALIGNED(sp_unaligned)) { + sp = sp_unaligned; + } + if (!IPSEC_IS_P2ALIGNED(ivp_unaligned)) { + ivp = ivp_unaligned; + } + /* update offsets */ sn += len; dn += len; diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index b6b68b920..00174a628 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -180,6 +181,9 @@ frag6_input(struct mbuf **mp, int *offp, int proto) struct sockaddr_in6 *dst; #endif + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), return IPPROTO_DONE); diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 43a61a6d2..cdf92a564 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,6 +98,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,7 @@ #include #include #include +#include #include #include @@ -434,7 +436,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; - int code, sum, noff; + int code, sum, noff, proxy = 0; ifp = m->m_pkthdr.rcvif; @@ -443,11 +445,13 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) /* m might change if M_LOOP. So, call mtod after this */ #endif + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ - ip6 = mtod(m, struct ip6_hdr *); if (icmp6len < sizeof(struct icmp6_hdr)) { icmp6stat.icp6s_tooshort++; @@ -466,9 +470,16 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) in6_multihead_lock_done(); if (inm == NULL) { - ip6stat.ip6s_notmember++; - in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); - goto freeit; + /* + * Don't discard if this is a Neighbor Solicitation + * that needs to be proxied (see check down below.) + */ + if (!(m->m_pkthdr.aux_flags & MAUXF_PROXY_DST)) { + ip6stat.ip6s_notmember++; + in6_ifstat_inc(m->m_pkthdr.rcvif, + ifs6_in_discard); + goto freeit; + } } else { IN6M_REMREF(inm); } @@ -496,23 +507,22 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) goto freeit; } -#if defined(NFAITH) && 0 < NFAITH - if (faithprefix(&ip6->ip6_dst)) { + if (m->m_pkthdr.aux_flags & MAUXF_PROXY_DST) { /* - * Deliver very specific ICMP6 type only. - * This is important to deliver TOOBIG. Otherwise PMTUD - * will not work. + * This is the special case of proxying NS (dst is either + * solicited-node multicast or unicast); process it locally + * but don't deliver it to sockets. It practically lets us + * steer the packet to nd6_prproxy_ns_input, where more + * specific tests and actions will be taken. */ switch (icmp6->icmp6_type) { - case ICMP6_DST_UNREACH: - case ICMP6_PACKET_TOO_BIG: - case ICMP6_TIME_EXCEEDED: + case ND_NEIGHBOR_SOLICIT: + proxy = 1; break; default: goto freeit; } } -#endif icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg); @@ -658,8 +668,12 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) sizeof(*nicmp6)); noff = off; } - nicmp6->icmp6_type = ICMP6_ECHO_REPLY; - nicmp6->icmp6_code = 0; + if(nicmp6 == NULL) + panic("nicmp6 is NULL in %s, which isn't good!\n", __FUNCTION__); + else { + nicmp6->icmp6_type = ICMP6_ECHO_REPLY; + nicmp6->icmp6_code = 0; + } if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; @@ -919,12 +933,13 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) goto freeit; } rate_limit_checked: - /* deliver the packet to appropriate sockets */ - icmp6_rip6_input(&m, *offp); - - return IPPROTO_DONE; + /* deliver the packet to appropriate sockets (unless proxying) */ + if (!proxy) { + icmp6_rip6_input(&m, *offp); + return IPPROTO_DONE; + } - freeit: +freeit: m_freem(m); return IPPROTO_DONE; } @@ -1051,7 +1066,7 @@ icmp6_notify_error(m, off, icmp6len, code) /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) - finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); + finaldst = (struct in6_addr *)(void *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; @@ -1764,7 +1779,7 @@ ni6_addrs(ni6, ifpp, subj) case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return(0); - subj_ip6 = (struct sockaddr_in6 *)subj; + subj_ip6 = (struct sockaddr_in6 *)(void *)subj; break; default: /* @@ -2007,7 +2022,7 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) sizeof(struct in6_addr)); /* XXX: KAME link-local hack; remove ifindex */ if (IN6_IS_ADDR_LINKLOCAL(&ifa6->ia_addr.sin6_addr)) - ((struct in6_addr *)cp)->s6_addr16[1] = 0; + ((struct in6_addr *)(void *)cp)->s6_addr16[1] = 0; cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); @@ -2067,9 +2082,9 @@ icmp6_rip6_input(mp, off) rip6src.sin6_family = AF_INET6; rip6src.sin6_len = sizeof(struct sockaddr_in6); rip6src.sin6_addr = ip6->ip6_src; - if (sa6_recoverscope(&rip6src)) + if (sa6_recoverscope(&rip6src, TRUE)) return (IPPROTO_DONE); - + lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(in6p, &ripcb, inp_list) { @@ -2141,8 +2156,7 @@ error: m_freem(m); m_freem(opts); ip6stat.ip6s_delivered--; - return IPPROTO_DONE; - + return IPPROTO_DONE; } /* @@ -2162,11 +2176,15 @@ icmp6_reflect(m, off) int type, code; struct ifnet *outif = NULL; struct sockaddr_in6 sa6_src, sa6_dst; + struct nd_ifinfo *ndi; u_int32_t oflow; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) { ip6oa.ip6oa_boundif = m->m_pkthdr.rcvif->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; + } /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { @@ -2284,6 +2302,11 @@ icmp6_reflect(m, off) sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */ bzero(&ro, sizeof(ro)); + /* + * in6_selectsrc() might return outif with its reference held + * even in the error case, so we always need to release it + * if non-NULL. + */ src = in6_selectsrc(&sin6, NULL, NULL, &ro, &outif, &src_storage, ip6oa.ip6oa_boundif, &e); if (ro.ro_rt) @@ -2306,11 +2329,19 @@ icmp6_reflect(m, off) } ip6->ip6_nxt = IPPROTO_ICMPV6; lck_rw_lock_shared(nd_if_rwlock); - if (outif) - ip6->ip6_hlim = ND_IFINFO(outif)->chlim; - if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_index < nd_ifinfo_indexlim) { + if (outif != NULL && (ndi = ND_IFINFO(outif)) != NULL && + ndi->initialized) { + lck_mtx_lock(&ndi->lock); + ip6->ip6_hlim = ndi->chlim; + lck_mtx_unlock(&ndi->lock); + } + if (m->m_pkthdr.rcvif != NULL && + (ndi = ND_IFINFO(m->m_pkthdr.rcvif)) != NULL && + ndi->initialized) { /* XXX: This may not be the outgoing interface */ - ip6->ip6_hlim = nd_ifinfo[m->m_pkthdr.rcvif->if_index].chlim; + lck_mtx_lock(&ndi->lock); + ip6->ip6_hlim = ndi->chlim; + lck_mtx_unlock(&ndi->lock); } else { ip6->ip6_hlim = ip6_defhlim; } @@ -2335,6 +2366,9 @@ icmp6_reflect(m, off) ifnet_release(outif); outif = NULL; } + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.csum_data = 0; + m->m_pkthdr.csum_flags = 0; ip6_output(m, NULL, NULL, IPV6_OUTARGS, NULL, &outif, &ip6oa); if (outif != NULL) { icmp6_ifoutstat_inc(outif, type, code); @@ -2385,8 +2419,11 @@ icmp6_redirect_input(m, off) if (!m || !ifp) return; - /* XXX if we are router, we don't update route by icmp6 redirect */ - if (ip6_forwarding) + /* + * If we are an advertising router on this interface, + * don't update route by icmp6 redirect. + */ + if (ifp->if_eflags & IFEF_IPV6_ROUTER) goto freeit; if (!icmp6_rediraccept) goto freeit; @@ -2446,7 +2483,8 @@ icmp6_redirect_input(m, off) goto bad; } - gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr); + gw6 = &(((struct sockaddr_in6 *)(void *) + rt->rt_gateway)->sin6_addr); if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " @@ -2551,7 +2589,7 @@ icmp6_redirect_input(m, off) sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); - + /* * Radar 6843900 * Release the IPv6 domain lock because we are going to take domain_proto_mtx @@ -2591,7 +2629,8 @@ icmp6_redirect_output(m0, rt) u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); @@ -2602,8 +2641,11 @@ icmp6_redirect_output(m0, rt) if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; - /* if we are not router, we don't send icmp6 redirect */ - if (!ip6_forwarding || ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD)) + /* + * If we are not a router to begin with, or not an advertising + * router on this interface, don't send icmp6 redirect. + */ + if (!ip6_forwarding || !(ifp->if_eflags & IFEF_IPV6_ROUTER)) goto fail; /* @@ -2672,7 +2714,7 @@ icmp6_redirect_output(m0, rt) /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)rt->rt_gateway; + sin6 = (struct sockaddr_in6 *)(void *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; @@ -2747,8 +2789,8 @@ icmp6_redirect_output(m0, rt) if (!(rt_router->rt_flags & RTF_GATEWAY) && (rt_router->rt_flags & RTF_LLINFO) && (rt_router->rt_gateway->sa_family == AF_LINK) && - (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && - sdl->sdl_alen) { + (sdl = (struct sockaddr_dl *)(void *) + rt_router->rt_gateway) && sdl->sdl_alen) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; @@ -2758,7 +2800,7 @@ icmp6_redirect_output(m0, rt) } RT_REMREF_LOCKED(rt_router); RT_UNLOCK(rt_router); - } + } nolladdropt:; @@ -2863,6 +2905,7 @@ noredhdropt:; #endif /*IPSEC*/ ip6oa.ip6oa_boundif = ifp->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; ip6_output(m, NULL, NULL, IPV6_OUTARGS, NULL, &outif, &ip6oa); if (outif) { @@ -2972,7 +3015,7 @@ icmp6_ctloutput(so, sopt) int icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) { - if (so->so_uid == 0) + if (kauth_cred_issuser(so->so_cred)) return icmp6_ctloutput(so, sopt); if (sopt->sopt_level == IPPROTO_ICMPV6) { @@ -2983,14 +3026,13 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) return EPERM; } } - + if (sopt->sopt_level != IPPROTO_IPV6) return EINVAL; - + switch (sopt->sopt_name) { case IPV6_UNICAST_HOPS: case IPV6_CHECKSUM: - case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_USE_MIN_MTU: case IPV6_RECVRTHDR: @@ -3020,11 +3062,9 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_NO_IFT_CELLULAR: return ip6_ctloutput(so, sopt); - + default: return EPERM; - - } } @@ -3036,11 +3076,12 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, int error = 0; struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam; struct icmp6_hdr *icmp6; - if (so->so_uid == 0) - return rip6_output(m, so, (struct sockaddr_in6 *) nam, control, 0); + if (kauth_cred_issuser(so->so_cred)) + return rip6_output(m, so, (struct sockaddr_in6 *)(void *)nam, + control, 0); /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { @@ -3060,7 +3101,7 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, m_freem(m); return ENOTCONN; } - tmp = *(struct sockaddr_in6 *)nam; + tmp = *(struct sockaddr_in6 *)(void *)nam; dst = &tmp; } @@ -3074,7 +3115,7 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); - + /* * Allow only to send echo request and node information request * See RFC 2463 for Echo Request Message format @@ -3097,7 +3138,8 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, } #endif - return rip6_output(m, so, (struct sockaddr_in6 *) nam, control, 0); + return rip6_output(m, so, (struct sockaddr_in6 *)(void *)nam, + control, 0); bad: m_freem(m); return error; @@ -3113,10 +3155,10 @@ icmp6_dgram_attach(struct socket *so, int proto, struct proc *p) inp = sotoinpcb(so); if (inp) panic("icmp6_dgram_attach"); - + if (proto != IPPROTO_ICMPV6) return EINVAL; - + error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index f11a99041..8a39cc647 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -170,8 +170,10 @@ const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = {sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0}; -static int in6_lifaddr_ioctl(struct socket *, u_long, caddr_t, +static int in6_lifaddr_ioctl(struct socket *, u_long, struct if_laddrreq *, struct ifnet *, struct proc *); +static int in6_autoconf(struct ifnet *, int); +static int in6_setrouter(struct ifnet *, int); static int in6_ifinit(struct ifnet *, struct in6_ifaddr *, struct sockaddr_in6 *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); @@ -183,6 +185,8 @@ static void in6_ifaddr_trace(struct ifaddr *, int); static struct in6_aliasreq *in6_aliasreq_to_native(void *, int, struct in6_aliasreq *); +static void in6_ifaddr_set_dadprogress(struct in6_ifaddr *); + extern lck_mtx_t *nd6_mutex; extern int in6_init2done; @@ -483,7 +487,7 @@ in6_aliasreq_to_native(void *data, int data_is_64, struct in6_aliasreq *dst) { #if defined(__LP64__) if (data_is_64) - dst = data; + bcopy(data, dst, sizeof (*dst)); else in6_aliasreq_32_to_64((struct in6_aliasreq_32 *)data, (struct in6_aliasreq_64 *)dst); @@ -492,147 +496,157 @@ in6_aliasreq_to_native(void *data, int data_is_64, struct in6_aliasreq *dst) in6_aliasreq_64_to_32((struct in6_aliasreq_64 *)data, (struct in6_aliasreq_32 *)dst); else - dst = data; + bcopy(data, dst, sizeof (*dst)); #endif /* __LP64__ */ return (dst); } -#define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) -#define ia62ifa(ia6) (&((ia6)->ia_ifa)) +#define ifa2ia6(ifa) ((struct in6_ifaddr *)(void *)(ifa)) int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p) { - struct in6_ifreq *ifr = (struct in6_ifreq *)data; + struct in6_aliasreq sifra, *ifra = NULL; struct in6_ifaddr *ia = NULL; - struct in6_aliasreq sifra; - struct in6_aliasreq *ifra = NULL; - struct sockaddr_in6 *sa6; + struct sockaddr_in6 sin6, *sa6 = NULL; int index, privileged, error = 0; + u_int32_t ifru_scope_id[16]; struct timeval timenow; int p64 = proc_is64bit(p); getmicrotime(&timenow); privileged = (proc_suser(p) == 0); -#if MROUTING switch (cmd) { - case SIOCGETSGCNT_IN6: - case SIOCGETMIFCNT_IN6_32: - case SIOCGETMIFCNT_IN6_64: +#if MROUTING + case SIOCGETSGCNT_IN6: /* struct sioc_sg_req6 */ + case SIOCGETMIFCNT_IN6_32: /* struct sioc_mif_req6_32 */ + case SIOCGETMIFCNT_IN6_64: /* struct sioc_mif_req6_64 */ return (mrt6_ioctl(cmd, data)); - } + /* NOTREACHED */ #endif - switch(cmd) { - case SIOCAADDRCTL_POLICY: - case SIOCDADDRCTL_POLICY: - if (!privileged) + case SIOCAADDRCTL_POLICY: /* struct in6_addrpolicy */ + case SIOCDADDRCTL_POLICY: /* struct in6_addrpolicy */ + if (!privileged) return (EPERM); return (in6_src_ioctl(cmd, data)); - } + /* NOTREACHED */ - switch (cmd) { - case SIOCDRADD_IN6_32: - case SIOCDRADD_IN6_64: - case SIOCDRDEL_IN6_32: - case SIOCDRDEL_IN6_64: + case SIOCDRADD_IN6_32: /* struct in6_defrouter_32 */ + case SIOCDRADD_IN6_64: /* struct in6_defrouter_64 */ + case SIOCDRDEL_IN6_32: /* struct in6_defrouter_32 */ + case SIOCDRDEL_IN6_64: /* struct in6_defrouter_64 */ if (!privileged) return (EPERM); return (defrtrlist_ioctl(cmd, data)); + /* NOTREACHED */ } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { - case SIOCAUTOCONF_START: - case SIOCAUTOCONF_STOP: - case SIOCLL_START_32: - case SIOCLL_START_64: - case SIOCLL_STOP: - case SIOCPROTOATTACH_IN6_32: - case SIOCPROTOATTACH_IN6_64: - case SIOCPROTODETACH_IN6: + case SIOCAUTOCONF_START: /* struct in6_ifreq */ + case SIOCAUTOCONF_STOP: /* struct in6_ifreq */ + case SIOCLL_START_32: /* struct in6_aliasreq_32 */ + case SIOCLL_START_64: /* struct in6_aliasreq_64 */ + case SIOCLL_STOP: /* struct in6_ifreq */ + case SIOCSETROUTERMODE_IN6: /* struct in6_ifreq */ + case SIOCPROTOATTACH_IN6_32: /* struct in6_aliasreq_32 */ + case SIOCPROTOATTACH_IN6_64: /* struct in6_aliasreq_64 */ + case SIOCPROTODETACH_IN6: /* struct in6_ifreq */ if (!privileged) return (EPERM); break; - case SIOCSNDFLUSH_IN6: - case SIOCSPFXFLUSH_IN6: - case SIOCSRTRFLUSH_IN6: - case SIOCSDEFIFACE_IN6_32: - case SIOCSDEFIFACE_IN6_64: - case SIOCSIFINFO_FLAGS: + + case SIOCSNDFLUSH_IN6: /* struct in6_ifreq */ + case SIOCSPFXFLUSH_IN6: /* struct in6_ifreq */ + case SIOCSRTRFLUSH_IN6: /* struct in6_ifreq */ + case SIOCSDEFIFACE_IN6_32: /* struct in6_ndifreq_32 */ + case SIOCSDEFIFACE_IN6_64: /* struct in6_ndifreq_64 */ + case SIOCSIFINFO_FLAGS: /* struct in6_ndireq */ if (!privileged) return (EPERM); - /* fall through */ - case OSIOCGIFINFO_IN6: - case SIOCGIFINFO_IN6: - case SIOCGDRLST_IN6_32: - case SIOCGDRLST_IN6_64: - case SIOCGPRLST_IN6_32: - case SIOCGPRLST_IN6_64: - case SIOCGNBRINFO_IN6_32: - case SIOCGNBRINFO_IN6_64: - case SIOCGDEFIFACE_IN6_32: - case SIOCGDEFIFACE_IN6_64: + /* FALLTHRU */ + case OSIOCGIFINFO_IN6: /* struct in6_ondireq */ + case SIOCGIFINFO_IN6: /* struct in6_ondireq */ + case SIOCGDRLST_IN6_32: /* struct in6_drlist_32 */ + case SIOCGDRLST_IN6_64: /* struct in6_drlist_64 */ + case SIOCGPRLST_IN6_32: /* struct in6_prlist_32 */ + case SIOCGPRLST_IN6_64: /* struct in6_prlist_64 */ + case SIOCGNBRINFO_IN6_32: /* struct in6_nbrinfo_32 */ + case SIOCGNBRINFO_IN6_64: /* struct in6_nbrinfo_64 */ + case SIOCGDEFIFACE_IN6_32: /* struct in6_ndifreq_32 */ + case SIOCGDEFIFACE_IN6_64: /* struct in6_ndifreq_64 */ return (nd6_ioctl(cmd, data, ifp)); - } + /* NOTREACHED */ - switch (cmd) { - case SIOCSIFPREFIX_IN6: - case SIOCDIFPREFIX_IN6: - case SIOCAIFPREFIX_IN6: - case SIOCCIFPREFIX_IN6: - case SIOCSGIFPREFIX_IN6: - case SIOCGIFPREFIX_IN6: + case SIOCSIFPREFIX_IN6: /* struct in6_prefixreq */ + case SIOCDIFPREFIX_IN6: /* struct in6_prefixreq */ + case SIOCAIFPREFIX_IN6: /* struct in6_rrenumreq */ + case SIOCCIFPREFIX_IN6: /* struct in6_rrenumreq */ + case SIOCSGIFPREFIX_IN6: /* struct in6_rrenumreq */ + case SIOCGIFPREFIX_IN6: /* struct in6_prefixreq */ log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); - } + /* NOTREACHED */ + + case SIOCSSCOPE6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - switch (cmd) { - case SIOCSSCOPE6: if (!privileged) return (EPERM); - return (scope6_set(ifp, ifr->ifr_ifru.ifru_scope_id)); + + bcopy(ifr->ifr_ifru.ifru_scope_id, ifru_scope_id, + sizeof (ifru_scope_id)); + + return (scope6_set(ifp, ifru_scope_id)); /* NOTREACHED */ + } + + case SIOCGSCOPE6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCGSCOPE6: - return (scope6_get(ifp, ifr->ifr_ifru.ifru_scope_id)); + bcopy(ifr->ifr_ifru.ifru_scope_id, ifru_scope_id, + sizeof (ifru_scope_id)); + + return (scope6_get(ifp, ifru_scope_id)); /* NOTREACHED */ + } - case SIOCGSCOPE6DEF: - return (scope6_get_default(ifr->ifr_ifru.ifru_scope_id)); + case SIOCGSCOPE6DEF: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; + + bcopy(ifr->ifr_ifru.ifru_scope_id, ifru_scope_id, + sizeof (ifru_scope_id)); + + return (scope6_get_default(ifru_scope_id)); + /* NOTREACHED */ } - switch (cmd) { - case SIOCALIFADDR: - case SIOCDLIFADDR: + case SIOCALIFADDR: /* struct if_laddrreq */ + case SIOCDLIFADDR: /* struct if_laddrreq */ if (!privileged) return(EPERM); - /* fall through */ - case SIOCGLIFADDR: - return (in6_lifaddr_ioctl(so, cmd, data, ifp, p)); + /* FALLTHRU */ + case SIOCGLIFADDR: { /* struct if_laddrreq */ + struct if_laddrreq iflr; + + bcopy(data, &iflr, sizeof (iflr)); + error = in6_lifaddr_ioctl(so, cmd, &iflr, ifp, p); + bcopy(&iflr, data, sizeof (iflr)); + return (error); + /* NOTREACHED */ + } } - /* - * Find address for this interface, if it exists. - * - * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation - * only, and used the first interface address as the target of other - * operations (without checking ifra_addr). This was because netinet - * code/API assumed at most 1 interface address per interface. - * Since IPv6 allows a node to assign multiple addresses - * on a single interface, we almost always look and check the - * presence of ifra_addr, and reject invalid ones here. - * It also decreases duplicated code among SIOC*_IN6 operations. - */ switch (cmd) { - case SIOCLL_START_32: - case SIOCAIFADDR_IN6_32: + case SIOCLL_START_32: /* struct in6_aliasreq_32 */ + case SIOCAIFADDR_IN6_32: { /* struct in6_aliasreq_32 */ /* * Convert user ifra to the kernel form, when appropriate. * This allows the conversion between different data models @@ -640,88 +654,65 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * routines that are expecting the kernel form. */ ifra = in6_aliasreq_to_native(data, 0, &sifra); - sa6 = (struct sockaddr_in6 *)&ifra->ifra_addr; + bcopy(&ifra->ifra_addr, &sin6, sizeof (sin6)); + sa6 = &sin6; break; + } - case SIOCLL_START_64: - case SIOCAIFADDR_IN6_64: + case SIOCLL_START_64: /* struct in6_aliasreq_64 */ + case SIOCAIFADDR_IN6_64: { /* struct in6_aliasreq_64 */ + /* + * Convert user ifra to the kernel form, when appropriate. + * This allows the conversion between different data models + * to be centralized, so that it can be passed around to other + * routines that are expecting the kernel form. + */ ifra = in6_aliasreq_to_native(data, 1, &sifra); - sa6 = (struct sockaddr_in6 *)&ifra->ifra_addr; + bcopy(&ifra->ifra_addr, &sin6, sizeof (sin6)); + sa6 = &sin6; break; + } - case SIOCSIFADDR_IN6: /* deprecated */ - case SIOCGIFADDR_IN6: - case SIOCSIFDSTADDR_IN6: /* deprecated */ - case SIOCSIFNETMASK_IN6: /* deprecated */ - case SIOCGIFDSTADDR_IN6: - case SIOCGIFNETMASK_IN6: - case SIOCDIFADDR_IN6: - case SIOCGIFPSRCADDR_IN6: - case SIOCGIFPDSTADDR_IN6: - case SIOCGIFAFLAG_IN6: - case SIOCSNDFLUSH_IN6: - case SIOCSPFXFLUSH_IN6: - case SIOCSRTRFLUSH_IN6: - case SIOCGIFALIFETIME_IN6: - case SIOCSIFALIFETIME_IN6: - case SIOCGIFSTAT_IN6: - case SIOCGIFSTAT_ICMP6: - sa6 = &ifr->ifr_addr; + case SIOCSIFADDR_IN6: /* struct in6_ifreq (deprecated) */ + case SIOCGIFADDR_IN6: /* struct in6_ifreq */ + case SIOCSIFDSTADDR_IN6: /* struct in6_ifreq (deprecated) */ + case SIOCSIFNETMASK_IN6: /* struct in6_ifreq (deprecated) */ + case SIOCGIFDSTADDR_IN6: /* struct in6_ifreq */ + case SIOCGIFNETMASK_IN6: /* struct in6_ifreq */ + case SIOCDIFADDR_IN6: /* struct in6_ifreq */ + case SIOCGIFPSRCADDR_IN6: /* struct in6_ifreq */ + case SIOCGIFPDSTADDR_IN6: /* struct in6_ifreq */ + case SIOCGIFAFLAG_IN6: /* struct in6_ifreq */ + case SIOCSNDFLUSH_IN6: /* struct in6_ifreq */ + case SIOCSPFXFLUSH_IN6: /* struct in6_ifreq */ + case SIOCSRTRFLUSH_IN6: /* struct in6_ifreq */ + case SIOCGIFALIFETIME_IN6: /* struct in6_ifreq */ + case SIOCSIFALIFETIME_IN6: /* struct in6_ifreq */ + case SIOCGIFSTAT_IN6: /* struct in6_ifreq */ + case SIOCGIFSTAT_ICMP6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; + + bcopy(&ifr->ifr_addr, &sin6, sizeof (sin6)); + sa6 = &sin6; break; + } default: - sa6 = NULL; break; } switch (cmd) { - case SIOCAUTOCONF_START: - ifnet_lock_exclusive(ifp); - ifp->if_eflags |= IFEF_ACCEPT_RTADVD; - ifnet_lock_done(ifp); - return (0); + return (in6_autoconf(ifp, TRUE)); /* NOTREACHED */ - case SIOCAUTOCONF_STOP: { - ifnet_lock_exclusive(ifp); - ifp->if_eflags &= ~IFEF_ACCEPT_RTADVD; - ifnet_lock_done(ifp); - - /* Remove autoconfigured address from interface */ - lck_rw_lock_exclusive(&in6_ifaddr_rwlock); - ia = in6_ifaddrs; - while (ia != NULL) { - if (ia->ia_ifa.ifa_ifp != ifp) { - ia = ia->ia_next; - continue; - } - IFA_LOCK(&ia->ia_ifa); - if (ia->ia6_flags & IN6_IFF_AUTOCONF) { - IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for us */ - IFA_UNLOCK(&ia->ia_ifa); - lck_rw_done(&in6_ifaddr_rwlock); - in6_purgeaddr(&ia->ia_ifa); - IFA_REMREF(&ia->ia_ifa); /* for us */ - lck_rw_lock_exclusive(&in6_ifaddr_rwlock); - /* - * Purging the address caused in6_ifaddr_rwlock - * to be dropped and reacquired; - * therefore search again from the beginning - * of in6_ifaddrs list. - */ - ia = in6_ifaddrs; - continue; - } - IFA_UNLOCK(&ia->ia_ifa); - ia = ia->ia_next; - } - lck_rw_done(&in6_ifaddr_rwlock); - return (0); - } + case SIOCAUTOCONF_STOP: + return (in6_autoconf(ifp, FALSE)); + /* NOTREACHED */ case SIOCLL_START_32: case SIOCLL_START_64: + VERIFY(ifra != NULL); /* * NOTE: All the interface specific DLIL attachements should * be done here. They are currently done in in6_ifattach() @@ -733,12 +724,12 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* some interfaces may provide LinkLocal addresses */ error = in6_if_up(ifp, ifra); } else { - error = in6_if_up(ifp, 0); + error = in6_if_up(ifp, NULL); } return (error); /* NOTREACHED */ - case SIOCLL_STOP: { + case SIOCLL_STOP: /* Remove link local addresses from interface */ lck_rw_lock_exclusive(&in6_ifaddr_rwlock); ia = in6_ifaddrs; @@ -769,14 +760,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } lck_rw_done(&in6_ifaddr_rwlock); return (0); + /* NOTREACHED */ + + case SIOCSETROUTERMODE_IN6: { /* struct in6_ifreq */ + int intval; + + VERIFY(ifp != NULL); + bcopy(&((struct in6_ifreq *)(void *)data)->ifr_intval, + &intval, sizeof (intval)); + + return (in6_setrouter(ifp, intval)); + /* NOTREACHED */ } - case SIOCPROTOATTACH_IN6_32: - case SIOCPROTOATTACH_IN6_64: + case SIOCPROTOATTACH_IN6_32: /* struct in6_aliasreq_32 */ + case SIOCPROTOATTACH_IN6_64: /* struct in6_aliasreq_64 */ return (in6_domifattach(ifp)); /* NOTREACHED */ - case SIOCPROTODETACH_IN6: + case SIOCPROTODETACH_IN6: /* struct in6_ifreq */ /* Cleanup interface routes and addresses */ in6_purgeif(ifp); @@ -784,11 +786,20 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, printf("SIOCPROTODETACH_IN6: %s error=%d\n", if_name(ifp), error); return (error); + /* NOTREACHED */ } /* - * Find address for this interface, if it exists; depending - * on the ioctl command, sa6 points to the address in ifra/ifr. + * Find address for this interface, if it exists. + * + * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation + * only, and used the first interface address as the target of other + * operations (without checking ifra_addr). This was because netinet + * code/API assumed at most 1 interface address per interface. + * Since IPv6 allows a node to assign multiple addresses + * on a single interface, we almost always look and check the + * presence of ifra_addr, and reject invalid ones here. + * It also decreases duplicated code among SIOC*_IN6 operations. */ if (sa6 != NULL && sa6->sin6_family == AF_INET6) { if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) { @@ -813,9 +824,9 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } switch (cmd) { - case SIOCSIFADDR_IN6: - case SIOCSIFDSTADDR_IN6: - case SIOCSIFNETMASK_IN6: + case SIOCSIFADDR_IN6: /* struct in6_ifreq */ + case SIOCSIFDSTADDR_IN6: /* struct in6_ifreq */ + case SIOCSIFNETMASK_IN6: /* struct in6_ifreq */ /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. @@ -824,7 +835,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, error = EINVAL; goto ioctl_cleanup; - case SIOCDIFADDR_IN6: + case SIOCDIFADDR_IN6: /* struct in6_ifreq */ /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on @@ -837,8 +848,9 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, goto ioctl_cleanup; } /* FALLTHROUGH */ - case SIOCAIFADDR_IN6_32: - case SIOCAIFADDR_IN6_64: + case SIOCAIFADDR_IN6_32: /* struct in6_aliasreq_32 */ + case SIOCAIFADDR_IN6_64: /* struct in6_aliasreq_64 */ + VERIFY(sa6 != NULL); /* * We always require users to specify a valid IPv6 address for * the corresponding operation. Use "sa6" instead of "ifra" @@ -853,16 +865,15 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, error = EPERM; goto ioctl_cleanup; } - break; - case SIOCGIFADDR_IN6: + case SIOCGIFADDR_IN6: /* struct in6_ifreq */ /* This interface is basically deprecated. use SIOCGIFCONF. */ - /* fall through */ - case SIOCGIFAFLAG_IN6: - case SIOCGIFNETMASK_IN6: - case SIOCGIFDSTADDR_IN6: - case SIOCGIFALIFETIME_IN6: + /* FALLTHRU */ + case SIOCGIFAFLAG_IN6: /* struct in6_ifreq */ + case SIOCGIFNETMASK_IN6: /* struct in6_ifreq */ + case SIOCGIFDSTADDR_IN6: /* struct in6_ifreq */ + case SIOCGIFALIFETIME_IN6: /* struct in6_ifreq */ /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; @@ -870,7 +881,9 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } break; - case SIOCSIFALIFETIME_IN6: + case SIOCSIFALIFETIME_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; + if (!privileged) { error = EPERM; goto ioctl_cleanup; @@ -881,39 +894,37 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } /* sanity for overflow - beware unsigned */ if (p64) { - struct in6_addrlifetime_64 *lt; + struct in6_addrlifetime_64 lt; - lt = (struct in6_addrlifetime_64 *) - &ifr->ifr_ifru.ifru_lifetime; + bcopy(&ifr->ifr_ifru.ifru_lifetime, <, sizeof (lt)); if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 - || lt->ia6t_vltime != ND6_INFINITE_LIFETIME) - && lt->ia6t_vltime + timenow.tv_sec < + || lt.ia6t_vltime != ND6_INFINITE_LIFETIME) + && lt.ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 - || lt->ia6t_pltime != ND6_INFINITE_LIFETIME) - && lt->ia6t_pltime + timenow.tv_sec < + || lt.ia6t_pltime != ND6_INFINITE_LIFETIME) + && lt.ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } } else { - struct in6_addrlifetime_32 *lt; + struct in6_addrlifetime_32 lt; - lt = (struct in6_addrlifetime_32 *) - &ifr->ifr_ifru.ifru_lifetime; + bcopy(&ifr->ifr_ifru.ifru_lifetime, <, sizeof (lt)); if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 - || lt->ia6t_vltime != ND6_INFINITE_LIFETIME) - && lt->ia6t_vltime + timenow.tv_sec < + || lt.ia6t_vltime != ND6_INFINITE_LIFETIME) + && lt.ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 - || lt->ia6t_pltime != ND6_INFINITE_LIFETIME) - && lt->ia6t_pltime + timenow.tv_sec < + || lt.ia6t_pltime != ND6_INFINITE_LIFETIME) + && lt.ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; @@ -921,20 +932,28 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } break; } + } switch (cmd) { + case SIOCGIFADDR_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; + struct sockaddr_in6 addr; - case SIOCGIFADDR_IN6: IFA_LOCK(&ia->ia_ifa); - ifr->ifr_addr = ia->ia_addr; + bcopy(&ia->ia_addr, &addr, sizeof (addr)); IFA_UNLOCK(&ia->ia_ifa); - if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) { + if ((error = sa6_recoverscope(&addr, TRUE)) != 0) { IFA_REMREF(&ia->ia_ifa); return (error); } + bcopy(&addr, &ifr->ifr_addr, sizeof (addr)); break; + } + + case SIOCGIFDSTADDR_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; + struct sockaddr_in6 dstaddr; - case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto ioctl_cleanup; @@ -944,45 +963,61 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * an error? */ IFA_LOCK(&ia->ia_ifa); - ifr->ifr_dstaddr = ia->ia_dstaddr; + bcopy(&ia->ia_dstaddr, &dstaddr, sizeof (dstaddr)); IFA_UNLOCK(&ia->ia_ifa); - if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) { + if ((error = sa6_recoverscope(&dstaddr, TRUE)) != 0) { IFA_REMREF(&ia->ia_ifa); return (error); } + bcopy(&dstaddr, &ifr->ifr_dstaddr, sizeof (dstaddr)); break; + } + + case SIOCGIFNETMASK_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCGIFNETMASK_IN6: IFA_LOCK(&ia->ia_ifa); - ifr->ifr_addr = ia->ia_prefixmask; + bcopy(&ia->ia_prefixmask, &ifr->ifr_addr, + sizeof (struct sockaddr_in6)); IFA_UNLOCK(&ia->ia_ifa); break; + } + + case SIOCGIFAFLAG_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCGIFAFLAG_IN6: IFA_LOCK(&ia->ia_ifa); - ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; + bcopy(&ia->ia6_flags, &ifr->ifr_ifru.ifru_flags6, + sizeof (ifr->ifr_ifru.ifru_flags6)); IFA_UNLOCK(&ia->ia_ifa); break; + } + + case SIOCGIFSTAT_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCGIFSTAT_IN6: if (ifp == NULL) { error = EINVAL; goto ioctl_cleanup; } index = ifp->if_index; lck_rw_lock_shared(&in6_ifs_rwlock); - if (in6_ifstat == NULL || index >= in6_ifstatmax - || in6_ifstat[index] == NULL) { + if (in6_ifstat == NULL || index >= in6_ifstatmax || + in6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ bzero(&ifr->ifr_ifru.ifru_stat, sizeof (ifr->ifr_ifru.ifru_stat)); } else { - ifr->ifr_ifru.ifru_stat = *in6_ifstat[index]; + bcopy(in6_ifstat[index], &ifr->ifr_ifru.ifru_stat, + sizeof (ifr->ifr_ifru.ifru_stat)); } lck_rw_done(&in6_ifs_rwlock); break; + } + + case SIOCGIFSTAT_ICMP6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCGIFSTAT_ICMP6: if (ifp == NULL) { error = EINVAL; goto ioctl_cleanup; @@ -995,61 +1030,64 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, bzero(&ifr->ifr_ifru.ifru_stat, sizeof (ifr->ifr_ifru.ifru_icmp6stat)); } else { - ifr->ifr_ifru.ifru_icmp6stat = *icmp6_ifstat[index]; + bcopy(icmp6_ifstat[index], + &ifr->ifr_ifru.ifru_icmp6stat, + sizeof (ifr->ifr_ifru.ifru_icmp6stat)); } lck_rw_done(&icmp6_ifs_rwlock); break; + } + + case SIOCGIFALIFETIME_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCGIFALIFETIME_IN6: IFA_LOCK(&ia->ia_ifa); if (p64) { - struct in6_addrlifetime_64 *lt; - - lt = (struct in6_addrlifetime_64 *) - &ifr->ifr_ifru.ifru_lifetime; - lt->ia6t_expire = ia->ia6_lifetime.ia6t_expire; - lt->ia6t_preferred = ia->ia6_lifetime.ia6t_preferred; - lt->ia6t_vltime = ia->ia6_lifetime.ia6t_vltime; - lt->ia6t_pltime = ia->ia6_lifetime.ia6t_pltime; + struct in6_addrlifetime_64 lt; + + bzero(<, sizeof (lt)); + lt.ia6t_expire = ia->ia6_lifetime.ia6t_expire; + lt.ia6t_preferred = ia->ia6_lifetime.ia6t_preferred; + lt.ia6t_vltime = ia->ia6_lifetime.ia6t_vltime; + lt.ia6t_pltime = ia->ia6_lifetime.ia6t_pltime; + bcopy(<, &ifr->ifr_ifru.ifru_lifetime, sizeof (lt)); } else { - struct in6_addrlifetime_32 *lt; + struct in6_addrlifetime_32 lt; - lt = (struct in6_addrlifetime_32 *) - &ifr->ifr_ifru.ifru_lifetime; - lt->ia6t_expire = - (uint32_t)ia->ia6_lifetime.ia6t_expire; - lt->ia6t_preferred = + bzero(<, sizeof (lt)); + lt.ia6t_expire = (uint32_t)ia->ia6_lifetime.ia6t_expire; + lt.ia6t_preferred = (uint32_t)ia->ia6_lifetime.ia6t_preferred; - lt->ia6t_vltime = - (uint32_t)ia->ia6_lifetime.ia6t_vltime; - lt->ia6t_pltime = - (uint32_t)ia->ia6_lifetime.ia6t_pltime; + lt.ia6t_vltime = (uint32_t)ia->ia6_lifetime.ia6t_vltime; + lt.ia6t_pltime = (uint32_t)ia->ia6_lifetime.ia6t_pltime; + bcopy(<, &ifr->ifr_ifru.ifru_lifetime, sizeof (lt)); } IFA_UNLOCK(&ia->ia_ifa); break; + } + + case SIOCSIFALIFETIME_IN6: { /* struct in6_ifreq */ + struct in6_ifreq *ifr = (struct in6_ifreq *)(void *)data; - case SIOCSIFALIFETIME_IN6: IFA_LOCK(&ia->ia_ifa); if (p64) { - struct in6_addrlifetime_64 *lt; - - lt = (struct in6_addrlifetime_64 *) - &ifr->ifr_ifru.ifru_lifetime; - ia->ia6_lifetime.ia6t_expire = lt->ia6t_expire; - ia->ia6_lifetime.ia6t_preferred = lt->ia6t_preferred; - ia->ia6_lifetime.ia6t_vltime = lt->ia6t_vltime; - ia->ia6_lifetime.ia6t_pltime = lt->ia6t_pltime; + struct in6_addrlifetime_64 lt; + + bcopy(&ifr->ifr_ifru.ifru_lifetime, <, sizeof (lt)); + ia->ia6_lifetime.ia6t_expire = lt.ia6t_expire; + ia->ia6_lifetime.ia6t_preferred = lt.ia6t_preferred; + ia->ia6_lifetime.ia6t_vltime = lt.ia6t_vltime; + ia->ia6_lifetime.ia6t_pltime = lt.ia6t_pltime; } else { - struct in6_addrlifetime_32 *lt; + struct in6_addrlifetime_32 lt; - lt = (struct in6_addrlifetime_32 *) - &ifr->ifr_ifru.ifru_lifetime; + bcopy(&ifr->ifr_ifru.ifru_lifetime, <, sizeof (lt)); ia->ia6_lifetime.ia6t_expire = - (uint32_t)lt->ia6t_expire; + (uint32_t)lt.ia6t_expire; ia->ia6_lifetime.ia6t_preferred = - (uint32_t)lt->ia6t_preferred; - ia->ia6_lifetime.ia6t_vltime = lt->ia6t_vltime; - ia->ia6_lifetime.ia6t_pltime = lt->ia6t_pltime; + (uint32_t)lt.ia6t_preferred; + ia->ia6_lifetime.ia6t_vltime = lt.ia6t_vltime; + ia->ia6_lifetime.ia6t_pltime = lt.ia6t_pltime; } /* for sanity */ if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME || @@ -1066,12 +1104,15 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, ia->ia6_lifetime.ia6t_preferred = 0; IFA_UNLOCK(&ia->ia_ifa); break; + } - case SIOCAIFADDR_IN6_32: - case SIOCAIFADDR_IN6_64: { + case SIOCAIFADDR_IN6_32: /* struct in6_aliasreq_32 */ + case SIOCAIFADDR_IN6_64: { /* struct in6_aliasreq_64 */ int i; struct nd_prefix pr0, *pr; + VERIFY(ifra != NULL); + /* Attempt to attach the protocol, in case it isn't attached */ error = in6_domifattach(ifp); if (error) { @@ -1104,6 +1145,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); + lck_mtx_init(&pr0.ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); @@ -1136,7 +1178,8 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * nd6_prelist_add will install the corresponding * interface route. */ - if ((error = nd6_prelist_add(&pr0, NULL, &pr, FALSE)) != 0) + if ((error = nd6_prelist_add(&pr0, NULL, &pr, + FALSE)) != 0) goto ioctl_cleanup; if (pr == NULL) { log(LOG_ERR, "nd6_prelist_add succedded but " @@ -1147,8 +1190,8 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } if (ia != NULL) IFA_REMREF(&ia->ia_ifa); - if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) - == NULL) { + if ((ia = in6ifa_ifpwithaddr(ifp, + &ifra->ifra_addr.sin6_addr)) == NULL) { /* XXX: this should not happen! */ log(LOG_ERR, "in6_control: addition succeeded, but" " no ifaddr\n"); @@ -1207,7 +1250,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, break; } - case SIOCDIFADDR_IN6: { + case SIOCDIFADDR_IN6: { /* struct in6_ifreq */ int i = 0; struct nd_prefix pr0, *pr; @@ -1252,7 +1295,8 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, pr->ndpr_addrcnt == 1) || ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0 && pr->ndpr_addrcnt == 0)) { - pr->ndpr_expire = 1; /* XXX: just for expiration */ + /* XXX: just for expiration */ + pr->ndpr_expire = 1; } NDPR_UNLOCK(pr); IFA_UNLOCK(&ia->ia_ifa); @@ -1273,12 +1317,126 @@ purgeaddr: error = ifnet_ioctl(ifp, PF_INET6, cmd, data); goto ioctl_cleanup; } + ioctl_cleanup: if (ia != NULL) IFA_REMREF(&ia->ia_ifa); return (error); } +static int +in6_autoconf(struct ifnet *ifp, int enable) +{ + int error = 0; + + if (ifp->if_flags & IFF_LOOPBACK) + return (EINVAL); + + if (enable) { + /* + * An interface in IPv6 router mode implies that it + * is either configured with a static IP address or + * autoconfigured via a locally-generated RA. Prevent + * SIOCAUTOCONF_START from being set in that mode. + */ + ifnet_lock_exclusive(ifp); + if (ifp->if_eflags & IFEF_IPV6_ROUTER) { + ifp->if_eflags &= ~IFEF_ACCEPT_RTADV; + error = EBUSY; + } else { + ifp->if_eflags |= IFEF_ACCEPT_RTADV; + } + ifnet_lock_done(ifp); + } else { + struct in6_ifaddr *ia = NULL; + + ifnet_lock_exclusive(ifp); + ifp->if_eflags &= ~IFEF_ACCEPT_RTADV; + ifnet_lock_done(ifp); + + /* Remove autoconfigured address from interface */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; + continue; + } + IFA_LOCK(&ia->ia_ifa); + if (ia->ia6_flags & IN6_IFF_AUTOCONF) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for us */ + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; + continue; + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; + } + lck_rw_done(&in6_ifaddr_rwlock); + } + return (error); +} + +/* + * Handle SIOCSETROUTERMODE_IN6 to set or clear the IPv6 router mode flag on + * the interface. Entering or exiting this mode will result in the removal of + * autoconfigured IPv6 addresses on the interface. + */ +static int +in6_setrouter(struct ifnet *ifp, int enable) +{ + if (ifp->if_flags & IFF_LOOPBACK) + return (ENODEV); + + if (enable) { + struct nd_ifinfo *ndi; + + lck_rw_lock_shared(nd_if_rwlock); + ndi = ND_IFINFO(ifp); + if (ndi != NULL && ndi->initialized) { + lck_mtx_lock(&ndi->lock); + if (ndi->flags & ND6_IFF_PROXY_PREFIXES) { + /* No proxy if we are an advertising router */ + ndi->flags &= ~ND6_IFF_PROXY_PREFIXES; + lck_mtx_unlock(&ndi->lock); + lck_rw_done(nd_if_rwlock); + (void) nd6_if_prproxy(ifp, FALSE); + } else { + lck_mtx_unlock(&ndi->lock); + lck_rw_done(nd_if_rwlock); + } + } else { + lck_rw_done(nd_if_rwlock); + } + } + + ifnet_lock_exclusive(ifp); + if (enable) { + ifp->if_eflags |= IFEF_IPV6_ROUTER; + } else { + ifp->if_eflags &= ~IFEF_IPV6_ROUTER; + } + ifnet_lock_done(ifp); + + lck_mtx_lock(nd6_mutex); + defrouter_select(ifp); + lck_mtx_unlock(nd6_mutex); + + if_allmulti(ifp, enable); + + return (in6_autoconf(ifp, FALSE)); +} + /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. @@ -1570,13 +1728,20 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, ia->ia6_lifetime.ia6t_preferred = timenow.tv_sec; } /* - * Make the address tentative before joining multicast addresses, + * Mark the address as tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ if (hostIsNew && in6if_do_dad(ifp)) - ia->ia6_flags |= IN6_IFF_TENTATIVE; + in6_ifaddr_set_dadprogress(ia); + + /* + * Do not delay sending neighbor solicitations when using optimistic + * duplicate address detection, c.f. RFC 4429. + */ + if (ia->ia6_flags & IN6_IFF_OPTIMISTIC) + flags &= ~IN6_IFAUPDATE_DADDELAY; /* * We are done if we have simply modified an existing address. @@ -1669,9 +1834,8 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, rt = rtalloc1_scoped((struct sockaddr *)&mltaddr, 0, 0UL, ia->ia_ifp->if_index); if (rt) { - if (memcmp(&mltaddr.sin6_addr, - &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, - MLTMASK_LEN)) { + if (memcmp(&mltaddr.sin6_addr, &((struct sockaddr_in6 *) + (void *)rt_key(rt))->sin6_addr, MLTMASK_LEN)) { rtfree(rt); rt = NULL; } @@ -1746,9 +1910,8 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, rt = rtalloc1_scoped((struct sockaddr *)&mltaddr, 0, 0UL, ia->ia_ifp->if_index); if (rt) { - if (memcmp(&mltaddr.sin6_addr, - &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, - MLTMASK_LEN)) { + if (memcmp(&mltaddr.sin6_addr, &((struct sockaddr_in6 *) + (void *)rt_key(rt))->sin6_addr, MLTMASK_LEN)) { rtfree(rt); rt = NULL; } @@ -1795,12 +1958,13 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, */ if (hostIsNew && in6if_do_dad(ifp) && ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && - (ia->ia6_flags & IN6_IFF_TENTATIVE)) + (ia->ia6_flags & IN6_IFF_DADPROGRESS)) { int mindelay, maxdelay; + int *delayptr, delayval; IFA_UNLOCK(ifa); - delay = 0; + delayptr = NULL; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS @@ -1819,14 +1983,15 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) - delay = 0; + delayval = 0; else { - delay = + delayval = (random() % (maxdelay - mindelay)) + mindelay; } + delayptr = &delayval; } - nd6_dad_start((struct ifaddr *)ia, &delay); + nd6_dad_start((struct ifaddr *)ia, delayptr); } else { IFA_UNLOCK(ifa); } @@ -2013,8 +2178,8 @@ in6_purgeif(struct ifnet *ifp) IFA_ADDREF(&ia->ia_ifa); /* for us */ lck_rw_done(&in6_ifaddr_rwlock); in6_purgeaddr(&ia->ia_ifa); - lck_rw_lock_exclusive(&in6_ifaddr_rwlock); IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); /* * Purging the address would have caused * in6_ifaddr_rwlock to be dropped and reacquired; @@ -2052,19 +2217,14 @@ in6_purgeif(struct ifnet *ifp) * address encoding scheme. (see figure on page 8) */ static int -in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, +in6_lifaddr_ioctl(struct socket *so, u_long cmd, struct if_laddrreq *iflr, struct ifnet *ifp, struct proc *p) { - struct if_laddrreq *iflr = (struct if_laddrreq *)data; struct ifaddr *ifa = NULL; struct sockaddr *sa; int p64 = proc_is64bit(p); - /* sanity checks */ - if (!data || !ifp) { - panic("invalid argument to in6_lifaddr_ioctl"); - /*NOTREACHED*/ - } + VERIFY(ifp != NULL); switch (cmd) { case SIOCGLIFADDR: @@ -2414,10 +2574,15 @@ in6_ifinit(ifp, ia, sin6, newhost) ia->ia_addr = *sin6; IFA_UNLOCK(ifa); - if (ifacount <= 1 && + /* + * NOTE: SIOCSIFADDR is defined with struct ifreq as parameter, + * but here we are sending it down to the interface with a pointer + * to struct ifaddr, for legacy reasons. + */ + if (ifacount <= 1 && (error = ifnet_ioctl(ifp, PF_INET6, SIOCSIFADDR, ia))) { - if (error == EOPNOTSUPP) - error = 0; + if (error == EOPNOTSUPP) + error = 0; else if (error) return(error); } @@ -2534,6 +2699,31 @@ in6ifa_ifpwithaddr(ifp, addr) return((struct in6_ifaddr *)ifa); } +struct in6_ifaddr * +in6ifa_prproxyaddr(struct in6_addr *addr) +{ + struct in6_ifaddr *ia; + + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); + if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(&ia->ia_ifa))) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for caller */ + IFA_UNLOCK(&ia->ia_ifa); + break; + } + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); + + if (ia != NULL && !nd6_prproxy_ifaddr(ia)) { + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } + + return (ia); +} + /* * Convert IP6 address to printable (loggable) representation. */ @@ -2752,7 +2942,6 @@ in6_ifawithscope( struct in6_ifaddr *ifa_best = NULL; if (oifp == NULL) { - /* output interface is not specified */ return(NULL); } @@ -3199,9 +3388,6 @@ in6_if_up( int dad_delay; /* delay ticks before DAD output */ int error; - if (!in6_init2done) - return ENXIO; - /* * special cases, like 6to4, are handled in in6_ifattach */ @@ -3220,7 +3406,7 @@ in6_if_up( continue; } ia = (struct in6_ifaddr *)ifa; - if (ia->ia6_flags & IN6_IFF_TENTATIVE) { + if (ia->ia6_flags & IN6_IFF_DADPROGRESS) { IFA_UNLOCK(ifa); nd6_dad_start(ifa, &dad_delay); } else { @@ -3240,12 +3426,14 @@ in6if_do_dad( return(0); /* - * Skip DAD on service triggered interfaces, for now, - * until we have support for Opportunistic Duplicate - * Address Detection [RFC 4429] and we can then back - * this out. + * If we are using the alternative neighbor discovery + * interface on this interface, then skip DAD. + * + * Also, skip it for interfaces marked "local private" + * for now, even when not marked as using the alternative + * interface. This is for historical reasons. */ - if (ifp->if_eflags & IFEF_SERVICE_TRIGGERED) + if (ifp->if_eflags & (IFEF_IPV6_ND6ALT|IFEF_LOCALNET_PRIVATE)) return (0); switch (ifp->if_type) { @@ -3283,17 +3471,25 @@ in6if_do_dad( * to in6_maxmtu. */ void -in6_setmaxmtu() +in6_setmaxmtu(void) { u_int32_t maxmtu = 0; struct ifnet *ifp; ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_list) { + struct nd_ifinfo *ndi; + lck_rw_lock_shared(nd_if_rwlock); + if ((ndi = ND_IFINFO(ifp)) != NULL && !ndi->initialized) + ndi = NULL; + if (ndi != NULL) + lck_mtx_lock(&ndi->lock); if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); + if (ndi != NULL) + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } ifnet_head_done(); @@ -3347,6 +3543,8 @@ in6_if2idlen(struct ifnet *ifp) return (64); /* for utun interfaces */ case IFT_CELLULAR: return (64); /* Packet Data over Cellular */ + case IFT_BRIDGE: + return (64); /* Transparent bridge interface */ default: /* * Unknown link type: @@ -3387,8 +3585,14 @@ in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; - sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; - sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; + if (sin->sin_addr.s_addr) { + sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; + sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; + } + else { + sin6->sin6_addr.s6_addr32[2] = 0; + sin6->sin6_addr.s6_addr32[3] = 0; + } } /* Convert sockaddr_in6 into sockaddr_in. */ @@ -3402,8 +3606,8 @@ in6_sin6_2_sin_in_sock(struct sockaddr *nam) * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ - sin6 = *(struct sockaddr_in6 *)nam; - sin_p = (struct sockaddr_in *)nam; + sin6 = *(struct sockaddr_in6 *)(void *)nam; + sin_p = (struct sockaddr_in *)(void *)nam; in6_sin6_2_sin(sin_p, &sin6); } @@ -3418,7 +3622,7 @@ in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) M_WAITOK); if (sin6_p == NULL) return ENOBUFS; - sin_p = (struct sockaddr_in *)*nam; + sin_p = (struct sockaddr_in *)(void *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); FREE(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; @@ -3466,7 +3670,7 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) IFA_UNLOCK(&ifa->ia_ifa); if (ifp != NULL) { - strncpy(&in6_event_data.link_data.if_name[0], + (void) strncpy(&in6_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in6_event_data.link_data.if_family = ifp->if_family; in6_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -3630,3 +3834,37 @@ in6_ifaddr_trace(struct ifaddr *ifa, int refhold) idx = atomic_add_16_ov(cnt, 1) % IN6IFA_TRACE_HIST_SIZE; ctrace_record(&tr[idx]); } + +static void +in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia) +{ + uint32_t flags = IN6_IFF_TENTATIVE; + uint32_t optdad = nd6_optimistic_dad; + + if (optdad && (ia->ia_ifp->if_eflags & IFEF_IPV6_ROUTER) == 0) { + if ((optdad & ND6_OPTIMISTIC_DAD_LINKLOCAL) && + IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) + flags = IN6_IFF_OPTIMISTIC; + else if ((optdad & ND6_OPTIMISTIC_DAD_AUTOCONF) && + (ia->ia6_flags & IN6_IFF_AUTOCONF)) { + if (ia->ia6_flags & IN6_IFF_TEMPORARY) { + if (optdad & ND6_OPTIMISTIC_DAD_TEMPORARY) + flags = IN6_IFF_OPTIMISTIC; + } else { + flags = IN6_IFF_OPTIMISTIC; + } + } else if ((optdad & ND6_OPTIMISTIC_DAD_DYNAMIC) && + (ia->ia6_flags & IN6_IFF_DYNAMIC)) { + if (ia->ia6_flags & IN6_IFF_TEMPORARY) { + if (optdad & ND6_OPTIMISTIC_DAD_TEMPORARY) + flags = IN6_IFF_OPTIMISTIC; + } else { + flags = IN6_IFF_OPTIMISTIC; + } + } + } + + ia->ia6_flags &= ~(IN6_IFF_DUPLICATED | IN6_IFF_DADPROGRESS); + ia->ia6_flags |= flags; +} + diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index c0838ec43..875af39a6 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -216,6 +216,10 @@ extern const struct in6_addr in6mask32; extern const struct in6_addr in6mask64; extern const struct in6_addr in6mask96; extern const struct in6_addr in6mask128; + +#define SIN6(s) ((struct sockaddr_in6 *)(void *)s) +#define satosin6(sa) SIN6(sa) +#define sin6tosa(sin6) ((struct sockaddr *)(void *)(sin6)) #endif /* KERNEL_PRIVATE */ #ifdef KERNEL /*XXX nonstandard*/ @@ -556,7 +560,7 @@ struct route_in6 { #if 1 /*IPSEC*/ #define IPV6_IPSEC_POLICY 28 /* struct; get/set security policy */ #endif /* 1 */ -#define IPV6_FAITH 29 /* bool; accept FAITH'ed connections */ +#define IPV6_FAITH 29 /* deprecated */ #if 1 /*IPV6FIREWALL*/ #define IPV6_FW_ADD 30 /* add a firewall rule to chain */ @@ -725,7 +729,7 @@ struct ip6_mtuinfo { #define IPV6CTL_SOURCECHECK 10 /* verify source route and intf */ #define IPV6CTL_SOURCECHECK_LOGINT 11 /* minimume logging interval */ #define IPV6CTL_ACCEPT_RTADV 12 -#define IPV6CTL_KEEPFAITH 13 +#define IPV6CTL_KEEPFAITH 13 /* deprecated */ #define IPV6CTL_LOG_INTERVAL 14 #define IPV6CTL_HDRNESTLIMIT 15 #define IPV6CTL_DAD_COUNT 16 @@ -842,11 +846,6 @@ extern void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, extern void in6_sin6_2_sin_in_sock(struct sockaddr *nam); extern int in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam); extern void in6_delayed_cksum(struct mbuf *, u_int16_t); - -#define satosin6(sa) ((struct sockaddr_in6 *)(sa)) -#define sin6tosa(sin6) ((struct sockaddr *)(sin6)) -#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) - extern int in6addr_local(struct in6_addr *); #define DEBUG_HWCKSUM 1 /* IPv6 Hardware checksum on/off */ diff --git a/bsd/netinet6/in6_cksum.c b/bsd/netinet6/in6_cksum.c index 77dd7e1af..bf8fe680d 100644 --- a/bsd/netinet6/in6_cksum.c +++ b/bsd/netinet6/in6_cksum.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -219,7 +219,7 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, break; m = m->m_next; } - w = (u_int16_t *)(mtod(m, u_char *) + off); + w = (u_int16_t *)(void *)(mtod(m, u_char *) + off); mlen = m->m_len - off; if (len < mlen) mlen = len; @@ -231,7 +231,7 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, REDUCE; sum <<= 8; s_util.c[0] = *(u_char *)w; - w = (u_int16_t *)((char *)w + 1); + w = (u_int16_t *)(void *)((char *)w + 1); mlen--; byte_swapped = 1; } @@ -292,7 +292,7 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, */ s_util.c[1] = *(char *)w; sum += s_util.s; - w = (u_int16_t *)((char *)w + 1); + w = (u_int16_t *)(void *)((char *)w + 1); mlen = m->m_len - 1; len--; } else @@ -307,7 +307,7 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, REDUCE; sum <<= 8; s_util.c[0] = *(u_char *)w; - w = (u_int16_t *)((char *)w + 1); + w = (u_int16_t *)(void *)((char *)w + 1); mlen--; byte_swapped = 1; } diff --git a/bsd/netinet6/in6_gif.c b/bsd/netinet6/in6_gif.c index d620db95e..c977e9a7a 100644 --- a/bsd/netinet6/in6_gif.c +++ b/bsd/netinet6/in6_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2011 Apple Inc. All rights reserved. + * Copyright (c) 2009-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -115,8 +115,8 @@ in6_gif_output( { struct gif_softc *sc = ifnet_softc(ifp); struct sockaddr_in6 *dst = (struct sockaddr_in6 *)&sc->gif_ro6.ro_dst; - struct sockaddr_in6 *sin6_src = (struct sockaddr_in6 *)sc->gif_psrc; - struct sockaddr_in6 *sin6_dst = (struct sockaddr_in6 *)sc->gif_pdst; + struct sockaddr_in6 *sin6_src = (struct sockaddr_in6 *)(void *)sc->gif_psrc; + struct sockaddr_in6 *sin6_dst = (struct sockaddr_in6 *)(void *)sc->gif_pdst; struct ip6_hdr *ip6; int proto; u_int8_t itos, otos; @@ -331,8 +331,8 @@ gif_validate6( { struct sockaddr_in6 *src, *dst; - src = (struct sockaddr_in6 *)sc->gif_psrc; - dst = (struct sockaddr_in6 *)sc->gif_pdst; + src = (struct sockaddr_in6 *)(void *)sc->gif_psrc; + dst = (struct sockaddr_in6 *)(void *)sc->gif_pdst; /* * Check for address match. Note that the check is for an incoming diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 10bf295f4..31f707c81 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include #include @@ -92,6 +92,11 @@ #include +#define IN6_IFSTAT_ALLOC_SIZE \ + sizeof(void *) + sizeof(struct in6_ifstat) + sizeof(uint64_t) +#define ICMP6_IFSTAT_ALLOC_SIZE \ + sizeof(void *) + sizeof(struct icmp6_ifstat) + sizeof(uint64_t) + struct in6_ifstat **in6_ifstat = NULL; struct icmp6_ifstat **icmp6_ifstat = NULL; size_t in6_ifstatmax = 0; @@ -117,23 +122,12 @@ static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *, struct in6_aliasreq *); static int in6_ifattach_loopback(struct ifnet *); -#define EUI64_GBIT 0x01 -#define EUI64_UBIT 0x02 -#define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) -#define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) -#define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) -#define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) -#define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) - -#define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) -#define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) - /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. - * We currently use MD5(hostname) for it. + * We currently use SHA1(hostname) for it. * * in6 - upper 64bits are preserved */ @@ -142,8 +136,8 @@ get_rand_ifid( __unused struct ifnet *ifp, struct in6_addr *in6) /* upper 64bits are preserved */ { - MD5_CTX ctxt; - u_int8_t digest[16]; + SHA1_CTX ctxt; + u_int8_t digest[SHA1_RESULTLEN]; int hostnlen = strlen(hostname); #if 0 @@ -154,19 +148,19 @@ get_rand_ifid( /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); - MD5Init(&ctxt); - MD5Update(&ctxt, hostname, hostnlen); - MD5Final(digest, &ctxt); + SHA1Init(&ctxt); + SHA1Update(&ctxt, hostname, hostnlen); + SHA1Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ - in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ - in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ + in6->s6_addr[8] &= ~ND6_EUI64_GBIT; /* g bit to "individual" */ + in6->s6_addr[8] |= ND6_EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ - EUI64_TO_IFID(in6); + ND6_EUI64_TO_IFID(in6); return 0; } @@ -177,8 +171,8 @@ generate_tmp_ifid( const u_int8_t *seed1, u_int8_t *ret) { - MD5_CTX ctxt; - u_int8_t seed[16], digest[16], nullbuf[8]; + SHA1_CTX ctxt; + u_int8_t seed[16], nullbuf[8], digest[SHA1_RESULTLEN]; u_int32_t val32; struct timeval tv; @@ -211,17 +205,17 @@ generate_tmp_ifid( /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); - MD5Init(&ctxt); - MD5Update(&ctxt, seed, sizeof(seed)); - MD5Final(digest, &ctxt); + SHA1Init(&ctxt); + SHA1Update(&ctxt, seed, sizeof(seed)); + SHA1Final(digest, &ctxt); /* * RFC 4941 3.2.1. (3) - * Take the left-most 64-bits of the MD5 digest and set bit 6 (the + * Take the left-most 64-bits of the SHA1 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); - ret[0] &= ~EUI64_UBIT; + ret[0] &= ~ND6_EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero @@ -230,7 +224,7 @@ generate_tmp_ifid( */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, - "generate_tmp_ifid: computed MD5 value is zero.\n")); + "generate_tmp_ifid: computed SHA1 value is zero.\n")); microtime(&tv); val32 = random() ^ tv.tv_usec; @@ -239,7 +233,7 @@ generate_tmp_ifid( /* * RFC 4941 3.2.1. (4) - * Take the rightmost 64-bits of the MD5 digest and save them in + * Take the next 64-bits of the SHA1 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ @@ -280,7 +274,7 @@ in6_get_hw_ifid( /* Why doesn't this code use ifnet_addrs? */ ifnet_lock_shared(ifp); ifa = ifp->if_lladdr; - sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl = (struct sockaddr_dl *)(void *)ifa->ifa_addr; if (sdl->sdl_alen == 0) { ifnet_lock_done(ifp); return (-1); @@ -351,8 +345,8 @@ in6_get_hw_ifid( /* * due to insufficient bitwidth, we mark it local. */ - in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ - in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ + in6->s6_addr[8] &= ~ND6_EUI64_GBIT; /* g bit to "individual" */ + in6->s6_addr[8] |= ND6_EUI64_UBIT; /* u bit to "local" */ break; case IFT_GIF: @@ -375,17 +369,17 @@ in6_get_hw_ifid( } /* sanity check: g bit must not indicate "group" */ - if (EUI64_GROUP(in6)) + if (ND6_EUI64_GROUP(in6)) goto done; /* convert EUI64 into IPv6 interface identifier */ - EUI64_TO_IFID(in6); + ND6_EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ - if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && + if ((in6->s6_addr[8] & ~(ND6_EUI64_GBIT | ND6_EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { goto done; } @@ -443,7 +437,7 @@ get_ifid( * to borrow ifid from other interface, ifid needs to be * globally unique */ - if (IFID_UNIVERSAL(in6)) { + if (ND6_IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); @@ -685,8 +679,8 @@ in6_nigroup( { const char *p; u_char *q; - MD5_CTX ctxt; - u_int8_t digest[16]; + SHA1_CTX ctxt; + u_int8_t digest[SHA1_RESULTLEN]; char l; char n[64]; /* a single label must not exceed 63 chars */ @@ -708,10 +702,10 @@ in6_nigroup( /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); - MD5Init(&ctxt); - MD5Update(&ctxt, &l, sizeof(l)); - MD5Update(&ctxt, n, l); - MD5Final(digest, &ctxt); + SHA1Init(&ctxt); + SHA1Update(&ctxt, &l, sizeof(l)); + SHA1Update(&ctxt, n, l); + SHA1Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; @@ -756,6 +750,7 @@ in6_ifattach( struct in6_ifaddr *ia; struct in6_addr in6; int error; + void *buf; lck_rw_lock_exclusive(&in6_ifs_rwlock); /* @@ -776,29 +771,30 @@ in6_ifattach( caddr_t q; n = if_indexlim * sizeof(struct in6_ifstat *); - q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK); + q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK|M_ZERO); if (q == NULL) { lck_rw_done(&in6_ifs_rwlock); return ENOBUFS; } - bzero(q, n); if (in6_ifstat) { bcopy((caddr_t)in6_ifstat, q, in6_ifstatmax * sizeof(struct in6_ifstat *)); FREE((caddr_t)in6_ifstat, M_IFADDR); } - in6_ifstat = (struct in6_ifstat **)q; + in6_ifstat = (struct in6_ifstat **)(void *)q; in6_ifstatmax = if_indexlim; } if (in6_ifstat[ifp->if_index] == NULL) { - in6_ifstat[ifp->if_index] = (struct in6_ifstat *) - _MALLOC(sizeof(struct in6_ifstat), M_IFADDR, M_WAITOK); - if (in6_ifstat[ifp->if_index] == NULL) { + buf = _MALLOC(IN6_IFSTAT_ALLOC_SIZE, M_IFADDR, M_WAITOK); + if (buf == NULL) { lck_rw_done(&in6_ifs_rwlock); return ENOBUFS; } - bzero(in6_ifstat[ifp->if_index], sizeof(struct in6_ifstat)); + bzero(buf, IN6_IFSTAT_ALLOC_SIZE); + in6_ifstat[ifp->if_index] = (struct in6_ifstat *) + P2ROUNDUP((intptr_t)buf + sizeof(void *), sizeof(uint64_t)); + VERIFY(IS_P2ALIGNED(in6_ifstat[ifp->if_index], sizeof(uint64_t))); } lck_rw_done(&in6_ifs_rwlock); @@ -808,29 +804,30 @@ in6_ifattach( caddr_t q; n = if_indexlim * sizeof(struct icmp6_ifstat *); - q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK); + q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK|M_ZERO); if (q == NULL) { lck_rw_done(&icmp6_ifs_rwlock); return ENOBUFS; } - bzero(q, n); if (icmp6_ifstat) { bcopy((caddr_t)icmp6_ifstat, q, icmp6_ifstatmax * sizeof(struct icmp6_ifstat *)); FREE((caddr_t)icmp6_ifstat, M_IFADDR); } - icmp6_ifstat = (struct icmp6_ifstat **)q; + icmp6_ifstat = (struct icmp6_ifstat **)(void *)q; icmp6_ifstatmax = if_indexlim; } if (icmp6_ifstat[ifp->if_index] == NULL) { - icmp6_ifstat[ifp->if_index] = (struct icmp6_ifstat *) - _MALLOC(sizeof(struct icmp6_ifstat), M_IFADDR, M_WAITOK); - if (icmp6_ifstat[ifp->if_index] == NULL) { + buf = _MALLOC(ICMP6_IFSTAT_ALLOC_SIZE, M_IFADDR, M_WAITOK); + if (buf == NULL) { lck_rw_done(&icmp6_ifs_rwlock); return ENOBUFS; } - bzero(icmp6_ifstat[ifp->if_index], sizeof(struct icmp6_ifstat)); + bzero(buf, ICMP6_IFSTAT_ALLOC_SIZE); + icmp6_ifstat[ifp->if_index] = (struct icmp6_ifstat *) + P2ROUNDUP((intptr_t)buf + sizeof(void *), sizeof(uint64_t)); + VERIFY(IS_P2ALIGNED(icmp6_ifstat[ifp->if_index], sizeof(uint64_t))); } lck_rw_done(&icmp6_ifs_rwlock); @@ -1113,7 +1110,9 @@ in6_get_tmpifid( struct nd_ifinfo *ndi; lck_rw_lock_shared(nd_if_rwlock); - ndi = &nd_ifinfo[ifp->if_index]; + ndi = ND_IFINFO(ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ @@ -1128,13 +1127,14 @@ in6_get_tmpifid( ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } void -in6_tmpaddrtimer( - __unused void *ignored_arg) +in6_tmpaddrtimer(void *arg) { +#pragma unused(arg) int i; struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; @@ -1143,23 +1143,25 @@ in6_tmpaddrtimer( (ip6_temp_preferred_lifetime - ip6_desync_factor - ip6_temp_regen_advance) * hz); - if (ip6_use_tempaddr) { - lck_rw_lock_shared(nd_if_rwlock); - bzero(nullbuf, sizeof(nullbuf)); - for (i = 1; i < nd_ifinfo_indexlim + 1; i++) { - ndi = &nd_ifinfo[i]; - if ((ndi->flags | ND6_IFF_PERFORMNUD) != ND6_IFF_PERFORMNUD) - continue; - if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { - /* - * We've been generating a random ID on this interface. - * Create a new one. - */ - (void)generate_tmp_ifid(ndi->randomseed0, - ndi->randomseed1, - ndi->randomid); - } + lck_rw_lock_shared(nd_if_rwlock); + bzero(nullbuf, sizeof(nullbuf)); + for (i = 1; i < if_index + 1; i++) { + if (!nd_ifinfo || i >= nd_ifinfo_indexlim) + break; + ndi = &nd_ifinfo[i]; + if (!ndi->initialized) + continue; + lck_mtx_lock(&ndi->lock); + if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { + /* + * We've been generating a random ID on this interface. + * Create a new one. + */ + (void)generate_tmp_ifid(ndi->randomseed0, + ndi->randomseed1, + ndi->randomid); } - lck_rw_done(nd_if_rwlock); + lck_mtx_unlock(&ndi->lock); } + lck_rw_done(nd_if_rwlock); } diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c index 05670d211..715403cf8 100644 --- a/bsd/netinet6/in6_mcast.c +++ b/bsd/netinet6/in6_mcast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * Copyright (c) 2010-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -351,7 +351,7 @@ im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); - gsin6 = (const struct sockaddr_in6 *)group; + gsin6 = (struct sockaddr_in6 *)(uintptr_t)(size_t)group; /* The im6o_membership array may be lazy allocated. */ if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) @@ -407,7 +407,7 @@ im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, return (NULL); imf = &imo->im6o_mfilters[gidx]; - psa = (const sockunion_t *)src; + psa = (sockunion_t *)(uintptr_t)(size_t)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); @@ -1662,7 +1662,11 @@ in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) if (ifp == NULL) return (EADDRNOTAVAIL); - + + if ((size_t) msfr.msfr_nsrcs > + SIZE_MAX / sizeof(struct sockaddr_storage)) + msfr.msfr_nsrcs = SIZE_MAX / sizeof(struct sockaddr_storage); + if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; @@ -1703,12 +1707,13 @@ in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); if (tmp_ptr != USER_ADDR_NULL && msfr.msfr_nsrcs > 0) { - tss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + tss = _MALLOC((size_t) msfr.msfr_nsrcs * sizeof(*tss), M_TEMP, M_WAITOK | M_ZERO); if (tss == NULL) { IM6O_UNLOCK(imo); return (ENOBUFS); } + bzero(tss, (size_t) msfr.msfr_nsrcs * sizeof(*tss)); } /* @@ -1738,8 +1743,7 @@ in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) IM6O_UNLOCK(imo); if (tss != NULL) { - error = copyout(tss, tmp_ptr, - sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + error = copyout(tss, tmp_ptr, ncsrcs * sizeof(*tss)); FREE(tss, M_TEMP); if (error) return (error); @@ -1870,7 +1874,7 @@ in6p_lookup_mcast_ifp(const struct inpcb *in6p, return NULL; if (in6p != NULL && (in6p->inp_flags & INP_BOUND_IF)) - ifscope = in6p->inp_boundif; + ifscope = in6p->inp_boundifp->if_index; ifp = NULL; memset(&ro6, 0, sizeof(struct route_in6)); @@ -1911,7 +1915,7 @@ in6p_lookup_v4addr(struct ipv6_mreq *mreq, struct ip_mreq *v4mreq) ifa = ifa_ifpgetprimary(ifp, AF_INET); if (ifa == NULL) return (EADDRNOTAVAIL); - sin = (struct sockaddr_in *)ifa->ifa_addr; + sin = (struct sockaddr_in *)(uintptr_t)(size_t)ifa->ifa_addr; v4mreq->imr_interface.s_addr = sin->sin_addr.s_addr; IFA_REMREF(ifa); @@ -2083,7 +2087,8 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) * a VERIFY() in in6_mc_join(). */ if ((IN6_IS_ADDR_MC_LINKLOCAL(&gsa->sin6.sin6_addr) || - IN6_IS_ADDR_MC_INTFACELOCAL(&gsa->sin6.sin6_addr)) && scopeid == 0) + IN6_IS_ADDR_MC_INTFACELOCAL(&gsa->sin6.sin6_addr)) && + (scopeid == 0 || gsa->sin6.sin6_addr.s6_addr16[1] == 0)) return (EINVAL); imo = in6p_findmoptions(inp); @@ -2626,6 +2631,10 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) memcpy(&msfr, &msfr32, sizeof(msfr)); } + if ((size_t) msfr.msfr_nsrcs > + SIZE_MAX / sizeof(struct sockaddr_storage)) + msfr.msfr_nsrcs = SIZE_MAX / sizeof(struct sockaddr_storage); + if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); @@ -2697,7 +2706,7 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) MLD_PRINTF(("%s: loading %lu source list entries\n", __func__, (unsigned long)msfr.msfr_nsrcs)); - kss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + kss = _MALLOC((size_t) msfr.msfr_nsrcs * sizeof(*kss), M_TEMP, M_WAITOK); if (kss == NULL) { error = ENOMEM; @@ -2705,7 +2714,7 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) } error = copyin(tmp_ptr, kss, - sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + (size_t) msfr.msfr_nsrcs * sizeof(*kss)); if (error) { FREE(kss, M_TEMP); goto out_imo_locked; diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 2ea4d7a5d..4c5925528 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,7 @@ #include #include #include +#include #include #include @@ -186,8 +187,10 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); +#if !CONFIG_EMBEDDED int error; kauth_cred_t cred; +#endif if (!in6_ifaddrs) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -198,9 +201,9 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) socket_unlock(so, 0); /* keep reference */ lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { - unsigned int outif = 0; + struct ifnet *outif = NULL; - sin6 = (struct sockaddr_in6 *)nam; + sin6 = (struct sockaddr_in6 *)(void *)nam; if (nam->sa_len != sizeof(*sin6)) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); @@ -262,7 +265,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) socket_lock(so, 0); return(EADDRNOTAVAIL); } - outif = ifa->ifa_ifp->if_index; + outif = ifa->ifa_ifp; IFA_UNLOCK(ifa); IFA_REMREF(ifa); } @@ -271,6 +274,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) struct inpcb *t; /* GROSS */ +#if !CONFIG_EMBEDDED if (ntohs(lport) < IPV6PORT_RESERVED) { cred = kauth_cred_proc_ref(p); error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); @@ -281,8 +285,9 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return(EACCES); } } +#endif - if (so->so_uid && + if (kauth_cred_getuid(so->so_cred) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { t = in6_pcblookup_local_and_cleanup(pcbinfo, &sin6->sin6_addr, lport, @@ -292,7 +297,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && - (so->so_uid != t->inp_socket->so_uid) && + (kauth_cred_getuid(so->so_cred) != + kauth_cred_getuid(t->inp_socket->so_cred)) && ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0)) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); @@ -307,8 +313,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) sin.sin_addr, lport, INPLOOKUP_WILDCARD); if (t && (t->inp_socket->so_options & SO_REUSEPORT) == 0 && - (so->so_uid != - t->inp_socket->so_uid) && + (kauth_cred_getuid(so->so_cred) != + kauth_cred_getuid(t->inp_socket->so_cred)) && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == @@ -348,7 +354,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } inp->in6p_laddr = sin6->sin6_addr; - inp->in6p_last_outif = outif; + inp->in6p_last_outifp = outif; } socket_lock(so, 0); if (lport == 0) { @@ -363,7 +369,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; - inp->in6p_last_outif = 0; + inp->in6p_last_outifp = NULL; lck_rw_done(pcbinfo->mtx); return (EAGAIN); } @@ -374,27 +380,32 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } /* - * Transform old in6_pcbconnect() into an inner subroutine for new - * in6_pcbconnect(): Do some validity-checking on the remote - * address (in mbuf 'nam') and then determine local host address - * (i.e., which interface) to use to access that remote host. + * Transform old in6_pcbconnect() into an inner subroutine for new + * in6_pcbconnect(): Do some validity-checking on the remote + * address (in mbuf 'nam') and then determine local host address + * (i.e., which interface) to use to access that remote host. * - * This preserves definition of in6_pcbconnect(), while supporting a - * slightly different version for T/TCP. (This is more than - * a bit of a kludge, but cleaning up the internal interfaces would - * have forced minor changes in every protocol). + * This preserves definition of in6_pcbconnect(), while supporting a + * slightly different version for T/TCP. (This is more than + * a bit of a kludge, but cleaning up the internal interfaces would + * have forced minor changes in every protocol). + * + * This routine might return an ifp with a reference held if the caller + * provides a non-NULL outif, even in the error case. The caller is + * responsible for releasing its reference. */ - int in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, - struct in6_addr *plocal_addr6, unsigned int *poutif) + struct in6_addr *plocal_addr6, struct ifnet **outif) { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)nam; struct in6_addr *addr6 = NULL; struct in6_addr src_storage; int error = 0; unsigned int ifscope; + if (outif != NULL) + *outif = NULL; if (nam->sa_len != sizeof (*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) @@ -404,7 +415,7 @@ in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, /* KAME hack: embed scopeid */ if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL, NULL) != 0) - return EINVAL; + return (EINVAL); if (in6_ifaddrs) { /* @@ -416,36 +427,54 @@ in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, } ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + inp->inp_boundifp->if_index : IFSCOPE_NONE; /* * XXX: in6_selectsrc might replace the bound local address * with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? + * + * in6_selectsrc() might return outif with its reference held + * even in the error case; caller always needs to release it + * if non-NULL. */ addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, inp, - &inp->in6p_route, NULL, &src_storage, ifscope, &error); - if (addr6 == 0) { - if (error == 0) - error = EADDRNOTAVAIL; - return(error); + &inp->in6p_route, outif, &src_storage, ifscope, &error); + + if (outif != NULL) { + struct rtentry *rt = inp->in6p_route.ro_rt; + /* + * If in6_selectsrc() returns a route, it should be one + * which points to the same ifp as outif. Just in case + * it isn't, use the one from the route for consistency. + * Otherwise if there is no route, leave outif alone as + * it could still be useful to the caller. + */ + if (rt != NULL && rt->rt_ifp != *outif) { + ifnet_reference(rt->rt_ifp); /* for caller */ + if (*outif != NULL) + ifnet_release(*outif); + *outif = rt->rt_ifp; + } } - if (poutif != NULL) { - struct rtentry *rt; - if ((rt = inp->in6p_route.ro_rt) != NULL) - *poutif = rt->rt_ifp->if_index; - else - *poutif = 0; + if (addr6 == NULL) { + if (outif != NULL && (*outif) != NULL && + (inp->inp_flags & INP_NO_IFT_CELLULAR) && + (*outif)->if_type == IFT_CELLULAR) + soevent(inp->inp_socket, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); + if (error == 0) + error = EADDRNOTAVAIL; + return (error); } *plocal_addr6 = *addr6; /* * Don't do pcblookup call here; return interface in - * plocal_addr6 - * and exit to caller, that will do the lookup. + * plocal_addr6 and exit to caller, that will do the lookup. */ - return(0); + return (0); } /* @@ -462,17 +491,27 @@ in6_pcbconnect( struct proc *p) { struct in6_addr addr6; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)nam; struct inpcb *pcb; - int error; - unsigned int outif = 0; + int error = 0; + struct ifnet *outif = NULL; /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. + * + * in6_pcbladdr() might return an ifp with its reference held + * even in the error case, so make sure that it's released + * whenever it's non-NULL. */ - if ((error = in6_pcbladdr(inp, nam, &addr6, &outif)) != 0) - return(error); + if ((error = in6_pcbladdr(inp, nam, &addr6, &outif)) != 0) { + if ((inp->inp_flags & INP_NO_IFT_CELLULAR) && + outif != NULL && + outif->if_type == IFT_CELLULAR) + soevent(inp->inp_socket, + (SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED)); + goto done; + } socket_unlock(inp->inp_socket, 0); pcb = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, @@ -482,16 +521,17 @@ in6_pcbconnect( socket_lock(inp->inp_socket, 0); if (pcb != NULL) { in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0); - return (EADDRINUSE); + error = EADDRINUSE; + goto done; } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, p); if (error) - return (error); + goto done; } inp->in6p_laddr = addr6; - inp->in6p_last_outif = outif; + inp->in6p_last_outifp = outif; /* no reference needed */ } if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { /*lock inversion issue, mostly with udp multicast packets */ @@ -509,7 +549,12 @@ in6_pcbconnect( in_pcbrehash(inp); lck_rw_done(inp->inp_pcbinfo->mtx); - return (0); + +done: + if (outif != NULL) + ifnet_release(outif); + + return (error); } void @@ -559,7 +604,7 @@ in6_pcbdetach( inp->inp_gencnt = ++ipi->ipi_gencnt; if (inp->in6p_options) m_freem(inp->in6p_options); - ip6_freepcbopts(inp->in6p_outputopts); + ip6_freepcbopts(inp->in6p_outputopts); if (inp->in6p_route.ro_rt) { rtfree(inp->in6p_route.ro_rt); inp->in6p_route.ro_rt = NULL; @@ -749,14 +794,15 @@ in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, cmdarg, notify) if ((unsigned)cmd > PRC_NCMDS || dst->sa_family != AF_INET6) return; - sa6_dst = (struct sockaddr_in6 *)dst; + sa6_dst = (struct sockaddr_in6 *)(void *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ - sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; + sa6_src = (src == NULL) ? + sa6_any : *(struct sockaddr_in6 *)(uintptr_t)(size_t)src; flowinfo = sa6_src.sin6_flowinfo; /* @@ -795,8 +841,8 @@ in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, cmdarg, notify) if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr))) { - ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, - (u_int32_t *)cmdarg); + ip6_notify_pmtu(inp, (struct sockaddr_in6 *)(void *)dst, + (u_int32_t *)cmdarg); } /* @@ -1000,20 +1046,13 @@ in6_pcblookup_hash_exists( int wildcard, uid_t *uid, gid_t *gid, - __unused struct ifnet *ifp) + struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; - int faith; int found; -#if defined(NFAITH) && NFAITH > 0 - faith = faithprefix(laddr); -#else - faith = 0; -#endif - *uid = UID_MAX; *gid = GID_MAX; @@ -1028,6 +1067,12 @@ in6_pcblookup_hash_exists( LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; + + if (ip6_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->in6p_flags & IN6P_RECV_ANYIF)) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && @@ -1036,8 +1081,10 @@ in6_pcblookup_hash_exists( /* * Found. Check if pcb is still valid */ - *uid = inp->inp_socket->so_uid; - *gid = inp->inp_socket->so_gid; + *uid = kauth_cred_getuid( + inp->inp_socket->so_cred); + *gid = kauth_cred_getgid( + inp->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1051,15 +1098,21 @@ in6_pcblookup_hash_exists( LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; + + if (ip6_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->in6p_flags & IN6P_RECV_ANYIF)) + continue; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && inp->inp_lport == lport) { - if (faith && (inp->inp_flags & INP_FAITH) == 0) - continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if ((found = (inp->inp_socket != NULL))) { - *uid = inp->inp_socket->so_uid; - *gid = inp->inp_socket->so_gid; + *uid = kauth_cred_getuid( + inp->inp_socket->so_cred); + *gid = kauth_cred_getgid( + inp->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1070,8 +1123,10 @@ in6_pcblookup_hash_exists( } if (local_wild) { if ((found = (local_wild->inp_socket != NULL))) { - *uid = local_wild->inp_socket->so_uid; - *gid = local_wild->inp_socket->so_gid; + *uid = kauth_cred_getuid( + local_wild->inp_socket->so_cred); + *gid = kauth_cred_getgid( + local_wild->inp_socket->so_cred); } lck_rw_done(pcbinfo->mtx); return (found); @@ -1101,13 +1156,6 @@ in6_pcblookup_hash( struct inpcbhead *head; struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; - int faith; - -#if defined(NFAITH) && NFAITH > 0 - faith = faithprefix(laddr); -#else - faith = 0; -#endif lck_rw_lock_shared(pcbinfo->mtx); @@ -1120,20 +1168,26 @@ in6_pcblookup_hash( LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; + + if (ip6_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->in6p_flags & IN6P_RECV_ANYIF)) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* - * Found. Check if pcb is still valid - */ + * Found. Check if pcb is still valid + */ if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { lck_rw_done(pcbinfo->mtx); return (inp); } else { /* it's there but dead, say it isn't found */ - lck_rw_done(pcbinfo->mtx); - return(NULL); + lck_rw_done(pcbinfo->mtx); + return (NULL); } } } @@ -1145,10 +1199,14 @@ in6_pcblookup_hash( LIST_FOREACH(inp, head, inp_hash) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; + + if (ip6_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(inp->in6p_flags & IN6P_RECV_ANYIF)) + continue; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && inp->inp_lport == lport) { - if (faith && (inp->inp_flags & INP_FAITH) == 0) - continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { @@ -1156,8 +1214,8 @@ in6_pcblookup_hash( return (inp); } else { /* it's there but dead, say it isn't found */ - lck_rw_done(pcbinfo->mtx); - return(NULL); + lck_rw_done(pcbinfo->mtx); + return (NULL); } } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) @@ -1199,3 +1257,32 @@ init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m) return; } + +void +in6p_route_copyout(struct inpcb *inp, struct route_in6 *dst) +{ + struct route_in6 *src = &inp->in6p_route; + + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + + /* Minor sanity check */ + if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET6) + panic("%s: wrong or corrupted route: %p", __func__, src); + + route_copyout((struct route *)dst, (struct route *)src, sizeof(*dst)); +} + +void +in6p_route_copyin(struct inpcb *inp, struct route_in6 *src) +{ + struct route_in6 *dst = &inp->in6p_route; + + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + + /* Minor sanity check */ + if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET6) + panic("%s: wrong or corrupted route: %p", __func__, src); + + route_copyin((struct route *)src, (struct route *)dst, sizeof(*src)); +} + diff --git a/bsd/netinet6/in6_pcb.h b/bsd/netinet6/in6_pcb.h index d83836bbc..1ad240007 100644 --- a/bsd/netinet6/in6_pcb.h +++ b/bsd/netinet6/in6_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,17 +95,13 @@ #include #ifdef KERNEL_PRIVATE -#define satosin6(sa) ((struct sockaddr_in6 *)(sa)) -#define sin6tosa(sin6) ((struct sockaddr *)(sin6)) -#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) - extern void in6_losing(struct inpcb *); extern int in6_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); extern int in6_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); extern void in6_pcbdetach(struct inpcb *); extern void in6_pcbdisconnect(struct inpcb *); extern int in6_pcbladdr(struct inpcb *, struct sockaddr *, - struct in6_addr *, unsigned int *); + struct in6_addr *, struct ifnet **); extern struct inpcb *in6_pcblookup_local(struct inpcbinfo *, struct in6_addr *, u_int, int); extern struct inpcb *in6_pcblookup_hash(struct inpcbinfo *, struct in6_addr *, @@ -126,6 +122,8 @@ extern int in6_selecthlim(struct in6pcb *, struct ifnet *); extern int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct proc *, int); extern void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m); +extern void in6p_route_copyout(struct inpcb *, struct route_in6 *); +extern void in6p_route_copyin(struct inpcb *, struct route_in6 *); #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET6_IN6_PCB_H_ */ diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index c0228feeb..cb44f9b41 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -395,7 +395,7 @@ int ip6_forwarding = IPV6FORWARDING; /* act as router? */ int ip6_sendredirects = IPV6_SENDREDIRECTS; int ip6_defhlim = IPV6_DEFHLIM; int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; -int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ +int ip6_accept_rtadv = 1; /* deprecated */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ int ip6_maxfrags; int ip6_log_interval = 5; @@ -417,7 +417,7 @@ int ip6_maxdynroutes = 1024; /* Max # of routes created via redirect */ int ip6_only_allow_rfc4193_prefix = 0; /* Only allow RFC4193 style Unique Local IPv6 Unicast prefixes */ u_int32_t ip6_id = 0UL; -int ip6_keepfaith = 0; +static int ip6_keepfaith = 0; time_t ip6_log_time = (time_t)0L; int nd6_onlink_ns_rfc4861 = 0; /* allow 'on-link' nd6 NS (as in RFC 4861) */ @@ -521,10 +521,10 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxfrags, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, - accept_rtadv, CTLFLAG_RW | CTLFLAG_LOCKED, + accept_rtadv, CTLFLAG_RD | CTLFLAG_LOCKED, &ip6_accept_rtadv, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, - keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_keepfaith, 0, ""); + keepfaith, CTLFLAG_RD | CTLFLAG_LOCKED, &ip6_keepfaith, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_log_interval, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, @@ -609,3 +609,5 @@ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_onlink_ns_rfc4861, 0, "Accept 'on-link' nd6 NS in compliance with RFC 4861."); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_OPTIMISTIC_DAD, + nd6_optimistic_dad, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_optimistic_dad, 0, ""); diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index 63a66121d..509bba093 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -154,7 +154,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt); struct radix_node *ret; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index 1eb5cd60f..40bad0948 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -151,11 +151,11 @@ extern lck_mtx_t *addrsel_mutex; static int selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, - struct ifnet **, struct rtentry **, int, int, unsigned int, - unsigned int); + struct ifnet **, struct rtentry **, int, int, + const struct ip6_out_args *ip6oa); static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, - struct ip6_moptions *, struct route_in6 *ro, unsigned int, - unsigned int, struct ifnet **); + struct ip6_moptions *, struct route_in6 *ro, + const struct ip6_out_args *, struct ifnet **); static void init_policy_queue(void); static int add_addrsel_policyent(const struct in6_addrpolicy *); #ifdef ENABLE_ADDRSEL @@ -192,6 +192,11 @@ void addrsel_policy_init(void); goto out; /* XXX: we can't use 'break' here */ \ } while(0) +/* + * Regardless of error, it will return an ifp with a reference held if the + * caller provides a non-NULL ifpp. The caller is responsible for checking + * if the returned ifp is valid and release its reference at all times. + */ struct in6_addr * in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct route_in6 *ro, @@ -208,7 +213,7 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, int prefer_tempaddr; struct ip6_moptions *mopts; struct timeval timenow; - unsigned int nocell; + struct ip6_out_args ip6oa = { ifscope, { 0 }, IP6OAF_SELECT_SRCIF }; boolean_t islocal = FALSE; getmicrotime(&timenow); @@ -220,12 +225,15 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, if (inp != NULL) { mopts = inp->in6p_moptions; - nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + if (inp->inp_flags & INP_NO_IFT_CELLULAR) + ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; } else { mopts = NULL; - nocell = 0; } + if (ip6oa.ip6oa_boundif != IFSCOPE_NONE) + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; + /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address @@ -238,9 +246,10 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct in6_ifaddr *ia6; /* get the outgoing interface */ - if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, ifscope, - nocell, &ifp)) != 0) { - return (NULL); + if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ip6oa, + &ifp)) != 0) { + src_storage = NULL; + goto done; } /* @@ -254,48 +263,44 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, srcsock.sin6_family = AF_INET6; srcsock.sin6_len = sizeof(srcsock); srcsock.sin6_addr = pi->ipi6_addr; - if (ifp) { + if (ifp != NULL) { *errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL); if (*errorp != 0) { - ifnet_release(ifp); - return (NULL); + src_storage = NULL; + goto done; } } - ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)(&srcsock)); + ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *) + (&srcsock)); if (ia6 == NULL) { *errorp = EADDRNOTAVAIL; - if (ifp != NULL) - ifnet_release(ifp); - return (NULL); + src_storage = NULL; + goto done; } IFA_LOCK_SPIN(&ia6->ia_ifa); if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) || - (nocell && (ia6->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR))) { + ((ip6oa.ip6oa_flags & IP6OAF_NO_CELLULAR) && + (ia6->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR))) { IFA_UNLOCK(&ia6->ia_ifa); IFA_REMREF(&ia6->ia_ifa); *errorp = EADDRNOTAVAIL; - if (ifp != NULL) - ifnet_release(ifp); - return (NULL); + src_storage = NULL; + goto done; } *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; IFA_UNLOCK(&ia6->ia_ifa); IFA_REMREF(&ia6->ia_ifa); - if (ifpp != NULL) { - /* if ifp is non-NULL, refcnt held in in6_selectif() */ - *ifpp = ifp; - } else if (ifp != NULL) { - ifnet_release(ifp); - } - return (src_storage); + goto done; } /* * Otherwise, if the socket has already bound the source, just use it. */ - if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) - return (&inp->in6p_laddr); + if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + src_storage = &inp->in6p_laddr; + goto done; + } /* * If the address is not specified, choose the best one based on @@ -303,19 +308,16 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, */ /* get the outgoing interface */ - if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, ifscope, nocell, - &ifp)) != 0) - return (NULL); + if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ip6oa, + &ifp)) != 0) { + src_storage = NULL; + goto done; + } -#ifdef DIAGNOSTIC - if (ifp == NULL) /* this should not happen */ - panic("in6_selectsrc: NULL ifp"); -#endif *errorp = in6_setscope(&dst, ifp, &odstzone); if (*errorp != 0) { - if (ifp != NULL) - ifnet_release(ifp); - return (NULL); + src_storage = NULL; + goto done; } lck_rw_lock_shared(&in6_ifaddr_rwlock); @@ -351,6 +353,10 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) goto next; + if (!nd6_optimistic_dad && + (ia->ia6_flags & IN6_IFF_OPTIMISTIC) != 0) + goto next; + /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) BREAK(1); /* there should be no better candidate */ @@ -381,6 +387,17 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); + /* + * RFC 4429 says that optimistic addresses are equivalent to + * deprecated addresses, so avoid them here. + */ + if ((ia_best->ia6_flags & IN6_IFF_OPTIMISTIC) == 0 && + (ia->ia6_flags & IN6_IFF_OPTIMISTIC) != 0) + NEXTSRC(3); + if ((ia_best->ia6_flags & IN6_IFF_OPTIMISTIC) != 0 && + (ia->ia6_flags & IN6_IFF_OPTIMISTIC) == 0) + REPLACE(3); + /* Rule 4: Prefer home addresses */ /* * XXX: This is a TODO. We should probably merge the MIP6 @@ -507,23 +524,24 @@ out: lck_rw_done(&in6_ifaddr_rwlock); - if (nocell && ia_best != NULL && - (ia_best->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR)) { + if (ia_best != NULL && + (ip6oa.ip6oa_flags & IP6OAF_NO_CELLULAR) && + ia_best->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) { IFA_REMREF(&ia_best->ia_ifa); ia_best = NULL; } - if ( (ia = ia_best) == NULL) { + if ((ia = ia_best) == NULL) { *errorp = EADDRNOTAVAIL; - if (ifp != NULL) - ifnet_release(ifp); - return (NULL); + src_storage = NULL; + goto done; } IFA_LOCK_SPIN(&ia->ia_ifa); *src_storage = satosin6(&ia->ia_addr)->sin6_addr; IFA_UNLOCK(&ia->ia_ifa); IFA_REMREF(&ia->ia_ifa); +done: if (ifpp != NULL) { /* if ifp is non-NULL, refcnt held in in6_selectif() */ *ifpp = ifp; @@ -542,7 +560,10 @@ out: * i.e. for any given pcb there can only be one thread performing output at * the IPv6 layer. * - * This routine is analogous to in_selectsrcif() for IPv4. + * This routine is analogous to in_selectsrcif() for IPv4. Regardless of + * error, it will return an ifp with a reference held if the caller provides + * a non-NULL retifp. The caller is responsible for checking if the + * returned ifp is valid and release its reference at all times. * * clone - meaningful only for bsdi and freebsd */ @@ -550,17 +571,18 @@ static int selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int clone, - int norouteok, unsigned int ifscope, unsigned int nocell) + int norouteok, const struct ip6_out_args *ip6oa) { int error = 0; - struct ifnet *ifp = NULL; + struct ifnet *ifp = NULL, *ifp0 = NULL; struct route_in6 *route = NULL; struct sockaddr_in6 *sin6_next; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; struct ifaddr *ifa = NULL; char s_src[MAX_IPv6_STR_LEN], s_dst[MAX_IPv6_STR_LEN]; - boolean_t select_srcif; + boolean_t select_srcif, proxied_ifa = FALSE; + unsigned int ifscope = ip6oa->ip6oa_boundif; #if 0 char ip6buf[INET6_ADDRSTRLEN]; @@ -622,7 +644,8 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, ifscope = pi->ipi6_ifindex; ifnet_head_lock_shared(); /* ifp may be NULL if detached or out of range */ - ifp = (ifscope <= if_index) ? ifindex2ifnet[ifscope] : NULL; + ifp = ifp0 = + ((ifscope <= if_index) ? ifindex2ifnet[ifscope] : NULL); ifnet_head_done(); if (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst)) { /* @@ -645,7 +668,7 @@ selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) { IM6O_LOCK(mopts); - if ((ifp = mopts->im6o_multicast_ifp) != NULL) { + if ((ifp = ifp0 = mopts->im6o_multicast_ifp) != NULL) { IM6O_UNLOCK(mopts); goto done; /* we do not need a route for multicast. */ } @@ -711,6 +734,21 @@ getsrcif: ifa = (struct ifaddr *) ifa_foraddr6_scoped(&srcsock->sin6_addr, scope); + /* + * If we are forwarding and proxying prefix(es), see if the + * source address is one of ours and is a proxied address; + * if so, use it. + */ + if (ifa == NULL && ip6_forwarding && nd6_prproxy) { + ifa = (struct ifaddr *) + ifa_foraddr6(&srcsock->sin6_addr); + if (ifa != NULL && !(proxied_ifa = + nd6_prproxy_ifaddr((struct in6_ifaddr *)ifa))) { + IFA_REMREF(ifa); + ifa = NULL; + } + } + if (ip6_select_srcif_debug && ifa != NULL) { if (ro->ro_rt != NULL) { printf("%s->%s ifscope %d->%d ifa_if %s " @@ -746,7 +784,7 @@ getsrcif: } getroute: - if (ifa != NULL) + if (ifa != NULL && !proxied_ifa) ifscope = ifa->ifa_ifp->if_index; /* @@ -776,7 +814,7 @@ getroute: (RTF_UP | RTF_LLINFO) || ron->ro_rt->generation_id != route_generation || (select_srcif && (ifa == NULL || - ifa->ifa_ifp != ron->ro_rt->rt_ifp)))) || + (ifa->ifa_ifp != ron->ro_rt->rt_ifp && !proxied_ifa))))) || !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, &sin6_next->sin6_addr)) { if (ron->ro_rt != NULL) { @@ -804,7 +842,7 @@ getroute: } } route = ron; - ifp = ron->ro_rt->rt_ifp; + ifp = ifp0 = ron->ro_rt->rt_ifp; /* * When cloning is required, try to allocate a route to the @@ -836,7 +874,7 @@ getroute: ro->ro_rt->generation_id != route_generation || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst) || (select_srcif && (ifa == NULL || - ifa->ifa_ifp != ro->ro_rt->rt_ifp)))) { + (ifa->ifa_ifp != ro->ro_rt->rt_ifp && !proxied_ifa))))) { RT_UNLOCK(ro->ro_rt); rtfree(ro->ro_rt); ro->ro_rt = NULL; @@ -874,7 +912,7 @@ getroute: if (ro->ro_rt != NULL) { RT_LOCK_ASSERT_HELD(ro->ro_rt); - ifp = ro->ro_rt->rt_ifp; + ifp = ifp0 = ro->ro_rt->rt_ifp; } else { error = EHOSTUNREACH; } @@ -883,6 +921,7 @@ getroute: validateroute: if (select_srcif) { boolean_t has_route = (route != NULL && route->ro_rt != NULL); + boolean_t srcif_selected = FALSE; if (has_route) RT_LOCK_ASSERT_HELD(route->ro_rt); @@ -895,28 +934,48 @@ validateroute: if (has_route && (ifa == NULL || (ifa->ifa_ifp != ifp && ifp != lo_ifp) || !(route->ro_rt->rt_flags & RTF_UP))) { - if (ip6_select_srcif_debug) { - if (ifa != NULL) { - printf("%s->%s ifscope %d ro_if %s " - "!= ifa_if %s (cached route " - "cleared)\n", s_src, s_dst, - ifscope, if_name(ifp), - if_name(ifa->ifa_ifp)); - } else { - printf("%s->%s ifscope %d ro_if %s " - "(no ifa_if found)\n", s_src, - s_dst, ifscope, if_name(ifp)); + /* + * If the destination address belongs to a proxied + * prefix, relax the requirement and allow the packet + * to come out of the proxy interface with the source + * address of the real interface. + */ + if (ifa != NULL && proxied_ifa && + (route->ro_rt->rt_flags & (RTF_UP|RTF_PROXY)) == + (RTF_UP|RTF_PROXY)) { + srcif_selected = TRUE; + } else { + if (ip6_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d " + "ro_if %s != ifa_if %s " + "(cached route cleared)\n", + s_src, s_dst, + ifscope, if_name(ifp), + if_name(ifa->ifa_ifp)); + } else { + printf("%s->%s ifscope %d " + "ro_if %s (no ifa_if " + "found)\n", s_src, s_dst, + ifscope, if_name(ifp)); + } } + RT_UNLOCK(route->ro_rt); + rtfree(route->ro_rt); + route->ro_rt = NULL; + route->ro_flags &= ~ROF_SRCIF_SELECTED; + error = EHOSTUNREACH; + /* Undo the settings done above */ + route = NULL; + ifp = NULL; /* ditch ifp; keep ifp0 */ + has_route = FALSE; } - RT_UNLOCK(route->ro_rt); - rtfree(route->ro_rt); - route->ro_rt = NULL; - route->ro_flags &= ~ROF_SRCIF_SELECTED; - error = EHOSTUNREACH; - /* Undo the settings done above */ - route = NULL; - ifp = NULL; } else if (has_route) { + srcif_selected = TRUE; + } + + if (srcif_selected) { + VERIFY(has_route); route->ro_flags |= ROF_SRCIF_SELECTED; route->ro_rt->generation_id = route_generation; RT_UNLOCK(route->ro_rt); @@ -943,17 +1002,18 @@ validateroute: } done: - if (nocell && error == 0) { - if ((ifp != NULL && ifp->if_type == IFT_CELLULAR) || + if (error == 0) { + if ((ip6oa->ip6oa_flags & IP6OAF_NO_CELLULAR) && + ((ifp != NULL && ifp->if_type == IFT_CELLULAR) || (route != NULL && route->ro_rt != NULL && - route->ro_rt->rt_ifp->if_type == IFT_CELLULAR)) { + route->ro_rt->rt_ifp->if_type == IFT_CELLULAR))) { if (route != NULL && route->ro_rt != NULL) { rtfree(route->ro_rt); route->ro_rt = NULL; route->ro_flags &= ~ROF_SRCIF_SELECTED; route = NULL; } - ifp = NULL; + ifp = NULL; /* ditch ifp; keep ifp0 */ error = EHOSTUNREACH; } } @@ -968,12 +1028,19 @@ done: if (error == EHOSTUNREACH) ip6stat.ip6s_noroute++; + /* + * We'll return ifp regardless of error, so pick it up from ifp0 + * in case it was nullified above. Caller is responsible for + * releasing the ifp if it is non-NULL. + */ + ifp = ifp0; + if (retifp != NULL) { + if (ifp != NULL) + ifnet_reference(ifp); /* for caller */ + *retifp = ifp; + } + if (error == 0) { - if (retifp != NULL) { - if (ifp != NULL) - ifnet_reference(ifp); /* for caller */ - *retifp = ifp; - } if (retrt != NULL && route != NULL) *retrt = route->ro_rt; /* ro_rt may be NULL */ } else if (select_srcif && ip6_select_srcif_debug) { @@ -989,12 +1056,17 @@ done: return (error); } +/* + * Regardless of error, it will return an ifp with a reference held if the + * caller provides a non-NULL retifp. The caller is responsible for checking + * if the returned ifp is valid and release its reference at all times. + */ static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, - struct ip6_moptions *mopts, struct route_in6 *ro, unsigned int ifscope, - unsigned int nocell, struct ifnet **retifp) + struct ip6_moptions *mopts, struct route_in6 *ro, + const struct ip6_out_args *ip6oa, struct ifnet **retifp) { - int error; + int err = 0; struct route_in6 sro; struct rtentry *rt = NULL; @@ -1003,12 +1075,9 @@ in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, ro = &sro; } - if ((error = selectroute(NULL, dstsock, opts, mopts, ro, retifp, - &rt, 0, 1, ifscope, nocell)) != 0) { - if (ro == &sro && rt && rt == sro.ro_rt) - rtfree(rt); - return (error); - } + if ((err = selectroute(NULL, dstsock, opts, mopts, ro, retifp, + &rt, 0, 1, ip6oa)) != 0) + goto done; /* * do not use a rejected or black hole route. @@ -1028,11 +1097,8 @@ in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, * We thus reject the case here. */ if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { - int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); - - if (ro == &sro && rt && rt == sro.ro_rt) - rtfree(rt); - return (flags); + err = ((rt->rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); + goto done; } /* @@ -1042,30 +1108,41 @@ in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, * destination address (which should probably be one of our own * addresses.) */ - if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp) { + if (rt != NULL && rt->rt_ifa != NULL && rt->rt_ifa->ifa_ifp != NULL && + retifp != NULL) { + ifnet_reference(rt->rt_ifa->ifa_ifp); if (*retifp != NULL) ifnet_release(*retifp); *retifp = rt->rt_ifa->ifa_ifp; - ifnet_reference(*retifp); } +done: if (ro == &sro && rt && rt == sro.ro_rt) rtfree(rt); - return (0); + + /* + * retifp might point to a valid ifp with a reference held; + * caller is responsible for releasing it if non-NULL. + */ + return (err); } /* + * Regardless of error, it will return an ifp with a reference held if the + * caller provides a non-NULL retifp. The caller is responsible for checking + * if the returned ifp is valid and release its reference at all times. + * * clone - meaningful only for bsdi and freebsd */ int in6_selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int clone, - unsigned int ifscope, unsigned int nocell) + const struct ip6_out_args *ip6oa) { return (selectroute(srcsock, dstsock, opts, mopts, ro, retifp, - retrt, clone, 0, ifscope, nocell)); + retrt, clone, 0, ip6oa)); } /* @@ -1085,7 +1162,16 @@ in6_selecthlim( } else { lck_rw_lock_shared(nd_if_rwlock); if (ifp && ifp->if_index < nd_ifinfo_indexlim) { - u_int8_t chlim = nd_ifinfo[ifp->if_index].chlim; + u_int8_t chlim; + struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; + + if (ndi->initialized) { + lck_mtx_lock(&ndi->lock); + chlim = ndi->chlim; + lck_mtx_unlock(&ndi->lock); + } else { + chlim = ip6_defhlim; + } lck_rw_done(nd_if_rwlock); return (chlim); } else { @@ -1166,7 +1252,7 @@ in6_pcbsetport( * occurred above. */ inp->in6p_laddr = in6addr_any; - inp->in6p_last_outif = 0; + inp->in6p_last_outifp = NULL; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -1190,7 +1276,7 @@ in6_pcbsetport( * occurred above. */ inp->in6p_laddr = in6addr_any; - inp->in6p_last_outif = 0; + inp->in6p_last_outifp = NULL; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -1207,7 +1293,7 @@ in6_pcbsetport( if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; - inp->in6p_last_outif = 0; + inp->in6p_last_outifp = NULL; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -1530,7 +1616,7 @@ in6_src_ioctl(u_long cmd, caddr_t data) if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ - ent0 = *(struct in6_addrpolicy *)data; + bcopy(data, &ent0, sizeof (ent0)); if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index 970250494..ca7528f2d 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -168,6 +168,8 @@ struct in6_ifaddr { /* multicast addresses joined from the kernel */ LIST_HEAD(, in6_multi_mship) ia6_memberships; }; + +#define ifatoia6(ifa) ((struct in6_ifaddr *)(void *)(ifa)) #endif /* XNU_KERNEL_PRIVATE */ /* control structure to manage address selection policy */ @@ -306,6 +308,7 @@ struct in6_ifreq { int ifru_flags; int ifru_flags6; int ifru_metric; + int ifru_intval; caddr_t ifru_data; struct in6_addrlifetime ifru_lifetime; struct in6_ifstat ifru_stat; @@ -435,10 +438,12 @@ struct in6_rrenumreq { #define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr)) #define IA6_SIN6(ia) (&((ia)->ia_addr)) #define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr)) -#define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr) -#define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) - -#define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) +#define IFA_IN6(x) \ + (&((struct sockaddr_in6 *)(void *)((x)->ifa_addr))->sin6_addr) +#define IFA_DSTIN6(x) \ + (&((struct sockaddr_in6 *)(void *)((x)->ifa_dstaddr))->sin6_addr) +#define IFPR_IN6(x) \ + (&((struct sockaddr_in6 *)(void *)((x)->ifpr_prefix))->sin6_addr) #endif /* XNU_KERNEL_PRIVATE */ /* @@ -607,6 +612,7 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define SIOCDRDEL_IN6_32 _IOWR('u', 135, struct in6_defrouter_32) #define SIOCDRDEL_IN6_64 _IOWR('u', 135, struct in6_defrouter_64) #endif /* XNU_KERNEL_PRIVATE */ +#define SIOCSETROUTERMODE_IN6 _IOWR('i', 136, struct in6_ifreq) /* enable/disable IPv6 router mode on interface */ #endif /* PRIVATE */ #define IN6_IFF_ANYCAST 0x01 /* anycast address */ @@ -619,10 +625,15 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ +#define IN6_IFF_DYNAMIC 0x100 /* assigned by DHCPv6 service */ +#define IN6_IFF_OPTIMISTIC 0x200 /* optimistic DAD, i.e. RFC 4429 */ #define IN6_IFF_NOPFX 0x8000 /* skip kernel prefix management. * XXX: this should be temporary. */ +/* Duplicate Address Detection [DAD] in progress. */ +#define IN6_IFF_DADPROGRESS (IN6_IFF_TENTATIVE|IN6_IFF_OPTIMISTIC) + /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) @@ -653,6 +664,7 @@ do { \ } while (0) __private_extern__ lck_rw_t in6_ifaddr_rwlock; +__private_extern__ lck_mtx_t proxy6_lock; extern struct ifqueue ip6intrq; /* IP6 packet input queue */ extern struct in6_addr zeroin6_addr; @@ -931,6 +943,7 @@ extern void in6_restoremkludge(struct in6_ifaddr *, struct ifnet *); extern void in6_purgemkludge(struct ifnet *); extern struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int); extern struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, struct in6_addr *); +extern struct in6_ifaddr *in6ifa_prproxyaddr(struct in6_addr *); extern char *ip6_sprintf(const struct in6_addr *); extern int in6_addr2scopeid(struct ifnet *, struct in6_addr *); extern int in6_matchlen(struct in6_addr *, struct in6_addr *); diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index f0d56fd7b..42156858e 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -118,7 +118,7 @@ extern int ipsec_bypass; * */ -void +struct mbuf * ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, int srcrt) { @@ -126,6 +126,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, struct sockaddr_in6 *dst; struct rtentry *rt; int error, type = 0, code = 0; + boolean_t proxy = FALSE; struct mbuf *mcopy = NULL; struct ifnet *ifp, *origifp; /* maybe unnecessary */ u_int32_t inzone, outzone; @@ -142,8 +143,24 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, getmicrotime(&timenow); #if PF pf_mtag = pf_find_mtag(m); - if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) - ifscope = pf_mtag->rtableid; + if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) + ifscope = pf_mtag->pftag_rtableid; + + /* + * If the caller provides a route which is on a different interface + * than the one specified for scoped forwarding, discard the route + * and do a lookup below. + */ + if (ifscope != IFSCOPE_NONE && (rt = ip6forward_rt->ro_rt) != NULL) { + RT_LOCK(rt); + if (rt->rt_ifp->if_index != ifscope) { + RT_UNLOCK(rt); + rtfree(rt); + rt = ip6forward_rt->ro_rt = NULL; + } else { + RT_UNLOCK(rt); + } + } #endif /* PF */ #if IPSEC @@ -158,7 +175,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, if (ipsec6_in_reject(m, NULL)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); m_freem(m); - return; + return (NULL); } } #endif /*IPSEC*/ @@ -185,15 +202,34 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, if_name(m->m_pkthdr.rcvif)); } m_freem(m); - return; + return (NULL); } if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); - return; + return (NULL); + } + + /* + * See if the destination is a proxied address, and if so pretend + * that it's for us. This is mostly to handle NUD probes against + * the proxied addresses. We filter for ICMPv6 here and will let + * icmp6_input handle the rest. + */ + if (!srcrt && nd6_prproxy) { + VERIFY(!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)); + proxy = nd6_prproxy_isours(m, ip6, ip6forward_rt, ifscope); + /* + * Don't update hop limit while proxying; RFC 4389 4.1. + * Also skip IPsec forwarding path processing as this + * packet is not to be forwarded. + */ + if (proxy) + goto skip_ipsec; } + ip6->ip6_hlim -= IPV6_HLIMDEC; /* @@ -224,7 +260,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #endif } m_freem(m); - return; + return (NULL); } error = 0; @@ -247,7 +283,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #endif } m_freem(m); - return; + return (NULL); case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: @@ -269,7 +305,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #endif } m_freem(m); - return; + return (NULL); } /* do IPsec */ break; @@ -300,7 +336,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, error = ipsec6_output_tunnel(&state, sp, 0); key_freesp(sp, KEY_SADB_UNLOCKED); if (state.tunneled == 4) - return; /* packet is gone - sent over IPv4 */ + return (NULL); /* packet is gone - sent over IPv4 */ m = state.m; if (state.ro.ro_rt) { @@ -332,7 +368,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #endif } m_freem(m); - return; + return (NULL); } } skip_ipsec: @@ -375,7 +411,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); m_freem(m); - return; + return (NULL); } RT_LOCK_ASSERT_HELD(rt); } else if (rt == NULL || !(rt->rt_flags & RTF_UP) || @@ -402,7 +438,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); m_freem(m); - return; + return (NULL); } RT_LOCK(rt); /* Take an extra ref for ourselves */ @@ -414,7 +450,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, * destination for the reason that the destination is beyond the scope * of the source address, discard the packet and return an icmp6 * destination unreachable error with Code 2 (beyond scope of source - * address). We use a local copy of ip6_src, since in6_setscope() + * address) unless we are proxying (source address is link local + * for NUDs.) We use a local copy of ip6_src, since in6_setscope() * will possibly modify its first argument. * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ @@ -424,15 +461,16 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; m_freem(m); - return; + return (NULL); } if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; m_freem(m); - return; + return (NULL); } - if (inzone != outzone) { + + if (inzone != outzone && !proxy) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); @@ -455,7 +493,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); } m_freem(m); - return; + return (NULL); } /* @@ -472,7 +510,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; m_freem(m); - return; + return (NULL); } if (m->m_pkthdr.len > rt->rt_ifp->if_mtu) { @@ -520,11 +558,11 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, RT_UNLOCK(rt); } m_freem(m); - return; + return (NULL); } if (rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in6 *)rt->rt_gateway; + dst = (struct sockaddr_in6 *)(void *)rt->rt_gateway; /* * If we are to forward the packet using the same interface @@ -535,7 +573,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, * Also, don't send redirect if forwarding using a route * modified by a redirect. */ - if (ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && + if (!proxy && + ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* @@ -553,7 +592,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); m_freem(m); - return; + return (NULL); } type = ND_REDIRECT; } @@ -628,11 +667,23 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, /* Drop the lock but retain the extra ref */ RT_UNLOCK(rt); + /* + * If this is to be processed locally, let ip6_input have it. + */ + if (proxy) { + VERIFY(m->m_pkthdr.aux_flags & MAUXF_PROXY_DST); + /* Release extra ref */ + RT_REMREF(rt); + if (mcopy != NULL) + m_freem(mcopy); + return (m); + } + #if PF /* Invoke outbound packet filter */ - error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE, NULL); - if (error) { + if (error != 0 || m == NULL) { if (m != NULL) { panic("%s: unexpected packet %p\n", __func__, m); /* NOTREACHED */ @@ -643,7 +694,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6 = mtod(m, struct ip6_hdr *); #endif /* PF */ - error = nd6_output(ifp, origifp, m, dst, rt); + error = nd6_output(ifp, origifp, m, dst, rt, NULL); if (error) { in6_ifstat_inc(ifp, ifs6_out_discard); ip6stat.ip6s_cantforward++; @@ -664,7 +715,7 @@ senderr: if (mcopy == NULL) { /* Release extra ref */ RT_REMREF(rt); - return; + return (NULL); } switch (error) { case 0: @@ -673,7 +724,7 @@ senderr: icmp6_redirect_output(mcopy, rt); /* Release extra ref */ RT_REMREF(rt); - return; + return (NULL); } #endif goto freecopy; @@ -698,11 +749,11 @@ senderr: icmp6_error(mcopy, type, code, 0); /* Release extra ref */ RT_REMREF(rt); - return; + return (NULL); freecopy: m_freem(mcopy); /* Release extra ref */ RT_REMREF(rt); - return; + return (NULL); } diff --git a/bsd/netinet6/ip6_fw.c b/bsd/netinet6/ip6_fw.c index ae221caad..3f0e4b23f 100644 --- a/bsd/netinet6/ip6_fw.c +++ b/bsd/netinet6/ip6_fw.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index ae8fecd68..9fcc992cd 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,6 +120,7 @@ #include #include #include +#include #include #include @@ -147,13 +148,17 @@ extern int ipsec_bypass; #include +#if DUMMYNET +#include +#include +#endif /* DUMMYNET */ + #include #include /* we need it for NLOOP. */ #include "loop.h" -#include "faith.h" #include @@ -181,13 +186,10 @@ int ip6_sourcecheck_interval; /* XXX */ const int int6intrq_present = 1; int ip6_ours_check_algorithm; -int in6_init2done = 0; -int in6_init_done = 0; -#define _CASSERT(x) \ - switch (0) { case 0: case (x): ; } #define IN6_IFSTAT_REQUIRE_ALIGNED_64(f) \ _CASSERT(!(offsetof(struct in6_ifstat, f) % sizeof (uint64_t))) + #define ICMP6_IFSTAT_REQUIRE_ALIGNED_64(f) \ _CASSERT(!(offsetof(struct icmp6_ifstat, f) % sizeof (uint64_t))) @@ -203,12 +205,18 @@ struct ip6stat ip6stat; #ifdef __APPLE__ struct ifqueue ip6intrq; decl_lck_mtx_data(, ip6_init_mutex); -lck_mtx_t *dad6_mutex; -lck_mtx_t *nd6_mutex; -lck_mtx_t *prefix6_mutex; -lck_mtx_t *scope6_mutex; +decl_lck_mtx_data(, proxy6_lock); +decl_lck_mtx_data(, dad6_mutex_data); +decl_lck_mtx_data(, nd6_mutex_data); +decl_lck_mtx_data(, prefix6_mutex_data); +decl_lck_mtx_data(, scope6_mutex_data); +lck_mtx_t *dad6_mutex = &dad6_mutex_data; +lck_mtx_t *nd6_mutex = &nd6_mutex_data; +lck_mtx_t *prefix6_mutex = &prefix6_mutex_data; +lck_mtx_t *scope6_mutex = &scope6_mutex_data; #ifdef ENABLE_ADDRSEL -lck_mtx_t *addrsel_mutex; +decl_lck_mtx_data(, addrsel_mutex_data); +lck_mtx_t *addrsel_mutex = &addrsel_mutex_data; #endif decl_lck_rw_data(, in6_ifs_rwlock); decl_lck_rw_data(, icmp6_ifs_rwlock); @@ -220,7 +228,7 @@ extern lck_mtx_t *inet6_domain_mutex; extern int loopattach_done; extern void addrsel_policy_init(void); -static void ip6_init2(void *); +static void ip6_init_delayed(void); static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); @@ -230,18 +238,56 @@ static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #ifdef __APPLE__ void gifattach(void); -void faithattach(void); void stfattach(void); #endif -extern lck_mtx_t *domain_proto_mtx; - SYSCTL_DECL(_net_inet6_ip6); int ip6_doscopedroute = 1; SYSCTL_INT(_net_inet6_ip6, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, &ip6_doscopedroute, 0, "Enable IPv6 scoped routing"); +int ip6_restrictrecvif = 1; +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, restrictrecvif, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_restrictrecvif, 0, + "Enable inbound interface restrictions"); + +/* + * On platforms which require strict alignment (currently for anything but + * i386 or x86_64), check if the IP header pointer is 32-bit aligned; if not, + * copy the contents of the mbuf chain into a new chain, and free the original + * one. Create some head room in the first mbuf of the new chain, in case + * it's needed later on. + * + * RFC 2460 says that IPv6 headers are 64-bit aligned, but network interfaces + * mostly align to 32-bit boundaries. Care should be taken never to use 64-bit + * load/store operations on the fields in IPv6 headers. + */ +#if defined(__i386__) || defined(__x86_64__) +#define IP6_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0) +#else /* !__i386__ && !__x86_64__ */ +#define IP6_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \ + if (!IP6_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \ + struct mbuf *_n; \ + struct ifnet *__ifp = (_ifp); \ + atomic_add_64(&(__ifp)->if_alignerrs, 1); \ + if (((_m)->m_flags & M_PKTHDR) && \ + (_m)->m_pkthdr.header != NULL) \ + (_m)->m_pkthdr.header = NULL; \ + _n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \ + if (_n == NULL) { \ + ip6stat.ip6s_toosmall++; \ + m_freem(_m); \ + (_m) = NULL; \ + _action \ + } else { \ + VERIFY(_n != (_m)); \ + (_m) = _n; \ + } \ + } \ +} while (0) +#endif /* !__i386__ && !__x86_64__ */ + static void ip6_proto_input( __unused protocol_family_t protocol, @@ -286,30 +332,20 @@ ip6_init() ip6_mutex_grp = lck_grp_alloc_init("ip6", ip6_mutex_grp_attr); ip6_mutex_attr = lck_attr_alloc_init(); - if ((dad6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc dad6_mutex\n"); - } - if ((nd6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc nd6_mutex\n"); - } - - if ((prefix6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc prefix6_mutex\n"); - } - - if ((scope6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc scope6_mutex\n"); - } + lck_mtx_init(dad6_mutex, ip6_mutex_grp, ip6_mutex_attr); + lck_mtx_init(nd6_mutex, ip6_mutex_grp, ip6_mutex_attr); + lck_mtx_init(prefix6_mutex, ip6_mutex_grp, ip6_mutex_attr); + lck_mtx_init(scope6_mutex, ip6_mutex_grp, ip6_mutex_attr); #ifdef ENABLE_ADDRSEL - if ((addrsel_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc addrsel_mutex\n"); - } + lck_mtx_init(addrsel_mutex, ip6_mutex_grp, ip6_mutex_attr); #endif + lck_mtx_init(&proxy6_lock, ip6_mutex_grp, ip6_mutex_attr); + lck_mtx_init(&ip6_init_mutex, ip6_mutex_grp, ip6_mutex_attr); + lck_rw_init(&in6_ifs_rwlock, ip6_mutex_grp, ip6_mutex_attr); lck_rw_init(&icmp6_ifs_rwlock, ip6_mutex_grp, ip6_mutex_attr); - lck_mtx_init(&ip6_init_mutex, ip6_mutex_grp, ip6_mutex_attr); inet6domain.dom_flags = DOM_REENTRANT; @@ -393,25 +429,22 @@ ip6_init() ip6_flow_seq = random() ^ tv.tv_usec; microtime(&tv); ip6_desync_factor = (random() ^ tv.tv_usec) % MAX_TEMP_DESYNC_FACTOR; - timeout(ip6_init2, (caddr_t)0, 1 * hz); - lck_mtx_unlock(domain_proto_mtx); + /* + * P2P interfaces often route the local address to the loopback + * interface. At this point, lo0 hasn't been initialized yet, which + * means that we need to delay the IPv6 configuration of lo0. + */ + net_init_add(ip6_init_delayed); + + domain_proto_mtx_unlock(TRUE); proto_register_input(PF_INET6, ip6_proto_input, NULL, 0); - lck_mtx_lock(domain_proto_mtx); + domain_proto_mtx_lock(); } static void -ip6_init2( - __unused void *dummy) +ip6_init_delayed(void) { - /* - * to route local address of p2p link to loopback, - * assign loopback address first. - */ - if (loopattach_done == 0) { - timeout(ip6_init2, (caddr_t)0, 1 * hz); - return; - } (void) in6_ifattach(lo_ifp, NULL, NULL); #ifdef __APPLE__ @@ -426,29 +459,10 @@ ip6_init2( #if NGIF gifattach(); #endif -#if NFAITH - faithattach(); -#endif #if NSTF stfattach(); #endif -#endif - in6_init2done = 1; - - lck_mtx_lock(&ip6_init_mutex); - in6_init_done = 1; - wakeup(&in6_init_done); - lck_mtx_unlock(&ip6_init_mutex); -} - -void -ip6_fin() -{ - lck_mtx_lock(&ip6_init_mutex); - while (in6_init_done == 0) { - (void) msleep(&in6_init_done, &ip6_init_mutex, 0, "ip6_fin()", NULL); - } - lck_mtx_unlock(&ip6_init_mutex); +#endif /* __APPLE__ */ } void @@ -465,6 +479,12 @@ ip6_input(struct mbuf *m) struct in6_ifaddr *ia6 = NULL; struct route_in6 ip6_forward_rt; struct sockaddr_in6 *dst6; +#if DUMMYNET + struct m_tag *tag; + struct ip_fw_args args; + + bzero(&args, sizeof(struct ip_fw_args)); +#endif /* DUMMYNET */ bzero(&ip6_forward_rt, sizeof(ip6_forward_rt)); @@ -473,6 +493,28 @@ ip6_input(struct mbuf *m) */ MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); + /* Perform IP header alignment fixup, if needed */ + IP6_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return;); + +#if DUMMYNET + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + struct dn_pkt_tag *dn_tag; + + dn_tag = (struct dn_pkt_tag *)(tag+1); + + args.fwa_pf_rule = dn_tag->dn_pf_rule; + + m_tag_delete(m, tag); + } + + if (args.fwa_pf_rule) { + ip6 = mtod(m, struct ip6_hdr *); /* In case PF got disabled */ + + goto check_with_pf; + } +#endif /* DUMMYNET */ + /* * No need to proccess packet twice if we've * already seen it @@ -485,7 +527,7 @@ ip6_input(struct mbuf *m) goto injectit; } else seen = 1; - + #if IPSEC /* * should the inner packet be considered authentic? @@ -524,7 +566,11 @@ ip6_input(struct mbuf *m) #undef M2MMAX } - /* drop the packet if IPv6 operation is disabled on the IF */ + /* + * Drop the packet if IPv6 operation is disabled on the IF; + * accessing the flag is done without acquiring nd_ifinfo lock + * for performance reasons. + */ lck_rw_lock_shared(nd_if_rwlock); if (m->m_pkthdr.rcvif->if_index < nd_ifinfo_indexlim && (nd_ifinfo[m->m_pkthdr.rcvif->if_index].flags & ND6_IFF_IFDISABLED)) { @@ -684,12 +730,19 @@ ip6_input(struct mbuf *m) } } +#if DUMMYNET +check_with_pf: +#endif #if PF /* Invoke inbound packet filter */ if (PF_IS_ENABLED) { int error; - error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE); - if (error != 0) { +#if DUMMYNET + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE, &args); +#else + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE, NULL); +#endif + if (error != 0 || m == NULL) { if (m != NULL) { panic("%s: unexpected packet %p\n", __func__, m); /* NOTREACHED */ @@ -740,12 +793,11 @@ ip6_input(struct mbuf *m) if (in6m != NULL) { IN6M_REMREF(in6m); ours = 1; - } - else + } else if (!nd6_prproxy #if MROUTING - if (!ip6_mrouter) + && !ip6_mrouter #endif - { + ) { ip6stat.ip6s_notmember++; ip6stat.ip6s_cantforward++; in6_ifstat_inc(ifp, ifs6_in_discard); @@ -841,21 +893,6 @@ ip6_input(struct mbuf *m) goto bad; } - /* - * FAITH (Firewall Aided Internet Translator) - */ -#if defined(NFAITH) && 0 < NFAITH - if (ip6_keepfaith) { - if (ip6_forward_rt.ro_rt && ip6_forward_rt.ro_rt->rt_ifp - && ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { - /* XXX do we need more sanity checks? */ - ours = 1; - deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /* faith */ - RT_UNLOCK(ip6_forward_rt.ro_rt); - goto hbhcheck; - } - } -#endif if (ip6_forward_rt.ro_rt != NULL) RT_UNLOCK(ip6_forward_rt.ro_rt); @@ -873,8 +910,7 @@ ip6_input(struct mbuf *m) /* * record address information into m_aux, if we don't have one yet. * note that we are unable to record it, if the address is not listed - * as our interface address (e.g. multicast addresses, addresses - * within FAITH prefixes and such). + * as our interface address (e.g. multicast addresses, etc.) */ if (deliverifp && (ia6 = ip6_getdstifaddr(m)) == NULL) { ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); @@ -1007,12 +1043,40 @@ ip6_input(struct mbuf *m) goto bad; } #endif + if (!ours && nd6_prproxy) { + /* + * If this isn't for us, this might be a Neighbor + * Solicitation (dst is solicited-node multicast) + * against an address in one of the proxied prefixes; + * if so, claim the packet and let icmp6_input() + * handle the rest. + */ + ours = nd6_prproxy_isours(m, ip6, NULL, IFSCOPE_NONE); + VERIFY(!ours || + (m->m_pkthdr.aux_flags & MAUXF_PROXY_DST)); + } if (!ours) goto bad; } else if (!ours) { - ip6_forward(m, &ip6_forward_rt, 0); - goto done; - } + /* + * The unicast forwarding function might return the packet + * if we are proxying prefix(es), and if the packet is an + * ICMPv6 packet that has failed the zone checks, but is + * targetted towards a proxied address (this is optimized by + * way of RTF_PROXY test.) If so, claim the packet as ours + * and let icmp6_input() handle the rest. The packet's hop + * limit value is kept intact (it's not decremented). This + * is for supporting Neighbor Unreachability Detection between + * proxied nodes on different links (src is link-local, dst + * is target address.) + */ + if ((m = ip6_forward(m, &ip6_forward_rt, 0)) == NULL) + goto done; + VERIFY(ip6_forward_rt.ro_rt != NULL); + VERIFY(m->m_pkthdr.aux_flags & MAUXF_PROXY_DST); + deliverifp = ip6_forward_rt.ro_rt->rt_ifp; + ours = 1; + } ip6 = mtod(m, struct ip6_hdr *); @@ -1041,6 +1105,13 @@ ip6_input(struct mbuf *m) injectit: nest = 0; + /* + * Perform IP header alignment fixup again, if needed. Note that + * we do it once for the outermost protocol, and we assume each + * protocol handler wouldn't mess with the alignment afterwards. + */ + IP6_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return;); + while (nxt != IPPROTO_DONE) { struct ipfilter *filter; int (*pr_input)(struct mbuf **, int *, int); @@ -1067,7 +1138,8 @@ injectit: * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ - if ((ipsec_bypass == 0) && (ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0) { + if ((ipsec_bypass == 0) && + (ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0) { if (ipsec6_in_reject(m, NULL)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); goto bad; @@ -1082,13 +1154,15 @@ injectit: ipf_ref(); TAILQ_FOREACH(filter, &ipv6_filters, ipf_link) { if (seen == 0) { - if ((struct ipfilter *)inject_ipfref == filter) + if ((struct ipfilter *)inject_ipfref == + filter) seen = 1; } else if (filter->ipf_filter.ipf_input) { errno_t result; - + result = filter->ipf_filter.ipf_input( - filter->ipf_filter.cookie, (mbuf_t*)&m, off, nxt); + filter->ipf_filter.cookie, + (mbuf_t *)&m, off, nxt); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1431,12 +1505,12 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, return NULL; } if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { - int tc = m->m_pkthdr.prio; - + int tc = m_get_traffic_class(m); + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), SO_TRAFFIC_CLASS, SOL_SOCKET, mp); - if (*mp == NULL) - return NULL; + if (*mp == NULL) + return (NULL); } if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { @@ -1716,7 +1790,7 @@ ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu) bzero(&mtuctl, sizeof(mtuctl)); /* zero-clear for safety */ mtuctl.ip6m_mtu = *mtu; mtuctl.ip6m_addr = *dst; - if (sa6_recoverscope(&mtuctl.ip6m_addr)) + if (sa6_recoverscope(&mtuctl.ip6m_addr, TRUE)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), @@ -1950,13 +2024,13 @@ ip6_lasthdr(m, off, proto, nxtp) } struct ip6aux * -ip6_addaux( - struct mbuf *m) +ip6_addaux(struct mbuf *m) { struct m_tag *tag; - + /* Check if one is already allocated */ - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_INET6, NULL); if (tag == NULL) { /* Allocate a tag */ tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, @@ -1967,28 +2041,28 @@ ip6_addaux( m_tag_prepend(m, tag); } } - - return tag ? (struct ip6aux*)(tag + 1) : NULL; + + return (tag ? (struct ip6aux *)(tag + 1) : NULL); } struct ip6aux * -ip6_findaux( - struct mbuf *m) +ip6_findaux(struct mbuf *m) { struct m_tag *tag; - - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); - - return tag ? (struct ip6aux*)(tag + 1) : NULL; + + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_INET6, NULL); + + return (tag ? (struct ip6aux *)(tag + 1) : NULL); } void -ip6_delaux( - struct mbuf *m) +ip6_delaux(struct mbuf *m) { struct m_tag *tag; - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_INET6, NULL); if (tag) { m_tag_delete(m, tag); } diff --git a/bsd/netinet6/ip6_mroute.c b/bsd/netinet6/ip6_mroute.c index 39f146284..f6504807b 100644 --- a/bsd/netinet6/ip6_mroute.c +++ b/bsd/netinet6/ip6_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -358,12 +358,17 @@ mrt6_ioctl(u_long cmd, caddr_t data) int error = 0; switch (cmd) { - case SIOCGETSGCNT_IN6: - return (get_sg_cnt((struct sioc_sg_req6 *)data)); - /* NOTREACHED */ + case SIOCGETSGCNT_IN6: { /* struct sioc_sg_req6 */ + struct sioc_sg_req6 req; + + bcopy(data, &req, sizeof (req)); + error = get_sg_cnt(®); + bcopy(&req, data, sizeof (req)); + break; + } - case SIOCGETMIFCNT_IN6_32: - case SIOCGETMIFCNT_IN6_64: + case SIOCGETMIFCNT_IN6_32: /* struct sioc_mif_req6_32 */ + case SIOCGETMIFCNT_IN6_64: /* struct sioc_mif_req6_64 */ return (get_mif6_cnt(data, cmd == SIOCGETMIFCNT_IN6_64)); /* NOTREACHED */ @@ -405,28 +410,36 @@ get_mif6_cnt(void *data, int p64) { if (p64) { struct sioc_mif_req6_64 *req = data; + mifi_t mifi; - mifi_t mifi = req->mifi; - + bcopy(&req->mifi, &mifi, sizeof (mifi)); if (mifi >= nummifs) return (EINVAL); - req->icount = mif6table[mifi].m6_pkt_in; - req->ocount = mif6table[mifi].m6_pkt_out; - req->ibytes = mif6table[mifi].m6_bytes_in; - req->obytes = mif6table[mifi].m6_bytes_out; + bcopy(&mif6table[mifi].m6_pkt_in, &req->icount, + sizeof (req->icount)); + bcopy(&mif6table[mifi].m6_pkt_out, &req->ocount, + sizeof (req->ocount)); + bcopy(&mif6table[mifi].m6_bytes_in, &req->ibytes, + sizeof (req->ibytes)); + bcopy(&mif6table[mifi].m6_bytes_out, &req->obytes, + sizeof (req->obytes)); } else { struct sioc_mif_req6_32 *req = data; + mifi_t mifi; - mifi_t mifi = req->mifi; - + bcopy(&req->mifi, &mifi, sizeof (mifi)); if (mifi >= nummifs) return (EINVAL); - req->icount = mif6table[mifi].m6_pkt_in; - req->ocount = mif6table[mifi].m6_pkt_out; - req->ibytes = mif6table[mifi].m6_bytes_in; - req->obytes = mif6table[mifi].m6_bytes_out; + bcopy(&mif6table[mifi].m6_pkt_in, &req->icount, + sizeof (req->icount)); + bcopy(&mif6table[mifi].m6_pkt_out, &req->ocount, + sizeof (req->ocount)); + bcopy(&mif6table[mifi].m6_bytes_in, &req->ibytes, + sizeof (req->ibytes)); + bcopy(&mif6table[mifi].m6_bytes_out, &req->obytes, + sizeof (req->obytes)); } return (0); } @@ -1547,7 +1560,7 @@ phyint_send(ip6, mifp, m) mb_copy->m_pkthdr.csum_flags = 0; error = dlil_output(ifp, PF_INET6, mb_copy, - NULL, (struct sockaddr *)&ro.ro_dst, 0); + NULL, (struct sockaddr *)&ro.ro_dst, 0, NULL); #else error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, @@ -1676,6 +1689,9 @@ pim6_input(struct mbuf **mp, int *offp, int proto) ++pim6stat.pim6s_rcv_total; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - *offp; @@ -1876,7 +1892,7 @@ pim6_input(struct mbuf **mp, int *offp, int proto) #ifdef __APPLE__ if (lo_ifp) { - dlil_output(lo_ifp, PF_INET6, m, 0, (struct sockaddr *)&dst, 0); + dlil_output(lo_ifp, PF_INET6, m, 0, (struct sockaddr *)&dst, 0, NULL); } else { printf("Warning: pim6_input call to dlil_find_dltag failed!\n"); diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 9b58e7ad7..d96db58f7 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -148,6 +148,11 @@ extern int ipsec_bypass; #include +#if DUMMYNET +#include +#include +#endif /* DUMMYNET */ + #include #include @@ -160,14 +165,6 @@ extern int ipsec_bypass; static MALLOC_DEFINE(M_IPMOPTS, "ip6_moptions", "internet multicast options"); #endif -struct ip6_exthdrs { - struct mbuf *ip6e_ip6; - struct mbuf *ip6e_hbh; - struct mbuf *ip6e_dest1; - struct mbuf *ip6e_rthdr; - struct mbuf *ip6e_dest2; -}; - int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *sopt); @@ -216,6 +213,49 @@ static struct zone *im6o_zone; /* zone for ip6_moptions */ #define IM6O_ZONE_MAX 64 /* maximum elements in zone */ #define IM6O_ZONE_NAME "ip6_moptions" /* zone name */ +SYSCTL_DECL(_net_inet6_ip6); + +static int ip6_maxchainsent = 0; +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip6_maxchainsent, 0, "use dlil_output_list"); + +/* + * XXX we don't handle mbuf chains yet in nd6_output() so ip6_output_list() only + * walks through the packet chain and sends each mbuf separately. + */ +int +ip6_output_list( + struct mbuf *m0, + int packetlist, + struct ip6_pktopts *opt, + struct route_in6 *ro, + int flags, + struct ip6_moptions *im6o, + struct ifnet **ifpp, /* XXX: just for statistics */ + struct ip6_out_args *ip6oap) +{ +#pragma unused(packetlist) + struct mbuf *m = m0, *nextpkt; + int error = 0; + + while (m) { + /* + * Break the chain before calling ip6_output() and free the + * mbufs if there was an error. + */ + nextpkt = m->m_nextpkt; + m->m_nextpkt = NULL; + error = ip6_output(m, opt, ro, flags, im6o, ifpp, ip6oap); + if (error) { + if (nextpkt) + m_freem_list(nextpkt); + return (error); + } + m = nextpkt; + } + + return (error); +} /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 @@ -236,7 +276,7 @@ ip6_output( int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, /* XXX: just for statistics */ - struct ip6_out_args *ip6oa) + struct ip6_out_args *ip6oap) { struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp = NULL, *origifp = NULL; @@ -258,11 +298,26 @@ ip6_output( int needipsec = 0; ipfilter_t inject_filter_ref; int tso; - unsigned int ifscope; - unsigned int nocell; boolean_t select_srcif; struct ipf_pktopts *ippo = NULL, ipf_pktopts; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, { 0 }, 0 }; + struct flowadv *adv = NULL; u_int32_t ifmtu; +#if DUMMYNET + struct m_tag *tag; + struct route_in6 saved_route; + struct route_in6 saved_ro_pmtu; + struct ip_fw_args args; + struct sockaddr_in6 dst_buf; + + bzero(&args, sizeof(struct ip_fw_args)); +#endif /* DUMMYNET */ + + if ((flags & IPV6_OUTARGS) && ip6oap != NULL) { + ip6oa = *ip6oap; + adv = &ip6oap->ip6oa_flowadv; + adv->code = FADV_SUCCESS; + } #if IPSEC int needipsectun = 0; @@ -272,7 +327,7 @@ ip6_output( struct ipsec_output_state ipsec_state; bzero(&ipsec_state, sizeof(ipsec_state)); - + /* for AH processing. stupid to have "socket" variable in IP layer... */ if (ipsec_bypass == 0) { @@ -286,26 +341,75 @@ ip6_output( ip6 = mtod(m, struct ip6_hdr *); inject_filter_ref = ipf_get_inject_filter(m); - + + /* Grab info from mtags prepended to the chain */ +#if DUMMYNET + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + struct dn_pkt_tag *dn_tag; + + dn_tag = (struct dn_pkt_tag *)(tag+1); + args.fwa_pf_rule = dn_tag->dn_pf_rule; + + bcopy(&dn_tag->dn_dst6, &dst_buf, sizeof(dst_buf)); + dst = &dst_buf; + ifp = dn_tag->dn_ifp; + if (ifp) + ifnet_reference(ifp); + flags = dn_tag->dn_flags; + if (dn_tag->dn_flags & IPV6_OUTARGS) + ip6oa = dn_tag->dn_ip6oa; + + saved_route = dn_tag->dn_ro6; + ro = &saved_route; + saved_ro_pmtu = dn_tag->dn_ro6_pmtu; + ro_pmtu = &saved_ro_pmtu; + origifp = dn_tag->dn_origifp; + if (origifp) + ifnet_reference(origifp); + mtu = dn_tag->dn_mtu; + alwaysfrag = dn_tag->dn_alwaysfrag; + unfragpartlen = dn_tag->dn_unfragpartlen; + + bcopy(&dn_tag->dn_exthdrs, &exthdrs, sizeof(exthdrs)); + + m_tag_delete(m0, tag); + } +#endif /* DUMMYNET */ + finaldst = ip6->ip6_dst; if (ip6_doscopedroute && (flags & IPV6_OUTARGS)) { - select_srcif = !(flags & (IPV6_FORWARDING | IPV6_UNSPECSRC | IPV6_FLAG_NOSRCIFSEL)); - ifscope = ip6oa->ip6oa_boundif; - ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; - ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); + if ((select_srcif = (!(flags & (IPV6_FORWARDING | + IPV6_UNSPECSRC | IPV6_FLAG_NOSRCIFSEL)) && + (ip6oa.ip6oa_flags & IP6OAF_SELECT_SRCIF)))) + ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; + + if ((ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) && + ip6oa.ip6oa_boundif != IFSCOPE_NONE) { + ipf_pktopts.ippo_flags |= (IPPOF_BOUND_IF | + (ip6oa.ip6oa_boundif << IPPOF_SHIFT_IFSCOPE)); + } + + if (ip6oa.ip6oa_flags & IP6OAF_BOUND_SRCADDR) + ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; } else { select_srcif = FALSE; - ifscope = IFSCOPE_NONE; + ip6oa.ip6oa_boundif = IFSCOPE_NONE; + ip6oa.ip6oa_flags &= ~(IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF | + IP6OAF_BOUND_SRCADDR); } - if (flags & IPV6_OUTARGS) { - nocell = ip6oa->ip6oa_nocell; - if (nocell) - ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; - } else { - nocell = 0; + if ((flags & IPV6_OUTARGS) && (ip6oa.ip6oa_flags & IP6OAF_NO_CELLULAR)) + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + +#if DUMMYNET + if (args.fwa_pf_rule) { + ip6 = mtod(m, struct ip6_hdr *); + + goto check_with_pf; } +#endif /* DUMMYNET */ #define MAKE_EXTHDR(hp, mp) \ do { \ @@ -317,9 +421,9 @@ ip6_output( goto freehdrs; \ } \ } while (0) - + bzero(&exthdrs, sizeof(exthdrs)); - + if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); @@ -346,7 +450,7 @@ ip6_output( #if IPSEC if (ipsec_bypass != 0) goto skip_ipsec; - + /* get a security policy for this packet */ if (so == NULL) sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 0, &error); @@ -375,7 +479,7 @@ ip6_output( /* no need to do IPsec. */ needipsec = 0; break; - + case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ @@ -542,7 +646,7 @@ ip6_output( seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); @@ -631,7 +735,7 @@ skip_ipsec2: switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rh0 = (struct ip6_rthdr0 *)rh; - addr = (struct in6_addr *)(rh0 + 1); + addr = (struct in6_addr *)(void *)(rh0 + 1); /* * construct a sockaddr_in6 form of @@ -743,12 +847,11 @@ skip_ipsec2: dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_addr = ip6->ip6_dst; } - #if IPSEC if (needipsec && needipsectun) { #if CONFIG_DTRACE struct ifnet *trace_ifp = (ifpp != NULL) ? (*ifpp) : NULL; -#endif /* CONFIG_DTRACE */ +#endif /* * All the extension headers will become inaccessible * (since they can be encrypted). @@ -775,7 +878,7 @@ skip_ipsec2: m = ipsec_state.m; ipsec_saved_route = ro; ro = (struct route_in6 *)&ipsec_state.ro; - dst = (struct sockaddr_in6 *)ipsec_state.dst; + dst = (struct sockaddr_in6 *)(void *)ipsec_state.dst; if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ m0 = m = NULL; @@ -797,11 +900,12 @@ skip_ipsec2: } goto bad; } - /* + /* * The packet has been encapsulated so the ifscope is no longer valid * since it does not apply to the outer address: ignore the ifscope. */ - ifscope = IFSCOPE_NONE; + ip6oa.ip6oa_boundif = IFSCOPE_NONE; + ip6oa.ip6oa_flags &= ~IP6OAF_BOUND_IF; if (opt != NULL && opt->ip6po_pktinfo != NULL) { if (opt->ip6po_pktinfo->ipi6_ifindex != IFSCOPE_NONE) opt->ip6po_pktinfo->ipi6_ifindex = IFSCOPE_NONE; @@ -830,8 +934,12 @@ skip_ipsec2: dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; + /* + * in6_selectroute() might return an ifp with its reference held + * even in the error case, so make sure to release its reference. + */ if ((error = in6_selectroute(select_srcif ? &src_sa : NULL, - &dst_sa, opt, im6o, ro, &ifp, &rt, 0, ifscope, nocell)) != 0) { + &dst_sa, opt, im6o, ro, &ifp, &rt, 0, &ip6oa)) != 0) { switch (error) { case EHOSTUNREACH: ip6stat.ip6s_noroute++; @@ -842,6 +950,7 @@ skip_ipsec2: } if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); + /* ifp (if non-NULL) will be released at the end */ goto bad; } if (rt == NULL) { @@ -874,13 +983,13 @@ skip_ipsec2: * case of sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) { - ifnet_reference(ia->ia_ifp); + ifnet_reference(ia->ia_ifp); /* for origifp */ if (origifp != NULL) ifnet_release(origifp); origifp = ia->ia_ifp; } else { if (ifp != NULL) - ifnet_reference(ifp); + ifnet_reference(ifp); /* for origifp */ if (origifp != NULL) ifnet_release(origifp); origifp = ifp; @@ -892,7 +1001,7 @@ skip_ipsec2: src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; - if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) + if (sa6_recoverscope(&src_sa, TRUE) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; @@ -903,7 +1012,7 @@ skip_ipsec2: dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; - if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { + if (sa6_recoverscope(&dst_sa, TRUE) || zone != dst_sa.sin6_scope_id) { goto badscope; } @@ -925,10 +1034,10 @@ skip_ipsec2: * application. We assume the next hop is an IPv6 * address. */ - dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; + dst = (struct sockaddr_in6 *)(void *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) - dst = (struct sockaddr_in6 *)rt->rt_gateway; + dst = (struct sockaddr_in6 *)(void *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { @@ -1113,12 +1222,40 @@ skip_ipsec2: m->m_pkthdr.rcvif = NULL; } +#if DUMMYNET +check_with_pf: +#endif #if PF if (PF_IS_ENABLED) { +#if DUMMYNET + /* + * TBD: Need to save opt->ip6po_flags for reinjection rdar://10434993 + */ + args.fwa_m = m; + args.fwa_oif = ifp; + args.fwa_oflags = flags; + if ((flags & IPV6_OUTARGS)) + args.fwa_ip6oa = &ip6oa; + args.fwa_ro6 = ro; + args.fwa_dst6 = dst; + args.fwa_ro6_pmtu = ro_pmtu; + args.fwa_origifp = origifp; + args.fwa_mtu = mtu; + args.fwa_alwaysfrag = alwaysfrag; + args.fwa_unfragpartlen = unfragpartlen; + args.fwa_exthdrs = &exthdrs; /* Invoke outbound packet filter */ - error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE, &args); +#else + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE, NULL); +#endif /* DUMMYNET */ - if (error) { + if (error != 0 || m == NULL) { + /* + * Note that if we ever handle packet chain, we will + * have to restore the linkage from the previous + * packet to the next like in ip_outout_list() + */ if (m != NULL) { panic("%s: unexpected packet %p\n", __func__, m); /* NOTREACHED */ @@ -1162,6 +1299,7 @@ skip_ipsec2: } lck_rw_lock_shared(nd_if_rwlock); + /* Access without acquiring nd_ifinfo lock for performance */ ifmtu = IN6_LINKMTU(ifp); lck_rw_done(nd_if_rwlock); @@ -1214,7 +1352,7 @@ skip_ipsec2: } if (ro->ro_rt) RT_LOCK_ASSERT_NOTHELD(ro->ro_rt); - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, adv); goto done; } @@ -1321,6 +1459,10 @@ skip_ipsec2: m->m_pkthdr.len = len + hlen + sizeof(*ip6f); m->m_pkthdr.rcvif = 0; m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; + + M_COPY_PFTAG(m, m0); + m_set_service_class(m, m0->m_pkthdr.svc); + #ifdef __darwin8_notyet #if CONFIG_MACF_NET mac_create_fragment(m0, m); @@ -1358,7 +1500,8 @@ sendorfree: /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, + adv); } else m_freem(m); @@ -1452,8 +1595,10 @@ in6_delayed_cksum(struct mbuf *m, uint16_t offset) offset += (m->m_pkthdr.csum_data & 0xffff); if ((offset + sizeof(csum)) > m->m_len) { m_copyback(m, offset, sizeof(csum), &csum); + } else if (IP6_HDR_ALIGNED_P(mtod(m, char *))) { + *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; } else { - *(uint16_t *)(mtod(m, char *) + offset) = csum; + bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); } } /* @@ -1636,6 +1781,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, u_int32_t ifmtu; lck_rw_lock_shared(nd_if_rwlock); + /* Access without acquiring nd_ifinfo lock for performance */ ifmtu = IN6_LINKMTU(ifp); lck_rw_done(nd_if_rwlock); @@ -1673,6 +1819,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, } else { if (ifp) { lck_rw_lock_shared(nd_if_rwlock); + /* Don't hold nd_ifinfo lock for performance */ mtu = IN6_LINKMTU(ifp); lck_rw_done(nd_if_rwlock); } else @@ -1757,7 +1904,6 @@ ip6_ctloutput(so, sopt) /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: - case IPV6_FAITH: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: @@ -1874,10 +2020,6 @@ do { \ OPTSET(IN6P_RTHDR); break; - case IPV6_FAITH: - OPTSET(INP_FAITH); - break; - case IPV6_RECVPATHMTU: /* * We ignore this option for TCP @@ -1985,6 +2127,7 @@ do { \ case IPV6_RTHDRDSTOPTS: case IPV6_3542NEXTHOP: { + struct ip6_pktopts **optp; /* new advanced API (RFC3542) */ struct mbuf *m; @@ -2001,8 +2144,9 @@ do { \ m_freem(m); break; } + optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, mtod(m, u_char *), - m->m_len, &in6p->in6p_outputopts, uproto); + m->m_len, optp, uproto); m_freem(m); break; } @@ -2106,7 +2250,7 @@ do { \ if (error) break; - inp_bindif(in6p, optval); + error = inp_bindif(in6p, optval); break; case IPV6_NO_IFT_CELLULAR: @@ -2161,7 +2305,6 @@ do { \ case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: - case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: @@ -2200,10 +2343,6 @@ do { \ optval = OPTBIT(IN6P_MTU); break; - case IPV6_FAITH: - optval = OPTBIT(INP_FAITH); - break; - case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; @@ -2355,7 +2494,7 @@ do { \ case IPV6_BOUND_IF: if (in6p->inp_flags & INP_BOUND_IF) - optval = in6p->inp_boundif; + optval = in6p->inp_boundifp->if_index; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; @@ -2368,7 +2507,8 @@ do { \ break; case IPV6_OUT_IF: - optval = in6p->in6p_last_outif; + optval = (in6p->in6p_last_outifp != NULL) ? + in6p->in6p_last_outifp->if_index : 0; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; @@ -2522,7 +2662,8 @@ ip6_initpktopts(struct ip6_pktopts *opt) } static int -ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, int uproto) +ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, + int uproto) { struct ip6_pktopts *opt; @@ -2550,6 +2691,7 @@ ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; + switch (optname) { case IPV6_PKTINFO: if (pktopt && pktopt->ip6po_pktinfo) @@ -2637,9 +2779,7 @@ ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) } void -ip6_clearpktopts(pktopt, optname) - struct ip6_pktopts *pktopt; - int optname; +ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; @@ -2704,7 +2844,7 @@ static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { - printf("ip6_clearpktopts: invalid argument\n"); + printf("copypktopts: invalid argument\n"); return (EINVAL); } @@ -2758,8 +2898,7 @@ ip6_copypktopts(struct ip6_pktopts *src, int canwait) } void -ip6_freepcbopts(pktopt) - struct ip6_pktopts *pktopt; +ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; @@ -2982,6 +3121,12 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, return (EINVAL); } + /* + * Caller must have ensured that the buffer is at least + * aligned on 32-bit boundary. + */ + VERIFY(IS_P2ALIGNED(buf, sizeof (u_int32_t))); + /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, @@ -3026,7 +3171,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, if (len != sizeof(struct in6_pktinfo)) return (EINVAL); - pktinfo = (struct in6_pktinfo *)buf; + pktinfo = (struct in6_pktinfo *)(void *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by @@ -3098,7 +3243,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, if (len != sizeof(int)) return (EINVAL); - hlimp = (int *)buf; + hlimp = (int *)(void *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); @@ -3112,7 +3257,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, if (len != sizeof(int)) return (EINVAL); - tclass = *(int *)buf; + tclass = *(int *)(void *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); @@ -3138,7 +3283,8 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { - struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; + struct sockaddr_in6 *sa6 = + (struct sockaddr_in6 *)(void *)buf; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); @@ -3189,7 +3335,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); - hbh = (struct ip6_hbh *)buf; + hbh = (struct ip6_hbh *)(void *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); @@ -3223,7 +3369,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); - dest = (struct ip6_dest *)buf; + dest = (struct ip6_dest *)(void *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); @@ -3283,7 +3429,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); - rth = (struct ip6_rthdr *)buf; + rth = (struct ip6_rthdr *)(void *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); @@ -3314,7 +3460,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); - minmtupolicy = *(int *)buf; + minmtupolicy = *(int *)(void *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { @@ -3327,7 +3473,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, if (len != sizeof(int)) return (EINVAL); - if (uproto == IPPROTO_TCP || *(int *)buf == 0) { + if (uproto == IPPROTO_TCP || *(int *)(void *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) @@ -3340,7 +3486,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); - preftemp = *(int *)buf; + preftemp = *(int *)(void *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { @@ -3416,7 +3562,8 @@ ip6_mloopback( if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_ifp, PF_INET6, copym, 0, (struct sockaddr *)dst, 0); + dlil_output(lo_ifp, PF_INET6, copym, 0, + (struct sockaddr *)dst, 0, NULL); } else m_free(copym); #else diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index acb9c3857..9507c904b 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -175,6 +175,14 @@ struct ip6_moptions { #define IM6O_REMREF(_im6o) \ im6o_remref(_im6o) +struct ip6_exthdrs { + struct mbuf *ip6e_ip6; + struct mbuf *ip6e_hbh; + struct mbuf *ip6e_dest1; + struct mbuf *ip6e_rthdr; + struct mbuf *ip6e_dest2; +}; + /* * Control options for outgoing packets */ @@ -348,18 +356,39 @@ struct ip6aux { #define IPV6_FLAG_NOSRCIFSEL 0x80 /* bypas source address selection */ #define IPV6_OUTARGS 0x100 /* has ancillary output info */ -#ifdef __NO_STRICT_ALIGNMENT -#define IP6_HDR_ALIGNED_P(ip) 1 -#else -#define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) -#endif +#ifdef XNU_KERNEL_PRIVATE +#define IP6_HDR_ALIGNED_P(_ip6) ((((uintptr_t)(_ip6)) & ((uintptr_t)3)) == 0) + +/* + * On platforms which require strict alignment (currently for anything but + * i386 or x86_64), this macro checks whether the pointer to the IP header + * is 32-bit aligned, and assert otherwise. + */ +#if defined(__i386__) || defined(__x86_64__) +#define IP6_HDR_STRICT_ALIGNMENT_CHECK(_ip6) do { } while (0) +#else /* !__i386__ && !__x86_64__ */ +#define IP6_HDR_STRICT_ALIGNMENT_CHECK(_ip6) do { \ + if (!IP_HDR_ALIGNED_P(_ip6)) { \ + panic_plain("\n%s: Unaligned IPv6 header %p\n", \ + __func__, _ip6); \ + } \ +} while (0) +#endif /* !__i386__ && !__x86_64__ */ +#endif /* XNU_KERNEL_PRIVATE */ + +#include /* - * Extra information passed to ip6_output when IP6_OUTARGS is set. + * Extra information passed to ip6_output when IPV6_OUTARGS is set. */ struct ip6_out_args { unsigned int ip6oa_boundif; /* bound outgoing interface */ - unsigned int ip6oa_nocell; /* don't use IFT_CELLULAR */ + struct flowadv ip6oa_flowadv; /* flow advisory code */ + u_int32_t ip6oa_flags; /* IP6OAF flags (see below) */ +#define IP6OAF_SELECT_SRCIF 0x00000001 /* src interface selection */ +#define IP6OAF_BOUND_IF 0x00000002 /* boundif value is valid */ +#define IP6OAF_BOUND_SRCADDR 0x00000004 /* bound to src address */ +#define IP6OAF_NO_CELLULAR 0x00000010 /* skip IFT_CELLULAR */ }; extern struct ip6stat ip6stat; /* statistics */ @@ -388,8 +417,7 @@ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ extern int ip6_sourcecheck; /* Verify source interface */ extern int ip6_sourcecheck_interval; /* Interval between log messages */ -extern int ip6_accept_rtadv; /* Acts as a host not a router */ -extern int ip6_keepfaith; /* Firewall Aided Internet Translator */ +extern int ip6_accept_rtadv; /* deprecated */ extern int ip6_log_interval; extern time_t ip6_log_time; extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */ @@ -415,6 +443,7 @@ extern struct pr_usrreqs rip6_usrreqs; extern struct pr_usrreqs icmp6_dgram_usrreqs; extern int ip6_doscopedroute; +extern int ip6_restrictrecvif; struct sockopt; @@ -428,7 +457,6 @@ int icmp6_dgram_attach(struct socket *, int , struct proc *); struct in6_ifaddr; void ip6_init(void); -void ip6_fin(void); void ip6_input(struct mbuf *); struct in6_ifaddr *ip6_getdstifaddr(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); @@ -454,12 +482,14 @@ int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, struct mbuf **ip6_savecontrol_v4(struct inpcb *, struct mbuf *, struct mbuf **, int *); int ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **); -void ip6_forward(struct mbuf *, struct route_in6 *, int); +struct mbuf *ip6_forward(struct mbuf *, struct route_in6 *, int); void ip6_notify_pmtu __P((struct inpcb *, struct sockaddr_in6 *, u_int32_t *)); void ip6_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in6 *); int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, - int, struct ip6_moptions *, struct ifnet **, + int, struct ip6_moptions *, struct ifnet **, struct ip6_out_args *); +int ip6_output_list(struct mbuf *, int, struct ip6_pktopts *, + struct route_in6 *, int, struct ip6_moptions *, struct ifnet **, struct ip6_out_args *); int ip6_ctloutput(struct socket *, struct sockopt *sopt); void ip6_initpktopts(struct ip6_pktopts *); @@ -488,7 +518,7 @@ extern struct in6_addrpolicy * in6_addrsel_lookup_policy(struct sockaddr_in6 *); int in6_selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, - struct ifnet **, struct rtentry **, int, unsigned int, unsigned int); + struct ifnet **, struct rtentry **, int, const struct ip6_out_args *); int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, int uproto); u_int32_t ip6_randomid(void); u_int32_t ip6_randomflowlabel(void); diff --git a/bsd/netinet6/ipcomp_input.c b/bsd/netinet6/ipcomp_input.c index 3c6a9a43d..c3b330305 100644 --- a/bsd/netinet6/ipcomp_input.c +++ b/bsd/netinet6/ipcomp_input.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -96,7 +97,6 @@ ipcomp4_input(struct mbuf *m, int off) size_t newlen, olen; struct secasvar *sav = NULL; - if (m->m_pkthdr.len < off + sizeof(struct ipcomp)) { ipseclog((LOG_DEBUG, "IPv4 IPComp input: assumption failed " "(packet too short)\n")); @@ -113,6 +113,10 @@ ipcomp4_input(struct mbuf *m, int off) goto fail; } ipcomp = mtod(md, struct ipcomp *); + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip = mtod(m, struct ip *); nxt = ipcomp->comp_nxt; #ifdef _IP_VHL @@ -266,6 +270,10 @@ ipcomp6_input(struct mbuf **mp, int *offp, int proto) goto fail; } ipcomp = mtod(md, struct ipcomp *); + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); nxt = ipcomp->comp_nxt; diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 6de4b97dc..271dbe1ab 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -257,17 +258,7 @@ static int ipsec_set_policy(struct secpolicy **pcb_sp, static int ipsec_get_policy(struct secpolicy *pcb_sp, struct mbuf **mp); static void vshiftl(unsigned char *, int, int); static int ipsec_in_reject(struct secpolicy *, struct mbuf *); -#if INET -static struct mbuf *ipsec4_splithdr(struct mbuf *); -#endif -#if INET6 -static struct mbuf *ipsec6_splithdr(struct mbuf *); -#endif -#if INET -static int ipsec4_encapsulate(struct mbuf *, struct secasvar *); -#endif #if INET6 -static int ipsec6_encapsulate(struct mbuf *, struct secasvar *); static int ipsec64_encapsulate(struct mbuf *, struct secasvar *); #endif static struct ipsec_tag *ipsec_addaux(struct mbuf *); @@ -1264,7 +1255,7 @@ ipsec_init_policy(so, pcb_sp) bzero(new, sizeof(*new)); #ifdef __APPLE__ - if (so->so_uid == 0) + if (kauth_cred_issuser(so->so_cred)) #else if (so->so_cred != 0 && !suser(so->so_cred->pc_ucred, NULL)) #endif @@ -1399,7 +1390,7 @@ ipsec_set_policy( return EINVAL; if (len < sizeof(*xpl)) return EINVAL; - xpl = (struct sadb_x_policy *)request; + xpl = (struct sadb_x_policy *)(void *)request; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_set_policy: passed policy\n"); @@ -1467,13 +1458,24 @@ ipsec4_set_policy(inp, optname, request, len, priv) struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; int error = 0; + struct sadb_x_policy xpl_aligned_buf; + u_int8_t *xpl_unaligned; /* sanity check. */ if (inp == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; - xpl = (struct sadb_x_policy *)request; + xpl = (struct sadb_x_policy *)(void *)request; + + /* This is a new mbuf allocated by soopt_getm() */ + if (IPSEC_IS_P2ALIGNED(xpl)) { + xpl_unaligned = NULL; + } else { + xpl_unaligned = (__typeof__(xpl_unaligned))xpl; + memcpy(&xpl_aligned_buf, xpl, sizeof(xpl_aligned_buf)); + xpl = (__typeof__(xpl))&xpl_aligned_buf; + } if (inp->inp_sp == NULL) { error = ipsec_init_policy(inp->inp_socket, &inp->inp_sp); @@ -1512,6 +1514,8 @@ ipsec4_get_policy(inp, request, len, mp) struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; int error = 0; + struct sadb_x_policy xpl_aligned_buf; + u_int8_t *xpl_unaligned; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -1520,8 +1524,17 @@ ipsec4_get_policy(inp, request, len, mp) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; - xpl = (struct sadb_x_policy *)request; - + xpl = (struct sadb_x_policy *)(void *)request; + + /* This is a new mbuf allocated by soopt_getm() */ + if (IPSEC_IS_P2ALIGNED(xpl)) { + xpl_unaligned = NULL; + } else { + xpl_unaligned = (__typeof__(xpl_unaligned))xpl; + memcpy(&xpl_aligned_buf, xpl, sizeof(xpl_aligned_buf)); + xpl = (__typeof__(xpl))&xpl_aligned_buf; + } + if (inp->inp_sp == NULL) { error = ipsec_init_policy(inp->inp_socket, &inp->inp_sp); if (error) @@ -1586,14 +1599,25 @@ ipsec6_set_policy(in6p, optname, request, len, priv) struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; int error = 0; + struct sadb_x_policy xpl_aligned_buf; + u_int8_t *xpl_unaligned; /* sanity check. */ if (in6p == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; - xpl = (struct sadb_x_policy *)request; - + xpl = (struct sadb_x_policy *)(void *)request; + + /* This is a new mbuf allocated by soopt_getm() */ + if (IPSEC_IS_P2ALIGNED(xpl)) { + xpl_unaligned = NULL; + } else { + xpl_unaligned = (__typeof__(xpl_unaligned))xpl; + memcpy(&xpl_aligned_buf, xpl, sizeof(xpl_aligned_buf)); + xpl = (__typeof__(xpl))&xpl_aligned_buf; + } + if (in6p->in6p_sp == NULL) { error = ipsec_init_policy(in6p->inp_socket, &in6p->in6p_sp); if (error) @@ -1631,14 +1655,25 @@ ipsec6_get_policy(in6p, request, len, mp) struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; int error = 0; + struct sadb_x_policy xpl_aligned_buf; + u_int8_t *xpl_unaligned; /* sanity check. */ if (in6p == NULL || request == NULL || mp == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; - xpl = (struct sadb_x_policy *)request; - + xpl = (struct sadb_x_policy *)(void *)request; + + /* This is a new mbuf allocated by soopt_getm() */ + if (IPSEC_IS_P2ALIGNED(xpl)) { + xpl_unaligned = NULL; + } else { + xpl_unaligned = (__typeof__(xpl_unaligned))xpl; + memcpy(&xpl_aligned_buf, xpl, sizeof(xpl_aligned_buf)); + xpl = (__typeof__(xpl))&xpl_aligned_buf; + } + if (in6p->in6p_sp == NULL) { error = ipsec_init_policy(in6p->inp_socket, &in6p->in6p_sp); if (error) @@ -2171,7 +2206,7 @@ ipsec6_hdrsiz(m, dir, in6p) * encapsulate for ipsec tunnel. * ip->ip_src must be fixed later on. */ -static int +int ipsec4_encapsulate(m, sav) struct mbuf *m; struct secasvar *sav; @@ -2288,10 +2323,103 @@ ipsec4_encapsulate(m, sav) return 0; } + +/* + * encapsulate for ipsec tunnel. + * ip->ip_src must be fixed later on. + */ +int +ipsec4_encapsulate_utun_esp_keepalive(m_ptr, sav) + struct mbuf **m_ptr; + struct secasvar *sav; +{ + struct ip *ip; + size_t plen; + struct mbuf *m = *m_ptr; + + /* can't tunnel between different AFs */ + if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family + != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family + || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET) { + m_freem(m); + *m_ptr = NULL; + return EINVAL; + } + + plen = m->m_pkthdr.len; + + /* + * grow the mbuf to accomodate the new IPv4 header. + * NOTE: IPv4 options will never be copied. + */ + { + struct mbuf *n; + MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ + if (!n) { + m_freem(m); + *m_ptr = NULL; + return ENOBUFS; + } + if (m->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(n, m); + m->m_flags &= ~M_PKTHDR; + } + MH_ALIGN(n, sizeof(*ip)); + n->m_len = sizeof(*ip); + n->m_next = m; + n->m_pkthdr.len = (plen + n->m_len); + m_fixhdr(m); + m = n; + *m_ptr = m; + plen = m->m_pkthdr.len; + } + ip = mtod(m, __typeof__(ip)); + + /* construct new IPv4 header. see RFC 2401 5.1.2.1 */ + // ip_ecn_ingress(ip4_ipsec_ecn, &ip->ip_tos, &oip->ip_tos); +#ifdef _IP_VHL + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(*ip) >> 2); +#else + ip->ip_hl = sizeof(*ip) >> 2; +#endif + ip->ip_off &= htons(~IP_OFFMASK); + ip->ip_off &= htons(~IP_MF); + switch (ip4_ipsec_dfbit) { + case 0: /* clear DF bit */ + ip->ip_off &= htons(~IP_DF); + break; + case 1: /* set DF bit */ + ip->ip_off |= htons(IP_DF); + break; + default: /* copy DF bit */ + break; + } + ip->ip_p = IPPROTO_IPIP; + if (plen < IP_MAXPACKET) + ip->ip_len = htons(plen); + else { + ipseclog((LOG_ERR, "IPv4 ipsec: size exceeds limit: " + "leave ip_len as is (invalid packet)\n")); + } +#ifdef RANDOM_IP_ID + ip->ip_id = ip_randomid(); +#else + ip->ip_id = htons(ip_id++); +#endif + bcopy(&((struct sockaddr_in *)&sav->sah->saidx.src)->sin_addr, + &ip->ip_src, sizeof(ip->ip_src)); + bcopy(&((struct sockaddr_in *)&sav->sah->saidx.dst)->sin_addr, + &ip->ip_dst, sizeof(ip->ip_dst)); + ip->ip_ttl = IPDEFTTL; + + /* XXX Should ip_src be updated later ? */ + + return 0; +} #endif /*INET*/ #if INET6 -static int +int ipsec6_encapsulate(m, sav) struct mbuf *m; struct secasvar *sav; @@ -2454,6 +2582,70 @@ ipsec64_encapsulate(m, sav) return 0; } + +int +ipsec6_encapsulate_utun_esp_keepalive(m_ptr, sav) + struct mbuf **m_ptr; + struct secasvar *sav; +{ + struct ip6_hdr *ip6; + size_t plen; + struct mbuf *m = *m_ptr; + + /* can't tunnel between different AFs */ + if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family + != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family + || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET6) { + m_freem(m); + *m_ptr = NULL; + return EINVAL; + } + + plen = m->m_pkthdr.len; + + /* + * grow the mbuf to accomodate the new IPv6 header. + */ + { + struct mbuf *n; + MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ + if (!n) { + m_freem(m); + *m_ptr = NULL; + return ENOBUFS; + } + if (m->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(n, m); + m->m_flags &= ~M_PKTHDR; + } + MH_ALIGN(n, sizeof(*ip6)); + n->m_len = sizeof(*ip6); + n->m_next = m; + n->m_pkthdr.len = (plen + n->m_len); + m_fixhdr(m); + m = n; + *m_ptr = m; + plen = m->m_pkthdr.len; + } + ip6 = mtod(m, __typeof__(ip6)); + + /* construct new IPv6 header. see RFC 2401 5.1.2.2 */ + if (plen < IPV6_MAXPACKET) + ip6->ip6_plen = htons(plen); + else { + /* ip6->ip6_plen will be updated in ip6_output() */ + } + ip6->ip6_nxt = IPPROTO_IPV6; + bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.src)->sin6_addr, + &ip6->ip6_src, sizeof(ip6->ip6_src)); + bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.dst)->sin6_addr, + &ip6->ip6_dst, sizeof(ip6->ip6_dst)); + ip6->ip6_hlim = IPV6_DEFHLIM; + + /* XXX Should ip6_src be updated later ? */ + + return 0; +} #endif /*INET6*/ /* @@ -2672,7 +2864,7 @@ ipsec4_logpacketstr(ip, spi) struct ip *ip; u_int32_t spi; { - static char buf[256]; + static char buf[256] __attribute__((aligned(4))); char *p; u_int8_t *s, *d; @@ -2702,7 +2894,7 @@ ipsec6_logpacketstr(ip6, spi) struct ip6_hdr *ip6; u_int32_t spi; { - static char buf[256]; + static char buf[256] __attribute__((aligned(4))); char *p; p = buf; @@ -2727,7 +2919,7 @@ const char * ipsec_logsastr(sav) struct secasvar *sav; { - static char buf[256]; + static char buf[256] __attribute__((aligned(4))); char *p; struct secasindex *saidx = &sav->sah->saidx; @@ -2883,7 +3075,7 @@ ipsec4_output( } ip = mtod(state->m, struct ip *); } - udp = (struct udphdr *)(((u_int8_t *)ip) + hlen); + udp = (struct udphdr *)(void *)(((u_int8_t *)ip) + hlen); sin->sin_port = udp->uh_dport; } } @@ -2962,7 +3154,7 @@ ipsec4_output( // grab sadb_mutex, before updating sah's route cache lck_mtx_lock(sadb_mutex); ro4= &sav->sah->sa_route; - dst4 = (struct sockaddr_in *)&ro4->ro_dst; + dst4 = (struct sockaddr_in *)(void *)&ro4->ro_dst; if (ro4->ro_rt != NULL) { RT_LOCK(ro4->ro_rt); } @@ -2999,7 +3191,7 @@ ipsec4_output( * addressed by SA_SIZE roundup in that routine. */ if (ro4->ro_rt->rt_flags & RTF_GATEWAY) - dst4 = (struct sockaddr_in *)ro4->ro_rt->rt_gateway; + dst4 = (struct sockaddr_in *)(void *)ro4->ro_rt->rt_gateway; RT_UNLOCK(ro4->ro_rt); if (state->ro.ro_rt != NULL) { rtfree(state->ro.ro_rt); @@ -3397,7 +3589,8 @@ ipsec6_output_tunnel( struct sockaddr_in* dst4; struct route *ro4 = NULL; struct route ro4_copy; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, + IPOAF_SELECT_SRCIF }; /* * must be last isr because encapsulated IPv6 packet @@ -3422,7 +3615,7 @@ ipsec6_output_tunnel( // grab sadb_mutex, to update sah's route cache and get a local copy of it lck_mtx_lock(sadb_mutex); ro4 = &sav->sah->sa_route; - dst4 = (struct sockaddr_in *)&ro4->ro_dst; + dst4 = (struct sockaddr_in *)(void *)&ro4->ro_dst; if (ro4->ro_rt) { RT_LOCK(ro4->ro_rt); } @@ -3534,7 +3727,7 @@ ipsec6_output_tunnel( // grab sadb_mutex, before updating sah's route cache lck_mtx_lock(sadb_mutex); ro6 = &sav->sah->sa_route; - dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; + dst6 = (struct sockaddr_in6 *)(void *)&ro6->ro_dst; if (ro6->ro_rt) { RT_LOCK(ro6->ro_rt); } @@ -3575,7 +3768,7 @@ ipsec6_output_tunnel( * addressed by SA_SIZE roundup in that routine. */ if (ro6->ro_rt->rt_flags & RTF_GATEWAY) - dst6 = (struct sockaddr_in6 *)ro6->ro_rt->rt_gateway; + dst6 = (struct sockaddr_in6 *)(void *)ro6->ro_rt->rt_gateway; RT_UNLOCK(ro6->ro_rt); if (state->ro.ro_rt != NULL) { rtfree(state->ro.ro_rt); @@ -3652,7 +3845,7 @@ bad: /* * Chop IP header and option off from the payload. */ -static struct mbuf * +struct mbuf * ipsec4_splithdr(m) struct mbuf *m; { @@ -3661,7 +3854,7 @@ ipsec4_splithdr(m) int hlen; if (m->m_len < sizeof(struct ip)) - panic("ipsec4_splithdr: first mbuf too short"); + panic("ipsec4_splithdr: first mbuf too short, m_len %d, pkt_len %d, m_flag %x", m->m_len, m->m_pkthdr.len, m->m_flags); ip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = _IP_VHL_HL(ip->ip_vhl) << 2; @@ -3694,7 +3887,7 @@ ipsec4_splithdr(m) #endif #if INET6 -static struct mbuf * +struct mbuf * ipsec6_splithdr(m) struct mbuf *m; { @@ -3777,6 +3970,18 @@ ipsec4_tunnel_validate(m, off, nxt0, sav, ifamily) if (bcmp(&oip->ip_dst, &sin->sin_addr, sizeof(oip->ip_dst)) != 0) return 0; + if (sav->utun_in_fn) { + // the utun SAs don't have a policy (yet). + if (nxt == IPPROTO_IPV4) { + *ifamily = AF_INET; + } else if (nxt == IPPROTO_IPV6) { + *ifamily = AF_INET6; + } else { + return 0; + } + return 1; + } + /* XXX slow */ bzero(&osrc, sizeof(osrc)); bzero(&odst, sizeof(odst)); @@ -3874,6 +4079,11 @@ ipsec6_tunnel_validate(m, off, nxt0, sav) if (!IN6_ARE_ADDR_EQUAL(&oip6->ip6_dst, &sin6->sin6_addr)) return 0; + if (sav->utun_in_fn) { + // the utun SAs don't have a policy (yet). + return 1; + } + /* XXX slow */ bzero(&osrc, sizeof(osrc)); bzero(&odst, sizeof(odst)); @@ -4191,51 +4401,58 @@ __private_extern__ int ipsec_send_natt_keepalive( struct secasvar *sav) { - struct mbuf *m; - struct udphdr *uh; - struct ip *ip; - int error; - struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; - struct route ro; + struct mbuf *m; + struct ip *ip; + int error; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, IPOAF_SELECT_SRCIF }; + struct route ro; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); - + if ((esp_udp_encap_port & 0xFFFF) == 0 || sav->remote_ike_port == 0) return FALSE; // natt timestamp may have changed... reverify if ((natt_now - sav->natt_last_activity) < natt_keepalive_interval) return FALSE; + if (sav->flags & SADB_X_EXT_ESP_KEEPALIVE) return FALSE; // don't send these from the kernel + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return FALSE; - - /* - * Create a UDP packet complete with IP header. - * We must do this because UDP output requires - * an inpcb which we don't have. UDP packet - * contains one byte payload. The byte is set - * to 0xFF. - */ - ip = (struct ip*)m_mtod(m); - uh = (struct udphdr*)((char*)m_mtod(m) + sizeof(struct ip)); - m->m_len = sizeof(struct udpiphdr) + 1; - bzero(m_mtod(m), m->m_len); - m->m_pkthdr.len = m->m_len; - - ip->ip_len = m->m_len; - ip->ip_ttl = ip_defttl; - ip->ip_p = IPPROTO_UDP; - if (sav->sah->dir != IPSEC_DIR_INBOUND) { - ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; - ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; - } else { - ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; - ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; + + ip = (__typeof__(ip))m_mtod(m); + + // this sends one type of NATT keepalives (Type 1, ESP keepalives, aren't sent by kernel) + if ((sav->flags & SADB_X_EXT_ESP_KEEPALIVE) == 0) { + struct udphdr *uh; + + /* + * Type 2: a UDP packet complete with IP header. + * We must do this because UDP output requires + * an inpcb which we don't have. UDP packet + * contains one byte payload. The byte is set + * to 0xFF. + */ + uh = (__typeof__(uh))(void *)((char *)m_mtod(m) + sizeof(*ip)); + m->m_len = sizeof(struct udpiphdr) + 1; + bzero(m_mtod(m), m->m_len); + m->m_pkthdr.len = m->m_len; + + ip->ip_len = m->m_len; + ip->ip_ttl = ip_defttl; + ip->ip_p = IPPROTO_UDP; + if (sav->sah->dir != IPSEC_DIR_INBOUND) { + ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; + ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; + } else { + ip->ip_src = ((struct sockaddr_in*)&sav->sah->saidx.dst)->sin_addr; + ip->ip_dst = ((struct sockaddr_in*)&sav->sah->saidx.src)->sin_addr; + } + uh->uh_sport = htons((u_short)esp_udp_encap_port); + uh->uh_dport = htons(sav->remote_ike_port); + uh->uh_ulen = htons(1 + sizeof(*uh)); + uh->uh_sum = 0; + *(u_int8_t*)((char*)m_mtod(m) + sizeof(*ip) + sizeof(*uh)) = 0xFF; } - uh->uh_sport = htons((u_short)esp_udp_encap_port); - uh->uh_dport = htons(sav->remote_ike_port); - uh->uh_ulen = htons(1 + sizeof(struct udphdr)); - uh->uh_sum = 0; - *(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF; // grab sadb_mutex, to get a local copy of sah's route cache lck_mtx_lock(sadb_mutex); diff --git a/bsd/netinet6/ipsec.h b/bsd/netinet6/ipsec.h index c31155576..ead5dc68c 100644 --- a/bsd/netinet6/ipsec.h +++ b/bsd/netinet6/ipsec.h @@ -278,6 +278,10 @@ struct ipsecstat { } #ifdef KERNEL + +#define IPSEC_IS_P2ALIGNED(p) 1 +#define IPSEC_GET_P2UNALIGNED_OFS(p) 0 + struct ipsec_output_state { int tunneled; struct mbuf *m; @@ -340,6 +344,16 @@ extern const char *ipsec_logsastr(struct secasvar *); extern void ipsec_dumpmbuf(struct mbuf *); extern int ipsec4_output(struct ipsec_output_state *, struct secpolicy *, int); +#if INET +extern struct mbuf * ipsec4_splithdr(struct mbuf *); +extern int ipsec4_encapsulate(struct mbuf *, struct secasvar *); +extern int ipsec4_encapsulate_utun_esp_keepalive(struct mbuf **, struct secasvar *); +#endif +#if INET6 +extern struct mbuf * ipsec6_splithdr(struct mbuf *); +extern int ipsec6_encapsulate(struct mbuf *, struct secasvar *); +extern int ipsec6_encapsulate_utun_esp_keepalive(struct mbuf **, struct secasvar *); +#endif extern int ipsec4_tunnel_validate(struct mbuf *, int, u_int, struct secasvar *, sa_family_t *); extern struct mbuf *ipsec_copypkt(struct mbuf *); extern void ipsec_delaux(struct mbuf *); diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 11f02db1e..7cedd2e22 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -125,7 +125,7 @@ #include #include -/* Lock group and attribute for mld6_mtx */ +/* Lock group and attribute for mld_mtx */ static lck_attr_t *mld_mtx_attr; static lck_grp_t *mld_mtx_grp; static lck_grp_attr_t *mld_mtx_grp_attr; @@ -236,16 +236,14 @@ static int interface_timers_running6; static int state_change_timers_running6; static int current_state_timers_running6; -static decl_lck_mtx_data(, mld6_mtx); - #define MLD_LOCK() \ - lck_mtx_lock(&mld6_mtx) + lck_mtx_lock(&mld_mtx) #define MLD_LOCK_ASSERT_HELD() \ - lck_mtx_assert(&mld6_mtx, LCK_MTX_ASSERT_OWNED) + lck_mtx_assert(&mld_mtx, LCK_MTX_ASSERT_OWNED) #define MLD_LOCK_ASSERT_NOTHELD() \ - lck_mtx_assert(&mld6_mtx, LCK_MTX_ASSERT_NOTOWNED) + lck_mtx_assert(&mld_mtx, LCK_MTX_ASSERT_NOTOWNED) #define MLD_UNLOCK() \ - lck_mtx_unlock(&mld6_mtx) + lck_mtx_unlock(&mld_mtx) #define MLD_ADD_DETACHED_IN6M(_head, _in6m) { \ SLIST_INSERT_HEAD(_head, _in6m, in6m_dtle); \ @@ -498,6 +496,9 @@ mld_domifattach(struct ifnet *ifp, int how) MLI_ADDREF_LOCKED(mli); /* hold a reference for mli_head */ MLI_ADDREF_LOCKED(mli); /* hold a reference for caller */ MLI_UNLOCK(mli); + ifnet_lock_shared(ifp); + mld6_initsilent(ifp, mli); + ifnet_lock_done(ifp); LIST_INSERT_HEAD(&mli_head, mli, mli_link); @@ -528,6 +529,9 @@ mld_domifreattach(struct mld_ifinfo *mli) mli->mli_debug |= IFD_ATTACHED; MLI_ADDREF_LOCKED(mli); /* hold a reference for mli_head */ MLI_UNLOCK(mli); + ifnet_lock_shared(ifp); + mld6_initsilent(ifp, mli); + ifnet_lock_done(ifp); LIST_INSERT_HEAD(&mli_head, mli, mli_link); @@ -593,6 +597,21 @@ mli_delete(const struct ifnet *ifp, struct mld_in6m_relhead *in6m_dthead) panic("%s: mld_ifinfo not found for ifp %p\n", __func__, ifp); } +__private_extern__ void +mld6_initsilent(struct ifnet *ifp, struct mld_ifinfo *mli) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED); + + MLI_LOCK_ASSERT_NOTHELD(mli); + MLI_LOCK(mli); + if (!(ifp->if_flags & IFF_MULTICAST) && + (ifp->if_eflags & (IFEF_IPV6_ND6ALT|IFEF_LOCALNET_PRIVATE))) + mli->mli_flags |= MLIF_SILENT; + else + mli->mli_flags &= ~MLIF_SILENT; + MLI_UNLOCK(mli); +} + static void mli_initvar(struct mld_ifinfo *mli, struct ifnet *ifp, int reattach) { @@ -606,9 +625,6 @@ mli_initvar(struct mld_ifinfo *mli, struct ifnet *ifp, int reattach) mli->mli_qri = MLD_QRI_INIT; mli->mli_uri = MLD_URI_INIT; - /* ifnet is not yet attached; no need to hold ifnet lock */ - if (!(ifp->if_flags & IFF_MULTICAST)) - mli->mli_flags |= MLIF_SILENT; if (mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; if (!reattach) @@ -1164,7 +1180,7 @@ mld_v2_process_group_query(struct in6_multi *inm, int timer, struct mbuf *m0, for (i = 0; i < nsrc; i++) { sp = mtod(m, uint8_t *) + soff; retval = in6m_record_source(inm, - (const struct in6_addr *)sp); + (const struct in6_addr *)(void *)sp); if (retval < 0) break; nrecorded += retval; @@ -1989,7 +2005,6 @@ mld_v1_transmit_report(struct in6_multi *in6m, const int type) mh->m_flags |= M_MLDV1; - /* * Due to the fact that at this point we are possibly holding * in6_multihead_lock in shared or exclusive mode, we can't call @@ -3286,6 +3301,13 @@ mld_dispatch_packet(struct mbuf *m) mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); type = mld->mld_type; + if (ifp->if_eflags & IFEF_TXSTART) { + /* Use control service class if the outgoing + * interface supports transmit-start model. + */ + (void) m_set_service_class(m0, MBUF_SC_CTL); + } + error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, im6o, &oifp, NULL); @@ -3424,7 +3446,7 @@ mld_init(void) MLD_PRINTF(("%s: initializing\n", __func__)); - /* Setup lock group and attribute for mld6_mtx */ + /* Setup lock group and attribute for mld_mtx */ mld_mtx_grp_attr = lck_grp_attr_alloc_init(); mld_mtx_grp = lck_grp_alloc_init("mld_mtx\n", mld_mtx_grp_attr); mld_mtx_attr = lck_attr_alloc_init(); diff --git a/bsd/netinet6/mld6_var.h b/bsd/netinet6/mld6_var.h index 7652cdca9..7ccbcbf0b 100644 --- a/bsd/netinet6/mld6_var.h +++ b/bsd/netinet6/mld6_var.h @@ -237,6 +237,7 @@ extern void mld_slowtimo(void); extern void mld_init(void); extern void mli_addref(struct mld_ifinfo *, int); extern void mli_remref(struct mld_ifinfo *); +__private_extern__ void mld6_initsilent(struct ifnet *, struct mld_ifinfo *); #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_mld); diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index 77ab7630a..7cae1d7e7 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -110,9 +110,6 @@ #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ -#define SA(p) ((struct sockaddr *)(p)) -#define SIN6(s) ((struct sockaddr_in6 *)s) -#define SDL(s) ((struct sockaddr_dl *)s) #define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) /* timer values */ @@ -135,6 +132,10 @@ int nd6_debug = 1; int nd6_debug = 0; #endif +int nd6_optimistic_dad = + (ND6_OPTIMISTIC_DAD_LINKLOCAL|ND6_OPTIMISTIC_DAD_AUTOCONF| + ND6_OPTIMISTIC_DAD_TEMPORARY|ND6_OPTIMISTIC_DAD_DYNAMIC); + static int nd6_is_new_addr_neighbor (struct sockaddr_in6 *, struct ifnet *); /* for debugging? */ @@ -174,10 +175,11 @@ struct llinfo_nd6 llinfo_nd6 = { size_t nd_ifinfo_indexlim = 32; /* increased for 5589193 */ struct nd_ifinfo *nd_ifinfo = NULL; -static lck_grp_attr_t *nd_if_rwlock_grp_attr; -static lck_grp_t *nd_if_rwlock_grp; -static lck_attr_t *nd_if_rwlock_attr; -lck_rw_t *nd_if_rwlock; +static lck_grp_attr_t *nd_if_lock_grp_attr; +static lck_grp_t *nd_if_lock_grp; +static lck_attr_t *nd_if_lock_attr; +decl_lck_rw_data(, nd_if_rwlock_data); +lck_rw_t *nd_if_rwlock = &nd_if_rwlock_data; /* Protected by nd6_mutex */ struct nd_drhead nd_defrouter; @@ -199,9 +201,10 @@ static struct llinfo_nd6 *nd6_llinfo_alloc(void); static void nd6_llinfo_free(void *); static void nd6_llinfo_purge(struct rtentry *); static void nd6_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); +static void nd6_llinfo_get_iflri(struct rtentry *, struct ifnet_llreach_info *); -static void nd6_siocgdrlst(void *, int); -static void nd6_siocgprlst(void *, int); +static int nd6_siocgdrlst(void *, int); +static int nd6_siocgprlst(void *, int); /* * Insertion and removal from llinfo_nd6 must be done with rnh_lock held. @@ -248,11 +251,10 @@ nd6_init() /* initialization of the default router list */ TAILQ_INIT(&nd_defrouter); - nd_if_rwlock_grp_attr = lck_grp_attr_alloc_init(); - nd_if_rwlock_grp = lck_grp_alloc_init("nd_if_rwlock", - nd_if_rwlock_grp_attr); - nd_if_rwlock_attr = lck_attr_alloc_init(); - nd_if_rwlock = lck_rw_alloc_init(nd_if_rwlock_grp, nd_if_rwlock_attr); + nd_if_lock_grp_attr = lck_grp_attr_alloc_init(); + nd_if_lock_grp = lck_grp_alloc_init("nd_if_lock", nd_if_lock_grp_attr); + nd_if_lock_attr = lck_attr_alloc_init(); + lck_rw_init(nd_if_rwlock, nd_if_lock_grp, nd_if_lock_attr); llinfo_nd6_zone = zinit(sizeof (struct llinfo_nd6), LLINFO_ND6_ZONE_MAX * sizeof (struct llinfo_nd6), 0, @@ -265,6 +267,7 @@ nd6_init() nd6_nbr_init(); nd6_rtr_init(); + nd6_prproxy_init(); nd6_init_done = 1; @@ -326,12 +329,38 @@ nd6_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) if (lr == NULL) { bzero(ri, sizeof (*ri)); + ri->ri_rssi = IFNET_RSSI_UNKNOWN; + ri->ri_lqm = IFNET_LQM_THRESH_OFF; + ri->ri_npm = IFNET_NPM_THRESH_UNKNOWN; } else { IFLR_LOCK(lr); /* Export to rt_reach_info structure */ ifnet_lr2ri(lr, ri); - /* Export ND6 send expiration time */ - ri->ri_snd_expire = ifnet_llreach_up2cal(lr, ln->ln_lastused); + /* Export ND6 send expiration (calendar) time */ + ri->ri_snd_expire = + ifnet_llreach_up2calexp(lr, ln->ln_lastused); + IFLR_UNLOCK(lr); + } +} + +static void +nd6_llinfo_get_iflri(struct rtentry *rt, struct ifnet_llreach_info *iflri) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + struct if_llreach *lr = ln->ln_llreach; + + if (lr == NULL) { + bzero(iflri, sizeof (*iflri)); + iflri->iflri_rssi = IFNET_RSSI_UNKNOWN; + iflri->iflri_lqm = IFNET_LQM_THRESH_OFF; + iflri->iflri_npm = IFNET_NPM_THRESH_UNKNOWN; + } else { + IFLR_LOCK(lr); + /* Export to ifnet_llreach_info structure */ + ifnet_lr2iflri(lr, iflri); + /* Export ND6 send expiration (uptime) time */ + iflri->iflri_snd_expire = + ifnet_llreach_up2upexp(lr, ln->ln_lastused); IFLR_UNLOCK(lr); } } @@ -339,7 +368,6 @@ nd6_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) int nd6_ifattach(struct ifnet *ifp) { - /* * We have some arrays that should be indexed by if_index. * since if_index will grow dynamically, they should grow too. @@ -358,7 +386,7 @@ nd6_ifattach(struct ifnet *ifp) q = (caddr_t)_MALLOC(n, M_IP6NDP, M_WAITOK); if (q == NULL) { lck_rw_done(nd_if_rwlock); - return ENOBUFS; + return (ENOBUFS); } bzero(q, n); nd_ifinfo_indexlim = newlim; @@ -370,24 +398,19 @@ nd6_ifattach(struct ifnet *ifp) */ FREE((caddr_t)nd_ifinfo, M_IP6NDP); } - nd_ifinfo = (struct nd_ifinfo *)q; + nd_ifinfo = (struct nd_ifinfo *)(void *)q; } - lck_rw_done(nd_if_rwlock); #define ND nd_ifinfo[ifp->if_index] - /* * Don't initialize if called twice. - * XXX: to detect this, we should choose a member that is never set - * before initialization of the ND structure itself. We formaly used - * the linkmtu member, which was not suitable because it could be - * initialized via "ifconfig mtu". */ - lck_rw_lock_shared(nd_if_rwlock); - if (ND.basereachable) { + if (ND.initialized) { lck_rw_done(nd_if_rwlock); - return 0; + return (0); } + lck_mtx_init(&ND.lock, nd_if_lock_grp, nd_if_lock_attr); + ND.initialized = TRUE; ND.linkmtu = ifp->if_mtu; ND.chlim = IPV6_DEFHLIM; ND.basereachable = REACHABLE_TIME; @@ -395,10 +418,11 @@ nd6_ifattach(struct ifnet *ifp) ND.retrans = RETRANS_TIMER; ND.flags = ND6_IFF_PERFORMNUD; lck_rw_done(nd_if_rwlock); - nd6_setmtu(ifp); #undef ND - - return 0; + + nd6_setmtu(ifp); + + return (0); } /* @@ -416,12 +440,15 @@ nd6_setmtu(struct ifnet *ifp) * because this can be called directly from SIOCSIFMTU for IPv4 */ lck_rw_lock_shared(nd_if_rwlock); - if (ifp->if_index >= nd_ifinfo_indexlim) { + if (ifp->if_index >= nd_ifinfo_indexlim || + !nd_ifinfo[ifp->if_index].initialized) { lck_rw_done(nd_if_rwlock); - return; /* we're out of bound for nd_ifinfo */ + return; /* nd_ifinfo out of bound, or not yet initialized */ } ndi = &nd_ifinfo[ifp->if_index]; + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); oldmaxmtu = ndi->maxmtu; /* @@ -449,6 +476,7 @@ nd6_setmtu(struct ifnet *ifp) ifp->if_name, ifp->if_unit, (uint32_t)ndi->maxmtu); } ndi->linkmtu = ifp->if_mtu; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); /* also adjust in6_maxmtu if necessary. */ @@ -639,6 +667,8 @@ again: struct rtentry *rt; struct sockaddr_in6 *dst; struct llinfo_nd6 *next; + struct nd_ifinfo *ndi; + u_int32_t retrans, flags; /* ln_next/prev/rt is protected by rnh_lock */ next = ln->ln_next; @@ -667,7 +697,7 @@ again: } /* rt_key should never be NULL */ - dst = (struct sockaddr_in6 *)rt_key(rt); + dst = (struct sockaddr_in6 *)(void *)rt_key(rt); if (dst == NULL) { panic("%s: rt(%p) key is NULL ln(%p)", __func__, rt, ln); @@ -683,7 +713,6 @@ again: continue; } - /* Make a copy (we're using it read-only anyway) */ lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index >= nd_ifinfo_indexlim) { lck_rw_done(nd_if_rwlock); @@ -691,6 +720,12 @@ again: ln = next; continue; } + ndi = ND_IFINFO(ifp); + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); + retrans = ndi->retrans; + flags = ndi->flags; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); RT_LOCK_ASSERT_HELD(rt); @@ -699,15 +734,17 @@ again: case ND6_LLINFO_INCOMPLETE: if (ln->ln_asked < nd6_mmaxtries) { ln->ln_asked++; - lck_rw_lock_shared(nd_if_rwlock); - ln->ln_expire = timenow.tv_sec + - nd_ifinfo[ifp->if_index].retrans / 1000; - lck_rw_done(nd_if_rwlock); + ln->ln_expire = timenow.tv_sec + retrans / 1000; RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); - nd6_ns_output(ifp, NULL, &dst->sin6_addr, - ln, 0); + if (ip6_forwarding) { + nd6_prproxy_ns_output(ifp, NULL, + &dst->sin6_addr, ln); + } else { + nd6_ns_output(ifp, NULL, + &dst->sin6_addr, ln, 0); + } RT_REMREF(rt); } else { struct mbuf *m = ln->ln_hold; @@ -759,15 +796,11 @@ again: break; case ND6_LLINFO_DELAY: - lck_rw_lock_shared(nd_if_rwlock); - if ((nd_ifinfo[ifp->if_index].flags & - ND6_IFF_PERFORMNUD) != 0) { + if ((flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->ln_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; - ln->ln_expire = timenow.tv_sec + - nd_ifinfo[ifp->if_index].retrans / 1000; - lck_rw_done(nd_if_rwlock); + ln->ln_expire = timenow.tv_sec + retrans / 1000; RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); @@ -778,7 +811,6 @@ again: RT_REMREF(rt); goto again; } - lck_rw_done(nd_if_rwlock); ln->ln_state = ND6_LLINFO_STALE; /* XXX */ ln->ln_expire = rt_expiry(rt, timenow.tv_sec, nd6_gctimer); @@ -788,10 +820,7 @@ again: case ND6_LLINFO_PROBE: if (ln->ln_asked < nd6_umaxtries) { ln->ln_asked++; - lck_rw_lock_shared(nd_if_rwlock); - ln->ln_expire = timenow.tv_sec + - nd_ifinfo[ifp->if_index].retrans / 1000; - lck_rw_done(nd_if_rwlock); + ln->ln_expire = timenow.tv_sec + retrans / 1000; RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); @@ -872,15 +901,17 @@ addrloop: * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of - * regeneration. + * regeneration. */ if (ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { - /* NOTE: We have to drop the lock here because - * regen_tmpaddr() eventually calls in6_update_ifa(), - * which must take the lock and would otherwise cause a - * hang. This is safe because the goto addrloop - * leads to a reevaluation of the in6_ifaddrs list + /* + * NOTE: We have to drop the lock here + * because regen_tmpaddr() eventually calls + * in6_update_ifa(), which must take the lock + * and would otherwise cause a hang. This is + * safe because the goto addrloop leads to a + * re-evaluation of the in6_ifaddrs list */ IFA_UNLOCK(&ia6->ia_ifa); lck_rw_done(&in6_ifaddr_rwlock); @@ -929,7 +960,7 @@ addrloop: * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the - * loop just for safety. Or does this + * loop just for safety. Or does this * significantly reduce performance?? */ /* Release extra reference */ @@ -1011,6 +1042,64 @@ addrloop: lck_mtx_unlock(nd6_mutex); } +/* + * ND6 router advertisement kernel notification + */ +void +nd6_post_msg(u_int32_t code, struct nd_prefix_list *prefix_list, + u_int32_t list_length, u_int32_t mtu, char *dl_addr, u_int32_t dl_addr_len) +{ + struct kev_msg ev_msg; + struct kev_nd6_ra_data nd6_ra_msg_data; + struct nd_prefix_list *itr = prefix_list; + + bzero(&ev_msg, sizeof(struct kev_msg)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_ND6_SUBCLASS; + ev_msg.event_code = code; + + bzero(&nd6_ra_msg_data, sizeof(nd6_ra_msg_data)); + nd6_ra_msg_data.lladdrlen = (dl_addr_len <= ND6_ROUTER_LL_SIZE) ? + dl_addr_len : ND6_ROUTER_LL_SIZE; + bcopy(dl_addr, &nd6_ra_msg_data.lladdr, nd6_ra_msg_data.lladdrlen); + + if (mtu > 0 && mtu >= IPV6_MMTU) { + nd6_ra_msg_data.mtu = mtu; + nd6_ra_msg_data.flags |= KEV_ND6_DATA_VALID_MTU; + } + + if (list_length > 0 && prefix_list != NULL) { + nd6_ra_msg_data.list_length = list_length; + nd6_ra_msg_data.flags |= KEV_ND6_DATA_VALID_PREFIX; + } + + while (itr != NULL && nd6_ra_msg_data.list_index < list_length) { + bcopy(&itr->pr.ndpr_prefix, &nd6_ra_msg_data.prefix.prefix, + sizeof (nd6_ra_msg_data.prefix.prefix)); + nd6_ra_msg_data.prefix.raflags = itr->pr.ndpr_raf; + nd6_ra_msg_data.prefix.prefixlen = itr->pr.ndpr_plen; + nd6_ra_msg_data.prefix.origin = PR_ORIG_RA; + nd6_ra_msg_data.prefix.vltime = itr->pr.ndpr_vltime; + nd6_ra_msg_data.prefix.pltime = itr->pr.ndpr_pltime; + nd6_ra_msg_data.prefix.expire = itr->pr.ndpr_expire; + nd6_ra_msg_data.prefix.flags = itr->pr.ndpr_stateflags; + nd6_ra_msg_data.prefix.refcnt = itr->pr.ndpr_addrcnt; + nd6_ra_msg_data.prefix.if_index = itr->pr.ndpr_ifp->if_index; + + /* send the message up */ + ev_msg.dv[0].data_ptr = &nd6_ra_msg_data; + ev_msg.dv[0].data_length = sizeof(nd6_ra_msg_data); + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + /* clean up for the next prefix */ + bzero(&nd6_ra_msg_data.prefix, sizeof(nd6_ra_msg_data.prefix)); + itr = itr->next; + nd6_ra_msg_data.list_index++; + } +} + /* * ND6 timer routine to expire default route list and prefix list */ @@ -1182,10 +1271,13 @@ nd6_purge( nd6_setdefaultiface(0); } - if (!ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + /* + * Perform default router selection even when we are a router, + * if Scoped Routing is enabled. + */ + if (ip6_doscopedroute || !ip6_forwarding) { lck_mtx_lock(nd6_mutex); /* refresh default router list */ - defrouter_reset(); defrouter_select(ifp); lck_mtx_unlock(nd6_mutex); } @@ -1352,11 +1444,16 @@ nd6_lookup( * interfaces to a same link, install a link prefix to an interface, * and try to install a neighbor cache on an interface that does not * have a route to the prefix. + * + * If the address is from a proxied prefix, the ifa_ifp and ifp might + * not match, because nd6_na_input() could have modified the ifp + * of the route to point to the interface where the NA arrived on, + * hence the test for RTF_PROXY. */ - if (ifp == NULL || - (rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || + if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || - (ifp && rt->rt_ifa->ifa_ifp != ifp)) { + (ifp && rt->rt_ifa->ifa_ifp != ifp && + !(rt->rt_flags & RTF_PROXY))) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); if (create) { @@ -1401,7 +1498,7 @@ nd6_is_new_addr_neighbor( * content (XXX). */ sin6_copy = *addr; - if (sa6_recoverscope(&sin6_copy)) + if (sa6_recoverscope(&sin6_copy, FALSE)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); @@ -1454,8 +1551,12 @@ nd6_is_new_addr_neighbor( * as on-link, and thus, as a neighbor. * XXX: we restrict the condition to hosts, because routers usually do * not have the "default router list". + * XXX: this block should eventually be removed (it is disabled when + * Scoped Routing is in effect); treating all destinations as on-link + * in the absence of a router is rather harmful. */ - if (!ip6_forwarding && TAILQ_FIRST(&nd_defrouter) == NULL && + if (!ip6_doscopedroute && !ip6_forwarding && + TAILQ_FIRST(&nd_defrouter) == NULL && nd6_defifindex == ifp->if_index) { return (1); } @@ -1516,7 +1617,7 @@ nd6_free( RT_LOCK(rt); RT_ADDREF_LOCKED(rt); /* Extra ref */ ln = rt->rt_llinfo; - in6 = ((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; + in6 = ((struct sockaddr_in6 *)(void *)rt_key(rt))->sin6_addr; /* * Prevent another thread from modifying rt_key, rt_gateway @@ -1526,14 +1627,13 @@ nd6_free( rt->rt_flags |= RTF_CONDEMNED; /* - * we used to have pfctlinput(PRC_HOSTDEAD) here. - * even though it is not harmful, it was not really necessary. + * We used to have pfctlinput(PRC_HOSTDEAD) here. Even though it is + * not harmful, it was not really necessary. Perform default router + * selection even when we are a router, if Scoped Routing is enabled. */ - - if (!ip6_forwarding && (ip6_accept_rtadv || - (rt->rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { - dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))-> - sin6_addr, rt->rt_ifp); + if (ip6_doscopedroute || !ip6_forwarding) { + dr = defrouter_lookup(&((struct sockaddr_in6 *)(void *) + rt_key(rt))->sin6_addr, rt->rt_ifp); if ((ln && ln->ln_router) || dr) { /* @@ -1655,9 +1755,14 @@ nd6_nud_hint( ln->ln_state = ND6_LLINFO_REACHABLE; if (ln->ln_expire) { + struct nd_ifinfo *ndi; + lck_rw_lock_shared(nd_if_rwlock); - ln->ln_expire = timenow.tv_sec + - nd_ifinfo[rt->rt_ifp->if_index].reachable; + ndi = ND_IFINFO(rt->rt_ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); + ln->ln_expire = timenow.tv_sec + ndi->reachable; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } done: @@ -1820,6 +1925,7 @@ nd6_rtrequest( break; } rt->rt_llinfo_get_ri = nd6_llinfo_get_ri; + rt->rt_llinfo_get_iflri = nd6_llinfo_get_iflri; rt->rt_llinfo_purge = nd6_llinfo_purge; rt->rt_llinfo_free = nd6_llinfo_free; @@ -2005,20 +2111,28 @@ nd6_rtrequest( } } -static void +static int nd6_siocgdrlst(void *data, int data_is_64) { - struct in6_drlist_64 *drl_64 = (struct in6_drlist_64 *)data; - struct in6_drlist_32 *drl_32 = (struct in6_drlist_32 *)data; + struct in6_drlist_32 *drl_32; struct nd_defrouter *dr; int i = 0; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - bzero(data, data_is_64 ? sizeof (*drl_64) : sizeof (*drl_32)); dr = TAILQ_FIRST(&nd_defrouter); + + /* For 64-bit process */ if (data_is_64) { - /* For 64-bit process */ + struct in6_drlist_64 *drl_64; + + drl_64 = _MALLOC(sizeof (*drl_64), M_TEMP, M_WAITOK|M_ZERO); + if (drl_64 == NULL) + return (ENOMEM); + + /* preserve the interface name */ + bcopy(data, drl_64, sizeof (drl_64->ifname)); + while (dr && i < DRLSTSIZ) { drl_64->defrouter[i].rtaddr = dr->rtaddr; if (IN6_IS_ADDR_LINKLOCAL(&drl_64->defrouter[i].rtaddr)) { @@ -2037,9 +2151,19 @@ nd6_siocgdrlst(void *data, int data_is_64) i++; dr = TAILQ_NEXT(dr, dr_entry); } - return; + bcopy(drl_64, data, sizeof (*drl_64)); + _FREE(drl_64, M_TEMP); + return (0); } + /* For 32-bit process */ + drl_32 = _MALLOC(sizeof (*drl_32), M_TEMP, M_WAITOK|M_ZERO); + if (drl_32 == NULL) + return (ENOMEM); + + /* preserve the interface name */ + bcopy(data, drl_32, sizeof (drl_32->ifname)); + while (dr && i < DRLSTSIZ) { drl_32->defrouter[i].rtaddr = dr->rtaddr; if (IN6_IS_ADDR_LINKLOCAL(&drl_32->defrouter[i].rtaddr)) { @@ -2058,26 +2182,38 @@ nd6_siocgdrlst(void *data, int data_is_64) i++; dr = TAILQ_NEXT(dr, dr_entry); } + bcopy(drl_32, data, sizeof (*drl_32)); + _FREE(drl_32, M_TEMP); + return (0); } -static void +/* + * XXX meaning of fields, especialy "raflags", is very + * differnet between RA prefix list and RR/static prefix list. + * how about separating ioctls into two? + */ +static int nd6_siocgprlst(void *data, int data_is_64) { - struct in6_prlist_64 *prl_64 = (struct in6_prlist_64 *)data; - struct in6_prlist_32 *prl_32 = (struct in6_prlist_32 *)data; + struct in6_prlist_32 *prl_32; struct nd_prefix *pr; int i = 0; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - /* - * XXX meaning of fields, especialy "raflags", is very - * differnet between RA prefix list and RR/static prefix list. - * how about separating ioctls into two? - */ - bzero(data, data_is_64 ? sizeof (*prl_64) : sizeof (*prl_32)); + pr = nd_prefix.lh_first; + + /* For 64-bit process */ if (data_is_64) { - /* For 64-bit process */ + struct in6_prlist_64 *prl_64; + + prl_64 = _MALLOC(sizeof (*prl_64), M_TEMP, M_WAITOK|M_ZERO); + if (prl_64 == NULL) + return (ENOMEM); + + /* preserve the interface name */ + bcopy(data, prl_64, sizeof (prl_64->ifname)); + while (pr && i < PRLSTSIZ) { struct nd_pfxrouter *pfr; int j; @@ -2120,10 +2256,19 @@ nd6_siocgprlst(void *data, int data_is_64) i++; pr = pr->ndpr_next; } - - return; + bcopy(prl_64, data, sizeof (*prl_64)); + _FREE(prl_64, M_TEMP); + return (0); } + /* For 32-bit process */ + prl_32 = _MALLOC(sizeof (*prl_32), M_TEMP, M_WAITOK|M_ZERO); + if (prl_32 == NULL) + return (ENOMEM); + + /* preserve the interface name */ + bcopy(data, prl_32, sizeof (prl_32->ifname)); + while (pr && i < PRLSTSIZ) { struct nd_pfxrouter *pfr; int j; @@ -2166,75 +2311,108 @@ nd6_siocgprlst(void *data, int data_is_64) i++; pr = pr->ndpr_next; } + bcopy(prl_32, data, sizeof (*prl_32)); + _FREE(prl_32, M_TEMP); + return (0); } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { - struct in6_ndireq *ndi = (struct in6_ndireq *)data; - struct in6_ondireq *ondi = (struct in6_ondireq *)data; struct nd_defrouter *dr; struct nd_prefix *pr; struct rtentry *rt; int i = ifp->if_index, error = 0; switch (cmd) { - case SIOCGDRLST_IN6_32: - case SIOCGDRLST_IN6_64: + case SIOCGDRLST_IN6_32: /* struct in6_drlist_32 */ + case SIOCGDRLST_IN6_64: /* struct in6_drlist_64 */ /* * obsolete API, use sysctl under net.inet6.icmp6 */ lck_mtx_lock(nd6_mutex); - nd6_siocgdrlst(data, cmd == SIOCGDRLST_IN6_64); + error = nd6_siocgdrlst(data, cmd == SIOCGDRLST_IN6_64); lck_mtx_unlock(nd6_mutex); break; - case SIOCGPRLST_IN6_32: - case SIOCGPRLST_IN6_64: + case SIOCGPRLST_IN6_32: /* struct in6_prlist_32 */ + case SIOCGPRLST_IN6_64: /* struct in6_prlist_64 */ /* * obsolete API, use sysctl under net.inet6.icmp6 */ lck_mtx_lock(nd6_mutex); - nd6_siocgprlst(data, cmd == SIOCGPRLST_IN6_64); + error = nd6_siocgprlst(data, cmd == SIOCGPRLST_IN6_64); lck_mtx_unlock(nd6_mutex); break; - case OSIOCGIFINFO_IN6: - case SIOCGIFINFO_IN6: + case OSIOCGIFINFO_IN6: /* struct in6_ondireq */ + case SIOCGIFINFO_IN6: { /* struct in6_ondireq */ + u_int32_t linkmtu; + struct in6_ondireq *ondi = (struct in6_ondireq *)(void *)data; + struct nd_ifinfo *ndi; /* * SIOCGIFINFO_IN6 ioctl is encoded with in6_ondireq * instead of in6_ndireq, so we treat it as such. */ lck_rw_lock_shared(nd_if_rwlock); - if (!nd_ifinfo || i >= nd_ifinfo_indexlim) { + ndi = ND_IFINFO(ifp); + if (!nd_ifinfo || i >= nd_ifinfo_indexlim || + !ndi->initialized) { lck_rw_done(nd_if_rwlock); error = EINVAL; break; } - ondi->ndi.linkmtu = IN6_LINKMTU(ifp); - ondi->ndi.maxmtu = nd_ifinfo[i].maxmtu; - ondi->ndi.basereachable = nd_ifinfo[i].basereachable; - ondi->ndi.reachable = nd_ifinfo[i].reachable; - ondi->ndi.retrans = nd_ifinfo[i].retrans; - ondi->ndi.flags = nd_ifinfo[i].flags; - ondi->ndi.recalctm = nd_ifinfo[i].recalctm; + lck_mtx_lock(&ndi->lock); + linkmtu = IN6_LINKMTU(ifp); + bcopy(&linkmtu, &ondi->ndi.linkmtu, sizeof (linkmtu)); + bcopy(&nd_ifinfo[i].maxmtu, &ondi->ndi.maxmtu, + sizeof (u_int32_t)); + bcopy(&nd_ifinfo[i].basereachable, &ondi->ndi.basereachable, + sizeof (u_int32_t)); + bcopy(&nd_ifinfo[i].reachable, &ondi->ndi.reachable, + sizeof (u_int32_t)); + bcopy(&nd_ifinfo[i].retrans, &ondi->ndi.retrans, + sizeof (u_int32_t)); + bcopy(&nd_ifinfo[i].flags, &ondi->ndi.flags, + sizeof (u_int32_t)); + bcopy(&nd_ifinfo[i].recalctm, &ondi->ndi.recalctm, + sizeof (int)); ondi->ndi.chlim = nd_ifinfo[i].chlim; + ondi->ndi.receivedra = 0; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); break; + } - case SIOCSIFINFO_FLAGS: - /* XXX: almost all other fields of ndi->ndi is unused */ + case SIOCSIFINFO_FLAGS: { /* struct in6_ndireq */ + struct in6_ndireq *cndi = (struct in6_ndireq *)(void *)data; + u_int32_t oflags, flags; + struct nd_ifinfo *ndi; + + /* XXX: almost all other fields of cndi->ndi is unused */ lck_rw_lock_shared(nd_if_rwlock); - if (!nd_ifinfo || i >= nd_ifinfo_indexlim) { + ndi = ND_IFINFO(ifp); + if (!nd_ifinfo || i >= nd_ifinfo_indexlim || + !ndi->initialized) { lck_rw_done(nd_if_rwlock); error = EINVAL; break; } - nd_ifinfo[i].flags = ndi->ndi.flags; + lck_mtx_lock(&ndi->lock); + oflags = nd_ifinfo[i].flags; + bcopy(&cndi->ndi.flags, &nd_ifinfo[i].flags, sizeof (flags)); + flags = nd_ifinfo[i].flags; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); + + if (oflags == flags) + break; + + error = nd6_setifinfo(ifp, oflags, flags); break; + } - case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ + case SIOCSNDFLUSH_IN6: /* struct in6_ifreq */ /* flush default router list */ /* * xxx sumikawa: should not delete route if default @@ -2247,7 +2425,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) /* xxx sumikawa: flush prefix list */ break; - case SIOCSPFXFLUSH_IN6: { + case SIOCSPFXFLUSH_IN6: { /* struct in6_ifreq */ /* flush all the prefix advertised by routers */ struct nd_prefix *next; @@ -2285,9 +2463,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) lck_rw_done(&in6_ifaddr_rwlock); lck_mtx_unlock(nd6_mutex); in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); lck_mtx_lock(nd6_mutex); lck_rw_lock_exclusive(&in6_ifaddr_rwlock); - IFA_REMREF(&ia->ia_ifa); /* * Purging the address caused * in6_ifaddr_rwlock to be @@ -2324,7 +2502,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) break; } - case SIOCSRTRFLUSH_IN6: { + case SIOCSRTRFLUSH_IN6: { /* struct in6_ifreq */ /* flush all the default routers */ struct nd_defrouter *next; @@ -2347,19 +2525,21 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) break; } - case SIOCGNBRINFO_IN6_32: { + case SIOCGNBRINFO_IN6_32: { /* struct in6_nbrinfo_32 */ struct llinfo_nd6 *ln; - struct in6_nbrinfo_32 *nbi_32 = (struct in6_nbrinfo_32 *)data; - /* make local for safety */ - struct in6_addr nb_addr = nbi_32->addr; + struct in6_nbrinfo_32 nbi_32; + struct in6_addr nb_addr; /* make local for safety */ + bcopy(data, &nbi_32, sizeof (nbi_32)); + nb_addr = nbi_32.addr; /* * XXX: KAME specific hack for scoped addresses * XXXX: for other scopes than link-local? */ - if (IN6_IS_ADDR_LINKLOCAL(&nbi_32->addr) || - IN6_IS_ADDR_MC_LINKLOCAL(&nbi_32->addr)) { - u_int16_t *idp = (u_int16_t *)&nb_addr.s6_addr[2]; + if (IN6_IS_ADDR_LINKLOCAL(&nbi_32.addr) || + IN6_IS_ADDR_MC_LINKLOCAL(&nbi_32.addr)) { + u_int16_t *idp = + (u_int16_t *)(void *)&nb_addr.s6_addr[2]; if (*idp == 0) *idp = htons(ifp->if_index); @@ -2372,28 +2552,31 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) } RT_LOCK_ASSERT_HELD(rt); ln = rt->rt_llinfo; - nbi_32->state = ln->ln_state; - nbi_32->asked = ln->ln_asked; - nbi_32->isrouter = ln->ln_router; - nbi_32->expire = ln->ln_expire; + nbi_32.state = ln->ln_state; + nbi_32.asked = ln->ln_asked; + nbi_32.isrouter = ln->ln_router; + nbi_32.expire = ln->ln_expire; RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); + bcopy(&nbi_32, data, sizeof (nbi_32)); break; } - case SIOCGNBRINFO_IN6_64: { + case SIOCGNBRINFO_IN6_64: { /* struct in6_nbrinfo_64 */ struct llinfo_nd6 *ln; - struct in6_nbrinfo_64 *nbi_64 = (struct in6_nbrinfo_64 *)data; - /* make local for safety */ - struct in6_addr nb_addr = nbi_64->addr; + struct in6_nbrinfo_64 nbi_64; + struct in6_addr nb_addr; /* make local for safety */ + bcopy(data, &nbi_64, sizeof (nbi_64)); + nb_addr = nbi_64.addr; /* * XXX: KAME specific hack for scoped addresses * XXXX: for other scopes than link-local? */ - if (IN6_IS_ADDR_LINKLOCAL(&nbi_64->addr) || - IN6_IS_ADDR_MC_LINKLOCAL(&nbi_64->addr)) { - u_int16_t *idp = (u_int16_t *)&nb_addr.s6_addr[2]; + if (IN6_IS_ADDR_LINKLOCAL(&nbi_64.addr) || + IN6_IS_ADDR_MC_LINKLOCAL(&nbi_64.addr)) { + u_int16_t *idp = + (u_int16_t *)(void *)&nb_addr.s6_addr[2]; if (*idp == 0) *idp = htons(ifp->if_index); @@ -2406,34 +2589,50 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) } RT_LOCK_ASSERT_HELD(rt); ln = rt->rt_llinfo; - nbi_64->state = ln->ln_state; - nbi_64->asked = ln->ln_asked; - nbi_64->isrouter = ln->ln_router; - nbi_64->expire = ln->ln_expire; + nbi_64.state = ln->ln_state; + nbi_64.asked = ln->ln_asked; + nbi_64.isrouter = ln->ln_router; + nbi_64.expire = ln->ln_expire; RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); + bcopy(&nbi_64, data, sizeof (nbi_64)); break; } - case SIOCGDEFIFACE_IN6_32: /* XXX: should be implemented as a sysctl? */ - case SIOCGDEFIFACE_IN6_64: { - struct in6_ndifreq_64 *ndif_64 = (struct in6_ndifreq_64 *)data; - struct in6_ndifreq_32 *ndif_32 = (struct in6_ndifreq_32 *)data; + case SIOCGDEFIFACE_IN6_32: /* struct in6_ndifreq_32 */ + case SIOCGDEFIFACE_IN6_64: { /* struct in6_ndifreq_64 */ + struct in6_ndifreq_64 *ndif_64 = + (struct in6_ndifreq_64 *)(void *)data; + struct in6_ndifreq_32 *ndif_32 = + (struct in6_ndifreq_32 *)(void *)data; - if (cmd == SIOCGDEFIFACE_IN6_64) - ndif_64->ifindex = nd6_defifindex; - else - ndif_32->ifindex = nd6_defifindex; + if (cmd == SIOCGDEFIFACE_IN6_64) { + u_int64_t j = nd6_defifindex; + bcopy(&j, &ndif_64->ifindex, sizeof (j)); + } else { + bcopy(&nd6_defifindex, &ndif_32->ifindex, + sizeof (u_int32_t)); + } break; } - case SIOCSDEFIFACE_IN6_32: /* XXX: should be implemented as a sysctl? */ - case SIOCSDEFIFACE_IN6_64: { - struct in6_ndifreq_64 *ndif_64 = (struct in6_ndifreq_64 *)data; - struct in6_ndifreq_32 *ndif_32 = (struct in6_ndifreq_32 *)data; + case SIOCSDEFIFACE_IN6_32: /* struct in6_ndifreq_32 */ + case SIOCSDEFIFACE_IN6_64: { /* struct in6_ndifreq_64 */ + struct in6_ndifreq_64 *ndif_64 = + (struct in6_ndifreq_64 *)(void *)data; + struct in6_ndifreq_32 *ndif_32 = + (struct in6_ndifreq_32 *)(void *)data; + u_int32_t idx; - error = nd6_setdefaultiface(cmd == SIOCSDEFIFACE_IN6_64 ? - ndif_64->ifindex : ndif_32->ifindex); + if (cmd == SIOCSDEFIFACE_IN6_64) { + u_int64_t j; + bcopy(&ndif_64->ifindex, &j, sizeof (j)); + idx = (u_int32_t)j; + } else { + bcopy(&ndif_32->ifindex, &idx, sizeof (idx)); + } + + error = nd6_setdefaultiface(idx); return (error); /* NOTREACHED */ } @@ -2592,7 +2791,7 @@ fail: * set the 2nd argument as the 1st one. */ RT_UNLOCK(rt); - nd6_output(ifp, ifp, m, &sin6, rt); + nd6_output(ifp, ifp, m, &sin6, rt, NULL); RT_LOCK(rt); } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { @@ -2672,17 +2871,12 @@ fail: * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? - * XXX: when we hear an RA from a new router with the link-layer - * address option, defrouter_select() is called twice, since - * defrtrlist_update called the function as well. However, I believe - * we can compromise the overhead, since it only happens the first - * time. - * XXX: although defrouter_select() should not have a bad effect - * for those are not autoconfigured hosts, we explicitly avoid such - * cases for safety. + * + * Note: Perform default router selection even when we are a router, + * if Scoped Routing is enabled. */ - if (do_update && ln->ln_router && !ip6_forwarding && - (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + if (do_update && ln->ln_router && + (ip6_doscopedroute || !ip6_forwarding)) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); @@ -2706,6 +2900,9 @@ nd6_slowtimo( if (!nd_ifinfo || i >= nd_ifinfo_indexlim) break; nd6if = &nd_ifinfo[i]; + if (!nd6if->initialized) + break; + lck_mtx_lock(&nd6if->lock); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* @@ -2717,6 +2914,7 @@ nd6_slowtimo( nd6if->recalctm = nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } + lck_mtx_unlock(&nd6if->lock); } lck_rw_done(nd_if_rwlock); timeout(nd6_slowtimo, (caddr_t)0, ND6_SLOWTIMER_INTERVAL * hz); @@ -2725,7 +2923,7 @@ nd6_slowtimo( #define senderr(e) { error = (e); goto bad;} int nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, - struct sockaddr_in6 *dst, struct rtentry *hint0) + struct sockaddr_in6 *dst, struct rtentry *hint0, struct flowadv *adv) { struct mbuf *m = m0; struct rtentry *rt = hint0, *hint = hint0; @@ -2733,6 +2931,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, int error = 0; struct timeval timenow; struct rtentry *rtrele = NULL; + struct nd_ifinfo *ndi; if (rt != NULL) { RT_LOCK_SPIN(rt); @@ -2758,7 +2957,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, * route with a reference held for that placeholder. * * This logic is similar to, though not exactly the same as the one - * used by arp_route_to_gateway_route(). + * used by route_to_gwroute(). */ if (rt != NULL) { /* @@ -2777,7 +2976,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, /* XXX: loop care? */ RT_UNLOCK(rt); error = nd6_output(ifp, origifp, m0, - dst, rt); + dst, rt, adv); rtfree(rt); return (error); } @@ -2821,7 +3020,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, } RT_LOCK_SPIN(rt); - gw6 = *((struct sockaddr_in6 *)rt->rt_gateway); + gw6 = *((struct sockaddr_in6 *)(void *)rt->rt_gateway); /* If hint is now down, give up */ if (!(rt->rt_flags & RTF_UP)) { @@ -2844,15 +3043,15 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, */ RT_LOCK_SPIN(gwrt); if (!(gwrt->rt_flags & RTF_UP)) { - struct rtentry *ogwrt; - rt->rt_gwroute = NULL; RT_UNLOCK(gwrt); RT_UNLOCK(rt); rtfree(gwrt); lookup: - gwrt = rtalloc1_scoped((struct sockaddr *)&gw6, - 1, 0, ifp->if_index); + lck_mtx_lock(rnh_lock); + gwrt = rtalloc1_scoped_locked( + (struct sockaddr *)&gw6, 1, 0, + ifp->if_index); RT_LOCK(rt); /* @@ -2869,57 +3068,68 @@ lookup: } RT_UNLOCK(rt); if (gwrt != NULL) - rtfree(gwrt); + rtfree_locked(gwrt); + lck_mtx_unlock(rnh_lock); senderr(EHOSTUNREACH); } - - /* Remove any existing gwrt */ - ogwrt = rt->rt_gwroute; - if ((rt->rt_gwroute = gwrt) != NULL) - RT_ADDREF(gwrt); - + VERIFY(gwrt != NULL); + /* + * Set gateway route; callee adds ref to gwrt; + * gwrt has an extra ref from rtalloc1() for + * this routine. + */ + rt_set_gwroute(rt, rt_key(rt), gwrt); RT_UNLOCK(rt); - /* Now free the replaced gwrt */ - if (ogwrt != NULL) - rtfree(ogwrt); - /* If still no route to gateway, bail out */ - if (gwrt == NULL) - senderr(EHOSTUNREACH); + lck_mtx_unlock(rnh_lock); /* Remember to release/free "rt" at the end */ rtrele = rt; rt = gwrt; - RT_LOCK_SPIN(rt); - /* If gwrt is now down, give up */ - if (!(rt->rt_flags & RTF_UP)) { - RT_UNLOCK(rt); - rtfree(rt); - rt = NULL; - /* "rtrele" == original "rt" */ - senderr(EHOSTUNREACH); - } } else { RT_ADDREF_LOCKED(gwrt); RT_UNLOCK(gwrt); RT_UNLOCK(rt); - RT_LOCK_SPIN(gwrt); - /* If gwrt is now down, give up */ - if (!(gwrt->rt_flags & RTF_UP)) { - RT_UNLOCK(gwrt); - rtfree(gwrt); - senderr(EHOSTUNREACH); - } /* Remember to release/free "rt" at the end */ rtrele = rt; rt = gwrt; } + VERIFY(rt == gwrt); + + /* + * This is an opportunity to revalidate the parent + * route's gwroute, in case it now points to a dead + * route entry. Parent route won't go away since the + * clone (hint) holds a reference to it. rt == gwrt. + */ + RT_LOCK_SPIN(hint); + if ((hint->rt_flags & (RTF_WASCLONED | RTF_UP)) == + (RTF_WASCLONED | RTF_UP)) { + struct rtentry *prt = hint->rt_parent; + VERIFY(prt != NULL); + + RT_CONVERT_LOCK(hint); + RT_ADDREF(prt); + RT_UNLOCK(hint); + rt_revalidate_gwroute(prt, rt); + RT_REMREF(prt); + } else { + RT_UNLOCK(hint); + } + + RT_LOCK_SPIN(rt); + /* rt == gwrt; if it is now down, give up */ + if (!(rt->rt_flags & RTF_UP)) { + RT_UNLOCK(rt); + rtfree(rt); + rt = NULL; + /* "rtrele" == original "rt" */ + senderr(EHOSTUNREACH); + } } + /* Become a regular mutex */ RT_CONVERT_LOCK(rt); } - if (rt != NULL) - RT_LOCK_ASSERT_HELD(rt); - /* * Address resolution or Neighbor Unreachability Detection * for the next hop. @@ -2970,8 +3180,12 @@ lookup: if (rt != NULL) RT_UNLOCK(rt); lck_rw_lock_shared(nd_if_rwlock); + ndi = ND_IFINFO(ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); if ((ifp->if_flags & IFF_POINTOPOINT) == 0 && - !(nd_ifinfo[ifp->if_index].flags & ND6_IFF_PERFORMNUD)) { + !(ndi->flags & ND6_IFF_PERFORMNUD)) { + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " @@ -2979,6 +3193,7 @@ lookup: ip6_sprintf(&dst->sin6_addr), ln, rt); senderr(EIO); /* XXX: good error? */ } + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); goto sendpkt; /* send anyway */ @@ -3047,12 +3262,18 @@ lookup: ln->ln_expire < timenow.tv_sec) { ln->ln_asked++; lck_rw_lock_shared(nd_if_rwlock); - ln->ln_expire = timenow.tv_sec + - nd_ifinfo[ifp->if_index].retrans / 1000; + ndi = ND_IFINFO(ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); + ln->ln_expire = timenow.tv_sec + ndi->retrans / 1000; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); RT_UNLOCK(rt); /* We still have a reference on rt (for ln) */ - nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); + if (ip6_forwarding) + nd6_prproxy_ns_output(ifp, NULL, &dst->sin6_addr, ln); + else + nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); } else { RT_UNLOCK(rt); } @@ -3087,7 +3308,10 @@ sendpkt: /* discard the packet if IPv6 operation is disabled on the interface */ lck_rw_lock_shared(nd_if_rwlock); - if ((nd_ifinfo[ifp->if_index].flags & ND6_IFF_IFDISABLED)) { + ndi = ND_IFINFO(ifp); + VERIFY(ndi != NULL && ndi->initialized); + /* test is done here without holding ndi lock, for performance */ + if (ndi->flags & ND6_IFF_IFDISABLED) { lck_rw_done(nd_if_rwlock); error = ENETDOWN; /* better error? */ goto bad; @@ -3098,7 +3322,7 @@ sendpkt: /* forwarding rules require the original scope_id */ m->m_pkthdr.rcvif = origifp; error = dlil_output(origifp, PF_INET6, m, (caddr_t)rt, - (struct sockaddr *)dst, 0); + (struct sockaddr *)dst, 0, adv); goto release; } else { /* Do not allow loopback address to wind up on a wire */ @@ -3133,7 +3357,7 @@ sendpkt: m->m_pkthdr.rcvif = NULL; error = dlil_output(ifp, PF_INET6, m, (caddr_t)rt, - (struct sockaddr *)dst, 0); + (struct sockaddr *)dst, 0, adv); goto release; bad: @@ -3287,8 +3511,8 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, * Callee holds a reference on the route and returns * with the route entry locked, upon success. */ - result = arp_route_to_gateway_route( - (const struct sockaddr*)ip6_dest, hint, &route); + result = route_to_gwroute((const struct sockaddr *)ip6_dest, + hint, &route); if (result != 0) return (result); if (route != NULL) @@ -3309,7 +3533,7 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, if (route == NULL) { /* * This could happen, if we could not allocate memory or - * if arp_route_to_gateway_route() didn't return a route. + * if route_to_gwroute() didn't return a route. */ result = ENOBUFS; goto release; @@ -3345,6 +3569,21 @@ release: return (result); } +int +nd6_setifinfo(struct ifnet *ifp, u_int32_t before, u_int32_t after) +{ + /* + * We only care about ND6_IFF_PROXY_PREFIXES for now. + */ + before &= ND6_IFF_PROXY_PREFIXES; + after &= ND6_IFF_PROXY_PREFIXES; + + if (before == after) + return (0); + + return (nd6_if_prproxy(ifp, ((int32_t)(after - before) > 0))); +} + SYSCTL_DECL(_net_inet6_icmp6); static int @@ -3366,8 +3605,9 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { - d = (struct in6_defrouter_64 *)buf; - de = (struct in6_defrouter_64 *)(buf + sizeof (buf)); + d = (struct in6_defrouter_64 *)(void *)buf; + de = (struct in6_defrouter_64 *) + (void *)(buf + sizeof (buf)); if (d + 1 <= de) { bzero(d, sizeof (*d)); @@ -3398,8 +3638,9 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { - d_32 = (struct in6_defrouter_32 *)buf; - de_32 = (struct in6_defrouter_32 *)(buf + sizeof (buf)); + d_32 = (struct in6_defrouter_32 *)(void *)buf; + de_32 = (struct in6_defrouter_32 *) + (void *)(buf + sizeof (buf)); if (d_32 + 1 <= de_32) { bzero(d_32, sizeof (*d_32)); @@ -3451,8 +3692,9 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS struct sockaddr_in6 *sin6, *s6; struct nd_pfxrouter *pfr; - p = (struct in6_prefix_64 *)buf; - pe = (struct in6_prefix_64 *)(buf + sizeof (buf)); + p = (struct in6_prefix_64 *)(void *)buf; + pe = (struct in6_prefix_64 *) + (void *)(buf + sizeof (buf)); if (p + 1 <= pe) { bzero(p, sizeof (*p)); @@ -3515,8 +3757,9 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS struct sockaddr_in6 *sin6, *s6; struct nd_pfxrouter *pfr; - p_32 = (struct in6_prefix_32 *)buf; - pe_32 = (struct in6_prefix_32 *)(buf + sizeof (buf)); + p_32 = (struct in6_prefix_32 *)(void *)buf; + pe_32 = (struct in6_prefix_32 *) + (void *)(buf + sizeof (buf)); if (p_32 + 1 <= pe_32) { bzero(p_32, sizeof (*p_32)); diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index 601e075aa..a831fb5ee 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,9 @@ #include #ifdef XNU_KERNEL_PRIVATE +#include #include +#include struct llinfo_nd6 { /* @@ -117,16 +119,17 @@ struct llinfo_nd6 { #ifdef XNU_KERNEL_PRIVATE #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE) #define ND6_LLINFO_PERMANENT(n) (((n)->ln_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE)) -#define ND6_IFF_PERFORMNUD 0x1 -#define ND6_IFF_ACCEPT_RTADV 0x2 /* APPLE: not used. Innterface specific router advertisments are - * handled with a specific ifnet flag: IFEF_ACCEPT_RTADVD - */ -#define ND6_IFF_PREFER_SOURCE 0x4 /* APPLE: NOT USED not related to ND. */ -#define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to - * DAD failure. (XXX: not ND-specific) - */ -#define ND6_IFF_DONT_SET_IFROUTE 0x10 /* NOT USED */ +#define ND6_EUI64_GBIT 0x01 +#define ND6_EUI64_UBIT 0x02 + +#define ND6_EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= ND6_EUI64_UBIT; } while (0) +#define ND6_EUI64_GROUP(in6) ((in6)->s6_addr[8] & ND6_EUI64_GBIT) +#define ND6_EUI64_INDIVIDUAL(in6) (!ND6_EUI64_GROUP(in6)) +#define ND6_EUI64_LOCAL(in6) ((in6)->s6_addr[8] & ND6_EUI64_UBIT) +#define ND6_EUI64_UNIVERSAL(in6) (!ND6_EUI64_LOCAL(in6)) +#define ND6_IFID_LOCAL(in6) (!ND6_EUI64_LOCAL(in6)) +#define ND6_IFID_UNIVERSAL(in6) (!ND6_EUI64_UNIVERSAL(in6)) #endif /* XNU_KERNEL_PRIVATE */ #if !defined(XNU_KERNEL_PRIVATE) @@ -145,13 +148,15 @@ struct nd_ifinfo_compat { u_int8_t chlim; /* CurHopLimit */ u_int8_t receivedra; /* the following 3 members are for privacy extension for addrconf */ - u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ + u_int8_t randomseed0[8]; /* upper 64 bits of SHA1 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ u_int8_t randomid[8]; /* current random ID */ }; #if defined(XNU_KERNEL_PRIVATE) struct nd_ifinfo { + decl_lck_mtx_data(, lock); + boolean_t initialized; /* Flag to see the entry is initialized */ u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ u_int32_t basereachable; /* BaseReachableTime */ @@ -160,9 +165,9 @@ struct nd_ifinfo { u_int32_t flags; /* Flags */ int recalctm; /* BaseReacable re-calculation timer */ u_int8_t chlim; /* CurHopLimit */ - u_int8_t initialized; /* Flag to see the entry is initialized */ + u_int8_t _pad[3]; /* the following 3 members are for privacy extension for addrconf */ - u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ + u_int8_t randomseed0[8]; /* upper 64 bits of SHA1 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ u_int8_t randomid[8]; /* current random ID */ /* keep track of routers and prefixes on this link */ @@ -171,7 +176,20 @@ struct nd_ifinfo { }; #endif /* XNU_KERNEL_PRIVATE */ -#define ND6_IFF_PERFORMNUD 0x1 +#define ND6_IFF_PERFORMNUD 0x1 +#if defined(PRIVATE) +#define ND6_IFF_ACCEPT_RTADV 0x2 /* APPLE: not used. Innterface specific router + * advertisments are handled with a specific ifnet + * flag: IFEF_ACCEPT_RTADVD + */ +#define ND6_IFF_PREFER_SOURCE 0x4 /* APPLE: NOT USED not related to ND. */ +#define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to + * DAD failure. (XXX: not ND-specific) + */ +#define ND6_IFF_DONT_SET_IFROUTE 0x10 /* NOT USED */ +#endif /* PRIVATE */ +#define ND6_IFF_PROXY_PREFIXES 0x20 +#define ND6_IFF_IGNORE_NA 0x40 struct in6_nbrinfo { char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ @@ -425,8 +443,9 @@ struct in6_ndifreq_64 { /* Prefix status */ #define NDPRF_ONLINK 0x1 #define NDPRF_DETACHED 0x2 -#define NDPRF_STATIC 0x100 -#define NDPRF_IFSCOPE 0x1000 +#define NDPRF_STATIC 0x100 +#define NDPRF_IFSCOPE 0x1000 +#define NDPRF_PRPROXY 0x2000 #ifdef XNU_KERNEL_PRIVATE #define NDPRF_PROCESSED 0x08000 #endif @@ -451,7 +470,9 @@ __private_extern__ lck_rw_t *nd_if_rwlock; /* * In a more readable form, we derive linkmtu based on: * - * if (ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < ifp->if_mtu) + * if (ND_IFINFO(ifp) == NULL || !ND_IFINFO(ifp)->initialized) + * linkmtu = ifp->if_mtu; + * else if (ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < ifp->if_mtu) * linkmtu = ND_IFINFO(ifp)->linkmtu; * else if ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < ifp->if_mtu)) * linkmtu = ND_IFINFO(ifp)->maxmtu; @@ -459,8 +480,8 @@ __private_extern__ lck_rw_t *nd_if_rwlock; * linkmtu = ifp->if_mtu; */ #define IN6_LINKMTU(ifp) \ - (ND_IFINFO(ifp) == NULL ? (ifp)->if_mtu : \ - ((ND_IFINFO(ifp)->linkmtu && \ + ((ND_IFINFO(ifp) == NULL || !ND_IFINFO(ifp)->initialized) ? \ + (ifp)->if_mtu : ((ND_IFINFO(ifp)->linkmtu && \ ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) ? ND_IFINFO(ifp)->linkmtu : \ ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) ? \ ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu))) @@ -530,11 +551,15 @@ struct nd_defrouter { #define NDDR_REMREF_LOCKED(_nddr) \ nddr_remref(_nddr, 1) +/* define struct prproxy_sols_tree */ +RB_HEAD(prproxy_sols_tree, nd6_prproxy_soltgt); + struct nd_prefix { decl_lck_mtx_data(, ndpr_lock); u_int32_t ndpr_refcount; /* reference count */ u_int32_t ndpr_debug; /* see ifa_debug flags */ struct ifnet *ndpr_ifp; + struct rtentry *ndpr_rt; LIST_ENTRY(nd_prefix) ndpr_entry; struct sockaddr_in6 ndpr_prefix; /* prefix */ struct in6_addr ndpr_mask; /* netmask derived from the prefix */ @@ -550,6 +575,9 @@ struct nd_prefix { LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs; u_char ndpr_plen; int ndpr_addrcnt; /* reference counter from addresses */ + u_int32_t ndpr_allmulti_cnt; /* total all-multi reqs */ + u_int32_t ndpr_prproxy_sols_cnt; /* total # of proxied NS */ + struct prproxy_sols_tree ndpr_prproxy_sols; /* tree of proxied NS */ void (*ndpr_trace) /* callback fn for tracing refs */ (struct nd_prefix *, int); }; @@ -636,6 +664,51 @@ struct nd_pfxrouter { LIST_HEAD(nd_prhead, nd_prefix); +struct nd_prefix_list { + struct nd_prefix_list *next; + struct nd_prefix pr; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#if defined(PRIVATE) +/* ND6 kernel event subclass value */ +#define KEV_ND6_SUBCLASS 7 +/* ND6 kernel event action type */ +#define KEV_ND6_RA 1 +/* ND6 RA L2 source address length */ +#define ND6_ROUTER_LL_SIZE 64 + +struct nd6_ra_prefix { + struct sockaddr_in6 prefix; + struct prf_ra raflags; + u_int32_t prefixlen; + u_int32_t origin; + u_int64_t vltime; + u_int64_t pltime; + u_int64_t expire; + u_int32_t flags; + u_int32_t refcnt; + u_int32_t if_index; + u_int32_t pad; +}; + +/* ND6 router advertisement valid bits */ +#define KEV_ND6_DATA_VALID_MTU (0x1 << 0) +#define KEV_ND6_DATA_VALID_PREFIX (0x1 << 1) + +struct kev_nd6_ra_data { + u_int8_t lladdr[ND6_ROUTER_LL_SIZE]; + u_int32_t lladdrlen; + u_int32_t mtu; + u_int32_t list_index; + u_int32_t list_length; + u_int32_t flags; + struct nd6_ra_prefix prefix; + u_int32_t pad; +}; +#endif /* PRIVATE */ + +#if defined(XNU_KERNEL_PRIVATE) /* nd6.c */ extern int nd6_prune; extern int nd6_delay; @@ -652,10 +725,16 @@ extern struct nd_prhead nd_prefix; extern int nd6_debug; extern size_t nd_ifinfo_indexlim; extern int nd6_onlink_ns_rfc4861; +extern int nd6_optimistic_dad; #define nd6log(x) do { if (nd6_debug >= 1) log x; } while (0) #define nd6log2(x) do { if (nd6_debug >= 2) log x; } while (0) +#define ND6_OPTIMISTIC_DAD_LINKLOCAL (1 << 0) +#define ND6_OPTIMISTIC_DAD_AUTOCONF (1 << 1) +#define ND6_OPTIMISTIC_DAD_TEMPORARY (1 << 2) +#define ND6_OPTIMISTIC_DAD_DYNAMIC (1 << 3) + /* nd6_rtr.c */ extern int nd6_defifindex; extern int ip6_desync_factor; /* seconds */ @@ -710,11 +789,13 @@ extern int nd6_ioctl(u_long, caddr_t, struct ifnet *); extern void nd6_cache_lladdr(struct ifnet *, struct in6_addr *, char *, int, int, int); extern int nd6_output(struct ifnet *, struct ifnet *, struct mbuf *, - struct sockaddr_in6 *, struct rtentry *); + struct sockaddr_in6 *, struct rtentry *, struct flowadv *); extern int nd6_storelladdr(struct ifnet *, struct rtentry *, struct mbuf *, struct sockaddr *, u_char *); extern int nd6_need_cache(struct ifnet *); extern void nd6_drain(void *); +extern void nd6_post_msg(u_int32_t, struct nd_prefix_list *, u_int32_t, u_int32_t, char *, u_int32_t); +extern int nd6_setifinfo(struct ifnet *, u_int32_t, u_int32_t); /* nd6_nbr.c */ extern void nd6_nbr_init(void); @@ -732,14 +813,17 @@ extern void nd6_llreach_alloc(struct rtentry *, struct ifnet *, void *, unsigned int, boolean_t); extern void nd6_llreach_set_reachable(struct ifnet *, void *, unsigned int); extern void nd6_llreach_use(struct llinfo_nd6 *); +extern void nd6_alt_node_addr_decompose(struct ifnet *, struct sockaddr *, + struct sockaddr_dl *, struct sockaddr_in6 *); +extern void nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *, + struct sockaddr_dl *, int32_t, int, int); +extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *); /* nd6_rtr.c */ extern void nd6_rtr_init(void); extern void nd6_rs_input(struct mbuf *, int, int); extern void nd6_ra_input(struct mbuf *, int, int); extern void prelist_del(struct nd_prefix *); -extern void defrouter_addreq(struct nd_defrouter *, boolean_t); -extern void defrouter_delreq(struct nd_defrouter *); extern void defrouter_select(struct ifnet *); extern void defrouter_reset(void); extern int defrtrlist_ioctl(u_long, caddr_t); @@ -765,6 +849,24 @@ extern void nddr_addref(struct nd_defrouter *, int); extern struct nd_defrouter *nddr_remref(struct nd_defrouter *, int); extern void ndpr_addref(struct nd_prefix *, int); extern struct nd_prefix *ndpr_remref(struct nd_prefix *, int); + +/* nd6_prproxy.c */ +struct ip6_hdr; +extern u_int32_t nd6_prproxy; +extern void nd6_prproxy_init(void); +extern int nd6_if_prproxy(struct ifnet *, boolean_t); +extern void nd6_prproxy_prelist_update(struct nd_prefix *, struct nd_prefix *); +extern boolean_t nd6_prproxy_ifaddr(struct in6_ifaddr *); +extern boolean_t nd6_prproxy_isours(struct mbuf *, struct ip6_hdr *, + struct route_in6 *, unsigned int); +extern void nd6_prproxy_ns_output(struct ifnet *, struct in6_addr *, + struct in6_addr *, struct llinfo_nd6 *); +extern void nd6_prproxy_ns_input(struct ifnet *, struct in6_addr *, + char *, int, struct in6_addr *, struct in6_addr *); +extern void nd6_prproxy_na_input(struct ifnet *, struct in6_addr *, + struct in6_addr *, struct in6_addr *, int); +extern void nd6_prproxy_sols_reap(struct nd_prefix *); +extern void nd6_prproxy_sols_prune(struct nd_prefix *, u_int32_t); #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index b2abd8169..be58f8dac 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,8 +101,6 @@ extern int ipsec_bypass; #include -#define SDL(s) ((struct sockaddr_dl *)s) - struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *); void nd6_dad_stoptimer(struct ifaddr *); @@ -159,14 +157,14 @@ extern int in6_get_hw_ifid(struct ifnet *, struct in6_addr *); static int nd6_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ +static struct sockaddr_in6 hostrtmask; + SYSCTL_DECL(_net_inet6_icmp6); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_llreach_base, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_llreach_base, LL_BASE_REACHABLE, "default ND6 link-layer reachability max lifetime (in seconds)"); -#define SIN6(s) ((struct sockaddr_in6 *)s) - /* * Obtain a link-layer source cache entry for the sender. * @@ -263,10 +261,14 @@ nd6_ns_input( char *lladdr = NULL; struct ifaddr *ifa = NULL; int lladdrlen = 0; - int anycast = 0, proxy = 0, tentative = 0; + int anycast = 0, proxy = 0, dadprogress = 0; int tlladdr; union nd_opts ndopts; struct sockaddr_dl proxydl; + boolean_t advrouter; + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len, return); @@ -283,7 +285,7 @@ nd6_ns_input( if (in6_setscope(&taddr6, ifp, NULL) != 0) goto bad; - if (ip6->ip6_hlim != 255) { + if (ip6->ip6_hlim != IPV6_MAXHLIM) { nd6log((LOG_ERR, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), @@ -368,7 +370,7 @@ nd6_ns_input( * Target address (taddr6) must be either: * (1) Valid unicast/anycast address for my receiving interface, * (2) Unicast address for which I'm offering proxy service, or - * (3) "tentative" address on which DAD is being performed. + * (3) "tentative" or "optimistic" address [DAD is in progress]. */ /* (1) and (3) check. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); @@ -404,18 +406,31 @@ nd6_ns_input( rtfree(rt); } } + if (ifa == NULL && ip6_forwarding && nd6_prproxy) { + /* + * Is the target address part of the prefix that is being + * proxied and installed on another interface? + */ + ifa = (struct ifaddr *)in6ifa_prproxyaddr(&taddr6); + } if (ifa == NULL) { /* - * We've got an NS packet, and we don't have that adddress - * assigned for us. We MUST silently ignore it. - * See RFC2461 7.2.3. + * We've got an NS packet, and we don't have that address + * assigned for us. We MUST silently ignore it on this + * interface, c.f. RFC 4861 7.2.3. + * + * Forwarding associated with NDPRF_PRPROXY may apply. */ + if (ip6_forwarding && nd6_prproxy) + nd6_prproxy_ns_input(ifp, &saddr6, lladdr, + lladdrlen, &daddr6, &taddr6); goto freeit; } IFA_LOCK(ifa); myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; - tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; + dadprogress = + ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DADPROGRESS; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) { IFA_UNLOCK(ifa); goto freeit; @@ -439,7 +454,7 @@ nd6_ns_input( /* * We have neighbor solicitation packet, with target address equals to - * one of my tentative address. + * one of my DAD in-progress addresses. * * src addr how to process? * --- --- @@ -447,9 +462,9 @@ nd6_ns_input( * unicast somebody is doing address resolution -> ignore * unspec dup address detection * - * The processing is defined in RFC 2462. + * The processing is defined in RFC 2462 (and updated by RFC 4429) */ - if (tentative) { + if (dadprogress) { /* * If source address is unspecified address, it is for * duplicate address detection. @@ -463,11 +478,15 @@ nd6_ns_input( goto freeit; } + /* Are we an advertising router on this interface? */ + advrouter = (ifp->if_eflags & IFEF_IPV6_ROUTER); + /* * If the source address is unspecified address, entries must not * be created or updated. - * It looks that sender is performing DAD. Output NA toward - * all-node multicast address, to tell the sender that I'm using + * It looks that sender is performing DAD. If I'm using the address, + * and it's a "preferred" address, i.e. not optimistic, then output NA + * toward all-node multicast address, to tell the sender that I'm using * the address. * S bit ("solicited") must be zero. */ @@ -475,21 +494,22 @@ nd6_ns_input( saddr6 = in6addr_linklocal_allnodes; if (in6_setscope(&saddr6, ifp, NULL) != 0) goto bad; - nd6_na_output(ifp, &saddr6, &taddr6, - ((anycast || proxy || !tlladdr) - ? 0 : ND_NA_FLAG_OVERRIDE) - | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), - tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL); + if ((dadprogress & IN6_IFF_OPTIMISTIC) == 0) + nd6_na_output(ifp, &saddr6, &taddr6, + ((anycast || proxy || !tlladdr) ? 0 : + ND_NA_FLAG_OVERRIDE) | (advrouter ? + ND_NA_FLAG_ROUTER : 0), tlladdr, proxy ? + (struct sockaddr *)&proxydl : NULL); goto freeit; } - nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0); + nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, + ND_NEIGHBOR_SOLICIT, 0); nd6_na_output(ifp, &saddr6, &taddr6, - ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) - | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) - | ND_NA_FLAG_SOLICITED, - tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL); + ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | + (advrouter ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, + tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL); freeit: m_freem(m); if (ifa != NULL) @@ -514,6 +534,7 @@ nd6_ns_input( * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) + * Updated by RFC 4429 (optimistic duplicate address detection) * * Caller must bump up ln->ln_rt refcnt to make sure 'ln' doesn't go * away if there is a llinfo_nd6 passed in. @@ -538,14 +559,17 @@ nd6_ns_output( int flags; caddr_t mac; struct route_in6 ro; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; + u_int32_t rtflags = 0; - bzero(&ro, sizeof(ro)); - - if (IN6_IS_ADDR_MULTICAST(taddr6)) + if ((ifp->if_eflags & IFEF_IPV6_ND6ALT) || IN6_IS_ADDR_MULTICAST(taddr6)) return; + bzero(&ro, sizeof(ro)); + ip6oa.ip6oa_boundif = ifp->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); @@ -580,7 +604,7 @@ nd6_ns_output( } im6o->im6o_multicast_ifp = ifp; - im6o->im6o_multicast_hlim = 255; + im6o->im6o_multicast_hlim = IPV6_MAXHLIM; im6o->im6o_multicast_loop = 0; } @@ -595,7 +619,7 @@ nd6_ns_output( ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; - ip6->ip6_hlim = 255; + ip6->ip6_hlim = IPV6_MAXHLIM; if (daddr6) ip6->ip6_dst = *daddr6; else { @@ -648,17 +672,16 @@ nd6_ns_output( ln->ln_llreach->lr_probes++; IFLR_UNLOCK(ln->ln_llreach); } + rtflags = ln->ln_rt->rt_flags; RT_UNLOCK(ln->ln_rt); - } if (ia != NULL) { IFA_REMREF(&ia->ia_ifa); ia = NULL; } - if (hsrc != NULL && (ia = in6ifa_ifpwithaddr(ifp, hsrc))) { + if (hsrc != NULL && (ia = in6ifa_ifpwithaddr(ifp, hsrc)) && + (ia->ia6_flags & IN6_IFF_OPTIMISTIC) == 0) { src = hsrc; - IFA_REMREF(&ia->ia_ifa); - ia = NULL; } else { int error; struct sockaddr_in6 dst_sa; @@ -679,6 +702,15 @@ nd6_ns_output( error)); goto bad; } + + ia = in6ifa_ifpwithaddr(ifp, src); + if (!ia || (ia->ia6_flags & IN6_IFF_OPTIMISTIC)) { + nd6log((LOG_DEBUG, + "nd6_ns_output: no preferred source " + "available: dst=%s\n", + ip6_sprintf(&dst_sa.sin6_addr))); + goto bad; + } } } else { /* @@ -690,6 +722,7 @@ nd6_ns_output( */ bzero(&src_in, sizeof(src_in)); src = &src_in; + ip6oa.ip6oa_flags &= ~IP6OAF_BOUND_SRCADDR; } ip6->ip6_src = *src; nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); @@ -716,7 +749,7 @@ nd6_ns_output( struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8 byte alignments... */ optlen = (optlen + 7) & ~7; - + m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; @@ -739,6 +772,23 @@ nd6_ns_output( flags = dad ? IPV6_UNSPECSRC : 0; flags |= IPV6_OUTARGS; + /* + * If this is a NS for resolving the (default) router, mark + * the packet accordingly so that the driver can find out, + * in case it needs to perform driver-specific action(s). + */ + if (rtflags & RTF_ROUTER) { + m->m_pkthdr.aux_flags |= MAUXF_INET6_RESOLVE_RTR; + VERIFY(!(m->m_pkthdr.aux_flags & MAUXF_INET_RESOLVE_RTR)); + } + + if (ifp->if_eflags & IFEF_TXSTART) { + /* Use control service class if the interface + * supports transmit-start model + */ + (void) m_set_service_class(m, MBUF_SC_CTL); + } + ip6_output(m, NULL, NULL, flags, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); @@ -787,6 +837,7 @@ nd6_na_input( struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; + struct in6_addr saddr6 = ip6->ip6_src; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; @@ -802,7 +853,10 @@ nd6_na_input( union nd_opts ndopts; struct timeval timenow; - if (ip6->ip6_hlim != 255) { + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + + if (ip6->ip6_hlim != IPV6_MAXHLIM) { nd6log((LOG_ERR, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), @@ -862,17 +916,34 @@ nd6_na_input( /* * Target address matches one of my interface address. * - * If my address is tentative, this means that there's somebody - * already using the same address as mine. This indicates DAD failure. - * This is defined in RFC 2462. + * If my address is tentative or optimistic, this means that there's + * somebody already using the same address as mine. This indicates DAD + * failure. This is defined in RFC 2462 and updated by RFC 4429. * * Otherwise, process as defined in RFC 2461. */ if (ifa != NULL) { IFA_LOCK(ifa); - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE) { + if (((struct in6_ifaddr *)ifa)->ia6_flags & + IN6_IFF_DADPROGRESS) { + struct nd_ifinfo *ndi; + boolean_t ignorena = FALSE; + IFA_UNLOCK(ifa); - nd6_dad_na_input(ifa, lladdr, lladdrlen); + lck_rw_lock_shared(nd_if_rwlock); + ndi = ND_IFINFO(ifp); + if (ndi != NULL && ndi->initialized) { + lck_mtx_lock(&ndi->lock); + ignorena = ndi->flags & ND6_IFF_IGNORE_NA; + lck_mtx_unlock(&ndi->lock); + } + lck_rw_done(nd_if_rwlock); + if (ignorena) + log(LOG_ERR, "%s: ignoring duplicate DAD due " + "to sleep proxy (%s)\n", __func__, + if_name(ifp)); + else + nd6_dad_na_input(ifa, lladdr, lladdrlen); goto freeit; } IFA_UNLOCK(ifa); @@ -894,12 +965,41 @@ nd6_na_input( goto bad; } + /* Forwarding associated with NDPRF_PRPROXY may apply. */ + if (ip6_forwarding && nd6_prproxy) + nd6_prproxy_na_input(ifp, &saddr6, &daddr6, &taddr6, flags); + /* * If no neighbor cache entry is found, NA SHOULD silently be - * discarded. + * discarded. If we are forwarding (and Scoped Routing is in + * effect), try to see if there is a neighbor cache entry on + * another interface (in case we are doing prefix proxying.) */ - if ((rt = nd6_lookup(&taddr6, 0, ifp, 0)) == NULL) - goto freeit; + if ((rt = nd6_lookup(&taddr6, 0, ifp, 0)) == NULL) { + if (!ip6_forwarding || !ip6_doscopedroute || !nd6_prproxy) + goto freeit; + + if ((rt = nd6_lookup(&taddr6, 0, NULL, 0)) == NULL) + goto freeit; + + RT_LOCK_ASSERT_HELD(rt); + if (rt->rt_ifp != ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* Adjust route ref count for the interfaces */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } + + /* Change the interface when the existing route is on */ + rt->rt_ifp = ifp; + } + } RT_LOCK_ASSERT_HELD(rt); if ((ln = rt->rt_llinfo) == NULL || @@ -930,9 +1030,15 @@ nd6_na_input( ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (ln->ln_expire) { + struct nd_ifinfo *ndi; + lck_rw_lock_shared(nd_if_rwlock); + ndi = ND_IFINFO(rt->rt_ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); ln->ln_expire = rt_expiry(rt, timenow.tv_sec, - nd_ifinfo[rt->rt_ifp->if_index].reachable); + ndi->reachable); + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } } else { @@ -1022,10 +1128,16 @@ nd6_na_input( ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (ln->ln_expire) { + struct nd_ifinfo *ndi; + lck_rw_lock_shared(nd_if_rwlock); + ndi = ND_IFINFO(ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); ln->ln_expire = rt_expiry(rt, timenow.tv_sec, - nd_ifinfo[ifp->if_index].reachable); + ndi->reachable); + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } } else { @@ -1047,14 +1159,9 @@ nd6_na_input( struct in6_addr *in6; struct ifnet *rt_ifp = rt->rt_ifp; - in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; + in6 = &((struct sockaddr_in6 *) + (void *)rt_key(rt))->sin6_addr; - /* - * Lock to protect the default router list. - * XXX: this might be unnecessary, since this function - * is only called under the network software interrupt - * context. However, we keep it just for safety. - */ RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); dr = defrouter_lookup(in6, rt_ifp); @@ -1062,16 +1169,15 @@ nd6_na_input( defrtrlist_del(dr); NDDR_REMREF(dr); lck_mtx_unlock(nd6_mutex); - } - else { + } else { lck_mtx_unlock(nd6_mutex); - if (!ip6_forwarding && (ip6_accept_rtadv || (rt_ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + if (ip6_doscopedroute || !ip6_forwarding) { /* - * Even if the neighbor is not in the default - * router list, the neighbor may be used - * as a next hop for some destinations - * (e.g. redirect case). So we must - * call rt6_flush explicitly. + * Even if the neighbor is not in the + * default router list, the neighbor + * may be used as a next hop for some + * destinations (e.g. redirect case). + * So we must call rt6_flush explicitly. */ rt6_flush(&ip6->ip6_src, rt_ifp); } @@ -1107,7 +1213,7 @@ nd6_na_input( * the 2nd argument as the 1st one. */ RT_UNLOCK(rt); - nd6_output(ifp, ifp, m_hold, &sin6, rt); + nd6_output(ifp, ifp, m_hold, &sin6, rt, NULL); RT_LOCK_SPIN(rt); } ln->ln_hold = NULL; @@ -1157,16 +1263,19 @@ nd6_na_output( caddr_t mac = NULL; struct route_in6 ro; struct in6_addr *src, src_storage, daddr6; + struct in6_ifaddr *ia; struct sockaddr_in6 dst_sa; int icmp6len, maxlen, error; struct ifnet *outif = NULL; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR }; bzero(&ro, sizeof(ro)); daddr6 = *daddr6_0; /* make a local copy for modification */ ip6oa.ip6oa_boundif = ifp->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); @@ -1201,7 +1310,7 @@ nd6_na_output( } im6o->im6o_multicast_ifp = ifp; - im6o->im6o_multicast_hlim = 255; + im6o->im6o_multicast_hlim = IPV6_MAXHLIM; im6o->im6o_multicast_loop = 0; } @@ -1215,7 +1324,7 @@ nd6_na_output( ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; - ip6->ip6_hlim = 255; + ip6->ip6_hlim = IPV6_MAXHLIM; if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; @@ -1249,6 +1358,17 @@ nd6_na_output( } ip6->ip6_src = *src; + /* + * RFC 4429 requires not setting "override" flag on NA packets sent + * from optimistic addresses. + */ + ia = in6ifa_ifpwithaddr(ifp, src); + if (ia != NULL) { + if (ia->ia6_flags & IN6_IFF_OPTIMISTIC) + flags &= ~ND_NA_FLAG_OVERRIDE; + IFA_REMREF(&ia->ia_ifa); + } + nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; @@ -1271,7 +1391,7 @@ nd6_na_output( mac = nd6_ifptomac(ifp); else if (sdl0->sa_family == AF_LINK) { struct sockaddr_dl *sdl; - sdl = (struct sockaddr_dl *)sdl0; + sdl = (struct sockaddr_dl *)(void *)sdl0; if (sdl->sdl_alen == ifp->if_addrlen) mac = LLADDR(sdl); } @@ -1304,6 +1424,14 @@ nd6_na_output( if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif + + if (ifp->if_eflags & IFEF_TXSTART) { + /* Use control service class if the interface supports + * transmit-start model. + */ + (void) m_set_service_class(m, MBUF_SC_CTL); + } + ip6_output(m, NULL, NULL, IPV6_OUTARGS, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); @@ -1376,6 +1504,8 @@ static struct dadq_head dadq; void nd6_nbr_init(void) { + int i; + TAILQ_INIT(&dadq); dad_size = sizeof (struct dadq); @@ -1386,6 +1516,12 @@ nd6_nbr_init(void) } zone_change(dad_zone, Z_EXPAND, TRUE); zone_change(dad_zone, Z_CALLERACCT, FALSE); + + bzero(&hostrtmask, sizeof hostrtmask); + hostrtmask.sin6_family = AF_INET6; + hostrtmask.sin6_len = sizeof hostrtmask; + for (i = 0; i < sizeof hostrtmask.sin6_addr; ++i) + hostrtmask.sin6_addr.s6_addr[i] = 0xff; } static struct dadq * @@ -1434,29 +1570,25 @@ nd6_dad_start( * - the interface address is anycast */ IFA_LOCK(&ia->ia_ifa); - if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) { + if (!(ia->ia6_flags & IN6_IFF_DADPROGRESS)) { log(LOG_DEBUG, - "nd6_dad_start: called with non-tentative address " + "nd6_dad_start: not a tentative or optimistic address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); IFA_UNLOCK(&ia->ia_ifa); return; } - if (ia->ia6_flags & IN6_IFF_ANYCAST) { - ia->ia6_flags &= ~IN6_IFF_TENTATIVE; - IFA_UNLOCK(&ia->ia_ifa); - return; - } - if (!ip6_dad_count) { - ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + if (!ip6_dad_count || (ia->ia6_flags & IN6_IFF_ANYCAST) != 0) { + ia->ia6_flags &= ~IN6_IFF_DADPROGRESS; IFA_UNLOCK(&ia->ia_ifa); return; } IFA_UNLOCK(&ia->ia_ifa); if (ifa->ifa_ifp == NULL) panic("nd6_dad_start: ifa->ifa_ifp == NULL"); - if (!(ifa->ifa_ifp->if_flags & IFF_UP)) { + if (!(ifa->ifa_ifp->if_flags & IFF_UP) || + (ifa->ifa_ifp->if_eflags & IFEF_IPV6_ND6ALT)) { return; } if ((dp = nd6_dad_find(ifa)) != NULL) { @@ -1479,7 +1611,9 @@ nd6_dad_start( /* Callee adds one reference for us */ dp = nd6_dad_attach(dp, ifa); - nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), + nd6log((LOG_DEBUG, "%s: starting %sDAD for %s\n", + if_name(ifa->ifa_ifp), + (ia->ia_flags & IN6_IFF_OPTIMISTIC) ? "optimistic " : "", ip6_sprintf(&ia->ia_addr.sin6_addr))); /* @@ -1490,9 +1624,15 @@ nd6_dad_start( */ if (tick_delay == NULL) { u_int32_t retrans; + struct nd_ifinfo *ndi; + nd6_dad_ns_output(dp, ifa); lck_rw_lock_shared(nd_if_rwlock); - retrans = nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000; + ndi = ND_IFINFO(ifa->ifa_ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); + retrans = ndi->retrans * hz / 1000; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, retrans); } else { @@ -1582,7 +1722,8 @@ nd6_unsol_na_output(struct ifaddr *ifa) struct in6_addr saddr6, taddr6; if ((ifp->if_flags & IFF_UP) == 0 || - (ifp->if_flags & IFF_RUNNING) == 0) + (ifp->if_flags & IFF_RUNNING) == 0 || + (ifp->if_eflags & IFEF_IPV6_ND6ALT) != 0) return; IFA_LOCK_SPIN(&ia->ia_ifa); @@ -1625,9 +1766,9 @@ nd6_dad_timer(struct ifaddr *ifa) IFA_UNLOCK(&ia->ia_ifa); goto done; } - if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { - log(LOG_ERR, "nd6_dad_timer: called with non-tentative address " - "%s(%s)\n", + if ((ia->ia6_flags & IN6_IFF_DADPROGRESS) == 0) { + log(LOG_ERR, "nd6_dad_timer: not a tentative or optimistic " + "address %s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); IFA_UNLOCK(&ia->ia_ifa); @@ -1649,13 +1790,19 @@ nd6_dad_timer(struct ifaddr *ifa) /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { u_int32_t retrans; + struct nd_ifinfo *ndi; + DAD_UNLOCK(dp); /* * We have more NS to go. Send NS packet for DAD. */ nd6_dad_ns_output(dp, ifa); lck_rw_lock_shared(nd_if_rwlock); - retrans = nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000; + ndi = ND_IFINFO(ifa->ifa_ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); + retrans = ndi->retrans * hz / 1000; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, retrans); } else { @@ -1690,7 +1837,7 @@ nd6_dad_timer(struct ifaddr *ifa) * No duplicate address found. */ IFA_LOCK_SPIN(&ia->ia_ifa); - ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + ia->ia6_flags &= ~IN6_IFF_DADPROGRESS; IFA_UNLOCK(&ia->ia_ifa); nd6log((LOG_DEBUG, @@ -1726,7 +1873,6 @@ nd6_dad_duplicated(struct ifaddr *ifa, boolean_t dontignhwdup) log(LOG_ERR, "nd6_dad_duplicated: DAD structure not found\n"); return; } - hwdupposs = 0; IFA_LOCK(&ia->ia_ifa); DAD_LOCK(dp); @@ -1737,7 +1883,7 @@ nd6_dad_duplicated(struct ifaddr *ifa, boolean_t dontignhwdup) dp->dad_na_ixcount); hwdupposs = dp->dad_na_ixcount; DAD_UNLOCK(dp); - ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + ia->ia6_flags &= ~IN6_IFF_DADPROGRESS; ia->ia6_flags |= IN6_IFF_DUPLICATED; IFA_UNLOCK(&ia->ia_ifa); @@ -1761,7 +1907,7 @@ nd6_dad_duplicated(struct ifaddr *ifa, boolean_t dontignhwdup) ND6_IFF_IFDISABLED; lck_rw_done(nd_if_rwlock); } - + /* Send an event to the configuration agent so that the * duplicate address will be notified to the user and will * be removed. @@ -1906,7 +2052,8 @@ nd6_dad_na_input(struct ifaddr *ifa, caddr_t lladdr, int lladdrlen) llifa = ifp->if_lladdr; IFA_LOCK(llifa); - sdl = (struct sockaddr_dl *)llifa->ifa_addr; + sdl = (struct sockaddr_dl *)(void *) + llifa->ifa_addr; if (lladdrlen == sdl->sdl_alen || bcmp(lladdr, LLADDR(sdl), lladdrlen) == 0) hwdupposs = 1; @@ -2008,3 +2155,166 @@ nd6_llreach_set_reachable(struct ifnet *ifp, void *addr, unsigned int alen) ifnet_llreach_set_reachable(ifp, ETHERTYPE_IPV6, addr, alen); } + +void +nd6_alt_node_addr_decompose(struct ifnet *ifp, struct sockaddr *sa, + struct sockaddr_dl* sdl, struct sockaddr_in6 *sin6) +{ + static const size_t EUI64_LENGTH = 8; + + VERIFY(nd6_need_cache(ifp)); + VERIFY(sa); + VERIFY(sdl && (void *)sa != (void *)sdl); + VERIFY(sin6 && (void *)sa != (void *)sin6); + + bzero(sin6, sizeof *sin6); + sin6->sin6_len = sizeof *sin6; + sin6->sin6_family = AF_INET6; + + bzero(sdl, sizeof *sdl); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_type = ifp->if_type; + sdl->sdl_index = ifp->if_index; + + switch (sa->sa_family) { + case AF_INET6: { + struct sockaddr_in6 *sin6a = (struct sockaddr_in6 *)(void *)sa; + struct in6_addr *in6 = &sin6a->sin6_addr; + + VERIFY(sa->sa_len == sizeof *sin6); + + sdl->sdl_nlen = strlen(ifp->if_name); + bcopy(ifp->if_name, sdl->sdl_data, sdl->sdl_nlen); + if (in6->s6_addr[11] == 0xff && in6->s6_addr[12] == 0xfe) { + sdl->sdl_alen = ETHER_ADDR_LEN; + LLADDR(sdl)[0] = (in6->s6_addr[8] ^ ND6_EUI64_UBIT); + LLADDR(sdl)[1] = in6->s6_addr[9]; + LLADDR(sdl)[2] = in6->s6_addr[10]; + LLADDR(sdl)[3] = in6->s6_addr[13]; + LLADDR(sdl)[4] = in6->s6_addr[14]; + LLADDR(sdl)[5] = in6->s6_addr[15]; + } + else { + sdl->sdl_alen = EUI64_LENGTH; + bcopy(&in6->s6_addr[8], LLADDR(sdl), EUI64_LENGTH); + } + + sdl->sdl_slen = 0; + break; + } + case AF_LINK: { + struct sockaddr_dl *sdla = (struct sockaddr_dl *)(void *)sa; + struct in6_addr *in6 = &sin6->sin6_addr; + caddr_t lla = LLADDR(sdla); + + VERIFY(sa->sa_len <= sizeof *sdl); + bcopy(sa, sdl, sa->sa_len); + + sin6->sin6_scope_id = sdla->sdl_index; + if (sin6->sin6_scope_id == 0) + sin6->sin6_scope_id = ifp->if_index; + in6->s6_addr[0] = 0xfe; + in6->s6_addr[1] = 0x80; + if (sdla->sdl_alen == EUI64_LENGTH) + bcopy(lla, &in6->s6_addr[8], EUI64_LENGTH); + else { + VERIFY(sdla->sdl_alen == ETHER_ADDR_LEN); + + in6->s6_addr[8] = ((uint8_t) lla[0] ^ ND6_EUI64_UBIT); + in6->s6_addr[9] = (uint8_t) lla[1]; + in6->s6_addr[10] = (uint8_t) lla[2]; + in6->s6_addr[11] = 0xff; + in6->s6_addr[12] = 0xfe; + in6->s6_addr[13] = (uint8_t) lla[3]; + in6->s6_addr[14] = (uint8_t) lla[4]; + in6->s6_addr[15] = (uint8_t) lla[5]; + } + + break; + } + default: + VERIFY(false); + break; + } +} + +void +nd6_alt_node_present(struct ifnet *ifp, struct sockaddr_in6 *sin6, + struct sockaddr_dl *sdl, int32_t rssi, int lqm, int npm) +{ + struct rtentry *rt; + struct llinfo_nd6 *ln; + struct if_llreach *lr; + + nd6_cache_lladdr(ifp, &sin6->sin6_addr, LLADDR(sdl), + sdl->sdl_alen, ND_NEIGHBOR_ADVERT, 0); + + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rnh_lock); + + rt = rtalloc1_scoped_locked((struct sockaddr *)sin6, 1, 0, + ifp->if_index); + if (rt != NULL) { + RT_LOCK(rt); + VERIFY(rt->rt_flags & RTF_LLINFO); + VERIFY(rt->rt_llinfo); + + ln = rt->rt_llinfo; + ln->ln_state = ND6_LLINFO_REACHABLE; + ln->ln_expire = 0; + + lr = ln->ln_llreach; + if (lr) { + IFLR_LOCK(lr); + lr->lr_rssi = rssi; + lr->lr_lqm = (int32_t) lqm; + lr->lr_npm = (int32_t) npm; + IFLR_UNLOCK(lr); + } + + RT_UNLOCK(rt); + RT_REMREF(rt); + } + + lck_mtx_unlock(rnh_lock); + + if (rt == NULL) { + log(LOG_ERR, "%s: failed to add/update host route to %s.\n", + __func__, ip6_sprintf(&sin6->sin6_addr)); + } +} + +void +nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6) +{ + struct rtentry *rt; + + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rnh_lock); + + rt = rtalloc1_scoped_locked((struct sockaddr *)sin6, 0, 0, + ifp->if_index); + if (rt != NULL) { + RT_LOCK(rt); + + if (!(rt->rt_flags & (RTF_PINNED|RTF_CLONING|RTF_PRCLONING)) && + (rt->rt_flags & (RTF_HOST|RTF_LLINFO|RTF_WASCLONED)) == + (RTF_HOST|RTF_LLINFO|RTF_WASCLONED)) { + rt->rt_flags |= RTF_CONDEMNED; + RT_UNLOCK(rt); + + (void) rtrequest_locked(RTM_DELETE, rt_key(rt), + (struct sockaddr *)NULL, rt_mask(rt), 0, + (struct rtentry **)NULL); + + rtfree_locked(rt); + } + else { + RT_REMREF_LOCKED(rt); + RT_UNLOCK(rt); + } + } + + lck_mtx_unlock(rnh_lock); +} diff --git a/bsd/netinet6/nd6_prproxy.c b/bsd/netinet6/nd6_prproxy.c new file mode 100644 index 000000000..3bac47eb1 --- /dev/null +++ b/bsd/netinet6/nd6_prproxy.c @@ -0,0 +1,1357 @@ +/* + * Copyright (c) 2011-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Prefix-based Neighbor Discovery Proxy + * + * When an interface is marked with the ND6_IFF_PROXY_PREFIXES flag, all + * of current and future non-scoped on-link prefixes configured on the + * interface will be shared with the scoped variant of such prefixes on + * other interfaces. This allows for one or more prefixes to be shared + * across multiple links, with full support for Duplicate Addres Detection, + * Address Resolution and Neighbor Unreachability Detection. + * + * A non-scoped prefix may be configured statically, or dynamically via + * Router Advertisement. An interface is said to be an "upstream" interface + * when it is marked with ND6_IFF_PROXY_PREFIXES and has at least one prefix + * that is non-scoped (global, not scoped.) Such prefixes are marked with + * the NDPRF_PRPROXY flag. + * + * A scoped prefix typically gets configured by way of adding an address + * to a "downstream" interface, when the added address is part of an existing + * prefix that is allowed to be shared (i.e. NDPRF_PRPROXY prefixes.) Unlike + * non-scoped prefixes, however, scoped prefixes will never be marked with + * the NDPRF_PRPROXY flag. + * + * The setting of NDPRF_PRPROXY depends on whether the prefix is on-link; + * an off-link prefix on an interface marked with ND6_IFF_PROXY_PREFIXES + * will not cause NDPRF_PRPROXY to be set (it will only happen when that + * prefix goes on-link.) Likewise, a previously on-link prefix that has + * transitioned to off-link will cause its NDPRF_PRPROXY flag to be cleared. + * + * Prefix proxying relies on IPv6 Scoped Routing to be in effect, as it would + * otherwise be impossible to install scoped prefix route entries in the + * routing table. By default, such cloning prefix routes will generate cloned + * routes that are scoped according to their interfaces. Because prefix + * proxying is essentially creating a larger network comprised of multiple + * links sharing a prefix, we need to treat the cloned routes as if they + * weren't scoped route entries. This requires marking such cloning prefix + * routes with the RTF_PROXY flag, which serves as an indication that the + * route entry (and its clones) are part of a proxied prefix, and that the + * entries are non-scoped. + * + * In order to handle solicited-node destined ND packets (Address Resolution, + * Neighbor Unreachability Detection), prefix proxying also requires that the + * "upstream" and "downstream" interfaces be configured for all-multicast mode. + * + * The setting and clearing of RTF_PROXY flag, as well as the entering and + * exiting of all-multicast mode on those interfaces happen when a prefix + * transitions between on-link and off-link (vice versa.) + * + * Note that this is not a strict implementation of RFC 4389, but rather a + * derivative based on similar concept. In particular, we only proxy NS and + * NA packets; RA packets are never proxied. Care should be taken to enable + * prefix proxying only on non-looping network topology. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nd6_prproxy_prelist { + SLIST_ENTRY(nd6_prproxy_prelist) ndprl_le; + struct nd_prefix *ndprl_pr; /* prefix */ + struct nd_prefix *ndprl_up; /* non-NULL for upstream */ + struct ifnet *ndprl_fwd_ifp; /* outgoing interface */ + boolean_t ndprl_sol; /* unicast solicitor? */ + struct in6_addr ndprl_sol_saddr; /* solicitor's address */ +}; + +/* + * Soliciting node (source) record. + */ +struct nd6_prproxy_solsrc { + TAILQ_ENTRY(nd6_prproxy_solsrc) solsrc_tqe; + struct in6_addr solsrc_saddr; /* soliciting (src) address */ + struct ifnet *solsrc_ifp; /* iface where NS arrived on */ +}; + +/* + * Solicited node (target) record. + */ +struct nd6_prproxy_soltgt { + RB_ENTRY(nd6_prproxy_soltgt) soltgt_link; /* RB tree links */ + struct soltgt_key_s { + struct in6_addr taddr; /* solicited (tgt) address */ + } soltgt_key; + u_int64_t soltgt_expire; /* expiration time */ + u_int32_t soltgt_cnt; /* total # of solicitors */ + TAILQ_HEAD(, nd6_prproxy_solsrc) soltgt_q; +}; + +SLIST_HEAD(nd6_prproxy_prelist_head, nd6_prproxy_prelist); + +static void nd6_prproxy_prelist_setroute(boolean_t enable, + struct nd6_prproxy_prelist_head *, struct nd6_prproxy_prelist_head *); +static struct nd6_prproxy_prelist *nd6_ndprl_alloc(int); +static void nd6_ndprl_free(struct nd6_prproxy_prelist *); +static struct nd6_prproxy_solsrc *nd6_solsrc_alloc(int); +static void nd6_solsrc_free(struct nd6_prproxy_solsrc *); +static boolean_t nd6_solsrc_enq(struct nd_prefix *, struct ifnet *, + struct in6_addr *, struct in6_addr *); +static boolean_t nd6_solsrc_deq(struct nd_prefix *, struct in6_addr *, + struct in6_addr *, struct ifnet **); +static struct nd6_prproxy_soltgt *nd6_soltgt_alloc(int); +static void nd6_soltgt_free(struct nd6_prproxy_soltgt *); +static void nd6_soltgt_prune(struct nd6_prproxy_soltgt *, u_int32_t); +static __inline int soltgt_cmp(const struct nd6_prproxy_soltgt *, + const struct nd6_prproxy_soltgt *); +static void nd6_prproxy_sols_purge(struct nd_prefix *, u_int64_t); + +RB_PROTOTYPE_SC_PREV(__private_extern__, prproxy_sols_tree, nd6_prproxy_soltgt, + soltgt_link, soltgt_cmp); + +/* + * Time (in seconds) before a target record expires (is idle). + */ +#define ND6_TGT_SOLS_EXPIRE 5 + +/* + * Maximum number of queued soliciting (source) records per target. + */ +#define ND6_MAX_SRC_SOLS_DEFAULT 4 + +/* + * Maximum number of queued solicited (target) records per prefix. + */ +#define ND6_MAX_TGT_SOLS_DEFAULT 8 + +static u_int32_t nd6_max_tgt_sols = ND6_MAX_TGT_SOLS_DEFAULT; +static u_int32_t nd6_max_src_sols = ND6_MAX_SRC_SOLS_DEFAULT; + +static unsigned int ndprl_size; /* size of zone element */ +static struct zone *ndprl_zone; /* nd6_prproxy_prelist zone */ + +#define NDPRL_ZONE_MAX 256 /* maximum elements in zone */ +#define NDPRL_ZONE_NAME "nd6_prproxy_prelist" /* name for zone */ + +static unsigned int solsrc_size; /* size of zone element */ +static struct zone *solsrc_zone; /* nd6_prproxy_solsrc zone */ + +#define SOLSRC_ZONE_MAX 256 /* maximum elements in zone */ +#define SOLSRC_ZONE_NAME "nd6_prproxy_solsrc" /* name for zone */ + +static unsigned int soltgt_size; /* size of zone element */ +static struct zone *soltgt_zone; /* nd6_prproxy_soltgt zone */ + +#define SOLTGT_ZONE_MAX 256 /* maximum elements in zone */ +#define SOLTGT_ZONE_NAME "nd6_prproxy_soltgt" /* name for zone */ + +/* The following is protected by ndpr_lock */ +RB_GENERATE_PREV(prproxy_sols_tree, nd6_prproxy_soltgt, + soltgt_link, soltgt_cmp); + +/* The following is protected by proxy6_lock (for updates) */ +u_int32_t nd6_prproxy; + +extern lck_mtx_t *nd6_mutex; + +SYSCTL_DECL(_net_inet6_icmp6); + +SYSCTL_UINT(_net_inet6_icmp6, OID_AUTO, nd6_maxsolstgt, + CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_max_tgt_sols, ND6_MAX_TGT_SOLS_DEFAULT, + "maximum number of outstanding solicited targets per prefix"); + +SYSCTL_UINT(_net_inet6_icmp6, OID_AUTO, nd6_maxproxiedsol, + CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_max_src_sols, ND6_MAX_SRC_SOLS_DEFAULT, + "maximum number of outstanding solicitations per target"); + +SYSCTL_UINT(_net_inet6_icmp6, OID_AUTO, prproxy_cnt, + CTLFLAG_RD | CTLFLAG_LOCKED, &nd6_prproxy, 0, + "total number of proxied prefixes"); + +/* + * Called by nd6_init() during initialization time. + */ +void +nd6_prproxy_init(void) +{ + ndprl_size = sizeof (struct nd6_prproxy_prelist); + ndprl_zone = zinit(ndprl_size, NDPRL_ZONE_MAX * ndprl_size, 0, + NDPRL_ZONE_NAME); + if (ndprl_zone == NULL) + panic("%s: failed allocating ndprl_zone", __func__); + + zone_change(ndprl_zone, Z_EXPAND, TRUE); + zone_change(ndprl_zone, Z_CALLERACCT, FALSE); + + solsrc_size = sizeof (struct nd6_prproxy_solsrc); + solsrc_zone = zinit(solsrc_size, SOLSRC_ZONE_MAX * solsrc_size, 0, + SOLSRC_ZONE_NAME); + if (solsrc_zone == NULL) + panic("%s: failed allocating solsrc_zone", __func__); + + zone_change(solsrc_zone, Z_EXPAND, TRUE); + zone_change(solsrc_zone, Z_CALLERACCT, FALSE); + + soltgt_size = sizeof (struct nd6_prproxy_soltgt); + soltgt_zone = zinit(soltgt_size, SOLTGT_ZONE_MAX * soltgt_size, 0, + SOLTGT_ZONE_NAME); + if (soltgt_zone == NULL) + panic("%s: failed allocating soltgt_zone", __func__); + + zone_change(soltgt_zone, Z_EXPAND, TRUE); + zone_change(soltgt_zone, Z_CALLERACCT, FALSE); +} + +static struct nd6_prproxy_prelist * +nd6_ndprl_alloc(int how) +{ + struct nd6_prproxy_prelist *ndprl; + + ndprl = (how == M_WAITOK) ? zalloc(ndprl_zone) : + zalloc_noblock(ndprl_zone); + if (ndprl != NULL) + bzero(ndprl, ndprl_size); + + return (ndprl); +} + +static void +nd6_ndprl_free(struct nd6_prproxy_prelist *ndprl) +{ + zfree(ndprl_zone, ndprl); +} + +/* + * Apply routing function on the affected upstream and downstream prefixes, + * i.e. either set or clear RTF_PROXY on the cloning prefix route; all route + * entries that were cloned off these prefixes will be blown away. Caller + * must have acquried proxy6_lock and must not be holding nd6_mutex. + */ +static void +nd6_prproxy_prelist_setroute(boolean_t enable, + struct nd6_prproxy_prelist_head *up_head, + struct nd6_prproxy_prelist_head *down_head) +{ + struct nd6_prproxy_prelist *up, *down, *ndprl_tmp; + struct nd_prefix *pr; + + lck_mtx_assert(&proxy6_lock, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + SLIST_FOREACH_SAFE(up, up_head, ndprl_le, ndprl_tmp) { + struct rtentry *rt; + boolean_t prproxy; + + SLIST_REMOVE(up_head, up, nd6_prproxy_prelist, ndprl_le); + pr = up->ndprl_pr; + VERIFY(up->ndprl_up == NULL); + + NDPR_LOCK(pr); + prproxy = (pr->ndpr_stateflags & NDPRF_PRPROXY); + VERIFY(!prproxy || ((pr->ndpr_stateflags & NDPRF_ONLINK) && + !(pr->ndpr_stateflags & NDPRF_IFSCOPE))); + + nd6_prproxy_sols_reap(pr); + VERIFY(pr->ndpr_prproxy_sols_cnt == 0); + VERIFY(RB_EMPTY(&pr->ndpr_prproxy_sols)); + + if (enable && pr->ndpr_allmulti_cnt == 0) { + nd6_prproxy++; + pr->ndpr_allmulti_cnt++; + if_allmulti(pr->ndpr_ifp, TRUE); + } else if (!enable && pr->ndpr_allmulti_cnt > 0) { + nd6_prproxy--; + pr->ndpr_allmulti_cnt--; + if_allmulti(pr->ndpr_ifp, FALSE); + } + + if ((rt = pr->ndpr_rt) != NULL) { + if ((enable && prproxy) || (!enable && !prproxy)) + RT_ADDREF(rt); + else + rt = NULL; + NDPR_UNLOCK(pr); + } else { + NDPR_UNLOCK(pr); + } + NDPR_REMREF(pr); + if (rt != NULL) { + rt_set_proxy(rt, enable); + rtfree(rt); + } + nd6_ndprl_free(up); + } + + SLIST_FOREACH_SAFE(down, down_head, ndprl_le, ndprl_tmp) { + struct nd_prefix *pr_up; + struct rtentry *rt; + boolean_t prproxy; + + SLIST_REMOVE(down_head, down, nd6_prproxy_prelist, ndprl_le); + pr = down->ndprl_pr; + pr_up = down->ndprl_up; + VERIFY(pr_up != NULL); + + NDPR_LOCK(pr_up); + prproxy = (pr_up->ndpr_stateflags & NDPRF_PRPROXY); + VERIFY(!prproxy || ((pr_up->ndpr_stateflags & NDPRF_ONLINK) && + !(pr_up->ndpr_stateflags & NDPRF_IFSCOPE))); + NDPR_UNLOCK(pr_up); + + NDPR_LOCK(pr); + if (enable && pr->ndpr_allmulti_cnt == 0) { + pr->ndpr_allmulti_cnt++; + if_allmulti(pr->ndpr_ifp, TRUE); + } else if (!enable && pr->ndpr_allmulti_cnt > 0) { + pr->ndpr_allmulti_cnt--; + if_allmulti(pr->ndpr_ifp, FALSE); + } + + if ((rt = pr->ndpr_rt) != NULL) { + if ((enable && prproxy) || (!enable && !prproxy)) + RT_ADDREF(rt); + else + rt = NULL; + NDPR_UNLOCK(pr); + } else { + NDPR_UNLOCK(pr); + } + NDPR_REMREF(pr); + NDPR_REMREF(pr_up); + if (rt != NULL) { + rt_set_proxy(rt, enable); + rtfree(rt); + } + nd6_ndprl_free(down); + } +} + +/* + * Enable/disable prefix proxying on an interface; typically called + * as part of handling SIOCSIFINFO_FLAGS[IFEF_IPV6_ROUTER]. + */ +int +nd6_if_prproxy(struct ifnet *ifp, boolean_t enable) +{ + SLIST_HEAD(, nd6_prproxy_prelist) up_head; + SLIST_HEAD(, nd6_prproxy_prelist) down_head; + struct nd6_prproxy_prelist *up, *down; + struct nd_prefix *pr; + + /* Can't be enabled if we are an advertising router on the interface */ + ifnet_lock_shared(ifp); + if (enable && (ifp->if_eflags & IFEF_IPV6_ROUTER)) { + ifnet_lock_done(ifp); + return (EBUSY); + } + ifnet_lock_done(ifp); + + SLIST_INIT(&up_head); + SLIST_INIT(&down_head); + + /* + * Serialize the clearing/setting of NDPRF_PRPROXY. + */ + lck_mtx_lock(&proxy6_lock); + + /* + * First build a list of upstream prefixes on this interface for + * which we need to enable/disable prefix proxy functionality. + */ + lck_mtx_lock(nd6_mutex); + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || + (!enable && !(pr->ndpr_stateflags & NDPRF_PRPROXY)) || + (enable && (pr->ndpr_stateflags & NDPRF_PRPROXY)) || + (pr->ndpr_stateflags & NDPRF_IFSCOPE) || + pr->ndpr_ifp != ifp) { + NDPR_UNLOCK(pr); + continue; + } + + /* + * At present, in order for the prefix to be eligible + * as a proxying/proxied prefix, we require that the + * prefix route entry be marked as a cloning route with + * RTF_PROXY; i.e. nd6_need_cache() needs to return + * true for the interface type. + */ + if (enable && (pr->ndpr_stateflags & NDPRF_ONLINK) && + nd6_need_cache(ifp)) { + pr->ndpr_stateflags |= NDPRF_PRPROXY; + NDPR_ADDREF_LOCKED(pr); + NDPR_UNLOCK(pr); + } else if (!enable) { + pr->ndpr_stateflags &= ~NDPRF_PRPROXY; + NDPR_ADDREF_LOCKED(pr); + NDPR_UNLOCK(pr); + } else { + NDPR_UNLOCK(pr); + pr = NULL; /* don't go further */ + } + + if (pr == NULL) + continue; + + up = nd6_ndprl_alloc(M_WAITOK); + if (up == NULL) { + NDPR_REMREF(pr); + continue; + } + + up->ndprl_pr = pr; /* keep reference from above */ + SLIST_INSERT_HEAD(&up_head, up, ndprl_le); + } + + /* + * Now build a list of matching (scoped) downstream prefixes on other + * interfaces which need to be enabled/disabled accordingly. Note that + * the NDPRF_PRPROXY is never set/cleared on the downstream prefixes. + */ + SLIST_FOREACH(up, &up_head, ndprl_le) { + struct nd_prefix *fwd; + struct in6_addr pr_addr; + u_char pr_len; + + pr = up->ndprl_pr; + + NDPR_LOCK(pr); + bcopy(&pr->ndpr_prefix.sin6_addr, &pr_addr, sizeof (pr_addr)); + pr_len = pr->ndpr_plen; + NDPR_UNLOCK(pr); + + for (fwd = nd_prefix.lh_first; fwd; fwd = fwd->ndpr_next) { + NDPR_LOCK(fwd); + if (!(fwd->ndpr_stateflags & NDPRF_ONLINK) || + !(fwd->ndpr_stateflags & NDPRF_IFSCOPE) || + fwd->ndpr_plen != pr_len || + !in6_are_prefix_equal(&fwd->ndpr_prefix.sin6_addr, + &pr_addr, pr_len)) { + NDPR_UNLOCK(fwd); + continue; + } + NDPR_UNLOCK(fwd); + + down = nd6_ndprl_alloc(M_WAITOK); + if (down == NULL) + continue; + + NDPR_ADDREF(fwd); + down->ndprl_pr = fwd; + NDPR_ADDREF(pr); + down->ndprl_up = pr; + SLIST_INSERT_HEAD(&down_head, down, ndprl_le); + } + } + lck_mtx_unlock(nd6_mutex); + + /* + * Apply routing function on prefixes; callee will free resources. + */ + nd6_prproxy_prelist_setroute(enable, + (struct nd6_prproxy_prelist_head *)&up_head, + (struct nd6_prproxy_prelist_head *)&down_head); + + VERIFY(SLIST_EMPTY(&up_head)); + VERIFY(SLIST_EMPTY(&down_head)); + + lck_mtx_unlock(&proxy6_lock); + + return (0); +} + +/* + * Called from the input path to determine whether the packet is destined + * to a proxied node; if so, mark the mbuf with MAUXF_PROXY_DST so that + * icmp6_input() knows that this is not to be delivered to socket(s). + */ +boolean_t +nd6_prproxy_isours(struct mbuf *m, struct ip6_hdr *ip6, struct route_in6 *ro6, + unsigned int ifscope) +{ + struct rtentry *rt; + boolean_t ours = FALSE; + + if (ip6->ip6_hlim != IPV6_MAXHLIM || ip6->ip6_nxt != IPPROTO_ICMPV6) + goto done; + + if (IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) || + IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) { + VERIFY(ro6 == NULL); + ours = TRUE; + goto done; + } else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + goto done; + } + + if (ro6 == NULL) + goto done; + + if ((rt = ro6->ro_rt) != NULL) + RT_LOCK(rt); + + if (rt == NULL || !(rt->rt_flags & RTF_UP) || + rt->generation_id != route_generation) { + if (rt != NULL) { + RT_UNLOCK(rt); + rtfree(rt); + rt = ro6->ro_rt = NULL; + } + + /* Caller must have ensured this condition (not srcrt) */ + VERIFY(IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ro6->ro_dst.sin6_addr)); + + rtalloc_scoped_ign((struct route *)ro6, RTF_PRCLONING, ifscope); + if ((rt = ro6->ro_rt) == NULL) + goto done; + + RT_LOCK(rt); + } + + ours = (rt->rt_flags & RTF_PROXY) ? TRUE : FALSE; + RT_UNLOCK(rt); + +done: + if (ours) + m->m_pkthdr.aux_flags |= MAUXF_PROXY_DST; + + return (ours); +} + +/* + * Called when a prefix transitions between on-link and off-link. Perform + * routing (RTF_PROXY) and interface (all-multicast) related operations on + * the affected prefixes. + */ +void +nd6_prproxy_prelist_update(struct nd_prefix *pr_cur, struct nd_prefix *pr_up) +{ + SLIST_HEAD(, nd6_prproxy_prelist) up_head; + SLIST_HEAD(, nd6_prproxy_prelist) down_head; + struct nd6_prproxy_prelist *up, *down; + struct nd_prefix *pr; + struct in6_addr pr_addr; + boolean_t enable; + u_char pr_len; + + SLIST_INIT(&up_head); + SLIST_INIT(&down_head); + VERIFY(pr_cur != NULL); + + lck_mtx_assert(&proxy6_lock, LCK_MTX_ASSERT_OWNED); + + /* + * Upstream prefix. If caller did not specify one, search for one + * based on the information in current prefix. Caller is expected + * to have held an extra reference for the passed-in prefixes. + */ + lck_mtx_lock(nd6_mutex); + if (pr_up == NULL) { + NDPR_LOCK(pr_cur); + bcopy(&pr_cur->ndpr_prefix.sin6_addr, &pr_addr, + sizeof (pr_addr)); + pr_len = pr_cur->ndpr_plen; + NDPR_UNLOCK(pr_cur); + + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK) || + !(pr->ndpr_stateflags & NDPRF_PRPROXY) || + pr->ndpr_plen != pr_len || + !in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, + &pr_addr, pr_len)) { + NDPR_UNLOCK(pr); + continue; + } + NDPR_UNLOCK(pr); + break; + } + + if ((pr_up = pr) == NULL) { + lck_mtx_unlock(nd6_mutex); + goto done; + } + NDPR_LOCK(pr_up); + } else { + NDPR_LOCK(pr_up); + bcopy(&pr_up->ndpr_prefix.sin6_addr, &pr_addr, + sizeof (pr_addr)); + pr_len = pr_up->ndpr_plen; + } + NDPR_LOCK_ASSERT_HELD(pr_up); + /* + * Upstream prefix could be offlink by now; therefore we cannot + * assert that NDPRF_PRPROXY is set; however, we can insist that + * it must not be a scoped prefix. + */ + VERIFY(!(pr_up->ndpr_stateflags & NDPRF_IFSCOPE)); + enable = (pr_up->ndpr_stateflags & NDPRF_PRPROXY); + NDPR_UNLOCK(pr_up); + + up = nd6_ndprl_alloc(M_WAITOK); + if (up == NULL) { + lck_mtx_unlock(nd6_mutex); + goto done; + } + + NDPR_ADDREF(pr_up); + up->ndprl_pr = pr_up; + SLIST_INSERT_HEAD(&up_head, up, ndprl_le); + + /* + * Now build a list of matching (scoped) downstream prefixes on other + * interfaces which need to be enabled/disabled accordingly. Note that + * the NDPRF_PRPROXY is never set/cleared on the downstream prefixes. + */ + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK) || + !(pr->ndpr_stateflags & NDPRF_IFSCOPE) || + pr->ndpr_plen != pr_len || + !in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, + &pr_addr, pr_len)) { + NDPR_UNLOCK(pr); + continue; + } + NDPR_UNLOCK(pr); + + down = nd6_ndprl_alloc(M_WAITOK); + if (down == NULL) + continue; + + NDPR_ADDREF(pr); + down->ndprl_pr = pr; + NDPR_ADDREF(pr_up); + down->ndprl_up = pr_up; + SLIST_INSERT_HEAD(&down_head, down, ndprl_le); + } + lck_mtx_unlock(nd6_mutex); + + /* + * Apply routing function on prefixes; callee will free resources. + */ + nd6_prproxy_prelist_setroute(enable, + (struct nd6_prproxy_prelist_head *)&up_head, + (struct nd6_prproxy_prelist_head *)&down_head); + +done: + VERIFY(SLIST_EMPTY(&up_head)); + VERIFY(SLIST_EMPTY(&down_head)); +} + +/* + * Given an interface address, determine whether or not the address + * is part of of a proxied prefix. + */ +boolean_t +nd6_prproxy_ifaddr(struct in6_ifaddr *ia) +{ + struct nd_prefix *pr; + struct in6_addr addr, pr_mask; + u_int32_t pr_len; + boolean_t proxied = FALSE; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + IFA_LOCK(&ia->ia_ifa); + bcopy(&ia->ia_addr.sin6_addr, &addr, sizeof (addr)); + bcopy(&ia->ia_prefixmask.sin6_addr, &pr_mask, sizeof (pr_mask)); + pr_len = ia->ia_plen; + IFA_UNLOCK(&ia->ia_ifa); + + lck_mtx_lock(nd6_mutex); + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if ((pr->ndpr_stateflags & NDPRF_ONLINK) && + (pr->ndpr_stateflags & NDPRF_PRPROXY) && + in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, + &addr, pr_len)) { + NDPR_UNLOCK(pr); + proxied = TRUE; + break; + } + NDPR_UNLOCK(pr); + } + lck_mtx_unlock(nd6_mutex); + + return (proxied); +} + +/* + * Perform automatic proxy function with NS output. + * + * If the target address matches a global prefix obtained from a router + * advertisement received on an interface with the ND6_IFF_PROXY_PREFIXES + * flag set, then we send solicitations for the target address to all other + * interfaces where a matching prefix is currently on-link, in addition to + * the original interface. + */ +void +nd6_prproxy_ns_output(struct ifnet *ifp, struct in6_addr *daddr, + struct in6_addr *taddr, struct llinfo_nd6 *ln) +{ + SLIST_HEAD(, nd6_prproxy_prelist) ndprl_head; + struct nd6_prproxy_prelist *ndprl, *ndprl_tmp; + struct nd_prefix *pr, *fwd; + struct ifnet *fwd_ifp; + struct in6_addr pr_addr; + u_char pr_len; + + SLIST_INIT(&ndprl_head); + + lck_mtx_lock(nd6_mutex); + + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK) || + !(pr->ndpr_stateflags & NDPRF_PRPROXY) || + !IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, + taddr, &pr->ndpr_mask)) { + NDPR_UNLOCK(pr); + continue; + } + + VERIFY(!(pr->ndpr_stateflags & NDPRF_IFSCOPE)); + bcopy(&pr->ndpr_prefix.sin6_addr, &pr_addr, sizeof (pr_addr)); + pr_len = pr->ndpr_plen; + NDPR_UNLOCK(pr); + + for (fwd = nd_prefix.lh_first; fwd; fwd = fwd->ndpr_next) { + NDPR_LOCK(fwd); + if (!(fwd->ndpr_stateflags & NDPRF_ONLINK) || + fwd->ndpr_ifp == ifp || + fwd->ndpr_plen != pr_len || + !in6_are_prefix_equal(&fwd->ndpr_prefix.sin6_addr, + &pr_addr, pr_len)) { + NDPR_UNLOCK(fwd); + continue; + } + + fwd_ifp = fwd->ndpr_ifp; + NDPR_UNLOCK(fwd); + + ndprl = nd6_ndprl_alloc(M_WAITOK); + if (ndprl == NULL) + continue; + + NDPR_ADDREF(fwd); + ndprl->ndprl_pr = fwd; + ndprl->ndprl_fwd_ifp = fwd_ifp; + + SLIST_INSERT_HEAD(&ndprl_head, ndprl, ndprl_le); + } + break; + } + + lck_mtx_unlock(nd6_mutex); + + SLIST_FOREACH_SAFE(ndprl, &ndprl_head, ndprl_le, ndprl_tmp) { + SLIST_REMOVE(&ndprl_head, ndprl, nd6_prproxy_prelist, ndprl_le); + + pr = ndprl->ndprl_pr; + fwd_ifp = ndprl->ndprl_fwd_ifp; + + if ((fwd_ifp->if_eflags & IFEF_IPV6_ND6ALT) != 0) { + NDPR_REMREF(pr); + nd6_ndprl_free(ndprl); + continue; + } + + NDPR_LOCK(pr); + if (pr->ndpr_stateflags & NDPRF_ONLINK) { + NDPR_UNLOCK(pr); + nd6log2((LOG_DEBUG, + "%s%d: Sending cloned NS who has %s on %s%d\n", + fwd_ifp->if_name, fwd_ifp->if_unit, + ip6_sprintf(taddr), ifp->if_name, + ifp->if_unit)); + + nd6_ns_output(fwd_ifp, daddr, taddr, NULL, 0); + } else { + NDPR_UNLOCK(pr); + } + NDPR_REMREF(pr); + + nd6_ndprl_free(ndprl); + } + VERIFY(SLIST_EMPTY(&ndprl_head)); + + nd6_ns_output(ifp, daddr, taddr, ln, 0); +} + +/* + * Perform automatic proxy function with NS input. + * + * If the target address matches a global prefix obtained from a router + * advertisement received on an interface with the ND6_IFF_PROXY_PREFIXES + * flag set, then we send solicitations for the target address to all other + * interfaces where a matching prefix is currently on-link. + */ +void +nd6_prproxy_ns_input(struct ifnet *ifp, struct in6_addr *saddr, + char *lladdr, int lladdrlen, struct in6_addr *daddr, struct in6_addr *taddr) +{ + SLIST_HEAD(, nd6_prproxy_prelist) ndprl_head; + struct nd6_prproxy_prelist *ndprl, *ndprl_tmp; + struct nd_prefix *pr, *fwd; + struct ifnet *fwd_ifp; + struct in6_addr pr_addr; + u_char pr_len; + boolean_t solrec = FALSE; + + SLIST_INIT(&ndprl_head); + + lck_mtx_lock(nd6_mutex); + + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK) || + !(pr->ndpr_stateflags & NDPRF_PRPROXY) || + !IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, + taddr, &pr->ndpr_mask)) { + NDPR_UNLOCK(pr); + continue; + } + + VERIFY(!(pr->ndpr_stateflags & NDPRF_IFSCOPE)); + bcopy(&pr->ndpr_prefix.sin6_addr, &pr_addr, sizeof (pr_addr)); + pr_len = pr->ndpr_plen; + + /* + * If this is a NS for NUD/AR, record it so that we know + * how to forward the NA reply later on (if/when it arrives.) + * Give up if we fail to save the NS info. + */ + if ((solrec = !IN6_IS_ADDR_UNSPECIFIED(saddr)) && + !nd6_solsrc_enq(pr, ifp, saddr, taddr)) { + NDPR_UNLOCK(pr); + solrec = FALSE; + break; /* bail out */ + } else { + NDPR_UNLOCK(pr); + } + + for (fwd = nd_prefix.lh_first; fwd; fwd = fwd->ndpr_next) { + NDPR_LOCK(fwd); + if (!(fwd->ndpr_stateflags & NDPRF_ONLINK) || + fwd->ndpr_ifp == ifp || + fwd->ndpr_plen != pr_len || + !in6_are_prefix_equal(&fwd->ndpr_prefix.sin6_addr, + &pr_addr, pr_len)) { + NDPR_UNLOCK(fwd); + continue; + } + + fwd_ifp = fwd->ndpr_ifp; + NDPR_UNLOCK(fwd); + + ndprl = nd6_ndprl_alloc(M_WAITOK); + if (ndprl == NULL) + continue; + + NDPR_ADDREF(fwd); + ndprl->ndprl_pr = fwd; + ndprl->ndprl_fwd_ifp = fwd_ifp; + ndprl->ndprl_sol = solrec; + + SLIST_INSERT_HEAD(&ndprl_head, ndprl, ndprl_le); + } + break; + } + + lck_mtx_unlock(nd6_mutex); + + /* + * If this is a recorded solicitation (NS for NUD/AR), create + * or update the neighbor cache entry for the soliciting node. + * Later on, when the NA reply arrives, we will need this cache + * entry in order to send the NA back to the original solicitor. + * Without a neighbor cache entry, we'd end up with an endless + * cycle of NS ping-pong between the us (the proxy) and the node + * which is soliciting for the address. + */ + if (solrec) { + VERIFY(!IN6_IS_ADDR_UNSPECIFIED(saddr)); + nd6_cache_lladdr(ifp, saddr, lladdr, lladdrlen, + ND_NEIGHBOR_SOLICIT, 0); + } + + SLIST_FOREACH_SAFE(ndprl, &ndprl_head, ndprl_le, ndprl_tmp) { + SLIST_REMOVE(&ndprl_head, ndprl, nd6_prproxy_prelist, ndprl_le); + + pr = ndprl->ndprl_pr; + fwd_ifp = ndprl->ndprl_fwd_ifp; + + if ((fwd_ifp->if_eflags & IFEF_IPV6_ND6ALT) != 0) { + NDPR_REMREF(pr); + nd6_ndprl_free(ndprl); + continue; + } + + NDPR_LOCK(pr); + if (pr->ndpr_stateflags & NDPRF_ONLINK) { + NDPR_UNLOCK(pr); + nd6log2((LOG_DEBUG, + "%s%d: Forwarding NS (%s) from %s to %s who has %s " + "on %s%d\n", fwd_ifp->if_name, fwd_ifp->if_unit, + ndprl->ndprl_sol ? "NUD/AR" : "DAD", + ip6_sprintf(saddr), ip6_sprintf(daddr), + ip6_sprintf(taddr), ifp->if_name, ifp->if_unit)); + + nd6_ns_output(fwd_ifp, ndprl->ndprl_sol ? taddr : NULL, + taddr, NULL, !ndprl->ndprl_sol); + } else { + NDPR_UNLOCK(pr); + } + NDPR_REMREF(pr); + + nd6_ndprl_free(ndprl); + } + VERIFY(SLIST_EMPTY(&ndprl_head)); +} + +/* + * Perform automatic proxy function with NA input. + * + * If the target address matches a global prefix obtained from a router + * advertisement received on an interface with the ND6_IFF_PROXY_PREFIXES flag + * set, then we send neighbor advertisements for the target address on all + * other interfaces where a matching prefix is currently on link. + */ +void +nd6_prproxy_na_input(struct ifnet *ifp, struct in6_addr *saddr, + struct in6_addr *daddr0, struct in6_addr *taddr, int flags) +{ + SLIST_HEAD(, nd6_prproxy_prelist) ndprl_head; + struct nd6_prproxy_prelist *ndprl, *ndprl_tmp; + struct nd_prefix *pr; + struct ifnet *fwd_ifp; + struct in6_addr daddr; + + SLIST_INIT(&ndprl_head); + + + lck_mtx_lock(nd6_mutex); + + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK) || + !(pr->ndpr_stateflags & NDPRF_PRPROXY) || + !IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, + taddr, &pr->ndpr_mask)) { + NDPR_UNLOCK(pr); + continue; + } + + VERIFY(!(pr->ndpr_stateflags & NDPRF_IFSCOPE)); + /* + * If this is a NA for NUD, see if there is a record created + * for the corresponding NS; upon success, we get back the + * interface where the NS originally arrived on, as well as + * the soliciting node's address. Give up if we can't find it. + */ + if (!IN6_IS_ADDR_MULTICAST(daddr0)) { + fwd_ifp = NULL; + bzero(&daddr, sizeof (daddr)); + if (!nd6_solsrc_deq(pr, taddr, &daddr, &fwd_ifp)) { + NDPR_UNLOCK(pr); + break; /* bail out */ + } + VERIFY(!IN6_IS_ADDR_UNSPECIFIED(&daddr) && fwd_ifp); + NDPR_UNLOCK(pr); + + ndprl = nd6_ndprl_alloc(M_WAITOK); + if (ndprl == NULL) + break; /* bail out */ + + ndprl->ndprl_fwd_ifp = fwd_ifp; + ndprl->ndprl_sol = TRUE; + ndprl->ndprl_sol_saddr = *(&daddr); + + SLIST_INSERT_HEAD(&ndprl_head, ndprl, ndprl_le); + } else { + struct nd_prefix *fwd; + struct in6_addr pr_addr; + u_char pr_len; + + bcopy(&pr->ndpr_prefix.sin6_addr, &pr_addr, + sizeof (pr_addr)); + pr_len = pr->ndpr_plen; + NDPR_UNLOCK(pr); + + for (fwd = nd_prefix.lh_first; fwd; + fwd = fwd->ndpr_next) { + NDPR_LOCK(fwd); + if (!(fwd->ndpr_stateflags & NDPRF_ONLINK) || + fwd->ndpr_ifp == ifp || + fwd->ndpr_plen != pr_len || + !in6_are_prefix_equal( + &fwd->ndpr_prefix.sin6_addr, + &pr_addr, pr_len)) { + NDPR_UNLOCK(fwd); + continue; + } + + fwd_ifp = fwd->ndpr_ifp; + NDPR_UNLOCK(fwd); + + ndprl = nd6_ndprl_alloc(M_WAITOK); + if (ndprl == NULL) + continue; + + NDPR_ADDREF(fwd); + ndprl->ndprl_pr = fwd; + ndprl->ndprl_fwd_ifp = fwd_ifp; + + SLIST_INSERT_HEAD(&ndprl_head, ndprl, ndprl_le); + } + } + break; + } + + lck_mtx_unlock(nd6_mutex); + + SLIST_FOREACH_SAFE(ndprl, &ndprl_head, ndprl_le, ndprl_tmp) { + boolean_t send_na; + + SLIST_REMOVE(&ndprl_head, ndprl, nd6_prproxy_prelist, ndprl_le); + + pr = ndprl->ndprl_pr; + fwd_ifp = ndprl->ndprl_fwd_ifp; + + if (ndprl->ndprl_sol) { + VERIFY(pr == NULL); + daddr = *(&ndprl->ndprl_sol_saddr); + VERIFY(!IN6_IS_ADDR_UNSPECIFIED(&daddr)); + send_na = (in6_setscope(&daddr, fwd_ifp, NULL) == 0); + } else { + VERIFY(pr != NULL); + daddr = *daddr0; + NDPR_LOCK(pr); + send_na = ((pr->ndpr_stateflags & NDPRF_ONLINK) && + in6_setscope(&daddr, fwd_ifp, NULL) == 0); + NDPR_UNLOCK(pr); + } + + if (send_na) { + if (!ndprl->ndprl_sol) { + nd6log2((LOG_DEBUG, + "%s%d: Forwarding NA (DAD) from %s to %s " + "tgt is %s on %s%d\n", + fwd_ifp->if_name, fwd_ifp->if_unit, + ip6_sprintf(saddr), ip6_sprintf(&daddr), + ip6_sprintf(taddr), ifp->if_name, + ifp->if_unit)); + } else { + nd6log2((LOG_DEBUG, + "%s%d: Forwarding NA (NUD/AR) from %s to " + "%s (was %s) tgt is %s on %s%d\n", + fwd_ifp->if_name, fwd_ifp->if_unit, + ip6_sprintf(saddr), ip6_sprintf(&daddr), + ip6_sprintf(daddr0), ip6_sprintf(taddr), + ifp->if_name, ifp->if_unit)); + } + + nd6_na_output(fwd_ifp, &daddr, taddr, flags, 1, NULL); + } + + if (pr != NULL) + NDPR_REMREF(pr); + + nd6_ndprl_free(ndprl); + } + VERIFY(SLIST_EMPTY(&ndprl_head)); +} + +static struct nd6_prproxy_solsrc * +nd6_solsrc_alloc(int how) +{ + struct nd6_prproxy_solsrc *ssrc; + + ssrc = (how == M_WAITOK) ? zalloc(solsrc_zone) : + zalloc_noblock(solsrc_zone); + if (ssrc != NULL) + bzero(ssrc, solsrc_size); + + return (ssrc); +} + +static void +nd6_solsrc_free(struct nd6_prproxy_solsrc *ssrc) +{ + zfree(solsrc_zone, ssrc); +} + +static void +nd6_prproxy_sols_purge(struct nd_prefix *pr, u_int64_t max_stgt) +{ + struct nd6_prproxy_soltgt *soltgt, *tmp; + u_int64_t expire = (max_stgt > 0) ? net_uptime() : 0; + + NDPR_LOCK_ASSERT_HELD(pr); + + /* Either trim all or those that have expired or are idle */ + RB_FOREACH_SAFE(soltgt, prproxy_sols_tree, + &pr->ndpr_prproxy_sols, tmp) { + VERIFY(pr->ndpr_prproxy_sols_cnt > 0); + if (expire == 0 || soltgt->soltgt_expire <= expire || + soltgt->soltgt_cnt == 0) { + pr->ndpr_prproxy_sols_cnt--; + RB_REMOVE(prproxy_sols_tree, + &pr->ndpr_prproxy_sols, soltgt); + nd6_soltgt_free(soltgt); + } + } + + if (max_stgt == 0 || pr->ndpr_prproxy_sols_cnt < max_stgt) { + VERIFY(max_stgt != 0 || (pr->ndpr_prproxy_sols_cnt == 0 && + RB_EMPTY(&pr->ndpr_prproxy_sols))); + return; + } + + /* Brute force; mercilessly evict entries until we are under limit */ + RB_FOREACH_SAFE(soltgt, prproxy_sols_tree, + &pr->ndpr_prproxy_sols, tmp) { + VERIFY(pr->ndpr_prproxy_sols_cnt > 0); + pr->ndpr_prproxy_sols_cnt--; + RB_REMOVE(prproxy_sols_tree, &pr->ndpr_prproxy_sols, soltgt); + nd6_soltgt_free(soltgt); + if (pr->ndpr_prproxy_sols_cnt < max_stgt) + break; + } +} + +/* + * Purges all solicitation records on a given prefix. + * Caller is responsible for holding prefix lock. + */ +void +nd6_prproxy_sols_reap(struct nd_prefix *pr) +{ + nd6_prproxy_sols_purge(pr, 0); +} + +/* + * Purges expired or idle solicitation records on a given prefix. + * Caller is responsible for holding prefix lock. + */ +void +nd6_prproxy_sols_prune(struct nd_prefix *pr, u_int32_t max_stgt) +{ + nd6_prproxy_sols_purge(pr, max_stgt); +} + +/* + * Enqueue a soliciation record in the target record of a prefix. + */ +static boolean_t +nd6_solsrc_enq(struct nd_prefix *pr, struct ifnet *ifp, + struct in6_addr *saddr, struct in6_addr *taddr) +{ + struct nd6_prproxy_soltgt find, *soltgt; + struct nd6_prproxy_solsrc *ssrc; + u_int32_t max_stgt = nd6_max_tgt_sols; + u_int32_t max_ssrc = nd6_max_src_sols; + + NDPR_LOCK_ASSERT_HELD(pr); + VERIFY(!(pr->ndpr_stateflags & NDPRF_IFSCOPE)); + VERIFY((pr->ndpr_stateflags & (NDPRF_ONLINK|NDPRF_PRPROXY)) == + (NDPRF_ONLINK|NDPRF_PRPROXY)); + VERIFY(!IN6_IS_ADDR_UNSPECIFIED(saddr)); + + ssrc = nd6_solsrc_alloc(M_WAITOK); + if (ssrc == NULL) + return (FALSE); + + ssrc->solsrc_saddr = *saddr; + ssrc->solsrc_ifp = ifp; + + find.soltgt_key.taddr = *taddr; /* search key */ + + soltgt = RB_FIND(prproxy_sols_tree, &pr->ndpr_prproxy_sols, &find); + if (soltgt == NULL) { + if (max_stgt != 0 && pr->ndpr_prproxy_sols_cnt >= max_stgt) { + VERIFY(!RB_EMPTY(&pr->ndpr_prproxy_sols)); + nd6_prproxy_sols_prune(pr, max_stgt); + VERIFY(pr->ndpr_prproxy_sols_cnt < max_stgt); + } + + soltgt = nd6_soltgt_alloc(M_WAITOK); + if (soltgt == NULL) { + nd6_solsrc_free(ssrc); + return (FALSE); + } + + soltgt->soltgt_key.taddr = *taddr; + VERIFY(soltgt->soltgt_cnt == 0); + VERIFY(TAILQ_EMPTY(&soltgt->soltgt_q)); + + pr->ndpr_prproxy_sols_cnt++; + VERIFY(pr->ndpr_prproxy_sols_cnt != 0); + RB_INSERT(prproxy_sols_tree, &pr->ndpr_prproxy_sols, soltgt); + } + + if (max_ssrc != 0 && soltgt->soltgt_cnt >= max_ssrc) { + VERIFY(!TAILQ_EMPTY(&soltgt->soltgt_q)); + nd6_soltgt_prune(soltgt, max_ssrc); + VERIFY(soltgt->soltgt_cnt < max_ssrc); + } + + soltgt->soltgt_cnt++; + VERIFY(soltgt->soltgt_cnt != 0); + TAILQ_INSERT_TAIL(&soltgt->soltgt_q, ssrc, solsrc_tqe); + if (soltgt->soltgt_cnt == 1) + soltgt->soltgt_expire = net_uptime() + ND6_TGT_SOLS_EXPIRE; + + return (TRUE); +} + +/* + * Dequeue a solicitation record from a target record of a prefix. + */ +static boolean_t +nd6_solsrc_deq(struct nd_prefix *pr, struct in6_addr *taddr, + struct in6_addr *daddr, struct ifnet **ifp) +{ + struct nd6_prproxy_soltgt find, *soltgt; + struct nd6_prproxy_solsrc *ssrc; + + NDPR_LOCK_ASSERT_HELD(pr); + VERIFY(!(pr->ndpr_stateflags & NDPRF_IFSCOPE)); + VERIFY((pr->ndpr_stateflags & (NDPRF_ONLINK|NDPRF_PRPROXY)) == + (NDPRF_ONLINK|NDPRF_PRPROXY)); + + bzero(daddr, sizeof (*daddr)); + *ifp = NULL; + + find.soltgt_key.taddr = *taddr; /* search key */ + + soltgt = RB_FIND(prproxy_sols_tree, &pr->ndpr_prproxy_sols, &find); + if (soltgt == NULL || soltgt->soltgt_cnt == 0) { + VERIFY(soltgt == NULL || TAILQ_EMPTY(&soltgt->soltgt_q)); + return (FALSE); + } + + VERIFY(soltgt->soltgt_cnt != 0); + --soltgt->soltgt_cnt; + ssrc = TAILQ_FIRST(&soltgt->soltgt_q); + VERIFY(ssrc != NULL); + TAILQ_REMOVE(&soltgt->soltgt_q, ssrc, solsrc_tqe); + *daddr = *(&ssrc->solsrc_saddr); + *ifp = ssrc->solsrc_ifp; + nd6_solsrc_free(ssrc); + + return (TRUE); +} + +static struct nd6_prproxy_soltgt * +nd6_soltgt_alloc(int how) +{ + struct nd6_prproxy_soltgt *soltgt; + + soltgt = (how == M_WAITOK) ? zalloc(soltgt_zone) : + zalloc_noblock(soltgt_zone); + if (soltgt != NULL) { + bzero(soltgt, soltgt_size); + TAILQ_INIT(&soltgt->soltgt_q); + } + return (soltgt); +} + +static void +nd6_soltgt_free(struct nd6_prproxy_soltgt *soltgt) +{ + struct nd6_prproxy_solsrc *ssrc, *tssrc; + + TAILQ_FOREACH_SAFE(ssrc, &soltgt->soltgt_q, solsrc_tqe, tssrc) { + VERIFY(soltgt->soltgt_cnt > 0); + soltgt->soltgt_cnt--; + TAILQ_REMOVE(&soltgt->soltgt_q, ssrc, solsrc_tqe); + nd6_solsrc_free(ssrc); + } + + VERIFY(soltgt->soltgt_cnt == 0); + VERIFY(TAILQ_EMPTY(&soltgt->soltgt_q)); + + zfree(soltgt_zone, soltgt); +} + +static void +nd6_soltgt_prune(struct nd6_prproxy_soltgt *soltgt, u_int32_t max_ssrc) +{ + while (soltgt->soltgt_cnt >= max_ssrc) { + struct nd6_prproxy_solsrc *ssrc; + + VERIFY(soltgt->soltgt_cnt != 0); + --soltgt->soltgt_cnt; + ssrc = TAILQ_FIRST(&soltgt->soltgt_q); + VERIFY(ssrc != NULL); + TAILQ_REMOVE(&soltgt->soltgt_q, ssrc, solsrc_tqe); + nd6_solsrc_free(ssrc); + } +} + +/* + * Solicited target tree comparison function. + * + * An ordered predicate is necessary; bcmp() is not documented to return + * an indication of order, memcmp() is, and is an ISO C99 requirement. + */ +static __inline int +soltgt_cmp(const struct nd6_prproxy_soltgt *a, + const struct nd6_prproxy_soltgt *b) +{ + return (memcmp(&a->soltgt_key, &b->soltgt_key, sizeof (a->soltgt_key))); +} diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index 2e5c5eae5..34bfb18a6 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2011 Apple Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,8 +93,8 @@ #include -#define SDL(s) ((struct sockaddr_dl *)s) - +static void defrouter_addreq(struct nd_defrouter *, boolean_t); +static void defrouter_delreq(struct nd_defrouter *); static struct nd_defrouter *defrtrlist_update_common(struct nd_defrouter *, boolean_t); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); @@ -102,8 +102,6 @@ static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static struct in6_ifaddr *in6_ifadd(struct nd_prefix *, int); static void defrtrlist_sync(struct ifnet *); -static void defrouter_select_common(struct ifnet *, int); - static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); @@ -280,8 +278,11 @@ nd6_rs_input( int lladdrlen = 0; union nd_opts ndopts; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + /* If I'm not a router, ignore it. */ - if (ip6_accept_rtadv != 0 || (ifp->if_eflags & IFEF_ACCEPT_RTADVD) || ip6_forwarding != 1) + if (!ip6_forwarding || !(ifp->if_eflags & IFEF_IPV6_ROUTER)) goto freeit; /* Sanity checks */ @@ -297,10 +298,10 @@ nd6_rs_input( * Don't update the neighbor cache, if src = :: or a non-neighbor. * The former case indicates that the src has no IP address assigned * yet. See nd6_ns_input() for the latter case. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { goto freeit; - else { + } else { struct sockaddr_in6 src_sa6; bzero(&src_sa6, sizeof(src_sa6)); @@ -368,7 +369,7 @@ nd6_rs_input( void nd6_ra_input( struct mbuf *m, - int off, + int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; @@ -380,12 +381,38 @@ nd6_ra_input( union nd_opts ndopts; struct nd_defrouter *dr = NULL; struct timeval timenow; + u_int32_t mtu = 0; + char *lladdr = NULL; + u_int32_t lladdrlen = 0; + struct nd_prefix_list *nd_prefix_list_head = NULL; + u_int32_t nd_prefix_list_length = 0; + struct in6_ifaddr *ia6 = NULL; + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); getmicrotime(&timenow); - if (ip6_accept_rtadv == 0 && ((ifp->if_eflags & IFEF_ACCEPT_RTADVD) == 0)) + /* + * Discard RA unless IFEF_ACCEPT_RTADV is set (as host), or when + * IFEF_IPV6_ROUTER is set (as router) but the RA is not locally + * generated. For convenience, we allow locally generated (rtadvd) + * RAs to be processed on the advertising interface, as a router. + * + * Note that we don't test against ip6_forwarding as we could be + * both a host and a router on different interfaces, hence the + * check against the per-interface flags. + */ + if (!(ifp->if_eflags & (IFEF_ACCEPT_RTADV | IFEF_IPV6_ROUTER)) || + ((ifp->if_eflags & IFEF_IPV6_ROUTER) && + (ia6 = ifa_foraddr6(&saddr6)) == NULL)) goto freeit; + if (ia6 != NULL) { + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + } + if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", @@ -435,6 +462,8 @@ nd6_ra_input( goto freeit; } ndi = &nd_ifinfo[ifp->if_index]; + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); bzero(&dr0, sizeof (dr0)); dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; @@ -455,6 +484,7 @@ nd6_ra_input( ndi->retrans = ntohl(nd_ra->nd_ra_retransmit); if (nd_ra->nd_ra_curhoplimit) ndi->chlim = nd_ra->nd_ra_curhoplimit; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); ndi = NULL; lck_mtx_lock(nd6_mutex); @@ -523,19 +553,22 @@ nd6_ra_input( /* * Exceptions to stateless autoconfiguration processing: * + nd6_accept_6to4 == 0 && address has 6to4 prefix - * + ip6_only_allow_rfc4193_prefix != 0 && address not RFC 4193 + * + ip6_only_allow_rfc4193_prefix != 0 && + * address not RFC 4193 */ if (ip6_only_allow_rfc4193_prefix && !IN6_IS_ADDR_UNIQUE_LOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, - "nd6_ra_input: no SLAAC on prefix %s [not RFC 4193]\n", + "nd6_ra_input: no SLAAC on prefix %s " + "[not RFC 4193]\n", ip6_sprintf(&pi->nd_opt_pi_prefix))); pr.ndpr_raf_auto = 0; } else if (!nd6_accept_6to4 && IN6_IS_ADDR_6TO4(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, - "nd6_ra_input: no SLAAC on prefix %s [6to4]\n", + "nd6_ra_input: no SLAAC on prefix %s " + "[6to4]\n", ip6_sprintf(&pi->nd_opt_pi_prefix))); pr.ndpr_raf_auto = 0; } @@ -549,6 +582,37 @@ nd6_ra_input( } (void)prelist_update(&pr, dr, m, mcast); lck_mtx_destroy(&pr.ndpr_lock, ifa_mtx_grp); + + /* + * We have to copy the values out after the + * prelist_update call since some of these values won't + * be properly set until after the router advertisement + * updating can vet the values. + */ + struct nd_prefix_list *prfl = NULL; + MALLOC(prfl, struct nd_prefix_list *, sizeof (*prfl), + M_TEMP, M_WAITOK | M_ZERO); + + if (prfl == NULL) { + log(LOG_DEBUG, "%s: unable to MALLOC RA prefix " + "structure\n", __func__); + continue; + } + + bcopy(&pr.ndpr_prefix, &prfl->pr.ndpr_prefix, + sizeof (prfl->pr.ndpr_prefix)); + prfl->pr.ndpr_raf = pr.ndpr_raf; + prfl->pr.ndpr_plen = pr.ndpr_plen; + prfl->pr.ndpr_vltime = pr.ndpr_vltime; + prfl->pr.ndpr_pltime = pr.ndpr_pltime; + prfl->pr.ndpr_expire = pr.ndpr_expire; + prfl->pr.ndpr_stateflags = pr.ndpr_stateflags; + prfl->pr.ndpr_addrcnt = pr.ndpr_addrcnt; + prfl->pr.ndpr_ifp = pr.ndpr_ifp; + + prfl->next = nd_prefix_list_head; + nd_prefix_list_head = prfl; + nd_prefix_list_length++; } } @@ -556,7 +620,7 @@ nd6_ra_input( * MTU */ if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) { - u_int32_t mtu = ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu); + mtu = ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu); /* lower bound */ if (mtu < IPV6_MMTU) { @@ -572,12 +636,15 @@ nd6_ra_input( goto freeit; } ndi = &nd_ifinfo[ifp->if_index]; + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); /* upper bound */ if (ndi->maxmtu) { if (mtu <= ndi->maxmtu) { int change = (ndi->linkmtu != mtu); ndi->linkmtu = mtu; + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); if (change) /* in6_maxmtu may change */ in6_setmaxmtu(); @@ -587,9 +654,11 @@ nd6_ra_input( "exceeds maxmtu %d, ignoring\n", mtu, ip6_sprintf(&ip6->ip6_src), ndi->maxmtu)); + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } } else { + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); nd6log((LOG_INFO, "nd6_ra_input: mtu option " "mtu=%d sent from %s; maxmtu unknown, " @@ -600,14 +669,10 @@ nd6_ra_input( } skip: - + /* * Source link layer address */ - { - char *lladdr = NULL; - int lladdrlen = 0; - if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; @@ -621,7 +686,12 @@ nd6_ra_input( goto bad; } - nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_ADVERT, 0); + nd6_cache_lladdr(ifp, &saddr6, lladdr, (int)lladdrlen, + ND_ROUTER_ADVERT, 0); + + /* Post message */ + nd6_post_msg(KEV_ND6_RA, nd_prefix_list_head, nd_prefix_list_length, + mtu, lladdr, lladdrlen); /* * Installing a link-layer address might change the state of the @@ -631,12 +701,20 @@ nd6_ra_input( lck_mtx_lock(nd6_mutex); pfxlist_onlink_check(); lck_mtx_unlock(nd6_mutex); - } freeit: m_freem(m); if (dr) NDDR_REMREF(dr); + + { + struct nd_prefix_list *prfl = NULL; + while ((prfl = nd_prefix_list_head) != NULL) { + nd_prefix_list_head = prfl->next; + FREE(prfl, M_TEMP); + } + } + return; bad: @@ -676,7 +754,7 @@ nd6_rtmsg(cmd, rt) ifnet_lock_done(ifp); } -void +static void defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) { struct sockaddr_in6 def, mask, gate; @@ -689,6 +767,14 @@ defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) if (new->stateflags & NDDRF_INSTALLED) return; + if (new->ifp->if_eflags & IFEF_IPV6_ROUTER) { + nd6log2((LOG_INFO, "%s: ignoring router %s, scoped=%d, " + "static=%d on advertising interface\n", if_name(new->ifp), + ip6_sprintf(&new->rtaddr), scoped, + (new->stateflags & NDDRF_STATIC) ? 1 : 0)); + return; + } + nd6log2((LOG_INFO, "%s: adding default router %s, scoped=%d, " "static=%d\n", if_name(new->ifp), ip6_sprintf(&new->rtaddr), scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0)); @@ -754,7 +840,7 @@ defrouter_lookup( * This is just a subroutine function for defrouter_select(), and should * not be called from anywhere else. */ -void +static void defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; @@ -861,29 +947,39 @@ defrouter_reset(void) int defrtrlist_ioctl(u_long cmd, caddr_t data) { - struct in6_defrouter_32 *r_32 = (struct in6_defrouter_32 *)data; - struct in6_defrouter_64 *r_64 = (struct in6_defrouter_64 *)data; struct nd_defrouter dr0; unsigned int ifindex; struct ifnet *dr_ifp; int error = 0, add = 0; switch (cmd) { - case SIOCDRADD_IN6_32: - case SIOCDRADD_IN6_64: + case SIOCDRADD_IN6_32: /* struct in6_defrouter_32 */ + case SIOCDRADD_IN6_64: /* struct in6_defrouter_64 */ ++add; /* FALLTHRU */ - case SIOCDRDEL_IN6_32: - case SIOCDRDEL_IN6_64: + case SIOCDRDEL_IN6_32: /* struct in6_defrouter_32 */ + case SIOCDRDEL_IN6_64: /* struct in6_defrouter_64 */ bzero(&dr0, sizeof (dr0)); if (cmd == SIOCDRADD_IN6_64 || cmd == SIOCDRDEL_IN6_64) { - dr0.rtaddr = r_64->rtaddr.sin6_addr; + struct in6_defrouter_64 *r_64 = + (struct in6_defrouter_64 *)(void *)data; + u_int16_t i; + + bcopy(&r_64->rtaddr.sin6_addr, &dr0.rtaddr, + sizeof (dr0.rtaddr)); dr0.flags = r_64->flags; - ifindex = r_64->if_index; + bcopy(&r_64->if_index, &i, sizeof (i)); + ifindex = i; } else { - dr0.rtaddr = r_32->rtaddr.sin6_addr; + struct in6_defrouter_32 *r_32 = + (struct in6_defrouter_32 *)(void *)data; + u_int16_t i; + + bcopy(&r_32->rtaddr.sin6_addr, &dr0.rtaddr, + sizeof (dr0.rtaddr)); dr0.flags = r_32->flags; - ifindex = r_32->if_index; + bcopy(&r_32->if_index, &i, sizeof (i)); + ifindex = i; } ifnet_head_lock_shared(); /* Don't need to check is ifindex is < 0 since it's unsigned */ @@ -937,8 +1033,7 @@ defrtrlist_del(struct nd_defrouter *dr) * Flush all the routing table entries that use the router * as a next hop. */ - if (!ip6_forwarding && - (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + if (ip6_doscopedroute || !ip6_forwarding) { /* above is a good condition? */ NDDR_ADDREF(dr); lck_mtx_unlock(nd6_mutex); @@ -947,7 +1042,7 @@ defrtrlist_del(struct nd_defrouter *dr) NDDR_REMREF(dr); } - if (dr == TAILQ_FIRST(&nd_defrouter)) + if (!ip6_doscopedroute && dr == TAILQ_FIRST(&nd_defrouter)) deldr = dr; /* The router is primary. */ TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); @@ -986,18 +1081,17 @@ defrtrlist_del(struct nd_defrouter *dr) * Routing is enabled, always try to pick another eligible router * on this interface. */ - if ((deldr || ip6_doscopedroute) && !ip6_forwarding && - (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) + if (deldr || ip6_doscopedroute) defrouter_select(ifp); lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index < nd_ifinfo_indexlim) { struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - atomic_add_32(&ndi->ndefrouters, -1); - if (ndi->ndefrouters < 0) { - log(LOG_WARNING, "defrtrlist_del: negative " - "count on %s\n", if_name(ifp)); - } + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); + VERIFY(ndi->ndefrouters > 0); + ndi->ndefrouters--; + lck_mtx_unlock(&ndi->lock); } lck_rw_done(nd_if_rwlock); @@ -1084,8 +1178,7 @@ rtpref(struct nd_defrouter *dr) } /* - * Default Router Selection according to Section 6.3.6 of RFC 2461 and - * draft-ietf-ipngwg-router-selection: + * Default Router Selection according to Section 6.3.6 of RFC 2461 and RFC 4191: * * 1) Routers that are reachable or probably reachable should be preferred. * If we have more than one (probably) reachable router, prefer ones @@ -1114,9 +1207,10 @@ rtpref(struct nd_defrouter *dr) * Since the code below covers both with and without router preference cases, * we do not need to classify the cases by ifdef. */ -static void -defrouter_select_common(struct ifnet *ifp, int ignore) +void +defrouter_select(struct ifnet *ifp) { +#pragma unused(ifp) struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; struct nd_defrouter *installed_dr0 = NULL; struct rtentry *rt = NULL; @@ -1127,16 +1221,17 @@ defrouter_select_common(struct ifnet *ifp, int ignore) lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); /* - * This function should be called only when acting as an autoconfigured - * host. Although the remaining part of this function is not effective - * if the node is not an autoconfigured host, we explicitly exclude - * such cases here for safety. + * We no longer install (default) interface route; only prefix routes + * are installed as interface routes. Therefore, there is no harm in + * going through this routine even if a default interface is specified, + * which happens when Scoped Routing is enabled. But for efficiency, + * we fall back to the original KAME logic when Scoped Routing is + * not in effect. */ - if (ip6_forwarding || (!ignore && !ip6_accept_rtadv && - !(ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + if (ip6_forwarding && !ip6_doscopedroute) { nd6log((LOG_WARNING, - "defrouter_select: called unexpectedly (forwarding=%d, " - "accept_rtadv=%d)\n", ip6_forwarding, ip6_accept_rtadv)); + "defrouter_select: called unexpectedly (forwarding=%d)\n", + ip6_forwarding)); return; } @@ -1170,20 +1265,31 @@ defrouter_select_common(struct ifnet *ifp, int ignore) */ for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { - boolean_t reachable; + boolean_t reachable, advrouter; + struct in6_addr rtaddr; + struct ifnet *drifp; + struct nd_defrouter *drrele; - /* Callee returns a locked route upon success */ + drrele = NULL; reachable = FALSE; - NDDR_ADDREF(dr); /* for this for loop */ + NDDR_LOCK(dr); + rtaddr = *(&dr->rtaddr); + drifp = dr->ifp; + advrouter = (drifp != NULL && + (drifp->if_eflags & IFEF_IPV6_ROUTER)); + NDDR_ADDREF_LOCKED(dr); /* for this for loop */ + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); - if ((rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp, 0)) != NULL) { + /* Callee returns a locked route upon success */ + if ((rt = nd6_lookup(&rtaddr, 0, drifp, 0)) != NULL) { RT_LOCK_ASSERT_HELD(rt); if ((ln = rt->rt_llinfo) != NULL && ND6_IS_LLINFO_PROBREACH(ln)) { reachable = TRUE; if (selected_dr == NULL && (!ip6_doscopedroute || - dr->ifp == nd6_defifp)) { + (drifp == nd6_defifp && !advrouter))) { selected_dr = dr; NDDR_ADDREF(selected_dr); } @@ -1195,15 +1301,19 @@ defrouter_select_common(struct ifnet *ifp, int ignore) lck_mtx_lock(nd6_mutex); /* Handle case (b) */ - if (ip6_doscopedroute && dr->ifp == nd6_defifp && + NDDR_LOCK(dr); + if (ip6_doscopedroute && drifp == nd6_defifp && !advrouter && (selected_dr == NULL || rtpref(dr) > rtpref(selected_dr) || (rtpref(dr) == rtpref(selected_dr) && (dr->stateflags & NDDRF_STATIC) && !(selected_dr->stateflags & NDDRF_STATIC)))) { - if (selected_dr) - NDDR_REMREF(selected_dr); + if (selected_dr) { + /* Release it later on */ + VERIFY(drrele == NULL); + drrele = selected_dr; + } selected_dr = dr; - NDDR_ADDREF(selected_dr); + NDDR_ADDREF_LOCKED(selected_dr); } if (!(dr->stateflags & NDDRF_INSTALLED)) { @@ -1212,16 +1322,20 @@ defrouter_select_common(struct ifnet *ifp, int ignore) * reachable, try to install it later on below. * If it's static, try to install it anyway. */ - if (reachable || (dr->stateflags & NDDRF_STATIC)) { + if (!advrouter && (reachable || + (dr->stateflags & NDDRF_STATIC))) { dr->genid = -1; ++update; nd6log2((LOG_INFO, "%s: possible router %s, " - "scoped=%d, static=%d\n", if_name(dr->ifp), - ip6_sprintf(&dr->rtaddr), + "scoped=%d, static=%d\n", if_name(drifp), + ip6_sprintf(&rtaddr), (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); } + NDDR_UNLOCK(dr); NDDR_REMREF(dr); /* for this for loop */ + if (drrele != NULL) + NDDR_REMREF(drrele); continue; } @@ -1229,7 +1343,7 @@ defrouter_select_common(struct ifnet *ifp, int ignore) if (!ip6_doscopedroute || !(dr->stateflags & NDDRF_IFSCOPE)) { if (installed_dr == NULL) { installed_dr = dr; - NDDR_ADDREF(installed_dr); + NDDR_ADDREF_LOCKED(installed_dr); } else { /* this should not happen; warn for diagnosis */ log(LOG_ERR, "defrouter_select: more than one " @@ -1237,7 +1351,10 @@ defrouter_select_common(struct ifnet *ifp, int ignore) ip6_doscopedroute ? "non-scoped" : ""); } } + NDDR_UNLOCK(dr); NDDR_REMREF(dr); /* for this for loop */ + if (drrele != NULL) + NDDR_REMREF(drrele); } /* If none was selected, use the currently installed one */ @@ -1286,7 +1403,8 @@ defrouter_select_common(struct ifnet *ifp, int ignore) } /* If none so far, schedule it to be installed below */ - if (_dr == NULL) { + if (_dr == NULL && dr->ifp != NULL && + !(dr->ifp->if_eflags & IFEF_IPV6_ROUTER)) { dr->genid = -1; ++update; nd6log2((LOG_INFO, "%s: possible router %s, " @@ -1407,7 +1525,7 @@ defrouter_select_common(struct ifnet *ifp, int ignore) if_name(dr->ifp), ip6_sprintf(&dr->rtaddr))); } if (!ip6_doscopedroute && installed_dr != dr) { - /* + /* * No need to ADDREF dr because at this point * dr points to selected_dr, which already holds * a reference. @@ -1588,18 +1706,12 @@ out: } } -void -defrouter_select(struct ifnet *ifp) -{ - return (defrouter_select_common(ifp, 0)); -} - static struct nd_defrouter * defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) { struct nd_defrouter *dr, *n; struct ifnet *ifp = new->ifp; - struct nd_ifinfo *ndi; + struct nd_ifinfo *ndi = NULL; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); @@ -1667,10 +1779,6 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) new->stateflags = dr->stateflags; new->stateflags &= ~NDDRF_PROCESSED; - lck_rw_lock_shared(nd_if_rwlock); - VERIFY(ifp->if_index < nd_ifinfo_indexlim); - ndi = &nd_ifinfo[ifp->if_index]; - lck_rw_done(nd_if_rwlock); n = dr; goto insert; } @@ -1690,11 +1798,14 @@ defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) } lck_rw_lock_shared(nd_if_rwlock); - ndi = &nd_ifinfo[ifp->if_index]; if (ifp->if_index >= nd_ifinfo_indexlim) goto freeit; + ndi = &nd_ifinfo[ifp->if_index]; + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); if (ip6_maxifdefrouters >= 0 && ndi->ndefrouters >= ip6_maxifdefrouters) { + lck_mtx_unlock(&ndi->lock); freeit: lck_rw_done(nd_if_rwlock); nddr_free(n); @@ -1705,7 +1816,9 @@ freeit: NDDR_ADDREF(n); /* for the caller */ ++nd6_defrouter_genid; - atomic_add_32(&ndi->ndefrouters, 1); + ndi->ndefrouters++; + VERIFY(ndi->ndefrouters != 0); + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); nd6log2((LOG_INFO, "%s: allocating defrouter %s\n", if_name(ifp), @@ -1746,8 +1859,7 @@ insert: else TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); - /* Ignore auto-configuration checks for static route entries */ - defrouter_select_common(ifp, (n->stateflags & NDDRF_STATIC)); + defrouter_select(ifp); return (n); } @@ -1785,11 +1897,7 @@ defrtrlist_sync(struct ifnet *ifp) } if (dr == NULL) { - /* - * Set ignore flag; the chosen default interface might - * not be configured to accept RAs. - */ - defrouter_select_common(ifp, 1); + defrouter_select(ifp); } else { memcpy(&new.rtaddr, &dr->rtaddr, sizeof(new.rtaddr)); new.flags = dr->flags; @@ -1921,8 +2029,8 @@ repeat: ifnet_lock_done(ifp); lck_mtx_unlock(nd6_mutex); in6_purgeaddr(ifa); - lck_mtx_lock(nd6_mutex); IFA_REMREF(ifa); /* drop ours */ + lck_mtx_lock(nd6_mutex); pr = nd_prefix.lh_first; goto repeat; } @@ -1963,7 +2071,10 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, return (EINVAL); } ndi = &nd_ifinfo[ifp->if_index]; + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); if (ndi->nprefixes >= ip6_maxifprefixes / 2) { + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); purge_detached(ifp); lck_rw_lock_shared(nd_if_rwlock); @@ -1973,11 +2084,14 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, * isn't necessary since the array never shrinks. */ ndi = &nd_ifinfo[ifp->if_index]; + lck_mtx_lock(&ndi->lock); } if (ndi->nprefixes >= ip6_maxifprefixes) { + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); return(ENOMEM); } + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); } @@ -2048,7 +2162,11 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, * isn't necessary since the array never shrinks. */ ndi = &nd_ifinfo[ifp->if_index]; - atomic_add_32(&ndi->nprefixes, 1); + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); + ndi->nprefixes++; + VERIFY(ndi->nprefixes != 0); + lck_mtx_unlock(&ndi->lock); lck_rw_done(nd_if_rwlock); lck_mtx_unlock(nd6_mutex); @@ -2112,11 +2230,11 @@ prelist_remove(struct nd_prefix *pr) lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index < nd_ifinfo_indexlim) { struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - atomic_add_32(&ndi->nprefixes, -1); - if (ndi->nprefixes < 0) { - log(LOG_WARNING, "prelist_remove: negative " - "count on %s\n", if_name(ifp)); - } + VERIFY(ndi->initialized); + lck_mtx_lock(&ndi->lock); + VERIFY(ndi->nprefixes > 0); + ndi->nprefixes--; + lck_mtx_unlock(&ndi->lock); } lck_rw_done(nd_if_rwlock); @@ -2505,7 +2623,7 @@ prelist_update( lck_mtx_unlock(nd6_mutex); } else { /* just set an error. do not bark here. */ - error = EADDRNOTAVAIL; /* XXX: might be unused. */ + error = EADDRNOTAVAIL; } } @@ -2642,6 +2760,7 @@ ndpr_alloc(int how) if (pr != NULL) { bzero(pr, ndpr_size); lck_mtx_init(&pr->ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); + RB_INIT(&pr->ndpr_prproxy_sols); pr->ndpr_debug |= IFD_ALLOC; if (ndpr_debug != 0) { pr->ndpr_debug |= IFD_DEBUG; @@ -2661,6 +2780,17 @@ ndpr_free(struct nd_prefix *pr) } else if (!(pr->ndpr_debug & IFD_ALLOC)) { panic("%s: ndpr %p cannot be freed", __func__, pr); /* NOTREACHED */ + } else if (pr->ndpr_rt != NULL) { + panic("%s: ndpr %p route %p not freed", __func__, pr, + pr->ndpr_rt); + /* NOTREACHED */ + } else if (pr->ndpr_prproxy_sols_cnt != 0) { + panic("%s: ndpr %p non-zero solicitors count (%d)", + __func__, pr, pr->ndpr_prproxy_sols_cnt); + /* NOTREACHED */ + } else if (!RB_EMPTY(&pr->ndpr_prproxy_sols)) { + panic("%s: ndpr %p non-empty solicitors tree", __func__, pr); + /* NOTREACHED */ } pr->ndpr_debug &= ~IFD_ALLOC; NDPR_UNLOCK(pr); @@ -2808,6 +2938,9 @@ pfxlist_onlink_check(void) struct in6_ifaddr *ifa; struct nd_defrouter *dr; struct nd_pfxrouter *pfxrtr = NULL; + int err, i, found = 0; + struct ifaddr **ifap = NULL; + struct nd_prefix *ndpr; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); @@ -3005,13 +3138,23 @@ pfxlist_onlink_check(void) * detached. Note, however, that a manually configured address should * always be attached. * The precise detection logic is same as the one for prefixes. + * + * ifnet_get_address_list_family_internal() may fail due to memory + * pressure, but we will eventually be called again when we receive + * another NA, RA, or when the link status changes. */ - lck_rw_lock_shared(&in6_ifaddr_rwlock); - for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - struct nd_prefix *ndpr; - + err = ifnet_get_address_list_family_internal(NULL, &ifap, AF_INET6, 0, + M_NOWAIT); + if (err != 0 || ifap == NULL) { + nd6log((LOG_ERR, "%s: ifnet_get_address_list_family_internal " + "failed", __func__)); + return; + } + for (i = 0; ifap[i]; i++) { + ifa = ifatoia6(ifap[i]); IFA_LOCK(&ifa->ia_ifa); - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0 || + (ifap[i]->ifa_debug & IFD_ATTACHED) == 0) { IFA_UNLOCK(&ifa->ia_ifa); continue; } @@ -3031,17 +3174,18 @@ pfxlist_onlink_check(void) if (find_pfxlist_reachable_router(ndpr)) { NDPR_UNLOCK(ndpr); NDPR_REMREF(ndpr); + found = 1; break; } NDPR_UNLOCK(ndpr); NDPR_REMREF(ndpr); } - if (ifa) { - for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - struct nd_prefix *ndpr; - + if (found) { + for (i = 0; ifap[i]; i++) { + ifa = ifatoia6(ifap[i]); IFA_LOCK(&ifa->ia_ifa); - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0 || + (ifap[i]->ifa_debug & IFD_ATTACHED) == 0) { IFA_UNLOCK(&ifa->ia_ifa); continue; } @@ -3072,9 +3216,9 @@ pfxlist_onlink_check(void) } NDPR_REMREF(ndpr); } - } - else { - for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { + } else { + for (i = 0; ifap[i]; i++) { + ifa = ifatoia6(ifap[i]); IFA_LOCK(&ifa->ia_ifa); if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { IFA_UNLOCK(&ifa->ia_ifa); @@ -3091,7 +3235,7 @@ pfxlist_onlink_check(void) } } } - lck_rw_done(&in6_ifaddr_rwlock); + ifnet_free_address_list(ifap); } static struct nd_prefix * @@ -3235,7 +3379,7 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, struct sockaddr_in6 mask6, prefix; struct nd_prefix *opr; u_int32_t rtflags; - int error = 0; + int error = 0, prproxy = 0; struct rtentry *rt = NULL; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); @@ -3322,6 +3466,9 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; prefix = pr->ndpr_prefix; + if ((rt = pr->ndpr_rt) != NULL) + pr->ndpr_rt = NULL; + NDPR_ADDREF_LOCKED(pr); /* keep reference for this routine */ NDPR_UNLOCK(pr); IFA_LOCK_SPIN(ifa); @@ -3339,15 +3486,25 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, lck_mtx_unlock(nd6_mutex); + if (rt != NULL) { + rtfree(rt); + rt = NULL; + } + error = rtrequest_scoped(RTM_ADD, (struct sockaddr *)&prefix, ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt, ifscope); + /* + * Serialize the setting of NDPRF_PRPROXY. + */ + lck_mtx_lock(&proxy6_lock); + if (rt != NULL) { RT_LOCK(rt); nd6_rtmsg(RTM_ADD, rt); RT_UNLOCK(rt); - RT_REMREF(rt); + NDPR_LOCK(pr); } else { NDPR_LOCK(pr); nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a" @@ -3355,25 +3512,71 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, " scoped=%d, errno = %d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp), - ip6_sprintf(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr), + ip6_sprintf(&((struct sockaddr_in6 *) + (void *)ifa->ifa_addr)->sin6_addr), ip6_sprintf(&mask6.sin6_addr), rtflags, (ifscope != IFSCOPE_NONE), error)); - NDPR_UNLOCK(pr); } + NDPR_LOCK_ASSERT_HELD(pr); - lck_mtx_lock(nd6_mutex); + pr->ndpr_stateflags &= ~(NDPRF_IFSCOPE | NDPRF_PRPROXY); - NDPR_LOCK(pr); - pr->ndpr_stateflags &= ~NDPRF_IFSCOPE; + /* + * TODO: If the prefix route exists, we should really find it and + * refer the prefix to it; otherwise ndpr_rt is NULL. + */ if (rt != NULL || error == EEXIST) { + struct nd_ifinfo *ndi; + + VERIFY(pr->ndpr_prproxy_sols_cnt == 0); + VERIFY(RB_EMPTY(&pr->ndpr_prproxy_sols)); + + lck_rw_lock_shared(nd_if_rwlock); + ndi = ND_IFINFO(ifp); + VERIFY(ndi != NULL && ndi->initialized); + lck_mtx_lock(&ndi->lock); + + pr->ndpr_rt = rt; /* keep reference from rtrequest */ pr->ndpr_stateflags |= NDPRF_ONLINK; - if (ifscope != IFSCOPE_NONE) + if (ifscope != IFSCOPE_NONE) { pr->ndpr_stateflags |= NDPRF_IFSCOPE; + } else if ((rtflags & RTF_CLONING) && + (ndi->flags & ND6_IFF_PROXY_PREFIXES) && + !IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) { + /* + * At present, in order for the prefix to be eligible + * as a proxying/proxied prefix, we require that the + * prefix route entry be marked as a cloning route with + * RTF_PROXY; i.e. nd6_need_cache() needs to return + * true for the interface type, hence the test for + * RTF_CLONING above. + */ + pr->ndpr_stateflags |= NDPRF_PRPROXY; + } + + lck_mtx_unlock(&ndi->lock); + lck_rw_done(nd_if_rwlock); } + + prproxy = (pr->ndpr_stateflags & NDPRF_PRPROXY); + VERIFY(!prproxy || !(pr->ndpr_stateflags & NDPRF_IFSCOPE)); NDPR_UNLOCK(pr); IFA_REMREF(ifa); + /* + * If this is an upstream prefix, find the downstream ones (if any) + * and re-configure their prefix routes accordingly. Otherwise, + * this could be potentially be a downstream prefix, and so find the + * upstream prefix, if any. + */ + nd6_prproxy_prelist_update(pr, prproxy ? pr : NULL); + + NDPR_REMREF(pr); /* release reference for this routine */ + lck_mtx_unlock(&proxy6_lock); + + lck_mtx_lock(nd6_mutex); + return (error); } @@ -3392,11 +3595,11 @@ nd6_prefix_onlink_scoped(struct nd_prefix *pr, unsigned int ifscope) int nd6_prefix_offlink(struct nd_prefix *pr) { - int plen, error = 0; + int plen, error = 0, prproxy; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6, prefix; - struct rtentry *rt = NULL; + struct rtentry *rt = NULL, *ndpr_rt = NULL; unsigned int ifscope; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -3424,6 +3627,9 @@ nd6_prefix_offlink(struct nd_prefix *pr) bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); prefix = pr->ndpr_prefix; plen = pr->ndpr_plen; + if ((ndpr_rt = pr->ndpr_rt) != NULL) + pr->ndpr_rt = NULL; + NDPR_ADDREF_LOCKED(pr); /* keep reference for this routine */ NDPR_UNLOCK(pr); ifscope = (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? @@ -3506,10 +3712,34 @@ nd6_prefix_offlink(struct nd_prefix *pr) (ifscope != IFSCOPE_NONE), error)); } + if (ndpr_rt != NULL) + rtfree(ndpr_rt); + + lck_mtx_lock(&proxy6_lock); + NDPR_LOCK(pr); - pr->ndpr_stateflags &= ~(NDPRF_ONLINK | NDPRF_IFSCOPE); + prproxy = (pr->ndpr_stateflags & NDPRF_PRPROXY); + VERIFY(!prproxy || !(pr->ndpr_stateflags & NDPRF_IFSCOPE)); + pr->ndpr_stateflags &= ~(NDPRF_ONLINK | NDPRF_IFSCOPE | NDPRF_PRPROXY); + if (pr->ndpr_prproxy_sols_cnt > 0) { + VERIFY(prproxy); + nd6_prproxy_sols_reap(pr); + VERIFY(pr->ndpr_prproxy_sols_cnt == 0); + VERIFY(RB_EMPTY(&pr->ndpr_prproxy_sols)); + } NDPR_UNLOCK(pr); + /* + * If this was an upstream prefix, find the downstream ones and do + * some cleanups. If this was a downstream prefix, the prefix route + * has been removed from the routing table above, but there may be + * other tasks to perform. + */ + nd6_prproxy_prelist_update(pr, prproxy ? pr : NULL); + + NDPR_REMREF(pr); /* release reference for this routine */ + lck_mtx_unlock(&proxy6_lock); + return (error); } @@ -3644,10 +3874,22 @@ in6_ifadd( } ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); + if (ia == NULL) { + /* + * XXX: both in6_ifadd and in6_iftmpadd follow this admittedly + * suboptimal pattern of calling in6_update_ifa to add the + * interface address, then calling in6ifa_ifpwithaddr to + * retrieve it from the interface address list after some + * concurrent kernel thread has first had the opportunity to + * call in6_purgeaddr and delete everything. + */ + nd6log((LOG_ERR, + "in6_ifadd: ifa update succeeded, but we got no ifaddr\n")); + return(NULL); + } - in6_post_msg(ifp, KEV_INET6_NEW_RTADV_ADDR, ia); - - return(ia); /* this must NOT be NULL. */ + in6_post_msg(ifp, KEV_INET6_NEW_RTADV_ADDR, ia); + return(ia); } #define IA6_NONCONST(i) ((struct in6_ifaddr *)(uintptr_t)(i)) @@ -3756,11 +3998,19 @@ again: return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); - if (newia == NULL) { /* XXX: can it happen? */ + if (newia == NULL) { + /* + * XXX: both in6_ifadd and in6_iftmpadd follow this admittedly + * suboptimal pattern of calling in6_update_ifa to add the + * interface address, then calling in6ifa_ifpwithaddr to + * retrieve it from the interface address list after some + * concurrent kernel thread has first had the opportunity to + * call in6_purgeaddr and delete everything. + */ nd6log((LOG_ERR, "in6_tmpifadd: ifa update succeeded, but we got " "no ifaddr\n")); - return(EINVAL); /* XXX */ + return(EINVAL); } IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); ndpr = ia0->ia6_ndpr; @@ -3894,7 +4144,6 @@ rt6_deleteroute( struct radix_node *rn, void *arg) { -#define SIN6(s) ((struct sockaddr_in6 *)s) struct rtentry *rt = (struct rtentry *)rn; struct in6_addr *gate = (struct in6_addr *)arg; @@ -3936,7 +4185,6 @@ rt6_deleteroute( RT_UNLOCK(rt); return (rtrequest_locked(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0)); -#undef SIN6 } int @@ -3945,7 +4193,7 @@ nd6_setdefaultiface( { int error = 0; ifnet_t def_ifp = NULL; - + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); ifnet_head_lock_shared(); diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index f5c48648e..a08cf7139 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,6 +93,7 @@ #include #include #include +#include #include #include #include @@ -127,9 +128,6 @@ extern int ipsec_bypass; #endif /*IPSEC*/ -#define satosin6(sa) ((struct sockaddr_in6 *)(sa)) -#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) - /* * Raw interface to IP6 protocol. */ @@ -161,15 +159,10 @@ rip6_input( struct sockaddr_in6 rip6src; int ret; - rip6stat.rip6s_ipackets++; + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); -#if defined(NFAITH) && 0 < NFAITH - if (faithprefix(&ip6->ip6_dst)) { - /* XXX send icmp6 host/port unreach? */ - m_freem(m); - return IPPROTO_DONE; - } -#endif + rip6stat.rip6s_ipackets++; init_sin6(&rip6src, m); /* general init */ @@ -186,7 +179,7 @@ rip6_input( if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; - if (in6p->in6p_cksum != -1) { + if (proto == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { rip6stat.rip6s_isum++; if (in6_cksum(m, ip6->ip6_nxt, *offp, m->m_pkthdr.len - *offp)) { @@ -353,8 +346,9 @@ rip6_output( struct ip6_moptions *im6o = NULL; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ - mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF }; int flags = IPV6_OUTARGS; if (dstsock && IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) { @@ -364,13 +358,16 @@ rip6_output( in6p = sotoin6pcb(so); - ip6oa.ip6oa_boundif = (in6p->inp_flags & INP_BOUND_IF) ? - in6p->inp_boundif : IFSCOPE_NONE; - ip6oa.ip6oa_nocell = (in6p->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + if (in6p->inp_flags & INP_BOUND_IF) { + ip6oa.ip6oa_boundif = in6p->inp_boundifp->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; + } + if (in6p->inp_flags & INP_NO_IFT_CELLULAR) + ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; dst = &dstsock->sin6_addr; if (control) { - mtc = mbuf_traffic_class_from_control(control); + msc = mbuf_service_class_from_control(control); if ((error = ip6_setpktopts(control, &opt, NULL, so->so_proto->pr_protocol)) != 0) goto bad; @@ -394,6 +391,9 @@ rip6_output( code = icmp6->icmp6_code; } + if (in6p->inp_flowhash == 0) + in6p->inp_flowhash = inp_calc_flowhash(in6p); + M_PREPEND(m, sizeof(*ip6), M_WAIT); if (m == NULL) { error = ENOBUFS; @@ -463,7 +463,7 @@ rip6_output( struct in6_addr *in6a; struct in6_addr storage; u_short index = 0; - + if (israw != 0 && optp && optp->ip6po_pktinfo && !IN6_IS_ADDR_UNSPECIFIED(&optp->ip6po_pktinfo->ipi6_addr)) { in6a = &optp->ip6po_pktinfo->ipi6_addr; flags |= IPV6_FLAG_NOSRCIFSEL; @@ -473,6 +473,8 @@ rip6_output( if (error == 0) error = EADDRNOTAVAIL; goto bad; + } else { + ip6oa.ip6oa_flags |= IP6OAF_BOUND_SRCADDR; } ip6->ip6_src = *in6a; if (in6p->in6p_route.ro_rt != NULL) { @@ -524,7 +526,7 @@ rip6_output( } if (!n) goto bad; - p = (u_int16_t *)(mtod(n, caddr_t) + off); + p = (u_int16_t *)(void *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } @@ -547,8 +549,10 @@ rip6_output( oifp = NULL; } - set_packet_tclass(m, so, mtc, 1); - + set_packet_service_class(m, so, msc, PKT_SCF_IPV6); + m->m_pkthdr.m_flowhash = in6p->inp_flowhash; + m->m_pkthdr.m_fhflags |= PF_TAG_FLOWHASH; + if (im6o != NULL) IM6O_ADDREF(im6o); @@ -560,7 +564,7 @@ rip6_output( if (in6p->in6p_route.ro_rt != NULL) { struct rtentry *rt = in6p->in6p_route.ro_rt; - unsigned int outif; + struct ifnet *outif; if ((rt->rt_flags & RTF_MULTICAST) || in6p->in6p_socket == NULL || @@ -581,8 +585,8 @@ rip6_output( * the route interface index used by IP. */ if (rt != NULL && - (outif = rt->rt_ifp->if_index) != in6p->in6p_last_outif) - in6p->in6p_last_outif = outif; + (outif = rt->rt_ifp) != in6p->in6p_last_outifp) + in6p->in6p_last_outifp = outif; } if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { @@ -629,15 +633,17 @@ rip6_ctloutput( struct socket *so, struct sockopt *sopt) { - int error; + int error, optval; + /* Allow at this level */ if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return(icmp6_ctloutput(so, sopt)); - else if (sopt->sopt_level != IPPROTO_IPV6) + else if (sopt->sopt_level != IPPROTO_IPV6 && + !(sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_FLUSH)) return (EINVAL); error = 0; @@ -711,6 +717,15 @@ rip6_ctloutput( case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; + + case SO_FLUSH: + if ((error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval))) != 0) + break; + + error = inp_flush(sotoinpcb(so), optval); + break; + default: error = ip6_ctloutput(so, sopt); break; @@ -795,9 +810,9 @@ static int rip6_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)(void *)nam; struct ifaddr *ifa = NULL; - unsigned int outif = 0; + struct ifnet *outif = NULL; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -821,12 +836,12 @@ rip6_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) IFA_REMREF(ifa); return(EADDRNOTAVAIL); } - outif = ifa->ifa_ifp->if_index; + outif = ifa->ifa_ifp; IFA_UNLOCK(ifa); IFA_REMREF(ifa); } inp->in6p_laddr = addr->sin6_addr; - inp->in6p_last_outif = outif; + inp->in6p_last_outifp = outif; return 0; } @@ -834,14 +849,15 @@ static int rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)(void *)nam; struct in6_addr *in6a = NULL; struct in6_addr storage; int error = 0; #if ENABLE_DEFAULT_SCOPE struct sockaddr_in6 tmp; #endif - unsigned int outif = 0, ifscope; + unsigned int ifscope; + struct ifnet *outif = NULL; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -859,7 +875,7 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) #endif ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + inp->inp_boundifp->if_index : IFSCOPE_NONE; /* Source address selection. XXX: need pcblookup? */ in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp, &inp->in6p_route, @@ -869,8 +885,8 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) inp->in6p_laddr = *in6a; inp->in6p_faddr = addr->sin6_addr; if (inp->in6p_route.ro_rt != NULL) - outif = inp->in6p_route.ro_rt->rt_ifp->if_index; - inp->in6p_last_outif = outif; + outif = inp->in6p_route.ro_rt->rt_ifp; + inp->in6p_last_outifp = outif; soisconnected(so); return 0; } @@ -889,7 +905,7 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, #pragma unused(flags, p) struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst = (struct sockaddr_in6 *)nam; + struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam; /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { @@ -909,7 +925,7 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, m_freem(m); return ENOTCONN; } - tmp = *(struct sockaddr_in6 *)nam; + tmp = *(struct sockaddr_in6 *)(void *)nam; dst = &tmp; } #if ENABLE_DEFAULT_SCOPE diff --git a/bsd/netinet6/route6.c b/bsd/netinet6/route6.c index a0dc6c6a6..9325aadec 100644 --- a/bsd/netinet6/route6.c +++ b/bsd/netinet6/route6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,9 +59,11 @@ */ #include +#include #include #include #include +#include #include #include @@ -101,9 +103,16 @@ route6_input(struct mbuf **mp, int *offp, int proto) #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*rh), return IPPROTO_DONE); + + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); rh = (struct ip6_rthdr *)((caddr_t)ip6 + off); #else + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ip6 = mtod(m, struct ip6_hdr *); IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, sizeof(*rh)); if (rh == NULL) { diff --git a/bsd/netinet6/scope6.c b/bsd/netinet6/scope6.c index 2d4eedf76..27ad76492 100644 --- a/bsd/netinet6/scope6.c +++ b/bsd/netinet6/scope6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -117,7 +117,7 @@ scope6_ifattach( bcopy((caddr_t)scope6_ids, q, n/2); FREE((caddr_t)scope6_ids, M_IFADDR); } - scope6_ids = (struct scope6_id *)q; + scope6_ids = (struct scope6_id *)(void *)q; } #define SID scope6_ids[ifp->if_index] @@ -365,7 +365,7 @@ rtkey_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) { VERIFY(rt_key(rt)->sa_family == AF_INET6); - *sin6 = *((struct sockaddr_in6 *)rt_key(rt)); + *sin6 = *((struct sockaddr_in6 *)(void *)rt_key(rt)); sin6->sin6_scope_id = 0; } @@ -374,7 +374,7 @@ rtgw_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) { VERIFY(rt->rt_flags & RTF_GATEWAY); - *sin6 = *((struct sockaddr_in6 *)rt->rt_gateway); + *sin6 = *((struct sockaddr_in6 *)(void *)rt->rt_gateway); sin6->sin6_scope_id = 0; } @@ -382,7 +382,7 @@ rtgw_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) * generate standard sockaddr_in6 from embedded form. */ int -sa6_recoverscope(struct sockaddr_in6 *sin6) +sa6_recoverscope(struct sockaddr_in6 *sin6, boolean_t attachcheck) { u_int32_t zoneid; @@ -402,12 +402,25 @@ sa6_recoverscope(struct sockaddr_in6 *sin6) /* sanity check */ if (if_index < zoneid) return (ENXIO); - ifnet_head_lock_shared(); - if (ifindex2ifnet[zoneid] == NULL) { + /* + * We use the attachcheck parameter to skip the + * interface attachment check. + * Some callers might hold the ifnet_head lock in + * exclusive mode. This means that: + * 1) the interface can't go away -- hence we don't + * need to perform this check + * 2) we can't perform this check because the lock is + * in exclusive mode and trying to lock it in shared + * mode would cause a deadlock. + */ + if (attachcheck) { + ifnet_head_lock_shared(); + if (ifindex2ifnet[zoneid] == NULL) { + ifnet_head_done(); + return (ENXIO); + } ifnet_head_done(); - return (ENXIO); } - ifnet_head_done(); sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = zoneid; } @@ -503,8 +516,14 @@ in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id) scope = in6_addrscope(in6); -#define SID scope6_ids[index] lck_mtx_lock(scope6_mutex); + if (index >= if_scope_indexlim) { + lck_mtx_unlock(scope6_mutex); + if (ret_id != NULL) + *ret_id = 0; + return (EINVAL); + } +#define SID scope6_ids[index] switch (scope) { case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */ zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL]; diff --git a/bsd/netinet6/scope6_var.h b/bsd/netinet6/scope6_var.h index d028aefb8..300e8228c 100644 --- a/bsd/netinet6/scope6_var.h +++ b/bsd/netinet6/scope6_var.h @@ -82,7 +82,7 @@ int scope6_get_default(u_int32_t *); u_int32_t scope6_in6_addrscope(struct in6_addr *); u_int32_t scope6_addr2default(struct in6_addr *); int sa6_embedscope (struct sockaddr_in6 *, int); -int sa6_recoverscope (struct sockaddr_in6 *); +int sa6_recoverscope (struct sockaddr_in6 *, boolean_t); int in6_setscope (struct in6_addr *, struct ifnet *, u_int32_t *); int in6_clearscope (struct in6_addr *); extern void rtkey_to_sa6(struct rtentry *, struct sockaddr_in6 *); diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index 0fb9a6993..fa609ea19 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -137,8 +137,6 @@ extern int ipsec_bypass; #endif /*IPSEC*/ -#include "faith.h" - #include /* @@ -186,16 +184,29 @@ udp6_output(in6p, m, addr6, control, p) int flags; struct sockaddr_in6 tmp; struct in6_addr storage; - mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; - struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + struct ip6_out_args ip6oa = + { IFSCOPE_NONE, { 0 }, IP6OAF_SELECT_SRCIF }; + struct flowadv *adv = &ip6oa.ip6oa_flowadv; + int flowadv = 0; + + /* Enable flow advisory only when connected */ + flowadv = (in6p->inp_socket->so_state & SS_ISCONNECTED) ? 1 : 0; - if (in6p->inp_flags & INP_BOUND_IF) - ip6oa.ip6oa_boundif = in6p->inp_boundif; + if (flowadv && INP_WAIT_FOR_IF_FEEDBACK(in6p)) { + error = ENOBUFS; + goto release; + } - ip6oa.ip6oa_nocell = (in6p->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + if (in6p->inp_flags & INP_BOUND_IF) { + ip6oa.ip6oa_boundif = in6p->inp_boundifp->if_index; + ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; + } + if (in6p->inp_flags & INP_NO_IFT_CELLULAR) + ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; if (control) { - mtc = mbuf_traffic_class_from_control(control); + msc = mbuf_service_class_from_control(control); if ((error = ip6_setpktopts(control, &opt, NULL, IPPROTO_UDP)) != 0) goto release; @@ -212,7 +223,9 @@ udp6_output(in6p, m, addr6, control, p) * and in6_pcbsetport in order to fill in the local address * and the local port. */ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr6; + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)(void *)addr6; + if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; @@ -296,6 +309,9 @@ udp6_output(in6p, m, addr6, control, p) fport = in6p->in6p_fport; } + if (in6p->inp_flowhash == 0) + in6p->inp_flowhash = inp_calc_flowhash(in6p); + if (af == AF_INET) hlen = sizeof(struct ip); @@ -312,7 +328,7 @@ udp6_output(in6p, m, addr6, control, p) /* * Stuff checksum and output datagram. */ - udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); + udp6 = (struct udphdr *)(void *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = in6p->in6p_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (plen <= 0xffff) @@ -342,6 +358,9 @@ udp6_output(in6p, m, addr6, control, p) m->m_pkthdr.csum_flags = CSUM_UDPIPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) + ip6oa.ip6oa_flags |= IP6OAF_BOUND_SRCADDR; + flags = IPV6_OUTARGS; udp6stat.udp6s_opackets++; @@ -353,26 +372,53 @@ udp6_output(in6p, m, addr6, control, p) #endif /*IPSEC*/ m->m_pkthdr.socket_id = get_socket_id(in6p->in6p_socket); - set_packet_tclass(m, in6p->in6p_socket, mtc, 1); + set_packet_service_class(m, in6p->in6p_socket, msc, PKT_SCF_IPV6); + + m->m_pkthdr.m_flowhash = in6p->inp_flowhash; + m->m_pkthdr.m_fhflags |= PF_TAG_FLOWHASH; + if (flowadv) + m->m_pkthdr.m_fhflags |= PF_TAG_FLOWADV; im6o = in6p->in6p_moptions; - if (im6o != NULL) - IM6O_ADDREF(im6o); + if (im6o != NULL) { + IM6O_LOCK(im6o); + IM6O_ADDREF_LOCKED(im6o); + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && + im6o->im6o_multicast_ifp != NULL) { + in6p->in6p_last_outifp = im6o->im6o_multicast_ifp; + } + IM6O_UNLOCK(im6o); + } + + in6p->inp_sndinprog_cnt++; error = ip6_output(m, optp, &in6p->in6p_route, flags, im6o, NULL, &ip6oa); if (im6o != NULL) IM6O_REMREF(im6o); - + if (error == 0 && nstat_collect) { locked_add_64(&in6p->inp_stat->txpackets, 1); locked_add_64(&in6p->inp_stat->txbytes, ulen); } + if (flowadv && (adv->code == FADV_FLOW_CONTROLLED || + adv->code == FADV_SUSPENDED)) { + /* return an error to indicate + * that the packet has been dropped + */ + error = ENOBUFS; + inp_set_fc_state(in6p, adv->code); + } + + VERIFY(in6p->inp_sndinprog_cnt > 0); + if ( --in6p->inp_sndinprog_cnt == 0) + in6p->inp_flags &= ~(INP_FC_FEEDBACK); + if (in6p->in6p_route.ro_rt != NULL) { struct rtentry *rt = in6p->in6p_route.ro_rt; - unsigned int outif; + struct ifnet *outif; if ((rt->rt_flags & RTF_MULTICAST) || in6p->in6p_socket == NULL || @@ -390,11 +436,11 @@ udp6_output(in6p, m, addr6, control, p) /* * If this is a connected socket and the destination * route is not multicast, update outif with that of - * the route interface index used by IP. + * the route interface used by IP. */ - if (rt != NULL && (outif = rt->rt_ifp->if_index) != - in6p->in6p_last_outif) - in6p->in6p_last_outif = outif; + if (rt != NULL && + (outif = rt->rt_ifp) != in6p->in6p_last_outifp) + in6p->in6p_last_outifp = outif; } break; case AF_INET: diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index c88c0d169..5e9ac2da0 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -217,31 +217,35 @@ udp6_input( IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), return IPPROTO_DONE); + /* Expect 32-bit aligned data pointer on strict-align platforms */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); -#if defined(NFAITH) && 0 < NFAITH - if (faithprefix(&ip6->ip6_dst)) { - /* XXX send icmp6 host/port unreach? */ - m_freem(m); - return IPPROTO_DONE; - } -#endif - udpstat.udps_ipackets++; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); - uh = (struct udphdr *)((caddr_t)ip6 + off); + uh = (struct udphdr *)(void *)((caddr_t)ip6 + off); ulen = ntohs((u_short)uh->uh_ulen); if (plen != ulen) { udpstat.udps_badlen++; + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badlength, 1); + goto bad; } /* destination port of 0 is illegal, based on RFC768. */ - if (uh->uh_dport == 0) + if (uh->uh_dport == 0) { + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->port0, 1); + goto bad; + } /* * Checksum extended UDP header and data. @@ -254,6 +258,10 @@ udp6_input( else { if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { udpstat.udps_badsum++; + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badchksum, 1); + goto bad; } } @@ -322,10 +330,15 @@ udp6_input( if ((in6p->inp_vflag & INP_IPV6) == 0) continue; + if (ip6_restrictrecvif && ifp != NULL && + (ifp->if_eflags & IFEF_RESTRICTED_RECV) && + !(in6p->in6p_flags & IN6P_RECV_ANYIF)) + continue; + if (in_pcb_checkstate(in6p, WNT_ACQUIRE, 0) == WNT_STOPUSING) continue; - udp_lock(in6p->in6p_socket, 1, 0); + udp_lock(in6p->in6p_socket, 1, 0); if (in_pcb_checkstate(in6p, WNT_RELEASE, 1) == WNT_STOPUSING) { udp_unlock(in6p->in6p_socket, 1, 0); @@ -345,7 +358,7 @@ udp6_input( struct sockaddr_in6 mcaddr; int blocked; - IM6O_LOCK(imo); + IM6O_LOCK(imo); bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; @@ -354,7 +367,7 @@ udp6_input( blocked = im6o_mc_filter(imo, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); - IM6O_UNLOCK(imo); + IM6O_UNLOCK(imo); if (blocked != MCAST_PASS) { udp_unlock(in6p->in6p_socket, 1, 0); continue; @@ -411,11 +424,18 @@ udp6_input( */ if (reuse_sock == 0 || ((m = n) == NULL)) break; + + /* + * Expect 32-bit aligned data pointer on strict-align + * platforms. + */ + MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + /* * Recompute IP and UDP header pointers for new mbuf */ ip6 = mtod(m, struct ip6_hdr *); - uh = (struct udphdr *)((caddr_t)ip6 + off); + uh = (struct udphdr *)(void *)((caddr_t)ip6 + off); } lck_rw_done(pcbinfo->mtx); @@ -429,6 +449,9 @@ udp6_input( #ifndef __APPLE__ udpstat.udps_noportmcast++; #endif + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->port_unreach, 1); + goto bad; } @@ -442,7 +465,11 @@ udp6_input( in6p = in6_pcblookup_hash(&udbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); - if (in6p == 0) { + if (in6p == NULL) { + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->port_unreach, 1); + if (log_in_vain) { char buf[INET6_ADDRSTRLEN]; @@ -465,6 +492,9 @@ udp6_input( #ifndef __APPLE__ udpstat.udps_noportmcast++; #endif + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badmcast, 1); + goto bad; } icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); @@ -478,6 +508,10 @@ udp6_input( if (ipsec6_in_reject_so(m, in6p->in6p_socket)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); in_pcb_checkstate(in6p, WNT_RELEASE, 0); + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->badipsec, 1); + goto bad; } } @@ -491,12 +525,16 @@ udp6_input( if (in_pcb_checkstate(in6p, WNT_RELEASE, 1) == WNT_STOPUSING) { udp_unlock(in6p->in6p_socket, 1, 0); + + if (ifp->if_udp_stat != NULL) + atomic_add_64(&ifp->if_udp_stat->cleanup, 1); + goto bad; } - + init_sin6(&udp_in6, m); /* general init */ udp_in6.sin6_port = uh->uh_sport; - if ((in6p->in6p_flags & IN6P_CONTROLOPTS) != 0 || + if ((in6p->in6p_flags & IN6P_CONTROLOPTS) != 0 || (in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0 || (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { ret = ip6_savecontrol(in6p, m, &opts); @@ -688,7 +726,8 @@ udp6_attach(struct socket *so, __unused int proto, struct proc *p) * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; - nstat_udp_new_pcb(inp); + if (nstat_collect) + nstat_udp_new_pcb(inp); return 0; } @@ -707,7 +746,7 @@ udp6_bind(struct socket *so, struct sockaddr *nam, struct proc *p) if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; - sin6_p = (struct sockaddr_in6 *)nam; + sin6_p = (struct sockaddr_in6 *)(void *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; @@ -739,7 +778,7 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; - sin6_p = (struct sockaddr_in6 *)nam; + sin6_p = (struct sockaddr_in6 *)(void *)nam; if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; @@ -765,6 +804,8 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) inp->inp_vflag |= INP_IPV6; } soisconnected(so); + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); } return error; } @@ -801,8 +842,12 @@ udp6_disconnect(struct socket *so) return ENOTCONN; in6_pcbdisconnect(inp); + + /* reset flow-controlled state, just in case */ + inp_reset_fc_state(inp); + inp->in6p_laddr = in6addr_any; - inp->in6p_last_outif = 0; + inp->in6p_last_outifp = NULL; so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } @@ -838,7 +883,7 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, if (addr == 0) hasv4addr = (inp->inp_vflag & INP_IPV4); else { - sin6 = (struct sockaddr_in6 *)addr; + sin6 = (struct sockaddr_in6 *)(void *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 457f772ec..d1d59cd6f 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -137,21 +137,19 @@ #include -#ifndef satosin -#define satosin(s) ((struct sockaddr_in *)s) -#endif - #define FULLMASK 0xff lck_grp_t *sadb_mutex_grp; lck_grp_attr_t *sadb_mutex_grp_attr; lck_attr_t *sadb_mutex_attr; -lck_mtx_t *sadb_mutex; +decl_lck_mtx_data(, sadb_mutex_data); +lck_mtx_t *sadb_mutex = &sadb_mutex_data; lck_grp_t *pfkey_stat_mutex_grp; lck_grp_attr_t *pfkey_stat_mutex_grp_attr; lck_attr_t *pfkey_stat_mutex_attr; -lck_mtx_t *pfkey_stat_mutex; +decl_lck_mtx_data(, pfkey_stat_mutex_data); +lck_mtx_t *pfkey_stat_mutex = &pfkey_stat_mutex_data; /* * Note on SA reference counting: @@ -175,7 +173,7 @@ static int key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ static int key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ static int key_preferred_oldsa = 0; /* preferred old sa rather than new sa.*/ __private_extern__ int natt_keepalive_interval = 20; /* interval between natt keepalives.*/ -static int ipsec_policy_count = 0; +__private_extern__ int ipsec_policy_count = 0; static int ipsec_sav_count = 0; static u_int32_t acq_seq = 0; @@ -456,10 +454,8 @@ static struct mbuf *key_setdumpsp(struct secpolicy *, static u_int key_getspreqmsglen(struct secpolicy *); static int key_spdexpire(struct secpolicy *); static struct secashead *key_newsah(struct secasindex *, u_int8_t); -static void key_delsah(struct secashead *); static struct secasvar *key_newsav(struct mbuf *, const struct sadb_msghdr *, struct secashead *, int *); -static void key_delsav(struct secasvar *); static struct secashead *key_getsah(struct secasindex *); static struct secasvar *key_checkspidup(struct secasindex *, u_int32_t); static void key_setspi __P((struct secasvar *, u_int32_t)); @@ -550,9 +546,24 @@ static int key_promisc(struct socket *, struct mbuf *, static int key_senderror(struct socket *, struct mbuf *, int); static int key_validate_ext(const struct sadb_ext *, int); static int key_align(struct mbuf *, struct sadb_msghdr *); -static void key_sa_chgstate(struct secasvar *, u_int8_t); static struct mbuf *key_alloc_mbuf(int); static int key_getsastat (struct socket *, struct mbuf *, const struct sadb_msghdr *); +static int key_setsaval2(struct secasvar *sav, + u_int8_t satype, + u_int8_t alg_auth, + u_int8_t alg_enc, + u_int32_t flags, + u_int8_t replay, + struct sadb_key *key_auth, + u_int16_t key_auth_len, + struct sadb_key *key_enc, + u_int16_t key_enc_len, + u_int16_t natt_port, + u_int32_t seq, + u_int32_t spi, + u_int32_t pid, + struct sadb_lifetime *lifetime_hard, + struct sadb_lifetime *lifetime_soft); extern int ipsec_bypass; extern int esp_udp_encap_port; @@ -577,19 +588,13 @@ key_init(void) sadb_mutex_grp = lck_grp_alloc_init("sadb", sadb_mutex_grp_attr); sadb_mutex_attr = lck_attr_alloc_init(); - if ((sadb_mutex = lck_mtx_alloc_init(sadb_mutex_grp, sadb_mutex_attr)) == NULL) { - printf("key_init: can't alloc sadb_mutex\n"); - return; - } + lck_mtx_init(sadb_mutex, sadb_mutex_grp, sadb_mutex_attr); pfkey_stat_mutex_grp_attr = lck_grp_attr_alloc_init(); pfkey_stat_mutex_grp = lck_grp_alloc_init("pfkey_stat", pfkey_stat_mutex_grp_attr); pfkey_stat_mutex_attr = lck_attr_alloc_init(); - if ((pfkey_stat_mutex = lck_mtx_alloc_init(pfkey_stat_mutex_grp, pfkey_stat_mutex_attr)) == NULL) { - printf("key_init: can't alloc pfkey_stat_mutex\n"); - return; - } + lck_mtx_init(pfkey_stat_mutex, pfkey_stat_mutex_grp, pfkey_stat_mutex_attr); for (i = 0; i < SPIHASHSIZE; i++) LIST_INIT(&spihash[i]); @@ -1860,8 +1865,8 @@ key_msg2sp( return NULL; } - xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr - + xisr->sadb_x_ipsecrequest_len); + xisr = (struct sadb_x_ipsecrequest *)(void *) + ((caddr_t)xisr + xisr->sadb_x_ipsecrequest_len); } } break; @@ -1881,13 +1886,40 @@ key_newreqid(void) { lck_mtx_lock(sadb_mutex); static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1; + int done = 0; - auto_reqid = (auto_reqid == ~0 - ? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1); - lck_mtx_unlock(sadb_mutex); + /* The reqid must be limited to 16 bits because the PF_KEY message format only uses + 16 bits for this field. Once it becomes larger than 16 bits - ipsec fails to + work anymore. Changing the PF_KEY message format would introduce compatibility + issues. This code now tests to see if the tentative reqid is in use */ + + while (!done) { + struct secpolicy *sp; + struct ipsecrequest *isr; + int dir; + + auto_reqid = (auto_reqid == 0xFFFF + ? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1); - /* XXX should be unique check */ + /* check for uniqueness */ + done = 1; + for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { + LIST_FOREACH(sp, &sptree[dir], chain) { + for (isr = sp->req; isr != NULL; isr = isr->next) { + if (isr->saidx.reqid == auto_reqid) { + done = 0; + break; + } + } + if (done == 0) + break; + } + if (done == 0) + break; + } + } + lck_mtx_unlock(sadb_mutex); return auto_reqid; } @@ -1935,7 +1967,7 @@ key_sp2msg( for (isr = sp->req; isr != NULL; isr = isr->next) { - xisr = (struct sadb_x_ipsecrequest *)p; + xisr = (struct sadb_x_ipsecrequest *)(void *)p; xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto; xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode; @@ -1988,7 +2020,7 @@ key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp, if (len > MHLEN) panic("assumption failed"); #endif - MGETHDR(n, M_DONTWAIT, MT_DATA); + MGETHDR(n, M_WAITOK, MT_DATA); if (!n) goto fail; n->m_len = len; @@ -2007,7 +2039,7 @@ key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp, mtod(n, caddr_t)); } else { n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx], - M_DONTWAIT); + M_WAITOK); } if (n == NULL) goto fail; @@ -2085,12 +2117,13 @@ key_spdadd( ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } - lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; + lft = (struct sadb_lifetime *) + (void *)mhp->ext[SADB_EXT_LIFETIME_HARD]; } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; - xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY]; + xpl0 = (struct sadb_x_policy *)(void *)mhp->ext[SADB_X_EXT_POLICY]; /* make secindex */ /* XXX boundary check against sa_len */ @@ -2289,7 +2322,7 @@ key_spdadd( /* n is already freed */ return key_senderror(so, m, ENOBUFS); } - xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off); + xpl = (struct sadb_x_policy *)(void *)(mtod(mpolicy, caddr_t) + off); if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) { m_freem(n); return key_senderror(so, m, EINVAL); @@ -2377,7 +2410,7 @@ key_spddelete( src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; - xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY]; + xpl0 = (struct sadb_x_policy *)(void *)mhp->ext[SADB_X_EXT_POLICY]; /* make secindex */ /* XXX boundary check against sa_len */ @@ -2469,7 +2502,8 @@ key_spddelete2( return 0; } - id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; + id = ((struct sadb_x_policy *) + (void *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; /* Is there SP in SPD ? */ lck_mtx_lock(sadb_mutex); @@ -2493,9 +2527,9 @@ key_spddelete2( if (len > MCLBYTES) return key_senderror(so, m, ENOBUFS); - MGETHDR(n, M_DONTWAIT, MT_DATA); + MGETHDR(n, M_WAITOK, MT_DATA); if (n && len > MHLEN) { - MCLGET(n, M_DONTWAIT); + MCLGET(n, M_WAITOK); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; @@ -2517,7 +2551,7 @@ key_spddelete2( #endif n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY], - mhp->extlen[SADB_X_EXT_POLICY], M_DONTWAIT); + mhp->extlen[SADB_X_EXT_POLICY], M_WAITOK); if (!n->m_next) { m_freem(n); return key_senderror(so, m, ENOBUFS); @@ -2570,7 +2604,8 @@ key_spdget( return key_senderror(so, m, EINVAL); } - id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; + id = ((struct sadb_x_policy *) + (void *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; /* Is there SP in SPD ? */ lck_mtx_lock(sadb_mutex); @@ -2940,7 +2975,7 @@ key_spdexpire( lt->sadb_lifetime_bytes = 0; lt->sadb_lifetime_addtime = sp->created; lt->sadb_lifetime_usetime = sp->lastused; - lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2); + lt = (struct sadb_lifetime *)(void *)(mtod(m, caddr_t) + len / 2); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lt->sadb_lifetime_allocations = 0; @@ -3061,7 +3096,7 @@ key_newsah( /* * delete SA index and all SA registerd. */ -static void +void key_delsah( struct secashead *sah) { @@ -3185,7 +3220,7 @@ key_newsav( *errp = EINVAL; return NULL; } - xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; + xsa = (struct sadb_sa *)(void *)mhp->ext[SADB_EXT_SA]; key_setspi(newsav, xsa->sadb_sa_spi); newsav->seq = mhp->msg->sadb_msg_seq; break; @@ -3226,10 +3261,115 @@ key_newsav( return newsav; } +/* + * allocating a new SA with LARVAL state. key_add() and key_getspi() call, + * and copy the values passed into new buffer. + * When SAD message type is GETSPI: + * to set sequence number from acq_seq++, + * to set zero to SPI. + * not to call key_setsava(). + * OUT: NULL : fail + * others : pointer to new secasvar. + */ +struct secasvar * +key_newsav2(struct secashead *sah, + u_int8_t satype, + u_int8_t alg_auth, + u_int8_t alg_enc, + u_int32_t flags, + u_int8_t replay, + struct sadb_key *key_auth, + u_int16_t key_auth_len, + struct sadb_key *key_enc, + u_int16_t key_enc_len, + u_int16_t natt_port, + u_int32_t seq, + u_int32_t spi, + u_int32_t pid, + struct sadb_lifetime *lifetime_hard, + struct sadb_lifetime *lifetime_soft) +{ + struct secasvar *newsav; + + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + + /* sanity check */ + if (sah == NULL) + panic("key_newsa: NULL pointer is passed.\n"); + + KMALLOC_NOWAIT(newsav, struct secasvar *, sizeof(struct secasvar)); + if (newsav == NULL) { + lck_mtx_unlock(sadb_mutex); + KMALLOC_WAIT(newsav, struct secasvar *, sizeof(struct secasvar)); + lck_mtx_lock(sadb_mutex); + if (newsav == NULL) { + ipseclog((LOG_DEBUG, "key_newsa: No more memory.\n")); + return NULL; + } + } + bzero((caddr_t)newsav, sizeof(struct secasvar)); + +#if IPSEC_DOSEQCHECK + /* sync sequence number */ + if (seq == 0) + newsav->seq = (acq_seq = (acq_seq == ~0 ? 1 : ++acq_seq)); + else +#endif + newsav->seq = seq; + key_setspi(newsav, spi); + + if (key_setsaval2(newsav, + satype, + alg_auth, + alg_enc, + flags, + replay, + key_auth, + key_auth_len, + key_enc, + key_enc_len, + natt_port, + seq, + spi, + pid, + lifetime_hard, + lifetime_soft)) { + if (newsav->spihash.le_prev || newsav->spihash.le_next) + LIST_REMOVE(newsav, spihash); + KFREE(newsav); + return NULL; + } + + /* reset created */ + { + struct timeval tv; + microtime(&tv); + newsav->created = tv.tv_sec; + } + + newsav->pid = pid; + + /* add to satree */ + newsav->sah = sah; + newsav->refcnt = 1; + if (spi && key_auth && key_auth_len && key_enc && key_enc_len) { + newsav->state = SADB_SASTATE_MATURE; + LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_MATURE], newsav, + secasvar, chain); + } else { + newsav->state = SADB_SASTATE_LARVAL; + LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav, + secasvar, chain); + } + ipsec_sav_count++; + + return newsav; +} + /* * free() SA variable entry. */ -static void +void key_delsav( struct secasvar *sav) { @@ -3316,6 +3456,21 @@ key_getsah( return NULL; } +struct secashead * +key_newsah2 (struct secasindex *saidx, + u_int8_t dir) +{ + struct secashead *sah; + + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + + sah = key_getsah(saidx); + if (!sah) { + return(key_newsah(saidx, dir)); + } + return sah; +} + /* * check not to be duplicated SPI. * NOTE: this function is too slow due to searching all SAD. @@ -3448,7 +3603,7 @@ key_setsaval( if (mhp->ext[SADB_EXT_SA] != NULL) { const struct sadb_sa *sa0; - sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; + sa0 = (struct sadb_sa *)(void *)mhp->ext[SADB_EXT_SA]; if (mhp->extlen[SADB_EXT_SA] < sizeof(*sa0)) { ipseclog((LOG_DEBUG, "key_setsaval: invalid message size.\n")); error = EINVAL; @@ -3647,7 +3802,8 @@ key_setsaval( { const struct sadb_lifetime *lft0; - lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; + lft0 = (struct sadb_lifetime *) + (void *)mhp->ext[SADB_EXT_LIFETIME_HARD]; if (lft0 != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(*lft0)) { ipseclog((LOG_DEBUG, "key_setsaval: invalid hard lifetime ext len.\n")); @@ -3664,7 +3820,8 @@ key_setsaval( /* to be initialize ? */ } - lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_SOFT]; + lft0 = (struct sadb_lifetime *) + (void *)mhp->ext[SADB_EXT_LIFETIME_SOFT]; if (lft0 != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_SOFT] < sizeof(*lft0)) { ipseclog((LOG_DEBUG, "key_setsaval: invalid soft lifetime ext len.\n")); @@ -3725,6 +3882,224 @@ key_setsaval( return error; } +/* + * copy SA values from PF_KEY message except *SPI, SEQ, PID, STATE and TYPE*. + * You must update these if need. + * OUT: 0: success. + * !0: failure. + * + * does not modify mbuf. does not free mbuf on error. + */ +int +key_setsaval2(struct secasvar *sav, + u_int8_t satype, + u_int8_t alg_auth, + u_int8_t alg_enc, + u_int32_t flags, + u_int8_t replay, + struct sadb_key *key_auth, + u_int16_t key_auth_len, + struct sadb_key *key_enc, + u_int16_t key_enc_len, + u_int16_t natt_port, + u_int32_t seq, + u_int32_t spi, + u_int32_t pid, + struct sadb_lifetime *lifetime_hard, + struct sadb_lifetime *lifetime_soft) +{ +#if IPSEC_ESP + const struct esp_algorithm *algo; +#endif + int error = 0; + struct timeval tv; + + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); + + /* initialization */ + sav->replay = NULL; + sav->key_auth = NULL; + sav->key_enc = NULL; + sav->sched = NULL; + sav->schedlen = 0; + sav->iv = NULL; + sav->lft_c = NULL; + sav->lft_h = NULL; + sav->lft_s = NULL; + sav->remote_ike_port = 0; + sav->natt_last_activity = natt_now; + sav->natt_encapsulated_src_port = 0; + + sav->alg_auth = alg_auth; + sav->alg_enc = alg_enc; + sav->flags = flags; + sav->pid = pid; + sav->seq = seq; + key_setspi(sav, htonl(spi)); + + /* + * Verify that a nat-traversal port was specified if + * the nat-traversal flag is set. + */ + if ((sav->flags & SADB_X_EXT_NATT) != 0) { + if (natt_port == 0) { + ipseclog((LOG_DEBUG, "key_setsaval2: natt port not set.\n")); + error = EINVAL; + goto fail; + } + sav->remote_ike_port = natt_port; + } + + /* + * Verify if SADB_X_EXT_NATT_MULTIPLEUSERS flag is set that + * SADB_X_EXT_NATT is set and SADB_X_EXT_NATT_KEEPALIVE is not + * set (we're not behind nat) - otherwise clear it. + */ + if ((sav->flags & SADB_X_EXT_NATT_MULTIPLEUSERS) != 0) + if ((sav->flags & SADB_X_EXT_NATT) == 0 || + (sav->flags & SADB_X_EXT_NATT_KEEPALIVE) != 0) + sav->flags &= ~SADB_X_EXT_NATT_MULTIPLEUSERS; + + /* replay window */ + if ((flags & SADB_X_EXT_OLD) == 0) { + sav->replay = keydb_newsecreplay(replay); + if (sav->replay == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + } + + /* Authentication keys */ + sav->key_auth = (__typeof__(sav->key_auth))key_newbuf(key_auth, key_auth_len); + if (sav->key_auth == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + + /* Encryption key */ + sav->key_enc = (__typeof__(sav->key_enc))key_newbuf(key_enc, key_enc_len); + if (sav->key_enc == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + + /* set iv */ + sav->ivlen = 0; + + if (satype == SADB_SATYPE_ESP) { +#if IPSEC_ESP + algo = esp_algorithm_lookup(sav->alg_enc); + if (algo && algo->ivlen) + sav->ivlen = (*algo->ivlen)(algo, sav); + if (sav->ivlen != 0) { + KMALLOC_NOWAIT(sav->iv, caddr_t, sav->ivlen); + if (sav->iv == 0) { + lck_mtx_unlock(sadb_mutex); + KMALLOC_WAIT(sav->iv, caddr_t, sav->ivlen); + lck_mtx_lock(sadb_mutex); + if (sav->iv == 0) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + } + /* initialize */ + key_randomfill(sav->iv, sav->ivlen); + } +#endif + } + + /* reset created */ + microtime(&tv); + sav->created = tv.tv_sec; + + /* make lifetime for CURRENT */ + KMALLOC_NOWAIT(sav->lft_c, struct sadb_lifetime *, + sizeof(struct sadb_lifetime)); + if (sav->lft_c == NULL) { + lck_mtx_unlock(sadb_mutex); + KMALLOC_WAIT(sav->lft_c, struct sadb_lifetime *, + sizeof(struct sadb_lifetime)); + lck_mtx_lock(sadb_mutex); + if (sav->lft_c == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + } + + microtime(&tv); + + sav->lft_c->sadb_lifetime_len = + PFKEY_UNIT64(sizeof(struct sadb_lifetime)); + sav->lft_c->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + sav->lft_c->sadb_lifetime_allocations = 0; + sav->lft_c->sadb_lifetime_bytes = 0; + sav->lft_c->sadb_lifetime_addtime = tv.tv_sec; + sav->lft_c->sadb_lifetime_usetime = 0; + + /* lifetimes for HARD and SOFT */ + sav->lft_h = (__typeof__(sav->lft_h))key_newbuf(lifetime_hard, + sizeof(*lifetime_hard)); + if (sav->lft_h == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + sav->lft_s = (__typeof__(sav->lft_s))key_newbuf(lifetime_soft, + sizeof(*lifetime_soft)); + if (sav->lft_s == NULL) { + ipseclog((LOG_DEBUG, "key_setsaval: No more memory.\n")); + error = ENOBUFS; + goto fail; + } + + return 0; + +fail: + /* initialization */ + if (sav->replay != NULL) { + keydb_delsecreplay(sav->replay); + sav->replay = NULL; + } + if (sav->key_auth != NULL) { + bzero(_KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); + KFREE(sav->key_auth); + sav->key_auth = NULL; + } + if (sav->key_enc != NULL) { + bzero(_KEYBUF(sav->key_enc), _KEYLEN(sav->key_enc)); + KFREE(sav->key_enc); + sav->key_enc = NULL; + } + if (sav->sched) { + bzero(sav->sched, sav->schedlen); + KFREE(sav->sched); + sav->sched = NULL; + } + if (sav->iv != NULL) { + KFREE(sav->iv); + sav->iv = NULL; + } + if (sav->lft_c != NULL) { + KFREE(sav->lft_c); + sav->lft_c = NULL; + } + if (sav->lft_h != NULL) { + KFREE(sav->lft_h); + sav->lft_h = NULL; + } + if (sav->lft_s != NULL) { + KFREE(sav->lft_s); + sav->lft_s = NULL; + } + + return error; +} + /* * validation with a secasvar entry, and set SADB_SATYPE_MATURE. * OUT: 0: valid @@ -4016,7 +4391,7 @@ key_setdumpsa( if ((!m && !p) || (m && p)) goto fail; if (p && tres) { - M_PREPEND(tres, l, M_DONTWAIT); + M_PREPEND(tres, l, M_WAITOK); if (!tres) goto fail; bcopy(p, mtod(tres, caddr_t), l); @@ -4416,7 +4791,7 @@ key_ismyaddr( #if INET case AF_INET: lck_rw_lock_shared(in_ifaddr_rwlock); - sin = (struct sockaddr_in *)sa; + sin = (struct sockaddr_in *)(void *)sa; for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { IFA_LOCK_SPIN(&ia->ia_ifa); @@ -4435,7 +4810,7 @@ key_ismyaddr( #endif #if INET6 case AF_INET6: - return key_ismyaddr6((struct sockaddr_in6 *)sa); + return key_ismyaddr6((struct sockaddr_in6 *)(void *)sa); #endif } @@ -4922,7 +5297,8 @@ key_timehandler(void) */ if (savkabuf && savkacount < savbufcount) { sav = LIST_FIRST(&sah->savtree[SADB_SASTATE_MATURE]); //%%% should we check dying list if this is empty??? - if (natt_keepalive_interval && sav && (sav->flags & SADB_X_EXT_NATT_KEEPALIVE) != 0) { + if (natt_keepalive_interval && sav && + (sav->flags & (SADB_X_EXT_NATT_KEEPALIVE | SADB_X_EXT_ESP_KEEPALIVE)) != 0) { sav->refcnt++; *savkaptr++ = sav; savkacount++; @@ -5363,8 +5739,10 @@ key_getspi( return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { - mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; - reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; + mode = ((struct sadb_x_sa2 *) + (void *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; + reqid = ((struct sadb_x_sa2 *) + (void *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; @@ -5385,13 +5763,13 @@ key_getspi( if (((struct sockaddr *)(src0 + 1))->sa_len != sizeof(struct sockaddr_in)) return key_senderror(so, m, EINVAL); - ((struct sockaddr_in *)(src0 + 1))->sin_port = 0; + ((struct sockaddr_in *)(void *)(src0 + 1))->sin_port = 0; break; case AF_INET6: if (((struct sockaddr *)(src0 + 1))->sa_len != sizeof(struct sockaddr_in6)) return key_senderror(so, m, EINVAL); - ((struct sockaddr_in6 *)(src0 + 1))->sin6_port = 0; + ((struct sockaddr_in6 *)(void *)(src0 + 1))->sin6_port = 0; break; default: ; /*???*/ @@ -5401,13 +5779,13 @@ key_getspi( if (((struct sockaddr *)(dst0 + 1))->sa_len != sizeof(struct sockaddr_in)) return key_senderror(so, m, EINVAL); - ((struct sockaddr_in *)(dst0 + 1))->sin_port = 0; + ((struct sockaddr_in *)(void *)(dst0 + 1))->sin_port = 0; break; case AF_INET6: if (((struct sockaddr *)(dst0 + 1))->sa_len != sizeof(struct sockaddr_in6)) return key_senderror(so, m, EINVAL); - ((struct sockaddr_in6 *)(dst0 + 1))->sin6_port = 0; + ((struct sockaddr_in6 *)(void *)(dst0 + 1))->sin6_port = 0; break; default: ; /*???*/ @@ -5417,10 +5795,10 @@ key_getspi( KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); lck_mtx_lock(sadb_mutex); - + /* SPI allocation */ - spi = key_do_getnewspi((struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE], - &saidx); + spi = key_do_getnewspi((struct sadb_spirange *) + (void *)mhp->ext[SADB_EXT_SPIRANGE], &saidx); if (spi == 0) { lck_mtx_unlock(sadb_mutex); return key_senderror(so, m, EINVAL); @@ -5476,9 +5854,9 @@ key_getspi( if (len > MCLBYTES) return key_senderror(so, m, ENOBUFS); - MGETHDR(n, M_DONTWAIT, MT_DATA); - if (len > MHLEN) { - MCLGET(n, M_DONTWAIT); + MGETHDR(n, M_WAITOK, MT_DATA); + if (n && len > MHLEN) { + MCLGET(n, M_WAITOK); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; @@ -5494,7 +5872,7 @@ key_getspi( m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); - m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off); + m_sa = (struct sadb_sa *)(void *)(mtod(n, caddr_t) + off); m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa)); m_sa->sadb_sa_exttype = SADB_EXT_SA; m_sa->sadb_sa_spi = htonl(spi); @@ -5533,9 +5911,65 @@ key_getspi( } } +u_int32_t +key_getspi2(struct sockaddr *src, + struct sockaddr *dst, + u_int8_t proto, + u_int8_t mode, + u_int32_t reqid, + struct sadb_spirange *spirange) +{ + u_int32_t spi; + struct secasindex saidx; + + lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); + + /* XXX boundary check against sa_len */ + KEY_SETSECASIDX(proto, mode, reqid, src, dst, &saidx); + + /* make sure if port number is zero. */ + switch (((struct sockaddr *)&saidx.src)->sa_family) { + case AF_INET: + if (((struct sockaddr *)&saidx.src)->sa_len != sizeof(struct sockaddr_in)) + return 0; + ((struct sockaddr_in *)&saidx.src)->sin_port = 0; + break; + case AF_INET6: + if (((struct sockaddr *)&saidx.src)->sa_len != sizeof(struct sockaddr_in6)) + return 0; + ((struct sockaddr_in6 *)&saidx.src)->sin6_port = 0; + break; + default: + ; /*???*/ + } + switch (((struct sockaddr *)&saidx.dst)->sa_family) { + case AF_INET: + if (((struct sockaddr *)&saidx.dst)->sa_len != sizeof(struct sockaddr_in)) + return 0; + ((struct sockaddr_in *)&saidx.dst)->sin_port = 0; + break; + case AF_INET6: + if (((struct sockaddr *)&saidx.dst)->sa_len != sizeof(struct sockaddr_in6)) + return 0; + ((struct sockaddr_in6 *)&saidx.dst)->sin6_port = 0; + break; + default: + ; /*???*/ + } + + lck_mtx_lock(sadb_mutex); + + /* SPI allocation */ + spi = key_do_getnewspi(spirange, &saidx); + + lck_mtx_unlock(sadb_mutex); + + return spi; +} + /* * allocating new SPI - * called by key_getspi(). + * called by key_getspi() and key_getspi2(). * OUT: * 0: failure. * others: success. @@ -5673,15 +6107,17 @@ key_update( return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { - mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; - reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; + mode = ((struct sadb_x_sa2 *) + (void *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; + reqid = ((struct sadb_x_sa2 *) + (void *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } /* XXX boundary checking for other extensions */ - sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; + sa0 = (struct sadb_sa *)(void *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); @@ -5689,7 +6125,7 @@ key_update( KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); lck_mtx_lock(sadb_mutex); - + /* get a SA header */ if ((sah = key_getsah(&saidx)) == NULL) { lck_mtx_unlock(sadb_mutex); @@ -5896,14 +6332,16 @@ key_add( return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { - mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; - reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; + mode = ((struct sadb_x_sa2 *) + (void *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; + reqid = ((struct sadb_x_sa2 *) + (void *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } - sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; + sa0 = (struct sadb_sa *)(void *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; @@ -5911,7 +6349,7 @@ key_add( KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); lck_mtx_lock(sadb_mutex); - + /* get a SA header */ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA header: key_addspi is always used for outbound spi */ @@ -6011,8 +6449,10 @@ key_setident( return EINVAL; } - idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC]; - iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST]; + idsrc = (const struct sadb_ident *) + (void *)mhp->ext[SADB_EXT_IDENTITY_SRC]; + iddst = (const struct sadb_ident *) + (void *)mhp->ext[SADB_EXT_IDENTITY_DST]; idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC]; iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST]; @@ -6167,7 +6607,7 @@ key_delete( return key_senderror(so, m, EINVAL); } - sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; + sa0 = (struct sadb_sa *)(void *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); @@ -6354,7 +6794,7 @@ key_get( return key_senderror(so, m, EINVAL); } - sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; + sa0 = (struct sadb_sa *)(void *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; @@ -6535,7 +6975,7 @@ key_getcomb_esp(void) if (l > MLEN) panic("assumption failed in key_getcomb_esp"); #endif - MGET(m, M_DONTWAIT, MT_DATA); + MGET(m, M_WAITOK, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; @@ -6560,7 +7000,8 @@ key_getcomb_esp(void) /* m is already freed */ goto fail; } - comb = (struct sadb_comb *)(mtod(n, caddr_t) + o); + comb = (struct sadb_comb *) + (void *)(mtod(n, caddr_t) + o); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_encrypt = i; @@ -6619,14 +7060,14 @@ key_getcomb_ah(void) if (l > MLEN) panic("assumption failed in key_getcomb_ah"); #endif - MGET(m, M_DONTWAIT, MT_DATA); + MGET(m, M_WAITOK, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; } } else - M_PREPEND(m, l, M_DONTWAIT); + M_PREPEND(m, l, M_WAITOK); if (!m) return NULL; @@ -6665,14 +7106,14 @@ key_getcomb_ipcomp(void) if (l > MLEN) panic("assumption failed in key_getcomb_ipcomp"); #endif - MGET(m, M_DONTWAIT, MT_DATA); + MGET(m, M_WAITOK, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; } } else - M_PREPEND(m, l, M_DONTWAIT); + M_PREPEND(m, l, M_WAITOK); if (!m) return NULL; @@ -6718,7 +7159,7 @@ key_getprop( if (!m) return NULL; - M_PREPEND(m, l, M_DONTWAIT); + M_PREPEND(m, l, M_WAITOK); if (!m) return NULL; @@ -6846,7 +7287,7 @@ key_acquire( } m_cat(result, m); } - + /* XXX identity (optional) */ #if 0 if (idexttype && fqdn) { @@ -7271,9 +7712,9 @@ key_register( if (len > MCLBYTES) return key_senderror(so, m, ENOBUFS); - MGETHDR(n, M_DONTWAIT, MT_DATA); - if (len > MHLEN) { - MCLGET(n, M_DONTWAIT); + MGETHDR(n, M_WAITOK, MT_DATA); + if (n && len > MHLEN) { + MCLGET(n, M_WAITOK); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; @@ -7294,7 +7735,7 @@ key_register( /* for authentication algorithm */ if (alen) { - sup = (struct sadb_supported *)(mtod(n, caddr_t) + off); + sup = (struct sadb_supported *)(void *)(mtod(n, caddr_t) + off); sup->sadb_supported_len = PFKEY_UNIT64(alen); sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; off += PFKEY_ALIGN8(sizeof(*sup)); @@ -7305,7 +7746,8 @@ key_register( aalgo = ah_algorithm_lookup(i); if (!aalgo) continue; - alg = (struct sadb_alg *)(mtod(n, caddr_t) + off); + alg = (struct sadb_alg *) + (void *)(mtod(n, caddr_t) + off); alg->sadb_alg_id = i; alg->sadb_alg_ivlen = 0; alg->sadb_alg_minbits = aalgo->keymin; @@ -7317,7 +7759,7 @@ key_register( #if IPSEC_ESP /* for encryption algorithm */ if (elen) { - sup = (struct sadb_supported *)(mtod(n, caddr_t) + off); + sup = (struct sadb_supported *)(void *)(mtod(n, caddr_t) + off); sup->sadb_supported_len = PFKEY_UNIT64(elen); sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; off += PFKEY_ALIGN8(sizeof(*sup)); @@ -7328,7 +7770,8 @@ key_register( ealgo = esp_algorithm_lookup(i); if (!ealgo) continue; - alg = (struct sadb_alg *)(mtod(n, caddr_t) + off); + alg = (struct sadb_alg *) + (void *)(mtod(n, caddr_t) + off); alg->sadb_alg_id = i; if (ealgo && ealgo->ivlen) { /* @@ -7464,7 +7907,7 @@ key_expire( lt->sadb_lifetime_bytes = sav->lft_c->sadb_lifetime_bytes; lt->sadb_lifetime_addtime = sav->lft_c->sadb_lifetime_addtime; lt->sadb_lifetime_usetime = sav->lft_c->sadb_lifetime_usetime; - lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2); + lt = (struct sadb_lifetime *)(void *)(mtod(m, caddr_t) + len / 2); bcopy(sav->lft_s, lt, sizeof(*lt)); m_cat(result, m); @@ -7893,9 +8336,9 @@ key_parse( if (m->m_next) { struct mbuf *n; - MGETHDR(n, M_DONTWAIT, MT_DATA); + MGETHDR(n, M_WAITOK, MT_DATA); if (n && m->m_pkthdr.len > MHLEN) { - MCLGET(n, M_DONTWAIT); + MCLGET(n, M_WAITOK); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; @@ -8137,7 +8580,7 @@ key_align( /* m is already freed */ return ENOBUFS; } - ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff); + ext = (struct sadb_ext *)(void *)(mtod(n, caddr_t) + toff); /* set pointer */ switch (ext->sadb_ext_type) { @@ -8197,7 +8640,7 @@ key_align( /* m is already freed */ return ENOBUFS; } - ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff); + ext = (struct sadb_ext *)(void *)(mtod(n, caddr_t) + toff); mhp->ext[ext->sadb_ext_type] = ext; mhp->extoff[ext->sadb_ext_type] = off; @@ -8245,8 +8688,8 @@ key_validate_ext( break; case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: - if (((const struct sadb_ident *)ext)->sadb_ident_type == - SADB_X_IDENTTYPE_ADDR) { + if (((struct sadb_ident *)(uintptr_t)(size_t)ext)-> + sadb_ident_type == SADB_X_IDENTTYPE_ADDR) { baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident)); checktype = ADDR; } else @@ -8379,7 +8822,7 @@ key_sa_routechange( return; } -static void +void key_sa_chgstate( struct secasvar *sav, u_int8_t state) @@ -8567,7 +9010,8 @@ key_getsastat (struct socket *so, } bzero(sa_stats_sav, bufsize); - sa_stats_arg = (__typeof__(sa_stats_arg))mhp->ext[SADB_EXT_SASTAT]; + sa_stats_arg = (__typeof__(sa_stats_arg)) + (void *)mhp->ext[SADB_EXT_SASTAT]; arg_count = sa_stats_arg->sadb_sastat_list_len; // exit early if there are no requested SAs if (arg_count == 0) { @@ -8591,7 +9035,8 @@ key_getsastat (struct socket *so, goto end; } - session_id = (__typeof__(session_id))mhp->ext[SADB_EXT_SESSION_ID]; + session_id = (__typeof__(session_id)) + (void *)mhp->ext[SADB_EXT_SESSION_ID]; /* send this to the userland. */ n = key_setdumpsastats(sa_stats_arg->sadb_sastat_dir, diff --git a/bsd/netkey/key.h b/bsd/netkey/key.h index 3dda20469..f2a2729a1 100644 --- a/bsd/netkey/key.h +++ b/bsd/netkey/key.h @@ -48,6 +48,10 @@ struct sockaddr; struct socket; struct sadb_msg; struct sadb_x_policy; +struct secasindex; +struct secashead; +struct sadb_key; +struct sadb_lifetime; extern struct secpolicy *key_allocsp(struct secpolicyindex *, u_int); extern struct secasvar *key_allocsa_policy(struct secasindex *); @@ -75,7 +79,34 @@ extern void key_domain_init(void); extern int key_checktunnelsanity(struct secasvar *, u_int, caddr_t, caddr_t); extern void key_sa_recordxfer(struct secasvar *, struct mbuf *); extern void key_sa_routechange(struct sockaddr *); +extern void key_sa_chgstate(struct secasvar *, u_int8_t); extern void key_sa_stir_iv(struct secasvar *); +extern void key_delsah(struct secashead *sah); +extern struct secashead *key_newsah2 (struct secasindex *saidx, u_int8_t dir); +extern u_int32_t key_getspi2(struct sockaddr *src, + struct sockaddr *dst, + u_int8_t proto, + u_int8_t mode, + u_int32_t reqid, + struct sadb_spirange *spirange); +extern struct secasvar * key_newsav2(struct secashead *sah, + u_int8_t satype, + u_int8_t alg_auth, + u_int8_t alg_enc, + u_int32_t flags, + u_int8_t replay, + struct sadb_key *key_auth, + u_int16_t key_auth_len, + struct sadb_key *key_enc, + u_int16_t key_enc_len, + u_int16_t natt_port, + u_int32_t seq, + u_int32_t spi, + u_int32_t pid, + struct sadb_lifetime *lifetime_hard, + struct sadb_lifetime *lifetime_soft); +extern void key_delsav(struct secasvar *sav); + #endif /* KERNEL_PRIVATE */ #endif /* _NETKEY_KEY_H_ */ diff --git a/bsd/netkey/keydb.h b/bsd/netkey/keydb.h index e304c336f..079f6e288 100644 --- a/bsd/netkey/keydb.h +++ b/bsd/netkey/keydb.h @@ -37,6 +37,7 @@ #ifdef KERNEL_PRIVATE #include +#include /* Security Assocciation Index */ /* NOTE: Ensure to be same address family */ @@ -68,6 +69,9 @@ struct secashead { struct route sa_route; /* route cache */ }; +typedef int (*utun_is_keepalive_func) __P((void *, void *, u_int16_t, u_int32_t, size_t)); +typedef int (*utun_input_func) __P((void *, void *, protocol_family_t family)); + /* Security Association */ struct secasvar { LIST_ENTRY(secasvar) chain; @@ -103,6 +107,10 @@ struct secasvar { u_int32_t natt_last_activity; u_int16_t remote_ike_port; u_int16_t natt_encapsulated_src_port; /* network byte order */ + + void *utun_pcb; + utun_is_keepalive_func utun_is_keepalive_fn; + utun_input_func utun_in_fn; }; /* replay prevention */ diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index 41b025389..bbb466d93 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -86,7 +86,7 @@ __private_extern__ int nfs_ticks; #define NFS_MAXREXMIT 100 /* Stop counting after this many */ #define NFS_RETRANS 10 /* Num of retrans for soft mounts */ #define NFS_TRYLATERDEL 4 /* Initial try later delay (sec) */ -#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MAXGRPS 16U /* Max. size of groups list */ #define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ #define NFS_MAXATTRTIMO 60 #define NFS_MINDIRATTRTIMO 5 /* directory attribute cache timeout in sec */ @@ -476,6 +476,7 @@ struct user_nfs_export_args { #define NX_MAPALL 0x0008 /* map all access to anon credential */ #define NX_32BITCLIENTS 0x0020 /* restrict directory cookies to 32 bits */ #define NX_OFFLINE 0x0040 /* export is offline */ +#define NX_MANGLEDNAMES 0x0080 /* export will return mangled names for names > 255 bytes */ /* * fs.nfs sysctl(3) export stats record structures @@ -675,6 +676,13 @@ __private_extern__ int nfsrv_async, nfsrv_export_hash_size, nfsrv_reqcache_size, nfsrv_sock_max_rec_queue_length; __private_extern__ uint32_t nfsrv_gss_context_ttl; __private_extern__ struct nfsstats nfsstats; +#define NFS_UC_Q_DEBUG +#ifdef NFS_UC_Q_DEBUG +__private_extern__ int nfsrv_uc_use_proxy; +__private_extern__ uint32_t nfsrv_uc_queue_limit; +__private_extern__ uint32_t nfsrv_uc_queue_max_seen; +__private_extern__ volatile uint32_t nfsrv_uc_queue_count; +#endif #endif // KERNEL @@ -686,38 +694,38 @@ __private_extern__ struct nfsstats nfsstats; * Stats structure */ struct nfsstats { - int attrcache_hits; - int attrcache_misses; - int lookupcache_hits; - int lookupcache_misses; - int direofcache_hits; - int direofcache_misses; - int biocache_reads; - int read_bios; - int read_physios; - int biocache_writes; - int write_bios; - int write_physios; - int biocache_readlinks; - int readlink_bios; - int biocache_readdirs; - int readdir_bios; - int rpccnt[NFS_NPROCS]; - int rpcretries; - int srvrpccnt[NFS_NPROCS]; - int srvrpc_errs; - int srv_errs; - int rpcrequests; - int rpctimeouts; - int rpcunexpected; - int rpcinvalid; - int srvcache_inproghits; - int srvcache_idemdonehits; - int srvcache_nonidemdonehits; - int srvcache_misses; - int srvvop_writes; - int pageins; - int pageouts; + uint64_t attrcache_hits; + uint64_t attrcache_misses; + uint64_t lookupcache_hits; + uint64_t lookupcache_misses; + uint64_t direofcache_hits; + uint64_t direofcache_misses; + uint64_t biocache_reads; + uint64_t read_bios; + uint64_t read_physios; + uint64_t biocache_writes; + uint64_t write_bios; + uint64_t write_physios; + uint64_t biocache_readlinks; + uint64_t readlink_bios; + uint64_t biocache_readdirs; + uint64_t readdir_bios; + uint64_t rpccnt[NFS_NPROCS]; + uint64_t rpcretries; + uint64_t srvrpccnt[NFS_NPROCS]; + uint64_t srvrpc_errs; + uint64_t srv_errs; + uint64_t rpcrequests; + uint64_t rpctimeouts; + uint64_t rpcunexpected; + uint64_t rpcinvalid; + uint64_t srvcache_inproghits; + uint64_t srvcache_idemdonehits; + uint64_t srvcache_nonidemdonehits; + uint64_t srvcache_misses; + uint64_t srvvop_writes; + uint64_t pageins; + uint64_t pageouts; }; #endif @@ -790,6 +798,7 @@ struct nfs_fs_locations; struct nfs_location_index; struct nfs_socket; struct nfs_socket_search; +struct nfsrv_uc_arg; /* * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. @@ -942,6 +951,8 @@ __private_extern__ int nfs_lockd_mounts, nfs_lockd_request_sent, nfs_single_des; __private_extern__ int nfs_tprintf_initial_delay, nfs_tprintf_delay; __private_extern__ int nfsiod_thread_count, nfsiod_thread_max, nfs_max_async_writes; __private_extern__ int nfs_idmap_ctrl, nfs_callback_port; +__private_extern__ int nfs_is_mobile; +__private_extern__ uint32_t nfs_squishy_flags; /* bits for nfs_idmap_ctrl: */ #define NFS_IDMAP_CTRL_USE_IDMAP_SERVICE 0x00000001 /* use the ID mapping service */ @@ -971,6 +982,7 @@ struct nfsrv_sock { TAILQ_ENTRY(nfsrv_sock) ns_chain; /* List of all nfsrv_sock's */ TAILQ_ENTRY(nfsrv_sock) ns_svcq; /* List of sockets needing servicing */ TAILQ_ENTRY(nfsrv_sock) ns_wgq; /* List of sockets with a pending write gather */ + struct nfsrv_uc_arg *ns_ua; /* Opaque pointer to upcall */ lck_rw_t ns_rwlock; /* lock for most fields */ socket_t ns_so; mbuf_t ns_nam; @@ -1429,6 +1441,12 @@ void nfs_ephemeral_mount_harvester_start(void); void nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr); #endif +/* socket upcall interfaces */ +void nfsrv_uc_init(void); +void nfsrv_uc_cleanup(void); +void nfsrv_uc_addsock(struct nfsrv_sock *, int); +void nfsrv_uc_dequeue(struct nfsrv_sock *); + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index 84d0aa9f7..69f12d1f7 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -439,7 +439,7 @@ out: * get the list of supported security flavors * * How we get them depends on what args we are given: - * + * * FH? Name? Action * ----- ----- ------ * YES YES Use the fh and name provided @@ -1666,6 +1666,8 @@ nfs4_parsefattr( nfsm_chain_get_32(error, nmc, ace_flags); nfsm_chain_get_32(error, nmc, ace_mask); nfsm_chain_get_32(error, nmc, len); + if (!error && len >= NFS_MAX_WHO) + error = EBADRPC; acl->acl_ace[i].ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); acl->acl_ace[i].ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); acl->acl_ace[i].ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); @@ -1675,16 +1677,12 @@ nfs4_parsefattr( s = sbuf; slen = sizeof(sbuf); } - if (len >= NFS_MAX_WHO) { - error = EBADRPC; - } else { - /* Let's add a bit more if we can to the allocation as to try and avoid future allocations */ - MALLOC(s, char*, (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO, M_TEMP, M_WAITOK); - if (s) - slen = (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO; - else - error = ENOMEM; - } + /* Let's add a bit more if we can to the allocation as to try and avoid future allocations */ + MALLOC(s, char*, (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO, M_TEMP, M_WAITOK); + if (s) + slen = (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO; + else + error = ENOMEM; } if (error2) nfsm_chain_adv(error, nmc, nfsm_rndup(len)); @@ -1999,22 +1997,20 @@ nfs4_parsefattr( } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { nfsm_chain_get_32(error, nmc, len); + if (!error && len >= NFS_MAX_WHO) + error = EBADRPC; if (!error && (len >= slen)) { if (s != sbuf) { FREE(s, M_TEMP); s = sbuf; slen = sizeof(sbuf); } - if (len >= NFS_MAX_WHO) { - error = EBADRPC; - } else { - /* Let's add a bit more if we can to the allocation as to try and avoid future allocations */ - MALLOC(s, char*, (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO, M_TEMP, M_WAITOK); - if (s) - slen = (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO; - else - error = ENOMEM; - } + /* Let's add a bit more if we can to the allocation as to try and avoid future allocations */ + MALLOC(s, char*, (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO, M_TEMP, M_WAITOK); + if (s) + slen = (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO; + else + error = ENOMEM; } nfsm_chain_get_opaque(error, nmc, len, s); if (!error) { @@ -2036,22 +2032,20 @@ nfs4_parsefattr( } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { nfsm_chain_get_32(error, nmc, len); + if (!error && len >= NFS_MAX_WHO) + error = EBADRPC; if (!error && (len >= slen)) { if (s != sbuf) { FREE(s, M_TEMP); s = sbuf; slen = sizeof(sbuf); } - if (len >= NFS_MAX_WHO) { - error = EBADRPC; - } else { - /* Let's add a bit more if we can to the allocation as to try and avoid future allocations */ - MALLOC(s, char*, (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO, M_TEMP, M_WAITOK); - if (s) - slen = (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO; - else - error = ENOMEM; - } + /* Let's add a bit more if we can to the allocation as to try and avoid future allocations */ + MALLOC(s, char*, (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO, M_TEMP, M_WAITOK); + if (s) + slen = (len + 16 < NFS_MAX_WHO) ? len+16 : NFS_MAX_WHO; + else + error = ENOMEM; } nfsm_chain_get_opaque(error, nmc, len, s); if (!error) { @@ -2696,4 +2690,3 @@ recheckdeleg: vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); } } - diff --git a/bsd/nfs/nfs4_vnops.c b/bsd/nfs/nfs4_vnops.c index ca874aa7c..6259af593 100644 --- a/bsd/nfs/nfs4_vnops.c +++ b/bsd/nfs/nfs4_vnops.c @@ -778,7 +778,7 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) } else { cookie = bp->nb_lblkno; /* increment with every buffer read */ - OSAddAtomic(1, &nfsstats.readdir_bios); + OSAddAtomic64(1, &nfsstats.readdir_bios); } lastcookie = cookie; @@ -946,7 +946,7 @@ nextbuffer: space_free = nfs_dir_buf_freespace(bp, rdirplus); dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp); /* increment with every buffer read */ - OSAddAtomic(1, &nfsstats.readdir_bios); + OSAddAtomic64(1, &nfsstats.readdir_bios); } nmrepsave = nmrep; dp->d_fileno = cookie; /* placeholder */ @@ -2830,6 +2830,26 @@ out: } if (noop) nfs_open_owner_rele(noop); + + if (!error) { + int ismapped = 0; + nfs_node_lock_force(np); + if ((np->n_flag & NISMAPPED) == 0) { + np->n_flag |= NISMAPPED; + ismapped = 1; + } + nfs_node_unlock(np); + if (ismapped) { + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state &= ~NFSSTA_SQUISHY; + nmp->nm_curdeadtimeout = nmp->nm_deadtimeout; + if (nmp->nm_curdeadtimeout <= 0) + nmp->nm_deadto_start = 0; + nmp->nm_mappers++; + lck_mtx_unlock(&nmp->nm_lock); + } + } + return (error); } @@ -2849,11 +2869,27 @@ nfs_vnop_mnomap( struct nfs_open_file *nofp = NULL; off_t size; int error; - + int is_mapped_flag = 0; + nmp = VTONMP(vp); if (!nmp) return (ENXIO); + nfs_node_lock_force(np); + if (np->n_flag & NISMAPPED) { + is_mapped_flag = 1; + np->n_flag &= ~NISMAPPED; + } + nfs_node_unlock(np); + if (is_mapped_flag) { + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_mappers) + nmp->nm_mappers--; + else + NP(np, "nfs_vnop_mnomap: removing mmap reference from mount, but mount has no files mmapped"); + lck_mtx_unlock(&nmp->nm_lock); + } + /* flush buffers/ubc before we drop the open (in case it's our last open) */ nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) @@ -3797,7 +3833,8 @@ error_out: wakeup(newnflp); } else { /* remove newnflp from lock list and destroy */ - TAILQ_REMOVE(&np->n_locks, newnflp, nfl_link); + if (inqueue) + TAILQ_REMOVE(&np->n_locks, newnflp, nfl_link); nfs_file_lock_destroy(newnflp); } lck_mtx_unlock(&np->n_openlock); @@ -5753,6 +5790,7 @@ nfs_release_open_state_for_node(nfsnode_t np, int force) lck_mtx_lock(&nofp->nof_lock); nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; nofp->nof_flags |= NFS_OPEN_FILE_LOST; + lck_mtx_unlock(&nofp->nof_lock); if (!force && nmp && (nmp->nm_vers >= NFS_VER4)) nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); @@ -6984,7 +7022,7 @@ nfs4_named_attr_get( /* FALLTHROUGH */ case -1: /* cache hit, not really an error */ - OSAddAtomic(1, &nfsstats.lookupcache_hits); + OSAddAtomic64(1, &nfsstats.lookupcache_hits); if (!anp && avp) *anpp = anp = VTONFS(avp); @@ -7524,7 +7562,7 @@ nfsmout: /* don't save the data if dirty or potential I/O conflict */ if (!error && bp && !bp->nb_dirtyoff && !(bp->nb_dirty & pagemask) && timevalcmp(&anp->n_lastio, &now, <)) { - OSAddAtomic(1, &nfsstats.read_bios); + OSAddAtomic64(1, &nfsstats.read_bios); CLR(bp->nb_flags, (NB_DONE|NB_ASYNC)); SET(bp->nb_flags, NB_READ); NFS_BUF_MAP(bp); @@ -7951,7 +7989,7 @@ nfs4_vnop_listxattr( nextcookie = lbn = 0; while (!error && !done) { - OSAddAtomic(1, &nfsstats.biocache_readdirs); + OSAddAtomic64(1, &nfsstats.biocache_readdirs); cookie = nextcookie; getbuffer: error = nfs_buf_get(adnp, lbn, NFS_DIRBLKSIZ, vfs_context_thread(ctx), NBLK_READ, &bp); diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 4bd1bff61..b9d22e786 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1475,7 +1475,7 @@ nfs_buf_read(struct nfsbuf *bp) NFS_BUF_MAP(bp); - OSAddAtomic(1, &nfsstats.read_bios); + OSAddAtomic64(1, &nfsstats.read_bios); error = nfs_buf_read_rpc(bp, thd, cred); /* @@ -2028,7 +2028,7 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) } /* count any biocache reads that we just copied directly */ if (lbn != (uio_offset(uio)/biosize)) { - OSAddAtomic((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads); + OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads); FSDBG(514, np, 0xcacefeed, uio_offset(uio), error); } } @@ -2059,7 +2059,7 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) readaheads = 1; } - OSAddAtomic(1, &nfsstats.biocache_reads); + OSAddAtomic64(1, &nfsstats.biocache_reads); /* * If the block is in the cache and has the required data @@ -2425,7 +2425,7 @@ nfs_buf_write(struct nfsbuf *bp) bp->nb_offio = doff; bp->nb_endio = dend; - OSAddAtomic(1, &nfsstats.write_bios); + OSAddAtomic64(1, &nfsstats.write_bios); SET(bp->nb_flags, NB_WRITEINPROG); error = nfs_buf_write_rpc(bp, iomode, thd, cred); @@ -2613,7 +2613,7 @@ nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) return (0); /* there are pages marked dirty that need to be written out */ - OSAddAtomic(1, &nfsstats.write_bios); + OSAddAtomic64(1, &nfsstats.write_bios); NFS_BUF_MAP(bp); SET(bp->nb_flags, NB_WRITEINPROG); npages = bp->nb_bufsize / PAGE_SIZE; diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index f6d019ba2..7633c00b9 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -89,7 +89,7 @@ #include #include -#include +#include #include #include #include @@ -199,8 +199,8 @@ static int nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *); static int nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *, uint32_t); #endif /* NFSSERVER */ -static void task_release_special_port(mach_port_t); -static mach_port_t task_copy_special_port(mach_port_t); +static void host_release_special_port(mach_port_t); +static mach_port_t host_copy_special_port(mach_port_t); static void nfs_gss_mach_alloc_buffer(u_char *, uint32_t, vm_map_copy_t *); static int nfs_gss_mach_vmcopyout(vm_map_copy_t, uint32_t, u_char *); static int nfs_gss_token_get(gss_key_info *ki, u_char *, u_char *, int, uint32_t *, u_char *); @@ -1320,9 +1320,47 @@ nfs_gss_clnt_svcname(struct nfsmount *nmp) return (svcname); } +/* + * Get a mach port to talk to gssd. + * gssd lives in the root bootstrap, so we call gssd's lookup routine + * to get a send right to talk to a new gssd instance that launchd has launched + * based on the cred's uid and audit session id. + */ +#define kauth_cred_getasid(cred) ((cred)->cr_audit.as_aia_p->ai_asid) +#define kauth_cred_getauid(cred) ((cred)->cr_audit.as_aia_p->ai_auid) + +static mach_port_t +nfs_gss_clnt_get_upcall_port(kauth_cred_t credp) +{ + mach_port_t gssd_host_port, uc_port = IPC_PORT_NULL; + kern_return_t kr; + au_asid_t asid; + uid_t uid; + + kr = host_get_gssd_port(host_priv_self(), &gssd_host_port); + if (kr != KERN_SUCCESS) { + printf("nfs_gss_get_upcall_port: can't get gssd port, status %x (%d)\n", kr, kr); + return (IPC_PORT_NULL); + } + if (!IPC_PORT_VALID(gssd_host_port)) { + printf("nfs_gss_get_upcall_port: gssd port not valid\n"); + return (IPC_PORT_NULL); + } + + asid = kauth_cred_getasid(credp); + uid = kauth_cred_getauid(credp); + if (uid == AU_DEFAUDITID) + uid = kauth_cred_getuid(credp); + kr = mach_gss_lookup(gssd_host_port, uid, asid, &uc_port); + if (kr != KERN_SUCCESS) + printf("nfs_gss_clnt_get_upcall_port: mach_gssd_lookup failed: status %x (%d)\n", kr, kr); + + return (uc_port); +} + /* * Make an upcall to the gssd using Mach RPC - * The upcall is made using a task special port. + * The upcall is made using a host special port. * This allows launchd to fire up the gssd in the * user's session. This is important, since gssd * must have access to the user's credential cache. @@ -1351,16 +1389,9 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) */ uprinc[0] = '\0'; if (!IPC_PORT_VALID(cp->gss_clnt_mport)) { - kr = task_get_gssd_port(get_threadtask(req->r_thread), &cp->gss_clnt_mport); - if (kr != KERN_SUCCESS) { - printf("nfs_gss_clnt_gssd_upcall: can't get gssd port, status %x (%d)\n", kr, kr); + cp->gss_clnt_mport = nfs_gss_clnt_get_upcall_port(req->r_cred); + if (cp->gss_clnt_mport == IPC_PORT_NULL) goto out; - } - if (!IPC_PORT_VALID(cp->gss_clnt_mport)) { - printf("nfs_gss_clnt_gssd_upcall: gssd port not valid\n"); - cp->gss_clnt_mport = NULL; - goto out; - } } if (cp->gss_clnt_tokenlen > 0) @@ -1394,8 +1425,9 @@ retry: nfs_gss_mach_alloc_buffer(cp->gss_clnt_token, cp->gss_clnt_tokenlen, &itoken); goto retry; } - task_release_special_port(cp->gss_clnt_mport); - cp->gss_clnt_mport = NULL; + + host_release_special_port(cp->gss_clnt_mport); + cp->gss_clnt_mport = IPC_PORT_NULL; goto out; } @@ -1583,7 +1615,7 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, struct nfs_gss_clnt_ctx *cp) if (nmp != NULL) TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries); - task_release_special_port(cp->gss_clnt_mport); + host_release_special_port(cp->gss_clnt_mport); if (cp->gss_clnt_mtx) lck_mtx_destroy(cp->gss_clnt_mtx, nfs_gss_clnt_grp); @@ -1623,7 +1655,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) return (0); // already being renewed } saved_uid = cp->gss_clnt_uid; - saved_mport = task_copy_special_port(cp->gss_clnt_mport); + saved_mport = host_copy_special_port(cp->gss_clnt_mport); /* Remove the old context */ cp->gss_clnt_flags |= GSS_CTX_INVAL; @@ -1649,7 +1681,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) } ncp->gss_clnt_uid = saved_uid; - ncp->gss_clnt_mport = task_copy_special_port(saved_mport); // re-use the gssd port + ncp->gss_clnt_mport = host_copy_special_port(saved_mport); // re-use the gssd port ncp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL); ncp->gss_clnt_thread = current_thread(); lck_mtx_lock(&nmp->nm_lock); @@ -1662,7 +1694,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) error = nfs_gss_clnt_ctx_init_retry(req, ncp); // Initialize new context out: - task_release_special_port(saved_mport); + host_release_special_port(saved_mport); if (error) nfs_gss_clnt_ctx_unref(req); @@ -2557,7 +2589,7 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) int error = 0; char svcname[] = "nfs"; - kr = task_get_gssd_port(get_threadtask(current_thread()), &mp); + kr = host_get_gssd_port(host_priv_self(), &mp); if (kr != KERN_SUCCESS) { printf("nfs_gss_svc_gssd_upcall: can't get gssd port, status %x (%d)\n", kr, kr); goto out; @@ -2595,11 +2627,11 @@ retry: nfs_gss_mach_alloc_buffer(cp->gss_svc_token, cp->gss_svc_tokenlen, &itoken); goto retry; } - task_release_special_port(mp); + host_release_special_port(mp); goto out; } - task_release_special_port(mp); + host_release_special_port(mp); if (skeylen > 0) { if (skeylen != SKEYLEN && skeylen != SKEYLEN3) { @@ -2765,8 +2797,8 @@ nfs_gss_svc_cleanup(void) */ /* - * Release a task special port that was obtained by task_get_special_port - * or one of its macros (task_get_gssd_port in this case). + * Release a host special port that was obtained by host_get_special_port + * or one of its macros (host_get_gssd_port in this case). * This really should be in a public kpi. */ @@ -2775,16 +2807,16 @@ extern void ipc_port_release_send(ipc_port_t); extern ipc_port_t ipc_port_copy_send(ipc_port_t); static void -task_release_special_port(mach_port_t mp) +host_release_special_port(mach_port_t mp) { if (IPC_PORT_VALID(mp)) ipc_port_release_send(mp); } static mach_port_t -task_copy_special_port(mach_port_t mp) +host_copy_special_port(mach_port_t mp) { - return ipc_port_copy_send(mp); + return (ipc_port_copy_send(mp)); } /* @@ -3393,15 +3425,15 @@ gss_des_crypt(gss_key_info *ki, des_cblock *in, des_cblock *out, switch (ki->type) { case NFS_GSS_1DES: { - des_key_schedule *sched = ((usage == KG_USAGE_SEAL) ? + des_cbc_key_schedule *sched = ((usage == KG_USAGE_SEAL) ? &ki->ks_u.des.gss_sched_Ke : &ki->ks_u.des.gss_sched); - des_cbc_encrypt(in, out, len, *sched, iv, retiv, encrypt); + des_cbc_encrypt(in, out, len, sched, iv, retiv, encrypt); } break; case NFS_GSS_3DES: - des3_cbc_encrypt(in, out, len, ki->ks_u.des3.gss_sched, iv, retiv, encrypt); + des3_cbc_encrypt(in, out, len, &ki->ks_u.des3.gss_sched, iv, retiv, encrypt); break; } } @@ -3419,12 +3451,12 @@ gss_key_init(gss_key_info *ki, uint32_t skeylen) ki->type = NFS_GSS_1DES; ki->hash_len = MD5_DESCBC_DIGEST_LENGTH; ki->ks_u.des.key = (des_cblock *)ki->skey; - rc = des_key_sched(ki->ks_u.des.key, ki->ks_u.des.gss_sched); + rc = des_cbc_key_sched(ki->ks_u.des.key, &ki->ks_u.des.gss_sched); if (rc) return (rc); for (i = 0; i < ki->keybytes; i++) k[0][i] = 0xf0 ^ (*ki->ks_u.des.key)[i]; - rc = des_key_sched(&k[0], ki->ks_u.des.gss_sched_Ke); + rc = des_cbc_key_sched(&k[0], &ki->ks_u.des.gss_sched_Ke); break; case 3*sizeof(des_cblock): ki->type = NFS_GSS_3DES; @@ -3432,7 +3464,7 @@ gss_key_init(gss_key_info *ki, uint32_t skeylen) ki->ks_u.des3.key = (des_cblock (*)[3])ki->skey; des3_derive_key(*ki->ks_u.des3.key, ki->ks_u.des3.ckey, KEY_USAGE_DES3_SIGN, KEY_USAGE_LEN); - rc = des3_key_sched(*ki->ks_u.des3.key, ki->ks_u.des3.gss_sched); + rc = des3_cbc_key_sched(*ki->ks_u.des3.key, &ki->ks_u.des3.gss_sched); if (rc) return (rc); break; diff --git a/bsd/nfs/nfs_gss.h b/bsd/nfs/nfs_gss.h index ad056e7f2..e8cdb5a6a 100644 --- a/bsd/nfs/nfs_gss.h +++ b/bsd/nfs/nfs_gss.h @@ -31,7 +31,7 @@ #include #include -#include +#include #define RPCSEC_GSS 6 #define RPCSEC_GSS_VERS_1 1 @@ -78,13 +78,13 @@ typedef struct { union { struct { des_cblock *key; - des_key_schedule gss_sched; - des_key_schedule gss_sched_Ke; + des_cbc_key_schedule gss_sched; + des_cbc_key_schedule gss_sched_Ke; } des; struct { des_cblock (*key)[3]; des_cblock ckey[3]; - des_key_schedule gss_sched[3]; + des3_cbc_key_schedule gss_sched; } des3; } ks_u; } gss_key_info; diff --git a/bsd/nfs/nfs_gss_crypto.c b/bsd/nfs/nfs_gss_crypto.c index 1d275ba8f..370560b67 100644 --- a/bsd/nfs/nfs_gss_crypto.c +++ b/bsd/nfs/nfs_gss_crypto.c @@ -177,100 +177,6 @@ des3_make_key(const unsigned char randombits[21], des_cblock key[3]) } } -/* - * Make a triple des key schedule, from a triple des key. - */ - -int -des3_key_sched(des_cblock key[3], des_key_schedule sched[3]) -{ - int i; - int rc = 0; - - for (i = 0; i < 3; i++) - rc |= des_key_sched(&key[i], sched[i]); - - return (rc); -} - -/* - * Triple DES cipher block chaining mode encryption. - */ - -void -des3_cbc_encrypt(des_cblock *input, des_cblock *output, int32_t length, - des_key_schedule schedule[3], des_cblock *ivec, des_cblock *retvec, int encrypt) -{ - register DES_LONG tin0,tin1; - register DES_LONG tout0,tout1,xor0,xor1; - register unsigned char *in,*out,*retval; - register int32_t l=length; - DES_LONG tin[2]; - unsigned char *iv; - tin0 = tin1 = 0; - - in=(unsigned char *)input; - out=(unsigned char *)output; - retval=(unsigned char *)retvec; - iv=(unsigned char *)ivec; - - if (encrypt) { - c2l(iv,tout0); - c2l(iv,tout1); - for (l-=8; l>=0; l-=8) { - c2l(in,tin0); - c2l(in,tin1); - tin0^=tout0; tin[0]=tin0; - tin1^=tout1; tin[1]=tin1; - des_encrypt3((DES_LONG *)tin,schedule[0], schedule[1], schedule[2]); - tout0=tin[0]; l2c(tout0,out); - tout1=tin[1]; l2c(tout1,out); - } - if (l != -8) { - c2ln(in,tin0,tin1,l+8); - tin0^=tout0; tin[0]=tin0; - tin1^=tout1; tin[1]=tin1; - des_encrypt3((DES_LONG *)tin,schedule[0], schedule[1], schedule[2]); - tout0=tin[0]; l2c(tout0,out); - tout1=tin[1]; l2c(tout1,out); - } - if (retval) { - l2c(tout0,retval); - l2c(tout1,retval); - } - } else { - c2l(iv,xor0); - c2l(iv,xor1); - for (l-=8; l>=0; l-=8) { - c2l(in,tin0); tin[0]=tin0; - c2l(in,tin1); tin[1]=tin1; - des_decrypt3((DES_LONG *)tin,schedule[0],schedule[1],schedule[2]); - tout0=tin[0]^xor0; - tout1=tin[1]^xor1; - l2c(tout0,out); - l2c(tout1,out); - xor0=tin0; - xor1=tin1; - } - if (l != -8) { - c2l(in,tin0); tin[0]=tin0; - c2l(in,tin1); tin[1]=tin1; - des_decrypt3((DES_LONG *)tin,schedule[0],schedule[1],schedule[2]); - tout0=tin[0]^xor0; - tout1=tin[1]^xor1; - l2cn(tout0,tout1,out,l+8); - /* xor0=tin0; - xor1=tin1; */ - } - if (retval) { - l2c(tin0,retval); - l2c(tin1,retval); - } - } - tin0=tin1=tout0=tout1=xor0=xor1=0; - tin[0]=tin[1]=0; -} - /* * Key derivation for triple DES. * Given the session key in in key, produce a new key in out key using @@ -282,7 +188,7 @@ des3_derive_key(des_cblock inkey[3], des_cblock outkey[3], const unsigned char *constant, int clen) { des_cblock inblock, outblock, ivec; - des_key_schedule sched[3]; + des3_cbc_key_schedule sched; unsigned char rawkey[21]; size_t n, keybytes = sizeof(rawkey); @@ -297,9 +203,9 @@ des3_derive_key(des_cblock inkey[3], des_cblock outkey[3], /* loop encrypting the blocks until enough key bytes are generated */ bzero(ivec, sizeof(ivec)); - des3_key_sched(inkey, sched); + des3_cbc_key_sched(inkey, &sched); for (n = 0; n < sizeof(rawkey); n += sizeof(des_cblock)) { - des3_cbc_encrypt(&inblock, &outblock, sizeof(outblock), sched, &ivec, NULL, 1); + des3_cbc_encrypt(&inblock, &outblock, sizeof(outblock), &sched, &ivec, NULL, 1); if ((keybytes - n) <= sizeof (des_cblock)) { memcpy(rawkey+n, outblock, (keybytes - n)); break; @@ -316,7 +222,7 @@ des3_derive_key(des_cblock inkey[3], des_cblock outkey[3], bzero(inblock, sizeof (des_cblock)); bzero(outblock, sizeof (des_cblock)); bzero(rawkey, keybytes); - bzero(sched, sizeof (sched)); + bzero(&sched, sizeof (sched)); return(0); } @@ -375,130 +281,11 @@ HMAC_SHA1_DES3KD_Final(void *digest, HMAC_SHA1_DES3KD_CTX *ctx) SHA1Final(digest, &ctx->sha1_ctx); } -/* - * XXX This function borrowed from OpenBSD. - * It will likely be moved into kernel crypto. - */ -DES_LONG -des_cbc_cksum(des_cblock *input, des_cblock *output, - int32_t length, des_key_schedule schedule, des_cblock *ivec) -{ - register DES_LONG tout0,tout1,tin0,tin1; - register int32_t l=length; - DES_LONG tin[2]; - unsigned char *in,*out,*iv; - - in=(unsigned char *)input; - out=(unsigned char *)output; - iv=(unsigned char *)ivec; - - c2l(iv,tout0); - c2l(iv,tout1); - for (; l>0; l-=8) { - if (l >= 8) { - c2l(in,tin0); - c2l(in,tin1); - } else - c2ln(in,tin0,tin1,l); - - tin0^=tout0; tin[0]=tin0; - tin1^=tout1; tin[1]=tin1; - des_encrypt1((DES_LONG *)tin,schedule,DES_ENCRYPT); - /* fix 15/10/91 eay - thanks to keithr@sco.COM */ - tout0=tin[0]; - tout1=tin[1]; - } - if (out != NULL) { - l2c(tout0,out); - l2c(tout1,out); - } - tout0=tin0=tin1=tin[0]=tin[1]=0; - return(tout1); -} - -/* - * XXX This function borrowed from OpenBSD. - * It will likely be moved into kernel crypto. - */ -void -des_cbc_encrypt(des_cblock *input, des_cblock *output, int32_t length, - des_key_schedule schedule, des_cblock *ivec, des_cblock *retvec, int encrypt) -{ - register DES_LONG tin0,tin1; - register DES_LONG tout0,tout1,xor0,xor1; - register unsigned char *in,*out,*retval; - register int32_t l=length; - DES_LONG tin[2]; - unsigned char *iv; - tin0 = tin1 = 0; - - in=(unsigned char *)input; - out=(unsigned char *)output; - retval=(unsigned char *)retvec; - iv=(unsigned char *)ivec; - - if (encrypt) { - c2l(iv,tout0); - c2l(iv,tout1); - for (l-=8; l>=0; l-=8) { - c2l(in,tin0); - c2l(in,tin1); - tin0^=tout0; tin[0]=tin0; - tin1^=tout1; tin[1]=tin1; - des_encrypt1((DES_LONG *)tin,schedule,DES_ENCRYPT); - tout0=tin[0]; l2c(tout0,out); - tout1=tin[1]; l2c(tout1,out); - } - if (l != -8) { - c2ln(in,tin0,tin1,l+8); - tin0^=tout0; tin[0]=tin0; - tin1^=tout1; tin[1]=tin1; - des_encrypt1((DES_LONG *)tin,schedule,DES_ENCRYPT); - tout0=tin[0]; l2c(tout0,out); - tout1=tin[1]; l2c(tout1,out); - } - if (retval) { - l2c(tout0,retval); - l2c(tout1,retval); - } - } else { - c2l(iv,xor0); - c2l(iv,xor1); - for (l-=8; l>=0; l-=8) { - c2l(in,tin0); tin[0]=tin0; - c2l(in,tin1); tin[1]=tin1; - des_encrypt1((DES_LONG *)tin,schedule,DES_DECRYPT); - tout0=tin[0]^xor0; - tout1=tin[1]^xor1; - l2c(tout0,out); - l2c(tout1,out); - xor0=tin0; - xor1=tin1; - } - if (l != -8) { - c2l(in,tin0); tin[0]=tin0; - c2l(in,tin1); tin[1]=tin1; - des_encrypt1((DES_LONG *)tin,schedule,DES_DECRYPT); - tout0=tin[0]^xor0; - tout1=tin[1]^xor1; - l2cn(tout0,tout1,out,l+8); - /* xor0=tin0; - xor1=tin1; */ - } - if (retval) { - l2c(tin0,retval); - l2c(tin1,retval); - } - } - tin0=tin1=tout0=tout1=xor0=xor1=0; - tin[0]=tin[1]=0; -} - /* * Initialize an MD5 DES CBC context with a schedule. */ -void MD5_DESCBC_Init(MD5_DESCBC_CTX *ctx, des_key_schedule *sched) +void MD5_DESCBC_Init(MD5_DESCBC_CTX *ctx, des_cbc_key_schedule *sched) { MD5Init(&ctx->md5_ctx); ctx->sched = sched; @@ -519,7 +306,6 @@ void MD5_DESCBC_Update(MD5_DESCBC_CTX *ctx, void *data, size_t len) void MD5_DESCBC_Final(void *digest, MD5_DESCBC_CTX *ctx) { - des_cblock iv0; unsigned char md5_digest[MD5_DIGEST_LENGTH]; MD5Final(md5_digest, &ctx->md5_ctx); @@ -527,8 +313,7 @@ void MD5_DESCBC_Final(void *digest, MD5_DESCBC_CTX *ctx) /* * Now get the DES CBC checksum for the digest. */ - bzero(iv0, sizeof (iv0)); - (void) des_cbc_cksum((des_cblock *) md5_digest, (des_cblock *)digest, - sizeof (md5_digest), *ctx->sched, &iv0); + des_cbc_cksum((des_cblock *) md5_digest, (des_cblock *)digest, + sizeof (md5_digest), ctx->sched); } diff --git a/bsd/nfs/nfs_gss_crypto.h b/bsd/nfs/nfs_gss_crypto.h index 677647f16..4819dcd9d 100644 --- a/bsd/nfs/nfs_gss_crypto.h +++ b/bsd/nfs/nfs_gss_crypto.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #define KG_USAGE_SEAL 22 #define KG_USAGE_SIGN 23 @@ -50,7 +50,7 @@ typedef struct { typedef struct { MD5_CTX md5_ctx; - des_key_schedule *sched; + des_cbc_key_schedule *sched; } MD5_DESCBC_CTX; #define MD5_DESCBC_DIGEST_LENGTH 8 @@ -59,18 +59,13 @@ __BEGIN_DECLS void krb5_nfold(unsigned int, const unsigned char *, unsigned int, unsigned char *); void des3_make_key(const unsigned char[21], des_cblock[3]); -int des3_key_sched(des_cblock[3], des_key_schedule[3]); -void des3_cbc_encrypt(des_cblock *, des_cblock *, int32_t, - des_key_schedule[3], des_cblock *, des_cblock *, int); int des3_derive_key(des_cblock[3], des_cblock[3], const unsigned char *, int); + void HMAC_SHA1_DES3KD_Init(HMAC_SHA1_DES3KD_CTX *, des_cblock[3], int); void HMAC_SHA1_DES3KD_Update(HMAC_SHA1_DES3KD_CTX *, void *, size_t); void HMAC_SHA1_DES3KD_Final(void *, HMAC_SHA1_DES3KD_CTX *); -DES_LONG des_cbc_cksum(des_cblock *, des_cblock *, int32_t, des_key_schedule, des_cblock *); -void des_cbc_encrypt(des_cblock *, des_cblock *, int32_t, des_key_schedule, - des_cblock *, des_cblock *, int); -void MD5_DESCBC_Init(MD5_DESCBC_CTX *, des_key_schedule *); +void MD5_DESCBC_Init(MD5_DESCBC_CTX *, des_cbc_key_schedule *); void MD5_DESCBC_Update(MD5_DESCBC_CTX *, void *, size_t); void MD5_DESCBC_Final(void *, MD5_DESCBC_CTX *); diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index f76a9b6d0..ad6a0cb53 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -425,10 +425,9 @@ nfs3_lockd_request( bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len)); if (nmp->nm_vers == NFS_VER3) msg->lm_flags |= LOCKD_MSG_NFSV3; -#if 0 /* not yet */ + if (nmp->nm_sotype != SOCK_DGRAM) msg->lm_flags |= LOCKD_MSG_TCP; -#endif microuptime(&now); starttime = now.tv_sec; diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index b3f2a47b9..7fc9ddaef 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include @@ -1177,3 +1178,36 @@ nfs_data_update_size(nfsnode_t np, int datalocked) FSDBG_BOT(272, np, np->n_flag, np->n_size, np->n_newsize); } +#define DODEBUG 1 +int +nfs_mount_is_dirty(mount_t mp) +{ + u_long i; + nfsnode_t np; +#ifdef DODEBUG + struct timeval now, then, diff; + u_long ncnt = 0; + microuptime(&now); +#endif + lck_mtx_lock(nfs_node_hash_mutex); + for (i = 0; i <= nfsnodehash; i++) { + LIST_FOREACH(np, &nfsnodehashtbl[i], n_hash) { +#ifdef DODEBUG + ncnt++; +#endif + if (np->n_mount == mp && !LIST_EMPTY(&np->n_dirtyblkhd)) + goto out; + } + } +out: + lck_mtx_unlock(nfs_node_hash_mutex); +#ifdef DODEBUG + microuptime(&then); + timersub(&then, &now, &diff); + + printf("nfs_mount_is_dirty took %lld mics for %ld slots and %ld nodes return %d\n", + (uint64_t)diff.tv_sec * 1000000LL + diff.tv_usec, i, ncnt, (i <= nfsnodehash)); +#endif + + return (i <= nfsnodehash); +} diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index 956cc9285..8cc717b8e 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -237,6 +237,9 @@ nfsrv_init(void) nfsrv_udpsock = NULL; nfsrv_udp6sock = NULL; + /* Setup the up-call handling */ + nfsrv_uc_init(); + /* initialization complete */ nfsrv_initted = NFSRV_INITIALIZED; } @@ -1280,7 +1283,7 @@ nfsrv_write( ioflags = (IO_METASYNC | IO_SYNC | IO_NODELOCKED); error = VNOP_WRITE(vp, auio, ioflags, ctx); - OSAddAtomic(1, &nfsstats.srvvop_writes); + OSAddAtomic64(1, &nfsstats.srvvop_writes); /* update export stats */ NFSStatAdd64(&nx->nx_stats.bytes_written, len); @@ -1559,7 +1562,7 @@ loop1: if ((tlen = mbuf_len(m)) > 0) uio_addiov(auio, CAST_USER_ADDR_T((caddr_t)mbuf_data(m)), tlen); error = VNOP_WRITE(vp, auio, ioflags, ctx); - OSAddAtomic(1, &nfsstats.srvvop_writes); + OSAddAtomic64(1, &nfsstats.srvvop_writes); /* update export stats */ NFSStatAdd64(&nx->nx_stats.bytes_written, nd->nd_len); @@ -2142,6 +2145,7 @@ nfsrv_mknod( uint32_t len = 0, cnflags; u_int32_t major = 0, minor = 0; enum vtype vtyp; + nfstype nvtype; vnode_t vp, dvp, dirp; struct nfs_filehandle nfh; struct nfs_export *nx = NULL; @@ -2192,9 +2196,9 @@ nfsrv_mknod( dvp = ni.ni_dvp; vp = ni.ni_vp; - nfsm_chain_get_32(error, nmreq, vtyp); + nfsm_chain_get_32(error, nmreq, nvtype); nfsmerr_if(error); - vtyp = nfstov_type(vtyp, NFS_VER3); + vtyp = nfstov_type(nvtype, NFS_VER3); if (!error && (vtyp != VCHR) && (vtyp != VBLK) && (vtyp != VSOCK) && (vtyp != VFIFO)) { error = NFSERR_BADTYPE; goto out; @@ -3938,8 +3942,12 @@ nfsrv_readdir( error = nfsrv_credcheck(nd, ctx, nx, nxo); nfsmerr_if(error); + if (nxo->nxo_flags & NX_MANGLEDNAMES || nd->nd_vers == NFS_VER2) + vnopflag |= VNODE_READDIR_NAMEMAX; + if ((nd->nd_vers == NFS_VER2) || (nxo->nxo_flags & NX_32BITCLIENTS)) vnopflag |= VNODE_READDIR_SEEKOFF32; + if (nd->nd_vers == NFS_VER3) { nfsm_srv_vattr_init(&attr, NFS_VER3); error = attrerr = vnode_getattr(vp, &attr, ctx); @@ -4160,6 +4168,9 @@ nfsrv_readdirplus( if (nxo->nxo_flags & NX_32BITCLIENTS) vnopflag |= VNODE_READDIR_SEEKOFF32; + if (nxo->nxo_flags & NX_MANGLEDNAMES) + vnopflag |= VNODE_READDIR_NAMEMAX; + nfsm_srv_vattr_init(&attr, NFS_VER3); error = attrerr = vnode_getattr(vp, &attr, ctx); if (!error && toff && verf && (verf != attr.va_filerev)) diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index 71b6e5c44..27126d218 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -157,6 +157,9 @@ void nfs_reqbusy(struct nfsreq *); struct nfsreq *nfs_reqnext(struct nfsreq *); int nfs_wait_reply(struct nfsreq *); void nfs_softterm(struct nfsreq *); +int nfs_can_squish(struct nfsmount *); +int nfs_is_squishy(struct nfsmount *); +int nfs_is_dead(int, struct nfsmount *); #ifdef NFS_SOCKET_DEBUGGING #define NFS_SOCK_DBG(X) printf X @@ -584,7 +587,7 @@ nfs_socket_options(struct nfsmount *nmp, struct nfs_socket *nso) int on = 1, proto; timeo.tv_usec = 0; - timeo.tv_sec = NMFLAG(nmp, SOFT) ? 5 : 60; + timeo.tv_sec = (NMFLAG(nmp, SOFT) || nfs_can_squish(nmp)) ? 5 : 60; sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); if (nso->nso_sotype == SOCK_STREAM) { @@ -1115,7 +1118,7 @@ keepsearching: else if (ss.ss_family == AF_INET6) ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, - nso->nso_so, NFS_PROG, nfsvers, + nso->nso_so, NFS_PROG, nfsvers, (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); if (!error) { if (ss.ss_family == AF_INET) @@ -1128,7 +1131,7 @@ keepsearching: if (error && !nmp->nm_vers) { nfsvers = NFS_VER2; error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, - nso->nso_so, NFS_PROG, nfsvers, + nso->nso_so, NFS_PROG, nfsvers, (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); if (!error) { if (ss.ss_family == AF_INET) @@ -1246,7 +1249,7 @@ keepsearching: if (saddr) MALLOC(fh, fhandle_t *, sizeof(fhandle_t), M_TEMP, M_WAITOK|M_ZERO); if (saddr && fh) - MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); if (!saddr || !fh || !path) { if (!error) error = ENOMEM; @@ -1498,13 +1501,19 @@ nfs_reconnect(struct nfsmount *nmp) thread_t thd = current_thread(); int error, wentdown = 0, verbose = 1; time_t lastmsg; + int timeo; microuptime(&now); lastmsg = now.tv_sec - (nmp->nm_tprintf_delay - nmp->nm_tprintf_initial_delay); nfs_disconnect(nmp); - while ((error = nfs_connect(nmp, verbose, 30))) { + + lck_mtx_lock(&nmp->nm_lock); + timeo = nfs_is_squishy(nmp) ? 8 : 30; + lck_mtx_unlock(&nmp->nm_lock); + + while ((error = nfs_connect(nmp, verbose, timeo))) { verbose = 0; nfs_disconnect(nmp); if ((error == EINTR) || (error == ERESTART)) @@ -1849,16 +1858,18 @@ nfs_mount_check_dead_timeout(struct nfsmount *nmp) { struct timeval now; - if (nmp->nm_deadtimeout <= 0) - return; if (nmp->nm_deadto_start == 0) return; if (nmp->nm_state & NFSSTA_DEAD) return; + nfs_is_squishy(nmp); + if (nmp->nm_curdeadtimeout <= 0) + return; microuptime(&now); - if ((now.tv_sec - nmp->nm_deadto_start) < nmp->nm_deadtimeout) + if ((now.tv_sec - nmp->nm_deadto_start) < nmp->nm_curdeadtimeout) return; - printf("nfs server %s: dead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + printf("nfs server %s: %sdead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, + (nmp->nm_curdeadtimeout != nmp->nm_deadtimeout) ? "squished " : ""); nmp->nm_state |= NFSSTA_DEAD; vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_DEAD, 0); } @@ -2360,7 +2371,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) status = error; else if ((error == ENOBUFS) || (error == ENOMEM)) status = NFSERR_RESOURCE; - else + else status = NFSERR_SERVERFAULT; error = 0; nfsm_chain_null(&nmrep); @@ -2508,7 +2519,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) status = error; else if ((error == ENOBUFS) || (error == ENOMEM)) status = NFSERR_RESOURCE; - else + else status = NFSERR_SERVERFAULT; error = 0; } @@ -2529,7 +2540,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) nfsmout: if (status == EBADRPC) - OSAddAtomic(1, &nfsstats.rpcinvalid); + OSAddAtomic64(1, &nfsstats.rpcinvalid); /* build reply header */ error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_DATA, &mhead); @@ -2838,7 +2849,7 @@ again: microuptime(&now); if ((now.tv_sec - nmp->nm_reconnect_start) >= 8) { /* soft mount in reconnect for a while... terminate ASAP */ - OSAddAtomic(1, &nfsstats.rpctimeouts); + OSAddAtomic64(1, &nfsstats.rpctimeouts); req->r_flags |= R_SOFTTERM; req->r_error = error = ETIMEDOUT; break; @@ -2918,7 +2929,7 @@ again: } else { /* * When retransmitting, turn timing off - * and divide congestion window by 2. + * and divide congestion window by 2. */ req->r_flags &= ~R_TIMING; nmp->nm_cwnd >>= 1; @@ -2970,7 +2981,7 @@ again: /* SUCCESS */ req->r_flags &= ~R_RESENDERR; if (rexmit) - OSAddAtomic(1, &nfsstats.rpcretries); + OSAddAtomic64(1, &nfsstats.rpcretries); req->r_flags |= R_SENT; if (req->r_flags & R_WAITSENT) { req->r_flags &= ~R_WAITSENT; @@ -3052,6 +3063,9 @@ again: !req->r_nmp ? "" : vfs_statfs(req->r_nmp->nm_mountp)->f_mntfromname); + if (nfs_is_dead(error, nmp)) + error = EIO; + /* prefer request termination error over other errors */ error2 = nfs_sigintr(req->r_nmp, req, req->r_thread, 0); if (error2) @@ -3201,6 +3215,7 @@ nfs_sock_poke(struct nfsmount *nmp) msg.msg_iovlen = 1; error = sock_send(nmp->nm_nso->nso_so, &msg, MSG_DONTWAIT, &len); NFS_SOCK_DBG(("nfs_sock_poke: error %d\n", error)); + nfs_is_dead(error, nmp); } /* @@ -3219,7 +3234,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) nfsm_chain_get_32(error, &nmrep, rxid); nfsm_chain_get_32(error, &nmrep, reply); if (error || (reply != RPC_REPLY)) { - OSAddAtomic(1, &nfsstats.rpcinvalid); + OSAddAtomic64(1, &nfsstats.rpcinvalid); mbuf_freem(mrep); return; } @@ -3307,7 +3322,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) if (!req) { /* not matched to a request, so drop it. */ lck_mtx_unlock(nfs_request_mutex); - OSAddAtomic(1, &nfsstats.rpcunexpected); + OSAddAtomic64(1, &nfsstats.rpcunexpected); mbuf_freem(mrep); } } @@ -3443,7 +3458,7 @@ nfs_request_create( } if ((nmp->nm_vers != NFS_VER4) && (procnum >= 0) && (procnum < NFS_NPROCS)) - OSAddAtomic(1, &nfsstats.rpccnt[procnum]); + OSAddAtomic64(1, &nfsstats.rpccnt[procnum]); if ((nmp->nm_vers == NFS_VER4) && (procnum != NFSPROC4_COMPOUND) && (procnum != NFSPROC4_NULL)) panic("nfs_request: invalid NFSv4 RPC request %d\n", procnum); @@ -3667,7 +3682,7 @@ nfs_request_send(struct nfsreq *req, int wait) ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); } - OSAddAtomic(1, &nfsstats.rpcrequests); + OSAddAtomic64(1, &nfsstats.rpcrequests); /* * Chain request into list of outstanding requests. Be sure @@ -3884,7 +3899,7 @@ nfs_request_finish( if ((req->r_delay >= 30) && !(nmp->nm_state & NFSSTA_MOUNTED)) { /* we're not yet completely mounted and */ /* we can't complete an RPC, so we fail */ - OSAddAtomic(1, &nfsstats.rpctimeouts); + OSAddAtomic64(1, &nfsstats.rpctimeouts); nfs_softterm(req); error = req->r_error; goto nfsmout; @@ -3904,7 +3919,7 @@ nfs_request_finish( } if (NMFLAG(nmp, SOFT) && (req->r_delay == 30) && !(req->r_flags & R_NOINTR)) { /* for soft mounts, just give up after a short while */ - OSAddAtomic(1, &nfsstats.rpctimeouts); + OSAddAtomic64(1, &nfsstats.rpctimeouts); nfs_softterm(req); error = req->r_error; goto nfsmout; @@ -4174,7 +4189,7 @@ nfs_request2( * server. Associate the context that we are setting up with the request that we * are sending. */ - + int nfs_request_gss( mount_t mp, @@ -4192,7 +4207,7 @@ nfs_request_gss( if ((error = nfs_request_create(NULL, mp, nmrest, NFSPROC_NULL, thd, cred, &req))) return (error); req->r_flags |= (flags & R_OPTMASK); - + if (cp == NULL) { printf("nfs_request_gss request has no context\n"); nfs_request_rele(req); @@ -4218,7 +4233,7 @@ nfs_request_gss( nfs_request_rele(req); return (error); } - + /* * Create and start an asynchronous NFS request. */ @@ -4533,7 +4548,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) lck_mtx_unlock(&nmp->nm_lock); /* we're not yet completely mounted and */ /* we can't complete an RPC, so we fail */ - OSAddAtomic(1, &nfsstats.rpctimeouts); + OSAddAtomic64(1, &nfsstats.rpctimeouts); nfs_softterm(req); finish_asyncio = ((req->r_callback.rcb_func != NULL) && !(req->r_flags & R_WAITSENT)); wakeup(req); @@ -4549,10 +4564,10 @@ nfs_request_timer(__unused void *param0, __unused void *param1) * Put a reasonable limit on the maximum timeout, * and reduce that limit when soft mounts get timeouts or are in reconnect. */ - if (!NMFLAG(nmp, SOFT)) + if (!NMFLAG(nmp, SOFT) && !nfs_can_squish(nmp)) maxtime = NFS_MAXTIMEO; else if ((req->r_flags & (R_SETUP|R_RECOVER)) || - ((nmp->nm_reconnect_start <= 0) || ((now.tv_sec - nmp->nm_reconnect_start) < 8))) + ((nmp->nm_reconnect_start <= 0) || ((now.tv_sec - nmp->nm_reconnect_start) < 8))) maxtime = (NFS_MAXTIMEO / (nmp->nm_timeouts+1))/2; else maxtime = NFS_MINTIMEO/4; @@ -4608,10 +4623,10 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } /* For soft mounts (& SETUPs/RECOVERs), check for too many retransmits/timeout. */ - if ((NMFLAG(nmp, SOFT) || (req->r_flags & (R_SETUP|R_RECOVER))) && + if ((NMFLAG(nmp, SOFT) || (req->r_flags & (R_SETUP|R_RECOVER))) && ((req->r_rexmit >= req->r_retry) || /* too many */ ((now.tv_sec - req->r_start)*NFS_HZ > maxtime))) { /* too long */ - OSAddAtomic(1, &nfsstats.rpctimeouts); + OSAddAtomic64(1, &nfsstats.rpctimeouts); lck_mtx_lock(&nmp->nm_lock); if (!(nmp->nm_state & NFSSTA_TIMEO)) { lck_mtx_unlock(&nmp->nm_lock); @@ -5037,7 +5052,7 @@ nfs_portmap_lookup( pmvers = RPCBVERS4; pmproc = RPCBPROC_GETVERSADDR; } else { - return (EINVAL); + return (EINVAL); } nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -5140,6 +5155,144 @@ nfs_msg(thread_t thd, return (0); } +#define NFS_SQUISH_MOBILE_ONLY 0x0001 /* Squish mounts only on mobile machines */ +#define NFS_SQUISH_AUTOMOUNTED_ONLY 0x0002 /* Squish mounts only if the are automounted */ +#define NFS_SQUISH_SOFT 0x0004 /* Treat all soft mounts as though they were on a mobile machine */ +#define NFS_SQUISH_QUICK 0x0008 /* Try to squish mounts more quickly. */ +#define NFS_SQUISH_SHUTDOWN 0x1000 /* Squish all mounts on shutdown. Currently not implemented */ + +uint32_t nfs_squishy_flags = NFS_SQUISH_MOBILE_ONLY | NFS_SQUISH_AUTOMOUNTED_ONLY | NFS_SQUISH_QUICK; +int32_t nfs_is_mobile; + +#define NFS_SQUISHY_DEADTIMEOUT 8 /* Dead time out for squishy mounts */ +#define NFS_SQUISHY_QUICKTIMEOUT 4 /* Quicker dead time out when nfs_squish_flags NFS_SQUISH_QUICK bit is set*/ + +/* + * Could this mount be squished? + */ +int +nfs_can_squish(struct nfsmount *nmp) +{ + uint64_t flags = vfs_flags(nmp->nm_mountp); + int softsquish = ((nfs_squishy_flags & NFS_SQUISH_SOFT) & NMFLAG(nmp, SOFT)); + + if (!softsquish && (nfs_squishy_flags & NFS_SQUISH_MOBILE_ONLY) && nfs_is_mobile == 0) + return (0); + + if ((nfs_squishy_flags & NFS_SQUISH_AUTOMOUNTED_ONLY) && (flags & MNT_AUTOMOUNTED) == 0) + return (0); + + return (1); +} + +/* + * NFS mounts default to "rw,hard" - but frequently on mobile clients + * the mount may become "not responding". It's desirable to be able + * to unmount these dead mounts, but only if there is no risk of + * losing data or crashing applications. A "squishy" NFS mount is one + * that can be force unmounted with little risk of harm. + * + * nfs_is_squishy checks if a mount is in a squishy state. A mount is + * in a squishy state iff it is allowed to be squishy and there are no + * dirty pages and there are no mmapped files and there are no files + * open for write. Mounts are allowed to be squishy is controlled by + * the settings of the nfs_squishy_flags and its mobility state. These + * flags can be set by sysctls. + * + * If nfs_is_squishy determines that we are in a squishy state we will + * update the current dead timeout to at least NFS_SQUISHY_DEADTIMEOUT + * (or NFS_SQUISHY_QUICKTIMEOUT if NFS_SQUISH_QUICK is set) (see + * above) or 1/8th of the mount's nm_deadtimeout value, otherwise we just + * update the current dead timeout with the mount's nm_deadtimeout + * value set at mount time. + * + * Assumes that nm_lock is held. + * + * Note this routine is racey, but its effects on setting the + * dead timeout only have effects when we're in trouble and are likely + * to stay that way. Since by default its only for automounted + * volumes on mobile machines; this is a reasonable trade off between + * data integrity and user experience. It can be disabled or set via + * nfs.conf file. + */ + +int +nfs_is_squishy(struct nfsmount *nmp) +{ + mount_t mp = nmp->nm_mountp; + int squishy = 0; + int timeo = (nfs_squishy_flags & NFS_SQUISH_QUICK) ? NFS_SQUISHY_QUICKTIMEOUT : NFS_SQUISHY_DEADTIMEOUT; + + NFS_SOCK_DBG(("nfs_is_squishy: %s: nm_curdeadtiemout = %d, nfs_is_mobile = %d\n", + vfs_statfs(mp)->f_mntfromname, nmp->nm_curdeadtimeout, nfs_is_mobile)); + + if (!nfs_can_squish(nmp)) + goto out; + + timeo = (nmp->nm_deadtimeout > timeo) ? max(nmp->nm_deadtimeout/8, timeo) : timeo; + NFS_SOCK_DBG(("nfs_is_squishy: nm_writers = %d nm_mappers = %d timeo = %d\n", nmp->nm_writers, nmp->nm_mappers, timeo)); + + if (nmp->nm_writers == 0 && nmp->nm_mappers == 0) { + uint64_t flags = mp ? vfs_flags(mp) : 0; + squishy = 1; + + /* + * Walk the nfs nodes and check for dirty buffers it we're not + * RDONLY and we've not already been declared as squishy since + * this can be a bit expensive. + */ + if (!(flags & MNT_RDONLY) && !(nmp->nm_state & NFSSTA_SQUISHY)) + squishy = !nfs_mount_is_dirty(mp); + } + +out: + if (squishy) + nmp->nm_state |= NFSSTA_SQUISHY; + else + nmp->nm_state &= ~NFSSTA_SQUISHY; + + nmp->nm_curdeadtimeout = squishy ? timeo : nmp->nm_deadtimeout; + + NFS_SOCK_DBG(("nfs_is_squishy: nm_curdeadtimeout = %d\n", nmp->nm_curdeadtimeout)); + + return (squishy); +} + +/* + * On a send operation, if we can't reach the server and we've got only one server to talk to + * and NFS_SQUISH_QUICK flag is set and we are in a squishy state then mark the mount as dead + * and ask to be forcibly unmounted. Return 1 if we're dead and 0 otherwise. + */ +static int +nfs_is_dead_lock(int error, struct nfsmount *nmp) +{ + if (nmp->nm_state & NFSSTA_DEAD) + return (1); + + if ((error != ENETUNREACH && error != EHOSTUNREACH) || + !(nmp->nm_locations.nl_numlocs == 1 && nmp->nm_locations.nl_locations[0]->nl_servcount == 1)) + return (0); + if ((nfs_squishy_flags & NFS_SQUISH_QUICK) && nfs_is_squishy(nmp)) { + printf("nfs_is_dead: nfs server %s: unreachable. Squished dead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + nmp->nm_state |= NFSSTA_DEAD; + vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_DEAD, 0); + return (1); + } + return (0); +} + +int +nfs_is_dead(int error, struct nfsmount *nmp) +{ + int is_dead; + + lck_mtx_lock(&nmp->nm_lock); + is_dead = nfs_is_dead_lock(error, nmp); + lck_mtx_unlock(&nmp->nm_lock); + + return (is_dead); +} + void nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *msg) { @@ -5169,14 +5322,17 @@ nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *m unresponsive = (nmp->nm_state & timeoutmask); - if (unresponsive && (nmp->nm_deadtimeout > 0)) { + nfs_is_squishy(nmp); + + if (unresponsive && (nmp->nm_curdeadtimeout > 0)) { microuptime(&now); if (!wasunresponsive) { nmp->nm_deadto_start = now.tv_sec; nfs_mount_sock_thread_wake(nmp); - } else if ((now.tv_sec - nmp->nm_deadto_start) > nmp->nm_deadtimeout) { + } else if ((now.tv_sec - nmp->nm_deadto_start) > nmp->nm_curdeadtimeout) { if (!(nmp->nm_state & NFSSTA_DEAD)) - printf("nfs server %s: dead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + printf("nfs server %s: %sdead\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, + (nmp->nm_curdeadtimeout != nmp->nm_deadtimeout) ? "squished " : ""); nmp->nm_state |= NFSSTA_DEAD; } } @@ -5225,8 +5381,9 @@ nfs_up(struct nfsmount *nmp, thread_t thd, int flags, const char *msg) unresponsive = (nmp->nm_state & timeoutmask); - if (nmp->nm_deadto_start) - nmp->nm_deadto_start = 0; + nmp->nm_deadto_start = 0; + nmp->nm_curdeadtimeout = nmp->nm_deadtimeout; + nmp->nm_state &= ~NFSSTA_SQUISHY; lck_mtx_unlock(&nmp->nm_lock); if (softnobrowse) @@ -5350,7 +5507,7 @@ done: *nmrepp = nmrep; if ((err != 0) && (err != NFSERR_RETVOID)) - OSAddAtomic(1, &nfsstats.srvrpc_errs); + OSAddAtomic64(1, &nfsstats.srvrpc_errs); return (0); } @@ -5487,11 +5644,11 @@ nfsrv_rcv_locked(socket_t so, struct nfsrv_sock *slp, int waitflag) ns_flag = SLP_NEEDQ; goto dorecs; } - + bzero(&msg, sizeof(msg)); msg.msg_name = (caddr_t)&nam; msg.msg_namelen = sizeof(nam); - + do { bytes_read = 1000000000; error = sock_receivembuf(so, &msg, &mp, MSG_DONTWAIT | MSG_NEEDSA, &bytes_read); @@ -5670,7 +5827,7 @@ nfsrv_getstream(struct nfsrv_sock *slp, int waitflag) if (slp->ns_frag == NULL) { slp->ns_frag = recm; } else { - m = slp->ns_frag; + m = slp->ns_frag; while ((m2 = mbuf_next(m))) m = m2; if ((error = mbuf_setnext(m, recm))) @@ -5918,4 +6075,3 @@ nfsrv_wakenfsd(struct nfsrv_sock *slp) } #endif /* NFSSERVER */ - diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index 7fde3da6b..b0eb21d73 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -253,10 +253,10 @@ loop: if (rp->rc_state == RC_UNUSED) panic("nfsrv cache"); if (rp->rc_state == RC_INPROG) { - OSAddAtomic(1, &nfsstats.srvcache_inproghits); + OSAddAtomic64(1, &nfsstats.srvcache_inproghits); ret = RC_DROPIT; } else if (rp->rc_flag & RC_REPSTATUS) { - OSAddAtomic(1, &nfsstats.srvcache_nonidemdonehits); + OSAddAtomic64(1, &nfsstats.srvcache_nonidemdonehits); nd->nd_repstat = rp->rc_status; error = nfsrv_rephead(nd, slp, &nmrep, 0); if (error) { @@ -268,7 +268,7 @@ loop: *mrepp = nmrep.nmc_mhead; } } else if (rp->rc_flag & RC_REPMBUF) { - OSAddAtomic(1, &nfsstats.srvcache_nonidemdonehits); + OSAddAtomic64(1, &nfsstats.srvcache_nonidemdonehits); error = mbuf_copym(rp->rc_reply, 0, MBUF_COPYALL, MBUF_WAITOK, mrepp); if (error) { printf("nfsrv cache: reply copym failed for nonidem request hit\n"); @@ -277,7 +277,7 @@ loop: ret = RC_REPLY; } } else { - OSAddAtomic(1, &nfsstats.srvcache_idemdonehits); + OSAddAtomic64(1, &nfsstats.srvcache_idemdonehits); rp->rc_state = RC_INPROG; ret = RC_DOIT; } @@ -290,7 +290,7 @@ loop: return (ret); } } - OSAddAtomic(1, &nfsstats.srvcache_misses); + OSAddAtomic64(1, &nfsstats.srvcache_misses); if (nfsrv_reqcache_count < nfsrv_reqcache_size) { /* try to allocate a new entry */ MALLOC(rp, struct nfsrvcache *, sizeof *rp, M_NFSD, M_WAITOK); diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index dccead918..e0e9446bf 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -118,7 +118,7 @@ /* * NFS globals */ -struct nfsstats nfsstats; +struct nfsstats __attribute__((aligned(8))) nfsstats; size_t nfs_mbuf_mhlen = 0, nfs_mbuf_minclsize = 0; /* @@ -1055,7 +1055,7 @@ nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, in if (groupcount < 1) return (EINVAL); - auth_len = ((((groupcount - 1) > nmp->nm_numgrps) ? + auth_len = (((((uint32_t)groupcount - 1) > nmp->nm_numgrps) ? nmp->nm_numgrps : (groupcount - 1)) << 2) + 5 * NFSX_UNSIGNED; break; @@ -1169,7 +1169,7 @@ add_cred: error = 0; req->r_auth = auth_type = RPCAUTH_SYS; (void)kauth_cred_getgroups(cred, grouplist, &groupcount); - auth_len = ((((groupcount - 1) > nmp->nm_numgrps) ? + auth_len = (((((uint32_t)groupcount - 1) > nmp->nm_numgrps) ? nmp->nm_numgrps : (groupcount - 1)) << 2) + 5 * NFSX_UNSIGNED; authsiz = nfsm_rndup(auth_len); @@ -1216,6 +1216,7 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) { int error = 0; enum vtype vtype; + nfstype nvtype; u_short vmode; uint32_t val, val2; dev_t rdev; @@ -1237,12 +1238,12 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY); NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA); - nfsm_chain_get_32(error, nmc, vtype); + nfsm_chain_get_32(error, nmc, nvtype); nfsm_chain_get_32(error, nmc, vmode); nfsmout_if(error); if (nfsvers == NFS_VER3) { - nvap->nva_type = nfstov_type(vtype, nfsvers); + nvap->nva_type = vtype = nfstov_type(nvtype, nfsvers); } else { /* * The duplicate information returned in fa_type and fa_mode @@ -1261,7 +1262,7 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) * contain any type information (while also introducing * sockets and FIFOs for fa_type). */ - vtype = nfstov_type(vtype, nfsvers); + vtype = nfstov_type(nvtype, nfsvers); if ((vtype == VNON) || ((vtype == VREG) && ((vmode & S_IFMT) != 0))) vtype = IFTOVT(vmode); nvap->nva_type = vtype; @@ -1635,7 +1636,7 @@ nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper, int flags) /* Check if the attributes are valid. */ if (!NATTRVALID(np) || ((flags & NGA_ACL) && !NACLVALID(np))) { FSDBG(528, np, 0, 0xffffff01, ENOENT); - OSAddAtomic(1, &nfsstats.attrcache_misses); + OSAddAtomic64(1, &nfsstats.attrcache_misses); return (ENOENT); } @@ -1644,18 +1645,18 @@ nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper, int flags) microuptime(&nowup); if ((nowup.tv_sec - np->n_attrstamp) >= timeo) { FSDBG(528, np, 0, 0xffffff02, ENOENT); - OSAddAtomic(1, &nfsstats.attrcache_misses); + OSAddAtomic64(1, &nfsstats.attrcache_misses); return (ENOENT); } if ((flags & NGA_ACL) && ((nowup.tv_sec - np->n_aclstamp) >= timeo)) { FSDBG(528, np, 0, 0xffffff02, ENOENT); - OSAddAtomic(1, &nfsstats.attrcache_misses); + OSAddAtomic64(1, &nfsstats.attrcache_misses); return (ENOENT); } nvap = &np->n_vattr; FSDBG(528, np, nvap->nva_size, np->n_size, 0xcace); - OSAddAtomic(1, &nfsstats.attrcache_hits); + OSAddAtomic64(1, &nfsstats.attrcache_hits); if (nvap->nva_type != VREG) { np->n_size = nvap->nva_size; @@ -2099,12 +2100,12 @@ nfsrv_namei( /* Check for encountering a symbolic link */ if (cnp->cn_flags & ISSYMLINK) { -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((cnp->cn_flags & FSNODELOCKHELD)) { cnp->cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(nip->ni_dvp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (cnp->cn_flags & (LOCKPARENT | WANTPARENT)) vnode_put(nip->ni_dvp); if (nip->ni_vp) { diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index d6de219ba..c79ab007e 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -166,6 +166,9 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLA SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, is_mobile, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_is_mobile, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, squishy_flags, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_squishy_flags, 0, ""); + #endif /* NFSCLIENT */ #if NFSSERVER @@ -184,6 +187,12 @@ SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOC #endif SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, ""); SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, ""); +#ifdef NFS_UC_Q_DEBUG +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, use_upcall_svc, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_use_proxy, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_limit, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)&nfsrv_uc_queue_count, 0, ""); +#endif #endif /* NFSSERVER */ @@ -724,6 +733,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) { struct nfsrv_sock *slp; int error = 0, sodomain, sotype, soprotocol, on = 1; + int first; struct timeval timeo; /* make sure mbuf constants are set up */ @@ -808,6 +818,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) } /* add the socket to the list */ + first = TAILQ_EMPTY(&nfsrv_socklist); TAILQ_INSERT_TAIL(&nfsrv_socklist, slp, ns_chain); sock_retain(so); /* grab a retain count on the socket */ @@ -815,10 +826,8 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) slp->ns_sotype = sotype; slp->ns_nam = mynam; - /* set up the socket upcall */ - sock_setupcall(so, nfsrv_rcv, slp); - /* just playin' it safe */ - sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); + /* set up the socket up-call */ + nfsrv_uc_addsock(slp, first); /* mark that the socket is not in the nfsrv_sockwg list */ slp->ns_wgq.tqe_next = SLPNOLIST; @@ -878,6 +887,7 @@ nfssvc_nfsd(void) u_quad_t cur_usec; struct timeval now; struct vfs_context context; + struct timespec to; #ifndef nolint cacherep = RC_DOIT; @@ -891,11 +901,16 @@ nfssvc_nfsd(void) lck_mtx_lock(nfsd_mutex); if (nfsd_thread_count++ == 0) nfsrv_initcache(); /* Init the server request cache */ + TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); lck_mtx_unlock(nfsd_mutex); context.vc_thread = current_thread(); + /* Set time out so that nfsd threads can wake up a see if they are still needed. */ + to.tv_sec = 5; + to.tv_nsec = 0; + /* * Loop getting rpc requests until SIGKILL. */ @@ -923,12 +938,14 @@ nfssvc_nfsd(void) } nfsd->nfsd_flag |= NFSD_WAITING; TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue); - error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", NULL); + error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to); if (error) { if (nfsd->nfsd_flag & NFSD_WAITING) { TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue); nfsd->nfsd_flag &= ~NFSD_WAITING; } + if (error == EWOULDBLOCK) + continue; goto done; } } @@ -1083,7 +1100,7 @@ nfssvc_nfsd(void) } if (error) { - OSAddAtomic(1, &nfsstats.srv_errs); + OSAddAtomic64(1, &nfsstats.srv_errs); nfsrv_updatecache(nd, FALSE, mrep); if (nd->nd_nam2) { mbuf_freem(nd->nd_nam2); @@ -1091,7 +1108,7 @@ nfssvc_nfsd(void) } break; } - OSAddAtomic(1, &nfsstats.srvrpccnt[nd->nd_procnum]); + OSAddAtomic64(1, &nfsstats.srvrpccnt[nd->nd_procnum]); nfsrv_updatecache(nd, TRUE, mrep); /* FALLTHRU */ @@ -1282,8 +1299,8 @@ nfsrv_zapsock(struct nfsrv_sock *slp) return; /* - * Attempt to deter future upcalls, but leave the - * upcall info in place to avoid a race with the + * Attempt to deter future up-calls, but leave the + * up-call info in place to avoid a race with the * networking code. */ socket_lock(so, 1); @@ -1291,6 +1308,11 @@ nfsrv_zapsock(struct nfsrv_sock *slp) socket_unlock(so, 1); sock_shutdown(so, SHUT_RDWR); + + /* + * Remove from the up-call queue + */ + nfsrv_uc_dequeue(slp); } /* @@ -1316,6 +1338,9 @@ nfsrv_slpfree(struct nfsrv_sock *slp) slp->ns_nam = slp->ns_raw = slp->ns_rec = slp->ns_frag = NULL; slp->ns_reccnt = 0; + if (slp->ns_ua) + FREE(slp->ns_ua, M_NFSSVC); + for (nwp = slp->ns_tq.lh_first; nwp; nwp = nnwp) { nnwp = nwp->nd_tq.le_next; LIST_REMOVE(nwp, nd_tq); @@ -1506,6 +1531,8 @@ nfsrv_cleanup(void) lck_mtx_unlock(nfsrv_fmod_mutex); #endif + nfsrv_uc_cleanup(); /* Stop nfs socket up-call threads */ + nfs_gss_svc_cleanup(); /* Remove any RPCSEC_GSS contexts */ nfsrv_cleancache(); /* And clear out server cache */ diff --git a/bsd/nfs/nfs_upcall.c b/bsd/nfs/nfs_upcall.c new file mode 100644 index 000000000..7d6f85f53 --- /dev/null +++ b/bsd/nfs/nfs_upcall.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef NFS_UC_DEBUG +#define DPRINT(fmt, ...) printf(fmt,## __VA_ARGS__) +#else +#define DPRINT(fmt, ...) +#endif + +struct nfsrv_uc_arg { + TAILQ_ENTRY(nfsrv_uc_arg) nua_svcq; + socket_t nua_so; + struct nfsrv_sock *nua_slp; + int nua_waitflag; /* Should always be MBUF_DONTWAIT */ + uint32_t nua_flags; + uint32_t nua_qi; +}; +#define NFS_UC_QUEUED 0x0001 + +#define NFS_UC_HASH_SZ 7 +#define NFS_UC_HASH(x) ((((uint32_t)(uintptr_t)(x)) >> 3) % nfsrv_uc_thread_count) + +TAILQ_HEAD(nfsrv_uc_q, nfsrv_uc_arg); + +static struct nfsrv_uc_queue { + lck_mtx_t *ucq_lock; + struct nfsrv_uc_q ucq_queue[1]; + thread_t ucq_thd; + uint32_t ucq_flags; +} nfsrv_uc_queue_tbl[NFS_UC_HASH_SZ]; +#define NFS_UC_QUEUE_SLEEPING 0x0001 + +static lck_grp_t *nfsrv_uc_group; +static lck_mtx_t *nfsrv_uc_shutdown_lock; +static volatile int nfsrv_uc_shutdown = 0; +static int32_t nfsrv_uc_thread_count; + +extern kern_return_t thread_terminate(thread_t); + +#ifdef NFS_UC_Q_DEBUG +int nfsrv_uc_use_proxy = 1; +uint32_t nfsrv_uc_queue_limit; +uint32_t nfsrv_uc_queue_max_seen; +volatile uint32_t nfsrv_uc_queue_count; +#endif + +/* + * Thread that dequeues up-calls and runs the nfsrv_rcv routine + */ +static void +nfsrv_uc_thread(void *arg, wait_result_t wr __unused) +{ + int qi = (int)(uintptr_t)arg; + int error; + struct nfsrv_uc_arg *ep = NULL; + struct nfsrv_uc_queue *myqueue = &nfsrv_uc_queue_tbl[qi]; + + DPRINT("nfsrv_uc_thread %d started\n", qi); + while (!nfsrv_uc_shutdown) { + lck_mtx_lock(myqueue->ucq_lock); + + while (!nfsrv_uc_shutdown && TAILQ_EMPTY(myqueue->ucq_queue)) { + myqueue->ucq_flags |= NFS_UC_QUEUE_SLEEPING; + error = msleep(myqueue, myqueue->ucq_lock, PSOCK, "nfsd_upcall_handler", NULL); + myqueue->ucq_flags &= ~NFS_UC_QUEUE_SLEEPING; + if (error) { + printf("nfsrv_uc_thread received error %d\n", error); + } + } + if (nfsrv_uc_shutdown) { + lck_mtx_unlock(myqueue->ucq_lock); + break; + } + + + ep = TAILQ_FIRST(myqueue->ucq_queue); + DPRINT("nfsrv_uc_thread:%d dequeue %p from %p\n", qi, ep, myqueue); + + TAILQ_REMOVE(myqueue->ucq_queue, ep, nua_svcq); + + ep->nua_flags &= ~NFS_UC_QUEUED; + + lck_mtx_unlock(myqueue->ucq_lock); + +#ifdef NFS_UC_Q_DEBUG + OSDecrementAtomic(&nfsrv_uc_queue_count); +#endif + + DPRINT("calling nfsrv_rcv for %p\n", (void *)ep->nua_slp); + nfsrv_rcv(ep->nua_so, (void *)ep->nua_slp, ep->nua_waitflag); + } + + lck_mtx_lock(nfsrv_uc_shutdown_lock); + nfsrv_uc_thread_count--; + wakeup(&nfsrv_uc_thread_count); + lck_mtx_unlock(nfsrv_uc_shutdown_lock); + + thread_terminate(current_thread()); +} + +/* + * Dequeue a closed nfsrv_sock if needed from the up-call queue. + * Call from nfsrv_zapsock + */ +void +nfsrv_uc_dequeue(struct nfsrv_sock *slp) +{ + struct nfsrv_uc_arg *ap = slp->ns_ua; + struct nfsrv_uc_queue *myqueue = &nfsrv_uc_queue_tbl[ap->nua_qi]; + + /* + * We assume that the socket up-calls have been stop and the socket + * is shutting down so no need for acquiring the lock to check that + * the flag is cleared. + */ + if (ap == NULL || (ap->nua_flags & NFS_UC_QUEUED) == 0) + return; + /* If we're queued we might race with nfsrv_uc_thread */ + lck_mtx_lock(myqueue->ucq_lock); + if (ap->nua_flags & NFS_UC_QUEUED) { + printf("nfsrv_uc_dequeue remove %p\n", ap); + TAILQ_REMOVE(myqueue->ucq_queue, ap, nua_svcq); + ap->nua_flags &= ~NFS_UC_QUEUED; +#ifdef NFS_UC_Q_DEBUG + OSDecrementAtomic(&nfsrv_uc_queue_count); +#endif + } + lck_mtx_unlock(myqueue->ucq_lock); +} + +/* + * Allocate and initialize globals for nfsrv_sock up-call support. + */ +void +nfsrv_uc_init(void) +{ + int i; + + nfsrv_uc_group = lck_grp_alloc_init("nfs_upcall_locks", LCK_GRP_ATTR_NULL); + for (i = 0; i < NFS_UC_HASH_SZ; i++) { + TAILQ_INIT(nfsrv_uc_queue_tbl[i].ucq_queue); + nfsrv_uc_queue_tbl[i].ucq_lock = lck_mtx_alloc_init(nfsrv_uc_group, LCK_ATTR_NULL); + nfsrv_uc_queue_tbl[i].ucq_thd = THREAD_NULL; + nfsrv_uc_queue_tbl[i].ucq_flags = 0; + } + nfsrv_uc_shutdown_lock = lck_mtx_alloc_init(nfsrv_uc_group, LCK_ATTR_NULL); +} + +/* + * Start up-call threads to service nfsrv_sock(s) + * Called from the first call of nfsrv_uc_addsock + */ +static void +nfsrv_uc_start(void) +{ + int32_t i; + int error; + +#ifdef NFS_UC_Q_DEBUG + if (!nfsrv_uc_use_proxy) + return; +#endif + DPRINT("nfsrv_uc_start\n"); + + /* Wait until previous shutdown finishes */ + lck_mtx_lock(nfsrv_uc_shutdown_lock); + while (nfsrv_uc_shutdown || nfsrv_uc_thread_count > 0) + msleep(&nfsrv_uc_thread_count, nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_wait", NULL); + + /* Start up-call threads */ + for (i = 0; i < NFS_UC_HASH_SZ; i++) { + error = kernel_thread_start(nfsrv_uc_thread, (void *)(uintptr_t)i, &nfsrv_uc_queue_tbl[nfsrv_uc_thread_count].ucq_thd); + if (!error) { + nfsrv_uc_thread_count++; + } else { + printf("nfsd: Could not start nfsrv_uc_thread: %d\n", error); + } + } + if (nfsrv_uc_thread_count == 0) { + printf("nfsd: Could not start nfsd proxy up-call service. Falling back\n"); + goto out; + } + +out: +#ifdef NFS_UC_Q_DEBUG + nfsrv_uc_queue_count = 0ULL; + nfsrv_uc_queue_max_seen = 0ULL; +#endif + lck_mtx_unlock(nfsrv_uc_shutdown_lock); +} + +/* + * Stop the up-call threads. + * Called from nfsrv_uc_cleanup. + */ +static void +nfsrv_uc_stop(void) +{ + int32_t i; + int32_t thread_count = nfsrv_uc_thread_count; + + DPRINT("Entering nfsrv_uc_stop\n"); + + /* Signal up-call threads to stop */ + nfsrv_uc_shutdown = 1; + for (i = 0; i < thread_count; i++) { + lck_mtx_lock(nfsrv_uc_queue_tbl[i].ucq_lock); + wakeup(&nfsrv_uc_queue_tbl[i]); + lck_mtx_unlock(nfsrv_uc_queue_tbl[i].ucq_lock); + } + + /* Wait until they are done shutting down */ + lck_mtx_lock(nfsrv_uc_shutdown_lock); + while (nfsrv_uc_thread_count > 0) + msleep(&nfsrv_uc_thread_count, nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_stop", NULL); + + /* Deallocate old threads */ + for (i = 0; i < nfsrv_uc_thread_count; i++) { + if (nfsrv_uc_queue_tbl[i].ucq_thd != THREAD_NULL) + thread_deallocate(nfsrv_uc_queue_tbl[i].ucq_thd); + nfsrv_uc_queue_tbl[i].ucq_thd = THREAD_NULL; + } + + /* Enable restarting */ + nfsrv_uc_shutdown = 0; + lck_mtx_unlock(nfsrv_uc_shutdown_lock); +} + +/* + * Shutdown up-calls for nfsrv_socks. + * Make sure nothing is queued on the up-call queues + * Shutdown the up-call threads + * Called from nfssvc_cleanup. + */ +void +nfsrv_uc_cleanup(void) +{ + int i; + + DPRINT("Entering nfsrv_uc_cleanup\n"); + + /* + * Every thing should be dequeued at this point or will be as sockets are closed + * but to be safe, we'll make sure. + */ + for (i = 0; i < NFS_UC_HASH_SZ; i++) { + struct nfsrv_uc_queue *queue = &nfsrv_uc_queue_tbl[i]; + + lck_mtx_lock(queue->ucq_lock); + while (!TAILQ_EMPTY(queue->ucq_queue)) { + struct nfsrv_uc_arg *ep = TAILQ_FIRST(queue->ucq_queue); + TAILQ_REMOVE(queue->ucq_queue, ep, nua_svcq); + ep->nua_flags &= ~NFS_UC_QUEUED; + } + lck_mtx_unlock(queue->ucq_lock); + } + + nfsrv_uc_stop(); +} + +/* + * This is the nfs up-call routine for server sockets. + * We used to set nfsrv_rcv as the up-call routine, but + * recently that seems like we are doing to much work for + * the interface thread, so we just queue the arguments + * that we would have gotten for nfsrv_rcv and let a + * worker thread dequeue them and pass them on to nfsrv_rcv. + */ +static void +nfsrv_uc_proxy(socket_t so, void *arg, int waitflag) +{ + struct nfsrv_uc_arg *uap = (struct nfsrv_uc_arg *)arg; + int qi = uap->nua_qi; + struct nfsrv_uc_queue *myqueue = &nfsrv_uc_queue_tbl[qi]; + + lck_mtx_lock(myqueue->ucq_lock); + DPRINT("nfsrv_uc_proxy called for %p (%p)\n", uap, uap->nua_slp); + DPRINT("\tUp-call queued on %d for wakeup of %p\n", qi, myqueue); + if (uap->nua_flags & NFS_UC_QUEUED) { + lck_mtx_unlock(myqueue->ucq_lock); + return; /* Already queued */ + } + + uap->nua_so = so; + uap->nua_waitflag = waitflag; + + TAILQ_INSERT_TAIL(myqueue->ucq_queue, uap, nua_svcq); + + uap->nua_flags |= NFS_UC_QUEUED; + if (myqueue->ucq_flags | NFS_UC_QUEUE_SLEEPING) + wakeup(myqueue); + +#ifdef NFS_UC_Q_DEBUG + { + uint32_t count = OSIncrementAtomic(&nfsrv_uc_queue_count); + + /* This is a bit racey but just for debug */ + if (count > nfsrv_uc_queue_max_seen) + nfsrv_uc_queue_max_seen = count; + + if (nfsrv_uc_queue_limit && count > nfsrv_uc_queue_limit) { + panic("nfsd up-call queue limit exceeded\n"); + } + } +#endif + lck_mtx_unlock(myqueue->ucq_lock); +} + + +/* + * Set the up-call routine on the socket associated with the passed in + * nfsrv_sock. + * Assumes nfsd_mutex is held. + */ +void +nfsrv_uc_addsock(struct nfsrv_sock *slp, int start) +{ + int on = 1; + struct nfsrv_uc_arg *arg; + + if (start && nfsrv_uc_thread_count == 0) + nfsrv_uc_start(); + + /* + * We don't take a lock since once we're up nfsrv_uc_thread_count does + * not change until shutdown and then we should not be adding sockets to + * generate up-calls. + */ + if (nfsrv_uc_thread_count) { + MALLOC(arg, struct nfsrv_uc_arg *, sizeof (struct nfsrv_uc_arg), M_NFSSVC, M_WAITOK | M_ZERO); + if (arg == NULL) + goto direct; + + slp->ns_ua = arg; + arg->nua_slp = slp; + arg->nua_qi = NFS_UC_HASH(slp); + + sock_setupcall(slp->ns_so, nfsrv_uc_proxy, arg); + } else { +direct: + slp->ns_ua = NULL; + DPRINT("setting nfsrv_rcv up-call\n"); + sock_setupcall(slp->ns_so, nfsrv_rcv, slp); + } + + /* just playin' it safe */ + sock_setsockopt(slp->ns_so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); + + return; +} + diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 7a0323fde..1c571a21f 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1575,8 +1575,12 @@ nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int ar /* copy socket address */ if (inkernel) bcopy(CAST_DOWN(void *, args.addr), &ss, args.addrlen); - else - error = copyin(args.addr, &ss, args.addrlen); + else { + if ((size_t)args.addrlen > sizeof (struct sockaddr_storage)) + error = EINVAL; + else + error = copyin(args.addr, &ss, args.addrlen); + } nfsmout_if(error); ss.ss_len = args.addrlen; @@ -2694,6 +2698,7 @@ mountnfs( nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; nmp->nm_auth = RPCAUTH_SYS; nmp->nm_deadtimeout = 0; + nmp->nm_curdeadtimeout = 0; NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_NOACL); } diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index a5917a297..eb636101d 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -855,9 +855,34 @@ out: NP(np, "nfs_vnop_open: error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); if (noop) nfs_open_owner_rele(noop); + if (!error && vtype == VREG && (ap->a_mode & FWRITE)) { + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state &= ~NFSSTA_SQUISHY; + nmp->nm_curdeadtimeout = nmp->nm_deadtimeout; + if (nmp->nm_curdeadtimeout <= 0) + nmp->nm_deadto_start = 0; + nmp->nm_writers++; + lck_mtx_unlock(&nmp->nm_lock); + } + return (error); } +static uint32_t +nfs_no_of_open_file_writers(nfsnode_t np) +{ + uint32_t writers = 0; + struct nfs_open_file *nofp; + + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + writers += nofp->nof_w + nofp->nof_rw + nofp->nof_w_dw + nofp->nof_rw_dw + + nofp->nof_w_drw + nofp->nof_rw_drw + nofp->nof_d_w_dw + + nofp->nof_d_rw_dw + nofp->nof_d_w_drw + nofp->nof_d_rw_drw + + nofp->nof_d_w + nofp->nof_d_rw; + } + + return (writers); +} /* * NFS close vnode op @@ -990,11 +1015,36 @@ nfs_vnop_close( * Guess this is the final close. * We should unlock all locks and close all opens. */ + uint32_t writers; mount_t mp = vnode_mount(vp); int force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); + + writers = nfs_no_of_open_file_writers(np); nfs_release_open_state_for_node(np, force); + if (writers) { + lck_mtx_lock(&nmp->nm_lock); + if (writers > nmp->nm_writers) { + NP(np, "nfs_vnop_close: number of write opens for mount underrun. Node has %d" + " opens for write. Mount has total of %d opens for write\n", + writers, nmp->nm_writers); + nmp->nm_writers = 0; + } else { + nmp->nm_writers -= writers; + } + lck_mtx_unlock(&nmp->nm_lock); + } + return (error); + } else if (fflag & FWRITE) { + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_writers == 0) { + NP(np, "nfs_vnop_close: removing open writer from mount, but mount has no files open for writing"); + } else { + nmp->nm_writers--; + } + lck_mtx_unlock(&nmp->nm_lock); } + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); if (!noop) { @@ -1065,7 +1115,7 @@ nfs_close( struct nfs_lock_owner *nlop; int error = 0, changed = 0, delegated = 0, closed = 0, downgrade = 0; uint32_t newAccessMode, newDenyMode; - + /* warn if modes don't match current state */ if (((accessMode & nofp->nof_access) != accessMode) || ((denyMode & nofp->nof_deny) != denyMode)) NP(np, "nfs_close: mode mismatch %d %d, current %d %d, %d", @@ -1191,6 +1241,7 @@ v3close: NP(np, "nfs_close: LOST%s, %d", !nofp->nof_opencnt ? " (last)" : "", kauth_cred_getuid(nofp->nof_owner->noo_cred)); } + return (error); } @@ -2011,7 +2062,7 @@ nfs_vnop_lookup( /* FALLTHROUGH */ case -1: /* cache hit, not really an error */ - OSAddAtomic(1, &nfsstats.lookupcache_hits); + OSAddAtomic64(1, &nfsstats.lookupcache_hits); nfs_node_clear_busy(dnp); busyerror = ENOENT; @@ -2063,7 +2114,7 @@ nfs_vnop_lookup( error = 0; newvp = NULLVP; - OSAddAtomic(1, &nfsstats.lookupcache_misses); + OSAddAtomic64(1, &nfsstats.lookupcache_misses); error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &req); nfsmout_if(error); @@ -2182,14 +2233,14 @@ nfs_vnop_readlink( return (error); } - OSAddAtomic(1, &nfsstats.biocache_readlinks); + OSAddAtomic64(1, &nfsstats.biocache_readlinks); error = nfs_buf_get(np, 0, NFS_MAXPATHLEN, vfs_context_thread(ctx), NBLK_READ, &bp); if (error) { FSDBG(531, np, 0xd1e0002, 0, error); return (error); } if (!ISSET(bp->nb_flags, NB_CACHE)) { - OSAddAtomic(1, &nfsstats.readlink_bios); + OSAddAtomic64(1, &nfsstats.readlink_bios); buflen = bp->nb_bufsize; error = nmp->nm_funcs->nf_readlink_rpc(np, bp->nb_data, &buflen, ctx); if (error) { @@ -2542,7 +2593,7 @@ nfs_vnop_write( } do { - OSAddAtomic(1, &nfsstats.biocache_writes); + OSAddAtomic64(1, &nfsstats.biocache_writes); lbn = uio_offset(uio) / biosize; on = uio_offset(uio) % biosize; n = biosize - on; @@ -4705,7 +4756,7 @@ nfs_vnop_readdir( } while (!error && !done) { - OSAddAtomic(1, &nfsstats.biocache_readdirs); + OSAddAtomic64(1, &nfsstats.biocache_readdirs); cookie = nextcookie; getbuffer: error = nfs_buf_get(dnp, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp); @@ -4955,7 +5006,7 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) if (cookie == dnp->n_eofcookie) { /* EOF cookie */ nfs_node_unlock(dnp); - OSAddAtomic(1, &nfsstats.direofcache_hits); + OSAddAtomic64(1, &nfsstats.direofcache_hits); *ptc = 0; return (-1); } @@ -4969,7 +5020,7 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) /* found a match for this cookie */ *lbnp = ndcc->cookies[i].lbn; nfs_node_unlock(dnp); - OSAddAtomic(1, &nfsstats.direofcache_hits); + OSAddAtomic64(1, &nfsstats.direofcache_hits); *ptc = 0; return (0); } @@ -4981,14 +5032,14 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) if (eofptc) { /* but 32-bit match hit the EOF cookie */ nfs_node_unlock(dnp); - OSAddAtomic(1, &nfsstats.direofcache_hits); + OSAddAtomic64(1, &nfsstats.direofcache_hits); return (-1); } if (iptc >= 0) { /* but 32-bit match got a hit */ *lbnp = ndcc->cookies[iptc].lbn; nfs_node_unlock(dnp); - OSAddAtomic(1, &nfsstats.direofcache_hits); + OSAddAtomic64(1, &nfsstats.direofcache_hits); return (0); } nfs_node_unlock(dnp); @@ -5065,13 +5116,13 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) } lck_mtx_unlock(nfs_buf_mutex); if (found) { - OSAddAtomic(1, &nfsstats.direofcache_hits); + OSAddAtomic64(1, &nfsstats.direofcache_hits); return (0); } /* still not found... oh well, just start a new block */ *lbnp = cookie; - OSAddAtomic(1, &nfsstats.direofcache_misses); + OSAddAtomic64(1, &nfsstats.direofcache_misses); return (0); } @@ -5333,7 +5384,7 @@ noplus: } else { cookie = bp->nb_lblkno; /* increment with every buffer read */ - OSAddAtomic(1, &nfsstats.readdir_bios); + OSAddAtomic64(1, &nfsstats.readdir_bios); } lastcookie = cookie; @@ -5446,7 +5497,7 @@ nextbuffer: space_free = nfs_dir_buf_freespace(bp, rdirplus); dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp); /* increment with every buffer read */ - OSAddAtomic(1, &nfsstats.readdir_bios); + OSAddAtomic64(1, &nfsstats.readdir_bios); } nmrepsave = nmrep; dp->d_fileno = fileno; @@ -6631,7 +6682,7 @@ nfs_vnop_pagein( if (size <= 0) { printf("nfs_pagein: invalid size %ld", size); if (!nofreeupl) - (void) ubc_upl_abort(pl, 0); + (void) ubc_upl_abort_range(pl, pl_offset, size, 0); return (EINVAL); } if (f_offset < 0 || f_offset >= (off_t)np->n_size || (f_offset & PAGE_MASK_64)) { @@ -6698,7 +6749,7 @@ tryagain: #if UPL_DEBUG upl_ubc_alias_set(pl, (uintptr_t) current_thread(), (uintptr_t) 2); #endif /* UPL_DEBUG */ - OSAddAtomic(1, &nfsstats.pageins); + OSAddAtomic64(1, &nfsstats.pageins); error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req[nextwait], uio, &retsize, NULL); req[nextwait] = NULL; nextwait = (nextwait + 1) % MAXPAGINGREQS; @@ -6944,7 +6995,7 @@ nfs_vnop_pageout( if (size <= 0) { printf("nfs_pageout: invalid size %ld", size); if (!nofreeupl) - ubc_upl_abort(pl, 0); + ubc_upl_abort_range(pl, pl_offset, size, 0); return (EINVAL); } @@ -6977,7 +7028,7 @@ nfs_vnop_pageout( nfs_data_unlock_noupdate(np); /* no panic. just tell vm we are busy */ if (!nofreeupl) - ubc_upl_abort(pl, 0); + ubc_upl_abort_range(pl, pl_offset, size, 0); return (EBUSY); } if (bp->nb_dirtyend > 0) { @@ -7024,7 +7075,7 @@ nfs_vnop_pageout( lck_mtx_unlock(nfs_buf_mutex); nfs_data_unlock_noupdate(np); if (!nofreeupl) - ubc_upl_abort(pl, 0); + ubc_upl_abort_range(pl, pl_offset, size, 0); return (EBUSY); } if ((bp->nb_dirtyoff < start) || @@ -7135,7 +7186,7 @@ tryagain: uio_reset(auio, txoffset, UIO_SYSSPACE, UIO_WRITE); uio_addiov(auio, CAST_USER_ADDR_T(txaddr), iosize); FSDBG(323, uio_offset(auio), iosize, txaddr, txsize); - OSAddAtomic(1, &nfsstats.pageouts); + OSAddAtomic64(1, &nfsstats.pageouts); nfs_node_lock_force(np); np->n_numoutput++; nfs_node_unlock(np); diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index 97f955e2f..574b5a70e 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -261,7 +261,7 @@ struct nfsmount { mount_t nm_mountp; /* VFS structure for this filesystem */ nfsnode_t nm_dnp; /* root directory nfsnode pointer */ struct nfs_fs_locations nm_locations; /* file system locations */ - int nm_numgrps; /* Max. size of groupslist */ + uint32_t nm_numgrps; /* Max. size of groupslist */ TAILQ_HEAD(, nfs_gss_clnt_ctx) nm_gsscl; /* GSS user contexts */ int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ int nm_retry; /* Max retries */ @@ -275,6 +275,8 @@ struct nfsmount { uint32_t nm_acdirmin; /* dir min attr cache timeout */ uint32_t nm_acdirmax; /* dir max attr cache timeout */ uint32_t nm_auth; /* security mechanism flavor being used */ + uint32_t nm_writers; /* Number of nodes open for writing */ + uint32_t nm_mappers; /* Number of nodes that have mmapped */ struct nfs_sec nm_sec; /* acceptable security mechanism flavors */ struct nfs_sec nm_servsec; /* server's acceptable security mechanism flavors */ fhandle_t *nm_fh; /* initial file handle */ @@ -330,7 +332,8 @@ struct nfsmount { time_t nm_reconnect_start; /* reconnect start time */ int nm_tprintf_initial_delay; /* delay first "server down" */ int nm_tprintf_delay; /* delay between "server down" */ - int nm_deadtimeout; /* delay between first "server down" and dead */ + int nm_deadtimeout; /* delay between first "server down" and dead set at mount time */ + int nm_curdeadtimeout; /* current dead timeout. Adjusted by mount state and mobility */ int nm_srtt[4]; /* Timers for RPCs */ int nm_sdrtt[4]; int nm_timeouts; /* Request timeouts */ @@ -372,6 +375,7 @@ struct nfsmount { #define NFSSTA_RECOVER 0x08000000 /* mount state needs to be recovered */ #define NFSSTA_RECOVER_EXPIRED 0x10000000 /* mount state expired */ #define NFSSTA_REVOKE 0x20000000 /* need to scan for revoked nodes */ +#define NFSSTA_SQUISHY 0x40000000 /* we can ask to be forcibly unmounted */ /* flags for nm_sockflags */ #define NMSOCK_READY 0x0001 /* socket is ready for use */ diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index cce1399ca..adf50cc52 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -595,8 +595,8 @@ struct nfsnode { struct nfs_sillyrename *nf_silly;/* Ptr to silly rename struct */ struct nfsdmap *nd_cookiecache; /* dir cookie cache */ } n_un3; + uint32_t n_flag; /* node flags */ u_short n_fhsize; /* size in bytes, of fh */ - u_short n_flag; /* node flags */ u_short n_hflag; /* node hash flags */ u_short n_bflag; /* node buffer flags */ u_short n_mflag; /* node mount flags */ @@ -672,22 +672,22 @@ struct nfsnode { /* * Flags for n_flag */ -#define NUPDATESIZE 0x0001 /* size of file needs updating */ -#define NREVOKE 0x0002 /* node revoked */ -#define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ -#define NWRITEERR 0x0008 /* Flag write errors so close will know */ -#define NNEEDINVALIDATE 0x0010 /* need to call vinvalbuf() */ -#define NGETATTRINPROG 0x0020 /* GETATTR RPC in progress */ -#define NGETATTRWANT 0x0040 /* waiting for GETATTR RPC */ -#define NACC 0x0100 /* Special file accessed */ -#define NUPD 0x0200 /* Special file updated */ -#define NCHG 0x0400 /* Special file times changed */ -#define NNEGNCENTRIES 0x0800 /* directory has negative name cache entries */ -#define NBUSY 0x1000 /* node is busy */ -#define NBUSYWANT 0x2000 /* waiting on busy node */ -#define NISDOTZFS 0x4000 /* a ".zfs" directory */ -#define NISDOTZFSCHILD 0x8000 /* a child of a ".zfs" directory */ - +#define NUPDATESIZE 0x00001 /* size of file needs updating */ +#define NREVOKE 0x00002 /* node revoked */ +#define NMODIFIED 0x00004 /* Might have a modified buffer in bio */ +#define NWRITEERR 0x00008 /* Flag write errors so close will know */ +#define NNEEDINVALIDATE 0x00010 /* need to call vinvalbuf() */ +#define NGETATTRINPROG 0x00020 /* GETATTR RPC in progress */ +#define NGETATTRWANT 0x00040 /* waiting for GETATTR RPC */ +#define NACC 0x00100 /* Special file accessed */ +#define NUPD 0x00200 /* Special file updated */ +#define NCHG 0x00400 /* Special file times changed */ +#define NNEGNCENTRIES 0x00800 /* directory has negative name cache entries */ +#define NBUSY 0x01000 /* node is busy */ +#define NBUSYWANT 0x02000 /* waiting on busy node */ +#define NISDOTZFS 0x04000 /* a ".zfs" directory */ +#define NISDOTZFSCHILD 0x08000 /* a child of a ".zfs" directory */ +#define NISMAPPED 0x10000 /* node is mmapped */ /* * Flags for n_hflag @@ -824,6 +824,7 @@ void nfs_data_update_size(nfsnode_t, int); /* other stuff */ int nfs_removeit(struct nfs_sillyrename *); int nfs_nget(mount_t,nfsnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,uint32_t,int,nfsnode_t*); +int nfs_mount_is_dirty(mount_t); void nfs_dir_cookie_cache(nfsnode_t, uint64_t, uint64_t); int nfs_dir_cookie_to_lbn(nfsnode_t, uint64_t, int *, uint64_t *); void nfs_invaldir(nfsnode_t); diff --git a/bsd/security/audit/audit_bsd.h b/bsd/security/audit/audit_bsd.h index 72db99f35..4293b781b 100644 --- a/bsd/security/audit/audit_bsd.h +++ b/bsd/security/audit/audit_bsd.h @@ -90,14 +90,14 @@ #define NUM_MALLOC_TYPES 11 -#ifdef M_NOWAIT -#undef M_NOWAIT -#endif -#define M_NOWAIT 0x0000 /* do not block */ #ifdef M_WAITOK #undef M_WAITOK -#define M_WAITOK 0x0001 /* ok to block */ +#define M_WAITOK 0x0000 /* ok to block */ +#endif +#ifdef M_NOWAIT +#undef M_NOWAIT #endif +#define M_NOWAIT 0x0001 /* do not block */ #ifdef M_ZERO #undef M_ZERO #endif diff --git a/bsd/security/audit/audit_bsm_errno.c b/bsd/security/audit/audit_bsm_errno.c index fe24ed2a6..70c87ae2b 100644 --- a/bsd/security/audit/audit_bsm_errno.c +++ b/bsd/security/audit/audit_bsm_errno.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2008-2009 Apple Inc. + * Copyright (c) 2008-2011 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -452,6 +452,7 @@ static const struct bsm_errno bsm_errnos[] = { { BSM_ERRNO_EINPROGRESS, EINPROGRESS, ES("Operation now in progress") }, { BSM_ERRNO_ESTALE, ESTALE, ES("Stale NFS file handle") }, + { BSM_ERRNO_EQFULL, EQFULL, ES("Interface output queue is full") }, { BSM_ERRNO_EPWROFF, #ifdef EPWROFF EPWROFF, diff --git a/bsd/security/audit/audit_pipe.c b/bsd/security/audit/audit_pipe.c index 9f64bba84..943cac431 100644 --- a/bsd/security/audit/audit_pipe.c +++ b/bsd/security/audit/audit_pipe.c @@ -571,7 +571,7 @@ audit_pipe_alloc(void) AUDIT_PIPE_LIST_WLOCK_ASSERT(); - ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_NOWAIT | M_ZERO); + ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_WAITOK | M_ZERO); if (ap == NULL) return (NULL); diff --git a/bsd/security/audit/audit_session.c b/bsd/security/audit/audit_session.c index 4b63e0082..9d26833f5 100644 --- a/bsd/security/audit/audit_session.c +++ b/bsd/security/audit/audit_session.c @@ -1603,7 +1603,7 @@ audit_sdev_alloc(void) AUDIT_SDEV_LIST_WLOCK_ASSERT(); - asdev = malloc(sizeof (*asdev), M_AUDIT_SDEV, M_NOWAIT | M_ZERO); + asdev = malloc(sizeof (*asdev), M_AUDIT_SDEV, M_WAITOK | M_ZERO); if (NULL == asdev) return (NULL); diff --git a/bsd/security/audit/audit_syscalls.c b/bsd/security/audit/audit_syscalls.c index 43d93bdda..359717f7b 100644 --- a/bsd/security/audit/audit_syscalls.c +++ b/bsd/security/audit/audit_syscalls.c @@ -800,129 +800,6 @@ setauid(proc_t p, struct setauid_args *uap, __unused int32_t *retval) return (error); } -static int -getaudit_internal(proc_t p, user_addr_t user_addr) -{ - struct auditinfo ai; - kauth_cred_t scred; - - scred = kauth_cred_proc_ref(p); - if (scred->cr_audit.as_aia_p->ai_termid.at_type == AU_IPv6) { - kauth_cred_unref(&scred); - return (ERANGE); - } - - bzero(&ai, sizeof(ai)); - ai.ai_auid = scred->cr_audit.as_aia_p->ai_auid; - ai.ai_asid = scred->cr_audit.as_aia_p->ai_asid; - - /* - * Only superuser gets to see the real mask. - */ - if (suser(scred, &p->p_acflag)) { - ai.ai_mask.am_success = ~0; - ai.ai_mask.am_failure = ~0; - } else { - ai.ai_mask.am_success = scred->cr_audit.as_mask.am_success; - ai.ai_mask.am_failure = scred->cr_audit.as_mask.am_failure; - } - ai.ai_termid.machine = scred->cr_audit.as_aia_p->ai_termid.at_addr[0]; - ai.ai_termid.port = scred->cr_audit.as_aia_p->ai_termid.at_port; - kauth_cred_unref(&scred); - - return (copyout(&ai, user_addr, sizeof (ai))); -} - -/* - * System calls to get and set process audit information. - */ -/* ARGSUSED */ -int -getaudit(proc_t p, struct getaudit_args *uap, __unused int32_t *retval) -{ - int error; - -#if CONFIG_MACF - error = mac_proc_check_getaudit(p); - if (error) - return (error); -#endif - return (getaudit_internal(p, uap->auditinfo)); -} - -/* ARGSUSED */ -int -setaudit(proc_t p, struct setaudit_args *uap, __unused int32_t *retval) -{ - struct auditinfo ai; - struct auditinfo_addr newaia; - kauth_cred_t scred; - int error; - - error = copyin(uap->auditinfo, &ai, sizeof(ai)); - if (error) - return (error); - AUDIT_ARG(auditinfo, &ai); - - if (ai.ai_asid != AU_ASSIGN_ASID && - (uint32_t)ai.ai_asid > ASSIGNED_ASID_MAX) - return (EINVAL); - -#if CONFIG_MACF - { - struct auditinfo_addr aia = { - .ai_auid = ai.ai_auid, - .ai_mask = ai.ai_mask, - .ai_termid = { - .at_port = ai.ai_termid.port, - .at_type = AU_IPv4, - .at_addr = { ai.ai_termid.machine, 0, 0, 0 } }, - .ai_asid = ai.ai_asid, - .ai_flags = 0 }; - error = mac_proc_check_setaudit(p, &aia); - } - if (error) - return (error); -#endif - - bzero(&newaia, sizeof(newaia)); - scred = kauth_cred_proc_ref(p); - error = suser(scred, &p->p_acflag); - if (error) { - kauth_cred_unref(&scred); - return (error); - } - newaia.ai_flags = scred->cr_audit.as_aia_p->ai_flags; - kauth_cred_unref(&scred); - - WARN_IF_BAD_ASID(ai.ai_asid, "setaudit(2)"); - - newaia.ai_auid = ai.ai_auid; - bcopy(&ai.ai_mask, &newaia.ai_mask, sizeof(au_mask_t)); - AUDIT_CHECK_IF_KEVENTS_MASK(ai.ai_mask); - newaia.ai_asid = ai.ai_asid; - if (ai.ai_asid == AU_DEFAUDITSID) - newaia.ai_asid = AU_ASSIGN_ASID; - else - newaia.ai_asid = ai.ai_asid; - newaia.ai_termid.at_addr[0] = ai.ai_termid.machine; - newaia.ai_termid.at_port = ai.ai_termid.port; - newaia.ai_termid.at_type = AU_IPv4; - - error = audit_session_setaia(p, &newaia); - if (error) - return (error); - - /* - * If asked to assign an ASID then let the user know what the ASID is - * by copying the auditinfo struct back out. - */ - if (newaia.ai_asid == AU_ASSIGN_ASID) - error = getaudit_internal(p, uap->auditinfo); - - return (error); -} - static int getaudit_addr_internal(proc_t p, user_addr_t user_addr, size_t length) { @@ -1126,22 +1003,6 @@ setauid(proc_t p, struct setauid_args *uap, int32_t *retval) return (ENOSYS); } -int -getaudit(proc_t p, struct getaudit_args *uap, int32_t *retval) -{ -#pragma unused(p, uap, retval) - - return (ENOSYS); -} - -int -setaudit(proc_t p, struct setaudit_args *uap, int32_t *retval) -{ -#pragma unused(p, uap, retval) - - return (ENOSYS); -} - int getaudit_addr(proc_t p, struct getaudit_addr_args *uap, int32_t *retval) { diff --git a/bsd/security/audit/audit_worker.c b/bsd/security/audit/audit_worker.c index d9ef366a2..5ebb842ef 100644 --- a/bsd/security/audit/audit_worker.c +++ b/bsd/security/audit/audit_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2010 Apple Inc. + * Copyright (c) 1999-2011 Apple Inc. * Copyright (c) 2006-2008 Robert N. M. Watson * All rights reserved. * @@ -105,10 +105,10 @@ static struct vnode *audit_vp; #define AUDIT_WORKER_SX_DESTROY() slck_destroy(&audit_worker_sl) /* - * The audit_draining flag is set when audit is disabled and the audit + * The audit_q_draining flag is set when audit is disabled and the audit * worker queue is being drained. */ -static int audit_draining; +static int audit_q_draining; /* * The special kernel audit record, audit_drain_kar, is used to mark the end of @@ -460,7 +460,7 @@ audit_worker(void) while ((ar = TAILQ_FIRST(&ar_worklist))) { TAILQ_REMOVE(&ar_worklist, ar, k_q); if (ar->k_ar_commit & AR_DRAIN_QUEUE) { - audit_draining = 0; + audit_q_draining = 0; cv_broadcast(&audit_drain_cv); } else { audit_worker_process_record(ar); @@ -485,51 +485,54 @@ audit_rotate_vnode(kauth_cred_t cred, struct vnode *vp) { kauth_cred_t old_audit_cred; struct vnode *old_audit_vp; - int audit_was_enabled; KASSERT((cred != NULL && vp != NULL) || (cred == NULL && vp == NULL), ("audit_rotate_vnode: cred %p vp %p", cred, vp)); - /* - * Rotate the vnode/cred, and clear the rotate flag so that we will - * send a rotate trigger if the new file fills. - */ - AUDIT_WORKER_SX_XLOCK(); - old_audit_cred = audit_ctx.vc_ucred; - old_audit_vp = audit_vp; - audit_ctx.vc_ucred = cred; - audit_file_rotate_wait = 0; - audit_was_enabled = audit_enabled; - if ((audit_enabled = (NULL != vp))) - audit_vp = vp; - audit_draining = (audit_was_enabled && !audit_enabled); - AUDIT_WORKER_SX_XUNLOCK(); - /* - * If audit (was enabled and) is now disabled then drain the audit - * record queue and wait until it is done. - */ mtx_lock(&audit_mtx); - if (audit_draining) { + if (audit_enabled && (NULL == vp)) { + /* Auditing is currently enabled but will be disabled. */ + /* - * Insert the special drain record in the queue. + * Disable auditing now so nothing more is added while the + * audit worker thread is draining the audit record queue. */ + audit_enabled = 0; + + /* + * Drain the auditing queue by inserting a drain record at the + * end of the queue and waiting for the audit worker thread + * to find this record and signal that it is done before + * we close the audit trail. + */ + audit_q_draining = 1; while (audit_q_len >= audit_qctrl.aq_hiwater) cv_wait(&audit_watermark_cv, &audit_mtx); TAILQ_INSERT_TAIL(&audit_q, &audit_drain_kar, k_q); audit_q_len++; cv_signal(&audit_worker_cv); - - /* - * Wait for the audit worker thread to signal it is done. - */ - while (audit_draining) - cv_wait(&audit_drain_cv, &audit_mtx); - - audit_vp = NULL; } + + /* If the audit queue is draining then wait here until it's done. */ + while (audit_q_draining) + cv_wait(&audit_drain_cv, &audit_mtx); mtx_unlock(&audit_mtx); + + /* + * Rotate the vnode/cred, and clear the rotate flag so that we will + * send a rotate trigger if the new file fills. + */ + AUDIT_WORKER_SX_XLOCK(); + old_audit_cred = audit_ctx.vc_ucred; + old_audit_vp = audit_vp; + audit_ctx.vc_ucred = cred; + audit_vp = vp; + audit_file_rotate_wait = 0; + audit_enabled = (audit_vp != NULL); + AUDIT_WORKER_SX_XUNLOCK(); + /* * If there was an old vnode/credential, close and free. */ diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index 53f457741..211d741b9 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -40,6 +40,7 @@ DATAFILES = \ PRIVATE_DATAFILES = \ codesign.h \ content_protection.h \ + decmpfs.h \ disklabel.h \ fileport.h \ fsctl.h \ @@ -47,6 +48,7 @@ PRIVATE_DATAFILES = \ fslog.h \ imgsrc.h \ ipcs.h \ + kas_info.h \ shm_internal.h \ spawn_internal.h \ tree.h \ @@ -66,7 +68,7 @@ KERNELFILES = \ errno.h ev.h event.h fcntl.h file.h filio.h \ ioccom.h ioctl.h ipc.h \ ioctl_compat.h kernel.h kernel_types.h kern_event.h lctx.h lock.h lockf.h \ - kauth.h kdebug.h md5.h kern_callout.h kern_control.h kern_memorystatus.h imgact.h malloc.h namei.h \ + kauth.h kdebug.h md5.h kern_control.h kern_memorystatus.h imgact.h malloc.h namei.h \ mman.h mbuf.h mount.h netport.h param.h paths.h \ proc.h queue.h random.h resource.h \ sbuf.h posix_sem.h posix_shm.h sem.h shm.h \ @@ -120,7 +122,7 @@ INSTALL_MI_DIR = sys EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h pthread_internal.h filedesc.h pipe.h resourcevar.h semaphore.h \ vnode_internal.h proc_internal.h file_internal.h mount_internal.h \ - uio_internal.h tree.h + uio_internal.h tree.h munge.h EXPORT_MI_GEN_LIST = syscall.h sysproto.h @@ -152,12 +154,12 @@ sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh _posix_availability.h: $(MAKE_POSIX_AVAILABILITY) @echo "Generating bsd/sys/$@" - $(_v)$(MAKE_POSIX_AVAILABILITY) $@ + $(_v)$(MAKE_POSIX_AVAILABILITY) "$@" MAKE_SYMBOL_ALIASING = $(SRCROOT)/bsd/sys/make_symbol_aliasing.sh _symbol_aliasing.h: $(MAKE_SYMBOL_ALIASING) @echo "Generating bsd/sys/$@" - $(_v)$(MAKE_SYMBOL_ALIASING) $@ + $(_v)$(MAKE_SYMBOL_ALIASING) "$(SDKROOT)" "$@" include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index 42a8b7673..e33ad2b86 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -53,8 +53,6 @@ #define FSOPT_EXCHANGE_DATA_ONLY 0x0000010 #endif - - /* we currently aren't anywhere near this amount for a valid * fssearchblock.sizeofsearchparams1 or fssearchblock.sizeofsearchparams2 * but we put a sanity check in to avoid abuse of the value passed in from diff --git a/bsd/sys/bsdtask_info.h b/bsd/sys/bsdtask_info.h index 7b7b56610..416e4b475 100644 --- a/bsd/sys/bsdtask_info.h +++ b/bsd/sys/bsdtask_info.h @@ -107,7 +107,7 @@ extern uint32_t vnode_vid(void *vp); extern int fill_procregioninfo(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid); void fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo); -int fill_taskthreadinfo(task_t task, uint64_t thaddr, struct proc_threadinfo_internal * ptinfo, void *, int *); +int fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *); int fill_taskthreadlist(task_t task, void * buffer, int thcount); int get_numthreads(task_t); void bsd_getthreadname(void *uth, char* buffer); diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index 08216cac0..43af7fb1a 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -91,6 +91,8 @@ #define B_IOSTREAMING 0x00001000 /* sequential access pattern detected */ #define B_THROTTLED_IO 0x00002000 /* low priority I/O */ #define B_ENCRYPTED_IO 0x00004000 /* Encrypted I/O */ +#define B_STATICCONTENT 0x00008000 /* Buffer is likely to remain unaltered */ + /* * make sure to check when adding flags that * that the new flags don't overlap the definitions @@ -1033,35 +1035,105 @@ void buf_set_redundancy_flags(buf_t, uint32_t); */ bufattr_t buf_attr(buf_t); +/*! + @function buf_markstatic + @abstract Mark a buffer as being likely to contain static data. + @param bp Buffer to mark. + @return void. + */ + void buf_markstatic(buf_t); + +/*! + @function buf_static + @abstract Check if a buffer contains static data. + @param bp Buffer to test. + @return Nonzero if buffer has static data, 0 otherwise. + */ +int buf_static(buf_t); + #ifdef KERNEL_PRIVATE void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void (**)(buf_t, void *), void **); +bufattr_t bufattr_alloc(void); + +void bufattr_free(bufattr_t bap); /*! - @function buf_getcpaddr - @abstract Set the address of cp_entry on a buffer. - @param bp Buffer whose cp entry value has to be set + @function bufattr_cpaddr + @abstract Get the address of cp_entry on a buffer. + @param bap Buffer Attribute whose cp_entry to get. + @return int. + */ +void *bufattr_cpaddr(bufattr_t); + +/*! + @function bufattr_cpoff + @abstract Gets the file offset on the buffer. + @param bap Buffer Attribute whose file offset value is used @return void. */ -void buf_setcpaddr(buf_t, void *); +uint64_t bufattr_cpoff(bufattr_t); + /*! - @function buf_getcpaddr - @abstract Get the address of cp_entry on a buffer. - @param bp Buffer whose error value to set. - @return int. + @function bufattr_setcpaddr + @abstract Set the address of cp_entry on a buffer attribute. + @param bap Buffer Attribute whose cp entry value has to be set + @return void. + */ +void bufattr_setcpaddr(bufattr_t, void *); + +/*! + @function bufattr_setcpoff + @abstract Set the file offset for a content protected I/O on + a buffer attribute. + @param bap Buffer Attribute whose cp file offset has to be set + @return void. */ -void *buf_getcpaddr(buf_t); +void bufattr_setcpoff(bufattr_t, uint64_t); /*! - @function buf_throttled + @function bufattr_rawencrypted + @abstract Check if a buffer contains raw encrypted data. + @param bap Buffer attribute to test. + @return Nonzero if buffer has raw encrypted data, 0 otherwise. + */ +int bufattr_rawencrypted(bufattr_t bap); + +/*! + @function bufattr_throttled @abstract Check if a buffer is throttled. @param bap Buffer attribute to test. @return Nonzero if the buffer is throttled, 0 otherwise. */ int bufattr_throttled(bufattr_t bap); -#endif /* KERNEL_PRIVATE */ +/*! + @function bufattr_nocache + @abstract Check if a buffer has nocache attribute. + @param bap Buffer attribute to test. + @return Nonzero if the buffer is not cached, 0 otherwise. + */ +int bufattr_nocache(bufattr_t bap); + +/*! + @function bufattr_meta + @abstract Check if a buffer has meta attribute. + @param bap Buffer attribute to test. + @return Nonzero if the buffer has meta attribute, 0 otherwise. + */ + +int bufattr_meta(bufattr_t bap); + +/*! + @function bufattr_delayidlesleep + @abstract Check if a buffer is marked to delay idle sleep on disk IO. + @param bap Buffer attribute to test. + @return Nonzero if the buffer is marked to delay idle sleep on disk IO, 0 otherwise. + */ +int bufattr_delayidlesleep(bufattr_t bap); + +#endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index 2d259ac2a..016632623 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,13 +79,16 @@ #include #include - #define NOLIST ((struct buf *)0x87654321) /* * Attributes of an I/O to be used by lower layers */ struct bufattr { +#if CONFIG_PROTECT + struct cprotect *ba_cpentry; /* address of cp_entry */ + uint64_t ba_cp_file_off; /* rounded file offset. See buf_setcpoff() for more comments */ +#endif uint64_t ba_flags; /* flags. Some are only in-use on embedded devices */ }; @@ -132,9 +135,6 @@ struct buf { #ifdef BUF_MAKE_PRIVATE buf_t b_data_store; #endif -#if CONFIG_PROTECT - struct cprotect *b_cpentry; /* address of cp_entry, to be passed further down */ -#endif /* CONFIG_PROTECT */ struct bufattr b_attr; #ifdef JOE_DEBUG void * b_owner; @@ -160,7 +160,7 @@ struct buf { #define b_cliodone b_wcred /* - * These flags are kept in b_lflags... + * These flags are kept in b_lflags... * buf_mtxp must be held before examining/updating */ #define BL_BUSY 0x00000001 /* I/O in progress. */ @@ -171,20 +171,20 @@ struct buf { #define BL_EXTERNAL 0x00000040 #define BL_WAITSHADOW 0x00000080 #define BL_IOBUF_ALLOC 0x00000100 +#define BL_WANTED_REF 0x00000200 /* * Parameters for buffer cache garbage collection */ #define BUF_STALE_THRESHHOLD 30 /* Collect if untouched in the last 30 seconds */ -#define BUF_MAX_GC_COUNT 1024 /* Generally 6-8 MB */ -#define BUF_MAX_GC_BATCH_SIZE 128 /* Under a single grab of the lock */ +#define BUF_MAX_GC_BATCH_SIZE 64 /* Under a single grab of the lock */ /* * mask used by buf_flags... these are the readable external flags */ #define BUF_X_RDFLAGS (B_PHYS | B_RAW | B_LOCKED | B_ASYNC | B_READ | B_WRITE | B_PAGEIO |\ B_META | B_CLUSTER | B_DELWRI | B_FUA | B_PASSIVE | B_IOSTREAMING | B_THROTTLED_IO |\ - B_ENCRYPTED_IO) + B_ENCRYPTED_IO | B_STATICCONTENT) /* * mask used by buf_clearflags/buf_setflags... these are the writable external flags */ @@ -230,7 +230,11 @@ struct buf { * ba_flags (Buffer Attribute flags) * Some of these may be in-use only on embedded devices. */ -#define BA_THROTTLED_IO 0x000000002 +#define BA_RAW_ENCRYPTED_IO 0x00000001 +#define BA_THROTTLED_IO 0x00000002 +#define BA_DELAYIDLESLEEP 0x00000004 /* Process is marked to delay idle sleep on disk IO */ +#define BA_NOCACHE 0x00000008 +#define BA_META 0x00000010 extern int niobuf_headers; /* The number of IO buffer headers for cluster IO */ @@ -281,10 +285,18 @@ errno_t buf_acquire(buf_t, int, int, int); int count_busy_buffers(void); int count_lock_queue(void); +int buf_flushdirtyblks_skipinfo (vnode_t, int, int, const char *); +void buf_wait_for_shadow_io (vnode_t, daddr64_t); + #ifdef BUF_MAKE_PRIVATE errno_t buf_make_private(buf_t bp); #endif +#ifdef CONFIG_PROTECT +void buf_setcpaddr(buf_t, struct cprotect *); +void buf_setcpoff (buf_t, uint64_t); +#endif + __END_DECLS diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index 7076ef572..2cbc7fef6 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -75,6 +75,12 @@ #define __END_DECLS #endif +/* This SDK is designed to work with clang and specific versions of + * gcc >= 4.0 with Apple's patch sets */ +#if !defined(__GNUC__) || __GNUC__ < 4 +#warning "Unsupported compiler detected" +#endif + /* * The __CONCAT macro is used to concatenate parts of symbol names, e.g. * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo. @@ -126,63 +132,45 @@ #endif /* !NO_ANSI_KEYWORDS */ #endif /* !(__STDC__ || __cplusplus) */ -/* - * GCC1 and some versions of GCC2 declare dead (non-returning) and - * pure (no side effects) functions using "volatile" and "const"; - * unfortunately, these then cause warnings under "-ansi -pedantic". - * GCC2 uses a new, peculiar __attribute__((attrs)) style. All of - * these work for GNU C++ (modulo a slight glitch in the C++ grammar - * in the distribution version of 2.5.5). +#define __dead2 __attribute__((noreturn)) +#define __pure2 __attribute__((const)) + +/* __unused denotes variables and functions that may not be used, preventing + * the compiler from warning about it if not used. */ -#if defined(__MWERKS__) && (__MWERKS__ > 0x2400) - /* newer Metrowerks compilers support __attribute__() */ -#elif __GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ >= 5 -#define __dead2 __attribute__((__noreturn__)) -#define __pure2 __attribute__((__const__)) -#if __GNUC__ == 2 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 7 -#define __unused /* no attribute */ -#else -#define __unused __attribute__((__unused__)) -#endif -#else -#define __attribute__(x) /* delete __attribute__ if non-gcc or gcc1 */ -#if defined(__GNUC__) && !defined(__STRICT_ANSI__) -/* __dead and __pure are depreciated. Use __dead2 and __pure2 instead */ -#define __dead __volatile -#define __pure __const -#endif -#endif +#define __unused __attribute__((unused)) + +/* __used forces variables and functions to be included even if it appears + * to the compiler that they are not used (and would thust be discarded). + */ +#define __used __attribute__((used)) + +/* __deprecated causes the compiler to produce a warning when encountering + * code using the deprecated functionality. This may require turning on + * such wardning with the -Wdeprecated flag. + */ +#define __deprecated __attribute__((deprecated)) + +/* __unavailable causes the compiler to error out when encountering + * code using the tagged function of variable. + */ +#define __unavailable __attribute__((unavailable)) /* Delete pseudo-keywords wherever they are not available or needed. */ #ifndef __dead #define __dead #define __pure #endif -#ifndef __dead2 -#define __dead2 -#define __pure2 -#define __unused -#endif - -#if defined(__GNUC__) && __GNUC__ >= 4 -#define __used __attribute__((__used__)) -#else -#define __used -#endif /* - * GCC 2.95 provides `__restrict' as an extension to C90 to support the - * C99-specific `restrict' type qualifier. We happen to use `__restrict' as - * a way to define the `restrict' type qualifier without disturbing older - * software that is unaware of C99 keywords. + * We use `__restrict' as a way to define the `restrict' type qualifier + * without disturbing older software that is unaware of C99 keywords. */ -#if !(__GNUC__ == 2 && __GNUC_MINOR__ == 95) #if __STDC_VERSION__ < 199901 #define __restrict #else #define __restrict restrict #endif -#endif /* * Compiler-dependent macros to declare that functions take printf-like @@ -192,15 +180,10 @@ * mismatch between the format string and subsequent function parameter * types. */ -#if __GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ >= 7 #define __printflike(fmtarg, firstvararg) \ __attribute__((__format__ (__printf__, fmtarg, firstvararg))) #define __scanflike(fmtarg, firstvararg) \ __attribute__((__format__ (__scanf__, fmtarg, firstvararg))) -#else -#define __printflike(fmtarg, firstvararg) -#define __scanflike(fmtarg, firstvararg) -#endif #define __IDSTRING(name,string) static const char name[] __used = string @@ -220,6 +203,12 @@ #define __PROJECT_VERSION(s) __IDSTRING(project_version,s) #endif +/* Source compatibility only, ID string not emitted in object file */ +#ifndef __FBSDID +#define __FBSDID(s) +#endif + + /* * COMPILATION ENVIRONMENTS -- see compat(5) for additional detail * @@ -451,7 +440,7 @@ #else #include -#if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) +#if defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) #define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) __DARWIN_ALIAS_STARTING_IPHONE_##_iphone(x) #elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) #define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) __DARWIN_ALIAS_STARTING_MAC_##_mac(x) @@ -515,17 +504,6 @@ #define _POSIX_C_SOURCE 198808L #endif -/* - * Deprecation macro - */ -#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) -#define __deprecated __attribute__((deprecated)) -#define __unavailable __attribute__((unavailable)) -#else -#define __deprecated /* nothing */ -#define __unavailable /* nothing */ -#endif - /* POSIX C deprecation macros */ #ifdef KERNEL #define __POSIX_C_DEPRECATED(ver) @@ -562,20 +540,6 @@ && (__STDC_VERSION__-0 < 199901L) \ && !defined(__GNUG__)) -/* - * Long double compatibility macro allow selecting variant symbols based - * on the old (compatible) 64-bit long doubles, or the new 128-bit - * long doubles. This applies only to ppc; i386 already has long double - * support, while ppc64 doesn't have any backwards history. - */ -#if defined(__i386__) || defined(__x86_64__) -# define __DARWIN_LDBL_COMPAT(x) /* nothing */ -# define __DARWIN_LDBL_COMPAT2(x) /* nothing */ -# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 -#else -# error Unknown architecture -#endif - /***************************************** * Public darwin-specific feature macros *****************************************/ @@ -588,14 +552,6 @@ #define _DARWIN_FEATURE_64_BIT_INODE 1 #endif -/* - * _DARWIN_FEATURE_LONG_DOUBLE_IS_DOUBLE indicates when the long double type - * is the same as the double type (ppc and arm only) - */ -#if __DARWIN_LONG_DOUBLE_IS_DOUBLE -#define _DARWIN_FEATURE_LONG_DOUBLE_IS_DOUBLE 1 -#endif - /* * _DARWIN_FEATURE_64_ONLY_BIT_INODE indicates that the ino_t type may only * be 64-bit; there is no support for 32-bit ino_t when this macro is defined diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index e72c25044..5894d982c 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -57,6 +57,8 @@ __BEGIN_DECLS /* code sign operations */ int csops(pid_t pid, unsigned int ops, void * useraddr, size_t usersize); +/* works only with CS_OPS_PIDPATH and CS_OPS_ENTITLEMENTS_BLOB */ +int csops_audittoken(pid_t pid, unsigned int ops, void * useraddr, size_t usersize, audit_token_t * token); __END_DECLS diff --git a/bsd/sys/conf.h b/bsd/sys/conf.h index 39e4fef37..bd6e518b0 100644 --- a/bsd/sys/conf.h +++ b/bsd/sys/conf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,6 +93,12 @@ struct vnode; #define D_TTY 3 #ifdef KERNEL +/* + * Flags for d_type (squeezed into the top half of d_type). + */ +#define D_TYPEMASK 0xffff +#define D_TRACKCLOSE 0x00010000 /* track all closes */ + /* * Device switch function types. */ @@ -166,6 +172,7 @@ d_write_t nowrite; #ifdef KERNEL_PRIVATE extern struct bdevsw bdevsw[]; +extern int (*bootcache_contains_block)(dev_t device, u_int64_t blkno); #endif /* KERNEL_PRIVATE */ /* diff --git a/bsd/sys/cprotect.h b/bsd/sys/cprotect.h index bebe3bb43..0dda075ac 100644 --- a/bsd/sys/cprotect.h +++ b/bsd/sys/cprotect.h @@ -38,9 +38,13 @@ extern "C" { #include #include #include +#include -#define CP_KEYSIZE 32 /* 8x4 = 32, 32x8 = 256 */ -#define CP_WRAPPEDKEYSIZE 40 /* 2x4 = 8, 8x8 = 64 */ +#define CP_IV_KEYSIZE 20 /* 16x8 = 128, but SHA1 pushes 20 bytes so keep space for that */ +#define CP_MAX_KEYSIZE 32 /* 8x4 = 32, 32x8 = 256 */ +#define CP_MAX_WRAPPEDKEYSIZE 128 /* The size of the largest allowed key */ +#define CP_INITIAL_WRAPPEDKEYSIZE 40 +#define CP_V2_WRAPPEDKEYSIZE 40 /* Size of the wrapped key in a v2 EA */ /* lock events from AppleKeyStore */ #define CP_LOCKED_STATE 0 /* Device is locked */ @@ -53,15 +57,18 @@ extern "C" { #define CP_NEEDS_KEYS 0x1 /* File needs persistent keys */ #define CP_KEY_FLUSHED 0x2 /* File's unwrapped key has been purged from memory */ #define CP_NO_XATTR 0x4 /* Key info has not been saved as EA to the FS */ +#define CP_OFF_IV_ENABLED 0x8 /* Only go down relative IV route if this flag is set */ + +#define CP_RELOCATION_INFLIGHT 0x10 /* File with offset IVs is in the process of being relocated. */ /* Content Protection VNOP Operation flags */ #define CP_READ_ACCESS 0x1 #define CP_WRITE_ACCESS 0x2 #define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" -#define CP_CURRENT_MAJOR_VERS 2 -#define CP_CURRENT_MINOR_VERS 0 - +#define CP_NEW_MAJOR_VERS 4 +#define CP_PREV_MAJOR_VERS 2 +#define CP_MINOR_VERS 0 typedef struct cprotect *cprotect_t; typedef struct cp_wrap_func *cp_wrap_func_t; @@ -73,18 +80,32 @@ typedef struct cnode * cnode_ptr_t; struct hfsmount; /* The wrappers are invoked by the AKS kext */ -typedef int wrapper_t(uint32_t properties, void *key_bytes, size_t key_length, void *wrapped_data, size_t *wrapped_length); +typedef int wrapper_t(uint32_t properties, uint64_t file_id, void *key_bytes, size_t key_length, void *wrapped_data, size_t *wrapped_length); typedef int unwrapper_t(uint32_t properties, void *wrapped_data, size_t wrapped_data_length, void *key_bytes, size_t *key_length); /* * Runtime-only structure containing the content protection status * for the given file. This is contained within the cnode + * This is passed down to IOStorageFamily via the bufattr struct + * + ****************************************************** + * Some Key calculation information for offset based IV + ****************************************************** + * Kf = original 256 bit per file key + * Kiv = SHA1(Kf), use full Kf, but truncate Kiv to 128 bits + * Kiv can be cached in the cprotect, so it only has to be calculated once for the file init + * + * IVb = Encrypt(Kiv, offset) + * */ struct cprotect { - uint8_t cp_cache_key[CP_KEYSIZE]; - uint8_t cp_persistent_key[CP_WRAPPEDKEYSIZE]; uint32_t cp_flags; uint32_t cp_pclass; + aes_encrypt_ctx cp_cache_iv_ctx; + uint32_t cp_cache_key_len; + uint8_t cp_cache_key[CP_MAX_KEYSIZE]; + uint32_t cp_persistent_key_len; + uint8_t cp_persistent_key[]; }; struct cp_wrap_func { @@ -94,7 +115,8 @@ struct cp_wrap_func { struct cp_global_state { uint8_t wrap_functions_set; - uint8_t lock_state; + uint8_t lock_state; + u_int16_t reserved; }; /* @@ -103,13 +125,27 @@ struct cp_global_state { * little-endian on-disk. This means they must be endian swapped to * L.E on getxattr() and converted to LE on setxattr(). */ -struct cp_xattr { - u_int16_t xattr_major_version; - u_int16_t xattr_minor_version; - u_int32_t flags; - u_int32_t persistent_class; - u_int32_t key_size; - uint8_t persistent_key[CP_WRAPPEDKEYSIZE]; +struct cp_xattr_v2 { + u_int16_t xattr_major_version; + u_int16_t xattr_minor_version; + u_int32_t flags; + u_int32_t persistent_class; + u_int32_t key_size; + uint8_t persistent_key[CP_V2_WRAPPEDKEYSIZE]; +}; + +struct cp_xattr_v4 { + u_int16_t xattr_major_version; + u_int16_t xattr_minor_version; + u_int32_t flags; + u_int32_t persistent_class; + u_int32_t key_size; + u_int32_t reserved1; + u_int32_t reserved2; + u_int32_t reserved3; + u_int32_t reserved4; + u_int32_t reserved5; + uint8_t persistent_key[CP_MAX_WRAPPEDKEYSIZE]; }; /* Same is true for the root EA, all fields must be written little endian. */ @@ -130,20 +166,31 @@ struct cp_root_xattr { */ int cp_vnode_getclass(vnode_t, int *); int cp_vnode_setclass(vnode_t, uint32_t); +int cp_vnode_transcode(vnode_t); int cp_key_store_action(int); int cp_register_wraps(cp_wrap_func_t); int cp_entry_init(cnode_ptr_t, struct mount *); -int cp_entry_create_keys(cnode_ptr_t); -void cp_entry_destroy(cnode_ptr_t); +int cp_entry_create_keys(struct cprotect **entry_ptr, struct cnode *dcp, struct hfsmount *hfsmp, + uint32_t input_class, uint32_t fileid, mode_t cmode); +int cp_entry_gentempkeys(struct cprotect **entry_ptr, struct hfsmount *hfsmp); +void cp_entry_destroy(struct cprotect **entry_ptr); cnode_ptr_t cp_get_protected_cnode(vnode_t); -int cp_handle_vnop(cnode_ptr_t, int); +int cp_handle_vnop(vnode_t, int, int); int cp_fs_protected (mount_t); int cp_getrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *outxattr); int cp_setrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *newxattr); -int cp_handle_relocate (cnode_ptr_t cp); +int cp_setxattr(struct cnode *cp, struct cprotect *entry, struct hfsmount *hfsmp, uint32_t fileid, int options); +int cp_update_mkb (struct cprotect *entry, uint32_t fileid); +int cp_handle_relocate (cnode_ptr_t cp, struct hfsmount *hfsmp); +int cp_handle_open(struct vnode *vp, int mode); +int cp_get_root_major_vers (struct vnode *vp, uint32_t *level); + +#if 0 +int cp_isdevice_locked (void); +#endif #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/decmpfs.h b/bsd/sys/decmpfs.h index f8a61d288..8cef87b69 100644 --- a/bsd/sys/decmpfs.h +++ b/bsd/sys/decmpfs.h @@ -28,6 +28,8 @@ #ifndef _SYS_DECMPFS_H_ #define _SYS_DECMPFS_H_ 1 +#include + #define MAX_DECMPFS_XATTR_SIZE 3802 /* @@ -61,7 +63,7 @@ enum { /* additional types defined in AppleFSCompression project */ - CMP_MAX = 255 + CMP_MAX = 255 /* Highest compression_type supported */ }; typedef struct { @@ -71,25 +73,19 @@ typedef struct { #if KERNEL -#include +#if XNU_KERNEL_PRIVATE -#if defined(__i386__) || defined(__x86_64__) -#define DECMPFS_SUPPORTS_SWAP64 1 -/* otherwise, no OSCompareAndSwap64, so use a mutex */ -#endif +#include typedef struct decmpfs_cnode { - uint8_t cmp_state; - uint8_t cmp_minimal_xattr; /* if non-zero, this file's com.apple.decmpfs xattr contained only the minimal decmpfs_disk_header */ - uint32_t cmp_type; - uint32_t lockcount; - void *lockowner; /* cnode's lock owner (if a thread is currently holding an exclusive lock) */ + uint8_t cmp_state; + uint8_t cmp_minimal_xattr; /* if non-zero, this file's com.apple.decmpfs xattr contained only the minimal decmpfs_disk_header */ + uint32_t cmp_type; + uint32_t lockcount; + void *lockowner; /* cnode's lock owner (if a thread is currently holding an exclusive lock) */ uint64_t uncompressed_size __attribute__((aligned(8))); + uint64_t decompression_flags; lck_rw_t compressed_data_lock; -#if !DECMPFS_SUPPORTS_SWAP64 - /* we need a lock since we can't atomically fetch/set 64 bits */ - lck_mtx_t uncompressed_size_mtx; -#endif /* !DECMPFS_SUPPORTS_SWAP64 */ } decmpfs_cnode; /* return values from decmpfs_file_is_compressed */ @@ -128,19 +124,32 @@ int decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap); errno_t decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmpfs_cnode *cp); errno_t decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_cnode *cp); +#endif /* XNU_KERNEL_PRIVATE */ + /* types shared between the kernel and kexts */ typedef int (*decmpfs_validate_compressed_file_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr); typedef void (*decmpfs_adjust_fetch_region_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr, off_t *offset, user_ssize_t *size); typedef int (*decmpfs_fetch_uncompressed_data_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr, off_t offset, user_ssize_t size, int nvec, decmpfs_vector *vec, uint64_t *bytes_read); typedef int (*decmpfs_free_compressed_data_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr); +typedef uint64_t (*decmpfs_get_decompression_flags_func)(vnode_t vp, vfs_context_t ctx, decmpfs_header *hdr); // returns flags from the DECMPFS_FLAGS enumeration below + +enum { + DECMPFS_FLAGS_FORCE_FLUSH_ON_DECOMPRESS = 1 << 0, +}; + +/* Versions that are supported for binary compatibility */ +#define DECMPFS_REGISTRATION_VERSION_V1 1 +#define DECMPFS_REGISTRATION_VERSION_V3 3 + +#define DECMPFS_REGISTRATION_VERSION (DECMPFS_REGISTRATION_VERSION_V3) -#define DECMPFS_REGISTRATION_VERSION 1 typedef struct { int decmpfs_registration; decmpfs_validate_compressed_file_func validate; decmpfs_adjust_fetch_region_func adjust_fetch; decmpfs_fetch_uncompressed_data_func fetch; decmpfs_free_compressed_data_func free_data; + decmpfs_get_decompression_flags_func get_flags; } decmpfs_registration; /* hooks for kexts to call */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 2bdd79a55..ba454ab69 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -149,16 +149,6 @@ typedef struct #define DKIOCGETPHYSICALBLOCKSIZE _IOR('d', 77, uint32_t) #define DKIOCGETCOMMANDPOOLSIZE _IOR('d', 78, uint32_t) -typedef struct -{ - uint64_t offset; - uint64_t length; - - uint8_t reserved0128[16]; /* reserved, clear to zero */ -} dk_discard_t __attribute__ ((deprecated)); - -#define DKIOCDISCARD _IOW('d', 31, dk_discard_t) - #ifdef KERNEL #define DK_FEATURE_FORCE_UNIT_ACCESS 0x00000001 @@ -182,10 +172,22 @@ typedef struct #define DKIOCLOCKPHYSICALEXTENTS _IO('d', 81) #define DKIOCGETPHYSICALEXTENT _IOWR('d', 82, dk_physical_extent_t) #define DKIOCUNLOCKPHYSICALEXTENTS _IO('d', 83) - #ifdef PRIVATE -#define _DKIOCGETMIGRATIONUNITBYTESIZE _IOR('d', 85, uint32_t) +typedef struct _dk_cs_pin { + dk_extent_t cp_extent; + int64_t cp_flags; +} _dk_cs_pin_t; +#define _DKIOCSPINDISCARDDATA (1 << 0) +#define _DKIOCCSPINEXTENT _IOW('d', 199, _dk_cs_pin_t) +#define _DKIOCCSUNPINEXTENT _IOW('d', 200, _dk_cs_pin_t) +#define _DKIOCGETMIGRATIONUNITBYTESIZE _IOR('d', 201, uint32_t) #endif /* PRIVATE */ #endif /* KERNEL */ +#ifdef PRIVATE +#ifdef TARGET_OS_EMBEDDED +#define _DKIOCSETSTATIC _IO('d', 84) +#endif /* TARGET_OS_EMBEDDED */ +#endif /* PRIVATE */ + #endif /* _SYS_DISK_H_ */ diff --git a/bsd/sys/domain.h b/bsd/sys/domain.h index 9edf0db5e..9f6bd965f 100644 --- a/bsd/sys/domain.h +++ b/bsd/sys/domain.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2006, 2012 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,6 +120,8 @@ extern struct domain localdomain; __BEGIN_DECLS extern void net_add_domain(struct domain *dp); extern int net_del_domain(struct domain *); +extern int domain_proto_mtx_lock(void); +extern void domain_proto_mtx_unlock(int locked); __END_DECLS #define DOMAIN_SET(domain_set) diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index d81a48a4f..cffaefdef 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -2423,7 +2423,7 @@ extern void (*dtrace_cpu_init)(processorid_t); extern void (*dtrace_modload)(struct modctl *); extern void (*dtrace_modunload)(struct modctl *); #else -extern int (*dtrace_modload)(struct kmod_info *); +extern int (*dtrace_modload)(struct kmod_info *, uint32_t); extern int (*dtrace_modunload)(struct kmod_info *); #endif /* __APPLE__ */ extern void (*dtrace_helpers_cleanup)(proc_t*); @@ -2442,8 +2442,6 @@ extern hrtime_t dtrace_gethrtime(void); extern void dtrace_sync(void); extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t)); extern void dtrace_xcall(processorid_t, dtrace_xcall_t, void *); -extern void dtrace_vpanic(const char *, __va_list); -extern void dtrace_panic(const char *, ...); extern int dtrace_safe_defer_signal(void); extern void dtrace_safe_synchronous_signal(void); diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index 7f42cff5e..38614c300 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -1353,6 +1353,7 @@ extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int, extern int dtrace_assfail(const char *, const char *, int); extern int dtrace_attached(void); extern hrtime_t dtrace_gethrestime(void); +extern void dtrace_isa_init(void); #ifdef __sparc extern void dtrace_flush_windows(void); diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index 231c68ead..8105a42b3 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -256,7 +256,8 @@ __END_DECLS #endif #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL -#define ELAST 105 /* Must be equal largest errno */ +#define EQFULL 106 /* Interface output queue is full */ +#define ELAST 106 /* Must be equal largest errno */ #endif #ifdef KERNEL @@ -272,8 +273,10 @@ __END_DECLS #define ECVCERORR 256 #define ECVPERORR 512 #else /* BSD_KERNEL_PRIVATE */ -/* -5 and -6 are reserved for kernel internal use */ +/* -5, -6 and -7 and -106 are reserved for kernel internal use */ #endif /* BSD_KERNEL_PRIVATE */ - -#endif +#ifdef PRIVATE +#define EQSUSPENDED (-EQFULL) /* Output queue is suspended */ +#endif /* PRIVATE */ +#endif /* KERNEL */ #endif /* _SYS_ERRNO_H_ */ diff --git a/bsd/sys/event.h b/bsd/sys/event.h index 05b31174a..d22d5efb2 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,7 +73,11 @@ /* (-11) unused */ #define EVFILT_VM (-12) /* Virtual memory events */ -#define EVFILT_SYSCOUNT 12 +#ifdef PRIVATE +#define EVFILT_SOCK (-13) /* Socket events */ +#endif /* PRIVATE */ + +#define EVFILT_SYSCOUNT 13 #define EVFILT_THREADMARKER EVFILT_SYSCOUNT /* Internal use only */ #pragma pack(4) @@ -242,6 +246,18 @@ struct kevent64_s { #define NOTE_SIGNAL 0x08000000 /* shared with EVFILT_SIGNAL */ #define NOTE_EXITSTATUS 0x04000000 /* exit status to be returned, valid for child process only */ #define NOTE_RESOURCEEND 0x02000000 /* resource limit reached, resource type returned */ + +#if CONFIG_EMBEDDED +/* 0x01000000 is reserved for future use */ + +/* App states notification */ +#define NOTE_APPACTIVE 0x00800000 /* app went to active state */ +#define NOTE_APPBACKGROUND 0x00400000 /* app went to background */ +#define NOTE_APPNONUI 0x00200000 /* app went to active with no UI */ +#define NOTE_APPINACTIVE 0x00100000 /* app went to inactive state */ +#define NOTE_APPALLSTATES 0x00f00000 +#endif /* CONFIG_EMBEDDED */ + #define NOTE_PDATAMASK 0x000fffff /* mask for pid/signal */ #define NOTE_PCTRLMASK (~NOTE_PDATAMASK) @@ -265,6 +281,23 @@ struct kevent64_s { #define NOTE_NSECONDS 0x00000004 /* data is nanoseconds */ #define NOTE_ABSOLUTE 0x00000008 /* absolute timeout */ /* ... implicit EV_ONESHOT */ +#ifdef PRIVATE +/* + * data/hint fflags for EVFILT_SOCK, shared with userspace. + * + */ +#define NOTE_CONNRESET 0x00000001 /* Received RST */ +#define NOTE_READCLOSED 0x00000002 /* Read side is shutdown */ +#define NOTE_WRITECLOSED 0x00000004 /* Write side is shutdown */ +#define NOTE_TIMEOUT 0x00000008 /* timeout: rexmt, keep-alive or persist */ +#define NOTE_NOSRCADDR 0x00000010 /* source address not available */ +#define NOTE_IFDENIED 0x00000020 /* interface denied connection */ +#define NOTE_SUSPEND 0x00000040 /* output queue suspended */ +#define NOTE_RESUME 0x00000080 /* output queue resumed */ +#define NOTE_KEEPALIVE 0x00000100 /* TCP Keepalive received */ + +#endif /* PRIVATE */ + /* * data/hint fflags for EVFILT_MACHPORT, shared with userspace. * @@ -382,8 +415,8 @@ extern void klist_init(struct klist *list); extern void knote(struct klist *list, long hint); extern int knote_attach(struct klist *list, struct knote *kn); extern int knote_detach(struct klist *list, struct knote *kn); -extern int knote_link_wait_queue(struct knote *kn, struct wait_queue *wq); -extern void knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq); +extern int knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t wql); +extern int knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp); extern void knote_fdclose(struct proc *p, int fd); extern void knote_markstayqueued(struct knote *kn); diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index f6cbe9d5a..acd5f4c2f 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -179,6 +179,20 @@ typedef __darwin_pid_t pid_t; #define O_CLOEXEC 0x1000000 /* implicitly set FD_CLOEXEC */ #endif +#ifdef KERNEL +#define FENCRYPTED 0x2000000 +#endif + +#ifdef KERNEL +#define FSINGLE_WRITER 0x4000000 /* fcntl(F_SINGLE_WRITER, 1) */ +#endif + +/* Data Protection Flags */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define O_DP_GETRAWENCRYPTED 0x0001 +#endif + + #ifdef KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) @@ -270,16 +284,32 @@ typedef __darwin_pid_t pid_t; #define F_GETLKPID 66 /* get record locking information, per-process */ +/* See F_DUPFD_CLOEXEC below for 67 */ + #ifdef PRIVATE -#define F_MOVEDATAEXTENTS 69 /* Swap only the data associated with two files */ +#define F_SETSTATICCONTENT 68 /* + * indicate to the filesystem/storage driver that the content to be + * written is usually static. a nonzero value enables it, 0 disables it. + */ +#define F_MOVEDATAEXTENTS 69 /* Swap only the data associated with two files */ #endif #define F_SETBACKINGSTORE 70 /* Mark the file as being the backing store for another filesystem */ #define F_GETPATH_MTMINFO 71 /* return the full path of the FD, but error in specific mtmd circumstances */ +/* 72 is free. It used to be F_GETENCRYPTEDDATA, which is now removed. */ + #define F_SETNOSIGPIPE 73 /* No SIGPIPE generated on EPIPE */ #define F_GETNOSIGPIPE 74 /* Status of SIGPIPE for this fd */ +#define F_TRANSCODEKEY 75 /* For some cases, we need to rewrap the key for AKS/MKB */ + +#define F_SINGLE_WRITER 76 /* file being written to a by single writer... if throttling enabled, writes */ + /* may be broken into smaller chunks with throttling in between */ + +#define F_GETPROTECTIONLEVEL 77 /* Get the protection version number for this filesystem */ + + // FS-specific fcntl()'s numbers begin at 0x00010000 and go up #define FCNTL_FS_SPECIFIC_BASE 0x00010000 @@ -300,8 +330,9 @@ typedef __darwin_pid_t pid_t; #define F_WAIT 0x010 /* Wait until lock is granted */ #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ -#define F_PROV 0x080 /* Non-coelesced provisional lock */ +#define F_PROV 0x080 /* Non-coalesced provisional lock */ #define F_WAKE1_SAFE 0x100 /* its safe to only wake one waiter */ +#define F_ABORT 0x200 /* lock attempt aborted (force umount) */ #endif /* @@ -386,7 +417,6 @@ struct flock { short l_whence; /* type of l_start */ }; - #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) /* * advisory file read data type - @@ -603,6 +633,8 @@ int fileport_makeport(int, fileport_t*); int fileport_makefd(fileport_t); #endif /* PRIVATE */ int openx_np(const char *, int, filesec_t); +/* data-protected non-portable open(2) */ +int open_dprotected_np ( const char *, int, int, int, ...); int flock(int, int); filesec_t filesec_init(void); filesec_t filesec_dup(filesec_t); diff --git a/bsd/sys/file.h b/bsd/sys/file.h index bd3629144..dcf08f448 100644 --- a/bsd/sys/file.h +++ b/bsd/sys/file.h @@ -85,23 +85,6 @@ struct posix_cred; typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ -#pragma pack(4) - -/* for the compat sake; */ -struct extern_file { - LIST_ENTRY(extern_file) f_list; /* list of active files */ - short f_flag; /* see fcntl.h */ - short f_type; /* descriptor type */ - short f_count; /* reference count */ - short f_msgcount; /* references from message queue */ - kauth_cred_t f_cred; /* credentials associated with descriptor */ - void * f_ops; - off_t f_offset; - caddr_t f_data; /* vnode or socket or SHM or semaphore */ -}; - -#pragma pack() - __BEGIN_DECLS #ifdef KERNEL int file_socket(int, socket_t *); diff --git a/bsd/sys/file_internal.h b/bsd/sys/file_internal.h index 9fcb4d1f3..473415d1d 100644 --- a/bsd/sys/file_internal.h +++ b/bsd/sys/file_internal.h @@ -139,7 +139,6 @@ typedef enum { #define FG_NOSIGPIPE 0x40 /* don't deliver SIGPIPE with EPIPE return */ struct fileglob { - LIST_ENTRY(fileglob) f_list;/* list of active files */ LIST_ENTRY(fileglob) f_msglist;/* list of active files */ int32_t fg_flag; /* see fcntl.h */ file_type_t fg_type; /* descriptor type */ @@ -172,9 +171,7 @@ struct fileglob { }; #ifdef __APPLE_API_PRIVATE -LIST_HEAD(filelist, fileglob); LIST_HEAD(fmsglist, fileglob); -extern struct filelist filehead; /* head of list of open files */ extern struct fmsglist fmsghead; /* head of list of open files */ extern int maxfiles; /* kernel limit on number of open files */ extern int nfiles; /* actual number of open files */ diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index 0a194b779..dd25d3a58 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2005, 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -122,10 +122,7 @@ struct image_params { */ #define IMGPF_NONE 0x00000000 /* No flags */ #define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ -#define IMGPF_POWERPC 0x00000002 /* ppc mode for x86 */ -#if CONFIG_EMBEDDED -#undef IMGPF_POWERPC -#endif +#define IMGPF_RESERVED 0x00000002 #define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ #define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ #define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ diff --git a/osfmk/ipc/ipc_print.h b/bsd/sys/kas_info.h similarity index 64% rename from osfmk/ipc/ipc_print.h rename to bsd/sys/kas_info.h index 8365a1ee4..c1be0761d 100644 --- a/osfmk/ipc/ipc_print.h +++ b/bsd/sys/kas_info.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,41 +25,31 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ - -#ifndef _IPC_PRINT_H_ -#define _IPC_PRINT_H_ -#if MACH_KDB +#ifndef _SYS_KAS_INFO_H_ +#define _SYS_KAS_INFO_H_ -#include +#include +#include -#include -#include -#include -#include +/* + * kas_info() ("Kernel Address Space Info") is a private interface that allows + * appropriately privileged system components to introspect the overall + * kernel address space layout. + */ -extern void ipc_pset_print( - ipc_pset_t pset); +__BEGIN_DECLS -extern void ipc_port_print( - ipc_port_t port, - boolean_t have_addr, - db_expr_t count, - char *modif); +/* The slide of the main kernel compared to its static link address */ +#define KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR (0) /* returns uint64_t */ +#define KAS_INFO_MAX_SELECTOR (1) -extern void ipc_kmsg_print( - ipc_kmsg_t kmsg); +#ifndef KERNEL -extern void ipc_msg_print( - mach_msg_header_t *msgh); +int kas_info(int selector, void *value, size_t *size) __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_NA); -extern ipc_port_t ipc_name_to_data( - task_t task, - mach_port_name_t name); +#endif /* KERNEL */ -#endif /* MACH_KDB */ +__END_DECLS -#endif /* IPC_PRINT_H */ +#endif /* !_SYS_KAS_INFO_H_ */ diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 94f0b1e1e..a077ceefa 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -128,10 +128,18 @@ struct kauth_identity_extlookup { u_int32_t el_member_valid; /* TTL on group lookup result */ }; +struct kauth_cache_sizes { + u_int32_t kcs_group_size; + u_int32_t kcs_id_size; +}; + #define KAUTH_EXTLOOKUP_REGISTER (0) #define KAUTH_EXTLOOKUP_RESULT (1<<0) #define KAUTH_EXTLOOKUP_WORKER (1<<1) #define KAUTH_EXTLOOKUP_DEREGISTER (1<<2) +#define KAUTH_GET_CACHE_SIZES (1<<3) +#define KAUTH_SET_CACHE_SIZES (1<<4) +#define KAUTH_CLEAR_CACHES (1<<5) #ifdef KERNEL @@ -772,10 +780,12 @@ extern lck_grp_t *kauth_lck_grp; #ifdef XNU_KERNEL_PRIVATE __BEGIN_DECLS extern void kauth_init(void) __attribute__((section("__TEXT, initcode"))); +extern void kauth_cred_init(void) __attribute__((section("__TEXT, initcode"))); +#if CONFIG_EXT_RESOLVER extern void kauth_identity_init(void) __attribute__((section("__TEXT, initcode"))); extern void kauth_groups_init(void) __attribute__((section("__TEXT, initcode"))); -extern void kauth_cred_init(void) __attribute__((section("__TEXT, initcode"))); extern void kauth_resolver_init(void) __attribute__((section("__TEXT, initcode"))); +#endif __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 393c413df..fbaddae7b 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -92,11 +92,13 @@ __BEGIN_DECLS #define DBG_DLIL 8 #define DBG_SECURITY 9 #define DBG_CORESTORAGE 10 +#define DBG_CG 11 #define DBG_MISC 20 #define DBG_DYLD 31 #define DBG_QT 32 #define DBG_APPS 33 #define DBG_LAUNCHD 34 +#define DBG_PERF 37 #define DBG_MIG 255 /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */ @@ -137,12 +139,16 @@ __BEGIN_DECLS #define MACH_MOVED 0xb /* did not use original scheduling decision */ #define MACH_FAIRSHARE_ENTER 0xc /* move to fairshare band */ #define MACH_FAIRSHARE_EXIT 0xd /* exit fairshare band */ -#define MACH_FAILSAFE 0xe /* tripped fixed-pri/RT failsafe */ +#define MACH_FAILSAFE 0xe /* tripped fixed-pri/RT failsafe */ +#define MACH_BLOCK 0xf /* thread block */ +#define MACH_WAIT 0x10 /* thread wait assertion */ #define MACH_GET_URGENCY 0x14 /* Urgency queried by platform */ #define MACH_URGENCY 0x15 /* Urgency (RT/BG/NORMAL) communicated - * to platform */ + * to platform + */ #define MACH_REDISPATCH 0x16 /* "next thread" thread redispatched */ #define MACH_REMOTE_AST 0x17 /* AST signal issued to remote processor */ + #define MACH_SCHED_LPA_BROKEN 0x18 /* last_processor affinity broken in choose_processor */ /* Codes for pmap (DBG_MACH_PMAP) */ @@ -242,6 +248,8 @@ __BEGIN_DECLS #define DBG_DRVINFINIBAND 17 /* Infiniband */ #define DBG_DRVGRAPHICS 18 /* Graphics */ #define DBG_DRVSD 19 /* Secure Digital */ +#define DBG_DRVNAND 20 /* NAND drivers and layers */ +#define DBG_SSD 21 /* SSD */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -263,6 +271,8 @@ __BEGIN_DECLS #define DBG_IOCTL 6 /* ioctl to the disk */ #define DBG_BOOTCACHE 7 /* bootcache operations */ #define DBG_HFS 8 /* HFS-specific events; see bsd/hfs/hfs_kdebug.h */ +#define DBG_EXFAT 0xE /* ExFAT-specific events; see the exfat project */ +#define DBG_MSDOS 0xF /* FAT-specific events; see the msdosfs project */ /* The Kernel Debug Sub Classes for BSD */ #define DBG_BSD_PROC 0x01 /* process/signals related */ @@ -284,6 +294,8 @@ __BEGIN_DECLS /* The Kernel Debug Sub Classes for DBG_CORESTORAGE */ #define DBG_CS_IO 0 +/* Sub-class codes for CoreGraphics (DBG_CG) are defined in its component. */ + /* The Kernel Debug Sub Classes for DBG_MISC */ #define DBG_EVENT 0x10 #define DBG_BUFFER 0x20 @@ -299,10 +311,12 @@ __BEGIN_DECLS #define DKIO_PAGING 0x10 #define DKIO_THROTTLE 0x20 #define DKIO_PASSIVE 0x40 +#define DKIO_NOCACHE 0x80 /* Codes for Application Sub Classes */ #define DBG_APP_SAMBA 128 + /**********************************************************************/ #define KDBG_CODE(Class, SubClass, code) (((Class & 0xff) << 24) | ((SubClass & 0xff) << 16) | ((code & 0x3fff) << 2)) @@ -335,6 +349,7 @@ __BEGIN_DECLS #define PMAP_CODE(code) MACHDBG_CODE(DBG_MACH_PMAP, code) + /* Usage: * kernel_debug((KDBG_CODE(DBG_NETWORK, DNET_PROTOCOL, 51) | DBG_FUNC_START), * offset, 0, 0, 0,0) @@ -366,41 +381,96 @@ extern unsigned int kdebug_enable; #define KDEBUG_ENABLE_TRACE 0x1 #define KDEBUG_ENABLE_ENTROPY 0x2 #define KDEBUG_ENABLE_CHUD 0x4 +#define KDEBUG_ENABLE_PPT 0x8 -#if (!defined(NO_KDEBUG)) -#ifdef XNU_KERNEL_PRIVATE +/* + * Infer the supported kernel debug event level from config option. + * Use (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) as a guard to protect + * unaudited debug code. + */ +#define KDEBUG_LEVEL_NONE 0 +#define KDEBUG_LEVEL_IST 1 +#define KDEBUG_LEVEL_STANDARD 2 +#define KDEBUG_LEVEL_FULL 3 + +#if NO_KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_NONE +#elif IST_KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_IST +#elif KDEBUG +#define KDEBUG_LEVEL KDEBUG_LEVEL_FULL +#else +#define KDEBUG_LEVEL KDEBUG_LEVEL_STANDARD +#endif + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#ifdef XNU_KERNEL_PRIVATE #define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ do { \ - if (__improbable(kdebug_enable)) \ + if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) #define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ do { \ - if (__improbable(kdebug_enable)) \ + if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) #else /* XNU_KERNEL_PRIVATE */ #define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) #define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) #endif /* XNU_KERNEL_PRIVATE */ -#else /*!NO_KDEBUG */ +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ #define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) do { } while(0) #define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) do { } while(0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ + +/* + * Specify KDEBUG_PPT to indicate that the event belongs to the + * limited PPT set. + */ +#define KDEBUG_COMMON (KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_ENTROPY|KDEBUG_ENABLE_CHUD|KDEBUG_ENABLE_PPT) +#define KDEBUG_TRACE (KDEBUG_ENABLE_TRACE|KDEBUG_ENABLE_ENTROPY|KDEBUG_ENABLE_CHUD) +#define KDEBUG_PPT (KDEBUG_ENABLE_PPT) +/* + * KERNEL_DEBUG_CONSTANT_IST events provide an audited subset of + * tracepoints for userland system tracing tools. + */ +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) +#ifdef XNU_KERNEL_PRIVATE +#define KERNEL_DEBUG_CONSTANT_IST(type,x,a,b,c,d,e) \ +do { \ + if (__improbable(kdebug_enable & type)) \ + kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ + (uintptr_t)d,(uintptr_t)e); \ +} while(0) +#else /* XNU_KERNEL_PRIVATE */ +#define KERNEL_DEBUG_CONSTANT_IST(type,x,a,b,c,d,e) \ +do { \ + if (kdebug_enable & type) \ + kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ + (uintptr_t)d,(uintptr_t)e); \ +} while(0) +#endif /* XNU_KERNEL_PRIVATE */ +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ +#define KERNEL_DEBUG_CONSTANT_IST(type,x,a,b,c,d,e) do { } while(0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ + +#if NO_KDEBUG #define __kdebug_constant_only __unused #endif @@ -421,18 +491,18 @@ extern void kernel_debug1( uintptr_t arg5); -#if (KDEBUG && (!defined(NO_KDEBUG))) +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) #ifdef XNU_KERNEL_PRIVATE #define KERNEL_DEBUG(x,a,b,c,d,e) \ do { \ - if (__improbable(kdebug_enable)) \ + if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #define KERNEL_DEBUG1(x,a,b,c,d,e) \ do { \ - if (__improbable(kdebug_enable)) \ + if (__improbable(kdebug_enable & ~KDEBUG_ENABLE_PPT)) \ kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) @@ -441,25 +511,24 @@ do { \ #else /* !XNU_KERNEL_PRIVATE */ #define KERNEL_DEBUG(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #define KERNEL_DEBUG1(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (kdebug_enable & ~KDEBUG_ENABLE_PPT) \ kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #endif /* XNU_KERNEL_PRIVATE */ -#else - +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ #define KERNEL_DEBUG(x,a,b,c,d,e) do {} while (0) #define KERNEL_DEBUG1(x,a,b,c,d,e) do {} while (0) #define __kdebug_only __unused -#endif +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_FULL) */ #ifdef KERNEL_PRIVATE #include @@ -483,6 +552,8 @@ void trace_handler_unmap_bufinfo(void); void trace_handler_map_buffer(int index, uintptr_t addr, unsigned long size); void trace_handler_unmap_buffer(int index); void trace_set_timebases(uint64_t tsc, uint64_t ns); + + #endif /* KERNEL_PRIVATE */ @@ -571,6 +642,9 @@ kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) } #endif +/* 2^16 bits (8 kilobytes), one for each possible class/subclass combination */ +#define KDBG_TYPEFILTER_BITMAP_SIZE ( (256 * 256) / 8 ) + /* Debug Flags */ #define KDBG_INIT 0x001 #define KDBG_NOWRAP 0x002 @@ -628,6 +702,8 @@ typedef struct { #define KDBG_RANGECHECK 0x100000 #define KDBG_VALCHECK 0x200000 /* Check up to 4 individual values */ +#define KDBG_TYPEFILTER_CHECK ((uint32_t) 0x400000) /* Check class and subclass against a bitmap */ + #define KDBG_BUFINIT 0x80000000 /* Control operations */ diff --git a/bsd/sys/kern_callout.h b/bsd/sys/kern_callout.h deleted file mode 100644 index 6ac7642cd..000000000 --- a/bsd/sys/kern_callout.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef KPI_KERN_CALLOUT_H -#define KPI_KERN_CALLOUT_H - -#ifdef KERNEL - -/* - * Default sample threshold for validity - */ -#define MA_SMA_SAMPLES 10 /* simple moving average */ - -/* - * Flags bits for the ma_flags field - */ -#define KCO_MA_F_SMA 0x00000001 /* Simple moving average */ -#define KCO_MA_F_WMA 0x00000002 /* Weighted moving average */ -#define KCO_MA_F_NEEDS_INIT 0x80000000 /* Need initialization */ - -struct kco_moving_average { - int ma_flags; /* flags */ - uint64_t ma_sma; /* simple over MA_SMA_SAMPLES*/ - uint64_t ma_old_sma; /* previous value */ - uint64_t ma_sma_samples[MA_SMA_SAMPLES]; /* sample history */ - int32_t ma_sma_threshold; /* trigger delta (%) */ - int ma_sma_trigger_count; /* number of time triggered */ - uint64_t ma_wma; /* weighted */ - uint64_t ma_old_wma; /* previous value */ - int ma_wma_weight; /* weighting (< 100) */ - int32_t ma_wma_threshold; /* trigger delta (%) */ - int ma_wma_trigger_count; /* number of time triggered */ -}; - -__BEGIN_DECLS -int kco_ma_addsample(struct kco_moving_average *map, uint64_t sample_time); -void kco_ma_init(struct kco_moving_average *map, int32_t threshold, int kind); -int kco_ma_info(struct kco_moving_average *map, int kind, uint64_t *averagep, uint64_t *old_averagep, int32_t *thresholdp, int *countp); -__END_DECLS - -#endif /* KERNEL */ - -#endif /* KPI_KERN_CONTROL_H */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index 4a05a490f..66fabee01 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -25,19 +25,33 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/*! - @header kern_memorystatus.h - This header defines a kernel event subclass for the OSMemoryNotification API - */ -#ifndef SYS_KERN_MEMORYSTATUS_H -#define SYS_KERN_MEMORYSTATUS_H - -#ifndef MACH_KERNEL_PRIVATE +#ifndef SYS_MEMORYSTATUS_H +#define SYS_MEMORYSTATUS_H #include #include #include +#include + +#define DEFAULT_JETSAM_PRIORITY -100 + +enum { + kMemorystatusFlagsFrontmost = (1 << 0), + kMemorystatusFlagsKilled = (1 << 1), + kMemorystatusFlagsKilledHiwat = (1 << 2), + kMemorystatusFlagsFrozen = (1 << 3), + kMemorystatusFlagsKilledVnodes = (1 << 4), + kMemorystatusFlagsKilledSwap = (1 << 5), + kMemorystatusFlagsThawed = (1 << 6), + kMemorystatusFlagsKilledVM = (1 << 7), + kMemorystatusFlagsSuspForDiagnosis = (1 << 8), + kMemorystatusFlagsActive = (1 << 9), + kMemorystatusFlagsSupportsIdleExit = (1 << 10), + kMemorystatusFlagsDirty = (1 << 11) +}; + +#if TARGET_OS_EMBEDDED || CONFIG_EMBEDDED /* * Define Memory Status event subclass. @@ -51,107 +65,205 @@ #define KEV_MEMORYSTATUS_SUBCLASS 3 enum { - kMemoryStatusLevelNote = 1, - kMemoryStatusSnapshotNote = 2, - kMemoryStatusHibernationNote = 3 + kMemorystatusLevelNote = 1, + kMemorystatusSnapshotNote = 2, + kMemorystatusFreezeNote = 3, + kMemorystatusPressureNote = 4 }; enum { - kMemoryStatusLevelAny = -1, - kMemoryStatusLevelNormal = 0, - kMemoryStatusLevelWarning = 1, - kMemoryStatusLevelUrgent = 2, - kMemoryStatusLevelCritical = 3 + kMemorystatusLevelAny = -1, + kMemorystatusLevelNormal = 0, + kMemorystatusLevelWarning = 1, + kMemorystatusLevelUrgent = 2, + kMemorystatusLevelCritical = 3 }; -typedef struct jetsam_priority_entry { +typedef struct memorystatus_priority_entry { pid_t pid; uint32_t flags; int32_t hiwat_pages; - int32_t hiwat_reserved1; - int32_t hiwat_reserved2; - int32_t hiwat_reserved3; -} jetsam_priority_entry_t; - -/* -** maximum killable processes to keep track of -*/ -#define kMaxPriorityEntries 64 - -typedef struct jetsam_snapshot_entry { - pid_t pid; - char name[MAXCOMLEN+1]; - uint32_t pages; - uint32_t flags; - uint8_t uuid[16]; -} jetsam_snapshot_entry_t; + int32_t priority; + int32_t reserved; + int32_t reserved2; +} memorystatus_priority_entry_t; /* ** how many processes to snapshot */ #define kMaxSnapshotEntries 128 -typedef struct jetsam_kernel_stats { +typedef struct memorystatus_kernel_stats { uint32_t free_pages; uint32_t active_pages; uint32_t inactive_pages; + uint32_t throttled_pages; uint32_t purgeable_pages; uint32_t wired_pages; -} jetsam_kernel_stats_t; +} memorystatus_kernel_stats_t; /* ** This is a variable-length struct. -** Allocate a buffer of the size returned by the sysctl, cast to a jetsam_snapshot_t * +** Allocate a buffer of the size returned by the sysctl, cast to a memorystatus_snapshot_t * */ +typedef struct jetsam_snapshot_entry { + pid_t pid; + char name[MAXCOMLEN+1]; + int32_t priority; + uint32_t pages; + uint32_t flags; + uint8_t uuid[16]; +} memorystatus_jetsam_snapshot_entry_t; + typedef struct jetsam_snapshot { - jetsam_kernel_stats_t stats; + uint64_t snapshot_time; + uint64_t notification_time; + memorystatus_kernel_stats_t stats; size_t entry_count; - jetsam_snapshot_entry_t entries[1]; -} jetsam_snapshot_t; + memorystatus_jetsam_snapshot_entry_t entries[1]; +} memorystatus_jetsam_snapshot_t; + +typedef memorystatus_priority_entry_t jetsam_priority_entry_t; +typedef memorystatus_jetsam_snapshot_t jetsam_snapshot_t; +typedef memorystatus_jetsam_snapshot_entry_t jetsam_snapshot_entry_t; + +#define kMemoryStatusLevelNote kMemorystatusLevelNote +#define kMemoryStatusSnapshotNote kMemorystatusSnapshotNote +#define kMemoryStatusFreezeNote kMemorystatusFreezeNote +#define kMemoryStatusPressureNote kMemorystatusPressureNote -typedef struct jetsam_hibernation_entry { - uint32_t pid; +typedef struct memorystatus_freeze_entry { + int32_t pid; uint32_t flags; uint32_t pages; -} jetsam_hibernation_entry_t; +} memorystatus_freeze_entry_t; + +#endif /* TARGET_OS_EMBEDDED */ + +#ifdef XNU_KERNEL_PRIVATE + +/* General tunables */ -#endif /* !MACH_KERNEL_PRIVATE */ +#define DELTA_PERCENT 5 +#define CRITICAL_PERCENT 5 +#define HIGHWATER_PERCENT 10 +#define PRESSURE_PERCENT 15 +#define FREEZE_PERCENT 50 + +#define POLICY_MORE_FREE_OFFSET_PERCENT 5 +#define POLICY_DIAGNOSTIC_OFFSET_PERCENT 5 + +#define IDLE_EXIT_TIME_SECS 10 enum { - kJetsamFlagsFrontmost = (1 << 0), - kJetsamFlagsKilled = (1 << 1), - kJetsamFlagsKilledHiwat = (1 << 2), - kJetsamFlagsHibernated = (1 << 3), - kJetsamFlagsKilledVnodes = (1 << 4), - kJetsamFlagsKilledSwap = (1 << 5), - kJetsamFlagsThawed = (1 << 6), - kJetsamFlagsKilledVM = (1 << 7), - kJetsamFlagsSuspForDiagnosis = (1 << 8) + kProcessSuspended = (1 << 0), + kProcessFrozen = (1 << 1), + kProcessNoReclaimWorth = (1 << 2), + kProcessIgnored = (1 << 3), + kProcessLocked = (1 << 4), + kProcessKilled = (1 << 5), + kProcessNotifiedForPressure = (1 << 6), + kProcessPriorityUpdated = (1 << 7), + kProcessActive = (1 << 8), + kProcessForeground = (1 << 9), + kProcessSuspendedForDiag = (1 << 10), + kProcessSupportsIdleExit = (1 << 11), + kProcessDirty = (1 << 12), + kProcessIgnoreIdleExit = (1 << 13) }; -#ifdef KERNEL -extern void kern_memorystatus_init(void) __attribute__((section("__TEXT, initcode"))); -extern int jetsam_kill_top_proc(boolean_t any, uint32_t reason); +typedef struct memorystatus_node { + TAILQ_ENTRY(memorystatus_node) link; + pid_t pid; + int32_t priority; + uint32_t state; +#if CONFIG_JETSAM + int32_t hiwat_pages; +#endif +#if CONFIG_FREEZE + uint32_t resident_pages; +#endif + uint64_t clean_time; +} memorystatus_node; + +extern int memorystatus_wakeup; +extern unsigned int memorystatus_running; + +extern unsigned int memorystatus_available_pages; +extern unsigned int memorystatus_available_pages_critical; +extern unsigned int memorystatus_level; +extern unsigned int memorystatus_delta; + +extern void memorystatus_init(void) __attribute__((section("__TEXT, initcode"))); + +extern kern_return_t memorystatus_list_add(int pid, int priority, int high_water_mark); +extern kern_return_t memorystatus_list_change(boolean_t effective, int pid, int priority, int state_flags, int high_water_mark); +extern kern_return_t memorystatus_list_remove(int pid); + +extern kern_return_t memorystatus_on_track_dirty(int pid, boolean_t track); +extern kern_return_t memorystatus_on_dirty(int pid, boolean_t dirty); + +extern void memorystatus_on_suspend(int pid); +extern void memorystatus_on_resume(int pid); +extern void memorystatus_on_inactivity(int pid); + +#if CONFIG_JETSAM + +typedef enum memorystatus_policy_t { + kPolicyDefault = 0x0, + kPolicyMoreFree = 0x1, + kPolicyDiagnoseAll = 0x2, + kPolicyDiagnoseFirst = 0x4, + kPolicyDiagnoseActive = (kPolicyDiagnoseAll | kPolicyDiagnoseFirst), +} memorystatus_policy_t; + +extern int memorystatus_jetsam_wakeup; +extern unsigned int memorystatus_jetsam_running; -extern int kern_memorystatus_wakeup; -extern int kern_memorystatus_level; -extern unsigned int kern_memorystatus_delta; +extern int memorystatus_kill_top_proc(boolean_t any, uint32_t reason); +extern int memorystatus_kill_top_proc_from_VM(void); + +extern void memorystatus_update(unsigned int pages_avail); + +#if VM_PRESSURE_EVENTS + +#define MEMORYSTATUS_SUSPENDED_THRESHOLD 4 + +extern int memorystatus_request_vm_pressure_candidate(void); +extern void memorystatus_send_pressure_note(int pid); + +#endif /* VM_PRESSURE_EVENTS */ + +#endif /* CONFIG_JETSAM */ #ifdef CONFIG_FREEZE -extern void kern_hibernation_init(void) __attribute__((section("__TEXT, initcode"))); -extern int kern_hibernation_wakeup; -void kern_hibernation_on_pid_suspend(int pid); -void kern_hibernation_on_pid_resume(int pid, task_t task); -void kern_hibernation_on_pid_hibernate(int pid); -#endif +#define FREEZE_PAGES_MIN ( 1 * 1024 * 1024 / PAGE_SIZE) +#define FREEZE_PAGES_MAX (16 * 1024 * 1024 / PAGE_SIZE) -#if CONFIG_EMBEDDED -#define VM_CHECK_MEMORYSTATUS do { vm_check_memorystatus(); } while(0) -#else /*CONFIG_EMBEDDED*/ -#define VM_CHECK_MEMORYSTATUS do {} while(0) -#endif +#define FREEZE_SUSPENDED_THRESHOLD_LOW 2 +#define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4 + +#define FREEZE_DAILY_MB_MAX 1024 +#define FREEZE_DAILY_PAGEOUTS_MAX (FREEZE_DAILY_MB_MAX * (1024 * 1024 / PAGE_SIZE)) + +typedef struct throttle_interval_t { + uint32_t mins; + uint32_t burst_multiple; + uint32_t pageouts; + uint32_t max_pageouts; + mach_timespec_t ts; + boolean_t throttle; +} throttle_interval_t; + +extern boolean_t memorystatus_freeze_enabled; +extern int memorystatus_freeze_wakeup; + +extern void memorystatus_freeze_init(void) __attribute__((section("__TEXT, initcode"))); + +#endif /* CONFIG_FREEZE */ + +#endif /* XNU_KERNEL_PRIVATE */ -#endif /* KERNEL */ -#endif /* SYS_KERN_MEMORYSTATUS_H */ +#endif /* SYS_MEMORYSTATUS_H */ diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index 24239b9f4..3cb4c787d 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1363,7 +1363,7 @@ typedef enum { #ifdef XNU_KERNEL_PRIVATE MBUF_TC_UNSPEC = -1, /* Internal: not specified */ #endif - MBUF_TC_BE = 0, + MBUF_TC_BE = 0, MBUF_TC_BK = 1, MBUF_TC_VI = 2, MBUF_TC_VO = 3 @@ -1385,11 +1385,133 @@ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); @function mbuf_set_traffic_class @discussion Set the traffic class of an mbuf packet. @param mbuf The mbuf to set the traffic class on. - @ac The traffic class + @tc The traffic class @result 0 on success, EINVAL if bad paramater is passed */ extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc); +/*! + @function mbuf_is_traffic_class_privileged + @discussion Returns the privileged status of the traffic class + of the packet specified by the mbuf. + @param mbuf The mbuf to retrieve the status from. + @result Non-zero if privileged, 0 otherwise. + */ +extern int mbuf_is_traffic_class_privileged(mbuf_t mbuf); + +#ifdef KERNEL_PRIVATE +/*! + @enum mbuf_svc_class_t + @abstract Service class of a packet + @discussion Property that represents the category of service + of a packet. This information may be used by the driver + and at the link level. + @constant MBUF_SC_BK_SYS "Background System-Initiated", high delay + tolerant, high loss tolerant, elastic flow, variable size & + long-lived. + @constant MBUF_SC_BK "Background", user-initiated, high delay tolerant, + high loss tolerant, elastic flow, variable size. This level + corresponds to WMM access class "BG", or MBUF_TC_BK. + @constant MBUF_SC_BE "Best Effort", unclassified/standard. This is + the default service class; pretty much a mix of everything. + This level corresponds to WMM access class "BE" or MBUF_TC_BE. + @constant MBUF_SC_RD + "Responsive Data", a notch higher than "Best Effort", medium + delay tolerant, medium loss tolerant, elastic flow, bursty, + long-lived. + @constant MBUF_SC_OAM "Operations, Administration, and Management", + medium delay tolerant, low-medium loss tolerant, elastic & + inelastic flows, variable size. + @constant MBUF_SC_AV "Multimedia Audio/Video Streaming", medium delay + tolerant, low-medium loss tolerant, elastic flow, constant + packet interval, variable rate & size. + @constant MBUF_SC_RV "Responsive Multimedia Audio/Video", low delay + tolerant, low-medium loss tolerant, elastic flow, variable + packet interval, rate and size. + @constant MBUF_SC_VI "Interactive Video", low delay tolerant, low- + medium loss tolerant, elastic flow, constant packet interval, + variable rate & size. This level corresponds to WMM access + class "VI" or MBUF_TC_VI. + @constant MBUF_SC_VO "Interactive Voice", low delay tolerant, low loss + tolerant, inelastic flow, constant packet rate, somewhat fixed + size. This level corresponds to WMM access class "VO" or + MBUF_TC_VO. + @constant MBUF_SC_CTL "Network Control", low delay tolerant, low loss + tolerant, inelastic flow, rate is short & burst, variable size. +*/ +typedef enum { +#ifdef XNU_KERNEL_PRIVATE + MBUF_SC_UNSPEC = -1, /* Internal: not specified */ +#endif + MBUF_SC_BK_SYS = 0x00080090, /* lowest class */ + MBUF_SC_BK = 0x00100080, + + MBUF_SC_BE = 0x00000000, + MBUF_SC_RD = 0x00180010, + MBUF_SC_OAM = 0x00200020, + + MBUF_SC_AV = 0x00280120, + MBUF_SC_RV = 0x00300110, + MBUF_SC_VI = 0x00380100, + + MBUF_SC_VO = 0x00400180, + MBUF_SC_CTL = 0x00480190, /* highest class */ +} mbuf_svc_class_t; + +/*! + @function mbuf_get_service_class + @discussion Get the service class of an mbuf packet + @param mbuf The mbuf to get the service class of. + @result The service class +*/ +extern mbuf_svc_class_t mbuf_get_service_class(mbuf_t mbuf); + +/*! + @function mbuf_set_servicec_class + @discussion Set the service class of an mbuf packet. + @param mbuf The mbuf to set the service class on. + @sc The service class + @result 0 on success, EINVAL if bad paramater is passed +*/ +extern errno_t mbuf_set_service_class(mbuf_t mbuf, mbuf_svc_class_t sc); + +/*! + @function mbuf_is_service_class_privileged + @discussion Returns the privileged status of the service class + of the packet specified by the mbuf. + @param mbuf The mbuf to retrieve the status from. + @result Non-zero if privileged, 0 otherwise. + */ +extern int mbuf_is_service_class_privileged(mbuf_t mbuf); + +/* + @enum mbuf_pkthdr_aux_flags_t + @abstract Constants defining mbuf auxiliary flags. Only the flags + listed below can be retrieved. + @constant MBUF_PKTAUXF_INET_RESOLVE_RTR Indicates this is an ARP + request packet, whose target is the address of the default + IPv4 router. + @constant MBUF_PKTAUXF_INET6_RESOLVE_RTR Indicates this is an ICMPv6 + Neighbor Solicitation packet, whose target is the address of + the default IPv6 router. + */ +enum { + MBUF_PKTAUXF_INET_RESOLVE_RTR = 0x0004, + MBUF_PKTAUXF_INET6_RESOLVE_RTR = 0x0008, +}; +typedef u_int32_t mbuf_pkthdr_aux_flags_t; + +/* + @function mbuf_pkthdr_aux_flags + @discussion Returns the auxiliary flags of a packet. + @param mbuf The mbuf containing the packet header. + @param paux_flags Pointer to mbuf_pkthdr_aux_flags_t variable. + @result 0 upon success otherwise the errno error. +*/ +extern errno_t mbuf_pkthdr_aux_flags(mbuf_t mbuf, + mbuf_pkthdr_aux_flags_t *paux_flags); +#endif /* KERNEL_PRIVATE */ + /* IF_QUEUE interaction */ #define IF_ENQUEUE_MBUF(ifq, m) { \ diff --git a/bsd/sys/kpi_socketfilter.h b/bsd/sys/kpi_socketfilter.h index e5ace3c03..10ab4323c 100644 --- a/bsd/sys/kpi_socketfilter.h +++ b/bsd/sys/kpi_socketfilter.h @@ -68,11 +68,15 @@ struct sockaddr; option. @constant SFLT_EXTENDED Indicates that this socket filter utilizes the extended fields within the sflt_filter structure. + @constant SFLT_EXTENDED_REGISTRY Indicates that this socket filter + wants to attach to all the sockets already present on the + system. It will also receive notifications for these sockets. */ enum { SFLT_GLOBAL = 0x01, SFLT_PROG = 0x02, - SFLT_EXTENDED = 0x04 + SFLT_EXTENDED = 0x04, + SFLT_EXTENDED_REGISTRY = 0x08 }; typedef u_int32_t sflt_flags; diff --git a/bsd/sys/lockf.h b/bsd/sys/lockf.h index ffa779573..4cbefc091 100644 --- a/bsd/sys/lockf.h +++ b/bsd/sys/lockf.h @@ -108,6 +108,7 @@ __BEGIN_DECLS int lf_advlock(struct vnop_advlock_args *); int lf_assert(struct vnop_advlock_args *, void **); void lf_commit(void *, int); +void lf_abort_advlocks(vnode_t); #ifdef LOCKF_DEBUG void lf_print(char *, struct lockf *); diff --git a/bsd/sys/lockstat.h b/bsd/sys/lockstat.h index 74b5ee6a0..a9e536d7a 100644 --- a/bsd/sys/lockstat.h +++ b/bsd/sys/lockstat.h @@ -90,6 +90,7 @@ extern "C" { #define LS_LCK_RW_LOCK_EXCL_TO_SHARED_ILK_SPIN 38 #define LS_NPROBES 40 +#define LS_LCK_INVALID LS_NPROBES /* * Name the various locking functions... @@ -168,9 +169,13 @@ extern void (*lockstat_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + #ifdef _KERNEL #if CONFIG_DTRACE + +extern void (lockstat_probe_wrapper)(int, uintptr_t, int); + /* * Macros to record lockstat probes. */ diff --git a/bsd/sys/make_posix_availability.sh b/bsd/sys/make_posix_availability.sh index 5aa58b364..3fb652b5b 100755 --- a/bsd/sys/make_posix_availability.sh +++ b/bsd/sys/make_posix_availability.sh @@ -22,6 +22,17 @@ # @APPLE_OSREFERENCE_LICENSE_HEADER_END@ # +function usage() { + echo "Usage: $0 " 1>&2 + exit 1 +} + +if [ $# -ne 1 ]; then + usage +fi + +OUTPUT="$1" + POSIX_VALUES="198808L 199009L 199209L 199309L 199506L 200112L 200809L" { @@ -67,5 +78,5 @@ for value in ${POSIX_VALUES} ; do echo "#endif" echo done -} > $1 +} > "$OUTPUT" diff --git a/bsd/sys/make_symbol_aliasing.sh b/bsd/sys/make_symbol_aliasing.sh index fa5f0e33c..a4a9b881f 100755 --- a/bsd/sys/make_symbol_aliasing.sh +++ b/bsd/sys/make_symbol_aliasing.sh @@ -22,6 +22,23 @@ # @APPLE_OSREFERENCE_LICENSE_HEADER_END@ # +function usage() { + echo "Usage: $0 " 1>&2 + exit 1 +} + +if [ $# -ne 2 ]; then + usage +fi + +SDKROOT="$1" +OUTPUT="$2" + +if [ ! -x "${SDKROOT}/usr/local/libexec/availability.pl" ] ; then + echo "Unable to locate ${SDKROOT}/usr/local/libexec/availability.pl (or not executable)" >&2 + exit 1 +fi + { cat < $1 +} > "$OUTPUT" diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index 4e8688735..80883f08b 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,9 +192,9 @@ #define M_FILEGLOB 99 /* fileglobal */ #define M_KAUTH 100 /* kauth subsystem */ #define M_DUMMYNET 101 /* dummynet */ -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL #define M_UNSAFEFS 102 /* storage for vnode lock state for unsafe FS */ -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ #define M_MACPIPELABEL 103 /* MAC pipe labels */ #define M_MACTEMP 104 /* MAC framework */ #define M_SBUF 105 /* string buffers */ diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index f0d45c565..e3ad20c4a 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -87,10 +87,6 @@ #include #include -#if PF_PKTHDR -#include -#endif /* PF_PKTHDR */ - /* * Mbufs are of a single size, MSIZE (machine/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size @@ -151,21 +147,69 @@ struct m_tag { u_int32_t m_tag_id; /* Module ID */ }; -#ifdef __LP64__ -#define M_TAG_ALIGN(len) \ - P2ROUNDUP(len, sizeof (u_int64_t)) + sizeof (struct m_tag) -#else #define M_TAG_ALIGN(len) \ - P2ROUNDUP(len, sizeof (u_int32_t)) + sizeof (struct m_tag) -#endif /* !__LP64__ */ + (P2ROUNDUP(len, sizeof (u_int64_t)) + sizeof (struct m_tag)) #define M_TAG_VALID_PATTERN 0xfeedfacefeedfaceULL #define M_TAG_FREE_PATTERN 0xdeadbeefdeadbeefULL +/* + * Packet tag header structure (at the top of mbuf). Pointers are + * 32-bit in ILP32; m_tag needs 64-bit alignment, hence padded. + */ struct m_taghdr { +#ifndef __LP64__ + u_int32_t pad; /* For structure alignment */ +#endif /* !__LP64__ */ u_int64_t refcnt; /* Number of tags in this mbuf */ }; +/* Values for pftag_flags */ +#define PF_TAG_GENERATED 0x000001 /* pkt generated by PF */ +#define PF_TAG_FRAGCACHE 0x000002 +#define PF_TAG_TRANSLATE_LOCALHOST 0x000004 +#define PF_TAG_FLOWHASH 0x000100 /* valid flowhash value */ +#define PF_TAG_HDR_INET 0x000200 /* hdr points to IPv4 */ +#define PF_TAG_HDR_INET6 0x000400 /* hdr points to IPv6 */ +#define PF_TAG_TCP 0x000800 /* payload is TCP */ +#define PF_TAG_FLOWADV 0x010000 /* local flow advisory */ +#define PF_TAG_QUEUE1 0x100000 /* queue-specific */ + +#define IF_PKTSEQ_SHIFT 4 + +/* PF mbuf tag */ +struct pf_mtag { + void *pftag_hdr; /* saved hdr pos in mbuf, for ECN */ + unsigned int pftag_rtableid; /* alternate routing table id */ + union { + struct { + u_int32_t qid; + union { + u_int8_t val8[4]; + u_int16_t val16[2]; + u_int32_t val32; + } __qpriv_u; /* for queue-specific use */ + } __pf_data; + u_int64_t pktseq; + } __pfifseq_u; /* Used for pf or interface bandwidth measurement */ +#define pftag_qid __pfifseq_u.__pf_data.qid +#define pftag_qpriv8 __pfifseq_u.__pf_data.__qpriv_u.val8 +#define pftag_qpriv16 __pfifseq_u.__pf_data.__qpriv_u.val16 +#define pftag_qpriv32 __pfifseq_u.__pf_data.__qpriv_u.val32 +#define pftag_pktseq __pfifseq_u.pktseq + u_int32_t pftag_flowhash; + u_int16_t pftag_tag; + u_int16_t pftag_routed; + u_int32_t pftag_flags; /* PF_TAG flags */ +}; + +/* TCP specific mbuf tag */ +struct tcp_mtag { + u_int tm_tso_segz; /* TSO segment size (actual MSS) */ + u_int16_t tm_pktlen; /* LRO - max segment size encountered */ + u_int16_t tm_npkts; /* LRO - number of coalesced TCP pkts */ +}; + /* record/packet header in first mbuf of chain; valid if M_PKTHDR set */ struct pkthdr { int len; /* total packet length */ @@ -177,24 +221,21 @@ struct pkthdr { /* Note: csum_flags is used for hardware checksum and VLAN */ int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ - u_int tso_segsz; /* TSO segment size (actual MSS) */ u_short vlan_tag; /* VLAN tag, host byte order */ u_short socket_id; /* socket id */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ -#if PF_PKTHDR - /* - * Be careful; {en,dis}abling PF_PKTHDR will require xnu recompile; - * private code outside of xnu must use mbuf_get_mhlen() instead - * of MHLEN. - */ - struct pf_mtag pf_mtag; -#endif /* PF_PKTHDR */ - u_int32_t prio; /* packet priority */ - u_short vt_nrecs; /* # of IGMPv3 records in this chain */ - u_short _pad; + struct pf_mtag pf_mtag; /* built-in PF tag */ +#define m_flowhash pf_mtag.pftag_flowhash +#define m_fhflags pf_mtag.pftag_flags + u_int32_t svc; /* MBUF_SVC value */ + u_int16_t vt_nrecs; /* # of IGMPv3/MLDv2 records */ + u_int16_t aux_flags; /* auxiliary packet flags */ + struct tcp_mtag tcp_mtag; /* tcp related data */ +#define tso_segsz tcp_mtag.tm_tso_segz +#define lro_pktlen tcp_mtag.tm_pktlen +#define lro_npkts tcp_mtag.tm_npkts }; - /* description of external storage mapped into mbuf, valid if M_EXT set */ struct m_ext { caddr_t ext_buf; /* start of buffer */ @@ -238,6 +279,8 @@ struct mbuf { #define m_ext M_dat.MH.MH_dat.MH_ext #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf +#define m_pktlen(_m) ((_m)->m_pkthdr.len) +#define m_pftag(_m) (&(_m)->m_pkthdr.pf_mtag) /* mbuf flags (private) */ #define M_EXT 0x0001 /* has associated external storage */ @@ -271,7 +314,7 @@ struct mbuf { M_LOOP|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ M_FIRSTFRAG|M_LASTFRAG|M_PROMISC|M_HASFCS) -/* flags indicating hw checksum support and sw checksum requirements [freebsd4.1] */ +/* flags indicating hw checksum support and sw checksum requirements */ #define CSUM_IP 0x0001 /* will csum IP */ #define CSUM_TCP 0x0002 /* will csum TCP */ #define CSUM_UDP 0x0004 /* will csum UDP */ @@ -303,6 +346,18 @@ struct mbuf { /* TCP Segment Offloading requested on this mbuf */ #define CSUM_TSO_IPV4 0x100000 /* This mbuf needs to be segmented by the NIC */ #define CSUM_TSO_IPV6 0x200000 /* This mbuf needs to be segmented by the NIC */ + +/* + * Auxiliary packet flags. Unlike m_flags, all auxiliary flags are copied + * along when copying m_pkthdr, i.e. no equivalent of M_COPYFLAGS here. + * Note that this flag is 16-bit wide. + */ +#define MAUXF_PRIO_PRIVILEGED 0x0001 /* packet priority is privileged */ +#define MAUXF_PROXY_DST 0x0002 /* processed but not locally destined */ +#define MAUXF_INET_RESOLVE_RTR 0x0004 /* pkt is for resolving IPv4 router */ +#define MAUXF_INET6_RESOLVE_RTR 0x0008 /* pkt is for resolving IPv6 router */ +#define MAUXF_SW_LRO_PKT 0x0010 /* pkt is a large coalesced pkt */ +#define MAUXF_SW_LRO_DID_CSUM 0x0020 /* IP and TCP checksums done by LRO*/ #endif /* XNU_KERNEL_PRIVATE */ /* mbuf types */ @@ -402,6 +457,8 @@ union m16kcluster { */ #define M_COPY_PKTHDR(to, from) m_copy_pkthdr(to, from) +#define M_COPY_PFTAG(to, from) m_copy_pftag(to, from) + /* * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place * an object of the specified size at the end of the mbuf, longword aligned. @@ -455,7 +512,7 @@ do { \ /* * M_STRUCT_GET ensures that intermediate protocol header (from "off" to - * "len") is located in single mbuf, on contiguous memory region. + * "off+len") is located in single mbuf, on contiguous memory region. * The pointer to the region will be returned to pointer variable "val", * with type "typ". * @@ -486,14 +543,14 @@ do { \ do { \ struct mbuf *t; \ \ - if ((off) == 0) { \ - (val) = (typ)mtod(m, caddr_t); \ + if ((off) == 0 && ((m)->m_len >= (len))) { \ + (val) = (typ)(void *)mtod(m, caddr_t); \ } else { \ t = m_pulldown((m), (off), (len), NULL); \ if (t != NULL) { \ if (t->m_len < (len)) \ panic("m_pulldown malfunction"); \ - (val) = (typ)mtod(t, caddr_t); \ + (val) = (typ)(void *)mtod(t, caddr_t); \ } else { \ (val) = (typ)NULL; \ (m) = NULL; \ @@ -508,7 +565,7 @@ do { \ m->m_len > ((njcl > 0) ? njclbytes : MBIGCLBYTES) || \ m->m_type == MT_FREE || \ ((m->m_flags & M_EXT) != 0 && m->m_ext.ext_buf == NULL)) { \ - panic("Failed mbuf validity check: mbuf %p len %d " \ + panic_plain("Failed mbuf validity check: mbuf %p len %d " \ "type %d flags 0x%x data %p rcvif %s%d ifflags 0x%x", \ m, m->m_len, m->m_type, m->m_flags, \ ((m->m_flags & M_EXT) ? m->m_ext.ext_buf : m->m_data), \ @@ -517,6 +574,92 @@ do { \ } \ } while (0) +/* + * Simple mbuf queueing system + * + * This is basically a SIMPLEQ adapted to mbuf use (i.e. using + * m_nextpkt instead of field.sqe_next). + * + * m_next is ignored, so queueing chains of mbufs is possible + */ +#define MBUFQ_HEAD(name) \ +struct name { \ + struct mbuf *mq_first; /* first packet */ \ + struct mbuf **mq_last; /* addr of last next packet */ \ +} + +#define MBUFQ_INIT(q) do { \ + MBUFQ_FIRST(q) = NULL; \ + (q)->mq_last = &MBUFQ_FIRST(q); \ +} while (0) + +#define MBUFQ_PREPEND(q, m) do { \ + if ((MBUFQ_NEXT(m) = MBUFQ_FIRST(q)) == NULL) \ + (q)->mq_last = &MBUFQ_NEXT(m); \ + MBUFQ_FIRST(q) = (m); \ +} while (0) + +#define MBUFQ_ENQUEUE(q, m) do { \ + MBUFQ_NEXT(m) = NULL; \ + *(q)->mq_last = (m); \ + (q)->mq_last = &MBUFQ_NEXT(m); \ +} while (0) + +#define MBUFQ_ENQUEUE_MULTI(q, m, n) do { \ + MBUFQ_NEXT(n) = NULL; \ + *(q)->mq_last = (m); \ + (q)->mq_last = &MBUFQ_NEXT(n); \ +} while (0) + +#define MBUFQ_DEQUEUE(q, m) do { \ + if (((m) = MBUFQ_FIRST(q)) != NULL) { \ + if ((MBUFQ_FIRST(q) = MBUFQ_NEXT(m)) == NULL) \ + (q)->mq_last = &MBUFQ_FIRST(q); \ + else \ + MBUFQ_NEXT(m) = NULL; \ + } \ +} while (0) + +#define MBUFQ_REMOVE(q, m) do { \ + if (MBUFQ_FIRST(q) == (m)) { \ + MBUFQ_DEQUEUE(q, m); \ + } else { \ + struct mbuf *_m = MBUFQ_FIRST(q); \ + while (MBUFQ_NEXT(_m) != (m)) \ + _m = MBUFQ_NEXT(_m); \ + if ((MBUFQ_NEXT(_m) = \ + MBUFQ_NEXT(MBUFQ_NEXT(_m))) == NULL) \ + (q)->mq_last = &MBUFQ_NEXT(_m); \ + } \ +} while (0) + +#define MBUFQ_DRAIN(q) do { \ + struct mbuf *__m0; \ + while ((__m0 = MBUFQ_FIRST(q)) != NULL) { \ + MBUFQ_FIRST(q) = MBUFQ_NEXT(__m0); \ + MBUFQ_NEXT(__m0) = NULL; \ + m_freem(__m0); \ + } \ + (q)->mq_last = &MBUFQ_FIRST(q); \ +} while (0) + +#define MBUFQ_FOREACH(m, q) \ + for ((m) = MBUFQ_FIRST(q); \ + (m); \ + (m) = MBUFQ_NEXT(m)) + +#define MBUFQ_FOREACH_SAFE(m, q, tvar) \ + for ((m) = MBUFQ_FIRST(q); \ + (m) && ((tvar) = MBUFQ_NEXT(m), 1); \ + (m) = (tvar)) + +#define MBUFQ_EMPTY(q) ((q)->mq_first == NULL) +#define MBUFQ_FIRST(q) ((q)->mq_first) +#define MBUFQ_NEXT(m) ((m)->m_nextpkt) +#define MBUFQ_LAST(q) (*(q)->mq_last) + +#define max_linkhdr P2ROUNDUP(_max_linkhdr, sizeof (u_int32_t)) +#define max_protohdr P2ROUNDUP(_max_protohdr, sizeof (u_int32_t)) #endif /* XNU_KERNEL_PRIVATE */ /* @@ -741,16 +884,123 @@ extern struct mbuf *m_pullup(struct mbuf *, int); extern struct mbuf *m_split(struct mbuf *, int, int); extern void m_mclfree(caddr_t p); +/* + * On platforms which require strict alignment (currently for anything but + * i386 or x86_64), this macro checks whether the data pointer of an mbuf + * is 32-bit aligned (this is the expected minimum alignment for protocol + * headers), and assert otherwise. + */ +#if defined(__i386__) || defined(__x86_64__) +#define MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(_m) +#else /* !__i386__ && !__x86_64__ */ +#define MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(_m) do { \ + if (!IS_P2ALIGNED((_m)->m_data, sizeof (u_int32_t))) { \ + if (((_m)->m_flags & M_PKTHDR) && \ + (_m)->m_pkthdr.rcvif != NULL) { \ + panic_plain("\n%s: mbuf %p data ptr %p is not " \ + "32-bit aligned [%s%d: alignerrs=%lld]\n", \ + __func__, (_m), (_m)->m_data, \ + (_m)->m_pkthdr.rcvif->if_name, \ + (_m)->m_pkthdr.rcvif->if_unit, \ + (_m)->m_pkthdr.rcvif->if_alignerrs); \ + } else { \ + panic_plain("\n%s: mbuf %p data ptr %p is not " \ + "32-bit aligned\n", \ + __func__, (_m), (_m)->m_data); \ + } \ + } \ +} while (0) +#endif /* !__i386__ && !__x86_64__ */ + +/* Maximum number of MBUF_SC values (excluding MBUF_SC_UNSPEC) */ +#define MBUF_SC_MAX_CLASSES 10 + +/* + * These conversion macros rely on the corresponding MBUF_SC and + * MBUF_TC values in order to establish the following mapping: + * + * MBUF_SC_BK_SYS ] ==> MBUF_TC_BK + * MBUF_SC_BK ] + * + * MBUF_SC_BE ] ==> MBUF_TC_BE + * MBUF_SC_RD ] + * MBUF_SC_OAM ] + * + * MBUF_SC_AV ] ==> MBUF_TC_VI + * MBUF_SC_RV ] + * MBUF_SC_VI ] + * + * MBUF_SC_VO ] ==> MBUF_TC_VO + * MBUF_SC_CTL ] + * + * The values assigned to each service class allows for a fast mapping to + * the corresponding MBUF_TC traffic class values, as well as to retrieve the + * assigned index; therefore care must be taken when comparing against these + * values. Use the corresponding class and index macros to retrieve the + * corresponding portion, and never assume that a higher class corresponds + * to a higher index. + */ +#define MBUF_SCVAL(x) ((x) & 0xffff) +#define MBUF_SCIDX(x) ((((x) >> 16) & 0xff) >> 3) +#define MBUF_SC2TC(_sc) (MBUF_SCVAL(_sc) >> 7) +#define MBUF_TC2SCVAL(_tc) ((_tc) << 7) +#define IS_MBUF_SC_BACKGROUND(_sc) (((_sc) == MBUF_SC_BK_SYS) || \ + ((_sc) == MBUF_SC_BK)) + +#define SCIDX_BK_SYS MBUF_SCIDX(MBUF_SC_BK_SYS) +#define SCIDX_BK MBUF_SCIDX(MBUF_SC_BK) +#define SCIDX_BE MBUF_SCIDX(MBUF_SC_BE) +#define SCIDX_RD MBUF_SCIDX(MBUF_SC_RD) +#define SCIDX_OAM MBUF_SCIDX(MBUF_SC_OAM) +#define SCIDX_AV MBUF_SCIDX(MBUF_SC_AV) +#define SCIDX_RV MBUF_SCIDX(MBUF_SC_RV) +#define SCIDX_VI MBUF_SCIDX(MBUF_SC_VI) +#define SCIDX_VO MBUF_SCIDX(MBUF_SC_VO) +#define SCIDX_CTL MBUF_SCIDX(MBUF_SC_CTL) + +#define SCVAL_BK_SYS MBUF_SCVAL(MBUF_SC_BK_SYS) +#define SCVAL_BK MBUF_SCVAL(MBUF_SC_BK) +#define SCVAL_BE MBUF_SCVAL(MBUF_SC_BE) +#define SCVAL_RD MBUF_SCVAL(MBUF_SC_RD) +#define SCVAL_OAM MBUF_SCVAL(MBUF_SC_OAM) +#define SCVAL_AV MBUF_SCVAL(MBUF_SC_AV) +#define SCVAL_RV MBUF_SCVAL(MBUF_SC_RV) +#define SCVAL_VI MBUF_SCVAL(MBUF_SC_VI) +#define SCVAL_VO MBUF_SCVAL(MBUF_SC_VO) +#define SCVAL_CTL MBUF_SCVAL(MBUF_SC_CTL) + +#define MBUF_VALID_SC(c) \ + (c == MBUF_SC_BK_SYS || c == MBUF_SC_BK || c == MBUF_SC_BE || \ + c == MBUF_SC_RD || c == MBUF_SC_OAM || c == MBUF_SC_AV || \ + c == MBUF_SC_RV || c == MBUF_SC_VI || c == MBUF_SC_VO || \ + c == MBUF_SC_CTL) + +#define MBUF_VALID_SCIDX(c) \ + (c == SCIDX_BK_SYS || c == SCIDX_BK || c == SCIDX_BE || \ + c == SCIDX_RD || c == SCIDX_OAM || c == SCIDX_AV || \ + c == SCIDX_RV || c == SCIDX_VI || c == SCIDX_VO || \ + c == SCIDX_CTL) + +#define MBUF_VALID_SCVAL(c) \ + (c == SCVAL_BK_SYS || c == SCVAL_BK || c == SCVAL_BE || \ + c == SCVAL_RD || c == SCVAL_OAM || c == SCVAL_AV || \ + c == SCVAL_RV || c == SCVAL_VI || c == SCVAL_VO || \ + c == SCVAL_CTL) + __private_extern__ union mbigcluster *mbutl; /* start VA of mbuf pool */ __private_extern__ union mbigcluster *embutl; /* end VA of mbuf pool */ __private_extern__ unsigned int nmbclusters; /* number of mapped clusters */ __private_extern__ int njcl; /* # of jumbo clusters */ __private_extern__ int njclbytes; /* size of a jumbo cluster */ -__private_extern__ int max_linkhdr; /* largest link-level header */ -__private_extern__ int max_protohdr; /* largest protocol header */ __private_extern__ int max_hdr; /* largest link+protocol header */ __private_extern__ int max_datalen; /* MHLEN - max_hdr */ +/* Use max_linkhdr instead of _max_linkhdr */ +__private_extern__ int _max_linkhdr; /* largest link-level header */ + +/* Use max_protohdr instead of _max_protohdr */ +__private_extern__ int _max_protohdr; /* largest protocol header */ + __private_extern__ unsigned int mbuf_default_ncl(int, u_int64_t); __private_extern__ void mbinit(void); __private_extern__ struct mbuf *m_clattach(struct mbuf *, int, caddr_t, @@ -766,6 +1016,10 @@ __private_extern__ struct mbuf *m_free(struct mbuf *); __private_extern__ struct mbuf *m_getclr(int, int); __private_extern__ struct mbuf *m_getptr(struct mbuf *, int, int *); __private_extern__ unsigned int m_length(struct mbuf *); +__private_extern__ unsigned int m_length2(struct mbuf *, struct mbuf **); +__private_extern__ unsigned int m_fixhdr(struct mbuf *); +__private_extern__ struct mbuf *m_defrag(struct mbuf *, int); +__private_extern__ struct mbuf *m_defrag_offset(struct mbuf *, u_int32_t, int); __private_extern__ struct mbuf *m_prepend(struct mbuf *, int, int); __private_extern__ struct mbuf *m_copyup(struct mbuf *, int, int); __private_extern__ struct mbuf *m_retry(int, int); @@ -781,6 +1035,7 @@ __private_extern__ struct mbuf *m_getcl(int, int, int); __private_extern__ caddr_t m_mclalloc(int); __private_extern__ int m_mclhasreference(struct mbuf *); __private_extern__ void m_copy_pkthdr(struct mbuf *, struct mbuf *); +__private_extern__ void m_copy_pftag(struct mbuf *, struct mbuf *); __private_extern__ struct mbuf *m_dtom(void *); __private_extern__ int m_mtocl(void *); @@ -867,11 +1122,22 @@ __private_extern__ void m_tag_init(struct mbuf *); __private_extern__ struct m_tag *m_tag_first(struct mbuf *); __private_extern__ struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); -__private_extern__ void m_prio_init(struct mbuf *); - __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL #include +#ifdef XNU_KERNEL_PRIVATE +__BEGIN_DECLS + +__private_extern__ void m_service_class_init(struct mbuf *); +__private_extern__ int m_set_service_class(struct mbuf *, mbuf_svc_class_t); +__private_extern__ mbuf_svc_class_t m_get_service_class(struct mbuf *); +__private_extern__ mbuf_svc_class_t m_service_class_from_idx(u_int32_t); +__private_extern__ mbuf_svc_class_t m_service_class_from_val(u_int32_t); +__private_extern__ int m_set_traffic_class(struct mbuf *, mbuf_traffic_class_t); +__private_extern__ mbuf_traffic_class_t m_get_traffic_class(struct mbuf *); + +__END_DECLS +#endif /* XNU_KERNEL_PRIVATE */ #endif /* KERNEL */ #endif /* !_SYS_MBUF_H_ */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 443e05b01..428a865ec 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * Copyright (c) 2006-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -58,6 +58,15 @@ extern "C" { #define ASSERT(EX) ((void)0) #endif +/* + * Compile time assert; this should be on its own someday. + */ +#define _CASSERT(x) \ + switch (0) { case 0: case (x): ; } + +/* + * Atomic macros; these should be on their own someday. + */ #define atomic_add_16_ov(a, n) \ ((u_int16_t) OSAddAtomic16(n, (volatile SInt16 *)a)) @@ -245,9 +254,11 @@ typedef struct mcache { #define MCF_TRACE 0x00000002 /* enable transaction auditing */ #define MCF_NOCPUCACHE 0x00000010 /* disable CPU layer caching */ #define MCF_NOLEAKLOG 0x00000100 /* disable leak logging */ +#define MCF_EXPLEAKLOG 0x00000200 /* expose leak info to user space */ #define MCF_DEBUG (MCF_VERIFY | MCF_TRACE) -#define MCF_FLAGS_MASK (MCF_DEBUG | MCF_NOCPUCACHE | MCF_NOLEAKLOG) +#define MCF_FLAGS_MASK \ + (MCF_DEBUG | MCF_NOCPUCACHE | MCF_NOLEAKLOG | MCF_EXPLEAKLOG) /* Valid values for notify callback */ #define MCN_RETRYALLOC 0x00000001 /* Allocation should be retried */ diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index f6594c0ef..9a2c732f0 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -343,9 +343,10 @@ struct vfs_attr { * External filesystem control flags. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ +#define MNT_NOBLOCK 0x00020000 /* don't block unmount if not responding */ #define MNT_RELOAD 0x00040000 /* reload filesystem data */ #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ -#define MNT_CMDFLAGS (MNT_UPDATE|MNT_RELOAD|MNT_FORCE) +#define MNT_CMDFLAGS (MNT_UPDATE|MNT_NOBLOCK|MNT_RELOAD|MNT_FORCE) @@ -442,6 +443,7 @@ union union_vfsidctl { /* the fields vc_vers and vc_fsid are compatible */ #define VFS_CTL_TIMEO 0x00010005 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010006 /* disable file locking */ #define VFS_CTL_SADDR 0x00010007 /* get server address */ +#define VFS_CTL_DISC 0x00010008 /* server disconnected */ struct vfsquery { u_int32_t vq_flags; @@ -482,7 +484,7 @@ struct vfsioattr { void * io_reserved[2]; /* extended attribute information */ }; -#define VFS_IOATTR_FLAGS_FUA 0x01 /* Write-through cache supported */ +#define VFS_IOATTR_FLAGS_FUA 0x01 /* Write-through cache supported */ #define VFS_IOATTR_FLAGS_UNMAP 0x02 /* Unmap (trim) supported */ /* diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 141fb3eeb..ccf31dd9a 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -265,7 +265,7 @@ extern struct mount * dead_mountp; #define MNTK_AUTH_OPAQUE_ACCESS 0x40000000 /* VNOP_ACCESS is reliable for remote auth */ #define MNTK_EXTENDED_SECURITY 0x80000000 /* extended security supported */ -#define MNT_LBUSY 0x00000001 /* mount is busy */ +#define MNT_LNOTRESP 0x00000001 /* mount not responding */ #define MNT_LUNMOUNT 0x00000002 /* mount in unmount */ #define MNT_LFORCE 0x00000004 /* mount in forced unmount */ #define MNT_LDRAIN 0x00000008 /* mount in drain */ @@ -319,9 +319,9 @@ struct vfstable { #define VFC_VFSPREFLIGHT 0x040 #define VFC_VFSREADDIR_EXTENDED 0x080 #define VFC_VFS64BITREADY 0x100 -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL #define VFC_VFSTHREADSAFE 0x200 -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ #define VFC_VFSNOMACLABEL 0x1000 #define VFC_VFSVNOP_PAGEINV2 0x2000 #define VFC_VFSVNOP_PAGEOUTV2 0x4000 @@ -466,8 +466,10 @@ boolean_t vfs_iskernelmount(mount_t); #endif /* throttled I/O api */ -int throttle_get_io_policy(struct uthread **ut); -int throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp); +int throttle_get_io_policy(struct uthread **ut); +int throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp); +void throttle_info_update_by_mount(mount_t mp); +void unthrottle_thread(uthread_t); /* throttled I/O helper function */ /* convert the lowest bit to a device index */ diff --git a/bsd/sys/munge.h b/bsd/sys/munge.h new file mode 100644 index 000000000..19a3dcd1d --- /dev/null +++ b/bsd/sys/munge.h @@ -0,0 +1,69 @@ +#ifndef __MUNGE_H__ +#define __MUNGE_H__ +/* + * Coyright (c) 2005-2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +void munge_w(const void *arg0 __unused, void *args); +void munge_ww(const void *arg0 __unused, void *args); +void munge_www(const void *arg0 __unused, void *args); +void munge_wwww(const void *arg0 __unused, void *args); +void munge_wwwww(const void *arg0 __unused, void *args); +void munge_wwwwww(const void *arg0 __unused, void *args); +void munge_wwwwwww(const void *arg0 __unused, void *args); +void munge_wwwwwwww(const void *arg0 __unused, void *args); +void munge_wl(const void *arg0 __unused, void *args); +void munge_wwl(const void *arg0 __unused, void *args); +void munge_wwlw(const void *arg0 __unused, void *args); +void munge_wwlll(const void *arg0 __unused, void *args); +void munge_wlw(const void *arg0 __unused, void *args); +void munge_wlwwwll(const void *arg0 __unused, void *args); +void munge_wlwwwllw(const void *arg0 __unused, void *args); +void munge_wlwwlwlw(const void *arg0 __unused, void *args); +void munge_wll(const void *arg0 __unused, void *args); +void munge_wllww(const void *arg0 __unused, void *args); +void munge_wlll(const void *arg0 __unused, void *args); +void munge_wllwwll(const void *arg0 __unused, void *args); +void munge_wwwlw(const void *arg0 __unused, void *args); +void munge_wwwlww(const void *arg0 __unused, void *args); +void munge_wwwl(const void *arg0 __unused, void *args); +void munge_wwwwlw(const void *arg0 __unused, void *args); +void munge_wwwwl(const void *arg0 __unused, void *args); +void munge_wwwwwl(const void *arg0 __unused, void *args); +void munge_wwwwwlww(const void *arg0 __unused, void *args); +void munge_wwwwwllw(const void *arg0 __unused, void *args); +void munge_wwwwwlll(const void *arg0 __unused, void *args); +void munge_wwwwwwl(const void *arg0 __unused, void *args); +void munge_wwwwwwlw(const void *arg0 __unused, void *args); +void munge_wwwwwwll(const void *arg0 __unused, void *args); +void munge_wsw(const void *arg0 __unused, void *args); +void munge_wws(const void *arg0 __unused, void *args); +void munge_wwwsw(const void *arg0 __unused, void *args); +void munge_llllll(const void *arg0 __unused, void *args __unused); +void munge_l(const void *arg0 __unused, void *args __unused); +void munge_lw(const void *arg0 __unused, void *args); +void munge_lwww(const void *arg0 __unused, void *args); +#endif /* __MUNGE_H__ */ diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 56d3ecf13..803a9d8d7 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -175,9 +175,9 @@ struct nameidata { #define AUDITVNPATH2 0x00200000 /* audit the path/vnode info */ #define USEDVP 0x00400000 /* start the lookup at ndp.ni_dvp */ #define CN_VOLFSPATH 0x00800000 /* user path was a volfs style path */ -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL #define FSNODELOCKHELD 0x01000000 -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ #define UNIONCREATED 0x02000000 /* union fs creation of vnode */ #if NAMEDRSRCFORK #define CN_WANTSRSRCFORK 0x04000000 diff --git a/bsd/sys/pipe.h b/bsd/sys/pipe.h index f45128bfb..3437710b2 100644 --- a/bsd/sys/pipe.h +++ b/bsd/sys/pipe.h @@ -71,6 +71,8 @@ #define PIPE_SIZE 16384 #endif +#define PIPE_KVAMAX (1024 * 1024 * 16) + #ifndef BIG_PIPE_SIZE #define BIG_PIPE_SIZE (64*1024) #endif diff --git a/bsd/sys/priv.h b/bsd/sys/priv.h index 1abb898bf..78f553f8a 100644 --- a/bsd/sys/priv.h +++ b/bsd/sys/priv.h @@ -79,6 +79,17 @@ */ #define PRIV_ADJTIME 1000 /* Set time adjustment. */ +/* + * Virtual memory privileges. + */ +#define PRIV_VM_PRESSURE 6000 /* Check VM pressure. */ +#define PRIV_VM_JETSAM 6001 /* Adjust jetsam configuration. */ + +/* + * Network stack privileges. + */ +#define PRIV_NET_PRIVILEGED_TRAFFIC_CLASS 10000 /* Set SO_PRIVILEGED_TRAFFIC_CLASS. */ + /* * IPv4 and IPv6 privileges. */ diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 8cec174a3..9b80718f3 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -182,7 +182,7 @@ struct extern_proc { #define P_SSTEP 0x20000 / * process needs single-step fixup ??? * / */ -#define P_RESV5 0x00040000 /* (P_WAITING) process has a wait() in progress */ +#define P_DELAYIDLESLEEP 0x00040000 /* Process is marked to delay idle sleep on disk IO */ #define P_CHECKOPENEVT 0x00080000 /* check if a vnode has the OPENEVT flag set on open */ #define P_DEPENDENCY_CAPABLE 0x00100000 /* process is ok to call vfs_markdependency() */ @@ -207,6 +207,16 @@ struct extern_proc { #define P_FSTRACE 0 /* Obsolete: retained for compilation */ #define P_SSTEP 0 /* Obsolete: retained for compilation */ +#define P_DIRTY_TRACK 0x00000001 /* track dirty state */ +#define P_DIRTY_ALLOW_IDLE_EXIT 0x00000002 /* process can be idle-exited when clean */ +#define P_DIRTY 0x00000004 /* process is dirty */ +#define P_DIRTY_SHUTDOWN 0x00000008 /* process is dirty during shutdown */ +#define P_DIRTY_TERMINATED 0x00000010 /* process has been marked for termination */ +#define P_DIRTY_BUSY 0x00000020 /* serialization flag */ + +#define P_DIRTY_CAN_IDLE_EXIT (P_DIRTY_TRACK | P_DIRTY_ALLOW_IDLE_EXIT) +#define P_DIRTY_IS_DIRTY (P_DIRTY | P_DIRTY_SHUTDOWN) + #endif /* XNU_KERNEL_PRIVATE || !KERNEL */ #ifdef KERNEL @@ -289,6 +299,7 @@ extern int IS_64BIT_PROCESS(proc_t); extern int tsleep(void *chan, int pri, const char *wmesg, int timo); extern int msleep1(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, u_int64_t timo); +task_t proc_task(proc_t); extern int proc_pidversion(proc_t); extern int proc_getcdhash(proc_t, unsigned char *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index 67842664d..381aef8f3 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -118,6 +118,8 @@ struct proc_bsdshortinfo { #ifdef PRIVATE #define PROC_FLAG_DARWINBG 0x8000 /* process in darwin background */ #define PROC_FLAG_EXT_DARWINBG 0x10000 /* process in darwin background - external enforcement */ +#define PROC_FLAG_IOS_APPLEDAEMON 0x20000 /* Process is apple daemon */ +#define PROC_FLAG_DELAYIDLESLEEP 0x40000 /* Process is marked to delay idle sleep on disk IO */ #endif @@ -642,6 +644,9 @@ struct proc_fileportinfo { #define PROC_PIDLISTFILEPORTS 14 #define PROC_PIDLISTFILEPORTS_SIZE (sizeof(struct proc_fileportinfo)) +#define PROC_PIDTHREADID64INFO 15 +#define PROC_PIDTHREADID64INFO_SIZE (sizeof(struct proc_threadinfo)) + /* Flavors for proc_pidfdinfo */ #define PROC_PIDFDVNODEINFO 1 @@ -691,6 +696,24 @@ struct proc_fileportinfo { #define PROC_SELFSET_VMRSRCOWNER 3 +#define PROC_SELFSET_DELAYIDLESLEEP 4 + +/* used for proc_dirtycontrol */ +#define PROC_DIRTYCONTROL_TRACK 1 +#define PROC_DIRTYCONTROL_SET 2 +#define PROC_DIRTYCONTROL_GET 3 + +/* proc_track_dirty() flags */ +#define PROC_DIRTY_TRACK 0x1 +#define PROC_DIRTY_ALLOW_IDLE_EXIT 0x2 + +#define PROC_DIRTY_TRACK_MASK (PROC_DIRTY_TRACK|PROC_DIRTY_ALLOW_IDLE_EXIT) + +/* proc_get_dirty() flags */ +#define PROC_DIRTY_TRACKED 0x1 +#define PROC_DIRTY_ALLOWS_IDLE_EXIT 0x2 +#define PROC_DIRTY_IS_DIRTY 0x4 + #ifdef XNU_KERNEL_PRIVATE #ifndef pshmnode struct pshmnode; diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 26b91b3cd..e4c7497bf 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -330,13 +330,15 @@ struct proc { char p_name[(2*MAXCOMLEN)+1]; /* PL */ struct pgrp *p_pgrp; /* Pointer to process group. (LL) */ -#if CONFIG_EMBEDDED - int p_iopol_disk; /* disk I/O policy (PL) */ -#endif /* CONFIG_EMBEDDED */ uint32_t p_csflags; /* flags for codesign (PL) */ uint32_t p_pcaction; /* action for process control on starvation */ uint8_t p_uuid[16]; /* from LC_UUID load command */ +#if !CONFIG_EMBEDDED +#define PROC_LEGACY_BEHAVIOR_IOTHROTTLE (0x00000001) + uint32_t p_legacy_behavior; +#endif + /* End area that is copied on creation. */ /* XXXXXXXXXXXXX End of BCOPY'ed on fork (AIOLOCK)XXXXXXXXXXXXXXXX */ #define p_endcopy p_aio_total_count @@ -379,6 +381,10 @@ struct proc { #endif /* SIGNAL_DEBUG */ #endif /* DIAGNOSTIC */ uint64_t p_dispatchqueue_offset; +#if VM_PRESSURE_EVENTS + struct timeval vm_pressure_last_notify_tstamp; +#endif + int p_dirty; /* dirty state */ }; #define PGRPID_DEAD 0xdeaddead @@ -430,10 +436,8 @@ struct proc { #define P_UNUSED 0x00200000 /* Unused */ #define P_LRAGE_VNODES 0x00400000 #define P_LREGISTER 0x00800000 /* thread start fns registered */ -#if CONFIG_EMBEDDED -#define P_LBACKGROUND 0x01000000 -#endif /* CONFIG_EMBEDDED */ -#define P_LVMRSRCOWNER 0x02000000 /* can handle the resource ownership of */ +#define P_LVMRSRCOWNER 0x01000000 /* can handle the resource ownership of */ +#define P_LPTERMINATE 0x02000000 /* can handle the resource ownership of */ /* Process control state for resource starvation */ #define P_PCTHROTTLE 1 @@ -686,6 +690,7 @@ extern int tsleep1(void *chan, int pri, const char *wmesg, u_int64_t abstime, in extern int msleep0(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, int timo, int (*continuation)(int)); extern void vfork_return(struct proc *child, int32_t *retval, int rval); extern int exit1(struct proc *, int, int *); +extern int exit1_internal(struct proc *, int, int *, boolean_t, boolean_t); extern int fork1(proc_t, thread_t *, int); extern void vfork_exit_internal(struct proc *p, int rv, int forced); extern void proc_reparentlocked(struct proc *child, struct proc * newparent, int cansignal, int locked); @@ -694,6 +699,7 @@ extern int proc_iterate(int flags, int (*callout)(proc_t , void *), void *arg, i extern int proc_rebootscan(int (*callout)(proc_t , void *), void *arg, int (*filterfn)(proc_t , void *), void *filterarg); extern int proc_childrenwalk(proc_t p, int (*callout)(proc_t , void *), void *arg); extern proc_t proc_findinternal(int pid, int funneled); +extern proc_t proc_findthread(thread_t thread); extern void proc_refdrain(proc_t); extern void proc_childdrainlocked(proc_t); extern void proc_childdrainstart(proc_t); @@ -739,6 +745,7 @@ extern int proc_pendingsignals(proc_t, sigset_t); int proc_getpcontrol(int pid, int * pcontrolp); int proc_dopcontrol(proc_t p, void *unused_arg); int proc_resetpcontrol(int pid); +extern void proc_removethrottle(proc_t); #if PSYNCH void pth_proc_hashinit(proc_t); void pth_proc_hashdelete(proc_t); diff --git a/bsd/sys/process_policy.h b/bsd/sys/process_policy.h index 19f3c2617..d9ad48272 100644 --- a/bsd/sys/process_policy.h +++ b/bsd/sys/process_policy.h @@ -59,7 +59,11 @@ __BEGIN_DECLS #define PROC_POLICY_HARDWARE_ACCESS 2 /* access to various hardware */ #define PROC_POLICY_RESOURCE_STARVATION 3 /* behavior on resource starvation */ #define PROC_POLICY_RESOURCE_USAGE 4 /* behavior on resource consumption */ +#if CONFIG_EMBEDDED +#define PROC_POLICY_APP_LIFECYCLE 5 /* app life cycle management */ +#else /* CONFIG_EMBEDDED */ #define PROC_POLICY_RESERVED 5 /* behavior on resource consumption */ +#endif /* CONFIG_EMBEDDED */ #define PROC_POLICY_APPTYPE 6 /* behavior on resource consumption */ /* sub policies for background policy */ @@ -85,28 +89,29 @@ __BEGIN_DECLS /* attribute values for disk hardware access, bit different as it should reflect IOPOL_XXX */ #define PROC_POLICY_DISKACC_NONE 0 #define PROC_POLICY_DISKACC_NORMAL 1 /* normal access to the disk */ +#define PROC_POLICY_DISKACC_FULLACCESS 1 /* normal access to the disk */ #define PROC_POLICY_DISKACC_PASSIVE 2 /* treat the I/Os as passive */ #define PROC_POLICY_DISKACC_THROTTLE 3 /* throttle the disk IOs */ -#define PROC_POLICY_DISKACC_DEFAULT 0 +#define PROC_POLICY_DISKACC_DEFAULT PROC_POLICY_DISKACC_FULLACCESS /* attribute values for GPU hardware access */ #define PROC_POLICY_GPUACC_NONE 0 #define PROC_POLICY_GPUACC_FULLACCESS 0 /* complete access to the GPU */ #define PROC_POLICY_GPUACC_DENYACCESS 1 /* deny any access to the GPU */ -#define PROC_POLICY_GPUACC_DEFAULT 0 /* default is complete access */ +#define PROC_POLICY_GPUACC_DEFAULT PROC_POLICY_GPUACC_FULLACCESS /* default is complete access */ /* atrribute values for network hardware access */ #define PROC_POLICY_NETACC_NONE 0 -#define PROC_POLICY_NETACC_NORMAL 0 /* complete access to the network */ +#define PROC_POLICY_NETACC_FULLACCESS 0 /* complete access to the network */ #define PROC_POLICY_NETACC_THROTTLE 1 /* throttle access to network */ -#define PROC_POLICY_NETACC_DEFAULT 0 /* default is complete access */ +#define PROC_POLICY_NETACC_DEFAULT PROC_POLICY_NETACC_FULLACCESS /* default is complete access */ /* atrribute values for network hardware access */ #define PROC_POLICY_CPUACC_NONE 0 -#define PROC_POLICY_CPUACC_ALL 0 /* access to all avialable cpus */ +#define PROC_POLICY_CPUACC_FULLACCESS 0 /* access to all avialable cpus */ #define PROC_POLICY_CPUACC_ONE 1 /* access to only one available cpu */ #define PROC_POLICY_CPUACC_LLCACHE 2 /* access to only one last level cache */ -#define PROC_POLICY_CPUACC_DEFAULT 0 /* default is access to all cpus */ +#define PROC_POLICY_CPUACC_DEFAULT PROC_POLICY_CPUACC_FULLACCESS /* default is access to all cpus */ /* System Resource management (ie usage and starvation related) definitions */ @@ -124,12 +129,13 @@ __BEGIN_DECLS #define PROC_POLICY_RUSAGE_NETWORK 5 /* amount of network usage */ #define PROC_POLICY_RUSAGE_POWER 6 /* amount of power/battery consumption */ -/* attribute values for the resource usage and low resource */ +/* attribute values for the resource usage and low resource - MUST match corresponding task definitions */ #define PROC_POLICY_RSRCACT_NONE 0 #define PROC_POLICY_RSRCACT_THROTTLE 1 /* throttle on resource condition */ #define PROC_POLICY_RSRCACT_SUSPEND 2 /* suspend on resource condition */ #define PROC_POLICY_RSRCACT_TERMINATE 3 /* kill on resource condition */ -#define PROC_POLICY_RSRCACT_NOTIFY 4 /* send kqueue notification */ +#define PROC_POLICY_RSRCACT_NOTIFY_KQ 4 /* send kqueue notification */ +#define PROC_POLICY_RSRCACT_NOTIFY_EXC 5 /* send exception */ /* type of resource for kqueue notifiction */ @@ -158,14 +164,33 @@ typedef struct proc_policy_cpuusage_attr { uint64_t ppattr_cpu_attr_deadline; /* 64bit deadline in nsecs */ } proc_policy_cpuusage_attr_t; +#if CONFIG_EMBEDDED +/* sub policies for app lifecycle management */ +#define PROC_POLICY_APPLIFE_NONE 0 /* does nothing.. */ +#define PROC_POLICY_APPLIFE_STATE 1 /* sets the app to various lifecycle states */ +#define PROC_POLICY_APPLIFE_DEVSTATUS 2 /* notes the device in inactive or short/long term */ +#define PROC_POLICY_APPLIFE_PIDBIND 3 /* a thread is to be bound to another processes app state */ +#endif /* CONFIG_EMBEDDED */ /* sub policies for PROC_POLICY_APPTYPE */ +#define PROC_POLICY_APPTYPE_NONE 0 /* does nothing.. */ +#define PROC_POLICY_APPTYPE_MODIFY 1 /* sets the app to various lifecycle states */ +#if CONFIG_EMBEDDED +#define PROC_POLICY_APPTYPE_THREADTHR 2 /* notes the device in inactive or short/long term */ +#endif /* CONFIG_EMBEDDED */ + + #define PROC_POLICY_OSX_APPTYPE_NONE 0 +#if CONFIG_EMBEDDED +#define PROC_POLICY_IOS_RESV1_APPTYPE 1 /* TAL based launched */ +#define PROC_POLICY_IOS_APPLE_DAEMON 2 /* for user of IOS apple daemons */ +#define PROC_POLICY_IOS_APPTYPE 3 /* ios specific handling */ +#define PROC_POLICY_IOS_NONUITYPE 4 /* ios non graphics type */ +#else #define PROC_POLICY_OSX_APPTYPE_TAL 1 /* TAL based launched */ #define PROC_POLICY_OSX_APPTYPE_WIDGET 2 /* for dashboard client */ #define PROC_POLICY_OSX_APPTYPE_DASHCLIENT 2 /* rename to move away from widget */ -#define PROC_POLICY_IOS_APPTYPE 3 /* ios specific handling */ -#define PROC_POLICY_IOS_NONUITYPE 4 /* ios non graphics type */ +#endif #ifndef KERNEL int process_policy(int scope, int action, int policy, int policy_subtype, proc_policy_attribute_t * attrp, pid_t target_pid, uint64_t target_threadid); diff --git a/bsd/sys/pthread_internal.h b/bsd/sys/pthread_internal.h index f33117241..b22de04d8 100644 --- a/bsd/sys/pthread_internal.h +++ b/bsd/sys/pthread_internal.h @@ -49,8 +49,6 @@ typedef struct ksyn_waitq_element * ksyn_waitq_element_t; #define KWE_THREAD_BROADCAST 4 -#define WORKITEM_SIZE 64 - #define WORKQUEUE_HIGH_PRIOQUEUE 0 /* high priority queue */ #define WORKQUEUE_DEFAULT_PRIOQUEUE 1 /* default priority queue */ #define WORKQUEUE_LOW_PRIOQUEUE 2 /* low priority queue */ @@ -82,26 +80,13 @@ struct threadlist { #define TH_LIST_CONSTRAINED 0x40 -struct workitem { - TAILQ_ENTRY(workitem) wi_entry; - user_addr_t wi_item; - uint32_t wi_affinity; -}; - -struct workitemlist { - TAILQ_HEAD(, workitem) wl_itemlist; - TAILQ_HEAD(, workitem) wl_freelist; -}; - struct workqueue { - struct workitem wq_array[WORKITEM_SIZE * WORKQUEUE_NUMPRIOS]; proc_t wq_proc; vm_map_t wq_map; task_t wq_task; thread_call_t wq_atimer_call; int wq_flags; int wq_lflags; - int wq_itemcount; uint64_t wq_thread_yielded_timestamp; uint32_t wq_thread_yielded_count; uint32_t wq_timer_interval; @@ -110,13 +95,14 @@ struct workqueue { uint32_t wq_constrained_threads_scheduled; uint32_t wq_nthreads; uint32_t wq_thidlecount; - uint32_t wq_reqconc[WORKQUEUE_NUMPRIOS]; /* requested concurrency for each priority level */ - struct workitemlist wq_list[WORKQUEUE_NUMPRIOS]; /* priority based item list */ - uint32_t wq_list_bitmap; + uint32_t wq_reqcount; TAILQ_HEAD(, threadlist) wq_thrunlist; TAILQ_HEAD(, threadlist) wq_thidlelist; - uint32_t *wq_thactive_count[WORKQUEUE_NUMPRIOS]; - uint32_t *wq_thscheduled_count[WORKQUEUE_NUMPRIOS]; + uint16_t wq_requests[WORKQUEUE_NUMPRIOS]; + uint16_t wq_ocrequests[WORKQUEUE_NUMPRIOS]; + uint16_t wq_reqconc[WORKQUEUE_NUMPRIOS]; /* requested concurrency for each priority level */ + uint16_t *wq_thscheduled_count[WORKQUEUE_NUMPRIOS]; + uint32_t *wq_thactive_count[WORKQUEUE_NUMPRIOS]; /* must be uint32_t since we OSAddAtomic on these */ uint64_t *wq_lastblocked_ts[WORKQUEUE_NUMPRIOS]; }; #define WQ_LIST_INITED 0x01 @@ -151,6 +137,8 @@ struct workqueue { #define WQOPS_QUEUE_REMOVE_OBSOLETE 2 #define WQOPS_THREAD_RETURN 4 #define WQOPS_THREAD_SETCONC 8 +#define WQOPS_QUEUE_NEWSPISUPP 0x10 /* this is to check for newer SPI support */ +#define WQOPS_QUEUE_REQTHREADS 0x20 /* request number of threads of a prio */ #define PTH_DEFAULT_STACKSIZE 512*1024 #define PTH_DEFAULT_GUARDSIZE 4*1024 diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index 6c64e53b8..18e3662e9 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -66,10 +66,6 @@ #include -#ifdef KERNEL_BUILD -#include -#endif /* KERNEL_BUILD */ - /* * Arguments to reboot system call. */ diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index fbe8e6266..311e1f601 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -284,6 +284,7 @@ struct rlimit { #define IOPOL_NORMAL 1 #define IOPOL_PASSIVE 2 #define IOPOL_THROTTLE 3 +#define IOPOL_UTILITY 4 #ifdef PRIVATE /* diff --git a/bsd/sys/sem.h b/bsd/sys/sem.h index 18eeb2061..35689b2fc 100644 --- a/bsd/sys/sem.h +++ b/bsd/sys/sem.h @@ -164,16 +164,6 @@ struct sembuf { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -/* - * System imposed limit on the value of the third parameter to semop(). - * This is arbitrary, and the standards unfortunately do not provide a - * way for user applications to retrieve this value (e.g. via sysconf() - * or from a manifest value in ). The value shown here is - * informational, and subject to change in future revisions. - */ -#define MAX_SOPS 5 /* maximum # of sembuf's per semop call */ - - /* * Union used as the fourth argment to semctl() in all cases. Specific * member values are used for different values of the third parameter: diff --git a/bsd/sys/sem_internal.h b/bsd/sys/sem_internal.h index 42fd3bffb..bf697a1ea 100644 --- a/bsd/sys/sem_internal.h +++ b/bsd/sys/sem_internal.h @@ -146,7 +146,7 @@ typedef union user_semun user_semun_t; #define SEMMSL SEMMNS /* max # of semaphores per id */ #endif #ifndef SEMOPM -#define SEMOPM 100 /* max # of operations per semop call */ +#define SEMOPM 5 /* max # of operations per semop call */ #endif diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index d2e40fd76..0ec175436 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -294,6 +294,9 @@ typedef struct user32_siginfo { user32_ulong_t __pad[7]; /* Reserved for Future Use */ } user32_siginfo_t; +void siginfo_user_to_user32(user_siginfo_t *, user32_siginfo_t *); +void siginfo_user_to_user64(user_siginfo_t *, user64_siginfo_t *); + #endif /* BSD_KERNEL_PRIVATE */ /* diff --git a/bsd/sys/signalvar.h b/bsd/sys/signalvar.h index 69ff9e15c..390d1b764 100644 --- a/bsd/sys/signalvar.h +++ b/bsd/sys/signalvar.h @@ -194,11 +194,11 @@ int sigprop[NSIG + 1] = { int coredump(struct proc *p); void execsigs(struct proc *p, thread_t thread); void gsignal(int pgid, int sig); -int issignal(struct proc *p); +int issignal_locked(struct proc *p); int CURSIG(struct proc *p); int clear_procsiglist(struct proc *p, int bit, int in_signalstart); int set_procsigmask(struct proc *p, int bit); -void postsig(int sig); +void postsig_locked(int sig); void siginit(struct proc *p) __attribute__((section("__TEXT, initcode"))); void trapsignal(struct proc *p, int sig, unsigned code); void pt_setrunnable(struct proc *p); diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 3fc35997c..204f5363b 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -140,12 +140,14 @@ struct iovec { #endif #ifdef PRIVATE -#define SO_TCDBG_PID 0x01 /* Set/get traffic class for PID */ -#define SO_TCDBG_PNAME 0x02 /* Set/get traffic class for processes of that name */ -#define SO_TCDBG_PURGE 0x04 /* Purge entries for unused PIDs */ -#define SO_TCDBG_FLUSH 0x08 /* Flush all entries */ -#define SO_TCDBG_COUNT 0x10 /* Get count of entries */ -#define SO_TCDBG_LIST 0x20 /* List entries */ +#define SO_TCDBG_PID 0x01 /* Set/get traffic class for PID */ +#define SO_TCDBG_PNAME 0x02 /* Set/get traffic class for processes of that name */ +#define SO_TCDBG_PURGE 0x04 /* Purge entries for unused PIDs */ +#define SO_TCDBG_FLUSH 0x08 /* Flush all entries */ +#define SO_TCDBG_COUNT 0x10 /* Get count of entries */ +#define SO_TCDBG_LIST 0x20 /* List entries */ +#define SO_TCDBG_DELETE 0x40 /* Delete a process entry */ +#define SO_TCDBG_TCFLUSH_PID 0x80 /* Flush traffic class for PID */ struct so_tcdbg { u_int32_t so_tcdbg_cmd; @@ -153,6 +155,7 @@ struct so_tcdbg { u_int32_t so_tcdbg_count; pid_t so_tcdbg_pid; char so_tcdbg_pname[MAXCOMLEN + 1]; + int32_t so_tcdbg_opportunistic; /* -1: unspecified, 0: off, 1: on, other: errors */ }; #endif /* PRIVATE */ @@ -192,9 +195,10 @@ struct so_tcdbg { #else #define SO_DONTTRUNC 0x2000 /* APPLE: Retain unread data */ /* (ATOMIC proto) */ -#define SO_WANTMORE 0x4000 /* APPLE: Give hint when more data ready */ +#define SO_WANTMORE 0x4000 /* APPLE: Give hint when more data ready */ #define SO_WANTOOBFLAG 0x8000 /* APPLE: Want OOB in MSG_FLAG on receive */ -#endif + +#endif /* (!__APPLE__) */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* @@ -234,12 +238,85 @@ struct so_tcdbg { #ifdef PRIVATE #define SO_EXECPATH 0x1085 /* Application Firewall Socket option */ -#define SO_TRAFFIC_CLASS 0x1086 /* Traffic class (int)*/ -#define SO_TC_BE 0 /* Best effort, normal */ -#define SO_TC_BK 1 /* Background, low priority or bulk traffic */ -#define SO_TC_VI 2 /* Interactive video, constant bit rate, low latency */ -#define SO_TC_VO 3 /* Interactive voice, constant bit rate, lowest latency */ -#define SO_TC_MAX 4 /* Max traffic class value */ +/* + * Traffic service class definitions (lowest to highest): + * + * SO_TC_BK_SYS + * "Background System-Initiated", high delay tolerant, high loss + * tolerant, elastic flow, variable size & long-lived. E.g: system- + * initiated iCloud synching or Time Capsule backup, for which there + * is no progress feedbacks. + * + * SO_TC_BK + * "Background", user-initiated, high delay tolerant, high loss tolerant, + * elastic flow, variable size. E.g. user-initiated iCloud synching or + * Time Capsule backup; or traffics of background applications, for which + * there is some progress feedbacks. + * + * SO_TC_BE + * "Best Effort", unclassified/standard. This is the default service + * class; pretty much a mix of everything. + * + * SO_TC_RD + * "Responsive Data", a notch higher than "Best Effort", medium delay + * tolerant, elastic & inelastic flow, bursty, long-lived. E.g. email, + * instant messaging, for which there is a sense of interactivity and + * urgency (user waiting for output). + * + * SO_TC_OAM + * "Operations, Administration, and Management", medium delay tolerant, + * low-medium loss tolerant, elastic & inelastic flows, variable size. + * E.g. VPN tunnels. + * + * SO_TC_AV + * "Multimedia Audio/Video Streaming", medium delay tolerant, low-medium + * loss tolerant, elastic flow, constant packet interval, variable rate & + * size. E.g. AirPlay playback (both video and audio). + * + * SO_TC_RV + * "Responsive Multimedia Audio/Video", low delay tolerant, low-medium + * loss tolerant, elastic flow, variable packet interval, rate and size. + * E.g. AirPlay mirroring, screen sharing. + * + * SO_TC_VI + * "Interactive Video", low delay tolerant, low-medium loss tolerant, + * elastic flow, constant packet interval, variable rate & size. E.g. + * FaceTime video. + * + * SO_TC_VO + * "Interactive Voice", low delay tolerant, low loss tolerant, inelastic + * flow, constant packet rate, somewhat fixed size. E.g. VoIP including + * FaceTime audio. + * + * SO_TC_CTL + * "Network Control", low delay tolerant, low loss tolerant, inelastic + * flow, rate is bursty but short, variable size. E.g. DNS queries; + * certain types of locally-originated ICMP, ICMPv6; IGMP/MLD join/leave, + * ARP. + */ +#define SO_TRAFFIC_CLASS 0x1086 /* Traffic service class (int) */ +#define SO_TC_BK_SYS 100 /* lowest class */ +#define SO_TC_BK 200 +#define SO_TC_BE 0 +#define SO_TC_RD 300 +#define SO_TC_OAM 400 +#define SO_TC_AV 500 +#define SO_TC_RV 600 +#define SO_TC_VI 700 +#define SO_TC_VO 800 +#define SO_TC_CTL 900 /* highest class */ +#define SO_TC_MAX 10 /* Total # of traffic classes */ +#ifdef XNU_KERNEL_PRIVATE +#define _SO_TC_BK 1 /* deprecated */ +#define _SO_TC_VI 2 /* deprecated */ +#define _SO_TC_VO 3 /* deprecated */ +#define _SO_TC_MAX 4 /* deprecated */ + +#define SO_VALID_TC(c) \ + (c == SO_TC_BK_SYS || c == SO_TC_BK || c == SO_TC_BE || \ + c == SO_TC_RD || c == SO_TC_OAM || c == SO_TC_AV || \ + c == SO_TC_RV || c == SO_TC_VI || c == SO_TC_VO || c == SO_TC_CTL) +#endif /* XNU_KERNEL_PRIVATE */ /* Background socket configuration flags */ #define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ @@ -248,8 +325,21 @@ struct so_tcdbg { #define SO_RECV_TRAFFIC_CLASS 0x1087 /* Receive traffic class (bool)*/ #define SO_TRAFFIC_CLASS_DBG 0x1088 /* Debug traffic class (struct so_tcdbg) */ #define SO_TRAFFIC_CLASS_STATS 0x1089 /* Traffic class statistics */ +#define SO_PRIVILEGED_TRAFFIC_CLASS 0x1090 /* Privileged traffic class (bool) */ #define SO_DEFUNCTOK 0x1100 /* can be defunct'd */ #define SO_ISDEFUNCT 0x1101 /* get defunct status */ + +#define SO_OPPORTUNISTIC 0x1102 /* deprecated; use SO_TRAFFIC_CLASS */ + +/* + * SO_FLUSH flushes any unsent data generated by a given socket. It takes + * an integer parameter, which can be any of the SO_TC traffic class values, + * or the special SO_TC_ALL value. + */ +#define SO_FLUSH 0x1103 /* flush unsent data (int) */ +#define SO_TC_ALL (-1) + +#define SO_RECV_ANYIF 0x1104 /* unrestricted inbound processing */ #endif /* PRIVATE */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ @@ -284,7 +374,8 @@ struct so_np_extensions { #ifdef KERNEL_PRIVATE #define SONPX_MASK_VALID (SONPX_SETOPTSHUT) -#endif +#define IS_SO_TC_BACKGROUND(_tc_) ((_tc_) == SO_TC_BK || (_tc_) == SO_TC_BK_SYS) +#endif /* KERNEL_PRIVATE */ #endif #endif @@ -360,7 +451,10 @@ struct so_np_extensions { #define AF_NETGRAPH 32 /* Netgraph sockets */ #endif #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ -#define AF_MAX 38 +#ifdef __APPLE__ +#define AF_UTUN 38 +#endif +#define AF_MAX 39 #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* @@ -460,6 +554,9 @@ struct sockaddr_storage { #define PF_NETGRAPH AF_NETGRAPH #endif +#ifdef __APPLE__ +#define PF_UTUN AF_UTUN +#endif #define PF_MAX AF_MAX /* @@ -773,6 +870,8 @@ struct omsghdr { void *msg_accrights; /* access rights sent/rcvd */ int msg_accrightslen; }; + +#define SA(s) ((struct sockaddr *)(void *)(s)) #endif /* KERNEL_PRIVATE */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 3c81716fe..06e6f5c41 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,10 +128,10 @@ struct accept_filter; struct socket { int so_zone; /* zone we were allocated from */ short so_type; /* generic type, see socket.h */ - short so_options; /* from socket call, see socket.h */ + u_int32_t so_options; /* from socket call, see socket.h */ short so_linger; /* time to linger while closing */ short so_state; /* internal state flags SS_*, below */ - void *so_pcb; /* protocol control block */ + void *so_pcb; /* protocol control block */ struct protosw *so_proto; /* protocol handle */ /* * Variables for connection queueing. @@ -179,7 +179,7 @@ struct socket { struct selinfo sb_sel; /* process selecting read/write */ short sb_flags; /* flags, see below */ struct timeval sb_timeo; /* timeout for read/write */ - u_int sb_maxused; /* max char count ever used in sockbuf */ + u_int32_t sb_idealsize; /* Ideal size for the sb based on bandwidth and delay */ void *reserved1[4]; /* for future use */ } so_rcv, so_snd; #define SB_MAX (8192*1024) /* default for max chars in sockbuf */ @@ -199,15 +199,16 @@ struct socket { #define SB_NOTIFY (SB_WAIT|SB_SEL|SB_ASYNC) #define SB_DROP 0x400 /* does not accept any more data */ #define SB_UNIX 0x800 /* UNIX domain socket buffer */ +#define SB_AUTOSIZE 0x1000 /* automatically size socket buffer */ +#define SB_TRIM 0x2000 /* Trim the socket buffer */ #define SB_RECV 0x8000 /* this is rcv sb */ - caddr_t so_tpcb; /* Wisc. protocol control block - XXX unused? */ + caddr_t so_tpcb; /* Wisc. protocol control block, used by some kexts */ #endif void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ - uid_t so_uid; /* who opened the socket */ - gid_t so_gid; /* gid of whoever opened the socket */ + kauth_cred_t so_cred; /* cred of who opened the socket */ /* NB: generation count must not be first; easiest to make it last. */ so_gen_t so_gencnt; /* generation count */ #ifndef __APPLE__ @@ -234,7 +235,6 @@ struct socket { #define SOF_PCBCLEARING 0x4 /* pru_disconnect done; don't call pru_detach */ #define SOF_DEFUNCT 0x8 /* socket marked as inactive */ #define SOF_CLOSEWAIT 0x10 /* blocked in close awaiting some events */ -#define SOF_UPCALLINUSE 0x20 /* socket upcall is currently in progress */ #define SOF_REUSESHAREUID 0x40 /* Allows SO_REUSEADDR/SO_REUSEPORT for multiple so_uid */ #define SOF_MULTIPAGES 0x80 /* jumbo clusters may be used for sosend */ #define SOF_ABORTED 0x100 /* soabort was already called once on the socket */ @@ -247,7 +247,13 @@ struct socket { #define SOF_NPX_SETOPTSHUT 0x2000 /* Non POSIX extension to allow setsockopt(2) after shut down */ #define SOF_RECV_TRAFFIC_CLASS 0x4000 /* Receive traffic class as ancillary data */ #define SOF_NODEFUNCT 0x8000 /* socket cannot be defunct'd */ -#define SOF_INCOMP_INPROGRESS 0x10000 /* incomp socket still being processed */ +#define SOF_PRIVILEGED_TRAFFIC_CLASS 0x10000 /* traffic class is privileged */ +#define SOF_SUSPENDED 0x20000 /* interface output queue is suspended */ +#define SOF_INCOMP_INPROGRESS 0x40000 /* incomp socket still being processed */ +#define SOF_NOTSENT_LOWAT 0x80000 /* A different lowat on not sent data has been set */ +#define SOF_KNOTE 0x100000 /* socket is on the EV_SOCK klist */ +#define SOF_USELRO 0x200000 /* TCP must use LRO on these sockets */ + uint32_t so_upcallusecount; /* number of upcalls in progress */ int so_usecount; /* refcounting of socket use */; int so_retaincnt; u_int32_t so_filteruse; /* usecount for the socket filters */ @@ -268,12 +274,12 @@ struct socket { struct label *so_peerlabel; /* cached MAC label for socket peer */ thread_t so_background_thread; /* thread that marked this socket background */ int so_traffic_class; - + // last process to interact with this socket u_int64_t last_upid; pid_t last_pid; - struct data_stats so_tc_stats[SO_TC_STATS_MAX]; + struct klist so_klist; /* klist for EV_SOCK events */ }; /* Control message accessor in mbufs */ @@ -285,8 +291,8 @@ struct socket { #define M_FIRST_CMSGHDR(m) \ ((char *)(m) != (char *)0L && (size_t)(m)->m_len >= sizeof(struct cmsghdr) && \ - (socklen_t)(m)->m_len >= __DARWIN_ALIGN32(((struct cmsghdr *)(m)->m_data)->cmsg_len) ?\ - (struct cmsghdr *)(m)->m_data : \ + (socklen_t)(m)->m_len >= __DARWIN_ALIGN32(((struct cmsghdr *)(void *)(m)->m_data)->cmsg_len) ?\ + (struct cmsghdr *)(void *)(m)->m_data : \ (struct cmsghdr *)0L) #define M_NXT_CMSGHDR(m, cmsg) \ @@ -294,7 +300,7 @@ struct socket { _MIN_NXT_CMSGHDR_PTR(cmsg) > ((char *)(m)->m_data) + (m)->m_len || \ _MIN_NXT_CMSGHDR_PTR(cmsg) < (char *)(m)->m_data ? \ (struct cmsghdr *)0L /* NULL */ : \ - (struct cmsghdr *)((unsigned char *)(cmsg) + \ + (struct cmsghdr *)(void *)((unsigned char *)(cmsg) + \ __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len))) #endif /* KERNEL_PRIVATE */ @@ -327,6 +333,12 @@ struct socket { #define _XSOCKET_PTR(x) x #endif +#ifdef PRIVATE +/* Flags returned in data field for EVFILT_SOCK events. */ +#define SOCKEV_CONNECTED 0x00000001 /* connected */ +#define SOCKEV_DISCONNECTED 0x00000002 /* disconnected */ +#endif /* PRIVATE */ + #pragma pack(4) struct xsockbuf { @@ -404,7 +416,7 @@ struct xsocket_n { u_int32_t xso_kind; /* XSO_SOCKET */ u_int64_t xso_so; /* makes a convenient handle */ short so_type; - short so_options; + u_int32_t so_options; short so_linger; short so_state; u_int64_t so_pcb; /* another convenient handle */ @@ -488,8 +500,25 @@ struct kextcb { #define sotokextcb(so) (so ? so->so_ext : 0) #ifdef KERNEL - -#define SO_FILT_HINT_LOCKED 0x1 +#include + +/* Hints for socket event processing */ +#define SO_FILT_HINT_LOCKED 0x00000001 /* socket is already locked */ +#define SO_FILT_HINT_CONNRESET 0x00000002 /* Reset is received */ +#define SO_FILT_HINT_CANTRCVMORE 0x00000004 /* No more data to read */ +#define SO_FILT_HINT_CANTSENDMORE 0x00000008 /* Can't write more data */ +#define SO_FILT_HINT_TIMEOUT 0x00000010 /* timeout */ +#define SO_FILT_HINT_NOSRCADDR 0x00000020 /* No src address available */ +#define SO_FILT_HINT_IFDENIED 0x00000040 /* interface denied connection */ +#define SO_FILT_HINT_SUSPEND 0x00000080 /* output queue suspended */ +#define SO_FILT_HINT_RESUME 0x00000100 /* output queue resumed */ +#define SO_FILT_HINT_KEEPALIVE 0x00000200 /* TCP Keepalive received */ + +#define SO_FILT_HINT_EV (SO_FILT_HINT_CONNRESET | \ + SO_FILT_HINT_CANTRCVMORE | SO_FILT_HINT_CANTSENDMORE | \ + SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR | \ + SO_FILT_HINT_IFDENIED | SO_FILT_HINT_SUSPEND | \ + SO_FILT_HINT_RESUME | SO_FILT_HINT_KEEPALIVE) /* * Argument structure for sosetopt et seq. This is in the KERNEL @@ -527,6 +556,7 @@ extern int socket_debug; extern int sosendjcl; extern int sosendjcl_ignore_capab; extern int sodefunctlog; +extern int sothrottlelog; extern int somaxconn; struct file; @@ -552,8 +582,20 @@ struct so_tcdbg; } \ } +#define SB_MB_CHECK(sb) do { \ + if (((sb)->sb_mb != NULL && \ + (sb)->sb_cc == 0) || \ + ((sb)->sb_mb == NULL && \ + (sb)->sb_cc > 0)) \ + panic("corrupt so_rcv: sb_mb %p sb_cc %d\n", \ + (sb)->sb_mb, (sb)->sb_cc); \ +} while(0) + + #define SODEFUNCTLOG(x) do { if (sodefunctlog) printf x; } while (0) +#define SOTHROTTLELOG(x) do { if (sothrottlelog) printf x; } while (0) + /* * For debugging traffic class behaviors */ @@ -562,7 +604,8 @@ struct so_tcdbg; #define SOTCDB_NO_SENDTCPBG 0x04 /* Do not use background TCP CC algorithm for sender */ #define SOTCDB_NO_LCLTST 0x08 /* Do not test for local destination for setting DSCP */ #define SOTCDB_NO_DSCPTST 0x10 /* Overwritte any existing DSCP code */ -#define SOTCDB_NO_RECVTCPBG 0x20 /* Do not use throttling on receiver-side of TCP */ +#define SOTCDB_NO_RECVTCPBG 0x20 /* Do not use throttling on receiver-side of TCP */ +#define SOTCDB_NO_PRIVILEGED 0x40 /* Do not set privileged traffic flag */ extern u_int32_t sotcdb; @@ -630,7 +673,8 @@ extern void soisconnecting(struct socket *so); extern void soisdisconnected(struct socket *so); extern void sodisconnectwakeup(struct socket *so); extern void soisdisconnecting(struct socket *so); -extern int soisbackground(struct socket *so); +extern int soisthrottled(struct socket *so); +extern int soisprivilegedtraffic(struct socket *so); extern int solisten(struct socket *so, int backlog); extern struct socket *sodropablereq(struct socket *head); extern struct socket *sonewconn(struct socket *head, int connstatus, @@ -643,15 +687,34 @@ extern int socket_unlock(struct socket *so, int refcount); extern void sofreelastref(struct socket *, int); extern int sogetaddr_locked(struct socket *, struct sockaddr **, int); extern const char *solockhistory_nr(struct socket *); -extern void set_packet_tclass(struct mbuf *, struct socket *, int, int); -extern int mbuf_traffic_class_from_control(struct mbuf *); +extern void soevent(struct socket *so, long hint); +extern void get_sockev_state(struct socket *, u_int32_t *); + +#ifdef BSD_KERNEL_PRIVATE +/* Service class flags used for setting service class on a packet */ +#define PKT_SCF_IPV6 0x00000001 /* IPv6 packet */ +#define PKT_SCF_TCP_ACK 0x00000002 /* Pure TCP ACK */ + +extern void set_packet_service_class(struct mbuf *, struct socket *, + mbuf_svc_class_t, u_int32_t); +extern void so_tc_update_stats(struct mbuf *, struct socket *, mbuf_svc_class_t ); +extern mbuf_svc_class_t mbuf_service_class_from_control(struct mbuf *); +extern mbuf_svc_class_t so_tc2msc(int); +extern int so_svc2tc(mbuf_svc_class_t); + extern void set_tcp_stream_priority(struct socket *so); extern int so_set_traffic_class(struct socket *, int); extern void so_set_default_traffic_class(struct socket *); +extern int so_set_opportunistic(struct socket *, int); +extern int so_get_opportunistic(struct socket *); +extern int so_set_recv_anyif(struct socket *, int); +extern int so_get_recv_anyif(struct socket *); extern void socket_tclass_init(void); extern int so_set_tcdbg(struct socket *, struct so_tcdbg *); extern int sogetopt_tcdbg(struct socket *, struct sockopt *); extern void so_recv_data_stat(struct socket *, struct mbuf *, size_t); +extern int so_wait_for_if_feedback(struct socket *); +#endif /* BSD_KERNEL_PRIVATE */ /* * XXX; prepare mbuf for (__FreeBSD__ < 3) routines. diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index 3a6b1371b..98009542c 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -223,7 +223,18 @@ #ifdef PRIVATE #define SIOCGIFGETRTREFCNT _IOWR('i', 137, struct ifreq) /* get interface route refcnt */ +#define SIOCGIFLINKQUALITYMETRIC _IOWR('i', 138, struct ifreq) /* get LQM */ +#define SIOCSIFOPPORTUNISTIC _IOWR('i', 139, struct ifreq) /* deprecated; use SIOCSIFTHROTTLE */ +#define SIOCGIFOPPORTUNISTIC _IOWR('i', 140, struct ifreq) /* deprecated; use SIOCGIFTHROTTLE */ +#define SIOCSETROUTERMODE _IOWR('i', 141, struct ifreq) /* enable/disable IPv4 router mode on interface */ +#define SIOCGIFEFLAGS _IOWR('i', 142, struct ifreq) /* get extended ifnet flags */ +#define SIOCSIFDESC _IOWR('i', 143, struct if_descreq) +#define SIOCGIFDESC _IOWR('i', 144, struct if_descreq) +#define SIOCSIFLINKPARAMS _IOWR('i', 145, struct if_linkparamsreq) +#define SIOCGIFLINKPARAMS _IOWR('i', 146, struct if_linkparamsreq) +#define SIOCGIFQUEUESTATS _IOWR('i', 147, struct if_qstatsreq) +#define SIOCSIFTHROTTLE _IOWR('i', 148, struct if_throttlereq) +#define SIOCGIFTHROTTLE _IOWR('i', 149, struct if_throttlereq) #endif /* PRIVATE */ - #endif /* !_SYS_SOCKIO_H_ */ diff --git a/bsd/sys/spawn.h b/bsd/sys/spawn.h index 4947902dd..85377d5f0 100644 --- a/bsd/sys/spawn.h +++ b/bsd/sys/spawn.h @@ -60,11 +60,13 @@ #define POSIX_SPAWN_START_SUSPENDED 0x0080 #ifdef PRIVATE #define _POSIX_SPAWN_DISABLE_ASLR 0x0100 -#define _POSIX_SPAWN_ALLOW_DATA_EXEC 0x2000 +/* unused 0x0200 */ +/* for compat sake */ #define POSIX_SPAWN_OSX_TALAPP_START 0x0400 #define POSIX_SPAWN_OSX_WIDGET_START 0x0800 #define POSIX_SPAWN_OSX_DBCLIENT_START 0x0800 /* not a bug, same as widget just rename */ -#define POSIX_SPAWN_IOS_APP_START 0x1000 +#define POSIX_SPAWN_OSX_RESVAPP_START 0x1000 /* reserved for app start usages */ +#define _POSIX_SPAWN_ALLOW_DATA_EXEC 0x2000 #endif /* PRIVATE */ #define POSIX_SPAWN_CLOEXEC_DEFAULT 0x4000 diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index d29526095..42f4687f8 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -38,7 +38,7 @@ */ #ifndef _SYS_SPAWN_INTERNAL_H_ -#define _SYS_SPAWN__INTERNALH_ +#define _SYS_SPAWN_INTERNAL_H_ #include /* __offsetof(), __darwin_size_t */ #include /* PATH_MAX */ @@ -64,7 +64,7 @@ typedef enum { typedef struct _ps_port_action { pspa_t port_type; exception_mask_t mask; - mach_port_t new_port; + mach_port_name_t new_port; exception_behavior_t behavior; thread_state_flavor_t flavor; int which; @@ -99,10 +99,55 @@ typedef struct _posix_spawnattr { sigset_t psa_sigmask; /* signal set to mask */ pid_t psa_pgroup; /* pgroup to spawn into */ cpu_type_t psa_binprefs[NBINPREFS]; /* cpu affinity prefs*/ - _posix_spawn_port_actions_t psa_ports; /* special/exception ports */ int psa_pcontrol; /* process control bits on resource starvation */ + int psa_apptype; /* app type and process spec behav */ + uint64_t psa_cpumonitor_percent; /* CPU usage monitor percentage */ + uint64_t psa_cpumonitor_interval; /* CPU usage monitor interval, in seconds */ + _posix_spawn_port_actions_t psa_ports; /* special/exception ports */ + /* XXX - k64/u32 unaligned below here */ +#if CONFIG_MEMORYSTATUS || CONFIG_EMBEDDED || TARGET_OS_EMBEDDED + /* Jetsam related */ + short psa_jetsam_flags; /* flags */ + int psa_priority; /* relative importance */ + int psa_high_water_mark; /* resident page count limit */ +#endif } *_posix_spawnattr_t; +/* + * Jetsam flags + */ +#if CONFIG_MEMORYSTATUS || CONFIG_EMBEDDED || TARGET_OS_EMBEDDED +#define POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY 0x1 +#endif + +/* + * DEPRECATED: maintained for transition purposes only + * posix_spawn apptype settings. + */ +#if TARGET_OS_EMBEDDED || CONFIG_EMBEDDED +/* for compat sake */ +#define POSIX_SPAWN_OSX_TALAPP_START 0x0400 +#define POSIX_SPAWN_IOS_RESV1_APP_START 0x0400 +#define POSIX_SPAWN_IOS_APPLE_DAEMON_START 0x0800 /* not a bug, same as widget just rename */ +#define POSIX_SPAWN_IOS_APP_START 0x1000 +#else /* TARGET_OS_EMBEDDED */ +#define POSIX_SPAWN_OSX_TALAPP_START 0x0400 +#define POSIX_SPAWN_OSX_WIDGET_START 0x0800 +#define POSIX_SPAWN_OSX_DBCLIENT_START 0x0800 /* not a bug, same as widget just rename */ +#define POSIX_SPAWN_OSX_RESVAPP_START 0x1000 /* reserved for app start usages */ +#endif /* TARGET_OS_EMBEDDED */ + + +/* + * posix_spawn apptype and process attribute settings. + */ +#if TARGET_OS_EMBEDDED || CONFIG_EMBEDDED +#define POSIX_SPAWN_APPTYPE_IOS_APPLEDAEMON 0x0001 /* it is an iOS apple daemon */ +#else /* TARGET_OS_EMBEDDED */ +#define POSIX_SPAWN_APPTYPE_OSX_TAL 0x0001 /* it is a TAL app */ +#define POSIX_SPAWN_APPTYPE_OSX_WIDGET 0x0002 /* it is a widget */ +#define POSIX_SPAWN_APPTYPE_DELAYIDLESLEEP 0x10000000 /* Process is marked to delay idle sleep on disk IO */ +#endif /* TARGET_OS_EMBEDDED */ /* * Allowable posix_spawn() file actions @@ -190,7 +235,7 @@ struct _posix_spawn_args_desc { __darwin_size_t file_actions_size; /* size of file actions block */ _posix_spawn_file_actions_t file_actions; /* pointer to block */ - __darwin_size_t port_actions_size; /* size of port actions block */ + __darwin_size_t port_actions_size; /* size of port actions block */ _posix_spawn_port_actions_t port_actions; /* pointer to port block */ }; diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index a1f06467a..29cec4776 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -544,7 +544,9 @@ SYSCTL_DECL(_user); #define KERN_KDGETENTROPY 16 #define KERN_KDWRITETR 17 #define KERN_KDWRITEMAP 18 - +#define KERN_KDENABLE_BG_TRACE 19 +#define KERN_KDDISABLE_BG_TRACE 20 +#define KERN_KDSET_TYPEFILTER 22 /* KERN_PANICINFO types */ #define KERN_PANICINFO_MAXSIZE 1 /* quad: panic UI image size limit */ diff --git a/bsd/sys/sysent.h b/bsd/sys/sysent.h index df71d010f..f2560fd79 100644 --- a/bsd/sys/sysent.h +++ b/bsd/sys/sysent.h @@ -56,7 +56,7 @@ extern struct sysent sysent[]; #endif /* __INIT_SYSENT_C__ */ extern int nsysent; -#define NUM_SYSENT 439 /* Current number of defined syscalls */ +#define NUM_SYSENT 440 /* Current number of defined syscalls */ /* sy_funnel flags bits */ #define FUNNEL_MASK 0x07f diff --git a/bsd/sys/syslimits.h b/bsd/sys/syslimits.h index a020b3919..56528fa2a 100644 --- a/bsd/sys/syslimits.h +++ b/bsd/sys/syslimits.h @@ -73,8 +73,8 @@ * compile time; you *cannot* set it higher than the hard limit!! */ #define ARG_MAX (256 * 1024) /* max bytes for an exec function */ -#define CHILD_MAX 266 /* max simultaneous processes */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define CHILD_MAX 266 /* max simultaneous processes */ #define GID_MAX 2147483647U /* max value for a gid_t (2^31-2) */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #define LINK_MAX 32767 /* max file link count */ diff --git a/bsd/sys/syslog.h b/bsd/sys/syslog.h index 71004cf2a..71f546ae5 100644 --- a/bsd/sys/syslog.h +++ b/bsd/sys/syslog.h @@ -229,9 +229,9 @@ __BEGIN_DECLS void closelog(void); void openlog(const char *, int, int); int setlogmask(int); -void syslog(int, const char *, ...) __printflike(2, 3) __DARWIN_LDBL_COMPAT(syslog); +void syslog(int, const char *, ...) __printflike(2, 3); #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL -void vsyslog(int, const char *, __darwin_va_list) __printflike(2, 0) __DARWIN_LDBL_COMPAT(vsyslog); +void vsyslog(int, const char *, __darwin_va_list) __printflike(2, 0); #endif __END_DECLS diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index f08bc477c..263e9223b 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -229,11 +229,22 @@ void throttle_info_release(void *throttle_info); void throttle_info_update(void *throttle_info, int flags); uint32_t throttle_lowpri_io(int sleep_amount); void throttle_set_thread_io_policy(int policy); + typedef struct __throttle_info_handle *throttle_info_handle_t; -int throttle_info_ref_by_mask( - uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle); +int throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle); void throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle); void throttle_info_update_by_mask(void *throttle_info_handle, int flags); + +void throttle_legacy_process_incr(void); +void throttle_legacy_process_decr(void); + +/* + * 'throttle_info_handle' acquired via 'throttle_info_ref_by_mask' + * 'policy' should be specified as either IOPOL_UTILITY or IPOL_THROTTLE, + * all other values will be treated as IOPOL_NORMAL (i.e. no throttling) + */ +int throttle_info_io_will_be_throttled(void *throttle_info_handle, int policy); + __END_DECLS #endif /* !_SYS_SYSTM_H_ */ diff --git a/bsd/sys/tty.h b/bsd/sys/tty.h index ecfb234d5..28ee788a5 100644 --- a/bsd/sys/tty.h +++ b/bsd/sys/tty.h @@ -144,6 +144,7 @@ struct tty { int t_lowat; /* Low water mark. */ int t_gen; /* Generation number. */ void *t_iokit; /* IOKit management */ + int t_refcnt; /* reference count */ }; #define TTY_NULL (struct tty *)0 diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index 4ee9e86cf..37bcdfcff 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -109,6 +109,7 @@ int ubc_upl_commit(upl_t); int ubc_upl_commit_range(upl_t, upl_offset_t, upl_size_t, int); int ubc_upl_abort(upl_t, int); int ubc_upl_abort_range(upl_t, upl_offset_t, upl_size_t, int); +void ubc_upl_range_needed(upl_t, int, int); upl_page_info_t *ubc_upl_pageinfo(upl_t); upl_size_t ubc_upl_maxbufsize(void); diff --git a/bsd/sys/un.h b/bsd/sys/un.h index 400620396..9a75d32c3 100644 --- a/bsd/sys/un.h +++ b/bsd/sys/un.h @@ -89,6 +89,7 @@ struct sockaddr_un { /* Socket options. */ #define LOCAL_PEERCRED 0x001 /* retrieve peer credentails */ +#define LOCAL_PEERPID 0x002 /* retrieve peer pid */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/unpcb.h b/bsd/sys/unpcb.h index a50aebe36..d4d566654 100644 --- a/bsd/sys/unpcb.h +++ b/bsd/sys/unpcb.h @@ -166,7 +166,7 @@ struct unpcb { _UNPCB_LIST_ENTRY(unpcb_compat) unp_link; /* glue on list of all PCBs */ _UNPCB_PTR(struct socket *) unp_socket; /* pointer back to socket */ _UNPCB_PTR(struct vnode *) unp_vnode; /* if associated with file */ - ino_t unp_ino; /* fake inode number */ + u_int32_t unp_ino; /* fake inode number */ _UNPCB_PTR(struct unpcb_compat *) unp_conn; /* control block of connected socket */ #if defined(KERNEL) u_int32_t unp_refs; diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 4a59aa866..b5c1106e5 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,7 +80,6 @@ #include /* XXX */ #include - #ifdef KERNEL #ifdef BSD_KERNEL_PRIVATE #include /* for uu_kwe entry */ @@ -103,6 +102,8 @@ struct vfs_context { /* XXX Deprecated: xnu source compatability */ #define uu_ucred uu_context.vc_ucred +struct label; /* MAC label dummy struct */ + #define MAXTHREADNAMESIZE 64 /* * Per-thread U area. @@ -159,10 +160,8 @@ struct uthread { caddr_t uu_wchan; /* sleeping thread wait channel */ const char *uu_wmesg; /* ... wait message */ int uu_flag; -#if CONFIG_EMBEDDED - int uu_iopol_disk; /* disk I/O policy */ -#endif /* CONFIG_EMBEDDED */ struct proc * uu_proc; + thread_t uu_thread; void * uu_userstate; wait_queue_set_t uu_wqset; /* cached across select calls */ size_t uu_allocsize; /* ...size of select cache */ @@ -177,11 +176,13 @@ struct uthread { struct kaudit_record *uu_ar; /* audit record */ struct task* uu_aio_task; /* target task for async io */ - + + u_int32_t uu_network_lock_held; /* network support for pf locking */ lck_mtx_t *uu_mtx; + TAILQ_ENTRY(uthread) uu_throttlelist; /* List of uthreads currently throttled */ + int uu_on_throttlelist; int uu_lowpri_window; - boolean_t uu_throttle_isssd; boolean_t uu_throttle_bc; void * uu_throttle_info; /* pointer to throttled I/Os info */ @@ -203,8 +204,8 @@ struct uthread { uint32_t t_dtrace_errno; /* Most recent errno */ uint8_t t_dtrace_stop; /* indicates a DTrace desired stop */ uint8_t t_dtrace_sig; /* signal sent via DTrace's raise() */ - uint64_t t_dtrace_resumepid; /* DTrace's pidresume() pid */ - + uint64_t t_dtrace_resumepid; /* DTrace's pidresume() pid */ + union __tdu { struct __tds { uint8_t _t_dtrace_on; /* hit a fasttrap tracepoint */ @@ -236,10 +237,12 @@ struct uthread { #if __sol64 || defined(__APPLE__) uint64_t t_dtrace_regv; /* DTrace saved reg from fasttrap */ #endif + void * t_dtrace_syscall_args; #endif /* CONFIG_DTRACE */ void * uu_threadlist; char * pth_name; struct ksyn_waitq_element uu_kwe; /* user for pthread synch */ + struct label * uu_label; /* MAC label */ }; typedef struct uthread * uthread_t; @@ -256,9 +259,7 @@ typedef struct uthread * uthread_t; #define UT_PASSIVE_IO 0x00000100 /* this thread issues passive I/O */ #define UT_PROCEXIT 0x00000200 /* this thread completed the proc exit */ #define UT_RAGE_VNODES 0x00000400 /* rapid age any vnodes created by this thread */ -#if CONFIG_EMBEDDED -#define UT_BACKGROUND 0x00000800 /* this thread is in background state */ -#endif /* !CONFIG_EMBEDDED */ +/* 0x00000800 unused, used to be UT_BACKGROUND */ #define UT_BACKGROUND_TRAFFIC_MGT 0x00001000 /* background traffic is regulated */ #define UT_VFORK 0x02000000 /* thread has vfork children */ diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 965518cb0..1a9343729 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -145,8 +145,11 @@ enum vtagtype { #define IO_BACKGROUND IO_PASSIVE /* used for backward compatibility. to be removed after IO_BACKGROUND is no longer * used by DiskImages in-kernel mode */ #define IO_NOAUTH 0x8000 /* No authorization checks. */ -#define IO_NODIRECT 0x10000 /* don't use direct synchronous writes if IO_NOCACHE is specified */ - +#define IO_NODIRECT 0x10000 /* don't use direct synchronous writes if IO_NOCACHE is specified */ +#define IO_ENCRYPTED 0x20000 /* Retrieve encrypted blocks from the filesystem */ +#define IO_RETURN_ON_THROTTLE 0x40000 +#define IO_SINGLE_WRITER 0x80000 +#define IO_SYSCALL_DISPATCH 0x100000 /* I/O origin is file table syscall */ /* * Component Name: this structure describes the pathname @@ -466,7 +469,7 @@ struct vnode_trigger_param { * Note that this structure may be extended, but existing fields must not move. */ -#define VATTR_INIT(v) do {(v)->va_supported = (v)->va_active = 0ll; (v)->va_vaflags = 0;} while(0) +#define VATTR_INIT(v) do {(v)->va_supported = (v)->va_active = 0ll; (v)->va_vaflags = 0; } while(0) #define VATTR_SET_ACTIVE(v, a) ((v)->va_active |= VNODE_ATTR_ ## a) #define VATTR_SET_SUPPORTED(v, a) ((v)->va_supported |= VNODE_ATTR_ ## a) #define VATTR_IS_SUPPORTED(v, a) ((v)->va_supported & VNODE_ATTR_ ## a) @@ -517,6 +520,8 @@ struct vnode_trigger_param { #define VNODE_ATTR_va_nchildren (1LL<<28) /* 10000000 */ #define VNODE_ATTR_va_dirlinkcount (1LL<<29) /* 20000000 */ #define VNODE_ATTR_va_addedtime (1LL<<30) /* 40000000 */ +#define VNODE_ATTR_va_dataprotect_class (1LL<<31) /* 80000000 */ +#define VNODE_ATTR_va_dataprotect_flags (1LL<<32) /* 100000000 */ #define VNODE_ATTR_BIT(n) (VNODE_ATTR_ ## n) /* @@ -537,8 +542,8 @@ struct vnode_trigger_param { VNODE_ATTR_BIT(va_name) | \ VNODE_ATTR_BIT(va_type) | \ VNODE_ATTR_BIT(va_nchildren) | \ - VNODE_ATTR_BIT(va_dirlinkcount)| \ - VNODE_ATTR_BIT(va_addedtime)) + VNODE_ATTR_BIT(va_dirlinkcount) | \ + VNODE_ATTR_BIT(va_addedtime)) /* * Attributes that can be applied to a new file object. */ @@ -554,7 +559,9 @@ struct vnode_trigger_param { VNODE_ATTR_BIT(va_encoding) | \ VNODE_ATTR_BIT(va_type) | \ VNODE_ATTR_BIT(va_uuuid) | \ - VNODE_ATTR_BIT(va_guuid)) + VNODE_ATTR_BIT(va_guuid) | \ + VNODE_ATTR_BIT(va_dataprotect_class) | \ + VNODE_ATTR_BIT(va_dataprotect_flags)) struct vnode_attr { @@ -617,11 +624,21 @@ struct vnode_attr { #else void * va_reserved1; #endif /* BSD_KERNEL_PRIVATE */ - struct timespec va_addedtime; /* timestamp when item was added to parent directory */ - + struct timespec va_addedtime; /* timestamp when item was added to parent directory */ + /* Data Protection fields */ + uint32_t va_dataprotect_class; /* class specified for this file if it didn't exist */ + uint32_t va_dataprotect_flags; /* flags from NP open(2) to the filesystem */ }; +#ifdef BSD_KERNEL_PRIVATE +/* + * Flags for va_dataprotect_flags + */ +#define VA_DP_RAWENCRYPTED 0x0001 + +#endif + /* * Flags for va_vaflags. */ @@ -666,14 +683,15 @@ extern int vttoif_tab[]; #define REVOKEALL 0x0001 /* vnop_revoke: revoke all aliases */ -/* VNOP_REMOVE: do not delete busy files (Carbon remove file semantics) */ -#define VNODE_REMOVE_NODELETEBUSY 0x0001 +/* VNOP_REMOVE/unlink flags: */ +#define VNODE_REMOVE_NODELETEBUSY 0x0001 /* Do not delete busy files (Carbon) */ +#define VNODE_REMOVE_SKIP_NAMESPACE_EVENT 0x0002 /* Do not upcall to userland handlers */ /* VNOP_READDIR flags: */ #define VNODE_READDIR_EXTENDED 0x0001 /* use extended directory entries */ #define VNODE_READDIR_REQSEEKOFF 0x0002 /* requires seek offset (cookies) */ #define VNODE_READDIR_SEEKOFF32 0x0004 /* seek offset values should fit in 32 bits */ - +#define VNODE_READDIR_NAMEMAX 0x0008 /* For extended readdir, try to limit names to NAME_MAX bytes */ #define NULLVP ((struct vnode *)NULL) @@ -1835,6 +1853,14 @@ vnode_t vnode_getparent(vnode_t vp); @result 0 to indicate that a vnode can be opened, or an error that should be passed up to VFS. */ int vnode_lookup_continue_needed(vnode_t vp, struct componentname *cnp); + +/*! + @function vnode_istty + @abstract Determine if the given vnode represents a tty device. + @param vp Vnode to examine. + @result Non-zero to indicate that the vnode represents a tty device. Zero otherwise. + */ +int vnode_istty(vnode_t vp); #endif /* KERNEL_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE @@ -1864,7 +1890,6 @@ int vnode_makeimode(int, int); enum vtype vnode_iftovt(int); int vnode_vttoif(enum vtype); int vnode_isshadow(vnode_t); -int vnode_istty(vnode_t vp); /* * Indicate that a file has multiple hard links. VFS will always call * VNOP_LOOKUP on this vnode. Volfs will always ask for it's parent diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index 7d17be99e..e846ac679 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,12 +93,13 @@ struct label; LIST_HEAD(buflists, buf); - +#if CONFIG_VFS_FUNNEL struct unsafe_fsnode { lck_mtx_t fsnodelock; int32_t fsnode_count; void * fsnodeowner; }; +#endif /* CONFIG_VFS_FUNNEL */ #if CONFIG_TRIGGERS /* @@ -177,12 +178,14 @@ struct vnode { const char *v_name; /* name component of the vnode */ vnode_t v_parent; /* pointer to parent vnode */ struct lockf *v_lockf; /* advisory lock list head */ -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL struct unsafe_fsnode *v_unsafefs; /* pointer to struct used to lock */ #else int32_t v_reserved1; +#ifdef __LP64__ int32_t v_reserved2; -#endif /* __LP64__ */ +#endif +#endif /* CONFIG_VFS_FUNNEL */ int (**v_op)(void *); /* vnode operations vector */ mount_t v_mount; /* ptr to vfs we are in */ void * v_data; /* private data for fs */ @@ -213,8 +216,9 @@ struct vnode { /* * v_listflag */ -#define VLIST_RAGE 0x01 /* vnode is currently in the rapid age list */ -#define VLIST_DEAD 0x02 /* vnode is currently in the dead list */ +#define VLIST_RAGE 0x01 /* vnode is currently in the rapid age list */ +#define VLIST_DEAD 0x02 /* vnode is currently in the dead list */ +#define VLIST_ASYNC_WORK 0x04 /* vnode is currently on the deferred async work queue */ /* * v_lflags @@ -372,6 +376,8 @@ struct ostat; #define BUILDPATH_NO_FS_ENTER 0x1 /* Use cache values, do not enter file system */ #define BUILDPATH_CHECKACCESS 0x2 /* Check if parents have search rights */ +#define BUILDPATH_CHECK_MOVED 0x4 /* Return EAGAIN if the parent hierarchy is modified */ + int build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx); int bdevvp(dev_t dev, struct vnode **vpp); diff --git a/bsd/sys/xattr.h b/bsd/sys/xattr.h index bd91c3c31..b4fe1ac99 100644 --- a/bsd/sys/xattr.h +++ b/bsd/sys/xattr.h @@ -49,6 +49,7 @@ #define XATTR_MAXNAMELEN 127 +/* See the ATTR_CMN_FNDRINFO section of getattrlist(2) for details on FinderInfo */ #define XATTR_FINDERINFO_NAME "com.apple.FinderInfo" #define XATTR_RESOURCEFORK_NAME "com.apple.ResourceFork" diff --git a/bsd/uxkern/ux_exception.c b/bsd/uxkern/ux_exception.c index dfbe2e5d2..3175c3163 100644 --- a/bsd/uxkern/ux_exception.c +++ b/bsd/uxkern/ux_exception.c @@ -239,7 +239,6 @@ catch_mach_exception_raise( task_t self = current_task(); thread_t th_act; ipc_port_t thread_port; - struct task *sig_task; struct proc *p; kern_return_t result = MACH_MSG_SUCCESS; int ux_signal = 0; @@ -273,10 +272,9 @@ catch_mach_exception_raise( ux_exception(exception, code[0], code[1], &ux_signal, &ucode); ut = get_bsdthread_info(th_act); - sig_task = get_threadtask(th_act); - p = (struct proc *) get_bsdtask_info(sig_task); + p = proc_findthread(th_act); - /* Can't deliver a signal without a bsd process */ + /* Can't deliver a signal without a bsd process reference */ if (p == NULL) { ux_signal = 0; result = KERN_FAILURE; @@ -298,15 +296,9 @@ catch_mach_exception_raise( struct sigacts *ps; sp = code[1]; - if (ut && (ut->uu_flag & UT_VFORK)) - p = ut->uu_proc; -#if STACK_GROWTH_UP - stack_min = p->user_stack; - stack_max = p->user_stack + MAXSSIZ; -#else /* STACK_GROWTH_UP */ + stack_max = p->user_stack; stack_min = p->user_stack - MAXSSIZ; -#endif /* STACK_GROWTH_UP */ if (sp >= stack_min && sp < stack_max) { /* @@ -344,7 +336,8 @@ catch_mach_exception_raise( ut->uu_subcode = code[1]; threadsignal(th_act, ux_signal, code[0]); } - + if (p != NULL) + proc_rele(p); thread_deallocate(th_act); } else diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index a18760397..2e200b08c 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -127,10 +127,10 @@ #define COMPAT_ONLY -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL #define THREAD_SAFE_FS(VP) \ ((VP)->v_unsafefs ? 0 : 1) -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ #define NATIVE_XATTR(VP) \ ((VP)->v_mount ? (VP)->v_mount->mnt_kern_flag & MNTK_EXTENDED_ATTRS : 0) @@ -166,7 +166,7 @@ vnode_setneedinactive(vnode_t vp) } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int lock_fsnode(vnode_t vp, int *funnel_state) { @@ -206,7 +206,7 @@ unlock_fsnode(vnode_t vp, int *funnel_state) if (funnel_state) (void) thread_funnel_set(kernel_flock, *funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ @@ -221,21 +221,21 @@ int VFS_MOUNT(mount_t mp, vnode_t devvp, user_addr_t data, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_mount == 0)) return(ENOTSUP); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ - +#endif /* CONFIG_VFS_FUNNEL */ + if (vfs_context_is64bit(ctx)) { if (vfs_64bitready(mp)) { error = (*mp->mnt_op->vfs_mount)(mp, devvp, data, ctx); @@ -248,11 +248,11 @@ VFS_MOUNT(mount_t mp, vnode_t devvp, user_addr_t data, vfs_context_t ctx) error = (*mp->mnt_op->vfs_mount)(mp, devvp, data, ctx); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -261,29 +261,28 @@ int VFS_START(mount_t mp, int flags, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_start == 0)) return(ENOTSUP); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); - if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_start)(mp, flags, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -292,29 +291,28 @@ int VFS_UNMOUNT(mount_t mp, int flags, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_unmount == 0)) return(ENOTSUP); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); - if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_unmount)(mp, flags, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -340,10 +338,10 @@ int VFS_ROOT(mount_t mp, struct vnode ** vpp, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_root == 0)) return(ENOTSUP); @@ -352,20 +350,20 @@ VFS_ROOT(mount_t mp, struct vnode ** vpp, vfs_context_t ctx) ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_root)(mp, vpp, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -374,28 +372,28 @@ int VFS_QUOTACTL(mount_t mp, int cmd, uid_t uid, caddr_t datap, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_quotactl == 0)) return(ENOTSUP); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_quotactl)(mp, cmd, uid, datap, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -404,10 +402,10 @@ int VFS_GETATTR(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_getattr == 0)) return(ENOTSUP); @@ -416,20 +414,20 @@ VFS_GETATTR(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_getattr)(mp, vfa, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(error); } @@ -438,10 +436,10 @@ int VFS_SETATTR(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_setattr == 0)) return(ENOTSUP); @@ -450,20 +448,20 @@ VFS_SETATTR(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx) ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_setattr)(mp, vfa, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(error); } @@ -472,10 +470,10 @@ int VFS_SYNC(mount_t mp, int flags, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_sync == 0)) return(ENOTSUP); @@ -484,20 +482,20 @@ VFS_SYNC(mount_t mp, int flags, vfs_context_t ctx) ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_sync)(mp, flags, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(error); } @@ -506,10 +504,10 @@ int VFS_VGET(mount_t mp, ino64_t ino, struct vnode **vpp, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_vget == 0)) return(ENOTSUP); @@ -518,20 +516,20 @@ VFS_VGET(mount_t mp, ino64_t ino, struct vnode **vpp, vfs_context_t ctx) ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_vget)(mp, ino, vpp, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(error); } @@ -540,10 +538,10 @@ int VFS_FHTOVP(mount_t mp, int fhlen, unsigned char * fhp, vnode_t * vpp, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((mp == dead_mountp) || (mp->mnt_op->vfs_fhtovp == 0)) return(ENOTSUP); @@ -552,20 +550,20 @@ VFS_FHTOVP(mount_t mp, int fhlen, unsigned char * fhp, vnode_t * vpp, vfs_contex ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = (mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*mp->mnt_op->vfs_fhtovp)(mp, fhlen, fhp, vpp, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(error); } @@ -574,10 +572,10 @@ int VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t ctx) { int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((vp->v_mount == dead_mountp) || (vp->v_mount->mnt_op->vfs_vptofh == 0)) return(ENOTSUP); @@ -586,20 +584,20 @@ VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t ct ctx = vfs_context_current(); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*vp->v_mount->mnt_op->vfs_vptofh)(vp, fhlenp, fhp, ctx); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(error); } @@ -1052,12 +1050,12 @@ vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) || (vfe->vfe_opvdescs == (struct vnodeopv_desc **)NULL)) return(EINVAL); -#ifdef __LP64__ - /* Non-threadsafe filesystems are not supported for K64 */ +#if !CONFIG_VFS_FUNNEL + /* Non-threadsafe filesystems are not supported e.g. on K64 & iOS */ if ((vfe->vfe_flags & (VFS_TBLTHREADSAFE | VFS_TBLFSNODELOCK)) == 0) { return (EINVAL); } -#endif /* __LP64__ */ +#endif /* !CONFIG_VFS_FUNNEL */ MALLOC(newvfstbl, void *, sizeof(struct vfstable), M_TEMP, M_WAITOK); @@ -1080,12 +1078,12 @@ vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) newvfstbl->vfc_vfsflags |= VFC_VFSVNOP_PAGEINV2; if (vfe->vfe_flags & VFS_TBLVNOP_PAGEOUTV2) newvfstbl->vfc_vfsflags |= VFC_VFSVNOP_PAGEOUTV2; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (vfe->vfe_flags & VFS_TBLTHREADSAFE) newvfstbl->vfc_vfsflags |= VFC_VFSTHREADSAFE; if (vfe->vfe_flags & VFS_TBLFSNODELOCK) newvfstbl->vfc_vfsflags |= VFC_VFSTHREADSAFE; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if ((vfe->vfe_flags & VFS_TBLLOCALVOL) == VFS_TBLLOCALVOL) newvfstbl->vfc_flags |= MNT_LOCAL; if ((vfe->vfe_flags & VFS_TBLLOCALVOL) && (vfe->vfe_flags & VFS_TBLGENERICMNTARGS) == 0) @@ -3016,10 +3014,10 @@ VNOP_LOOKUP(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, vfs_context_t int _err; struct vnop_lookup_args a; vnode_t vp; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_lookup_desc; a.a_dvp = dvp; @@ -3027,20 +3025,20 @@ VNOP_LOOKUP(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, vfs_context_t a.a_cnp = cnp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_lookup_desc.vdesc_offset])(&a); vp = *vpp; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if ( (cnp->cn_flags & ISLASTCN) ) { if ( (cnp->cn_flags & LOCKPARENT) ) { @@ -3061,7 +3059,7 @@ VNOP_LOOKUP(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, vfs_context_t } unlock_fsnode(dvp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3162,10 +3160,10 @@ VNOP_CREATE(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode { int _err; struct vnop_create_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_create_desc; a.a_dvp = dvp; @@ -3174,14 +3172,14 @@ VNOP_CREATE(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode a.a_vap = vap; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_create_desc.vdesc_offset])(&a); if (_err == 0 && !NATIVE_XATTR(dvp)) { @@ -3191,11 +3189,11 @@ VNOP_CREATE(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(dvp, _err, NOTE_WRITE); @@ -3223,10 +3221,10 @@ VNOP_WHITEOUT(vnode_t dvp, struct componentname * cnp, int flags, vfs_context_t { int _err; struct vnop_whiteout_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_whiteout_desc; a.a_dvp = dvp; @@ -3234,22 +3232,22 @@ VNOP_WHITEOUT(vnode_t dvp, struct componentname * cnp, int flags, vfs_context_t a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_whiteout_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(dvp, _err, NOTE_WRITE); @@ -3278,10 +3276,10 @@ VNOP_MKNOD(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode_ int _err; struct vnop_mknod_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_mknod_desc; a.a_dvp = dvp; @@ -3290,22 +3288,22 @@ VNOP_MKNOD(vnode_t dvp, vnode_t * vpp, struct componentname * cnp, struct vnode_ a.a_vap = vap; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_mknod_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(dvp, _err, NOTE_WRITE); @@ -3330,10 +3328,10 @@ VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) { int _err; struct vnop_open_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3343,7 +3341,7 @@ VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) a.a_mode = mode; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); @@ -3354,18 +3352,18 @@ VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) } } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_open_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); } (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3388,10 +3386,10 @@ VNOP_CLOSE(vnode_t vp, int fflag, vfs_context_t ctx) { int _err; struct vnop_close_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3401,7 +3399,7 @@ VNOP_CLOSE(vnode_t vp, int fflag, vfs_context_t ctx) a.a_fflag = fflag; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); @@ -3412,18 +3410,18 @@ VNOP_CLOSE(vnode_t vp, int fflag, vfs_context_t ctx) } } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_close_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); } (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3446,10 +3444,10 @@ VNOP_ACCESS(vnode_t vp, int action, vfs_context_t ctx) { int _err; struct vnop_access_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3459,22 +3457,22 @@ VNOP_ACCESS(vnode_t vp, int action, vfs_context_t ctx) a.a_action = action; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_access_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3497,32 +3495,32 @@ VNOP_GETATTR(vnode_t vp, struct vnode_attr * vap, vfs_context_t ctx) { int _err; struct vnop_getattr_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_getattr_desc; a.a_vp = vp; a.a_vap = vap; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_getattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3545,24 +3543,24 @@ VNOP_SETATTR(vnode_t vp, struct vnode_attr * vap, vfs_context_t ctx) { int _err; struct vnop_setattr_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_setattr_desc; a.a_vp = vp; a.a_vap = vap; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_setattr_desc.vdesc_offset])(&a); @@ -3601,11 +3599,11 @@ VNOP_SETATTR(vnode_t vp, struct vnode_attr * vap, vfs_context_t ctx) } } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* * If we have changed any of the things about the file that are likely @@ -3659,10 +3657,10 @@ VNOP_READ(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t ctx) { int _err; struct vnop_read_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3674,7 +3672,7 @@ VNOP_READ(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t ctx) a.a_ioflag = ioflag; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); @@ -3685,18 +3683,18 @@ VNOP_READ(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t ctx) } } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_read_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); } (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3721,10 +3719,10 @@ VNOP_WRITE(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t ctx) { struct vnop_write_args a; int _err; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3736,7 +3734,7 @@ VNOP_WRITE(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t ctx) a.a_ioflag = ioflag; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); @@ -3747,18 +3745,18 @@ VNOP_WRITE(vnode_t vp, struct uio * uio, int ioflag, vfs_context_t ctx) } } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_write_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); } (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(vp, _err, NOTE_WRITE); @@ -3786,10 +3784,10 @@ VNOP_IOCTL(vnode_t vp, u_long command, caddr_t data, int fflag, vfs_context_t ct { int _err; struct vnop_ioctl_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3822,7 +3820,7 @@ VNOP_IOCTL(vnode_t vp, u_long command, caddr_t data, int fflag, vfs_context_t ct a.a_fflag = fflag; a.a_context= ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); @@ -3833,18 +3831,18 @@ VNOP_IOCTL(vnode_t vp, u_long command, caddr_t data, int fflag, vfs_context_t ct } } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_ioctl_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); } (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3870,10 +3868,10 @@ VNOP_SELECT(vnode_t vp, int which , int fflags, void * wql, vfs_context_t ctx) { int _err; struct vnop_select_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -3885,7 +3883,7 @@ VNOP_SELECT(vnode_t vp, int which , int fflags, void * wql, vfs_context_t ctx) a.a_context = ctx; a.a_wql = wql; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); @@ -3896,18 +3894,18 @@ VNOP_SELECT(vnode_t vp, int which , int fflags, void * wql, vfs_context_t ctx) } } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_select_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); } (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -3933,11 +3931,11 @@ VNOP_EXCHANGE(vnode_t fvp, vnode_t tvp, int options, vfs_context_t ctx) { int _err; struct vnop_exchange_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; vnode_t lock_first = NULL, lock_second = NULL; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_exchange_desc; a.a_fvp = fvp; @@ -3945,7 +3943,7 @@ VNOP_EXCHANGE(vnode_t fvp, vnode_t tvp, int options, vfs_context_t ctx) a.a_options = options; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(fvp); if (!thread_safe) { /* @@ -3966,16 +3964,16 @@ VNOP_EXCHANGE(vnode_t fvp, vnode_t tvp, int options, vfs_context_t ctx) return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*fvp->v_op[vnop_exchange_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(lock_second, NULL); unlock_fsnode(lock_first, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* Don't post NOTE_WRITE because file descriptors follow the data ... */ post_event_if_success(fvp, _err, NOTE_ATTRIB); @@ -4003,30 +4001,30 @@ VNOP_REVOKE(vnode_t vp, int flags, vfs_context_t ctx) { struct vnop_revoke_args a; int _err; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_revoke_desc; a.a_vp = vp; a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_revoke_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -4050,32 +4048,32 @@ VNOP_MMAP(vnode_t vp, int fflags, vfs_context_t ctx) { int _err; struct vnop_mmap_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_mmap_desc; a.a_vp = vp; a.a_fflags = fflags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_mmap_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -4098,31 +4096,31 @@ VNOP_MNOMAP(vnode_t vp, vfs_context_t ctx) { int _err; struct vnop_mnomap_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_mnomap_desc; a.a_vp = vp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_mnomap_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -4146,32 +4144,32 @@ VNOP_FSYNC(vnode_t vp, int waitfor, vfs_context_t ctx) { struct vnop_fsync_args a; int _err; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_fsync_desc; a.a_vp = vp; a.a_waitfor = waitfor; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_fsync_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -4198,10 +4196,10 @@ VNOP_REMOVE(vnode_t dvp, vnode_t vp, struct componentname * cnp, int flags, vfs_ { int _err; struct vnop_remove_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_remove_desc; a.a_dvp = dvp; @@ -4210,14 +4208,14 @@ VNOP_REMOVE(vnode_t dvp, vnode_t vp, struct componentname * cnp, int flags, vfs_ a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_remove_desc.vdesc_offset])(&a); @@ -4232,11 +4230,11 @@ VNOP_REMOVE(vnode_t dvp, vnode_t vp, struct componentname * cnp, int flags, vfs_ } } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(vp, _err, NOTE_DELETE | NOTE_LINK); post_event_if_success(dvp, _err, NOTE_WRITE); @@ -4308,10 +4306,10 @@ VNOP_LINK(vnode_t vp, vnode_t tdvp, struct componentname * cnp, vfs_context_t ct { int _err; struct vnop_link_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* * For file systems with non-native extended attributes, @@ -4337,22 +4335,22 @@ VNOP_LINK(vnode_t vp, vnode_t tdvp, struct componentname * cnp, vfs_context_t ct a.a_cnp = cnp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*tdvp->v_op[vnop_link_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(vp, _err, NOTE_LINK); post_event_if_success(tdvp, _err, NOTE_WRITE); @@ -4368,8 +4366,8 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s int _err; vnode_t src_attr_vp = NULLVP; vnode_t dst_attr_vp = NULLVP; - struct nameidata fromnd; - struct nameidata tond; + struct nameidata *fromnd = NULL; + struct nameidata *tond = NULL; char smallname1[48]; char smallname2[48]; char *xfromname = NULL; @@ -4378,9 +4376,9 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s batched = vnode_compound_rename_available(fdvp); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL vnode_t fdvp_unsafe = (THREAD_SAFE_FS(fdvp) ? NULLVP : fdvp); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (!batched) { if (*fvpp == NULLVP) @@ -4429,10 +4427,11 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s * is only for AppleDouble files. */ if (xfromname != NULL) { - NDINIT(&fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, + MALLOC(fromnd, struct nameidata *, sizeof (struct nameidata), M_TEMP, M_WAITOK); + NDINIT(fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx); - fromnd.ni_dvp = fdvp; - error = namei(&fromnd); + fromnd->ni_dvp = fdvp; + error = namei(fromnd); /* * If there was an error looking up source attribute file, @@ -4440,13 +4439,13 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s */ if (error == 0) { - if (fromnd.ni_vp) { + if (fromnd->ni_vp) { /* src_attr_vp indicates need to call vnode_put / nameidone later */ - src_attr_vp = fromnd.ni_vp; - - if (fromnd.ni_vp->v_type != VREG) { + src_attr_vp = fromnd->ni_vp; + + if (fromnd->ni_vp->v_type != VREG) { src_attr_vp = NULLVP; - vnode_put(fromnd.ni_vp); + vnode_put(fromnd->ni_vp); } } /* @@ -4455,7 +4454,7 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s * have a vnode here, so we drop our namei buffer for the source attribute file */ if (src_attr_vp == NULLVP) { - nameidone(&fromnd); + nameidone(fromnd); } } } @@ -4486,29 +4485,47 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s * Note that tdvp already has an iocount reference. Make sure to check that we * get a valid vnode from namei. */ - NDINIT(&tond, RENAME, OP_RENAME, + MALLOC(tond, struct nameidata *, sizeof(struct nameidata), M_TEMP, M_WAITOK); + NDINIT(tond, RENAME, OP_RENAME, NOCACHE | NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, UIO_SYSSPACE, CAST_USER_ADDR_T(xtoname), ctx); - tond.ni_dvp = tdvp; - error = namei(&tond); + tond->ni_dvp = tdvp; + error = namei(tond); if (error) goto out; - if (tond.ni_vp) { - dst_attr_vp = tond.ni_vp; + if (tond->ni_vp) { + dst_attr_vp = tond->ni_vp; } if (src_attr_vp) { + const char *old_name = src_attr_vp->v_name; + vnode_t old_parent = src_attr_vp->v_parent; + if (batched) { - error = VNOP_COMPOUND_RENAME(fdvp, &src_attr_vp, &fromnd.ni_cnd, NULL, - tdvp, &dst_attr_vp, &tond.ni_cnd, NULL, + error = VNOP_COMPOUND_RENAME(fdvp, &src_attr_vp, &fromnd->ni_cnd, NULL, + tdvp, &dst_attr_vp, &tond->ni_cnd, NULL, 0, ctx); } else { - error = VNOP_RENAME(fdvp, src_attr_vp, &fromnd.ni_cnd, - tdvp, dst_attr_vp, &tond.ni_cnd, ctx); + error = VNOP_RENAME(fdvp, src_attr_vp, &fromnd->ni_cnd, + tdvp, dst_attr_vp, &tond->ni_cnd, ctx); } + if (error == 0 && old_name == src_attr_vp->v_name && + old_parent == src_attr_vp->v_parent) { + int update_flags = VNODE_UPDATE_NAME; + + if (fdvp != tdvp) + update_flags |= VNODE_UPDATE_PARENT; + + vnode_update_identity(src_attr_vp, tdvp, + tond->ni_cnd.cn_nameptr, + tond->ni_cnd.cn_namelen, + tond->ni_cnd.cn_hash, + update_flags); + } + /* kevent notifications for moving resource files * _err is zero if we're here, so no need to notify directories, code * below will do that. only need to post the rename on the source and @@ -4531,20 +4548,20 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s args.a_desc = &vnop_remove_desc; args.a_dvp = tdvp; args.a_vp = dst_attr_vp; - args.a_cnp = &tond.ni_cnd; + args.a_cnp = &tond->ni_cnd; args.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (fdvp_unsafe != NULLVP) error = lock_fsnode(dst_attr_vp, NULL); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (error == 0) { error = (*tdvp->v_op[vnop_remove_desc.vdesc_offset])(&args); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (fdvp_unsafe != NULLVP) unlock_fsnode(dst_attr_vp, NULL); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (error == 0) vnode_setneedinactive(dst_attr_vp); @@ -4560,13 +4577,18 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s out: if (src_attr_vp) { vnode_put(src_attr_vp); - nameidone(&fromnd); + nameidone(fromnd); } if (dst_attr_vp) { vnode_put(dst_attr_vp); - nameidone(&tond); + nameidone(tond); + } + if (fromnd) { + FREE(fromnd, M_TEMP); + } + if (tond) { + FREE(tond, M_TEMP); } - if (xfromname && xfromname != &smallname1[0]) { FREE(xfromname, M_TEMP); } @@ -4606,12 +4628,12 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, int _err = 0; int events; struct vnop_rename_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int funnel_state = 0; vnode_t lock_first = NULL, lock_second = NULL; vnode_t fdvp_unsafe = NULLVP; vnode_t tdvp_unsafe = NULLVP; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_rename_desc; a.a_fdvp = fdvp; @@ -4622,7 +4644,7 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, a.a_tcnp = tcnp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!THREAD_SAFE_FS(fdvp)) fdvp_unsafe = fdvp; if (!THREAD_SAFE_FS(tdvp)) @@ -4675,32 +4697,32 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, goto out1; } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* do the rename of the main file. */ _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (fdvp_unsafe != NULLVP) { if (lock_second != NULL) unlock_fsnode(lock_second, NULL); unlock_fsnode(lock_first, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (_err == 0) { if (tvp && tvp != fvp) vnode_setneedinactive(tvp); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL out1: if (fdvp_unsafe != NULLVP) { if (tdvp_unsafe != NULLVP) unlock_fsnode(tdvp_unsafe, NULL); unlock_fsnode(fdvp_unsafe, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* Wrote at least one directory. If transplanted a dir, also changed link counts */ if (0 == _err) { @@ -4860,10 +4882,10 @@ VNOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, { int _err; struct vnop_mkdir_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_mkdir_desc; a.a_dvp = dvp; @@ -4872,14 +4894,14 @@ VNOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, a.a_vap = vap; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_mkdir_desc.vdesc_offset])(&a); if (_err == 0 && !NATIVE_XATTR(dvp)) { @@ -4889,11 +4911,11 @@ VNOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(dvp, _err, NOTE_LINK | NOTE_WRITE); @@ -4975,10 +4997,10 @@ VNOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, vfs_c { int _err; struct vnop_rmdir_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_rmdir_desc; a.a_dvp = dvp; @@ -4986,14 +5008,14 @@ VNOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, vfs_c a.a_cnp = cnp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_rmdir_desc.vdesc_offset])(&a); @@ -5008,11 +5030,11 @@ VNOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, vfs_c } } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* If you delete a dir, it loses its "." reference --> NOTE_LINK */ post_event_if_success(vp, _err, NOTE_DELETE | NOTE_LINK); @@ -5182,9 +5204,9 @@ xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, nameidone(&nd); if (xvp->v_type == VREG) { -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe = THREAD_SAFE_FS(dvp); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ struct vnop_setattr_args a; a.a_desc = &vnop_setattr_desc; @@ -5192,26 +5214,26 @@ xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, a.a_vap = vap; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { if ( (lock_fsnode(xvp, NULL)) ) goto out1; } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ (void) (*xvp->v_op[vnop_setattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(xvp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL out1: -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ vnode_put(xvp); out2: @@ -5244,10 +5266,10 @@ VNOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, { int _err; struct vnop_symlink_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_symlink_desc; a.a_dvp = dvp; @@ -5257,14 +5279,14 @@ VNOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, a.a_target = target; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(dvp); if (!thread_safe) { if ( (_err = lock_fsnode(dvp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*dvp->v_op[vnop_symlink_desc.vdesc_offset])(&a); if (_err == 0 && !NATIVE_XATTR(dvp)) { @@ -5274,11 +5296,11 @@ VNOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(dvp, _err, NOTE_WRITE); @@ -5308,10 +5330,10 @@ VNOP_READDIR(struct vnode *vp, struct uio *uio, int flags, int *eofflag, { int _err; struct vnop_readdir_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_readdir_desc; a.a_vp = vp; @@ -5320,7 +5342,7 @@ VNOP_READDIR(struct vnode *vp, struct uio *uio, int flags, int *eofflag, a.a_eofflag = eofflag; a.a_numdirent = numdirent; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { @@ -5328,15 +5350,15 @@ VNOP_READDIR(struct vnode *vp, struct uio *uio, int flags, int *eofflag, return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_readdir_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5366,10 +5388,10 @@ VNOP_READDIRATTR(struct vnode *vp, struct attrlist *alist, struct uio *uio, uint { int _err; struct vnop_readdirattr_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_readdirattr_desc; a.a_vp = vp; @@ -5382,22 +5404,22 @@ VNOP_READDIRATTR(struct vnode *vp, struct attrlist *alist, struct uio *uio, uint a.a_actualcount = actualcount; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_readdirattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5440,32 +5462,32 @@ VNOP_READLINK(struct vnode *vp, struct uio *uio, vfs_context_t ctx) { int _err; struct vnop_readlink_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_readlink_desc; a.a_vp = vp; a.a_uio = uio; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_readlink_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5487,31 +5509,31 @@ VNOP_INACTIVE(struct vnode *vp, vfs_context_t ctx) { int _err; struct vnop_inactive_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_inactive_desc; a.a_vp = vp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_inactive_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ #if NAMEDSTREAMS /* For file systems that do not support namedstream natively, mark @@ -5548,29 +5570,29 @@ VNOP_RECLAIM(struct vnode *vp, vfs_context_t ctx) { int _err; struct vnop_reclaim_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_reclaim_desc; a.a_vp = vp; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_reclaim_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5602,10 +5624,10 @@ VNOP_PATHCONF(struct vnode *vp, int name, int32_t *retval, vfs_context_t ctx) { int _err; struct vnop_pathconf_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_pathconf_desc; a.a_vp = vp; @@ -5613,22 +5635,22 @@ VNOP_PATHCONF(struct vnode *vp, int name, int32_t *retval, vfs_context_t ctx) a.a_retval = retval; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_pathconf_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5665,10 +5687,10 @@ VNOP_ADVLOCK(struct vnode *vp, caddr_t id, int op, struct flock *fl, int flags, { int _err; struct vnop_advlock_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_advlock_desc; a.a_vp = vp; @@ -5678,12 +5700,12 @@ VNOP_ADVLOCK(struct vnode *vp, caddr_t id, int op, struct flock *fl, int flags, a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* Disallow advisory locking on non-seekable vnodes */ if (vnode_isfifo(vp)) { @@ -5698,11 +5720,11 @@ VNOP_ADVLOCK(struct vnode *vp, caddr_t id, int op, struct flock *fl, int flags, } } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5731,10 +5753,10 @@ VNOP_ALLOCATE(struct vnode *vp, off_t length, u_int32_t flags, off_t *bytesalloc { int _err; struct vnop_allocate_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_allocate_desc; a.a_vp = vp; @@ -5744,14 +5766,14 @@ VNOP_ALLOCATE(struct vnode *vp, off_t length, u_int32_t flags, off_t *bytesalloc a.a_offset = offset; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_allocate_desc.vdesc_offset])(&a); #if CONFIG_FSE @@ -5760,11 +5782,11 @@ VNOP_ALLOCATE(struct vnode *vp, off_t length, u_int32_t flags, off_t *bytesalloc } #endif -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5791,10 +5813,10 @@ VNOP_PAGEIN(struct vnode *vp, upl_t pl, upl_offset_t pl_offset, off_t f_offset, { int _err; struct vnop_pagein_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_pagein_desc; a.a_vp = vp; @@ -5805,20 +5827,20 @@ VNOP_PAGEIN(struct vnode *vp, upl_t pl, upl_offset_t pl_offset, off_t f_offset, a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_pagein_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -5846,10 +5868,10 @@ VNOP_PAGEOUT(struct vnode *vp, upl_t pl, upl_offset_t pl_offset, off_t f_offset, { int _err; struct vnop_pageout_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_pageout_desc; a.a_vp = vp; @@ -5860,20 +5882,20 @@ VNOP_PAGEOUT(struct vnode *vp, upl_t pl, upl_offset_t pl_offset, off_t f_offset, a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_pageout_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(vp, _err, NOTE_WRITE); @@ -5890,6 +5912,7 @@ vn_remove(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struc } } +#if CONFIG_SEARCHFS #if 0 /* @@ -5920,10 +5943,10 @@ VNOP_SEARCHFS(struct vnode *vp, void *searchparams1, void *searchparams2, struct { int _err; struct vnop_searchfs_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_searchfs_desc; a.a_vp = vp; @@ -5940,25 +5963,26 @@ VNOP_SEARCHFS(struct vnode *vp, void *searchparams1, void *searchparams2, struct a.a_searchstate = searchstate; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_searchfs_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } +#endif /* CONFIG_SEARCHFS */ #if 0 /* @@ -6002,10 +6026,10 @@ VNOP_GETXATTR(vnode_t vp, const char *name, uio_t uio, size_t *size, int options { struct vnop_getxattr_args a; int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_getxattr_desc; a.a_vp = vp; @@ -6015,22 +6039,22 @@ VNOP_GETXATTR(vnode_t vp, const char *name, uio_t uio, size_t *size, int options a.a_options = options; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (error = lock_fsnode(vp, &funnel_state)) ) { return (error); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*vp->v_op[vnop_getxattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -6040,10 +6064,10 @@ VNOP_SETXATTR(vnode_t vp, const char *name, uio_t uio, int options, vfs_context_ { struct vnop_setxattr_args a; int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_setxattr_desc; a.a_vp = vp; @@ -6052,22 +6076,22 @@ VNOP_SETXATTR(vnode_t vp, const char *name, uio_t uio, int options, vfs_context_ a.a_options = options; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (error = lock_fsnode(vp, &funnel_state)) ) { return (error); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*vp->v_op[vnop_setxattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (error == 0) vnode_uncache_authorized_action(vp, KAUTH_INVALIDATE_CACHED_RIGHTS); @@ -6082,10 +6106,10 @@ VNOP_REMOVEXATTR(vnode_t vp, const char *name, int options, vfs_context_t ctx) { struct vnop_removexattr_args a; int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_removexattr_desc; a.a_vp = vp; @@ -6093,22 +6117,22 @@ VNOP_REMOVEXATTR(vnode_t vp, const char *name, int options, vfs_context_t ctx) a.a_options = options; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (error = lock_fsnode(vp, &funnel_state)) ) { return (error); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*vp->v_op[vnop_removexattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ post_event_if_success(vp, error, NOTE_ATTRIB); @@ -6120,10 +6144,10 @@ VNOP_LISTXATTR(vnode_t vp, uio_t uio, size_t *size, int options, vfs_context_t c { struct vnop_listxattr_args a; int error; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_listxattr_desc; a.a_vp = vp; @@ -6132,22 +6156,22 @@ VNOP_LISTXATTR(vnode_t vp, uio_t uio, size_t *size, int options, vfs_context_t c a.a_options = options; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (error = lock_fsnode(vp, &funnel_state)) ) { return (error); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ error = (*vp->v_op[vnop_listxattr_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (error); } @@ -6171,30 +6195,30 @@ VNOP_BLKTOOFF(struct vnode *vp, daddr64_t lblkno, off_t *offset) { int _err; struct vnop_blktooff_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_blktooff_desc; a.a_vp = vp; a.a_lblkno = lblkno; a.a_offset = offset; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_blktooff_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -6217,30 +6241,30 @@ VNOP_OFFTOBLK(struct vnode *vp, off_t offset, daddr64_t *lblkno) { int _err; struct vnop_offtoblk_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_offtoblk_desc; a.a_vp = vp; a.a_offset = offset; a.a_lblkno = lblkno; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_offtoblk_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return (_err); } @@ -6268,10 +6292,11 @@ VNOP_BLOCKMAP(struct vnode *vp, off_t foffset, size_t size, daddr64_t *bpn, size { int _err; struct vnop_blockmap_args a; -#ifndef __LP64__ + size_t localrun = 0; +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ctx == NULL) { ctx = vfs_context_current(); @@ -6281,25 +6306,41 @@ VNOP_BLOCKMAP(struct vnode *vp, off_t foffset, size_t size, daddr64_t *bpn, size a.a_foffset = foffset; a.a_size = size; a.a_bpn = bpn; - a.a_run = run; + a.a_run = &localrun; a.a_poff = poff; a.a_flags = flags; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_blockmap_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { (void) thread_funnel_set(kernel_flock, funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ + + /* + * We used a local variable to request information from the underlying + * filesystem about the length of the I/O run in question. If + * we get malformed output from the filesystem, we cap it to the length + * requested, at most. Update 'run' on the way out. + */ + if (_err == 0) { + if (localrun > size) { + localrun = size; + } + + if (run) { + *run = localrun; + } + } return (_err); } @@ -6352,32 +6393,32 @@ VNOP_KQFILT_ADD(struct vnode *vp, struct knote *kn, vfs_context_t ctx) { int _err; struct vnop_kqfilt_add_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = VDESC(vnop_kqfilt_add); a.a_vp = vp; a.a_kn = kn; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_kqfilt_add_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(_err); } @@ -6395,32 +6436,32 @@ VNOP_KQFILT_REMOVE(struct vnode *vp, uintptr_t ident, vfs_context_t ctx) { int _err; struct vnop_kqfilt_remove_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = VDESC(vnop_kqfilt_remove); a.a_vp = vp; a.a_ident = ident; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_kqfilt_remove_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(_err); } @@ -6430,10 +6471,10 @@ VNOP_MONITOR(vnode_t vp, uint32_t events, uint32_t flags, void *handle, vfs_cont { int _err; struct vnop_monitor_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = VDESC(vnop_monitor); a.a_vp = vp; @@ -6442,22 +6483,22 @@ VNOP_MONITOR(vnode_t vp, uint32_t events, uint32_t flags, void *handle, vfs_cont a.a_handle = handle; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_monitor_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(_err); } @@ -6475,32 +6516,32 @@ VNOP_SETLABEL(struct vnode *vp, struct label *label, vfs_context_t ctx) { int _err; struct vnop_setlabel_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL int thread_safe; int funnel_state = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = VDESC(vnop_setlabel); a.a_vp = vp; a.a_vl = label; a.a_context = ctx; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { if ( (_err = lock_fsnode(vp, &funnel_state)) ) { return (_err); } } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ _err = (*vp->v_op[vnop_setlabel_desc.vdesc_offset])(&a); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!thread_safe) { unlock_fsnode(vp, &funnel_state); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ return(_err); } @@ -6515,10 +6556,10 @@ VNOP_GETNAMEDSTREAM(vnode_t vp, vnode_t *svpp, const char *name, enum nsoperatio { struct vnop_getnamedstream_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!THREAD_SAFE_FS(vp)) return (ENOTSUP); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_getnamedstream_desc; a.a_vp = vp; @@ -6539,10 +6580,10 @@ VNOP_MAKENAMEDSTREAM(vnode_t vp, vnode_t *svpp, const char *name, int flags, vfs { struct vnop_makenamedstream_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!THREAD_SAFE_FS(vp)) return (ENOTSUP); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_makenamedstream_desc; a.a_vp = vp; @@ -6563,10 +6604,10 @@ VNOP_REMOVENAMEDSTREAM(vnode_t vp, vnode_t svp, const char *name, int flags, vfs { struct vnop_removenamedstream_args a; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (!THREAD_SAFE_FS(vp)) return (ENOTSUP); -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ a.a_desc = &vnop_removenamedstream_desc; a.a_vp = vp; diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index 091ee16ab..7033b4cf6 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -395,7 +395,7 @@ static struct getattrlist_attrtab getattrlist_common_tab[] = { {ATTR_CMN_FILEID, VATTR_BIT(va_fileid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_PARENTID, VATTR_BIT(va_parentid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_FULLPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES }, - {ATTR_CMN_ADDEDTIME, VATTR_BIT(va_addedtime), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, + {ATTR_CMN_ADDEDTIME, VATTR_BIT(va_addedtime), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t), 0}, {0, 0, 0, 0} }; @@ -544,7 +544,6 @@ getattrlist_fixupattrs(attribute_set_t *asp, struct vnode_attr *vap) * on. This is done so that we can uncheck those bits and re-request * a vnode_getattr from the filesystem again. */ - if ((tab->attr & asp->commonattr) && (tab->bits & vap->va_active) && (tab->bits & vap->va_supported) == 0) { @@ -1774,7 +1773,6 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ab.actual.commonattr |= ATTR_CMN_ADDEDTIME; } - /* directory attributes *********************************************/ if (al.dirattr && (vtype == VDIR)) { if (al.dirattr & ATTR_DIR_LINKCOUNT) { /* full count of entries */ @@ -2137,6 +2135,21 @@ setattrlist_internal(vnode_t vp, struct setattrlist_args *uap, proc_t p, vfs_con } } + /* + * If the caller's bitmaps indicate that there are no attributes to set, + * then exit early. In particular, we want to avoid the MALLOC below + * since the caller's bufferSize could be zero, and MALLOC of zero bytes + * returns a NULL pointer, which would cause setattrlist to return ENOMEM. + */ + if (al.commonattr == 0 && + (al.volattr & ~ATTR_VOL_INFO) == 0 && + al.dirattr == 0 && + al.fileattr == 0 && + al.forkattr == 0) { + error = 0; + goto out; + } + /* * Make the naive assumption that the caller has supplied a reasonable buffer * size. We could be more careful by pulling in the fixed-size region, checking diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 53f4f5576..ce7c68e82 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,6 +109,7 @@ #include #include +#include #if BALANCE_QUEUES @@ -382,29 +383,82 @@ buf_markfua(buf_t bp) { SET(bp->b_flags, B_FUA); } -#ifdef CONFIG_PROTECT +#if CONFIG_PROTECT +void +buf_setcpaddr(buf_t bp, struct cprotect *entry) { + bp->b_attr.ba_cpentry = entry; +} + +void +buf_setcpoff (buf_t bp, uint64_t foffset) { + bp->b_attr.ba_cp_file_off = foffset; +} + void * -buf_getcpaddr(buf_t bp) { - return bp->b_cpentry; +bufattr_cpaddr(bufattr_t bap) { + return (bap->ba_cpentry); } -void -buf_setcpaddr(buf_t bp, void *cp_entry_addr) { - bp->b_cpentry = (struct cprotect *) cp_entry_addr; +uint64_t +bufattr_cpoff(bufattr_t bap) { + return (bap->ba_cp_file_off); +} + +void +bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) { + bap->ba_cpentry = cp_entry_addr; +} + +void +bufattr_setcpoff(bufattr_t bap, uint64_t foffset) { + bap->ba_cp_file_off = foffset; } #else void * -buf_getcpaddr(buf_t bp __unused) { - return NULL; +bufattr_cpaddr(bufattr_t bap __unused) { + return NULL; } -void -buf_setcpaddr(buf_t bp __unused, void *cp_entry_addr __unused) { +uint64_t +bufattr_cpoff(bufattr_t bap __unused) { + return 0; +} + +void +bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) { +} + +void +bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) { return; } #endif /* CONFIG_PROTECT */ +bufattr_t +bufattr_alloc() { + bufattr_t bap; + MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK); + if (bap == NULL) + return NULL; + + bzero(bap, sizeof(struct bufattr)); + return bap; +} + +void +bufattr_free(bufattr_t bap) { + if (bap) + FREE(bap, M_TEMP); +} + +int +bufattr_rawencrypted(bufattr_t bap) { + if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) ) + return 1; + return 0; +} + int bufattr_throttled(bufattr_t bap) { if ( (bap->ba_flags & BA_THROTTLED_IO) ) @@ -412,11 +466,51 @@ bufattr_throttled(bufattr_t bap) { return 0; } +int +bufattr_nocache(bufattr_t bap) { + if ( (bap->ba_flags & BA_NOCACHE) ) + return 1; + return 0; +} + +int +bufattr_meta(bufattr_t bap) { + if ( (bap->ba_flags & BA_META) ) + return 1; + return 0; +} + +int +#if !CONFIG_EMBEDDED +bufattr_delayidlesleep(bufattr_t bap) +#else /* !CONFIG_EMBEDDED */ +bufattr_delayidlesleep(__unused bufattr_t bap) +#endif /* !CONFIG_EMBEDDED */ +{ +#if !CONFIG_EMBEDDED + if ( (bap->ba_flags & BA_DELAYIDLESLEEP) ) + return 1; +#endif /* !CONFIG_EMBEDDED */ + return 0; +} + bufattr_t buf_attr(buf_t bp) { return &bp->b_attr; } +void +buf_markstatic(buf_t bp __unused) { + SET(bp->b_flags, B_STATICCONTENT); +} + +int +buf_static(buf_t bp) { + if ( (bp->b_flags & B_STATICCONTENT) ) + return 1; + return 0; +} + errno_t buf_error(buf_t bp) { @@ -1135,7 +1229,7 @@ buf_strategy(vnode_t devvp, void *ap) return (cluster_bp(bp)); } if (bp->b_blkno == bp->b_lblkno) { - off_t f_offset; + off_t f_offset; size_t contig_bytes; if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) { @@ -1143,21 +1237,22 @@ buf_strategy(vnode_t devvp, void *ap) buf_seterror(bp, error); buf_biodone(bp); - return (error); + return (error); } - if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { + + if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { DTRACE_IO1(start, buf_t, bp); buf_seterror(bp, error); buf_biodone(bp); return (error); } - + DTRACE_IO1(start, buf_t, bp); #if CONFIG_DTRACE dtrace_io_start_flag = 1; #endif /* CONFIG_DTRACE */ - + if ((bp->b_blkno == -1) || (contig_bytes == 0)) { /* Set block number to force biodone later */ bp->b_blkno = -1; @@ -1186,6 +1281,33 @@ buf_strategy(vnode_t devvp, void *ap) DTRACE_IO1(start, buf_t, bp); #endif /* CONFIG_DTRACE */ +#if CONFIG_PROTECT + /* Capture f_offset in the bufattr*/ + if (bp->b_attr.ba_cpentry != 0) { + /* No need to go here for older EAs */ + if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) { + off_t f_offset; + if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) + return error; + + /* + * Attach the file offset to this buffer. The + * bufattr attributes will be passed down the stack + * until they reach IOFlashStorage. IOFlashStorage + * will retain the offset in a local variable when it + * issues its I/Os to the NAND controller. + * + * Note that LwVM may end up splitting this I/O + * into sub-I/Os if it crosses a chunk boundary. In this + * case, LwVM will update this field when it dispatches + * each I/O to IOFlashStorage. But from our perspective + * we have only issued a single I/O. + */ + bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset); + } + } +#endif + /* * we can issue the I/O because... * either B_CLUSTER is set which @@ -1489,12 +1611,20 @@ try_dirty_list: void buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) { + + (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg); + return; +} + +int +buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) { buf_t bp; int writes_issued = 0; errno_t error; int busy = 0; struct buflists local_iterblkhd; int lock_flags = BAC_NOWAIT | BAC_REMOVE; + int any_locked = 0; if (flags & BUF_SKIP_LOCKED) lock_flags |= BAC_SKIP_LOCKED; @@ -1508,11 +1638,26 @@ loop: bp = LIST_FIRST(&local_iterblkhd); LIST_REMOVE(bp, b_vnbufs); LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs); - - if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) - busy++; - if (error) - continue; + + if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) { + busy++; + } + if (error) { + /* + * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED, + * we may want to do somethign differently if a locked or unlocked + * buffer was encountered (depending on the arg specified). + * In this case, we know that one of those two was set, and the + * buf acquisition failed above. + * + * If it failed with EDEADLK, then save state which can be emitted + * later on to the caller. Most callers should not care. + */ + if (error == EDEADLK) { + any_locked++; + } + continue; + } lck_mtx_unlock(buf_mtxp); bp->b_flags &= ~B_LOCKED; @@ -1558,6 +1703,8 @@ loop: goto loop; } } + + return any_locked; } @@ -2267,6 +2414,8 @@ buf_brelse_shadow(buf_t bp) buf_t bp_data; int data_ref = 0; #endif + int need_wakeup = 0; + lck_mtx_lock_spin(buf_mtxp); bp_head = (buf_t)bp->b_orig; @@ -2334,8 +2483,17 @@ buf_brelse_shadow(buf_t bp) bp_return = bp_head; } + if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) { + CLR(bp_head->b_lflags, BL_WANTED_REF); + need_wakeup = 1; + } } lck_mtx_unlock(buf_mtxp); + + if (need_wakeup) { + wakeup(bp_head); + } + #ifdef BUF_MAKE_PRIVATE if (bp == bp_data && data_ref == 0) buf_free_meta_store(bp); @@ -2662,7 +2820,30 @@ incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp) return (NULL); } +void +buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno) +{ + buf_t bp; + struct bufhashhdr *dp; + + dp = BUFHASH(vp, blkno); + + lck_mtx_lock_spin(buf_mtxp); + + for (;;) { + if ((bp = incore_locked(vp, blkno, dp)) == NULL) + break; + + if (bp->b_shadow_ref == 0) + break; + + SET(bp->b_lflags, BL_WANTED_REF); + (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL); + } + lck_mtx_unlock(buf_mtxp); +} + /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */ /* * Get a block of requested size that is associated with @@ -3409,9 +3590,6 @@ bcleanbuf(buf_t bp, boolean_t discard) bp->b_bcount = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_validoff = bp->b_validend = 0; -#ifdef CONFIG_PROTECT - bp->b_cpentry = 0; -#endif bzero(&bp->b_attr, sizeof(struct bufattr)); lck_mtx_lock_spin(buf_mtxp); @@ -3654,12 +3832,15 @@ buf_biodone(buf_t bp) else if (bp->b_flags & B_PASSIVE) code |= DKIO_PASSIVE; - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, + if (bp->b_attr.ba_flags & BA_NOCACHE) + code |= DKIO_NOCACHE; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, bp, (uintptr_t)bp->b_vp, bp->b_resid, bp->b_error, 0); } if ((bp->b_vp != NULLVP) && - ((bp->b_flags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) && + ((bp->b_flags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ | B_THROTTLED_IO | B_PASSIVE)) == (B_PAGEIO | B_READ)) && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { microuptime(&priority_IO_timestamp_for_root); hard_throttle_on_root = 0; @@ -3672,7 +3853,12 @@ buf_biodone(buf_t bp) * indicators */ CLR(bp->b_flags, (B_WASDIRTY | B_THROTTLED_IO | B_PASSIVE)); - CLR(bp->b_attr.ba_flags, (BA_THROTTLED_IO)); + CLR(bp->b_attr.ba_flags, (BA_META | BA_NOCACHE)); +#if !CONFIG_EMBEDDED + CLR(bp->b_attr.ba_flags, (BA_THROTTLED_IO | BA_DELAYIDLESLEEP)); +#else + CLR(bp->b_attr.ba_flags, BA_THROTTLED_IO); +#endif /* !CONFIG_EMBEDDED */ DTRACE_IO1(done, buf_t, bp); if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) @@ -3769,6 +3955,7 @@ count_lock_queue(void) /* * Return a count of 'busy' buffers. Used at the time of shutdown. + * note: This is also called from the mach side in debug context in kdp.c */ int count_busy_buffers(void) @@ -3864,9 +4051,6 @@ alloc_io_buf(vnode_t vp, int priv) bp->b_bufsize = 0; bp->b_upl = NULL; bp->b_vp = vp; -#ifdef CONFIG_PROTECT - bp->b_cpentry = 0; -#endif bzero(&bp->b_attr, sizeof(struct bufattr)); if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) @@ -4085,7 +4269,7 @@ buffer_cache_gc(int all) boolean_t did_large_zfree = FALSE; boolean_t need_wakeup = FALSE; int now = buf_timestamp(); - uint32_t found = 0, total_found = 0; + uint32_t found = 0; struct bqueues privq; int thresh_hold = BUF_STALE_THRESHHOLD; @@ -4093,11 +4277,14 @@ buffer_cache_gc(int all) thresh_hold = 0; /* * We only care about metadata (incore storage comes from zalloc()). - * No more than 1024 buffers total, and only those not accessed within the - * last 30s. We will also only examine 128 buffers during a single grab - * of the lock in order to limit lock hold time. + * Unless "all" is set (used to evict meta data buffers in preparation + * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers + * that have not been accessed in the last 30s. This limit controls both + * the hold time of the global lock "buf_mtxp" and the length of time + * we spend compute bound in the GC thread which calls this function */ lck_mtx_lock(buf_mtxp); + do { found = 0; TAILQ_INIT(&privq); @@ -4179,7 +4366,6 @@ buffer_cache_gc(int all) bp->b_whichq = BQ_EMPTY; BLISTNONE(bp); } - lck_mtx_lock(buf_mtxp); /* Back under lock, move them all to invalid hash and clear busy */ @@ -4199,9 +4385,8 @@ buffer_cache_gc(int all) /* And do a big bulk move to the empty queue */ TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist); - total_found += found; - } while ((all || (total_found < BUF_MAX_GC_COUNT)) && (found == BUF_MAX_GC_BATCH_SIZE)); + } while (all && (found == BUF_MAX_GC_BATCH_SIZE)); lck_mtx_unlock(buf_mtxp); diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index 3096d1294..73fbb3afb 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,6 +192,11 @@ static unsigned int crc32tab[256]; * If BUILDPATH_NO_FS_ENTER is set in flags, it only uses values present * in the name cache and does not enter the file system. * + * If BUILDPATH_CHECK_MOVED is set in flags, we return EAGAIN when + * we encounter ENOENT during path reconstruction. ENOENT means that + * one of the parents moved while we were building the path. The + * caller can special handle this case by calling build_path again. + * * passed in vp must have a valid io_count reference */ int @@ -309,7 +314,8 @@ again: * Walk up the parent chain. */ if (((vp->v_parent != NULLVP) && !fixhardlink) || - (flags & BUILDPATH_NO_FS_ENTER)) { + (flags & BUILDPATH_NO_FS_ENTER)) { + /* * In this if () block we are not allowed to enter the filesystem * to conclusively get the most accurate parent identifier. @@ -323,17 +329,17 @@ again: /* The code below will exit early if 'tvp = vp' == NULL */ } - vp = vp->v_parent; - + /* * if the vnode we have in hand isn't a directory and it * has a v_parent, then we started with the resource fork * so skip up to avoid getting a duplicate copy of the * file name in the path. */ - if (vp && !vnode_isdir(vp) && vp->v_parent) + if (vp && !vnode_isdir(vp) && vp->v_parent) { vp = vp->v_parent; + } } else { /* * No parent, go get it if supported. @@ -492,6 +498,14 @@ out: */ *outlen = &buff[buflen] - end; + /* One of the parents was moved during path reconstruction. + * The caller is interested in knowing whether any of the + * parents moved via BUILDPATH_CHECK_MOVED, so return EAGAIN. + */ + if ((ret == ENOENT) && (flags & BUILDPATH_CHECK_MOVED)) { + ret = EAGAIN; + } + return (ret); } diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 2bccd5bb3..69dfdfda3 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,8 @@ #define CL_IOSTREAMING 0x4000 #define CL_CLOSE 0x8000 #define CL_ENCRYPTED 0x10000 +#define CL_RAW_ENCRYPTED 0x20000 +#define CL_NOCACHE 0x40000 #define MAX_VECTOR_UPL_ELEMENTS 8 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE @@ -201,6 +204,30 @@ static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *le static kern_return_t vfs_drt_control(void **cmapp, int op_type); +/* + * For throttled IO to check whether + * a block is cached by the boot cache + * and thus it can avoid delaying the IO. + * + * bootcache_contains_block is initially + * NULL. The BootCache will set it while + * the cache is active and clear it when + * the cache is jettisoned. + * + * Returns 0 if the block is not + * contained in the cache, 1 if it is + * contained. + * + * The function pointer remains valid + * after the cache has been evicted even + * if bootcache_contains_block has been + * cleared. + * + * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs + */ +int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; + + /* * limit the internal I/O size so that we * can represent it in a 32 bit int @@ -214,16 +241,26 @@ static kern_return_t vfs_drt_control(void **cmapp, int op_type); #define WRITE_THROTTLE_SSD 2 #define WRITE_BEHIND 1 #define WRITE_BEHIND_SSD 1 + +#if CONFIG_EMBEDDED +#define PREFETCH 1 +#define PREFETCH_SSD 1 +uint32_t speculative_prefetch_max = 512; /* maximum number of pages to use for a specluative read-ahead */ +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead */ +#else #define PREFETCH 3 -#define PREFETCH_SSD 2 +#define PREFETCH_SSD 1 +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/ +#endif -#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * base) + +#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) -#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, (is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)) +#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) int ignore_is_ssd = 0; int speculative_reads_disabled = 0; -uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); /* * throttle the number of async writes that @@ -231,11 +268,25 @@ uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); * before we issue a synchronous write */ #define HARD_THROTTLE_MAXCNT 0 -#define HARD_THROTTLE_MAXSIZE (256 * 1024) +#define HARD_THROTTLE_MAX_IOSIZE (128 * 1024) +#define LEGACY_HARD_THROTTLE_MAX_IOSIZE (512 * 1024) +extern int32_t throttle_legacy_process_count; int hard_throttle_on_root = 0; +uint32_t hard_throttle_max_iosize = HARD_THROTTLE_MAX_IOSIZE; +uint32_t legacy_hard_throttle_max_iosize = LEGACY_HARD_THROTTLE_MAX_IOSIZE; struct timeval priority_IO_timestamp_for_root; +#if CONFIG_EMBEDDED +#define THROTTLE_MAX_IOSIZE (hard_throttle_max_iosize) +#else +#define THROTTLE_MAX_IOSIZE (throttle_legacy_process_count == 0 ? hard_throttle_max_iosize : legacy_hard_throttle_max_iosize) +#endif + + +SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &hard_throttle_max_iosize, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_hard_throttle_max_iosize, 0, ""); + void cluster_init(void) { @@ -426,31 +477,47 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c } +static int +cluster_io_present_in_BC(vnode_t vp, off_t f_offset) +{ + daddr64_t blkno; + size_t io_size; + int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; + + if (bootcache_check_fn) { + if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) + return(0); + + if (io_size == 0) + return (0); + + if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) + return(1); + } + return(0); +} + + static int cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) { - struct uthread *ut; + int throttle_type = 0; - if (hard_throttle) { - static struct timeval hard_throttle_maxelapsed = { 0, 200000 }; + if ( (throttle_type = throttle_io_will_be_throttled(-1, vp->v_mount)) ) + return(throttle_type); - if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) { - struct timeval elapsed; + if (hard_throttle && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { + static struct timeval hard_throttle_maxelapsed = { 0, 100000 }; + struct timeval elapsed; - if (hard_throttle_on_root) - return(1); + if (hard_throttle_on_root) + return(1); - microuptime(&elapsed); - timevalsub(&elapsed, &priority_IO_timestamp_for_root); + microuptime(&elapsed); + timevalsub(&elapsed, &priority_IO_timestamp_for_root); - if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) - return(1); - } - } - if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) { - if (throttle_io_will_be_throttled(-1, vp->v_mount)) { + if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) return(1); - } } return(0); } @@ -707,7 +774,7 @@ uint32_t cluster_hard_throttle_limit(vnode_t vp, uint32_t *limit, uint32_t hard_throttle) { if (cluster_hard_throttle_on(vp, hard_throttle)) { - *limit = HARD_THROTTLE_MAXSIZE; + *limit = THROTTLE_MAX_IOSIZE; return 1; } return 0; @@ -948,8 +1015,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (flags & CL_THROTTLE) { if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp, 1)) { - if (max_iosize > HARD_THROTTLE_MAXSIZE) - max_iosize = HARD_THROTTLE_MAXSIZE; + if (max_iosize > THROTTLE_MAX_IOSIZE) + max_iosize = THROTTLE_MAX_IOSIZE; async_throttle = HARD_THROTTLE_MAXCNT; } else { if ( (flags & CL_DEV_MEMORY) ) @@ -1397,6 +1464,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } cbp->b_cliodone = (void *)callback; cbp->b_flags |= io_flags; + if (flags & CL_NOCACHE) + cbp->b_attr.ba_flags |= BA_NOCACHE; cbp->b_lblkno = lblkno; cbp->b_blkno = blkno; @@ -1489,6 +1558,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if ( !(io_flags & B_READ)) vnode_startwrite(vp); + if (flags & CL_RAW_ENCRYPTED) { + /* + * User requested raw encrypted bytes. + * Twiddle the bit in the ba_flags for the buffer + */ + cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; + } + (void) VNOP_STRATEGY(cbp); if (need_EOT == TRUE) { @@ -1914,9 +1991,10 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t else bflag = 0; - if (vp->v_flag & VNOCACHE_DATA) + if (vp->v_flag & VNOCACHE_DATA){ flags |= IO_NOCACHE; - + bflag |= CL_NOCACHE; + } if (uio == NULL) { /* * no user data... @@ -2058,7 +2136,10 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in user_addr_t iov_base; u_int32_t mem_alignment_mask; u_int32_t devblocksize; + u_int32_t max_io_size; u_int32_t max_upl_size; + u_int32_t max_vector_size; + boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -2080,7 +2161,10 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; - + + if (flags & IO_NOCACHE) + io_flag |= CL_NOCACHE; + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; @@ -2129,6 +2213,33 @@ next_dwrite: } while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { + int throttle_type; + + if ( (throttle_type = cluster_hard_throttle_on(vp, 1)) ) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == 2) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_write call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dwrites; + } + max_vector_size = THROTTLE_MAX_IOSIZE; + max_io_size = THROTTLE_MAX_IOSIZE; + } else { + max_vector_size = MAX_VECTOR_UPL_SIZE; + max_io_size = max_upl_size; + } if (first_IO) { cluster_syncup(vp, newEOF, callback, callback_arg); @@ -2137,8 +2248,8 @@ next_dwrite: io_size = io_req_size & ~PAGE_MASK; iov_base = uio_curriovbase(uio); - if (io_size > max_upl_size) - io_size = max_upl_size; + if (io_size > max_io_size) + io_size = max_io_size; if(useVectorUPL && (iov_base & PAGE_MASK)) { /* @@ -2304,7 +2415,7 @@ next_dwrite: vector_upl_iosize += io_size; vector_upl_size += upl_size; - if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= MAX_VECTOR_UPL_SIZE) { + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -2367,6 +2478,9 @@ wait_for_dwrites: lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -2671,7 +2785,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old bflag = CL_PASSIVE; else bflag = 0; - + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + zero_cnt = 0; zero_cnt1 = 0; zero_off = 0; @@ -3286,17 +3402,34 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* flags |= IO_NOCACHE; if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) flags |= IO_RAOFF; + + /* + * If we're doing an encrypted IO, then first check to see + * if the IO requested was page aligned. If not, then bail + * out immediately. + */ + if (flags & IO_ENCRYPTED) { + if (read_length & PAGE_MASK) { + retval = EINVAL; + return retval; + } + } - /* + /* * do a read through the cache if one of the following is true.... * NOCACHE is not true * the uio request doesn't target USERSPACE + * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. + * Reading encrypted data from a CP filesystem should never result in the data touching + * the UBC. + * * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) - retval = cluster_io_type(uio, &read_type, &read_length, 0); - + if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { + retval = cluster_io_type(uio, &read_type, &read_length, 0); + } + while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { switch (read_type) { @@ -3380,27 +3513,19 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file struct cl_extent extent; int bflag; int take_reference = 1; -#if CONFIG_EMBEDDED - struct uthread *ut; -#endif /* CONFIG_EMBEDDED */ int policy = IOPOL_DEFAULT; boolean_t iolock_inited = FALSE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); + + if (flags & IO_ENCRYPTED) { + panic ("encrypted blocks will hit UBC!"); + } -#if !CONFIG_EMBEDDED policy = proc_get_task_selfdiskacc(); -#else /* !CONFIG_EMBEDDED */ - policy = current_proc()->p_iopol_disk; - ut = get_bsdthread_info(current_thread()); - - if (ut->uu_iopol_disk != IOPOL_DEFAULT) - policy = ut->uu_iopol_disk; -#endif /* !CONFIG_EMBEDDED */ - - if (policy == IOPOL_THROTTLE || (flags & IO_NOCACHE)) + if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY || (flags & IO_NOCACHE)) take_reference = 0; if (flags & IO_PASSIVE) @@ -3408,6 +3533,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file else bflag = 0; + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); max_rd_size = max_prefetch; @@ -3422,13 +3550,15 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap = NULL; } else { if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - max_rd_size = HARD_THROTTLE_MAXSIZE; - } else if (policy == IOPOL_THROTTLE) { - rd_ahead_enabled = 0; - prefetch_enabled = 0; + max_rd_size = THROTTLE_MAX_IOSIZE; } if ((rap = cluster_get_rap(vp)) == NULL) rd_ahead_enabled = 0; @@ -3547,6 +3677,30 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file */ max_size = filesize - uio->uio_offset; } + + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_hard_throttle_on(vp, 0) == 2) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + retval = EAGAIN; + break; + } + } + } + /* * compute the size of the upl needed to encompass * the requested read... limit each call to cluster_io @@ -3608,10 +3762,6 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (upl_valid_page(pl, last_pg)) break; } - iostate.io_completed = 0; - iostate.io_issued = 0; - iostate.io_error = 0; - iostate.io_wanted = 0; if (start_pg < last_pg) { /* @@ -3804,16 +3954,20 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (io_req_size) { if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - - max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_size = THROTTLE_MAX_IOSIZE; } else { - if (max_rd_size == HARD_THROTTLE_MAXSIZE) { + if (max_rd_size == THROTTLE_MAX_IOSIZE) { /* * coming out of throttled state */ - if (policy != IOPOL_THROTTLE) { + if (policy != IOPOL_THROTTLE && policy != IOPOL_UTILITY) { if (rap != NULL) rd_ahead_enabled = 1; prefetch_enabled = 1; @@ -3884,7 +4038,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t max_upl_size; u_int32_t max_rd_size; u_int32_t max_rd_ahead; + u_int32_t max_vector_size; boolean_t strict_uncached_IO = FALSE; + boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -3905,6 +4061,14 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; + if (flags & IO_ENCRYPTED) { + io_flag |= CL_RAW_ENCRYPTED; + } + + if (flags & IO_NOCACHE) { + io_flag |= CL_NOCACHE; + } + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; @@ -3960,7 +4124,17 @@ next_dread: * I/O that ends on a page boundary in cluster_io */ misaligned = 1; - } + } + + /* + * The user must request IO in aligned chunks. If the + * offset into the file is bad, or the userland pointer + * is non-aligned, then we cannot service the encrypted IO request. + */ + if ((flags & IO_ENCRYPTED) && (misaligned)) { + retval = EINVAL; + } + /* * When we get to this point, we know... * -- the offset into the file is on a devblocksize boundary @@ -3970,22 +4144,32 @@ next_dread: u_int32_t io_start; if (cluster_hard_throttle_on(vp, 1)) { - max_rd_size = HARD_THROTTLE_MAXSIZE; - max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + max_rd_size = THROTTLE_MAX_IOSIZE; + max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; + max_vector_size = THROTTLE_MAX_IOSIZE; } else { max_rd_size = max_upl_size; max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); + max_vector_size = MAX_VECTOR_UPL_SIZE; } io_start = io_size = io_req_size; /* * First look for pages already in the cache - * and move them to user space. + * and move them to user space. But only do this + * check if we are not retrieving encrypted data directly + * from the filesystem; those blocks should never + * be in the UBC. * * cluster_copy_ubc_data returns the resid * in io_size */ - if (strict_uncached_IO == FALSE) { + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); } /* @@ -4018,9 +4202,14 @@ next_dread: } /* - * check to see if we are finished with this request... + * check to see if we are finished with this request. + * + * If we satisfied this IO already, then io_req_size will be 0. + * Otherwise, see if the IO was mis-aligned and needs to go through + * the UBC to deal with the 'tail'. + * */ - if (io_req_size == 0 || misaligned) { + if (io_req_size == 0 || (misaligned)) { /* * see if there's another uio vector to * process that's of type IO_DIRECT @@ -4046,13 +4235,31 @@ next_dread: * (which overlaps the end of the direct read) in order to * get at the overhang bytes */ - if (io_size & (devblocksize - 1)) { - /* - * request does NOT end on a device block boundary - * so clip it back to a PAGE_SIZE boundary - */ - io_size &= ~PAGE_MASK; - io_min = PAGE_SIZE; + if (io_size & (devblocksize - 1)) { + if (flags & IO_ENCRYPTED) { + /* + * Normally, we'd round down to the previous page boundary to + * let the UBC manage the zero-filling of the file past the EOF. + * But if we're doing encrypted IO, we can't let any of + * the data hit the UBC. This means we have to do the full + * IO to the upper block boundary of the device block that + * contains the EOF. The user will be responsible for not + * interpreting data PAST the EOF in its buffer. + * + * So just bump the IO back up to a multiple of devblocksize + */ + io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); + io_min = io_size; + } + else { + /* + * Clip the request to the previous page size boundary + * since request does NOT end on a device block boundary + */ + io_size &= ~PAGE_MASK; + io_min = PAGE_SIZE; + } + } if (retval || io_size < io_min) { /* @@ -4065,10 +4272,14 @@ next_dread: goto wait_for_dreads; } - if (strict_uncached_IO == FALSE) { + /* + * Don't re-check the UBC data if we are looking for uncached IO + * or asking for encrypted blocks. + */ + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { if ((xsize = io_size) > max_rd_size) - xsize = max_rd_size; + xsize = max_rd_size; io_size = 0; @@ -4083,6 +4294,25 @@ next_dread: continue; } } + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_hard_throttle_on(vp, 0) == 2) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dreads; + } + } + } + if (io_size > max_rd_size) + io_size = max_rd_size; iov_base = uio_curriovbase(uio); @@ -4216,7 +4446,7 @@ next_dread: vector_upl_size += upl_size; vector_upl_iosize += io_size; - if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= MAX_VECTOR_UPL_SIZE) { + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -4224,9 +4454,24 @@ next_dread: /* * update the uio structure */ - uio_update(uio, (user_size_t)io_size); - - io_req_size -= io_size; + if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { + uio_update(uio, (user_size_t)max_io_size); + } + else { + uio_update(uio, (user_size_t)io_size); + } + /* + * Under normal circumstances, the io_size should not be + * bigger than the io_req_size, but we may have had to round up + * to the end of the page in the encrypted IO case. In that case only, + * ensure that we only decrement io_req_size to 0. + */ + if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { + io_req_size = 0; + } + else { + io_req_size -= io_size; + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, upl, (int)uio->uio_offset, io_req_size, retval, 0); @@ -4264,6 +4509,9 @@ wait_for_dreads: lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -4311,7 +4559,10 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, bflag = CL_PASSIVE; else bflag = 0; - + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + /* * When we enter this routine, we know * -- the read_length will not exceed the current iov_len @@ -4595,6 +4846,16 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); +#if CONFIG_EMBEDDED + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; +#else + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; + } +#endif + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, (int)f_offset, resid, (int)filesize, 0, 0); @@ -5222,6 +5483,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if (flags & IO_CLOSE) io_flags |= CL_CLOSE; + if (flags & IO_NOCACHE) + io_flags |= CL_NOCACHE; + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -5348,6 +5612,9 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t else bflag = 0; + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + upl_flags = UPL_SET_LITE; if ( !(flags & CL_READ) ) { diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index a4a962b66..a64040dd9 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -72,11 +72,11 @@ #include #include -#ifndef __LP64__ -#define VFS_THREAD_SAFE_FLAG VFC_VFSTHREADSAFE /* This is only defined for 32-bit */ +#if CONFIG_VFS_FUNNEL +#define VFS_THREAD_SAFE_FLAG VFC_VFSTHREADSAFE /* Only defined under CONFIG_VFS_FUNNEL */ #else #define VFS_THREAD_SAFE_FLAG 0 -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ /* diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index b92b69a28..179a264d4 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -61,7 +61,7 @@ #include #include - +#include typedef struct kfs_event { LIST_ENTRY(kfs_event) kevent_list; @@ -124,8 +124,8 @@ typedef struct fs_event_watcher { #define MAX_WATCHERS 8 static fs_event_watcher *watcher_table[MAX_WATCHERS]; - -#define MAX_KFS_EVENTS 4096 +#define DEFAULT_MAX_KFS_EVENTS 4096 +static int max_kfs_events = DEFAULT_MAX_KFS_EVENTS; // we allocate kfs_event structures out of this zone static zone_t event_zone; @@ -190,9 +190,11 @@ fsevents_internal_init(void) lck_rw_init(&event_handling_lock, fsevent_rw_group, fsevent_lock_attr); + PE_get_default("kern.maxkfsevents", &max_kfs_events, sizeof(max_kfs_events)); + event_zone = zinit(sizeof(kfs_event), - MAX_KFS_EVENTS * sizeof(kfs_event), - MAX_KFS_EVENTS * sizeof(kfs_event), + max_kfs_events * sizeof(kfs_event), + max_kfs_events * sizeof(kfs_event), "fs-event-buf"); if (event_zone == NULL) { printf("fsevents: failed to initialize the event zone.\n"); @@ -204,7 +206,7 @@ fsevents_internal_init(void) zone_change(event_zone, Z_COLLECT, FALSE); zone_change(event_zone, Z_CALLERACCT, FALSE); - if (zfill(event_zone, MAX_KFS_EVENTS) < MAX_KFS_EVENTS) { + if (zfill(event_zone, max_kfs_events) < max_kfs_events) { printf("fsevents: failed to pre-fill the event zone.\n"); } @@ -999,13 +1001,7 @@ add_fsevent(int type, vfs_context_t ctx, ...) pathbuff[0] = '\0'; if ((ret = vn_getpath(vp, pathbuff, &pathbuff_len)) != 0 || pathbuff[0] == '\0') { - struct vnode *orig_vp = vp; - if (ret != ENOSPC) { - printf("add_fsevent: unable to get path for vp %p (%s; ret %d; type %d)\n", - vp, vp->v_name ? vp->v_name : "-UNKNOWN-FILE", ret, type); - } - cur->flags |= KFSE_CONTAINS_DROPPED_EVENTS; do { @@ -1027,7 +1023,6 @@ add_fsevent(int type, vfs_context_t ctx, ...) } while (ret == ENOSPC); if (ret != 0 || vp == NULL) { - printf("add_fsevent: unabled to get a path for vp %p. dropping the event.\n", orig_vp); error = ENOENT; if (need_event_unlock == 0) { // then we only grabbed it shared @@ -1277,13 +1272,13 @@ release_event_ref(kfs_event *kfse) static int -add_watcher(int8_t *event_list, int32_t num_events, int32_t eventq_size, fs_event_watcher **watcher_out) +add_watcher(int8_t *event_list, int32_t num_events, int32_t eventq_size, fs_event_watcher **watcher_out, void *fseh) { int i; fs_event_watcher *watcher; - if (eventq_size <= 0 || eventq_size > 100*MAX_KFS_EVENTS) { - eventq_size = MAX_KFS_EVENTS; + if (eventq_size <= 0 || eventq_size > 100*max_kfs_events) { + eventq_size = max_kfs_events; } // Note: the event_queue follows the fs_event_watcher struct @@ -1308,7 +1303,7 @@ add_watcher(int8_t *event_list, int32_t num_events, int32_t eventq_size, fs_even watcher->blockers = 0; watcher->num_readers = 0; watcher->max_event_id = 0; - watcher->fseh = NULL; + watcher->fseh = fseh; watcher->num_dropped = 0; // XXXdbg - debugging @@ -1922,13 +1917,14 @@ typedef struct ext_fsevent_dev_filter_args { } ext_fsevent_dev_filter_args; #pragma pack(pop) +#define NEW_FSEVENTS_DEVICE_FILTER _IOW('s', 100, ext_fsevent_dev_filter_args) + typedef struct old_fsevent_dev_filter_args { uint32_t num_devices; int32_t devices; } old_fsevent_dev_filter_args; #define OLD_FSEVENTS_DEVICE_FILTER _IOW('s', 100, old_fsevent_dev_filter_args) -#define NEW_FSEVENTS_DEVICE_FILTER _IOW('s', 100, ext_fsevent_dev_filter_args) #if __LP64__ /* need this in spite of the padding due to alignment of devices */ @@ -1948,7 +1944,8 @@ fseventsf_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, vfs_context_t ctx if (proc_is64bit(vfs_context_proc(ctx))) { devfilt_args = (ext_fsevent_dev_filter_args *)data; - } else if (cmd == OLD_FSEVENTS_DEVICE_FILTER) { + } + else if (cmd == OLD_FSEVENTS_DEVICE_FILTER) { old_fsevent_dev_filter_args *udev_filt_args = (old_fsevent_dev_filter_args *)data; devfilt_args = &_devfilt_args; @@ -1956,7 +1953,8 @@ fseventsf_ioctl(struct fileproc *fp, u_long cmd, caddr_t data, vfs_context_t ctx devfilt_args->num_devices = udev_filt_args->num_devices; devfilt_args->devices = CAST_USER_ADDR_T(udev_filt_args->devices); - } else { + } + else { #if __LP64__ fsevent_dev_filter_args32 *udev_filt_args = (fsevent_dev_filter_args32 *)data; #else @@ -2530,14 +2528,14 @@ fseventsioctl(__unused dev_t dev, u_long cmd, caddr_t data, __unused int flag, s error = add_watcher(event_list, fse_clone_args->num_events, fse_clone_args->event_queue_depth, - &fseh->watcher); + &fseh->watcher, + fseh); if (error) { FREE(event_list, M_TEMP); FREE(fseh, M_TEMP); return error; } - // connect up the watcher with this fsevent_handle fseh->watcher->fseh = fseh; error = falloc(p, &f, &fd, vfs_context_current()); diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 4999f814b..3864ff34c 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2011 Apple Inc. All rights reserved. + * Copyright (c) 2002-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -115,10 +115,12 @@ SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, & #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT #endif + #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif + #if JOURNALING // @@ -136,8 +138,7 @@ enum { unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS; SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush"); - -/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ +/* XXX next prototype should be from libsa/stdlib.h> but conflicts libkern */ __private_extern__ void qsort( void * array, size_t nmembers, @@ -1099,6 +1100,7 @@ replay_journal(journal *jnl) struct bucket *co_buf; int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; uint32_t last_sequence_num = 0; + int replay_retry_count = 0; // wrap the start ptr if it points to the very end of the journal if (jnl->jhdr->start == jnl->jhdr->size) { @@ -1336,11 +1338,25 @@ restart_replay: bad_txn_handling: if (bad_blocks) { + /* Journal replay got error before it found any valid + * transations, abort replay */ if (txn_start_offset == 0) { printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); goto bad_replay; } + /* Repeated error during journal replay, abort replay */ + if (replay_retry_count == 3) { + printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name); + goto bad_replay; + } + replay_retry_count++; + + /* There was an error replaying the journal (possibly + * EIO/ENXIO from the device). So retry replaying all + * the good transactions that we found before getting + * the error. + */ jnl->jhdr->start = orig_jnl_start; jnl->jhdr->end = txn_start_offset; check_past_jnl_end = 0; @@ -1763,7 +1779,8 @@ journal_create(struct vnode *jvp, lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); - + + jnl->flushing = FALSE; jnl->asyncIO = FALSE; jnl->flush_aborted = FALSE; @@ -1911,26 +1928,24 @@ journal_open(struct vnode *jvp, jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; } - if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { - /* - * The volume has probably been resized (such that we had to adjust the - * logical sector size), or copied to media with a different logical - * sector size. - * - * Temporarily change the device's logical block size to match the - * journal's header size. This will allow us to replay the journal - * safely. If the replay succeeds, we will update the journal's header - * size (later in this function). - */ - - orig_blksz = phys_blksz; - phys_blksz = jnl->jhdr->jhdr_size; - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context); + if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { + /* + * The volume has probably been resized (such that we had to adjust the + * logical sector size), or copied to media with a different logical + * sector size. + * + * Temporarily change the device's logical block size to match the + * journal's header size. This will allow us to replay the journal + * safely. If the replay succeeds, we will update the journal's header + * size (later in this function). + */ + orig_blksz = phys_blksz; + phys_blksz = jnl->jhdr->jhdr_size; + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, &context); + printf("jnl: %s: open: temporarily switched block size from %u to %u\n", + jdev_name, orig_blksz, phys_blksz); + } - printf("jnl: %s: open: temporarily switched block size from %u to %u\n", - jdev_name, orig_blksz, phys_blksz); - } - if ( jnl->jhdr->start <= 0 || jnl->jhdr->start > jnl->jhdr->size || jnl->jhdr->start > 1024*1024*1024) { @@ -1980,68 +1995,71 @@ journal_open(struct vnode *jvp, printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); goto bad_journal; } - - /* - * When we get here, we know that the journal is empty (jnl->jhdr->start == - * jnl->jhdr->end). If the device's logical block size was different from - * the journal's header size, then we can now restore the device's logical - * block size and update the journal's header size to match. - * - * Note that we also adjust the journal's start and end so that they will - * be aligned on the new block size. We pick a new sequence number to - * avoid any problems if a replay found previous transactions using the old - * journal header size. (See the comments in journal_create(), above.) - */ - if (orig_blksz != 0) { - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - phys_blksz = orig_blksz; - orig_blksz = 0; - printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz); - jnl->jhdr->jhdr_size = phys_blksz; - jnl->jhdr->start = phys_blksz; - jnl->jhdr->end = phys_blksz; - jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + - (journal_size / phys_blksz) + - (random() % 16384)) & 0x00ffffff; + /* + * When we get here, we know that the journal is empty (jnl->jhdr->start == + * jnl->jhdr->end). If the device's logical block size was different from + * the journal's header size, then we can now restore the device's logical + * block size and update the journal's header size to match. + * + * Note that we also adjust the journal's start and end so that they will + * be aligned on the new block size. We pick a new sequence number to + * avoid any problems if a replay found previous transactions using the old + * journal header size. (See the comments in journal_create(), above.) + */ - if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { - printf("jnl: %s: open: failed to update journal header size\n", jdev_name); + if (orig_blksz != 0) { + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); + phys_blksz = orig_blksz; + + orig_blksz = 0; + + jnl->jhdr->jhdr_size = phys_blksz; + jnl->jhdr->start = phys_blksz; + jnl->jhdr->end = phys_blksz; + jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + + (journal_size / phys_blksz) + + (random() % 16384)) & 0x00ffffff; + + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { + printf("jnl: %s: open: failed to update journal header size\n", jdev_name); + goto bad_journal; + } + } + + // make sure this is in sync! + jnl->active_start = jnl->jhdr->start; + jnl->sequence_num = jnl->jhdr->sequence_num; + + // set this now, after we've replayed the journal + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + // TODO: Does this need to change if the device's logical block size changed? + if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { + printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, + jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); goto bad_journal; } - } - - // make sure this is in sync! - jnl->active_start = jnl->jhdr->start; - jnl->sequence_num = jnl->jhdr->sequence_num; - - // set this now, after we've replayed the journal - size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - - // TODO: Does this need to change if the device's logical block size changed? - if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { - printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, - jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); - goto bad_journal; - } - - lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); - - return jnl; - - bad_journal: - if (orig_blksz != 0) { - phys_blksz = orig_blksz; - VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz); - } - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); - bad_kmem_alloc: - if (jdev_name) { - vfs_removename(jdev_name); - } - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - return NULL; + + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); + lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); + + return jnl; + +bad_journal: + if (orig_blksz != 0) { + phys_blksz = orig_blksz; + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); + printf("jnl: %s: open: restored block size after error\n", jdev_name); + } + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); +bad_kmem_alloc: + if (jdev_name) { + vfs_removename(jdev_name); + } + FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); + return NULL; } @@ -2351,7 +2369,7 @@ check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write lcl_counter = 0; while (jnl->old_start[i] & 0x8000000000000000LL) { - if (lcl_counter++ > 1000) { + if (lcl_counter++ > 10000) { panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", jnl->old_start[i], jnl); } @@ -2922,7 +2940,6 @@ journal_kill_block(journal *jnl, struct buf *bp) return 0; } - /* ;________________________________________________________________________________ ; @@ -3016,24 +3033,23 @@ trim_realloc(struct jnl_trim_list *trim) return 0; } - /* -;________________________________________________________________________________ -; -; Routine: trim_search_extent -; -; Function: Search the given extent list to see if any of its extents -; overlap the given extent. -; -; Input Arguments: -; trim - The trim list to be searched. -; offset - The first byte of the range to be searched for. -; length - The number of bytes of the extent being searched for. -; -; Output: -; (result) - TRUE if one or more extents overlap, FALSE otherwise. -;________________________________________________________________________________ -*/ + ;________________________________________________________________________________ + ; + ; Routine: trim_search_extent + ; + ; Function: Search the given extent list to see if any of its extents + ; overlap the given extent. + ; + ; Input Arguments: + ; trim - The trim list to be searched. + ; offset - The first byte of the range to be searched for. + ; length - The number of bytes of the extent being searched for. + ; + ; Output: + ; (result) - TRUE if one or more extents overlap, FALSE otherwise. + ;________________________________________________________________________________ + */ static int trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) { @@ -3092,7 +3108,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) dk_extent_t *extent; uint32_t insert_index; uint32_t replace_count; - + CHECK_JOURNAL(jnl); /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ @@ -3112,9 +3128,9 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) } free_old_stuff(jnl); - + end = offset + length; - + /* * Find the range of existing extents that can be combined with the * input extent. We start by counting the number of extents that end @@ -3132,7 +3148,7 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) ++replace_count; ++extent; } - + /* * If none of the existing extents can be combined with the input extent, * then just insert it in the list (before item number insert_index). @@ -3331,24 +3347,23 @@ trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) return 0; } - /* -;________________________________________________________________________________ -; -; Routine: journal_trim_remove_extent -; -; Function: Make note of a range of bytes, some of which may have previously -; been passed to journal_trim_add_extent, is now in use on the -; volume. The given bytes will be not be trimmed as part of -; this transaction, or a pending trim of a transaction being -; asynchronously flushed. -; -; Input Arguments: -; jnl - The journal for the volume containing the byte range. -; offset - The first byte of the range to be trimmed. -; length - The number of bytes of the extent being trimmed. -;________________________________________________________________________________ -*/ + ;________________________________________________________________________________ + ; + ; Routine: journal_trim_remove_extent + ; + ; Function: Make note of a range of bytes, some of which may have previously + ; been passed to journal_trim_add_extent, is now in use on the + ; volume. The given bytes will be not be trimmed as part of + ; this transaction, or a pending trim of a transaction being + ; asynchronously flushed. + ; + ; Input Arguments: + ; jnl - The journal for the volume containing the byte range. + ; offset - The first byte of the range to be trimmed. + ; length - The number of bytes of the extent being trimmed. + ;________________________________________________________________________________ + */ __private_extern__ int journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) { @@ -3374,7 +3389,7 @@ journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) } free_old_stuff(jnl); - + error = trim_remove_extent(&tr->trim, offset, length); if (error == 0) { int found = FALSE; @@ -3424,11 +3439,11 @@ journal_trim_flush(journal *jnl, transaction *tr) if (jnl_kdebug) KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); + lck_rw_lock_shared(&jnl->trim_lock); if (tr->trim.extent_count > 0) { dk_unmap_t unmap; bzero(&unmap, sizeof(unmap)); - lck_rw_lock_shared(&jnl->trim_lock); if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) { unmap.extents = tr->trim.extents; unmap.extentsCount = tr->trim.extent_count; @@ -3439,12 +3454,12 @@ journal_trim_flush(journal *jnl, transaction *tr) KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0); if (errno) { printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n", - errno, (unsigned long) (unmap.extents), unmap.extentsCount, - jnl->jdev_name); + errno, (unsigned long) (unmap.extents), unmap.extentsCount, + jnl->jdev_name); jnl->flags &= ~JOURNAL_USE_UNMAP; } } - + /* * Call back into the file system to tell them that we have * trimmed some extents and that they can now be reused. @@ -3456,9 +3471,8 @@ journal_trim_flush(journal *jnl, transaction *tr) */ if (jnl->trim_callback) jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); - - lck_rw_unlock_shared(&jnl->trim_lock); } + lck_rw_unlock_shared(&jnl->trim_lock); /* * If the transaction we're flushing was the async transaction, then @@ -3475,6 +3489,11 @@ journal_trim_flush(journal *jnl, transaction *tr) jnl->async_trim = NULL; lck_rw_unlock_exclusive(&jnl->trim_lock); + /* + * By the time we get here, no other thread can discover the address + * of "tr", so it is safe for us to manipulate tr->trim without + * holding any locks. + */ if (tr->trim.extents) { kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); tr->trim.allocated_count = 0; @@ -3488,7 +3507,6 @@ journal_trim_flush(journal *jnl, transaction *tr) return errno; } - static int journal_binfo_cmp(const void *a, const void *b) { @@ -3607,7 +3625,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); goto done; } - + /* * Store a pointer to this transaction's trim list so that * future transactions can find it. @@ -3634,7 +3652,7 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void * of the journal flush, 'saved_sequence_num' remains stable */ jnl->saved_sequence_num = jnl->sequence_num; - + /* * if we're here we're going to flush the transaction buffer to disk. * 'check_free_space' will not return untl there is enough free @@ -3822,15 +3840,7 @@ done: static void finish_end_thread(transaction *tr) { -#if !CONFIG_EMBEDDED proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); -#else /* !CONFIG_EMBEDDED */ - struct uthread *ut; - - ut = get_bsdthread_info(current_thread()); - ut->uu_iopol_disk = IOPOL_PASSIVE; -#endif /* !CONFIG_EMBEDDED */ - finish_end_transaction(tr, NULL, NULL); thread_deallocate(current_thread()); @@ -3840,14 +3850,7 @@ finish_end_thread(transaction *tr) static void write_header_thread(journal *jnl) { -#if !CONFIG_EMBEDDED proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); -#else /* !CONFIG_EMBEDDED */ - struct uthread *ut; - - ut = get_bsdthread_info(current_thread()); - ut->uu_iopol_disk = IOPOL_PASSIVE; -#endif /* !CONFIG_EMBEDDED */ if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) jnl->write_header_failed = TRUE; @@ -4249,7 +4252,7 @@ abort_transaction(journal *jnl, transaction *tr) */ vnode_rele_ext(bp_vp, 0, 1); } else { - printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n", + printf("jnl: %s: abort_tr: could not find block %lld vp %p!\n", jnl->jdev_name, blhdr->binfo[i].bnum, tbp); if (bp) { buf_brelse(bp); @@ -4276,6 +4279,7 @@ abort_transaction(journal *jnl, transaction *tr) jnl->async_trim = NULL; lck_rw_unlock_exclusive(&jnl->trim_lock); + if (tr->trim.extents) { kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); } @@ -4520,7 +4524,8 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu { int ret; transaction *tr; - + size_t i = 0; + /* * Sanity check inputs, and adjust the size of the transaction buffer. */ @@ -4565,7 +4570,23 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu return ret; } wait_condition(jnl, &jnl->flushing, "end_transaction"); - + + /* + * At this point, we have completely flushed the contents of the current + * journal to disk (and have asynchronously written all of the txns to + * their actual desired locations). As a result, we can (and must) clear + * out the old_start array. If we do not, then if the last written transaction + * started at the beginning of the journal (starting 1 block into the + * journal file) it could confuse the buffer_flushed callback. This is + * because we're about to reset the start/end pointers of the journal header + * below. + */ + lock_oldstart(jnl); + for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { + jnl->old_start[i] = 0; + } + unlock_oldstart(jnl); + /* Update the journal's offset and size in memory. */ jnl->jdev_offset = offset; jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size; diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h index 11b24c3ee..7b7f4f319 100644 --- a/bsd/vfs/vfs_journal.h +++ b/bsd/vfs/vfs_journal.h @@ -1,5 +1,6 @@ + /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +44,7 @@ #include #include + typedef struct _blk_info { int32_t bsize; union { @@ -83,20 +85,20 @@ struct jnl_trim_list { typedef void (*jnl_trim_callback_t)(void *arg, uint32_t extent_count, const dk_extent_t *extents); typedef struct transaction { - int tbuffer_size; // in bytes - char *tbuffer; // memory copy of the transaction - block_list_header *blhdr; // points to the first byte of tbuffer - int num_blhdrs; // how many buffers we've allocated - int total_bytes; // total # of bytes in transaction - int num_flushed; // how many bytes have been flushed - int num_killed; // how many bytes were "killed" - off_t journal_start; // where in the journal this transaction starts - off_t journal_end; // where in the journal this transaction ends - struct journal *jnl; // ptr back to the journal structure - struct transaction *next; // list of tr's (either completed or to be free'd) - uint32_t sequence_num; - struct jnl_trim_list trim; - boolean_t delayed_header_write; + int tbuffer_size; // in bytes + char *tbuffer; // memory copy of the transaction + block_list_header *blhdr; // points to the first byte of tbuffer + int num_blhdrs; // how many buffers we've allocated + int total_bytes; // total # of bytes in transaction + int num_flushed; // how many bytes have been flushed + int num_killed; // how many bytes were "killed" + off_t journal_start; // where in the journal this transaction starts + off_t journal_end; // where in the journal this transaction ends + struct journal *jnl; // ptr back to the journal structure + struct transaction *next; // list of tr's (either completed or to be free'd) + uint32_t sequence_num; + struct jnl_trim_list trim; + boolean_t delayed_header_write; } transaction; @@ -136,7 +138,8 @@ typedef struct journal_header { typedef struct journal { lck_mtx_t jlock; // protects the struct journal data lck_mtx_t flock; // serializes flushing of journal - lck_rw_t trim_lock; // protects the async_trim field, below + lck_rw_t trim_lock; // protects the async_trim field, below + struct vnode *jdev; // vnode of the device where the journal lives off_t jdev_offset; // byte offset to the start of the journal @@ -154,7 +157,7 @@ typedef struct journal { boolean_t asyncIO; boolean_t writing_header; boolean_t write_header_failed; - + struct jnl_trim_list *async_trim; // extents to be trimmed by transaction being asynchronously flushed jnl_trim_callback_t trim_callback; void *trim_callback_arg; @@ -163,8 +166,8 @@ typedef struct journal { int32_t header_buf_size; journal_header *jhdr; // points to the first byte of header_buf - uint32_t saved_sequence_num; - uint32_t sequence_num; + uint32_t saved_sequence_num; + uint32_t sequence_num; off_t max_read_size; off_t max_write_size; @@ -192,6 +195,7 @@ typedef struct journal { #define JOURNAL_DO_FUA_WRITES 0x00100000 // do force-unit-access writes #define JOURNAL_USE_UNMAP 0x00200000 // device supports UNMAP (TRIM) + /* journal_open/create options are always in the low-16 bits */ #define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 10b885d51..2c225cf19 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -524,12 +524,12 @@ lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname /* The "parent" of the stream is the file. */ if (wantparent) { if (ndp->ni_dvp) { -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (ndp->ni_cnd.cn_flags & FSNODELOCKHELD) { ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ vnode_put(ndp->ni_dvp); } ndp->ni_dvp = dp; @@ -1020,12 +1020,12 @@ lookup_error: if ((error == ENOENT) && (dp->v_flag & VROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((cnp->cn_flags & FSNODELOCKHELD)) { cnp->cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(dp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ tdp = dp; dp = tdp->v_mount->mnt_vnodecovered; @@ -1098,12 +1098,12 @@ returned_from_lookup_path: return (0); bad2: -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((cnp->cn_flags & FSNODELOCKHELD)) { cnp->cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ndp->ni_dvp) vnode_put(ndp->ni_dvp); @@ -1115,12 +1115,12 @@ bad2: return (error); bad: -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((cnp->cn_flags & FSNODELOCKHELD)) { cnp->cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (dp) vnode_put(dp); ndp->ni_vp = NULLVP; @@ -1280,12 +1280,12 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) vnode_t dp; char *tmppn; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((cnp->cn_flags & FSNODELOCKHELD)) { cnp->cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { return ELOOP; @@ -1494,14 +1494,14 @@ bad: void namei_unlock_fsnode(struct nameidata *ndp) { -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((ndp->ni_cnd.cn_flags & FSNODELOCKHELD)) { ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } #else (void)ndp; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ } /* @@ -1553,7 +1553,7 @@ nameidone(struct nameidata *ndp) * fails because /foo_bar_baz is not found will only log "/foo_bar_baz", with * no '>' padding. But /foo_bar/spam would log "/foo_bar>>>>". */ -#if !defined(NO_KDEBUG) +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) static void kdebug_lookup(struct vnode *dp, struct componentname *cnp) { @@ -1590,7 +1590,7 @@ kdebug_lookup(struct vnode *dp, struct componentname *cnp) if (dbg_namelen <= 12) code |= DBG_FUNC_END; - KERNEL_DEBUG_CONSTANT(code, dp, dbg_parms[0], dbg_parms[1], dbg_parms[2], 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, dp, dbg_parms[0], dbg_parms[1], dbg_parms[2], 0); code &= ~DBG_FUNC_START; @@ -1598,15 +1598,15 @@ kdebug_lookup(struct vnode *dp, struct componentname *cnp) if (dbg_namelen <= 16) code |= DBG_FUNC_END; - KERNEL_DEBUG_CONSTANT(code, dbg_parms[i], dbg_parms[i+1], dbg_parms[i+2], dbg_parms[i+3], 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, dbg_parms[i], dbg_parms[i+1], dbg_parms[i+2], dbg_parms[i+3], 0); } } -#else /* NO_KDEBUG */ +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ static void kdebug_lookup(struct vnode *dp __unused, struct componentname *cnp __unused) { } -#endif /* NO_KDEBUG */ +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ int vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx) diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 43352545d..d287837a3 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,7 +105,9 @@ #include #include #include +#include #include +#include #include #include @@ -113,6 +115,9 @@ #include +#include +#include +#include #include @@ -183,9 +188,11 @@ __private_extern__ int unlink1(vfs_context_t, struct nameidata *, int); extern int system_inshutdown; static void vnode_list_add(vnode_t); +static void vnode_async_list_add(vnode_t); static void vnode_list_remove(vnode_t); static void vnode_list_remove_locked(vnode_t); +static void vnode_abort_advlocks(vnode_t); static errno_t vnode_drain(vnode_t); static void vgone(vnode_t, int flags); static void vclean(vnode_t vp, int flag); @@ -223,6 +230,8 @@ static void vnode_resolver_detach(vnode_t); TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ +TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list; + TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */ struct timeval rage_tv; @@ -262,7 +271,6 @@ static int nummounts = 0; } while(0) - /* remove a vnode from dead vnode list */ #define VREMDEAD(fun, vp) \ do { \ @@ -274,6 +282,17 @@ static int nummounts = 0; } while(0) +/* remove a vnode from async work vnode list */ +#define VREMASYNC_WORK(fun, vp) \ + do { \ + VLISTCHECK((fun), (vp), "async_work"); \ + TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \ + VLISTNONE((vp)); \ + vp->v_listflag &= ~VLIST_ASYNC_WORK; \ + async_work_vnodes--; \ + } while(0) + + /* remove a vnode from rage vnode list */ #define VREMRAGE(fun, vp) \ do { \ @@ -304,15 +323,21 @@ u_int32_t vnodetarget; /* target for vnreclaim() */ */ #define VNODE_FREE_MIN CONFIG_VNODE_FREE_MIN /* freelist should have at least this many */ + +static void async_work_continue(void); + /* * Initialize the vnode management data structures. */ __private_extern__ void vntblinit(void) { + thread_t thread = THREAD_NULL; + TAILQ_INIT(&vnode_free_list); TAILQ_INIT(&vnode_rage_list); TAILQ_INIT(&vnode_dead_list); + TAILQ_INIT(&vnode_async_work_list); TAILQ_INIT(&mountlist); if (!vnodetarget) @@ -329,6 +354,12 @@ vntblinit(void) * we want to cache */ (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN); + + /* + * create worker threads + */ + kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread); + thread_deallocate(thread); } /* Reset the VM Object Cache with the values passed in */ @@ -1201,9 +1232,14 @@ vfs_getnewfsid(struct mount *mp) * Routines having to do with the management of the vnode table. */ extern int (**dead_vnodeop_p)(void *); -long numvnodes, freevnodes, deadvnodes; +long numvnodes, freevnodes, deadvnodes, async_work_vnodes; +int async_work_timed_out = 0; +int async_work_handled = 0; +int dead_vnode_wanted = 0; +int dead_vnode_waited = 0; + /* * Move a vnode from one mount queue to another. */ @@ -1555,6 +1591,34 @@ out: } +static boolean_t +vnode_on_reliable_media(vnode_t vp) +{ + if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL) ) + return (TRUE); + return (FALSE); +} + +static void +vnode_async_list_add(vnode_t vp) +{ + vnode_list_lock(); + + if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE|VL_DEAD))) + panic("vnode_async_list_add: %p is in wrong state", vp); + + TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist); + vp->v_listflag |= VLIST_ASYNC_WORK; + + async_work_vnodes++; + + vnode_list_unlock(); + + wakeup(&vnode_async_work_list); + +} + + /* * put the vnode on appropriate free list. * called with vnode LOCKED @@ -1562,6 +1626,8 @@ out: static void vnode_list_add(vnode_t vp) { + boolean_t need_dead_wakeup = FALSE; + #if DIAGNOSTIC lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); #endif @@ -1603,7 +1669,13 @@ vnode_list_add(vnode_t vp) TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist); vp->v_listflag |= VLIST_DEAD; deadvnodes++; - } else if ((vp->v_flag & VAGE)) { + + if (dead_vnode_wanted) { + dead_vnode_wanted--; + need_dead_wakeup = TRUE; + } + + } else if ( (vp->v_flag & VAGE) ) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); vp->v_flag &= ~VAGE; freevnodes++; @@ -1613,6 +1685,9 @@ vnode_list_add(vnode_t vp) } } vnode_list_unlock(); + + if (need_dead_wakeup == TRUE) + wakeup_one((caddr_t)&dead_vnode_wanted); } @@ -1633,6 +1708,8 @@ vnode_list_remove_locked(vnode_t vp) VREMRAGE("vnode_list_remove", vp); else if (vp->v_listflag & VLIST_DEAD) VREMDEAD("vnode_list_remove", vp); + else if (vp->v_listflag & VLIST_ASYNC_WORK) + VREMASYNC_WORK("vnode_list_remove", vp); else VREMFREE("vnode_list_remove", vp); } @@ -1744,9 +1821,15 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) * if it's been marked for termination */ if (dont_reenter) { - if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) ) + if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) ) { vp->v_lflag |= VL_NEEDINACTIVE; - vp->v_flag |= VAGE; + + if (vnode_on_reliable_media(vp) == FALSE) { + vnode_async_list_add(vp); + goto done; + } + } + vp->v_flag |= VAGE; } vnode_list_add(vp); @@ -1947,6 +2030,7 @@ loop: #ifdef JOE_DEBUG record_vp(vp, 1); #endif + vnode_abort_advlocks(vp); vnode_reclaim_internal(vp, 1, 1, 0); vnode_dropiocount(vp); vnode_list_add(vp); @@ -2641,6 +2725,10 @@ vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, int error; struct vfsconf vfsc; + if (namelen > CTL_MAXNAME) { + return (EINVAL); + } + /* All non VFS_GENERIC and in VFS_GENERIC, * VFS_MAXTYPENUM, VFS_CONF, VFS_SET_PACKAGE_EXTS * needs to have root priv to have modifiers. @@ -2729,6 +2817,7 @@ vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, * We need to get back into the general MIB, so we need to re-prepend * CTL_VFS to our name and try userland_sysctl(). */ + usernamelen = namelen + 1; MALLOC(username, int *, usernamelen * sizeof(*username), M_TEMP, M_WAITOK); @@ -3039,8 +3128,10 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) if (features & DK_FEATURE_FORCE_UNIT_ACCESS) mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED; + if (features & DK_FEATURE_UNMAP) - mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED; + mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED; + return (error); } @@ -3058,8 +3149,20 @@ vfs_event_init(void) } void -vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data) -{ +vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) +{ + if (event == VQ_DEAD || event == VQ_NOTRESP) { + struct mount *mp = vfs_getvfs(fsid); + if (mp) { + mount_lock_spin(mp); + if (data) + mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding + else + mp->mnt_kern_flag |= MNT_LNOTRESP; // Not responding + mount_unlock(mp); + } + } + lck_mtx_lock(fs_klist_lock); KNOTE(&fs_klist, event); lck_mtx_unlock(fs_klist_lock); @@ -3480,28 +3583,162 @@ SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, long num_reusedvnodes = 0; + +static vnode_t +process_vp(vnode_t vp, int want_vp, int *deferred) +{ + unsigned int vpid; + + *deferred = 0; + + vpid = vp->v_id; + + vnode_list_remove_locked(vp); + + vnode_list_unlock(); + + vnode_lock_spin(vp); + + /* + * We could wait for the vnode_lock after removing the vp from the freelist + * and the vid is bumped only at the very end of reclaim. So it is possible + * that we are looking at a vnode that is being terminated. If so skip it. + */ + if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || + VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { + /* + * we lost the race between dropping the list lock + * and picking up the vnode_lock... someone else + * used this vnode and it is now in a new state + */ + vnode_unlock(vp); + + return (NULLVP); + } + if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) { + /* + * we did a vnode_rele_ext that asked for + * us not to reenter the filesystem during + * the release even though VL_NEEDINACTIVE was + * set... we'll do it here by doing a + * vnode_get/vnode_put + * + * pick up an iocount so that we can call + * vnode_put and drive the VNOP_INACTIVE... + * vnode_put will either leave us off + * the freelist if a new ref comes in, + * or put us back on the end of the freelist + * or recycle us if we were marked for termination... + * so we'll just go grab a new candidate + */ + vp->v_iocount++; +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + vnode_put_locked(vp); + vnode_unlock(vp); + + return (NULLVP); + } + /* + * Checks for anyone racing us for recycle + */ + if (vp->v_type != VBAD) { + if (want_vp && vnode_on_reliable_media(vp) == FALSE) { + vnode_async_list_add(vp); + vnode_unlock(vp); + + *deferred = 1; + + return (NULLVP); + } + if (vp->v_lflag & VL_DEAD) + panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp); + + vnode_lock_convert(vp); + (void)vnode_reclaim_internal(vp, 1, want_vp, 0); + + if (want_vp) { + if ((VONLIST(vp))) + panic("new_vnode(%p): vp on list", vp); + if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || + (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) + panic("new_vnode(%p): free vnode still referenced", vp); + if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) + panic("new_vnode(%p): vnode seems to be on mount list", vp); + if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) + panic("new_vnode(%p): vnode still hooked into the name cache", vp); + } else { + vnode_unlock(vp); + vp = NULLVP; + } + } + return (vp); +} + + + +static void +async_work_continue(void) +{ + struct async_work_lst *q; + int deferred; + vnode_t vp; + + q = &vnode_async_work_list; + + for (;;) { + + vnode_list_lock(); + + if ( TAILQ_EMPTY(q) ) { + assert_wait(q, (THREAD_UNINT)); + + vnode_list_unlock(); + + thread_block((thread_continue_t)async_work_continue); + + continue; + } + async_work_handled++; + + vp = TAILQ_FIRST(q); + + vp = process_vp(vp, 0, &deferred); + + if (vp != NULLVP) + panic("found VBAD vp (%p) on async queue", vp); + } +} + + static int new_vnode(vnode_t *vpp) { vnode_t vp; - int retries = 0; /* retry incase of tablefull */ + uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */ int force_alloc = 0, walk_count = 0; - unsigned int vpid; - struct timespec ts; + boolean_t need_reliable_vp = FALSE; + int deferred; + struct timeval initial_tv; struct timeval current_tv; -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL struct unsafe_fsnode *l_unsafefs = 0; -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ proc_t curproc = current_proc(); + initial_tv.tv_sec = 0; retry: - microuptime(¤t_tv); - vp = NULLVP; vnode_list_lock(); + if (need_reliable_vp == TRUE) + async_work_timed_out++; + if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) { + struct timespec ts; + if ( !TAILQ_EMPTY(&vnode_dead_list)) { /* * Can always reuse a dead one @@ -3534,6 +3771,7 @@ retry: vp->v_iocount = 1; goto done; } + microuptime(¤t_tv); #define MAX_WALK_COUNT 1000 @@ -3542,10 +3780,10 @@ retry: (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) { TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) { - if ( !(vp->v_listflag & VLIST_RAGE)) - panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); + if ( !(vp->v_listflag & VLIST_RAGE)) + panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); - // if we're a dependency-capable process, skip vnodes that can + // if we're a dependency-capable process, skip vnodes that can // cause recycling deadlocks. (i.e. this process is diskimages // helper and the vnode is in a disk image). Querying the // mnt_kern_flag for the mount's virtual device status @@ -3553,19 +3791,27 @@ retry: // may not be updated if there are multiple devnode layers // in between the disk image and the final consumer. - if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || - (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { - break; - } + if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || + (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { + /* + * if need_reliable_vp == TRUE, then we've already sent one or more + * non-reliable vnodes to the async thread for processing and timed + * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT + * mechanism to first scan for a reliable vnode before forcing + * a new vnode to be created + */ + if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) + break; + } - // don't iterate more than MAX_WALK_COUNT vnodes to - // avoid keeping the vnode list lock held for too long. - if (walk_count++ > MAX_WALK_COUNT) { + // don't iterate more than MAX_WALK_COUNT vnodes to + // avoid keeping the vnode list lock held for too long. + + if (walk_count++ > MAX_WALK_COUNT) { vp = NULL; - break; - } + break; + } } - } if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) { @@ -3583,19 +3829,27 @@ retry: // may not be updated if there are multiple devnode layers // in between the disk image and the final consumer. - if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || - (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { - break; - } + if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || + (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { + /* + * if need_reliable_vp == TRUE, then we've already sent one or more + * non-reliable vnodes to the async thread for processing and timed + * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT + * mechanism to first scan for a reliable vnode before forcing + * a new vnode to be created + */ + if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) + break; + } - // don't iterate more than MAX_WALK_COUNT vnodes to - // avoid keeping the vnode list lock held for too long. - if (walk_count++ > MAX_WALK_COUNT) { - vp = NULL; - break; - } - } + // don't iterate more than MAX_WALK_COUNT vnodes to + // avoid keeping the vnode list lock held for too long. + if (walk_count++ > MAX_WALK_COUNT) { + vp = NULL; + break; + } + } } // @@ -3608,9 +3862,9 @@ retry: // the allocation. // if (vp == NULL && walk_count >= MAX_WALK_COUNT) { - force_alloc = 1; - vnode_list_unlock(); - goto retry; + force_alloc = 1; + vnode_list_unlock(); + goto retry; } if (vp == NULL) { @@ -3618,9 +3872,9 @@ retry: * we've reached the system imposed maximum number of vnodes * but there isn't a single one available * wait a bit and then retry... if we can't get a vnode - * after 100 retries, than log a complaint + * after our target number of retries, than log a complaint */ - if (++retries <= 100) { + if (++retries <= max_retries) { vnode_list_unlock(); delay_for_interval(1, 1000 * 1000); goto retry; @@ -3631,12 +3885,12 @@ retry: log(LOG_EMERG, "%d desired, %d numvnodes, " "%d free, %d dead, %d rage\n", desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes); -#if CONFIG_EMBEDDED +#if CONFIG_JETSAM /* * Running out of vnodes tends to make a system unusable. Start killing * processes that jetsam knows are killable. */ - if (jetsam_kill_top_proc(TRUE, kJetsamFlagsKilledVnodes) < 0) { + if (memorystatus_kill_top_proc(TRUE, kMemorystatusFlagsKilledVnodes) < 0) { /* * If jetsam can't find any more processes to kill and there * still aren't any free vnodes, panic. Hopefully we'll get a @@ -3645,7 +3899,13 @@ retry: panic("vnode table is full\n"); } - delay_for_interval(1, 1000 * 1000); + /* + * Now that we've killed someone, wait a bit and continue looking + * (with fewer retries before trying another kill). + */ + delay_for_interval(3, 1000 * 1000); + retries = 0; + max_retries = 10; goto retry; #endif @@ -3653,80 +3913,66 @@ retry: return (ENFILE); } steal_this_vp: - vpid = vp->v_id; + if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) { + if (deferred) { + int elapsed_msecs; + struct timeval elapsed_tv; - vnode_list_remove_locked(vp); + if (initial_tv.tv_sec == 0) + microuptime(&initial_tv); - vnode_list_unlock(); + vnode_list_lock(); - vnode_lock_spin(vp); + dead_vnode_waited++; + dead_vnode_wanted++; - /* - * We could wait for the vnode_lock after removing the vp from the freelist - * and the vid is bumped only at the very end of reclaim. So it is possible - * that we are looking at a vnode that is being terminated. If so skip it. - */ - if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || - VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { - /* - * we lost the race between dropping the list lock - * and picking up the vnode_lock... someone else - * used this vnode and it is now in a new state - * so we need to go back and try again - */ - vnode_unlock(vp); - goto retry; - } - if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) { - /* - * we did a vnode_rele_ext that asked for - * us not to reenter the filesystem during - * the release even though VL_NEEDINACTIVE was - * set... we'll do it here by doing a - * vnode_get/vnode_put - * - * pick up an iocount so that we can call - * vnode_put and drive the VNOP_INACTIVE... - * vnode_put will either leave us off - * the freelist if a new ref comes in, - * or put us back on the end of the freelist - * or recycle us if we were marked for termination... - * so we'll just go grab a new candidate - */ - vp->v_iocount++; -#ifdef JOE_DEBUG - record_vp(vp, 1); -#endif - vnode_put_locked(vp); - vnode_unlock(vp); + /* + * note that we're only going to explicitly wait 10ms + * for a dead vnode to become available, since even if one + * isn't available, a reliable vnode might now be available + * at the head of the VRAGE or free lists... if so, we + * can satisfy the new_vnode request with less latency then waiting + * for the full 100ms duration we're ultimately willing to tolerate + */ + assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC); + + vnode_list_unlock(); + + thread_block(THREAD_CONTINUE_NULL); + + microuptime(&elapsed_tv); + + timevalsub(&elapsed_tv, &initial_tv); + elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000; + + if (elapsed_msecs >= 100) { + /* + * we've waited long enough... 100ms is + * somewhat arbitrary for this case, but the + * normal worst case latency used for UI + * interaction is 100ms, so I've chosen to + * go with that. + * + * setting need_reliable_vp to TRUE + * forces us to find a reliable vnode + * that we can process synchronously, or + * to create a new one if the scan for + * a reliable one hits the scan limit + */ + need_reliable_vp = TRUE; + } + } goto retry; } OSAddAtomicLong(1, &num_reusedvnodes); - /* Checks for anyone racing us for recycle */ - if (vp->v_type != VBAD) { - if (vp->v_lflag & VL_DEAD) - panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp); - vnode_lock_convert(vp); - (void)vnode_reclaim_internal(vp, 1, 1, 0); - if ((VONLIST(vp))) - panic("new_vnode(%p): vp on list", vp); - if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || - (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) - panic("new_vnode(%p): free vnode still referenced", vp); - if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) - panic("new_vnode(%p): vnode seems to be on mount list", vp); - if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) - panic("new_vnode(%p): vnode still hooked into the name cache", vp); - } - -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (vp->v_unsafefs) { l_unsafefs = vp->v_unsafefs; vp->v_unsafefs = (struct unsafe_fsnode *)NULL; } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ #if CONFIG_MACF /* @@ -3757,12 +4003,12 @@ steal_this_vp: vnode_unlock(vp); -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if (l_unsafefs) { lck_mtx_destroy(&l_unsafefs->fsnodelock, vnode_lck_grp); FREE_ZONE((void *)l_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ done: *vpp = vp; @@ -3988,6 +4234,18 @@ vnode_suspend(vnode_t vp) return(0); } +/* + * Release any blocked locking requests on the vnode. + * Used for forced-unmounts. + * + * XXX What about network filesystems? + */ +static void +vnode_abort_advlocks(vnode_t vp) +{ + if (vp->v_flag & VLOCKLOCAL) + lf_abort_advlocks(vp); +} static errno_t @@ -4345,6 +4603,14 @@ vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) insert = 0; vnode_unlock(vp); } + + if (VCHR == vp->v_type) { + u_int maj = major(vp->v_rdev); + + if (maj < (u_int)nchrdev && + (D_TYPEMASK & cdevsw[maj].d_type) == D_TTY) + vp->v_flag |= VISTTY; + } } if (vp->v_type == VFIFO) { @@ -4378,7 +4644,7 @@ vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) */ insmntque(vp, param->vnfs_mp); } -#ifndef __LP64__ +#if CONFIG_VFS_FUNNEL if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) { MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); @@ -4386,7 +4652,7 @@ vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) vp->v_unsafefs->fsnodeowner = (void *)NULL; lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); } -#endif /* __LP64__ */ +#endif /* CONFIG_VFS_FUNNEL */ } if (dvp && vnode_ref(dvp) == 0) { vp->v_parent = dvp; @@ -7747,7 +8013,7 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx); nd_temp.ni_dvp = vp; - error = unlink1(ctx, &nd_temp, 0); + error = unlink1(ctx, &nd_temp, VNODE_REMOVE_SKIP_NAMESPACE_EVENT); if (error && error != ENOENT) { goto outsc; diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 3d9b4591b..652a8e34a 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2010 Apple Inc. All rights reserved. + * Copyright (c) 1995-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -182,33 +182,6 @@ int open1(vfs_context_t, struct nameidata *, int, struct vnode_attr *, int32_t * __private_extern__ int unlink1(vfs_context_t, struct nameidata *, int); - -#ifdef __APPLE_API_OBSOLETE -struct fstatv_args { - int fd; /* file descriptor of the target file */ - struct vstat *vsb; /* vstat structure for returned info */ -}; -struct lstatv_args { - const char *path; /* pathname of the target file */ - struct vstat *vsb; /* vstat structure for returned info */ -}; -struct mkcomplex_args { - const char *path; /* pathname of the file to be created */ - mode_t mode; /* access mode for the newly created file */ - u_int32_t type; /* format of the complex file */ -}; -struct statv_args { - const char *path; /* pathname of the target file */ - struct vstat *vsb; /* vstat structure for returned info */ -}; - -int fstatv(proc_t p, struct fstatv_args *uap, int32_t *retval); -int lstatv(proc_t p, struct lstatv_args *uap, int32_t *retval); -int mkcomplex(proc_t p, struct mkcomplex_args *uap, int32_t *retval); -int statv(proc_t p, struct statv_args *uap, int32_t *retval); - -#endif /* __APPLE_API_OBSOLETE */ - /* * incremented each time a mount or unmount operation occurs * used to invalidate the cached value of the rootvp in the @@ -500,6 +473,16 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, goto out1; } + /* + * If content protection is enabled, update mounts are not + * allowed to turn it off. + */ + if ((mp->mnt_flag & MNT_CPROTECT) && + ((flags & MNT_CPROTECT) == 0)) { + error = EINVAL; + goto out1; + } + #ifdef CONFIG_IMGSRC_ACCESS /* Can't downgrade the backer of the root FS */ if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) && @@ -534,6 +517,8 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, } flag = mp->mnt_flag; + + mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); vfsp = mp->mnt_vtable; @@ -1728,6 +1713,16 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) int error; proc_t p = vfs_context_proc(ctx); + /* + * If the file system is not responding and MNT_NOBLOCK + * is set and not a forced unmount then return EBUSY. + */ + if ((mp->mnt_kern_flag & MNT_LNOTRESP) && + (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) { + error = EBUSY; + goto out; + } + /* * Skip authorization if the mount is tagged as permissive and * this is not a forced-unmount attempt. @@ -2370,7 +2365,7 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t mp = vp->v_mount; if (!mp) { - error = EBADF;; + error = EBADF; goto out; } sp = &mp->mnt_vfsstat; @@ -3052,6 +3047,14 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, struct vnode_attr *v fp->f_fglob->fg_ops = &vnops; fp->f_fglob->fg_data = (caddr_t)vp; +#if CONFIG_PROTECT + if (VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) { + if (vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) { + fp->f_fglob->fg_flag |= FENCRYPTED; + } + } +#endif + if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; @@ -3209,6 +3212,58 @@ open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval) return ciferror; } +/* + * Go through the data-protected atomically controlled open (2) + * + * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode) + */ +int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) { + int flags = uap->flags; + int class = uap->class; + int dpflags = uap->dpflags; + + /* + * Follow the same path as normal open(2) + * Look up the item if it exists, and acquire the vnode. + */ + struct filedesc *fdp = p->p_fd; + struct vnode_attr va; + struct nameidata nd; + int cmode; + int error; + + VATTR_INIT(&va); + /* Mask off all but regular access permissions */ + cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; + VATTR_SET(&va, va_mode, cmode & ACCESSPERMS); + + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, vfs_context_current()); + + /* + * Initialize the extra fields in vnode_attr to pass down our + * extra fields. + * 1. target cprotect class. + * 2. set a flag to mark it as requiring open-raw-encrypted semantics. + */ + if (flags & O_CREAT) { + VATTR_SET(&va, va_dataprotect_class, class); + } + + if (dpflags & O_DP_GETRAWENCRYPTED) { + if ( flags & (O_RDWR | O_WRONLY)) { + /* Not allowed to write raw encrypted bytes */ + return EINVAL; + } + VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED); + } + + error = open1(vfs_context_current(), &nd, uap->flags, &va, retval); + + return error; +} + + int open(proc_t p, struct open_args *uap, int32_t *retval) { @@ -3889,7 +3944,7 @@ undelete(__unused proc_t p, struct undelete_args *uap, __unused int32_t *retval) */ /* ARGSUSED */ int -unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) +unlink1(vfs_context_t ctx, struct nameidata *ndp, int unlink_flags) { vnode_t vp, dvp; int error; @@ -3926,9 +3981,15 @@ lookup_continue: /* With Carbon delete semantics, busy files cannot be deleted */ - if (nodelbusy) { + if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) { flags |= VNODE_REMOVE_NODELETEBUSY; } + + /* If we're told to, then skip any potential future upcalls */ + if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) { + flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT; + } + if (vp) { batched = vnode_compound_remove_available(vp); @@ -4100,7 +4161,7 @@ delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval) NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); - return unlink1(ctx, &nd, 1); + return unlink1(ctx, &nd, VNODE_REMOVE_NODELETEBUSY); } /* @@ -5089,8 +5150,8 @@ chmod2(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap) #endif #if CONFIG_MACF - error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode); - if (error) + if (VATTR_IS_ACTIVE(vap, va_mode) && + (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) return (error); #endif @@ -5887,7 +5948,7 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) { vnode_t tvp, tdvp; vnode_t fvp, fdvp; - struct nameidata fromnd, tond; + struct nameidata *fromnd, *tond; vfs_context_t ctx = vfs_context_current(); int error; int do_retry; @@ -5901,42 +5962,49 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) vnode_t oparent = NULLVP; #if CONFIG_FSE fse_info from_finfo, to_finfo; - struct vnode_attr fva, tva; #endif int from_truncated=0, to_truncated; int batched = 0; struct vnode_attr *fvap, *tvap; int continuing = 0; - + /* carving out a chunk for structs that are too big to be on stack. */ + struct { + struct nameidata from_node, to_node; + struct vnode_attr fv_attr, tv_attr; + } * __rename_data; + MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK); + fromnd = &__rename_data->from_node; + tond = &__rename_data->to_node; + holding_mntlock = 0; - do_retry = 0; + do_retry = 0; retry: fvp = tvp = NULL; fdvp = tdvp = NULL; fvap = tvap = NULL; mntrename = FALSE; - NDINIT(&fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1, + NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->from, ctx); - fromnd.ni_flag = NAMEI_COMPOUNDRENAME; + fromnd->ni_flag = NAMEI_COMPOUNDRENAME; - NDINIT(&tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK, + NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK, UIO_USERSPACE, uap->to, ctx); - tond.ni_flag = NAMEI_COMPOUNDRENAME; + tond->ni_flag = NAMEI_COMPOUNDRENAME; continue_lookup: - if ((fromnd.ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { - if ( (error = namei(&fromnd)) ) + if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { + if ( (error = namei(fromnd)) ) goto out1; - fdvp = fromnd.ni_dvp; - fvp = fromnd.ni_vp; + fdvp = fromnd->ni_dvp; + fvp = fromnd->ni_vp; if (fvp && fvp->v_type == VDIR) - tond.ni_cnd.cn_flags |= WILLBEDIR; + tond->ni_cnd.cn_flags |= WILLBEDIR; } - if ((tond.ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { - if ( (error = namei(&tond)) ) { + if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { + if ( (error = namei(tond)) ) { /* * Translate error code for rename("dir1", "dir2/."). */ @@ -5944,8 +6012,8 @@ continue_lookup: error = EINVAL; goto out1; } - tdvp = tond.ni_dvp; - tvp = tond.ni_vp; + tdvp = tond->ni_dvp; + tvp = tond->ni_vp; } batched = vnode_compound_rename_available(fdvp); @@ -5968,7 +6036,7 @@ continue_lookup: } if (!batched) { - error = vn_authorize_rename(fdvp, fvp, &fromnd.ni_cnd, tdvp, tvp, &tond.ni_cnd, ctx, NULL); + error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL); if (error) { if (error == ENOENT) { /* @@ -6062,9 +6130,9 @@ continue_lookup: * XXX filesystem should take care of this itself, perhaps... */ if (fvp == tvp && fdvp == tdvp) { - if (fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && - !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, - fromnd.ni_cnd.cn_namelen)) { + if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen && + !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr, + fromnd->ni_cnd.cn_namelen)) { goto out1; } } @@ -6106,7 +6174,7 @@ continue_lookup: * nameidone has to happen before we vnode_put(tvp) * since it may need to release the fs_nodelock on the tvp */ - nameidone(&tond); + nameidone(tond); if (tvp) vnode_put(tvp); @@ -6116,7 +6184,7 @@ continue_lookup: * nameidone has to happen before we vnode_put(fdvp) * since it may need to release the fs_nodelock on the fvp */ - nameidone(&fromnd); + nameidone(fromnd); vnode_put(fvp); vnode_put(fdvp); @@ -6155,23 +6223,23 @@ skipped_lookup: if (fvp) { get_fse_info(fvp, &from_finfo, ctx); } else { - error = vfs_get_notify_attributes(&fva); + error = vfs_get_notify_attributes(&__rename_data->fv_attr); if (error) { goto out1; } - fvap = &fva; + fvap = &__rename_data->fv_attr; } if (tvp) { get_fse_info(tvp, &to_finfo, ctx); } else if (batched) { - error = vfs_get_notify_attributes(&tva); + error = vfs_get_notify_attributes(&__rename_data->tv_attr); if (error) { goto out1; } - tvap = &tva; + tvap = &__rename_data->tv_attr; } } #else @@ -6187,7 +6255,7 @@ skipped_lookup: } } - from_len = safe_getpath(fdvp, fromnd.ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); + from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); if (to_name == NULL) { GET_PATH(to_name); @@ -6197,11 +6265,11 @@ skipped_lookup: } } - to_len = safe_getpath(tdvp, tond.ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); + to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); } - error = vn_rename(fdvp, &fvp, &fromnd.ni_cnd, fvap, - tdvp, &tvp, &tond.ni_cnd, tvap, + error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap, + tdvp, &tvp, &tond->ni_cnd, tvap, 0, ctx); if (holding_mntlock) { @@ -6215,14 +6283,14 @@ skipped_lookup: } if (error) { if (error == EKEEPLOOKING) { - if ((fromnd.ni_flag & NAMEI_CONTLOOKUP) == 0) { - if ((tond.ni_flag & NAMEI_CONTLOOKUP) == 0) { + if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) { + if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) { panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?"); } } - fromnd.ni_vp = fvp; - tond.ni_vp = tvp; + fromnd->ni_vp = fvp; + tond->ni_vp = tvp; goto continue_lookup; } @@ -6335,7 +6403,7 @@ skipped_lookup: if (fdvp != tdvp) update_flags |= VNODE_UPDATE_PARENT; - vnode_update_identity(fvp, tdvp, tond.ni_cnd.cn_nameptr, tond.ni_cnd.cn_namelen, tond.ni_cnd.cn_hash, update_flags); + vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags); } out1: if (to_name != NULL) { @@ -6356,7 +6424,7 @@ out1: * nameidone has to happen before we vnode_put(tdvp) * since it may need to release the fs_nodelock on the tdvp */ - nameidone(&tond); + nameidone(tond); if (tvp) vnode_put(tvp); @@ -6367,22 +6435,24 @@ out1: * nameidone has to happen before we vnode_put(fdvp) * since it may need to release the fs_nodelock on the fdvp */ - nameidone(&fromnd); + nameidone(fromnd); if (fvp) vnode_put(fvp); vnode_put(fdvp); } + /* * If things changed after we did the namei, then we will re-drive * this rename call from the top. */ - if(do_retry) { + if (do_retry) { do_retry = 0; goto retry; } - + + FREE(__rename_data, M_TEMP); return (error); } @@ -6790,7 +6860,7 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, * use 32K in the MIN(), but we use magic number 87371 to * prevent uio_resid() * 3 / 8 from overflowing. */ - bufsize = 3 * MIN(uio_resid(uio), 87371) / 8; + bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8; MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK); if (bufptr == NULL) { return ENOMEM; @@ -7096,65 +7166,6 @@ out: * which are specific to the HFS & HFS Plus volume formats */ -#ifdef __APPLE_API_OBSOLETE - -/************************************************/ -/* *** Following calls will be deleted soon *** */ -/************************************************/ - -/* - * Make a complex file. A complex file is one with multiple forks (data streams) - */ -/* ARGSUSED */ -int -mkcomplex(__unused proc_t p, __unused struct mkcomplex_args *uap, __unused int32_t *retval) -{ - return (ENOTSUP); -} - -/* - * Extended stat call which returns volumeid and vnodeid as well as other info - */ -/* ARGSUSED */ -int -statv(__unused proc_t p, - __unused struct statv_args *uap, - __unused int32_t *retval) -{ - return (ENOTSUP); /* We'll just return an error for now */ - -} /* end of statv system call */ - -/* -* Extended lstat call which returns volumeid and vnodeid as well as other info -*/ -/* ARGSUSED */ -int -lstatv(__unused proc_t p, - __unused struct lstatv_args *uap, - __unused int32_t *retval) -{ - return (ENOTSUP); /* We'll just return an error for now */ -} /* end of lstatv system call */ - -/* -* Extended fstat call which returns volumeid and vnodeid as well as other info -*/ -/* ARGSUSED */ -int -fstatv(__unused proc_t p, - __unused struct fstatv_args *uap, - __unused int32_t *retval) -{ - return (ENOTSUP); /* We'll just return an error for now */ -} /* end of fstatv system call */ - - -/************************************************/ -/* *** Preceding calls will be deleted soon *** */ -/************************************************/ - -#endif /* __APPLE_API_OBSOLETE */ /* * Obtain attribute information on objects in a directory while enumerating @@ -7421,6 +7432,7 @@ out2: return (error); } +#if CONFIG_SEARCHFS /* ARGSUSED */ @@ -7638,6 +7650,15 @@ freeandexit: } /* end of searchfs system call */ +#else /* CONFIG_SEARCHFS */ + +int +searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval) +{ + return (ENOTSUP); +} + +#endif /* CONFIG_SEARCHFS */ lck_grp_attr_t * nspace_group_attr; @@ -9221,6 +9242,7 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) #endif /* Obtain the absolute path to this vnode. */ bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0; + bpflags |= BUILDPATH_CHECK_MOVED; error = build_path(vp, realpath, uap->bufsize, &length, bpflags, ctx); vnode_put(vp); if (error) { diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index d7e2b5f14..47552a0d8 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -111,6 +111,10 @@ int ubc_setcred(struct vnode *, struct proc *); #include #endif +#if CONFIG_PROTECT +#include +#endif + static int vn_closefile(struct fileglob *fp, vfs_context_t ctx); static int vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, @@ -445,7 +449,8 @@ continue_create_lookup: } need_vnop_open = !did_open; - } else { + } + else { if (fmode & O_EXCL) error = EEXIST; @@ -555,6 +560,25 @@ continue_create_lookup: } } +#if CONFIG_PROTECT + /* + * Perform any content protection access checks prior to calling + * into the filesystem, if the raw encrypted mode was not + * requested. + * + * If the va_dataprotect_flags are NOT active, or if they are, + * but they do not have the VA_DP_RAWENCRYPTED bit set, then we need + * to perform the checks. + */ + if (!(VATTR_IS_ACTIVE (vap, va_dataprotect_flags)) || + ((vap->va_dataprotect_flags & VA_DP_RAWENCRYPTED) == 0)) { + error = cp_handle_open (vp, fmode); + if (error) { + goto bad; + } + } +#endif + error = VNOP_OPEN(vp, fmode, ctx); if (error) { goto bad; @@ -877,13 +901,18 @@ vn_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) } #endif - ioflag = 0; + /* This signals to VNOP handlers that this read came from a file table read */ + ioflag = IO_SYSCALL_DISPATCH; + if (fp->f_fglob->fg_flag & FNONBLOCK) ioflag |= IO_NDELAY; if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) - ioflag |= IO_NOCACHE; + ioflag |= IO_NOCACHE; + if (fp->f_fglob->fg_flag & FENCRYPTED) { + ioflag |= IO_ENCRYPTED; + } if (fp->f_fglob->fg_flag & FNORDAHEAD) - ioflag |= IO_RAOFF; + ioflag |= IO_RAOFF; if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_fglob->fg_offset; @@ -931,7 +960,12 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) } #endif - ioflag = IO_UNIT; + /* + * IO_SYSCALL_DISPATCH signals to VNOP handlers that this write originated + * from a file table write. + */ + ioflag = (IO_UNIT | IO_SYSCALL_DISPATCH); + if (vp->v_type == VREG && (fp->f_fglob->fg_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_fglob->fg_flag & FNONBLOCK) @@ -940,6 +974,8 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) ioflag |= IO_NOCACHE; if (fp->f_fglob->fg_flag & FNODIRECT) ioflag |= IO_NODIRECT; + if (fp->f_fglob->fg_flag & FSINGLE_WRITER) + ioflag |= IO_SINGLE_WRITER; /* * Treat synchronous mounts and O_FSYNC on the fd as equivalent. @@ -1289,14 +1325,14 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) error = ENXIO; goto out; } - *(int *)data = bdevsw[major(vp->v_rdev)].d_type; + *(int *)data = D_TYPEMASK & bdevsw[major(vp->v_rdev)].d_type; } else if (vp->v_type == VCHR) { if (major(vp->v_rdev) >= nchrdev) { error = ENXIO; goto out; } - *(int *)data = cdevsw[major(vp->v_rdev)].d_type; + *(int *)data = D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type; } else { error = ENOTTY; goto out; diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index a37ba0f74..94715ae55 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,13 +59,31 @@ #if NAMEDSTREAMS +static int shadow_sequence; + /* * We use %p to prevent loss of precision for pointers on varying architectures. */ + +#define SHADOW_NAME_FMT ".vfs_rsrc_stream_%p%08x%p" +#define SHADOW_DIR_FMT ".vfs_rsrc_streams_%p%x" +#define SHADOW_DIR_CONTAINER "/var/run" + #define MAKE_SHADOW_NAME(VP, NAME) \ - snprintf((NAME), sizeof((NAME)), ".vfs_rsrc_stream_%p%08x%p", (void*)(VP), (VP)->v_id, (VP)->v_data); + snprintf((NAME), sizeof((NAME)), (SHADOW_NAME_FMT), \ + ((void*)(VM_KERNEL_ADDRPERM(VP))), \ + ((VP)->v_id), \ + ((void*)(VM_KERNEL_ADDRPERM((VP)->v_data)))) -static int shadow_sequence; +/* The full path to the shadow directory */ +#define MAKE_SHADOW_DIRNAME(VP, NAME) \ + snprintf((NAME), sizeof((NAME)), (SHADOW_DIR_CONTAINER "/" SHADOW_DIR_FMT), \ + ((void*)(VM_KERNEL_ADDRPERM(VP))), shadow_sequence) + +/* The shadow directory as a 'leaf' entry */ +#define MAKE_SHADOW_DIR_LEAF(VP, NAME) \ + snprintf((NAME), sizeof((NAME)), (SHADOW_DIR_FMT), \ + ((void*)(VM_KERNEL_ADDRPERM(VP))), shadow_sequence) static int default_getnamedstream(vnode_t vp, vnode_t *svpp, const char *name, enum nsoperation op, vfs_context_t context); @@ -960,8 +978,7 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) bzero(tmpname, sizeof(tmpname)); - snprintf(tmpname, sizeof(tmpname), "/var/run/.vfs_rsrc_streams_%p%x", - (void*)rootvnode, shadow_sequence); + MAKE_SHADOW_DIRNAME(rootvnode, tmpname); /* * Look up the shadow directory to ensure that it still exists. * By looking it up, we get an iocounted dvp to use, and avoid some coherency issues @@ -980,15 +997,21 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) sdvp = NULLVP; bzero (tmpname, sizeof(tmpname)); - /* Obtain the vnode for "/var/run" directory. */ - if (vnode_lookup("/var/run", 0, &dvp, context) != 0) { + /* + * Obtain the vnode for "/var/run" directory. + * This is defined in the SHADOW_DIR_CONTAINER macro + */ + if (vnode_lookup(SHADOW_DIR_CONTAINER, 0, &dvp, context) != 0) { error = ENOTSUP; goto out; } - /* Create the shadow stream directory. */ - snprintf(tmpname, sizeof(tmpname), ".vfs_rsrc_streams_%p%x", - (void*)rootvnode, shadow_sequence); + /* + * Create the shadow stream directory. + * 'dvp' below suggests the parent directory so + * we only need to provide the leaf entry name + */ + MAKE_SHADOW_DIR_LEAF(rootvnode, tmpname); bzero(&cn, sizeof(cn)); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; diff --git a/bsd/vm/dp_backing_file.c b/bsd/vm/dp_backing_file.c index bb2808ecf..9df1b810e 100644 --- a/bsd/vm/dp_backing_file.c +++ b/bsd/vm/dp_backing_file.c @@ -279,12 +279,10 @@ macx_swapon( #if CONFIG_PROTECT { - void *cnode = NULL; /* initialize content protection keys manually */ - if ((cnode = cp_get_protected_cnode(vp)) != 0) { - if ((error = cp_handle_vnop(cnode, CP_WRITE_ACCESS)) != 0) - goto swapon_bailout; - } + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + goto swapon_bailout; + } } #endif @@ -474,21 +472,12 @@ macx_swapoff( ut = get_bsdthread_info(current_thread()); -#if !CONFIG_EMBEDDED orig_iopol_disk = proc_get_thread_selfdiskacc(); proc_apply_thread_selfdiskacc(IOPOL_THROTTLE); -#else /* !CONFIG_EMBEDDED */ - orig_iopol_disk = ut->uu_iopol_disk; - ut->uu_iopol_disk = IOPOL_THROTTLE; -#endif /* !CONFIG_EMBEDDED */ kr = default_pager_backing_store_delete(backing_store); -#if !CONFIG_EMBEDDED proc_apply_thread_selfdiskacc(orig_iopol_disk); -#else /* !CONFIG_EMBEDDED */ - ut->uu_iopol_disk = orig_iopol_disk; -#endif /* !CONFIG_EMBEDDED */ switch (kr) { case KERN_SUCCESS: diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 0190e70f7..0dd7823ce 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -94,9 +95,7 @@ #include -#if CONFIG_FREEZE #include -#endif int _shared_region_map( struct proc*, int, unsigned int, struct shared_file_mapping_np*, memory_object_control_t*, struct shared_file_mapping_np*); @@ -474,7 +473,7 @@ task_for_pid_posix_check(proc_t target) int allowed; /* No task_for_pid on bad targets */ - if (target == PROC_NULL || target->p_stat == SZOMB) { + if (target->p_stat == SZOMB) { return FALSE; } @@ -573,9 +572,13 @@ task_for_pid( p = proc_find(pid); + if (p == PROC_NULL) { + error = KERN_FAILURE; + goto tfpout; + } + #if CONFIG_AUDIT - if (p != PROC_NULL) - AUDIT_ARG(process, p); + AUDIT_ARG(process, p); #endif if (!(task_for_pid_posix_check(p))) { @@ -745,6 +748,11 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) } targetproc = proc_find(pid); + if (targetproc == PROC_NULL) { + error = ESRCH; + goto out; + } + if (!task_for_pid_posix_check(targetproc)) { error = EPERM; goto out; @@ -781,7 +789,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) #endif task_reference(target); - error = task_suspend(target); + error = task_pidsuspend(target); if (error) { if (error == KERN_INVALID_ARGUMENT) { error = EINVAL; @@ -789,12 +797,14 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) error = EPERM; } } - task_deallocate(target); - -#if CONFIG_FREEZE - kern_hibernation_on_pid_suspend(pid); +#if CONFIG_MEMORYSTATUS + else { + memorystatus_on_suspend(pid); + } #endif + task_deallocate(target); + out: if (targetproc != PROC_NULL) proc_rele(targetproc); @@ -824,6 +834,11 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) } targetproc = proc_find(pid); + if (targetproc == PROC_NULL) { + error = ESRCH; + goto out; + } + if (!task_for_pid_posix_check(targetproc)) { error = EPERM; goto out; @@ -861,11 +876,11 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) task_reference(target); -#if CONFIG_FREEZE - kern_hibernation_on_pid_resume(pid, target); +#if CONFIG_MEMORYSTATUS + memorystatus_on_resume(pid); #endif - error = task_resume(target); + error = task_pidresume(target); if (error) { if (error == KERN_INVALID_ARGUMENT) { error = EINVAL; @@ -873,15 +888,15 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) error = EPERM; } } + task_deallocate(target); out: if (targetproc != PROC_NULL) proc_rele(targetproc); + *ret = error; return error; - - return 0; } #if CONFIG_EMBEDDED @@ -905,14 +920,19 @@ pid_hibernate(struct proc *p __unused, struct pid_hibernate_args *args, int *ret #endif /* - * The only accepted pid value here is currently -1, since we just kick off the hibernation thread + * The only accepted pid value here is currently -1, since we just kick off the freeze thread * here - individual ids aren't required. However, it's intended that that this call is to change - * in the future to initiate hibernation of individual processes. In anticipation, we'll obtain the + * in the future to initiate freeze of individual processes. In anticipation, we'll obtain the * process handle for potentially valid values and call task_for_pid_posix_check(); this way, everything * is validated correctly and set for further refactoring. See for more details. */ if (pid >= 0) { targetproc = proc_find(pid); + if (targetproc == PROC_NULL) { + error = ESRCH; + goto out; + } + if (!task_for_pid_posix_check(targetproc)) { error = EPERM; goto out; @@ -920,7 +940,7 @@ pid_hibernate(struct proc *p __unused, struct pid_hibernate_args *args, int *ret } if (pid == -1) { - kern_hibernation_on_pid_hibernate(pid); + memorystatus_on_inactivity(pid); } else { error = EPERM; } @@ -962,6 +982,11 @@ pid_shutdown_sockets(struct proc *p __unused, struct pid_shutdown_sockets_args * #endif targetproc = proc_find(pid); + if (targetproc == PROC_NULL) { + error = ESRCH; + goto out; + } + if (!task_for_pid_posix_check(targetproc)) { error = EPERM; goto out; @@ -1075,7 +1100,7 @@ shared_region_check_np( __unused int *retvalp) { vm_shared_region_t shared_region; - mach_vm_offset_t start_address; + mach_vm_offset_t start_address = 0; int error; kern_return_t kr; @@ -1248,12 +1273,10 @@ _shared_region_map( #if CONFIG_PROTECT /* check for content protection access */ { - void *cnode; - if ((cnode = cp_get_protected_cnode(vp)) != NULL) { - error = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); - if (error) + error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0); + if (error) { goto done; - } + } } #endif /* CONFIG_PROTECT */ @@ -1442,7 +1465,7 @@ _shared_region_slide(uint32_t slide, if (slide_info_entry == NULL){ error = EFAULT; } else { - error = copyin(slide_start, + error = copyin((user_addr_t)slide_start, slide_info_entry, (vm_size_t)slide_size); } @@ -1482,20 +1505,22 @@ shared_region_map_and_slide_np( #define SFM_MAX_STACK 8 struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; + /* Is the process chrooted?? */ + if (p->p_fd->fd_rdir != NULL) { + kr = EINVAL; + goto done; + } + if ((kr = vm_shared_region_sliding_valid(slide)) != KERN_SUCCESS) { if (kr == KERN_INVALID_ARGUMENT) { /* * This will happen if we request sliding again * with the same slide value that was used earlier - * for the very first sliding. We continue through - * to the mapping layer. This is so that we can be - * absolutely certain that the same mappings have - * been requested. + * for the very first sliding. */ kr = KERN_SUCCESS; - } else { - goto done; } + goto done; } if (mappings_count == 0) { @@ -1603,6 +1628,66 @@ SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.can_reuse_failure, ""); +extern unsigned int vm_page_free_count, vm_page_speculative_count; +SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, ""); + +extern unsigned int vm_page_cleaned_count; +SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size"); + +/* pageout counts */ +extern unsigned int vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external, vm_pageout_inactive_clean, vm_pageout_speculative_clean, vm_pageout_inactive_used; +extern unsigned int vm_pageout_freed_from_inactive_clean, vm_pageout_freed_from_speculative; +SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_dirty_internal, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_dirty_external, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_clean, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_speculative_clean, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_used, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_inactive_clean, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_speculative, 0, ""); + +extern unsigned int vm_pageout_freed_from_cleaned; +SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_cleaned, 0, ""); + +/* counts of pages entering the cleaned queue */ +extern unsigned int vm_pageout_enqueued_cleaned, vm_pageout_enqueued_cleaned_from_inactive_clean, vm_pageout_enqueued_cleaned_from_inactive_dirty; +SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */ +SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned_from_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned_from_inactive_clean, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned_from_inactive_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned_from_inactive_dirty, 0, ""); + +/* counts of pages leaving the cleaned queue */ +extern unsigned int vm_pageout_cleaned_reclaimed, vm_pageout_cleaned_reactivated, vm_pageout_cleaned_reference_reactivated, vm_pageout_cleaned_volatile_reactivated, vm_pageout_cleaned_fault_reactivated, vm_pageout_cleaned_commit_reactivated, vm_pageout_cleaned_busy, vm_pageout_cleaned_nolock; +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reclaimed, 0, "Cleaned pages reclaimed"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */ +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_commit_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_commit_reactivated, 0, "Cleaned pages commit reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)"); + +#include +#include + +void vm_pageout_io_throttle(void); + +void vm_pageout_io_throttle(void) { + struct uthread *uthread = get_bsdthread_info(current_thread()); + + /* + * thread is marked as a low priority I/O type + * and the I/O we issued while in this cleaning operation + * collided with normal I/O operations... we'll + * delay in order to mitigate the impact of this + * task on the normal operation of the system + */ + + if (uthread->uu_lowpri_window) { + throttle_lowpri_io(TRUE); + } + +} + int vm_pressure_monitor( __unused struct proc *p, @@ -1639,3 +1724,77 @@ vm_pressure_monitor( *retval = (int) pages_wanted; return 0; } + +int +kas_info(struct proc *p, + struct kas_info_args *uap, + int *retval __unused) +{ +#ifdef SECURE_KERNEL + (void)p; + (void)uap; + return ENOTSUP; +#else /* !SECURE_KERNEL */ + int selector = uap->selector; + user_addr_t valuep = uap->value; + user_addr_t sizep = uap->size; + user_size_t size; + int error; + + if (!kauth_cred_issuser(kauth_cred_get())) { + return EPERM; + } + +#if CONFIG_MACF + error = mac_system_check_kas_info(kauth_cred_get(), selector); + if (error) { + return error; + } +#endif + + if (IS_64BIT_PROCESS(p)) { + user64_size_t size64; + error = copyin(sizep, &size64, sizeof(size64)); + size = (user_size_t)size64; + } else { + user32_size_t size32; + error = copyin(sizep, &size32, sizeof(size32)); + size = (user_size_t)size32; + } + if (error) { + return error; + } + + switch (selector) { + case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR: + { + uint64_t slide = vm_kernel_slide; + + if (sizeof(slide) != size) { + return EINVAL; + } + + if (IS_64BIT_PROCESS(p)) { + user64_size_t size64 = (user64_size_t)size; + error = copyout(&size64, sizep, sizeof(size64)); + } else { + user32_size_t size32 = (user32_size_t)size; + error = copyout(&size32, sizep, sizeof(size32)); + } + if (error) { + return error; + } + + error = copyout(&slide, valuep, sizeof(slide)); + if (error) { + return error; + } + } + break; + default: + return EINVAL; + } + + return 0; +#endif /* !SECURE_KERNEL */ +} diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index d12a65652..f86ba0148 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -280,15 +280,17 @@ vnode_pageout(struct vnode *vp, * just go ahead and call vnop_pageout since * it has already sorted out the dirty ranges */ - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START, - size, 1, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START, + size, 1, 0, 0, 0); if ( (error_ret = VNOP_PAGEOUT(vp, upl, upl_offset, (off_t)f_offset, (size_t)size, flags, ctx)) ) result = PAGER_ERROR; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END, - size, 1, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END, + size, 1, 0, 0, 0); goto out; } @@ -303,15 +305,17 @@ vnode_pageout(struct vnode *vp, * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first * take any locks it needs, before effectively locking the pages into a UPL... */ - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START, - size, (int)f_offset, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START, + size, (int)f_offset, 0, 0, 0); if ( (error_ret = VNOP_PAGEOUT(vp, NULL, upl_offset, (off_t)f_offset, size, flags, ctx)) ) { result = PAGER_ERROR; } - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END, - size, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END, + size, 0, 0, 0, 0); goto out; } @@ -461,8 +465,9 @@ vnode_pageout(struct vnode *vp, } xsize = num_of_pages * PAGE_SIZE; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START, - xsize, (int)f_offset, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START, + xsize, (int)f_offset, 0, 0, 0); if ( (error = VNOP_PAGEOUT(vp, upl, offset, (off_t)f_offset, xsize, flags, ctx)) ) { @@ -470,8 +475,9 @@ vnode_pageout(struct vnode *vp, error_ret = error; result = PAGER_ERROR; } - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END, - xsize, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END, + xsize, 0, 0, 0, 0); f_offset += xsize; offset += xsize; @@ -554,6 +560,8 @@ vnode_pagein( error = PAGER_ABSENT; goto out; } + ubc_upl_range_needed(upl, upl_offset / PAGE_SIZE, 1); + upl_offset = 0; first_pg = 0; diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index ebb5af5db..a424e367b 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -389,6 +389,7 @@ _mbuf_gethdr _mbuf_getpacket _mbuf_inbound_modified _mbuf_inet_cksum +_mbuf_is_traffic_class_privileged _mbuf_leadingspace _mbuf_maxlen _mbuf_mclget diff --git a/config/IOKit.exports b/config/IOKit.exports index 2ad8e78c9..e78f56919 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -3,15 +3,12 @@ _IOBSDNameMatching _IOBSDRegistryEntryForDeviceTree _IOBSDRegistryEntryGetData _IOBSDRegistryEntryRelease -_IOCDMatching _IOCreateThread _IODTFreeLoaderInfo _IODTGetLoaderInfo _IODelay -_IODiskMatching _IOExitThread _IOFindBSDRoot -_IOFindMatchingChild _IOFindNameForValue _IOFindValueForName _IOFlushProcessorCache @@ -54,9 +51,7 @@ _IOMapperInsertPPNPages _IOMapperInsertPage _IOMapperInsertUPLPages _IONDRVLibrariesInitialize -_IONetworkMatching _IONetworkNamePrefixMatching -_IOOFPathMatching _IOPageableMapForAddress _IOPause _IOPrintPlane @@ -87,9 +82,7 @@ _IOSimpleLockTryLock:_lck_spin_try_lock _IOSimpleLockUnlock:_lck_spin_unlock _IOSizeToAlignment _IOSleep -_IOSpinUnlock _IOSystemShutdownNotification -_IOTrySpinLock _IOZeroTvalspec _OSKernelStackRemaining _OSPrintMemory @@ -143,9 +136,7 @@ __ZN10IONotifier10superClassE __ZN10IONotifier9MetaClassC1Ev __ZN10IONotifier9MetaClassC2Ev __ZN10IONotifier9metaClassE -__ZN10IONotifierC1EPK11OSMetaClass __ZN10IONotifierC2EPK11OSMetaClass -__ZN10IONotifierD0Ev __ZN10IONotifierD2Ev __ZN10IOWorkLoop10gMetaClassE __ZN10IOWorkLoop10superClassE @@ -157,11 +148,6 @@ __ZN10IOWorkLoop14addEventSourceEP13IOEventSource __ZN10IOWorkLoop15runEventSourcesEv __ZN10IOWorkLoop17removeEventSourceEP13IOEventSource __ZN10IOWorkLoop19signalWorkAvailableEv -__ZN10IOWorkLoop20_RESERVEDIOWorkLoop3Ev -__ZN10IOWorkLoop20_RESERVEDIOWorkLoop4Ev -__ZN10IOWorkLoop20_RESERVEDIOWorkLoop5Ev -__ZN10IOWorkLoop20_RESERVEDIOWorkLoop6Ev -__ZN10IOWorkLoop20_RESERVEDIOWorkLoop7Ev __ZN10IOWorkLoop4freeEv __ZN10IOWorkLoop4initEv __ZN10IOWorkLoop8openGateEv @@ -222,14 +208,6 @@ __ZN11IOMemoryMap14getAddressTaskEv __ZN11IOMemoryMap17getVirtualAddressEv __ZN11IOMemoryMap18getPhysicalAddressEv __ZN11IOMemoryMap19getMemoryDescriptorEv -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap0Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap1Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap2Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap3Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap4Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap5Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap6Ev -__ZN11IOMemoryMap21_RESERVEDIOMemoryMap7Ev __ZN11IOMemoryMap5unmapEv __ZN11IOMemoryMap9MetaClassC1Ev __ZN11IOMemoryMap9MetaClassC2Ev @@ -263,19 +241,6 @@ __ZN12IODMACommand12getAlignmentEv __ZN12IODMACommand17getNumAddressBitsEv __ZN12IODMACommand19setMemoryDescriptorEPK18IOMemoryDescriptorb __ZN12IODMACommand21clearMemoryDescriptorEb -__ZN12IODMACommand22_RESERVEDIODMACommand3Ev -__ZN12IODMACommand22_RESERVEDIODMACommand4Ev -__ZN12IODMACommand22_RESERVEDIODMACommand5Ev -__ZN12IODMACommand22_RESERVEDIODMACommand6Ev -__ZN12IODMACommand22_RESERVEDIODMACommand7Ev -__ZN12IODMACommand22_RESERVEDIODMACommand8Ev -__ZN12IODMACommand22_RESERVEDIODMACommand9Ev -__ZN12IODMACommand23_RESERVEDIODMACommand10Ev -__ZN12IODMACommand23_RESERVEDIODMACommand11Ev -__ZN12IODMACommand23_RESERVEDIODMACommand12Ev -__ZN12IODMACommand23_RESERVEDIODMACommand13Ev -__ZN12IODMACommand23_RESERVEDIODMACommand14Ev -__ZN12IODMACommand23_RESERVEDIODMACommand15Ev __ZN12IODMACommand26getPreparedOffsetAndLengthEPyS0_ __ZN12IODMACommand4freeEv __ZN12IODMACommand7prepareEyybb @@ -333,22 +298,8 @@ __ZN12IOUserClient17setAsyncReferenceEPjP8ipc_portPvS3_ __ZN12IOUserClient18clientHasPrivilegeEPvPKc __ZN12IOUserClient20exportObjectToClientEP4taskP8OSObjectPS3_ __ZN12IOUserClient21destroyUserReferencesEP8OSObject -__ZN12IOUserClient22_RESERVEDIOUserClient2Ev -__ZN12IOUserClient22_RESERVEDIOUserClient3Ev -__ZN12IOUserClient22_RESERVEDIOUserClient4Ev -__ZN12IOUserClient22_RESERVEDIOUserClient5Ev -__ZN12IOUserClient22_RESERVEDIOUserClient6Ev -__ZN12IOUserClient22_RESERVEDIOUserClient7Ev -__ZN12IOUserClient22_RESERVEDIOUserClient8Ev -__ZN12IOUserClient22_RESERVEDIOUserClient9Ev -__ZN12IOUserClient23_RESERVEDIOUserClient10Ev -__ZN12IOUserClient23_RESERVEDIOUserClient11Ev -__ZN12IOUserClient23_RESERVEDIOUserClient12Ev -__ZN12IOUserClient23_RESERVEDIOUserClient13Ev -__ZN12IOUserClient23_RESERVEDIOUserClient14Ev -__ZN12IOUserClient23_RESERVEDIOUserClient15Ev -__ZN12IOUserClient23releaseNotificationPortEP8ipc_port __ZN12IOUserClient23releaseAsyncReference64EPy +__ZN12IOUserClient23releaseNotificationPortEP8ipc_port __ZN12IOUserClient26removeMappingForDescriptorEP18IOMemoryDescriptor __ZN12IOUserClient4freeEv __ZN12IOUserClient4initEP12OSDictionary @@ -368,13 +319,6 @@ __ZN13IOCommandGate11setWorkLoopEP10IOWorkLoop __ZN13IOCommandGate13attemptActionEPFiP8OSObjectPvS2_S2_S2_ES2_S2_S2_S2_ __ZN13IOCommandGate13commandWakeupEPvb __ZN13IOCommandGate14attemptCommandEPvS0_S0_S0_ -__ZN13IOCommandGate23_RESERVEDIOCommandGate1Ev -__ZN13IOCommandGate23_RESERVEDIOCommandGate2Ev -__ZN13IOCommandGate23_RESERVEDIOCommandGate3Ev -__ZN13IOCommandGate23_RESERVEDIOCommandGate4Ev -__ZN13IOCommandGate23_RESERVEDIOCommandGate5Ev -__ZN13IOCommandGate23_RESERVEDIOCommandGate6Ev -__ZN13IOCommandGate23_RESERVEDIOCommandGate7Ev __ZN13IOCommandGate4freeEv __ZN13IOCommandGate4initEP8OSObjectPFiS1_PvS2_S2_S2_E __ZN13IOCommandGate6enableEv @@ -397,14 +341,6 @@ __ZN13IOCommandPool13returnCommandEP9IOCommand __ZN13IOCommandPool15gatedGetCommandEPP9IOCommandb __ZN13IOCommandPool16initWithWorkLoopEP10IOWorkLoop __ZN13IOCommandPool18gatedReturnCommandEP9IOCommand -__ZN13IOCommandPool23_RESERVEDIOCommandPool0Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool1Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool2Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool3Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool4Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool5Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool6Ev -__ZN13IOCommandPool23_RESERVEDIOCommandPool7Ev __ZN13IOCommandPool4freeEv __ZN13IOCommandPool9MetaClassC1Ev __ZN13IOCommandPool9MetaClassC2Ev @@ -419,16 +355,9 @@ __ZN13IOEventSource10gMetaClassE __ZN13IOEventSource10superClassE __ZN13IOEventSource10wakeupGateEPvb __ZN13IOEventSource11setWorkLoopEP10IOWorkLoop +__ZN13IOEventSource12checkForWorkEv __ZN13IOEventSource12tryCloseGateEv __ZN13IOEventSource19signalWorkAvailableEv -__ZN13IOEventSource23_RESERVEDIOEventSource0Ev -__ZN13IOEventSource23_RESERVEDIOEventSource1Ev -__ZN13IOEventSource23_RESERVEDIOEventSource2Ev -__ZN13IOEventSource23_RESERVEDIOEventSource3Ev -__ZN13IOEventSource23_RESERVEDIOEventSource4Ev -__ZN13IOEventSource23_RESERVEDIOEventSource5Ev -__ZN13IOEventSource23_RESERVEDIOEventSource6Ev -__ZN13IOEventSource23_RESERVEDIOEventSource7Ev __ZN13IOEventSource4freeEv __ZN13IOEventSource4initEP8OSObjectPFvS1_zE __ZN13IOEventSource6enableEv @@ -440,7 +369,6 @@ __ZN13IOEventSource9MetaClassC2Ev __ZN13IOEventSource9closeGateEv __ZN13IOEventSource9metaClassE __ZN13IOEventSource9setActionEPFvP8OSObjectzE -__ZN13IOEventSource12checkForWorkEv __ZN13IOEventSourceC1EPK11OSMetaClass __ZN13IOEventSourceC2EPK11OSMetaClass __ZN13IOEventSourceD0Ev @@ -484,12 +412,12 @@ __ZN14IOPMrootDomain14tellChangeDownEm __ZN14IOPMrootDomain15powerChangeDoneEm __ZN14IOPMrootDomain16tellNoChangeDownEm __ZN14IOPMrootDomain17createPMAssertionEyjP9IOServicePKc -__ZN14IOPMrootDomain18releasePMAssertionEy -__ZN14IOPMrootDomain19getPMAssertionLevelEy -__ZN14IOPMrootDomain19setPMAssertionLevelEyj __ZN14IOPMrootDomain17getSleepSupportedEv __ZN14IOPMrootDomain17setAggressivenessEmm __ZN14IOPMrootDomain18changePowerStateToEm +__ZN14IOPMrootDomain18releasePMAssertionEy +__ZN14IOPMrootDomain19getPMAssertionLevelEy +__ZN14IOPMrootDomain19setPMAssertionLevelEyj __ZN14IOPMrootDomain22changePowerStateToPrivEm __ZN14IOPMrootDomain22removePublishedFeatureEj __ZN14IOPMrootDomain23requestPowerDomainStateEmP17IOPowerConnectionm @@ -535,9 +463,7 @@ __ZN15IODMAController5startEP9IOService __ZN15IODMAController9MetaClassC1Ev __ZN15IODMAController9MetaClassC2Ev __ZN15IODMAController9metaClassE -__ZN15IODMAControllerC1EPK11OSMetaClass __ZN15IODMAControllerC2EPK11OSMetaClass -__ZN15IODMAControllerD0Ev __ZN15IODMAControllerD2Ev __ZN15IOPMPowerSource10cycleCountEv __ZN15IOPMPowerSource10gMetaClassE @@ -625,32 +551,6 @@ __ZN15IORegistryEntry17runPropertyActionEPFiP8OSObjectPvS2_S2_S2_ES1_S2_S2_S2_S2 __ZN15IORegistryEntry18getGenerationCountEv __ZN15IORegistryEntry18getRegistryEntryIDEv __ZN15IORegistryEntry21getChildFromComponentEPPKcPK15IORegistryPlane -__ZN15IORegistryEntry25_RESERVEDIORegistryEntry6Ev -__ZN15IORegistryEntry25_RESERVEDIORegistryEntry7Ev -__ZN15IORegistryEntry25_RESERVEDIORegistryEntry8Ev -__ZN15IORegistryEntry25_RESERVEDIORegistryEntry9Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry10Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry11Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry12Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry13Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry14Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry15Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry16Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry17Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry18Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry19Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry20Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry21Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry22Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry23Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry24Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry25Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry26Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry27Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry28Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry29Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry30Ev -__ZN15IORegistryEntry26_RESERVEDIORegistryEntry31Ev __ZN15IORegistryEntry4freeEv __ZN15IORegistryEntry4initEP12OSDictionary __ZN15IORegistryEntry4initEPS_PK15IORegistryPlane @@ -753,22 +653,6 @@ __ZN17IOBigMemoryCursorD0Ev __ZN17IOBigMemoryCursorD2Ev __ZN17IOPolledInterface10gMetaClassE __ZN17IOPolledInterface15checkAllForWorkEv -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface0Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface1Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface2Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface3Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface4Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface5Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface6Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface7Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface8Ev -__ZN17IOPolledInterface27_RESERVEDIOPolledInterface9Ev -__ZN17IOPolledInterface28_RESERVEDIOPolledInterface10Ev -__ZN17IOPolledInterface28_RESERVEDIOPolledInterface11Ev -__ZN17IOPolledInterface28_RESERVEDIOPolledInterface12Ev -__ZN17IOPolledInterface28_RESERVEDIOPolledInterface13Ev -__ZN17IOPolledInterface28_RESERVEDIOPolledInterface14Ev -__ZN17IOPolledInterface28_RESERVEDIOPolledInterface15Ev __ZN17IOPolledInterfaceC2EPK11OSMetaClass __ZN17IOPolledInterfaceD2Ev __ZN17IOPowerConnection10gMetaClassE @@ -799,14 +683,6 @@ __ZN17IOPowerConnectionD2Ev __ZN17IOSharedDataQueue10gMetaClassE __ZN17IOSharedDataQueue10superClassE __ZN17IOSharedDataQueue19getMemoryDescriptorEv -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue0Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue1Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue2Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue3Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue4Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue5Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue6Ev -__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue7Ev __ZN17IOSharedDataQueue4freeEv __ZN17IOSharedDataQueue4peekEv __ZN17IOSharedDataQueue9MetaClassC1Ev @@ -832,9 +708,7 @@ __ZN18IOMemoryDescriptor8redirectEP4taskb __ZN18IOMemoryDescriptor9MetaClassC1Ev __ZN18IOMemoryDescriptor9MetaClassC2Ev __ZN18IOMemoryDescriptor9metaClassE -__ZN18IOMemoryDescriptorC1EPK11OSMetaClass __ZN18IOMemoryDescriptorC2EPK11OSMetaClass -__ZN18IOMemoryDescriptorD0Ev __ZN18IOMemoryDescriptorD2Ev __ZN18IORegistryIterator10enterEntryEPK15IORegistryPlane __ZN18IORegistryIterator10enterEntryEv @@ -864,14 +738,6 @@ __ZN18IOTimerEventSource11setWorkLoopEP10IOWorkLoop __ZN18IOTimerEventSource13cancelTimeoutEv __ZN18IOTimerEventSource14setTimeoutFuncEv __ZN18IOTimerEventSource16timerEventSourceEP8OSObjectPFvS1_PS_E -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource0Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource1Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource2Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource3Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource4Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource5Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource6Ev -__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource7Ev __ZN18IOTimerEventSource4freeEv __ZN18IOTimerEventSource4initEP8OSObjectPFvS1_PS_E __ZN18IOTimerEventSource6enableEv @@ -896,9 +762,7 @@ __ZN18IOUserNotification7isValidEv __ZN18IOUserNotification9MetaClassC1Ev __ZN18IOUserNotification9MetaClassC2Ev __ZN18IOUserNotification9metaClassE -__ZN18IOUserNotificationC1EPK11OSMetaClass __ZN18IOUserNotificationC2EPK11OSMetaClass -__ZN18IOUserNotificationD0Ev __ZN18IOUserNotificationD2Ev __ZN18_IOServiceNotifier10gMetaClassE __ZN18_IOServiceNotifier10superClassE @@ -968,12 +832,6 @@ __ZN21IOInterruptController16getInterruptTypeEP9IOServiceiPi __ZN21IOInterruptController17registerInterruptEP9IOServiceiPvPFvS2_S2_S2_iES2_ __ZN21IOInterruptController19unregisterInterruptEP9IOServicei __ZN21IOInterruptController26getInterruptHandlerAddressEv -__ZN21IOInterruptController31_RESERVEDIOInterruptController0Ev -__ZN21IOInterruptController31_RESERVEDIOInterruptController1Ev -__ZN21IOInterruptController31_RESERVEDIOInterruptController2Ev -__ZN21IOInterruptController31_RESERVEDIOInterruptController3Ev -__ZN21IOInterruptController31_RESERVEDIOInterruptController4Ev -__ZN21IOInterruptController31_RESERVEDIOInterruptController5Ev __ZN21IOInterruptController9MetaClassC1Ev __ZN21IOInterruptController9MetaClassC2Ev __ZN21IOInterruptController9metaClassE @@ -1014,14 +872,6 @@ __ZN22IOInterruptEventSource17interruptOccurredEPvP9IOServicei __ZN22IOInterruptEventSource20interruptEventSourceEP8OSObjectPFvS1_PS_iEP9IOServicei __ZN22IOInterruptEventSource23normalInterruptOccurredEPvP9IOServicei __ZN22IOInterruptEventSource24disableInterruptOccurredEPvP9IOServicei -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource0Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource1Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource2Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource3Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource4Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource5Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource6Ev -__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource7Ev __ZN22IOInterruptEventSource4freeEv __ZN22IOInterruptEventSource4initEP8OSObjectPFvS1_PS_iEP9IOServicei __ZN22IOInterruptEventSource6enableEv @@ -1115,10 +965,6 @@ __ZN27IOSharedInterruptController17registerInterruptEP9IOServiceiPvPFvS2_S2_S2_i __ZN27IOSharedInterruptController19unregisterInterruptEP9IOServicei __ZN27IOSharedInterruptController23initInterruptControllerEP21IOInterruptControllerP6OSData __ZN27IOSharedInterruptController26getInterruptHandlerAddressEv -__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController0Ev -__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController1Ev -__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController2Ev -__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController3Ev __ZN27IOSharedInterruptController9MetaClassC1Ev __ZN27IOSharedInterruptController9MetaClassC2Ev __ZN27IOSharedInterruptController9metaClassE @@ -1135,14 +981,6 @@ __ZN28IOFilterInterruptEventSource20interruptEventSourceEP8OSObjectPFvS1_P22IOIn __ZN28IOFilterInterruptEventSource23normalInterruptOccurredEPvP9IOServicei __ZN28IOFilterInterruptEventSource24disableInterruptOccurredEPvP9IOServicei __ZN28IOFilterInterruptEventSource26filterInterruptEventSourceEP8OSObjectPFvS1_P22IOInterruptEventSourceiEPFbS1_PS_EP9IOServicei -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource0Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource1Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource2Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource3Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource4Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource5Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource6Ev -__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource7Ev __ZN28IOFilterInterruptEventSource4initEP8OSObjectPFvS1_P22IOInterruptEventSourceiEP9IOServicei __ZN28IOFilterInterruptEventSource4initEP8OSObjectPFvS1_P22IOInterruptEventSourceiEPFbS1_PS_EP9IOServicei __ZN28IOFilterInterruptEventSource9MetaClassC1Ev @@ -1169,19 +1007,6 @@ __ZN29IOInterleavedMemoryDescriptorD2Ev __ZN8IOMapper10gMetaClassE __ZN8IOMapper10superClassE __ZN8IOMapper17setMapperRequiredEb -__ZN8IOMapper18_RESERVEDIOMapper3Ev -__ZN8IOMapper18_RESERVEDIOMapper4Ev -__ZN8IOMapper18_RESERVEDIOMapper5Ev -__ZN8IOMapper18_RESERVEDIOMapper6Ev -__ZN8IOMapper18_RESERVEDIOMapper7Ev -__ZN8IOMapper18_RESERVEDIOMapper8Ev -__ZN8IOMapper18_RESERVEDIOMapper9Ev -__ZN8IOMapper19_RESERVEDIOMapper10Ev -__ZN8IOMapper19_RESERVEDIOMapper11Ev -__ZN8IOMapper19_RESERVEDIOMapper12Ev -__ZN8IOMapper19_RESERVEDIOMapper13Ev -__ZN8IOMapper19_RESERVEDIOMapper14Ev -__ZN8IOMapper19_RESERVEDIOMapper15Ev __ZN8IOMapper19copyMapperForDeviceEP9IOService __ZN8IOMapper19waitForSystemMapperEv __ZN8IOMapper4freeEv @@ -1190,28 +1015,8 @@ __ZN8IOMapper7gSystemE __ZN8IOMapper9MetaClassC1Ev __ZN8IOMapper9MetaClassC2Ev __ZN8IOMapper9metaClassE -__ZN8IOMapperC1EPK11OSMetaClass __ZN8IOMapperC2EPK11OSMetaClass -__ZN8IOMapperD0Ev __ZN8IOMapperD2Ev -__ZN8IOSyncer10gMetaClassE -__ZN8IOSyncer10superClassE -__ZN8IOSyncer13privateSignalEv -__ZN8IOSyncer4freeEv -__ZN8IOSyncer4initEb -__ZN8IOSyncer4waitEb -__ZN8IOSyncer6createEb -__ZN8IOSyncer6reinitEv -__ZN8IOSyncer6signalEib -__ZN8IOSyncer9MetaClassC1Ev -__ZN8IOSyncer9MetaClassC2Ev -__ZN8IOSyncer9metaClassE -__ZN8IOSyncerC1EPK11OSMetaClass -__ZN8IOSyncerC1Ev -__ZN8IOSyncerC2EPK11OSMetaClass -__ZN8IOSyncerC2Ev -__ZN8IOSyncerD0Ev -__ZN8IOSyncerD2Ev __ZN9IOCommand10gMetaClassE __ZN9IOCommand10superClassE __ZN9IOCommand4initEv @@ -1292,53 +1097,12 @@ __ZN9IOService18getResourceServiceEv __ZN9IOService18lockForArbitrationEb __ZN9IOService18matchPropertyTableEP12OSDictionary __ZN9IOService18setIdleTimerPeriodEm -__ZN9IOService19_RESERVEDIOService6Ev -__ZN9IOService19_RESERVEDIOService7Ev -__ZN9IOService19_RESERVEDIOService8Ev -__ZN9IOService19_RESERVEDIOService9Ev +__ZN9IOService19copyMatchingServiceEP12OSDictionary __ZN9IOService19getMatchingServicesEP12OSDictionary __ZN9IOService19powerOverrideOnPrivEv __ZN9IOService19registerPowerDriverEPS_P14IOPMPowerStatem __ZN9IOService19start_PM_idle_timerEv __ZN9IOService19unregisterInterruptEi -__ZN9IOService20_RESERVEDIOService10Ev -__ZN9IOService20_RESERVEDIOService11Ev -__ZN9IOService20_RESERVEDIOService12Ev -__ZN9IOService20_RESERVEDIOService13Ev -__ZN9IOService20_RESERVEDIOService14Ev -__ZN9IOService20_RESERVEDIOService15Ev -__ZN9IOService20_RESERVEDIOService16Ev -__ZN9IOService20_RESERVEDIOService17Ev -__ZN9IOService20_RESERVEDIOService18Ev -__ZN9IOService20_RESERVEDIOService19Ev -__ZN9IOService20_RESERVEDIOService20Ev -__ZN9IOService20_RESERVEDIOService21Ev -__ZN9IOService20_RESERVEDIOService22Ev -__ZN9IOService20_RESERVEDIOService23Ev -__ZN9IOService20_RESERVEDIOService24Ev -__ZN9IOService20_RESERVEDIOService25Ev -__ZN9IOService20_RESERVEDIOService26Ev -__ZN9IOService20_RESERVEDIOService27Ev -__ZN9IOService20_RESERVEDIOService28Ev -__ZN9IOService20_RESERVEDIOService29Ev -__ZN9IOService20_RESERVEDIOService30Ev -__ZN9IOService20_RESERVEDIOService31Ev -__ZN9IOService20_RESERVEDIOService32Ev -__ZN9IOService20_RESERVEDIOService33Ev -__ZN9IOService20_RESERVEDIOService34Ev -__ZN9IOService20_RESERVEDIOService35Ev -__ZN9IOService20_RESERVEDIOService36Ev -__ZN9IOService20_RESERVEDIOService37Ev -__ZN9IOService20_RESERVEDIOService38Ev -__ZN9IOService20_RESERVEDIOService39Ev -__ZN9IOService20_RESERVEDIOService40Ev -__ZN9IOService20_RESERVEDIOService41Ev -__ZN9IOService20_RESERVEDIOService42Ev -__ZN9IOService20_RESERVEDIOService43Ev -__ZN9IOService20_RESERVEDIOService44Ev -__ZN9IOService20_RESERVEDIOService45Ev -__ZN9IOService20_RESERVEDIOService46Ev -__ZN9IOService20_RESERVEDIOService47Ev __ZN9IOService20callPlatformFunctionEPK8OSSymbolbPvS3_S3_S3_ __ZN9IOService20callPlatformFunctionEPKcbPvS2_S2_S2_ __ZN9IOService20getDeviceMemoryCountEv @@ -1551,8 +1315,6 @@ __ZNK29IOInterleavedMemoryDescriptor9MetaClass5allocEv __ZNK8IOMapper12getMetaClassEv __ZNK8IOMapper13getBypassMaskEPy __ZNK8IOMapper9MetaClass5allocEv -__ZNK8IOSyncer12getMetaClassEv -__ZNK8IOSyncer9MetaClass5allocEv __ZNK9IOCommand12getMetaClassEv __ZNK9IOCommand9MetaClass5allocEv __ZNK9IOService10isInactiveEv @@ -1620,7 +1382,6 @@ __ZTV27IOSharedInterruptController __ZTV28IOFilterInterruptEventSource __ZTV29IOInterleavedMemoryDescriptor __ZTV8IOMapper -__ZTV8IOSyncer __ZTV9IOCommand __ZTV9IOService __ZTVN10IOMachPort9MetaClassE @@ -1673,7 +1434,6 @@ __ZTVN27IOSharedInterruptController9MetaClassE __ZTVN28IOFilterInterruptEventSource9MetaClassE __ZTVN29IOInterleavedMemoryDescriptor9MetaClassE __ZTVN8IOMapper9MetaClassE -__ZTVN8IOSyncer9MetaClassE __ZTVN9IOCommand9MetaClassE __ZTVN9IOService9MetaClassE __giDebugLogDataInternal @@ -1689,8 +1449,6 @@ _debug_malloc_size _device_close _device_data_action _di_root_image -_ev_try_lock -_ev_unlock _gIOAppPowerStateInterest _gIOBusyInterest _gIOCatalogue diff --git a/config/IOKit.i386.exports b/config/IOKit.i386.exports index d83bbdde6..8dff01adc 100644 --- a/config/IOKit.i386.exports +++ b/config/IOKit.i386.exports @@ -1,5 +1,8 @@ _IOLockUnlock_darwin10:_lck_mtx_unlock_darwin10 +_IOOFPathMatching _IOPanic +_IOSpinUnlock +_IOTrySpinLock _PE_parse_boot_arg __Z16IODTFindSlotNameP15IORegistryEntrym __Z16IODTSetResolvingP15IORegistryEntryPFlmPmS1_EPFvS0_PhS4_S4_E @@ -7,6 +10,11 @@ __Z17IODTGetCellCountsP15IORegistryEntryPmS1_ __Z22IODTResolveAddressCellP15IORegistryEntryPmS1_S1_ __Z23IODTFindMatchingEntriesP15IORegistryEntrymPKc __ZN10IOWorkLoop19workLoopWithOptionsEm +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop3Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop4Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop5Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop6Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop7Ev __ZN10IOWorkLoop9sleepGateEPv12UnsignedWidem __ZN10IOWorkLoop9sleepGateEPvm __ZN11IOCatalogue11findDriversEP12OSDictionaryPl @@ -19,6 +27,14 @@ __ZN11IODataQueue7enqueueEPvm __ZN11IOMemoryMap10getAddressEv __ZN11IOMemoryMap18getPhysicalSegmentEmPm __ZN11IOMemoryMap19setMemoryDescriptorEP18IOMemoryDescriptory +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap0Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap1Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap2Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap3Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap4Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap5Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap6Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap7Ev __ZN11IOMemoryMap7getSizeEv __ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormm __ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormy @@ -33,6 +49,19 @@ __ZN12IODMACommand15genIOVMSegmentsEPFbPS_NS_9Segment64EPvmEPyS2_Pm __ZN12IODMACommand15genIOVMSegmentsEPyPvPm __ZN12IODMACommand17withSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperS2_ __ZN12IODMACommand21initWithSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperS2_ +__ZN12IODMACommand22_RESERVEDIODMACommand3Ev +__ZN12IODMACommand22_RESERVEDIODMACommand4Ev +__ZN12IODMACommand22_RESERVEDIODMACommand5Ev +__ZN12IODMACommand22_RESERVEDIODMACommand6Ev +__ZN12IODMACommand22_RESERVEDIODMACommand7Ev +__ZN12IODMACommand22_RESERVEDIODMACommand8Ev +__ZN12IODMACommand22_RESERVEDIODMACommand9Ev +__ZN12IODMACommand23_RESERVEDIODMACommand10Ev +__ZN12IODMACommand23_RESERVEDIODMACommand11Ev +__ZN12IODMACommand23_RESERVEDIODMACommand12Ev +__ZN12IODMACommand23_RESERVEDIODMACommand13Ev +__ZN12IODMACommand23_RESERVEDIODMACommand14Ev +__ZN12IODMACommand23_RESERVEDIODMACommand15Ev __ZN12IODMACommand24prepareWithSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperyybb __ZN12IODMACommand8transferEmyPvy __ZN12IOUserClient12initWithTaskEP4taskPvm @@ -43,6 +72,20 @@ __ZN12IOUserClient17mapClientMemory64EmP4taskmy __ZN12IOUserClient17sendAsyncResult64EPyiS0_m __ZN12IOUserClient19clientMemoryForTypeEmPmPP18IOMemoryDescriptor __ZN12IOUserClient19setAsyncReference64EPyP8ipc_portyy +__ZN12IOUserClient22_RESERVEDIOUserClient2Ev +__ZN12IOUserClient22_RESERVEDIOUserClient3Ev +__ZN12IOUserClient22_RESERVEDIOUserClient4Ev +__ZN12IOUserClient22_RESERVEDIOUserClient5Ev +__ZN12IOUserClient22_RESERVEDIOUserClient6Ev +__ZN12IOUserClient22_RESERVEDIOUserClient7Ev +__ZN12IOUserClient22_RESERVEDIOUserClient8Ev +__ZN12IOUserClient22_RESERVEDIOUserClient9Ev +__ZN12IOUserClient23_RESERVEDIOUserClient10Ev +__ZN12IOUserClient23_RESERVEDIOUserClient11Ev +__ZN12IOUserClient23_RESERVEDIOUserClient12Ev +__ZN12IOUserClient23_RESERVEDIOUserClient13Ev +__ZN12IOUserClient23_RESERVEDIOUserClient14Ev +__ZN12IOUserClient23_RESERVEDIOUserClient15Ev __ZN12IOUserClient23getExternalTrapForIndexEm __ZN12IOUserClient24getNotificationSemaphoreEmPP9semaphore __ZN12IOUserClient24getTargetAndTrapForIndexEPP9IOServicem @@ -54,8 +97,31 @@ __ZN12IOUserClient30getExternalAsyncMethodForIndexEm __ZN12IOUserClient31getAsyncTargetAndMethodForIndexEPP9IOServicem __ZN13IOCommandGate12commandSleepEPv12UnsignedWidem __ZN13IOCommandGate12commandSleepEPvm +__ZN13IOCommandGate23_RESERVEDIOCommandGate1Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate2Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate3Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate4Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate5Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate6Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate7Ev __ZN13IOCommandPool11commandPoolEP9IOServiceP10IOWorkLoopm +__ZN13IOCommandPool23_RESERVEDIOCommandPool0Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool1Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool2Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool3Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool4Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool5Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool6Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool7Ev __ZN13IOCommandPool4initEP9IOServiceP10IOWorkLoopm +__ZN13IOEventSource23_RESERVEDIOEventSource0Ev +__ZN13IOEventSource23_RESERVEDIOEventSource1Ev +__ZN13IOEventSource23_RESERVEDIOEventSource2Ev +__ZN13IOEventSource23_RESERVEDIOEventSource3Ev +__ZN13IOEventSource23_RESERVEDIOEventSource4Ev +__ZN13IOEventSource23_RESERVEDIOEventSource5Ev +__ZN13IOEventSource23_RESERVEDIOEventSource6Ev +__ZN13IOEventSource23_RESERVEDIOEventSource7Ev __ZN13IOEventSource9sleepGateEPv12UnsignedWidem __ZN13IOEventSource9sleepGateEPvm __ZN13_IOServiceJob8startJobEP9IOServiceim @@ -90,6 +156,32 @@ __ZN15IODMAController13getControllerEP9IOServicem __ZN15IODMAController16notifyDMACommandEP16IODMAEventSourceP12IODMACommandim __ZN15IODMAController20createControllerNameEm __ZN15IODMAController21registerDMAControllerEm +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry6Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry7Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry8Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry9Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry10Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry11Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry12Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry13Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry14Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry15Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry16Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry17Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry18Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry19Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry20Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry21Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry22Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry23Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry24Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry25Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry26Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry27Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry28Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry29Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry30Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry31Ev __ZN16IODMAEventSource14dmaEventSourceEP8OSObjectP9IOServicePFvS1_PS_P12IODMACommandimES8_m __ZN16IODMAEventSource15startDMACommandEP12IODMACommand11IODirectionmm __ZN16IODMAEventSource16notifyDMACommandEP12IODMACommandim @@ -106,9 +198,33 @@ __ZN16IORangeAllocator9withRangeEmmmm __ZN17IOBigMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm __ZN17IOBigMemoryCursor17withSpecificationEmmm __ZN17IOBigMemoryCursor21initWithSpecificationEmmm +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface0Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface1Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface2Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface3Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface4Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface5Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface6Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface7Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface8Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface9Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface10Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface11Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface12Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface13Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface14Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface15Ev __ZN17IOSharedDataQueue11withEntriesEmm __ZN17IOSharedDataQueue12withCapacityEm __ZN17IOSharedDataQueue16initWithCapacityEm +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue0Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue1Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue2Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue3Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue4Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue5Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue6Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue7Ev __ZN17IOSharedDataQueue7dequeueEPvPm __ZN18IOMemoryDescriptor10setMappingEP4taskjm __ZN18IOMemoryDescriptor10withRangesEP14IOVirtualRangem11IODirectionP4taskb @@ -164,6 +280,14 @@ __ZN18IOTimerEventSource12wakeAtTimeMSEm __ZN18IOTimerEventSource12wakeAtTimeUSEm __ZN18IOTimerEventSource15setTimeoutTicksEm __ZN18IOTimerEventSource15wakeAtTimeTicksEm +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource0Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource1Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource2Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource3Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource4Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource5Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource6Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource7Ev __ZN20IOLittleMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm __ZN20IOLittleMemoryCursor17withSpecificationEmmm __ZN20IOLittleMemoryCursor21initWithSpecificationEmmm @@ -175,6 +299,12 @@ __ZN21IOInterruptController12enableVectorElP17IOInterruptVector __ZN21IOInterruptController13getVectorTypeElP17IOInterruptVector __ZN21IOInterruptController17disableVectorHardElP17IOInterruptVector __ZN21IOInterruptController17vectorCanBeSharedElP17IOInterruptVector +__ZN21IOInterruptController31_RESERVEDIOInterruptController0Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController1Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController2Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController3Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController4Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController5Ev __ZN21IONaturalMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm __ZN21IONaturalMemoryCursor17withSpecificationEmmm __ZN21IONaturalMemoryCursor21initWithSpecificationEmmm @@ -185,6 +315,14 @@ __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptormmm __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEmPmm __ZN21IOSubMemoryDescriptor7prepareE11IODirection __ZN21IOSubMemoryDescriptor8completeE11IODirection +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource0Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource1Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource2Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource3Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource4Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource5Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource6Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource7Ev __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorm11IODirectionb __ZN23IOMultiMemoryDescriptor18getPhysicalSegmentEmPmm __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorm11IODirectionb @@ -237,6 +375,18 @@ __ZN25IOGeneralMemoryDescriptor5doMapEP7_vm_mapPjmmm __ZN25IOGeneralMemoryDescriptor7doUnmapEP7_vm_mapjm __ZN25IOGeneralMemoryDescriptor7prepareE11IODirection __ZN25IOGeneralMemoryDescriptor8completeE11IODirection +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController0Ev +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController1Ev +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController2Ev +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController3Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource0Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource1Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource2Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource3Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource4Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource5Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource6Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource7Ev __ZN29IOInterleavedMemoryDescriptor12withCapacityEm11IODirection __ZN29IOInterleavedMemoryDescriptor16initWithCapacityEm11IODirection __ZN29IOInterleavedMemoryDescriptor18getPhysicalSegmentEmPmm @@ -249,7 +399,20 @@ __ZN8IOMapper10iovmInsertEjmP13upl_page_infom __ZN8IOMapper10iovmInsertEjmPjm __ZN8IOMapper11NewARTTableEmPPvPj __ZN8IOMapper12FreeARTTableEP6OSDatam +__ZN8IOMapper18_RESERVEDIOMapper3Ev +__ZN8IOMapper18_RESERVEDIOMapper4Ev +__ZN8IOMapper18_RESERVEDIOMapper5Ev +__ZN8IOMapper18_RESERVEDIOMapper6Ev +__ZN8IOMapper18_RESERVEDIOMapper7Ev +__ZN8IOMapper18_RESERVEDIOMapper8Ev +__ZN8IOMapper18_RESERVEDIOMapper9Ev __ZN8IOMapper18iovmFreeDMACommandEP12IODMACommandjm +__ZN8IOMapper19_RESERVEDIOMapper10Ev +__ZN8IOMapper19_RESERVEDIOMapper11Ev +__ZN8IOMapper19_RESERVEDIOMapper12Ev +__ZN8IOMapper19_RESERVEDIOMapper13Ev +__ZN8IOMapper19_RESERVEDIOMapper14Ev +__ZN8IOMapper19_RESERVEDIOMapper15Ev __ZN8IOMapper19iovmAllocDMACommandEP12IODMACommandm __ZN8IOPMprot10gMetaClassE __ZN8IOPMprot10superClassE @@ -262,6 +425,24 @@ __ZN8IOPMprotC2EPK11OSMetaClass __ZN8IOPMprotC2Ev __ZN8IOPMprotD0Ev __ZN8IOPMprotD2Ev +__ZN8IOSyncer10gMetaClassE +__ZN8IOSyncer10superClassE +__ZN8IOSyncer13privateSignalEv +__ZN8IOSyncer4freeEv +__ZN8IOSyncer4initEb +__ZN8IOSyncer4waitEb +__ZN8IOSyncer6createEb +__ZN8IOSyncer6reinitEv +__ZN8IOSyncer6signalEib +__ZN8IOSyncer9MetaClassC1Ev +__ZN8IOSyncer9MetaClassC2Ev +__ZN8IOSyncer9metaClassE +__ZN8IOSyncerC1EPK11OSMetaClass +__ZN8IOSyncerC1Ev +__ZN8IOSyncerC2EPK11OSMetaClass +__ZN8IOSyncerC2Ev +__ZN8IOSyncerD0Ev +__ZN8IOSyncerD2Ev __ZN9IOService10adjustBusyEl __ZN9IOService10handleOpenEPS_mPv __ZN9IOService10systemWakeEv @@ -303,8 +484,50 @@ __ZN9IOService18matchPropertyTableEP12OSDictionaryPl __ZN9IOService18requireMaxBusStallEm __ZN9IOService18settleTimerExpiredEv __ZN9IOService18systemWillShutdownEm +__ZN9IOService19_RESERVEDIOService6Ev +__ZN9IOService19_RESERVEDIOService7Ev +__ZN9IOService19_RESERVEDIOService8Ev +__ZN9IOService19_RESERVEDIOService9Ev __ZN9IOService19deliverNotificationEPK8OSSymbolmm __ZN9IOService19installNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_lPP10OSIterator +__ZN9IOService20_RESERVEDIOService10Ev +__ZN9IOService20_RESERVEDIOService11Ev +__ZN9IOService20_RESERVEDIOService12Ev +__ZN9IOService20_RESERVEDIOService13Ev +__ZN9IOService20_RESERVEDIOService14Ev +__ZN9IOService20_RESERVEDIOService15Ev +__ZN9IOService20_RESERVEDIOService16Ev +__ZN9IOService20_RESERVEDIOService17Ev +__ZN9IOService20_RESERVEDIOService18Ev +__ZN9IOService20_RESERVEDIOService19Ev +__ZN9IOService20_RESERVEDIOService20Ev +__ZN9IOService20_RESERVEDIOService21Ev +__ZN9IOService20_RESERVEDIOService22Ev +__ZN9IOService20_RESERVEDIOService23Ev +__ZN9IOService20_RESERVEDIOService24Ev +__ZN9IOService20_RESERVEDIOService25Ev +__ZN9IOService20_RESERVEDIOService26Ev +__ZN9IOService20_RESERVEDIOService27Ev +__ZN9IOService20_RESERVEDIOService28Ev +__ZN9IOService20_RESERVEDIOService29Ev +__ZN9IOService20_RESERVEDIOService30Ev +__ZN9IOService20_RESERVEDIOService31Ev +__ZN9IOService20_RESERVEDIOService32Ev +__ZN9IOService20_RESERVEDIOService33Ev +__ZN9IOService20_RESERVEDIOService34Ev +__ZN9IOService20_RESERVEDIOService35Ev +__ZN9IOService20_RESERVEDIOService36Ev +__ZN9IOService20_RESERVEDIOService37Ev +__ZN9IOService20_RESERVEDIOService38Ev +__ZN9IOService20_RESERVEDIOService39Ev +__ZN9IOService20_RESERVEDIOService40Ev +__ZN9IOService20_RESERVEDIOService41Ev +__ZN9IOService20_RESERVEDIOService42Ev +__ZN9IOService20_RESERVEDIOService43Ev +__ZN9IOService20_RESERVEDIOService44Ev +__ZN9IOService20_RESERVEDIOService45Ev +__ZN9IOService20_RESERVEDIOService46Ev +__ZN9IOService20_RESERVEDIOService47Ev __ZN9IOService22PM_Clamp_Timer_ExpiredEv __ZN9IOService22powerDomainDidChangeToEmP17IOPowerConnection __ZN9IOService23acknowledgeNotificationEPvm @@ -337,7 +560,13 @@ __ZNK18IOMemoryDescriptor19dmaCommandOperationEmPvj __ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEmPvj __ZNK8IOPMprot12getMetaClassEv __ZNK8IOPMprot9MetaClass5allocEv +__ZNK8IOSyncer12getMetaClassEv +__ZNK8IOSyncer9MetaClass5allocEv __ZTV14IOCommandQueue __ZTV8IOPMprot +__ZTV8IOSyncer __ZTVN14IOCommandQueue9MetaClassE __ZTVN8IOPMprot9MetaClassE +__ZTVN8IOSyncer9MetaClassE +_ev_try_lock +_ev_unlock diff --git a/config/IOKit.x86_64.exports b/config/IOKit.x86_64.exports index 6f986aea6..37e7e8d41 100644 --- a/config/IOKit.x86_64.exports +++ b/config/IOKit.x86_64.exports @@ -1,3 +1,6 @@ +_IOOFPathMatching +_IOSpinUnlock +_IOTrySpinLock __Z16IODTFindSlotNameP15IORegistryEntryj __Z16IODTSetResolvingP15IORegistryEntryPFijPjS1_EPFvS0_PhS4_S4_E __Z17IODTGetCellCountsP15IORegistryEntryPjS1_ @@ -7,6 +10,11 @@ __ZN10IOWorkLoop19workLoopWithOptionsEj __ZN10IOWorkLoop20_RESERVEDIOWorkLoop0Ev __ZN10IOWorkLoop20_RESERVEDIOWorkLoop1Ev __ZN10IOWorkLoop20_RESERVEDIOWorkLoop2Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop3Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop4Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop5Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop6Ev +__ZN10IOWorkLoop20_RESERVEDIOWorkLoop7Ev __ZN10IOWorkLoop9sleepGateEPvj __ZN10IOWorkLoop9sleepGateEPvyj __ZN11IOCatalogue11findDriversEP12OSDictionaryPi @@ -18,6 +26,14 @@ __ZN11IODataQueue16initWithCapacityEj __ZN11IODataQueue7enqueueEPvj __ZN11IOMemoryMap18getPhysicalSegmentEyPyj __ZN11IOMemoryMap19setMemoryDescriptorEP18IOMemoryDescriptory +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap0Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap1Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap2Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap3Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap4Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap5Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap6Ev +__ZN11IOMemoryMap21_RESERVEDIOMemoryMap7Ev __ZN11IOMemoryMap8redirectEP18IOMemoryDescriptorjy __ZN12IODMACommand11OutputBig32EPS_NS_9Segment64EPvj __ZN12IODMACommand11OutputBig64EPS_NS_9Segment64EPvj @@ -30,6 +46,19 @@ __ZN12IODMACommand15genIOVMSegmentsEPFbPS_NS_9Segment64EPvjEPyS2_Pj __ZN12IODMACommand15genIOVMSegmentsEPyPvPj __ZN12IODMACommand17withSpecificationEPFbPS_NS_9Segment64EPvjEhyNS_14MappingOptionsEyjP8IOMapperS2_ __ZN12IODMACommand21initWithSpecificationEPFbPS_NS_9Segment64EPvjEhyNS_14MappingOptionsEyjP8IOMapperS2_ +__ZN12IODMACommand22_RESERVEDIODMACommand3Ev +__ZN12IODMACommand22_RESERVEDIODMACommand4Ev +__ZN12IODMACommand22_RESERVEDIODMACommand5Ev +__ZN12IODMACommand22_RESERVEDIODMACommand6Ev +__ZN12IODMACommand22_RESERVEDIODMACommand7Ev +__ZN12IODMACommand22_RESERVEDIODMACommand8Ev +__ZN12IODMACommand22_RESERVEDIODMACommand9Ev +__ZN12IODMACommand23_RESERVEDIODMACommand10Ev +__ZN12IODMACommand23_RESERVEDIODMACommand11Ev +__ZN12IODMACommand23_RESERVEDIODMACommand12Ev +__ZN12IODMACommand23_RESERVEDIODMACommand13Ev +__ZN12IODMACommand23_RESERVEDIODMACommand14Ev +__ZN12IODMACommand23_RESERVEDIODMACommand15Ev __ZN12IODMACommand24prepareWithSpecificationEPFbPS_NS_9Segment64EPvjEhyNS_14MappingOptionsEyjP8IOMapperyybb __ZN12IODMACommand8transferEjyPvy __ZN12IOUserClient12initWithTaskEP4taskPvj @@ -41,6 +70,20 @@ __ZN12IOUserClient19clientMemoryForTypeEjPjPP18IOMemoryDescriptor __ZN12IOUserClient19setAsyncReference64EPyP8ipc_portyy __ZN12IOUserClient22_RESERVEDIOUserClient0Ev __ZN12IOUserClient22_RESERVEDIOUserClient1Ev +__ZN12IOUserClient22_RESERVEDIOUserClient2Ev +__ZN12IOUserClient22_RESERVEDIOUserClient3Ev +__ZN12IOUserClient22_RESERVEDIOUserClient4Ev +__ZN12IOUserClient22_RESERVEDIOUserClient5Ev +__ZN12IOUserClient22_RESERVEDIOUserClient6Ev +__ZN12IOUserClient22_RESERVEDIOUserClient7Ev +__ZN12IOUserClient22_RESERVEDIOUserClient8Ev +__ZN12IOUserClient22_RESERVEDIOUserClient9Ev +__ZN12IOUserClient23_RESERVEDIOUserClient10Ev +__ZN12IOUserClient23_RESERVEDIOUserClient11Ev +__ZN12IOUserClient23_RESERVEDIOUserClient12Ev +__ZN12IOUserClient23_RESERVEDIOUserClient13Ev +__ZN12IOUserClient23_RESERVEDIOUserClient14Ev +__ZN12IOUserClient23_RESERVEDIOUserClient15Ev __ZN12IOUserClient23getExternalTrapForIndexEj __ZN12IOUserClient24getNotificationSemaphoreEjPP9semaphore __ZN12IOUserClient24getTargetAndTrapForIndexEPP9IOServicej @@ -53,8 +96,31 @@ __ZN12IOUserClient31getAsyncTargetAndMethodForIndexEPP9IOServicej __ZN13IOCommandGate12commandSleepEPvj __ZN13IOCommandGate12commandSleepEPvyj __ZN13IOCommandGate23_RESERVEDIOCommandGate0Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate1Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate2Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate3Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate4Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate5Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate6Ev +__ZN13IOCommandGate23_RESERVEDIOCommandGate7Ev __ZN13IOCommandPool11commandPoolEP9IOServiceP10IOWorkLoopj +__ZN13IOCommandPool23_RESERVEDIOCommandPool0Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool1Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool2Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool3Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool4Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool5Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool6Ev +__ZN13IOCommandPool23_RESERVEDIOCommandPool7Ev __ZN13IOCommandPool4initEP9IOServiceP10IOWorkLoopj +__ZN13IOEventSource23_RESERVEDIOEventSource0Ev +__ZN13IOEventSource23_RESERVEDIOEventSource1Ev +__ZN13IOEventSource23_RESERVEDIOEventSource2Ev +__ZN13IOEventSource23_RESERVEDIOEventSource3Ev +__ZN13IOEventSource23_RESERVEDIOEventSource4Ev +__ZN13IOEventSource23_RESERVEDIOEventSource5Ev +__ZN13IOEventSource23_RESERVEDIOEventSource6Ev +__ZN13IOEventSource23_RESERVEDIOEventSource7Ev __ZN13IOEventSource9sleepGateEPvj __ZN13IOEventSource9sleepGateEPvyj __ZN13_IOServiceJob8startJobEP9IOServiceij @@ -78,6 +144,32 @@ __ZN15IORegistryEntry25_RESERVEDIORegistryEntry2Ev __ZN15IORegistryEntry25_RESERVEDIORegistryEntry3Ev __ZN15IORegistryEntry25_RESERVEDIORegistryEntry4Ev __ZN15IORegistryEntry25_RESERVEDIORegistryEntry5Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry6Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry7Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry8Ev +__ZN15IORegistryEntry25_RESERVEDIORegistryEntry9Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry10Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry11Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry12Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry13Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry14Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry15Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry16Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry17Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry18Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry19Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry20Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry21Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry22Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry23Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry24Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry25Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry26Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry27Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry28Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry29Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry30Ev +__ZN15IORegistryEntry26_RESERVEDIORegistryEntry31Ev __ZN16IODMAEventSource14dmaEventSourceEP8OSObjectP9IOServicePFvS1_PS_P12IODMACommandiyES8_j __ZN16IODMAEventSource15startDMACommandEP12IODMACommandjyy __ZN16IODMAEventSource16notifyDMACommandEP12IODMACommandiy @@ -94,9 +186,33 @@ __ZN16IORangeAllocator9withRangeEyyjj __ZN17IOBigMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvj __ZN17IOBigMemoryCursor17withSpecificationEyyy __ZN17IOBigMemoryCursor21initWithSpecificationEyyy +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface0Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface1Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface2Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface3Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface4Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface5Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface6Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface7Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface8Ev +__ZN17IOPolledInterface27_RESERVEDIOPolledInterface9Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface10Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface11Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface12Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface13Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface14Ev +__ZN17IOPolledInterface28_RESERVEDIOPolledInterface15Ev __ZN17IOSharedDataQueue11withEntriesEjj __ZN17IOSharedDataQueue12withCapacityEj __ZN17IOSharedDataQueue16initWithCapacityEj +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue0Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue1Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue2Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue3Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue4Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue5Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue6Ev +__ZN17IOSharedDataQueue27_RESERVEDIOSharedDataQueue7Ev __ZN17IOSharedDataQueue7dequeueEPvPj __ZN18IOMemoryDescriptor10setMappingEP4taskyj __ZN18IOMemoryDescriptor10writeBytesEyPKvy @@ -142,6 +258,14 @@ __ZN18IOTimerEventSource12wakeAtTimeMSEj __ZN18IOTimerEventSource12wakeAtTimeUSEj __ZN18IOTimerEventSource15setTimeoutTicksEj __ZN18IOTimerEventSource15wakeAtTimeTicksEj +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource0Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource1Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource2Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource3Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource4Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource5Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource6Ev +__ZN18IOTimerEventSource28_RESERVEDIOTimerEventSource7Ev __ZN20IOLittleMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvj __ZN20IOLittleMemoryCursor17withSpecificationEyyy __ZN20IOLittleMemoryCursor21initWithSpecificationEyyy @@ -153,6 +277,12 @@ __ZN21IOInterruptController12enableVectorEiP17IOInterruptVector __ZN21IOInterruptController13getVectorTypeEiP17IOInterruptVector __ZN21IOInterruptController17disableVectorHardEiP17IOInterruptVector __ZN21IOInterruptController17vectorCanBeSharedEiP17IOInterruptVector +__ZN21IOInterruptController31_RESERVEDIOInterruptController0Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController1Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController2Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController3Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController4Ev +__ZN21IOInterruptController31_RESERVEDIOInterruptController5Ev __ZN21IONaturalMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvj __ZN21IONaturalMemoryCursor17withSpecificationEyyy __ZN21IONaturalMemoryCursor21initWithSpecificationEyyy @@ -163,6 +293,14 @@ __ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptoryyj __ZN21IOSubMemoryDescriptor18getPhysicalSegmentEyPyj __ZN21IOSubMemoryDescriptor7prepareEj __ZN21IOSubMemoryDescriptor8completeEj +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource0Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource1Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource2Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource3Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource4Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource5Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource6Ev +__ZN22IOInterruptEventSource32_RESERVEDIOInterruptEventSource7Ev __ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorjjb __ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorjjb __ZN23IOMultiMemoryDescriptor7prepareEj @@ -201,6 +339,18 @@ __ZN25IOGeneralMemoryDescriptor5doMapEP7_vm_mapPyjyy __ZN25IOGeneralMemoryDescriptor7doUnmapEP7_vm_mapyy __ZN25IOGeneralMemoryDescriptor7prepareEj __ZN25IOGeneralMemoryDescriptor8completeEj +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController0Ev +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController1Ev +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController2Ev +__ZN27IOSharedInterruptController37_RESERVEDIOSharedInterruptController3Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource0Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource1Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource2Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource3Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource4Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource5Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource6Ev +__ZN28IOFilterInterruptEventSource38_RESERVEDIOFilterInterruptEventSource7Ev __ZN29IOInterleavedMemoryDescriptor12withCapacityEyj __ZN29IOInterleavedMemoryDescriptor16initWithCapacityEyj __ZN29IOInterleavedMemoryDescriptor19setMemoryDescriptorEP18IOMemoryDescriptoryy @@ -212,8 +362,39 @@ __ZN8IOMapper10iovmInsertEjjP13upl_page_infoj __ZN8IOMapper10iovmInsertEjjPjj __ZN8IOMapper11NewARTTableEyPPvPj __ZN8IOMapper12FreeARTTableEP6OSDatay +__ZN8IOMapper18_RESERVEDIOMapper3Ev +__ZN8IOMapper18_RESERVEDIOMapper4Ev +__ZN8IOMapper18_RESERVEDIOMapper5Ev +__ZN8IOMapper18_RESERVEDIOMapper6Ev +__ZN8IOMapper18_RESERVEDIOMapper7Ev +__ZN8IOMapper18_RESERVEDIOMapper8Ev +__ZN8IOMapper18_RESERVEDIOMapper9Ev __ZN8IOMapper18iovmFreeDMACommandEP12IODMACommandjj +__ZN8IOMapper19_RESERVEDIOMapper10Ev +__ZN8IOMapper19_RESERVEDIOMapper11Ev +__ZN8IOMapper19_RESERVEDIOMapper12Ev +__ZN8IOMapper19_RESERVEDIOMapper13Ev +__ZN8IOMapper19_RESERVEDIOMapper14Ev +__ZN8IOMapper19_RESERVEDIOMapper15Ev __ZN8IOMapper19iovmAllocDMACommandEP12IODMACommandj +__ZN8IOSyncer10gMetaClassE +__ZN8IOSyncer10superClassE +__ZN8IOSyncer13privateSignalEv +__ZN8IOSyncer4freeEv +__ZN8IOSyncer4initEb +__ZN8IOSyncer4waitEb +__ZN8IOSyncer6createEb +__ZN8IOSyncer6reinitEv +__ZN8IOSyncer6signalEib +__ZN8IOSyncer9MetaClassC1Ev +__ZN8IOSyncer9MetaClassC2Ev +__ZN8IOSyncer9metaClassE +__ZN8IOSyncerC1EPK11OSMetaClass +__ZN8IOSyncerC1Ev +__ZN8IOSyncerC2EPK11OSMetaClass +__ZN8IOSyncerC2Ev +__ZN8IOSyncerD0Ev +__ZN8IOSyncerD2Ev __ZN9IOService10adjustBusyEi __ZN9IOService10handleOpenEPS_jPv __ZN9IOService11_adjustBusyEi @@ -247,7 +428,49 @@ __ZN9IOService19_RESERVEDIOService2Ev __ZN9IOService19_RESERVEDIOService3Ev __ZN9IOService19_RESERVEDIOService4Ev __ZN9IOService19_RESERVEDIOService5Ev +__ZN9IOService19_RESERVEDIOService6Ev +__ZN9IOService19_RESERVEDIOService7Ev +__ZN9IOService19_RESERVEDIOService8Ev +__ZN9IOService19_RESERVEDIOService9Ev __ZN9IOService19deliverNotificationEPK8OSSymboljj +__ZN9IOService20_RESERVEDIOService10Ev +__ZN9IOService20_RESERVEDIOService11Ev +__ZN9IOService20_RESERVEDIOService12Ev +__ZN9IOService20_RESERVEDIOService13Ev +__ZN9IOService20_RESERVEDIOService14Ev +__ZN9IOService20_RESERVEDIOService15Ev +__ZN9IOService20_RESERVEDIOService16Ev +__ZN9IOService20_RESERVEDIOService17Ev +__ZN9IOService20_RESERVEDIOService18Ev +__ZN9IOService20_RESERVEDIOService19Ev +__ZN9IOService20_RESERVEDIOService20Ev +__ZN9IOService20_RESERVEDIOService21Ev +__ZN9IOService20_RESERVEDIOService22Ev +__ZN9IOService20_RESERVEDIOService23Ev +__ZN9IOService20_RESERVEDIOService24Ev +__ZN9IOService20_RESERVEDIOService25Ev +__ZN9IOService20_RESERVEDIOService26Ev +__ZN9IOService20_RESERVEDIOService27Ev +__ZN9IOService20_RESERVEDIOService28Ev +__ZN9IOService20_RESERVEDIOService29Ev +__ZN9IOService20_RESERVEDIOService30Ev +__ZN9IOService20_RESERVEDIOService31Ev +__ZN9IOService20_RESERVEDIOService32Ev +__ZN9IOService20_RESERVEDIOService33Ev +__ZN9IOService20_RESERVEDIOService34Ev +__ZN9IOService20_RESERVEDIOService35Ev +__ZN9IOService20_RESERVEDIOService36Ev +__ZN9IOService20_RESERVEDIOService37Ev +__ZN9IOService20_RESERVEDIOService38Ev +__ZN9IOService20_RESERVEDIOService39Ev +__ZN9IOService20_RESERVEDIOService40Ev +__ZN9IOService20_RESERVEDIOService41Ev +__ZN9IOService20_RESERVEDIOService42Ev +__ZN9IOService20_RESERVEDIOService43Ev +__ZN9IOService20_RESERVEDIOService44Ev +__ZN9IOService20_RESERVEDIOService45Ev +__ZN9IOService20_RESERVEDIOService46Ev +__ZN9IOService20_RESERVEDIOService47Ev __ZN9IOService23acknowledgeNotificationEPvj __ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_P10IONotifierES5_S5_i __ZN9IOService23scheduleTerminatePhase2Ej @@ -266,3 +489,9 @@ __ZNK15IORegistryEntry12copyPropertyEPK8OSSymbolPK15IORegistryPlanej __ZNK15IORegistryEntry12copyPropertyEPKcPK15IORegistryPlanej __ZNK18IOMemoryDescriptor19dmaCommandOperationEjPvj __ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEjPvj +__ZNK8IOSyncer12getMetaClassEv +__ZNK8IOSyncer9MetaClass5allocEv +__ZTV8IOSyncer +__ZTVN8IOSyncer9MetaClassE +_ev_try_lock +_ev_unlock diff --git a/config/Libkern.exports b/config/Libkern.exports index 4bd05a193..b310d501a 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -1,4 +1,3 @@ -___bzero _Assert _MD5Final _MD5Init @@ -50,16 +49,10 @@ __Z13OSUnserializePKcPP8OSString __Z16OSUnserializeXMLPKcPP8OSString __ZN10OSIterator10gMetaClassE __ZN10OSIterator10superClassE -__ZN10OSIterator20_RESERVEDOSIterator0Ev -__ZN10OSIterator20_RESERVEDOSIterator1Ev -__ZN10OSIterator20_RESERVEDOSIterator2Ev -__ZN10OSIterator20_RESERVEDOSIterator3Ev __ZN10OSIterator9MetaClassC1Ev __ZN10OSIterator9MetaClassC2Ev __ZN10OSIterator9metaClassE -__ZN10OSIteratorC1EPK11OSMetaClass __ZN10OSIteratorC2EPK11OSMetaClass -__ZN10OSIteratorD0Ev __ZN10OSIteratorD2Ev __ZN11OSMetaClass10preModLoadEPKc __ZN11OSMetaClass11postModLoadEPv @@ -73,23 +66,13 @@ __ZN11OSMetaClass18getClassDictionaryEv __ZN11OSMetaClass18reportModInstancesEPKc __ZN11OSMetaClass19printInstanceCountsEv __ZN11OSMetaClass20getMetaClassWithNameEPK8OSSymbol -__ZN11OSMetaClass21_RESERVEDOSMetaClass0Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass1Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass2Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass3Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass4Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass5Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass6Ev -__ZN11OSMetaClass21_RESERVEDOSMetaClass7Ev __ZN11OSMetaClass21checkMetaCastWithNameEPK8OSStringPK15OSMetaClassBase __ZN11OSMetaClass21checkMetaCastWithNameEPK8OSSymbolPK15OSMetaClassBase __ZN11OSMetaClass21checkMetaCastWithNameEPKcPK15OSMetaClassBase __ZN11OSMetaClass24serializeClassDictionaryEP12OSDictionary __ZN11OSMetaClass8logErrorEi __ZN11OSMetaClass9metaClassE -__ZN11OSMetaClassC1EPKcPKS_j __ZN11OSMetaClassC2EPKcPKS_j -__ZN11OSMetaClassD0Ev __ZN11OSMetaClassD2Ev __ZN11OSMetaClassdlEPvm __ZN11OSMetaClassnwEm @@ -102,14 +85,6 @@ __ZN11OSSerialize14ensureCapacityEj __ZN11OSSerialize16initWithCapacityEj __ZN11OSSerialize20previouslySerializedEPK15OSMetaClassBase __ZN11OSSerialize20setCapacityIncrementEj -__ZN11OSSerialize21_RESERVEDOSSerialize0Ev -__ZN11OSSerialize21_RESERVEDOSSerialize1Ev -__ZN11OSSerialize21_RESERVEDOSSerialize2Ev -__ZN11OSSerialize21_RESERVEDOSSerialize3Ev -__ZN11OSSerialize21_RESERVEDOSSerialize4Ev -__ZN11OSSerialize21_RESERVEDOSSerialize5Ev -__ZN11OSSerialize21_RESERVEDOSSerialize6Ev -__ZN11OSSerialize21_RESERVEDOSSerialize7Ev __ZN11OSSerialize4freeEv __ZN11OSSerialize7addCharEc __ZN11OSSerialize9MetaClassC1Ev @@ -128,19 +103,11 @@ __ZN12OSCollection10setOptionsEjjPv __ZN12OSCollection10superClassE __ZN12OSCollection11haveUpdatedEv __ZN12OSCollection14copyCollectionEP12OSDictionary -__ZN12OSCollection22_RESERVEDOSCollection2Ev -__ZN12OSCollection22_RESERVEDOSCollection3Ev -__ZN12OSCollection22_RESERVEDOSCollection4Ev -__ZN12OSCollection22_RESERVEDOSCollection5Ev -__ZN12OSCollection22_RESERVEDOSCollection6Ev -__ZN12OSCollection22_RESERVEDOSCollection7Ev __ZN12OSCollection4initEv __ZN12OSCollection9MetaClassC1Ev __ZN12OSCollection9MetaClassC2Ev __ZN12OSCollection9metaClassE -__ZN12OSCollectionC1EPK11OSMetaClass __ZN12OSCollectionC2EPK11OSMetaClass -__ZN12OSCollectionD0Ev __ZN12OSCollectionD2Ev __ZN12OSDictionary10gMetaClassE __ZN12OSDictionary10setOptionsEjjPv @@ -160,14 +127,6 @@ __ZN12OSDictionary15initWithObjectsEPPK8OSObjectPPK8OSSymboljj __ZN12OSDictionary16initWithCapacityEj __ZN12OSDictionary18initWithDictionaryEPKS_j __ZN12OSDictionary20setCapacityIncrementEj -__ZN12OSDictionary22_RESERVEDOSDictionary0Ev -__ZN12OSDictionary22_RESERVEDOSDictionary1Ev -__ZN12OSDictionary22_RESERVEDOSDictionary2Ev -__ZN12OSDictionary22_RESERVEDOSDictionary3Ev -__ZN12OSDictionary22_RESERVEDOSDictionary4Ev -__ZN12OSDictionary22_RESERVEDOSDictionary5Ev -__ZN12OSDictionary22_RESERVEDOSDictionary6Ev -__ZN12OSDictionary22_RESERVEDOSDictionary7Ev __ZN12OSDictionary4freeEv __ZN12OSDictionary5mergeEPKS_ __ZN12OSDictionary9MetaClassC1Ev @@ -194,14 +153,6 @@ __ZN12OSOrderedSet14getOrderingRefEv __ZN12OSOrderedSet14setFirstObjectEPK15OSMetaClassBase __ZN12OSOrderedSet15flushCollectionEv __ZN12OSOrderedSet20setCapacityIncrementEj -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet0Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet1Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet2Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet3Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet4Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet5Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet6Ev -__ZN12OSOrderedSet22_RESERVEDOSOrderedSet7Ev __ZN12OSOrderedSet4freeEv __ZN12OSOrderedSet9MetaClassC1Ev __ZN12OSOrderedSet9MetaClassC2Ev @@ -243,14 +194,7 @@ __ZN12OSSymbolPooldlEPvm __ZN12OSSymbolPoolnwEm __ZN15OSMetaClassBase12safeMetaCastEPKS_PK11OSMetaClass __ZN15OSMetaClassBase13checkTypeInstEPKS_S1_ -__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev -__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase4Ev -__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase5Ev -__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase6Ev -__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase7Ev -__ZN15OSMetaClassBaseC1Ev __ZN15OSMetaClassBaseC2Ev -__ZN15OSMetaClassBaseD0Ev __ZN15OSMetaClassBaseD2Ev __ZN15OSMetaClassMetaC1Ev __ZN15OSMetaClassMetaC2Ev @@ -281,14 +225,6 @@ __ZN5OSSet12withCapacityEj __ZN5OSSet13initWithArrayEPK7OSArrayj __ZN5OSSet14copyCollectionEP12OSDictionary __ZN5OSSet14ensureCapacityEj -__ZN5OSSet15_RESERVEDOSSet0Ev -__ZN5OSSet15_RESERVEDOSSet1Ev -__ZN5OSSet15_RESERVEDOSSet2Ev -__ZN5OSSet15_RESERVEDOSSet3Ev -__ZN5OSSet15_RESERVEDOSSet4Ev -__ZN5OSSet15_RESERVEDOSSet5Ev -__ZN5OSSet15_RESERVEDOSSet6Ev -__ZN5OSSet15_RESERVEDOSSet7Ev __ZN5OSSet15flushCollectionEv __ZN5OSSet15initWithObjectsEPPK8OSObjectjj __ZN5OSSet16initWithCapacityEj @@ -319,13 +255,6 @@ __ZN6OSData12withCapacityEj __ZN6OSData13initWithBytesEPKvj __ZN6OSData14ensureCapacityEj __ZN6OSData15withBytesNoCopyEPvj -__ZN6OSData16_RESERVEDOSData1Ev -__ZN6OSData16_RESERVEDOSData2Ev -__ZN6OSData16_RESERVEDOSData3Ev -__ZN6OSData16_RESERVEDOSData4Ev -__ZN6OSData16_RESERVEDOSData5Ev -__ZN6OSData16_RESERVEDOSData6Ev -__ZN6OSData16_RESERVEDOSData7Ev __ZN6OSData16initWithCapacityEj __ZN6OSData18setDeallocFunctionEPFvPvjE __ZN6OSData19initWithBytesNoCopyEPvj @@ -356,14 +285,6 @@ __ZN7OSArray14ensureCapacityEj __ZN7OSArray15flushCollectionEv __ZN7OSArray15initWithObjectsEPPK8OSObjectjj __ZN7OSArray16initWithCapacityEj -__ZN7OSArray17_RESERVEDOSArray0Ev -__ZN7OSArray17_RESERVEDOSArray1Ev -__ZN7OSArray17_RESERVEDOSArray2Ev -__ZN7OSArray17_RESERVEDOSArray3Ev -__ZN7OSArray17_RESERVEDOSArray4Ev -__ZN7OSArray17_RESERVEDOSArray5Ev -__ZN7OSArray17_RESERVEDOSArray6Ev -__ZN7OSArray17_RESERVEDOSArray7Ev __ZN7OSArray20setCapacityIncrementEj __ZN7OSArray4freeEv __ZN7OSArray5mergeEPKS_ @@ -383,14 +304,6 @@ __ZN8OSNumber10gMetaClassE __ZN8OSNumber10superClassE __ZN8OSNumber10withNumberEPKcj __ZN8OSNumber10withNumberEyj -__ZN8OSNumber18_RESERVEDOSNumber0Ev -__ZN8OSNumber18_RESERVEDOSNumber1Ev -__ZN8OSNumber18_RESERVEDOSNumber2Ev -__ZN8OSNumber18_RESERVEDOSNumber3Ev -__ZN8OSNumber18_RESERVEDOSNumber4Ev -__ZN8OSNumber18_RESERVEDOSNumber5Ev -__ZN8OSNumber18_RESERVEDOSNumber6Ev -__ZN8OSNumber18_RESERVEDOSNumber7Ev __ZN8OSNumber4freeEv __ZN8OSNumber4initEPKcj __ZN8OSNumber4initEyj @@ -407,22 +320,6 @@ __ZN8OSNumberD0Ev __ZN8OSNumberD2Ev __ZN8OSObject10gMetaClassE __ZN8OSObject10superClassE -__ZN8OSObject18_RESERVEDOSObject0Ev -__ZN8OSObject18_RESERVEDOSObject1Ev -__ZN8OSObject18_RESERVEDOSObject2Ev -__ZN8OSObject18_RESERVEDOSObject3Ev -__ZN8OSObject18_RESERVEDOSObject4Ev -__ZN8OSObject18_RESERVEDOSObject5Ev -__ZN8OSObject18_RESERVEDOSObject6Ev -__ZN8OSObject18_RESERVEDOSObject7Ev -__ZN8OSObject18_RESERVEDOSObject8Ev -__ZN8OSObject18_RESERVEDOSObject9Ev -__ZN8OSObject19_RESERVEDOSObject10Ev -__ZN8OSObject19_RESERVEDOSObject11Ev -__ZN8OSObject19_RESERVEDOSObject12Ev -__ZN8OSObject19_RESERVEDOSObject13Ev -__ZN8OSObject19_RESERVEDOSObject14Ev -__ZN8OSObject19_RESERVEDOSObject15Ev __ZN8OSObject4freeEv __ZN8OSObject4initEv __ZN8OSObject9MetaClassC1Ev @@ -443,22 +340,6 @@ __ZN8OSString11withCStringEPKc __ZN8OSString14initWithStringEPKS_ __ZN8OSString15initWithCStringEPKc __ZN8OSString17withCStringNoCopyEPKc -__ZN8OSString18_RESERVEDOSString0Ev -__ZN8OSString18_RESERVEDOSString1Ev -__ZN8OSString18_RESERVEDOSString2Ev -__ZN8OSString18_RESERVEDOSString3Ev -__ZN8OSString18_RESERVEDOSString4Ev -__ZN8OSString18_RESERVEDOSString5Ev -__ZN8OSString18_RESERVEDOSString6Ev -__ZN8OSString18_RESERVEDOSString7Ev -__ZN8OSString18_RESERVEDOSString8Ev -__ZN8OSString18_RESERVEDOSString9Ev -__ZN8OSString19_RESERVEDOSString10Ev -__ZN8OSString19_RESERVEDOSString11Ev -__ZN8OSString19_RESERVEDOSString12Ev -__ZN8OSString19_RESERVEDOSString13Ev -__ZN8OSString19_RESERVEDOSString14Ev -__ZN8OSString19_RESERVEDOSString15Ev __ZN8OSString21initWithCStringNoCopyEPKc __ZN8OSString4freeEv __ZN8OSString7setCharEcj @@ -479,14 +360,6 @@ __ZN8OSSymbol11withCStringEPKc __ZN8OSSymbol14initWithStringEPK8OSString __ZN8OSSymbol15initWithCStringEPKc __ZN8OSSymbol17withCStringNoCopyEPKc -__ZN8OSSymbol18_RESERVEDOSSymbol0Ev -__ZN8OSSymbol18_RESERVEDOSSymbol1Ev -__ZN8OSSymbol18_RESERVEDOSSymbol2Ev -__ZN8OSSymbol18_RESERVEDOSSymbol3Ev -__ZN8OSSymbol18_RESERVEDOSSymbol4Ev -__ZN8OSSymbol18_RESERVEDOSSymbol5Ev -__ZN8OSSymbol18_RESERVEDOSSymbol6Ev -__ZN8OSSymbol18_RESERVEDOSSymbol7Ev __ZN8OSSymbol18checkForPageUnloadEPvS0_ __ZN8OSSymbol21initWithCStringNoCopyEPKc __ZN8OSSymbol4freeEv @@ -503,14 +376,6 @@ __ZN9OSBoolean10gMetaClassE __ZN9OSBoolean10initializeEv __ZN9OSBoolean10superClassE __ZN9OSBoolean11withBooleanEb -__ZN9OSBoolean19_RESERVEDOSBoolean0Ev -__ZN9OSBoolean19_RESERVEDOSBoolean1Ev -__ZN9OSBoolean19_RESERVEDOSBoolean2Ev -__ZN9OSBoolean19_RESERVEDOSBoolean3Ev -__ZN9OSBoolean19_RESERVEDOSBoolean4Ev -__ZN9OSBoolean19_RESERVEDOSBoolean5Ev -__ZN9OSBoolean19_RESERVEDOSBoolean6Ev -__ZN9OSBoolean19_RESERVEDOSBoolean7Ev __ZN9OSBoolean4freeEv __ZN9OSBoolean9MetaClassC1Ev __ZN9OSBoolean9MetaClassC2Ev @@ -717,6 +582,7 @@ __ZdaPv __ZdlPv __Znam __Znwm +___bzero ___cxa_pure_virtual ___stack_chk_fail ___stack_chk_guard @@ -857,7 +723,6 @@ _version_stage _version_variant _vprintf _vsnprintf -_vsprintf _vsscanf _zError _zlibVersion diff --git a/config/Libkern.i386.exports b/config/Libkern.i386.exports index d1a97b9ee..f9ef1120e 100644 --- a/config/Libkern.i386.exports +++ b/config/Libkern.i386.exports @@ -1,15 +1,142 @@ -_lck_mtx_unlock_darwin10 -_lck_mtx_lock_spin -_lck_mtx_try_lock_spin -_lck_mtx_convert_spin _OSAddAtomic64 _OSCompareAndSwap64 _OSRuntimeFinalizeCPP _OSRuntimeInitializeCPP _OSRuntimeUnloadCPP _OSRuntimeUnloadCPPForSegment +__ZN10OSIterator20_RESERVEDOSIterator0Ev +__ZN10OSIterator20_RESERVEDOSIterator1Ev +__ZN10OSIterator20_RESERVEDOSIterator2Ev +__ZN10OSIterator20_RESERVEDOSIterator3Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass0Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass1Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass2Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass3Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass4Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass5Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass6Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass7Ev +__ZN11OSSerialize21_RESERVEDOSSerialize0Ev +__ZN11OSSerialize21_RESERVEDOSSerialize1Ev +__ZN11OSSerialize21_RESERVEDOSSerialize2Ev +__ZN11OSSerialize21_RESERVEDOSSerialize3Ev +__ZN11OSSerialize21_RESERVEDOSSerialize4Ev +__ZN11OSSerialize21_RESERVEDOSSerialize5Ev +__ZN11OSSerialize21_RESERVEDOSSerialize6Ev +__ZN11OSSerialize21_RESERVEDOSSerialize7Ev +__ZN12OSCollection22_RESERVEDOSCollection2Ev +__ZN12OSCollection22_RESERVEDOSCollection3Ev +__ZN12OSCollection22_RESERVEDOSCollection4Ev +__ZN12OSCollection22_RESERVEDOSCollection5Ev +__ZN12OSCollection22_RESERVEDOSCollection6Ev +__ZN12OSCollection22_RESERVEDOSCollection7Ev +__ZN12OSDictionary22_RESERVEDOSDictionary0Ev +__ZN12OSDictionary22_RESERVEDOSDictionary1Ev +__ZN12OSDictionary22_RESERVEDOSDictionary2Ev +__ZN12OSDictionary22_RESERVEDOSDictionary3Ev +__ZN12OSDictionary22_RESERVEDOSDictionary4Ev +__ZN12OSDictionary22_RESERVEDOSDictionary5Ev +__ZN12OSDictionary22_RESERVEDOSDictionary6Ev +__ZN12OSDictionary22_RESERVEDOSDictionary7Ev __ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet0Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet1Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet2Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet3Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet4Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet5Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet6Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet7Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase4Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase5Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase6Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase7Ev +__ZN5OSSet15_RESERVEDOSSet0Ev +__ZN5OSSet15_RESERVEDOSSet1Ev +__ZN5OSSet15_RESERVEDOSSet2Ev +__ZN5OSSet15_RESERVEDOSSet3Ev +__ZN5OSSet15_RESERVEDOSSet4Ev +__ZN5OSSet15_RESERVEDOSSet5Ev +__ZN5OSSet15_RESERVEDOSSet6Ev +__ZN5OSSet15_RESERVEDOSSet7Ev +__ZN6OSData16_RESERVEDOSData1Ev +__ZN6OSData16_RESERVEDOSData2Ev +__ZN6OSData16_RESERVEDOSData3Ev +__ZN6OSData16_RESERVEDOSData4Ev +__ZN6OSData16_RESERVEDOSData5Ev +__ZN6OSData16_RESERVEDOSData6Ev +__ZN6OSData16_RESERVEDOSData7Ev +__ZN7OSArray17_RESERVEDOSArray0Ev +__ZN7OSArray17_RESERVEDOSArray1Ev +__ZN7OSArray17_RESERVEDOSArray2Ev +__ZN7OSArray17_RESERVEDOSArray3Ev +__ZN7OSArray17_RESERVEDOSArray4Ev +__ZN7OSArray17_RESERVEDOSArray5Ev +__ZN7OSArray17_RESERVEDOSArray6Ev +__ZN7OSArray17_RESERVEDOSArray7Ev +__ZN8OSNumber18_RESERVEDOSNumber0Ev +__ZN8OSNumber18_RESERVEDOSNumber1Ev +__ZN8OSNumber18_RESERVEDOSNumber2Ev +__ZN8OSNumber18_RESERVEDOSNumber3Ev +__ZN8OSNumber18_RESERVEDOSNumber4Ev +__ZN8OSNumber18_RESERVEDOSNumber5Ev +__ZN8OSNumber18_RESERVEDOSNumber6Ev +__ZN8OSNumber18_RESERVEDOSNumber7Ev +__ZN8OSObject18_RESERVEDOSObject0Ev +__ZN8OSObject18_RESERVEDOSObject1Ev +__ZN8OSObject18_RESERVEDOSObject2Ev +__ZN8OSObject18_RESERVEDOSObject3Ev +__ZN8OSObject18_RESERVEDOSObject4Ev +__ZN8OSObject18_RESERVEDOSObject5Ev +__ZN8OSObject18_RESERVEDOSObject6Ev +__ZN8OSObject18_RESERVEDOSObject7Ev +__ZN8OSObject18_RESERVEDOSObject8Ev +__ZN8OSObject18_RESERVEDOSObject9Ev +__ZN8OSObject19_RESERVEDOSObject10Ev +__ZN8OSObject19_RESERVEDOSObject11Ev +__ZN8OSObject19_RESERVEDOSObject12Ev +__ZN8OSObject19_RESERVEDOSObject13Ev +__ZN8OSObject19_RESERVEDOSObject14Ev +__ZN8OSObject19_RESERVEDOSObject15Ev +__ZN8OSString18_RESERVEDOSString0Ev +__ZN8OSString18_RESERVEDOSString1Ev +__ZN8OSString18_RESERVEDOSString2Ev +__ZN8OSString18_RESERVEDOSString3Ev +__ZN8OSString18_RESERVEDOSString4Ev +__ZN8OSString18_RESERVEDOSString5Ev +__ZN8OSString18_RESERVEDOSString6Ev +__ZN8OSString18_RESERVEDOSString7Ev +__ZN8OSString18_RESERVEDOSString8Ev +__ZN8OSString18_RESERVEDOSString9Ev +__ZN8OSString19_RESERVEDOSString10Ev +__ZN8OSString19_RESERVEDOSString11Ev +__ZN8OSString19_RESERVEDOSString12Ev +__ZN8OSString19_RESERVEDOSString13Ev +__ZN8OSString19_RESERVEDOSString14Ev +__ZN8OSString19_RESERVEDOSString15Ev +__ZN8OSSymbol18_RESERVEDOSSymbol0Ev +__ZN8OSSymbol18_RESERVEDOSSymbol1Ev +__ZN8OSSymbol18_RESERVEDOSSymbol2Ev +__ZN8OSSymbol18_RESERVEDOSSymbol3Ev +__ZN8OSSymbol18_RESERVEDOSSymbol4Ev +__ZN8OSSymbol18_RESERVEDOSSymbol5Ev +__ZN8OSSymbol18_RESERVEDOSSymbol6Ev +__ZN8OSSymbol18_RESERVEDOSSymbol7Ev +__ZN9OSBoolean19_RESERVEDOSBoolean0Ev +__ZN9OSBoolean19_RESERVEDOSBoolean1Ev +__ZN9OSBoolean19_RESERVEDOSBoolean2Ev +__ZN9OSBoolean19_RESERVEDOSBoolean3Ev +__ZN9OSBoolean19_RESERVEDOSBoolean4Ev +__ZN9OSBoolean19_RESERVEDOSBoolean5Ev +__ZN9OSBoolean19_RESERVEDOSBoolean6Ev +__ZN9OSBoolean19_RESERVEDOSBoolean7Ev +_lck_mtx_convert_spin +_lck_mtx_lock_spin +_lck_mtx_try_lock_spin +_lck_mtx_unlock_darwin10 _sprintf _strcat _strcpy +_vsprintf diff --git a/config/Libkern.x86_64.exports b/config/Libkern.x86_64.exports index c42f577d8..f67db63a8 100644 --- a/config/Libkern.x86_64.exports +++ b/config/Libkern.x86_64.exports @@ -1,10 +1,137 @@ -_lck_mtx_lock_spin -_lck_mtx_try_lock_spin -_lck_mtx_convert_spin _OSAddAtomic64 _OSCompareAndSwap64 +__ZN10OSIterator20_RESERVEDOSIterator0Ev +__ZN10OSIterator20_RESERVEDOSIterator1Ev +__ZN10OSIterator20_RESERVEDOSIterator2Ev +__ZN10OSIterator20_RESERVEDOSIterator3Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass0Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass1Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass2Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass3Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass4Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass5Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass6Ev +__ZN11OSMetaClass21_RESERVEDOSMetaClass7Ev +__ZN11OSSerialize21_RESERVEDOSSerialize0Ev +__ZN11OSSerialize21_RESERVEDOSSerialize1Ev +__ZN11OSSerialize21_RESERVEDOSSerialize2Ev +__ZN11OSSerialize21_RESERVEDOSSerialize3Ev +__ZN11OSSerialize21_RESERVEDOSSerialize4Ev +__ZN11OSSerialize21_RESERVEDOSSerialize5Ev +__ZN11OSSerialize21_RESERVEDOSSerialize6Ev +__ZN11OSSerialize21_RESERVEDOSSerialize7Ev +__ZN12OSCollection22_RESERVEDOSCollection2Ev +__ZN12OSCollection22_RESERVEDOSCollection3Ev +__ZN12OSCollection22_RESERVEDOSCollection4Ev +__ZN12OSCollection22_RESERVEDOSCollection5Ev +__ZN12OSCollection22_RESERVEDOSCollection6Ev +__ZN12OSCollection22_RESERVEDOSCollection7Ev +__ZN12OSDictionary22_RESERVEDOSDictionary0Ev +__ZN12OSDictionary22_RESERVEDOSDictionary1Ev +__ZN12OSDictionary22_RESERVEDOSDictionary2Ev +__ZN12OSDictionary22_RESERVEDOSDictionary3Ev +__ZN12OSDictionary22_RESERVEDOSDictionary4Ev +__ZN12OSDictionary22_RESERVEDOSDictionary5Ev +__ZN12OSDictionary22_RESERVEDOSDictionary6Ev +__ZN12OSDictionary22_RESERVEDOSDictionary7Ev __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet0Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet1Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet2Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet3Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet4Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet5Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet6Ev +__ZN12OSOrderedSet22_RESERVEDOSOrderedSet7Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase3Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase4Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase5Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase6Ev +__ZN15OSMetaClassBase25_RESERVEDOSMetaClassBase7Ev +__ZN5OSSet15_RESERVEDOSSet0Ev +__ZN5OSSet15_RESERVEDOSSet1Ev +__ZN5OSSet15_RESERVEDOSSet2Ev +__ZN5OSSet15_RESERVEDOSSet3Ev +__ZN5OSSet15_RESERVEDOSSet4Ev +__ZN5OSSet15_RESERVEDOSSet5Ev +__ZN5OSSet15_RESERVEDOSSet6Ev +__ZN5OSSet15_RESERVEDOSSet7Ev +__ZN6OSData16_RESERVEDOSData1Ev +__ZN6OSData16_RESERVEDOSData2Ev +__ZN6OSData16_RESERVEDOSData3Ev +__ZN6OSData16_RESERVEDOSData4Ev +__ZN6OSData16_RESERVEDOSData5Ev +__ZN6OSData16_RESERVEDOSData6Ev +__ZN6OSData16_RESERVEDOSData7Ev +__ZN7OSArray17_RESERVEDOSArray0Ev +__ZN7OSArray17_RESERVEDOSArray1Ev +__ZN7OSArray17_RESERVEDOSArray2Ev +__ZN7OSArray17_RESERVEDOSArray3Ev +__ZN7OSArray17_RESERVEDOSArray4Ev +__ZN7OSArray17_RESERVEDOSArray5Ev +__ZN7OSArray17_RESERVEDOSArray6Ev +__ZN7OSArray17_RESERVEDOSArray7Ev +__ZN8OSNumber18_RESERVEDOSNumber0Ev +__ZN8OSNumber18_RESERVEDOSNumber1Ev +__ZN8OSNumber18_RESERVEDOSNumber2Ev +__ZN8OSNumber18_RESERVEDOSNumber3Ev +__ZN8OSNumber18_RESERVEDOSNumber4Ev +__ZN8OSNumber18_RESERVEDOSNumber5Ev +__ZN8OSNumber18_RESERVEDOSNumber6Ev +__ZN8OSNumber18_RESERVEDOSNumber7Ev +__ZN8OSObject18_RESERVEDOSObject0Ev +__ZN8OSObject18_RESERVEDOSObject1Ev +__ZN8OSObject18_RESERVEDOSObject2Ev +__ZN8OSObject18_RESERVEDOSObject3Ev +__ZN8OSObject18_RESERVEDOSObject4Ev +__ZN8OSObject18_RESERVEDOSObject5Ev +__ZN8OSObject18_RESERVEDOSObject6Ev +__ZN8OSObject18_RESERVEDOSObject7Ev +__ZN8OSObject18_RESERVEDOSObject8Ev +__ZN8OSObject18_RESERVEDOSObject9Ev +__ZN8OSObject19_RESERVEDOSObject10Ev +__ZN8OSObject19_RESERVEDOSObject11Ev +__ZN8OSObject19_RESERVEDOSObject12Ev +__ZN8OSObject19_RESERVEDOSObject13Ev +__ZN8OSObject19_RESERVEDOSObject14Ev +__ZN8OSObject19_RESERVEDOSObject15Ev +__ZN8OSString18_RESERVEDOSString0Ev +__ZN8OSString18_RESERVEDOSString1Ev +__ZN8OSString18_RESERVEDOSString2Ev +__ZN8OSString18_RESERVEDOSString3Ev +__ZN8OSString18_RESERVEDOSString4Ev +__ZN8OSString18_RESERVEDOSString5Ev +__ZN8OSString18_RESERVEDOSString6Ev +__ZN8OSString18_RESERVEDOSString7Ev +__ZN8OSString18_RESERVEDOSString8Ev +__ZN8OSString18_RESERVEDOSString9Ev +__ZN8OSString19_RESERVEDOSString10Ev +__ZN8OSString19_RESERVEDOSString11Ev +__ZN8OSString19_RESERVEDOSString12Ev +__ZN8OSString19_RESERVEDOSString13Ev +__ZN8OSString19_RESERVEDOSString14Ev +__ZN8OSString19_RESERVEDOSString15Ev +__ZN8OSSymbol18_RESERVEDOSSymbol0Ev +__ZN8OSSymbol18_RESERVEDOSSymbol1Ev +__ZN8OSSymbol18_RESERVEDOSSymbol2Ev +__ZN8OSSymbol18_RESERVEDOSSymbol3Ev +__ZN8OSSymbol18_RESERVEDOSSymbol4Ev +__ZN8OSSymbol18_RESERVEDOSSymbol5Ev +__ZN8OSSymbol18_RESERVEDOSSymbol6Ev +__ZN8OSSymbol18_RESERVEDOSSymbol7Ev +__ZN9OSBoolean19_RESERVEDOSBoolean0Ev +__ZN9OSBoolean19_RESERVEDOSBoolean1Ev +__ZN9OSBoolean19_RESERVEDOSBoolean2Ev +__ZN9OSBoolean19_RESERVEDOSBoolean3Ev +__ZN9OSBoolean19_RESERVEDOSBoolean4Ev +__ZN9OSBoolean19_RESERVEDOSBoolean5Ev +__ZN9OSBoolean19_RESERVEDOSBoolean6Ev +__ZN9OSBoolean19_RESERVEDOSBoolean7Ev +_lck_mtx_convert_spin +_lck_mtx_lock_spin +_lck_mtx_try_lock_spin _sprintf _strcat _strcpy +_vsprintf diff --git a/config/MACFramework.exports b/config/MACFramework.exports index 839eadc4f..73dda1064 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -10,6 +10,9 @@ _mac_audit_text _mac_iokit_check_hid_control +_mac_thread_get_threadlabel +_mac_thread_get_uthreadlabel + _sbuf_cat _sbuf_data _sbuf_delete diff --git a/config/MACFramework.i386.exports b/config/MACFramework.i386.exports index 6006136b4..aa74fd56a 100644 --- a/config/MACFramework.i386.exports +++ b/config/MACFramework.i386.exports @@ -1,9 +1,11 @@ _kau_will_audit +_mac_do_machexc _mac_kalloc _mac_kalloc_noblock _mac_kfree _mac_mbuf_alloc _mac_mbuf_free +_mac_schedule_userret _mac_unwire _mac_wire _sysctl__security_mac_children diff --git a/config/MACFramework.x86_64.exports b/config/MACFramework.x86_64.exports index 6006136b4..aa74fd56a 100644 --- a/config/MACFramework.x86_64.exports +++ b/config/MACFramework.x86_64.exports @@ -1,9 +1,11 @@ _kau_will_audit +_mac_do_machexc _mac_kalloc _mac_kalloc_noblock _mac_kfree _mac_mbuf_alloc _mac_mbuf_free +_mac_schedule_userret _mac_unwire _mac_wire _sysctl__security_mac_children diff --git a/config/Makefile b/config/Makefile index ff2d46ddb..201cbee6c 100644 --- a/config/Makefile +++ b/config/Makefile @@ -93,7 +93,7 @@ $(OBJPATH)/allsymbols: $(OBJPATH)/mach_kernel $(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset : %.exports %.$(ARCH_CONFIG_LC).exports $(OBJPATH)/allsymbols $(_v)if [ "$*" != System6.0 -o $(SUPPORT_SYSTEM60_KEXT) -eq 1 ]; then \ $(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ -import $(OBJPATH)/allsymbols \ -export $*.exports \ -export $*.$(ARCH_CONFIG_LC).exports \ @@ -107,15 +107,39 @@ $(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset : %.exports %.$(ARCH_CONFIG_LC).exp # symbolsets for the primary machine configuration for each kernel/arch config $(SYMBOL_SET_FAT): $(OBJROOT)/%.symbolset : $(_v)per_arch_symbolsets=""; \ - kernel_config=$(INSTALL_TYPE); \ - machine_config=$(MACHINE_CONFIG); \ for arch_config in $(INSTALL_ARCHS); \ do \ + \ + my_counter=1; \ + found_arch=0; \ + for my_config in $(TARGET_CONFIGS_UC); \ + do \ + if [ $${my_counter} -eq 1 ] ; then \ + kernel_config=$${my_config}; \ + elif [ $${my_counter} -eq 2 ] ; then \ + if [ $${my_config} = $${arch_config} ]; then \ + found_arch=1; \ + fi; \ + else \ + if [ $${found_arch} -eq 1 ]; then \ + machine_config=$${my_config};\ + break; \ + fi; \ + my_counter=0; \ + fi; \ + my_counter=$$((my_counter + 1)); \ + done; \ + \ if [ $${arch_config} = ARM ] ; then \ if [ $${machine_config} = DEFAULT ] ; then \ machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ objpath=${OBJROOT}/$${kernel_config}_$${arch_config}; \ else \ @@ -133,14 +157,14 @@ $(SYMBOL_SET_FAT): $(OBJROOT)/%.symbolset : build_symbol_sets: $(SYMBOL_SET_BUILD) $(OBJPATH)/allsymbols $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ $(foreach comp,$(filter-out System6.0 Private,$(SYMBOL_COMPONENT_LIST)), \ -export $(SRCROOT)/$(COMPONENT)/$(comp).exports \ -export $(SRCROOT)/$(COMPONENT)/$(comp).$(ARCH_CONFIG_LC).exports) \ -import $(OBJPATH)/allsymbols \ -output /dev/null $(_vstdout); $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ $(foreach comp,$(filter-out System6.0 Unsupported,$(SYMBOL_COMPONENT_LIST)), \ -export $(SRCROOT)/$(COMPONENT)/$(comp).exports \ -export $(SRCROOT)/$(COMPONENT)/$(comp).$(ARCH_CONFIG_LC).exports) \ diff --git a/config/MasterVersion b/config/MasterVersion index fb4c4656a..1bf9f8818 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -11.4.2 +12.0.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.exports b/config/Private.exports index 364d84069..95fe92e41 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -13,20 +13,25 @@ __ZTV24IOCPUInterruptController _b_to_q _bdevsw _boot +_bootcache_contains_block _bsd_hostname _bsd_set_dependency_capable _buf_attr _buf_create_shadow -_buf_getcpaddr -_buf_setcpaddr _buf_setfilter _buf_shadow +_bufattr_meta +_bufattr_nocache _bufattr_throttled _cdevsw _cdevsw_setkqueueok +_chudxnu_platform_ptr +_chudxnu_thread_get_dirty +_chudxnu_thread_set_dirty _clalloc _clfree _cons_cinput +_convert_task_to_port _cp_key_store_action _cp_register_wraps _cs_entitlements_blob_get @@ -35,13 +40,41 @@ _ctl_name_by_id _fd_rdwr _get_aiotask _hz +_ifnet_allocate_extended +_ifnet_bandwidths _ifnet_clone_attach _ifnet_clone_detach +_ifnet_get_local_ports +_ifnet_dequeue +_ifnet_dequeue_service_class +_ifnet_dequeue_multi +_ifnet_dequeue_service_class_multi +_ifnet_enqueue +_ifnet_get_sndq_len +_ifnet_get_rcvq_maxlen +_ifnet_get_sndq_maxlen _ifnet_idle_flags +_ifnet_inet_defrouter_llreachinfo +_ifnet_inet6_defrouter_llreachinfo +_ifnet_input_extended +_ifnet_link_quality +_ifnet_notice_node_presence +_ifnet_notice_node_absence +_ifnet_notice_master_elected +_ifnet_purge +_ifnet_set_bandwidths _ifnet_set_idle_flags +_ifnet_set_link_quality +_ifnet_set_output_sched_model +_ifnet_set_rcvq_maxlen +_ifnet_set_sndq_maxlen +_ifnet_start +_ifnet_transmit_burst_start +_ifnet_transmit_burst_end _in6addr_local _inaddr_local _inet_domain_mutex +_inp_clear_INP_INADDR_ANY _ip_mutex _ip_output _ip_protox @@ -71,6 +104,9 @@ _m_split _m_trailingspace:_mbuf_trailingspace _mac_proc_set_enforce _mbuf_get_priority:_mbuf_get_traffic_class +_mbuf_get_service_class +_mbuf_is_service_class_privileged:_mbuf_is_traffic_class_privileged +_mbuf_pkthdr_aux_flags _mcl_to_paddr _mountroot_post_hook _net_add_domain @@ -78,11 +114,10 @@ _net_add_proto _net_del_domain _net_del_proto _netboot_root -_perf_monitor_register_* +_perf_monitor_register _perf_monitor_unregister _pffinddomain _pffindproto -_pmc_accessible_from_core _pmc_config_set_interrupt_threshold _pmc_config_set_value _pmc_create_config @@ -105,6 +140,7 @@ _pmc_unregister _post_sys_powersource _port_name_to_task _port_name_to_thread +_proc_task _pru_abort_notsupp _pru_accept_notsupp _pru_bind_notsupp @@ -121,6 +157,7 @@ _pru_shutdown_notsupp _pru_sockaddr_notsupp _pru_sopoll_notsupp _q_to_b +_register_crypto_functions _register_decmpfs_decompressor _rootdev _rootvp @@ -152,6 +189,8 @@ _soreserve _sorwakeup _sosend _termioschars +_thread_call_allocate_with_priority +_thread_call_cancel_wait _thread_clear_eager_preempt _thread_dispatchqaddr _thread_set_eager_preempt @@ -164,6 +203,7 @@ _throttle_info_update _throttle_info_ref_by_mask _throttle_info_rel_by_mask _throttle_info_update_by_mask +_throttle_info_io_will_be_throttled _throttle_lowpri_io _throttle_set_thread_io_policy _timeout @@ -191,6 +231,7 @@ _unregister_decmpfs_decompressor _untimeout _vnode_isdyldsharedcache _vnode_ismonitored +_vnode_istty _vnode_notify _vnop_compound_open_desc _vnop_compound_mkdir_desc diff --git a/config/Private.i386.exports b/config/Private.i386.exports index 3a7064dc6..80e66dfe9 100644 --- a/config/Private.i386.exports +++ b/config/Private.i386.exports @@ -8,6 +8,7 @@ _acpi_sleep_kernel _add_fsevent _apic_table _apply_func_phys +_bufattr_delayidlesleep _cpu_to_lapic _cpuid_features _cpuid_info diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index a9c6a89a1..73963f2c2 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -8,6 +8,7 @@ _acpi_sleep_kernel _add_fsevent _apic_table _apply_func_phys +_bufattr_delayidlesleep _cpu_to_lapic _cpuid_features _cpuid_info diff --git a/config/System6.0.exports b/config/System6.0.exports index c3d167834..9b4585b33 100644 --- a/config/System6.0.exports +++ b/config/System6.0.exports @@ -19,15 +19,12 @@ _IOBSDNameMatching _IOBSDRegistryEntryForDeviceTree _IOBSDRegistryEntryGetData _IOBSDRegistryEntryRelease -_IOCDMatching _IOCreateThread _IODTFreeLoaderInfo _IODTGetLoaderInfo _IODelay -_IODiskMatching _IOExitThread _IOFindBSDRoot -_IOFindMatchingChild _IOFindNameForValue _IOFindValueForName _IOFlushProcessorCache @@ -69,7 +66,6 @@ _IOMapperInsertPPNPages _IOMapperInsertPage _IOMapperInsertUPLPages _IONDRVLibrariesInitialize -_IONetworkMatching _IONetworkNamePrefixMatching _IOOFPathMatching _IOPageableMapForAddress @@ -100,9 +96,7 @@ _IOSimpleLockTryLock:_lck_spin_try_lock _IOSimpleLockUnlock:_lck_spin_unlock _IOSizeToAlignment _IOSleep -_IOSpinUnlock _IOSystemShutdownNotification -_IOTrySpinLock _IOZeroTvalspec _KUNCExecute _KUNCGetNotificationID @@ -2877,8 +2871,6 @@ _device_data_action _devnode_free _disable_serial_output _ether_check_multi -_ev_try_lock -_ev_unlock _fatfile_getarch _fatfile_getarch_affinity _find_entry @@ -3193,7 +3185,6 @@ _vm_protect _vm_region _vm_region_object_create _vsnprintf -_vsprintf _vsscanf _zalloc _zfree diff --git a/config/System6.0.i386.exports b/config/System6.0.i386.exports index f3955791d..aecfe0c97 100644 --- a/config/System6.0.i386.exports +++ b/config/System6.0.i386.exports @@ -1,3 +1,5 @@ +_IOSpinUnlock +_IOTrySpinLock _PE_install_interrupt_handler _PE_interrupt_handler _PE_parse_boot_arg @@ -12,6 +14,8 @@ _cpu_number _cpu_to_lapic _cpuid_features _cpuid_info +_ev_try_lock +_ev_unlock _hfs_addconverter _hfs_remconverter _lapic_end_of_interrupt @@ -27,3 +31,4 @@ _sprintf _strcat _strcpy _thread_funnel_set +_vsprintf diff --git a/config/System6.0.x86_64.exports b/config/System6.0.x86_64.exports index 60c3e3ad1..accc98e65 100644 --- a/config/System6.0.x86_64.exports +++ b/config/System6.0.x86_64.exports @@ -1,3 +1,8 @@ +_IOSpinUnlock +_IOTrySpinLock +_ev_try_lock +_ev_unlock _sprintf _strcat _strcpy +_vsprintf diff --git a/config/Unsupported.exports b/config/Unsupported.exports index 374517b7e..dc1590d45 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -6,37 +6,26 @@ _KUNCUserNotificationDisplayAlert _KUNCUserNotificationDisplayFromBundle _KUNCUserNotificationDisplayNotice _NDR_record -_OSSpinLockTry -_OSSpinLockUnlock _PE_kputc __Z22OSFlushObjectTrackListv __ZN15IOWatchDogTimer10gMetaClassE __ZN15IOWatchDogTimer10superClassE __ZN15IOWatchDogTimer13setPropertiesEP8OSObject -__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer0Ev -__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer1Ev -__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer2Ev -__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer3Ev __ZN15IOWatchDogTimer4stopEP9IOService __ZN15IOWatchDogTimer5startEP9IOService __ZN15IOWatchDogTimer9MetaClassC1Ev __ZN15IOWatchDogTimer9MetaClassC2Ev __ZN15IOWatchDogTimer9metaClassE -__ZN15IOWatchDogTimerC1EPK11OSMetaClass __ZN15IOWatchDogTimerC2EPK11OSMetaClass -__ZN15IOWatchDogTimerD0Ev __ZN15IOWatchDogTimerD2Ev __ZN16IOPlatformDevice10gMetaClassE __ZN16IOPlatformDevice13matchLocationEP9IOService -__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice0Ev -__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice1Ev -__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice2Ev -__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice3Ev __ZN16IOPlatformDevice9metaClassE __ZN16IOPlatformDeviceC2EPK11OSMetaClass __ZN16IOPlatformDeviceD2Ev __ZN18IODTPlatformExpert9metaClassE __ZN9IODTNVRAM10gMetaClassE +__ZN9IODTNVRAM10safeToSyncEv __ZN9IODTNVRAM15initOFVariablesEv __ZN9IODTNVRAM15syncOFVariablesEv __ZN9IODTNVRAM16escapeDataToDataEP6OSData @@ -51,7 +40,6 @@ __ZN9IODTNVRAM26calculatePartitionChecksumEPh __ZN9IODTNVRAM9metaClassE __ZN9IODTNVRAMC2EPK11OSMetaClass __ZN9IODTNVRAMD2Ev -__ZN9IODTNVRAM10safeToSyncEv __ZNK15IOWatchDogTimer12getMetaClassEv __ZNK15IOWatchDogTimer9MetaClass5allocEv __ZNK9IODTNVRAM17getOFVariablePermEPK8OSSymbol @@ -81,7 +69,7 @@ _current_act _delay _delay_for_interval _des_ecb_encrypt -_des_set_key +_des_ecb_key_sched _gIODTSharedInterrupts _gOSObjectTrackList _gOSObjectTrackThread @@ -117,6 +105,7 @@ _lck_rw_done _ldisc_deregister _ldisc_register _log +_mach_gss_lookup _mach_gss_accept_sec_context _mach_gss_accept_sec_context_v2 _mach_gss_hold_cred diff --git a/config/Unsupported.i386.exports b/config/Unsupported.i386.exports index 99112a161..602e7123a 100644 --- a/config/Unsupported.i386.exports +++ b/config/Unsupported.i386.exports @@ -1,13 +1,26 @@ +_OSSpinLockTry +_OSSpinLockUnlock +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer0Ev +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer1Ev +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer2Ev +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer3Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice0Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice1Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice2Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice3Ev __ZN9IODTNVRAM17getOWVariableInfoEmPPK8OSSymbolPmS4_ __ZN9IODTNVRAM19convertObjectToPropEPhPmPK8OSSymbolP8OSObject __ZN9IODTNVRAM19convertPropToObjectEPhmS0_mPPK8OSSymbolPP8OSObject __ZN9IODTNVRAM19searchNVRAMPropertyEP17IONVRAMDescriptorPm __ZN9IODTNVRAM19unescapeBytesToDataEPKhm +_clock_get_system_value _cpu_number _domains _dsmos_page_transform_hook _gPEEFIRuntimeServices _gPEEFISystemTable +_hibernate_vm_lock +_hibernate_vm_unlock _ifunit _in6addr_local _in_broadcast @@ -42,10 +55,10 @@ _m_trailingspace:_mbuf_trailingspace _mach_msg_rpc_from_kernel _mach_msg_send_from_kernel_with_options _mcl_to_paddr:_mbuf_data_to_physical +_ml_cpu_int_event_time _ml_get_apicid _ml_get_maxbusdelay _ml_get_maxsnoop -_ml_cpu_int_event_time _mp_rendezvous _mp_rendezvous_no_intrs _nd6_storelladdr @@ -109,8 +122,6 @@ _sorwakeup _sosend _sosetopt _tcbinfo -_tmrCvt -_tsc_get_info _thread_call_func _thread_call_func_cancel _thread_call_func_delayed @@ -119,8 +130,7 @@ _thread_cancel_timer _thread_funnel_set _thread_set_timer _thread_set_timer_deadline +_tmrCvt +_tsc_get_info _udbinfo -_hibernate_vm_lock -_hibernate_vm_unlock -_clock_get_system_value _PE_state diff --git a/config/Unsupported.x86_64.exports b/config/Unsupported.x86_64.exports index 2e7f007d1..4eb17cafa 100644 --- a/config/Unsupported.x86_64.exports +++ b/config/Unsupported.x86_64.exports @@ -1,3 +1,13 @@ +_OSSpinLockTry +_OSSpinLockUnlock +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer0Ev +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer1Ev +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer2Ev +__ZN15IOWatchDogTimer25_RESERVEDIOWatchDogTimer3Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice0Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice1Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice2Ev +__ZN16IOPlatformDevice26_RESERVEDIOPlatformDevice3Ev __ZN9IODTNVRAM17getOWVariableInfoEjPPK8OSSymbolPjS4_ __ZN9IODTNVRAM19convertObjectToPropEPhPjPK8OSSymbolP8OSObject __ZN9IODTNVRAM19convertPropToObjectEPhjS0_jPPK8OSSymbolPP8OSObject @@ -7,14 +17,16 @@ _cpu_number _dsmos_page_transform_hook _gPEEFIRuntimeServices _gPEEFISystemTable +_hibernate_vm_lock +_hibernate_vm_unlock _kdp_register_callout _kdp_set_ip_and_mac_addresses _lapic_set_perfcnt_interrupt_mask _lapic_set_pmi_func +_ml_cpu_int_event_time _ml_get_apicid _ml_get_maxbusdelay _ml_get_maxsnoop -_ml_cpu_int_event_time _mp_rendezvous _mp_rendezvous_no_intrs _pmCPUControl @@ -29,6 +41,4 @@ _sock_release _sock_retain _tmrCvt _tsc_get_info -_hibernate_vm_lock -_hibernate_vm_unlock _PE_state diff --git a/config/newvers.pl b/config/newvers.pl index 31deccace..bf5096ad0 100755 --- a/config/newvers.pl +++ b/config/newvers.pl @@ -18,6 +18,10 @@ # ###KERNEL_BUILD_OBJROOT### xnu/xnu-690.obj~2/RELEASE_PPC # ###KERNEL_BUILD_DATE### Sun Oct 24 05:33:28 PDT 2004 +use File::Basename; + +use strict; + sub ReadFile { my ($fileName) = @_; my $data; @@ -39,17 +43,43 @@ sub WriteFile { close(OUT); } +die("SRCROOT not defined") unless defined($ENV{'SRCROOT'}); +die("OBJROOT not defined") unless defined($ENV{'OBJROOT'}); + my $versfile = "MasterVersion"; -$versfile = "$ENV{'SRCROOT'}/config/$versfile" if ($ENV{'SRCROOT'}); -my $BUILD_OBJROOT=$ENV{'OBJROOT'} . "/" . $ENV{'KERNEL_CONFIG'} . '_' . $ENV{'ARCH_CONFIG'}; -if($ENV{'MACHINE_CONFIG'} ne "DEFAULT") { - $BUILD_OBJROOT .= '_' . $ENV{'MACHINE_CONFIG'}; -} +$versfile = "$ENV{'SRCROOT'}/config/$versfile"; +my $BUILD_SRCROOT=$ENV{'SRCROOT'}; +$BUILD_SRCROOT =~ s,/+$,,; +my $BUILD_OBJROOT=$ENV{'OBJROOT'}; +$BUILD_OBJROOT =~ s,/+$,,; +my $BUILD_OBJPATH=$ENV{'OBJPATH'} || $ENV{'OBJROOT'}; +$BUILD_OBJPATH =~ s,/+$,,; my $BUILD_DATE = `date`; $BUILD_DATE =~ s/[\n\t]//g; my $BUILDER=`whoami`; $BUILDER =~ s/[\n\t]//g; -$BUILD_OBJROOT =~ s|.*(xnu.*)|$1|; + +# Handle two scenarios: +# SRCROOT=/tmp/xnu +# OBJROOT=/tmp/xnu/BUILD/obj +# OBJPATH=/tmp/xnu/BUILD/obj/RELEASE_X86_64 +# +# SRCROOT=/SourceCache/xnu/xnu-1234 +# OBJROOT=/tmp/xnu/xnu-1234~1.obj +# OBJPATH=/tmp/xnu/xnu-1234~1.obj/RELEASE_X86_64 +# +# If SRCROOT is a strict prefix of OBJPATH, we +# want to preserve the "interesting" part +# starting with "xnu". If it's not a prefix, +# the basename of OBJROOT itself is "interesting". + +if ($BUILD_OBJPATH =~ m,^$BUILD_SRCROOT/(.*)$,) { + $BUILD_OBJROOT = basename($BUILD_SRCROOT) . "/" . $1; +} elsif ($BUILD_OBJPATH =~ m,^$BUILD_OBJROOT/(.*)$,) { + $BUILD_OBJROOT = basename($BUILD_OBJROOT) . "/" . $1; +} else { + # Use original OBJROOT +} my $rawvers = &ReadFile($versfile); #$rawvers =~ s/\s//g; diff --git a/iokit/IOKit/IOCatalogue.h b/iokit/IOKit/IOCatalogue.h index 49f8fb84c..ac8cec46c 100644 --- a/iokit/IOKit/IOCatalogue.h +++ b/iokit/IOKit/IOCatalogue.h @@ -57,17 +57,11 @@ class IOCatalogue : public OSObject OSDeclareDefaultStructors(IOCatalogue) private: - OSCollectionIterator * kernelTables; - OSArray * array; - IOLock * lock; + IORWLock * lock; SInt32 generation; - -/* This stuff is no longer used at all but was exported in prior - * releases, so we keep it around for i386 only. - */ -#if __i386__ - IOLock * kld_lock; -#endif /* __i386__ */ + OSDictionary * personalities; + OSArray * arrayForPersonality(OSDictionary * dict); + void addPersonality(OSDictionary * dict); public: /*! @@ -273,6 +267,9 @@ private: @param moduleName An OSString containing the name of the module to unload. */ IOReturn unloadModule( OSString * moduleName ) const; + + IOReturn _removeDrivers(OSDictionary * matching); + IOReturn _terminateDrivers(OSDictionary * matching); }; extern const OSSymbol * gIOClassKey; diff --git a/iokit/IOKit/IODeviceTreeSupport.h b/iokit/IOKit/IODeviceTreeSupport.h index 15b5aa4b4..6e3ed1ed1 100644 --- a/iokit/IOKit/IODeviceTreeSupport.h +++ b/iokit/IOKit/IODeviceTreeSupport.h @@ -72,6 +72,10 @@ OSCollectionIterator * IODTFindMatchingEntries( IORegistryEntry * from, typedef SInt32 (*IODTCompareAddressCellFunc) (UInt32 cellCount, UInt32 left[], UInt32 right[]); + +typedef SInt64 (*IODTCompareAddressCell64Func) + (UInt32 cellCount, UInt32 left[], UInt32 right[]); + typedef void (*IODTNVLocationFunc) (IORegistryEntry * entry, UInt8 * busNum, UInt8 * deviceNum, UInt8 * functionNum ); diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index 0cc86a55c..6e758273d 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -33,10 +33,13 @@ extern "C" { #endif #ifdef KERNEL -#include +#include #include #endif +#ifndef __IOKIT_IOHIBERNATEPRIVATE_H +#define __IOKIT_IOHIBERNATEPRIVATE_H + struct IOPolledFileExtent { uint64_t start; @@ -96,8 +99,9 @@ struct IOHibernateImageHeader uint32_t debugFlags; uint32_t options; + uint32_t sleepTime; - uint32_t reserved[70]; // make sizeof == 512 + uint32_t reserved[69]; // make sizeof == 512 uint64_t encryptEnd __attribute__ ((packed)); uint64_t deviceBase __attribute__ ((packed)); @@ -235,6 +239,18 @@ static const uint8_t gIOHibernateProgressAlpha \ { 0x00,0x66,0xdb,0xf3,0xdb,0x66,0x00 } \ }; +struct hibernate_preview_t +{ + uint32_t imageCount; // Number of images + uint32_t width; // Width + uint32_t height; // Height + uint32_t depth; // Pixel Depth + uint32_t lockTime; // Lock time + uint32_t reservedG[8]; // reserved + uint32_t reservedK[8]; // reserved +}; +typedef struct hibernate_preview_t hibernate_preview_t; + #ifdef KERNEL #ifdef __cplusplus @@ -242,9 +258,12 @@ static const uint8_t gIOHibernateProgressAlpha \ void IOHibernateSystemInit(IOPMrootDomain * rootDomain); IOReturn IOHibernateSystemSleep(void); +IOReturn IOHibernateIOKitSleep(void); IOReturn IOHibernateSystemHasSlept(void); IOReturn IOHibernateSystemWake(void); IOReturn IOHibernateSystemPostWake(void); +bool IOHibernateWasScreenLocked(void); +void IOHibernateSetScreenLocked(uint32_t lockState); #endif /* __cplusplus */ @@ -419,12 +438,14 @@ enum #define kIOHibernateFeatureKey "Hibernation" #define kIOHibernatePreviewBufferKey "IOPreviewBuffer" +#ifndef kIOHibernatePreviewActiveKey #define kIOHibernatePreviewActiveKey "IOHibernatePreviewActive" // values for kIOHibernatePreviewActiveKey enum { kIOHibernatePreviewActive = 0x00000001, kIOHibernatePreviewUpdates = 0x00000002 }; +#endif #define kIOHibernateOptionsKey "IOHibernateOptions" #define kIOHibernateGfxStatusKey "IOHibernateGfxStatus" @@ -447,6 +468,25 @@ enum { #define kIOHibernateUseKernelInterpreter 0x80000000 +enum +{ + kIOPreviewImageIndexDesktop = 0, + kIOPreviewImageIndexLockScreen = 1, + kIOPreviewImageCount = 2 +}; + +enum +{ + kIOScreenLockNoLock = 1, + kIOScreenLockUnlocked = 2, + kIOScreenLockLocked = 3, + kIOScreenLockFileVaultDialog = 4, +}; + +#define kIOScreenLockStateKey "IOScreenLockState" + +#endif /* ! __IOKIT_IOHIBERNATEPRIVATE_H */ + #ifdef __cplusplus } #endif diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index 26787a25c..48ff9580d 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,19 +52,6 @@ extern "C" { } #endif -// IOMakeMatching -/*! - @enum IOMakeMatching - @constant kIOServiceMatching - @constant kIOBSDNameMatching - @constant kIOOFPathMatching -*/ -enum { - kIOServiceMatching = 100, - kIOBSDNameMatching = 101, - kIOOFPathMatching = 102 -}; - // IOCatalogueSendData /*! @enum IOCatalogueSendData user-client flags. diff --git a/iokit/IOKit/IOLib.h b/iokit/IOKit/IOLib.h index 5e91b4725..3b5103218 100644 --- a/iokit/IOKit/IOLib.h +++ b/iokit/IOKit/IOLib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -85,8 +85,10 @@ void * IOMalloc(vm_size_t size); /*! @function IOFree @abstract Frees memory allocated with IOMalloc. @discussion This function frees memory allocated with IOMalloc, it may block and so should not be called from interrupt level or while a simple lock is held. - @param address Pointer to the allocated memory. - @param size Size of the memory allocated. */ + @param address Pointer to the allocated memory. Must be identical to result + @of a prior IOMalloc. + @param size Size of the memory allocated. Must be identical to size of + @the corresponding IOMalloc */ void IOFree(void * address, vm_size_t size); @@ -348,7 +350,7 @@ OSDictionary * #else struct OSDictionary * #endif -IOOFPathMatching( const char * path, char * buf, int maxLen ); +IOOFPathMatching( const char * path, char * buf, int maxLen ) __attribute__((deprecated)); /* * Convert between size and a power-of-two alignment. diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 6e6961136..fd83d0547 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -134,19 +134,10 @@ class IOMemoryDescriptor : public OSObject OSDeclareDefaultStructors(IOMemoryDescriptor); protected: -/*! @struct ExpansionData - @discussion This structure will be used to expand the capablilties of this class in the future. - */ - struct ExpansionData { - void * devicePager; - unsigned int pagerContig:1; - unsigned int unused:31; - IOMemoryDescriptor * memory; - }; /*! @var reserved Reserved for future use. (Internal use only) */ - ExpansionData * reserved; + struct IOMemoryDescriptorReserved * reserved; protected: OSSet * _mappings; @@ -238,6 +229,11 @@ typedef IOOptionBits DMACommandOps; #endif /* !__LP64__ */ virtual uint64_t getPreparationID( void ); + void setPreparationID( void ); + +#ifdef XNU_KERNEL_PRIVATE + IOMemoryDescriptorReserved * getKernelReserved( void ); +#endif private: OSMetaClassDeclareReservedUsed(IOMemoryDescriptor, 0); diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index 99c30699b..db7f5d20b 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -179,11 +179,11 @@ class IOPlatformExpert; @abstract The base class for most I/O Kit families, devices, and drivers. @discussion The IOService base class defines APIs used to publish services, instantiate other services based on the existance of a providing service (ie. driver stacking), destroy a service and its dependent stack, notify interested parties of service state changes, and general utility functions useful across all families. -Types of service are specified with a matching dictionary that describes properties of the service. For example, a matching dictionary might describe any IOUSBDevice (or subclass), an IOUSBDevice with a certain class code, or a IOPCIDevice with a set of OpenFirmware matching names or device & vendor IDs. Since the matching dictionary is interpreted by the family which created the service, as well as generically by IOService, the list of properties considered for matching depends on the familiy. +Types of service are specified with a matching dictionary that describes properties of the service. For example, a matching dictionary might describe any IOUSBDevice (or subclass), an IOUSBDevice with a certain class code, or a IOPCIDevice with a set of matching names or device & vendor IDs. Since the matching dictionary is interpreted by the family which created the service, as well as generically by IOService, the list of properties considered for matching depends on the familiy. Matching dictionaries are associated with IOService classes by the catalogue, as driver property tables, and also supplied by clients of the notification APIs. -IOService provides matching based on C++ class (via OSMetaClass dynamic casting), registry entry name, a registry path to the service (which includes OpenFirmware paths), a name assigned by BSD, or by its location (its point of attachment). +IOService provides matching based on C++ class (via OSMetaClass dynamic casting), registry entry name, a registry path to the service (which includes device tree paths), a name assigned by BSD, or by its location (its point of attachment).

Driver Instantiation by IOService

@@ -231,7 +231,7 @@ A string defining the driver category for matching purposes. All drivers with no
kIONameMatchKey, extern const OSSymbol * gIONameMatchKey, "IONameMatch"
-A string or collection of strings that match the provider's name. The comparison is implemented with the @link //apple_ref/cpp/instm/IORegistryEntry/compareNames/virtualbool/(OSObject*,OSString**) IORegistryEntry::compareNames@/link method, which supports a single string, or any collection (OSArray, OSSet, OSDictionary etc.) of strings. IOService objects with OpenFirmware device tree properties (eg. IOPCIDevice) will also be matched based on that standard's "compatible", "name", "device_type" properties. The matching name will be left in the driver's property table in the kIONameMatchedKey property. +A string or collection of strings that match the provider's name. The comparison is implemented with the @link //apple_ref/cpp/instm/IORegistryEntry/compareNames/virtualbool/(OSObject*,OSString**) IORegistryEntry::compareNames@/link method, which supports a single string, or any collection (OSArray, OSSet, OSDictionary etc.) of strings. IOService objects with device tree properties (eg. IOPCIDevice) will also be matched based on that standard's "compatible", "name", "device_type" properties. The matching name will be left in the driver's property table in the kIONameMatchedKey property.
Examples
@@ -728,6 +728,14 @@ public:
 
     static OSIterator * getMatchingServices( OSDictionary * matching );
 
+/*! @function copyMatchingService
+    @abstract Finds one of the current published IOService objects matching a matching dictionary.
+    @discussion Provides a method to find one member of the set of published IOService objects matching the supplied matching dictionary.   
+    @param matching The matching dictionary describing the desired IOService object.
+    @result The IOService object or NULL. To be released by the caller. */
+
+    static IOService * copyMatchingService( OSDictionary * matching );
+
 public:
     /* Helpers to make matching dictionaries for simple cases,
      * they add keys to an existing dictionary, or create one. */
@@ -1254,6 +1262,11 @@ private:
     void doServiceTerminate( IOOptionBits options );
 
 private:
+
+    bool matchPassive(OSDictionary * table, uint32_t options);
+    bool matchInternal(OSDictionary * table, uint32_t options, unsigned int * did);
+    static bool instanceMatch(const OSObject * entry, void * context);
+
     static OSObject * copyExistingServices( OSDictionary * matching,
 		 IOOptionBits inState, IOOptionBits options = 0 );
 
@@ -1778,7 +1791,7 @@ private:
     bool checkForDone ( void );
     bool responseValid ( uint32_t x, int pid );
     void computeDesiredState ( unsigned long tempDesire = 0 );
-    void rebuildChildClampBits ( void );
+    void trackSystemSleepPreventers( IOPMPowerStateIndex, IOPMPowerStateIndex, IOPMPowerChangeFlags );
     void tellSystemCapabilityChange( uint32_t nextMS );
 
 	static void ack_timer_expired( thread_call_param_t, thread_call_param_t );
diff --git a/iokit/IOKit/IOTypes.h b/iokit/IOKit/IOTypes.h
index d56aea7be..76bd0acfa 100644
--- a/iokit/IOKit/IOTypes.h
+++ b/iokit/IOKit/IOTypes.h
@@ -197,7 +197,8 @@ enum {
     kIOInhibitCache		= 1,
     kIOWriteThruCache		= 2,
     kIOCopybackCache		= 3,
-    kIOWriteCombineCache	= 4
+    kIOWriteCombineCache	= 4,
+    kIOCopybackInnerCache	= 5
 };
 
 // IOMemory mapping options
@@ -206,11 +207,12 @@ enum {
 
     kIOMapCacheMask		= 0x00000700,
     kIOMapCacheShift		= 8,
-    kIOMapDefaultCache		= kIODefaultCache      << kIOMapCacheShift,
-    kIOMapInhibitCache		= kIOInhibitCache      << kIOMapCacheShift,
-    kIOMapWriteThruCache	= kIOWriteThruCache    << kIOMapCacheShift,
-    kIOMapCopybackCache		= kIOCopybackCache     << kIOMapCacheShift,
-    kIOMapWriteCombineCache	= kIOWriteCombineCache << kIOMapCacheShift,
+    kIOMapDefaultCache		= kIODefaultCache       << kIOMapCacheShift,
+    kIOMapInhibitCache		= kIOInhibitCache       << kIOMapCacheShift,
+    kIOMapWriteThruCache	= kIOWriteThruCache     << kIOMapCacheShift,
+    kIOMapCopybackCache		= kIOCopybackCache      << kIOMapCacheShift,
+    kIOMapWriteCombineCache	= kIOWriteCombineCache  << kIOMapCacheShift,
+    kIOMapCopybackInnerCache	= kIOCopybackInnerCache << kIOMapCacheShift,
 
     kIOMapUserOptionsMask	= 0x00000fff,
 
diff --git a/iokit/IOKit/Makefile b/iokit/IOKit/Makefile
index 7b3c8df3e..69e6c7aad 100644
--- a/iokit/IOKit/Makefile
+++ b/iokit/IOKit/Makefile
@@ -26,12 +26,11 @@ INSTINC_SUBDIRS_I386 =
 
 INSTINC_SUBDIRS_X86_64 = 
 
-INSTINC_SUBDIRS_ARM = 
 
 EXPINC_SUBDIRS = ${INSTINC_SUBDIRS}
 EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386}
 EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64}
-EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM}
+
 
 NOT_EXPORT_HEADERS = 
 
@@ -39,7 +38,7 @@ NOT_KF_MI_HEADERS  = $(NOT_EXPORT_HEADERS)			\
 		     IOKitKeysPrivate.h IOCPU.h			\
 		     IOHibernatePrivate.h IOPolledInterface.h	\
 		     IOCommandQueue.h IOLocksPrivate.h 		\
-		     AppleKeyStoreInterface.h			\
+		     IOSyncer.h AppleKeyStoreInterface.h	\
 		     IOStatistics.h IOStatisticsPrivate.h
 
 NOT_LOCAL_HEADERS = 
diff --git a/iokit/IOKit/i386/Makefile b/iokit/IOKit/i386/Makefile
index f8f0826c4..514496af6 100644
--- a/iokit/IOKit/i386/Makefile
+++ b/iokit/IOKit/i386/Makefile
@@ -15,11 +15,9 @@ EXCLUDE_HEADERS =
 
 INSTINC_SUBDIRS =
 INSTINC_SUBDIRS_I386 =
-INSTINC_SUBDIRS_X86_64 =
 
 EXPINC_SUBDIRS = ${INSTINC_SUBDIRS}
 EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386}
-EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64}
 
 ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
 HEADER_LIST = $(filter-out $(EXCLUDE_HEADERS), $(ALL_HEADERS))
diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h
index 4bdddb751..cfcbf6d55 100644
--- a/iokit/IOKit/pwr_mgt/IOPM.h
+++ b/iokit/IOKit/pwr_mgt/IOPM.h
@@ -245,7 +245,7 @@ enum {
  *  false       == Retain FV key when going to standby mode
  *  not present == Retain FV key when going to standby mode
  */
-#define kIOPMDestroyFVKeyOnStandbyKey       "DestroyFVKeyOnStandby"
+#define kIOPMDestroyFVKeyOnStandbyKey            "DestroyFVKeyOnStandby"
 
 /*******************************************************************************
  *
@@ -288,15 +288,7 @@ enum {
      */
     kIOPMDriverAssertionPreventDisplaySleepBit      = 0x40,
 
-    /*! kIOPMDriverAssertionReservedBit7
-     * Reserved for storage family.
-     */
-    kIOPMDriverAssertionReservedBit7                = 0x80,
-
-    /*! kIOPMDriverAssertionReservedBit8
-     * Reserved for networking family.
-     */
-    kIOPMDriverAssertionReservedBit8                = 0x100
+    kIOPMDriverAssertionReservedBit7                = 0x80
 };
 
  /* kIOPMAssertionsDriverKey
@@ -304,7 +296,7 @@ enum {
   * a bitfield describing the aggregate PM assertion levels.
   * Example: A value of 0 indicates that no driver has asserted anything.
   * Or, a value of kIOPMDriverAssertionCPUBit
-  *   indicates that a driver (or drivers) have asserted a need fro CPU and video.
+  *   indicates that a driver (or drivers) have asserted a need for CPU and video.
   */
 #define kIOPMAssertionsDriverKey            "DriverPMAssertions"
 
@@ -313,7 +305,7 @@ enum {
   * a bitfield describing the aggregate PM assertion levels.
   * Example: A value of 0 indicates that no driver has asserted anything.
   * Or, a value of kIOPMDriverAssertionCPUBit
-  *   indicates that a driver (or drivers) have asserted a need fro CPU and video.
+  *   indicates that a driver (or drivers) have asserted a need for CPU and video.
   */
 #define kIOPMAssertionsDriverDetailedKey    "DriverPMAssertionsDetailed"
 
@@ -416,6 +408,13 @@ enum {
 #define kIOPMMessageDriverAssertionsChanged  \
                 iokit_family_msg(sub_iokit_powermanagement, 0x150)
 
+/*! kIOPMMessageDarkWakeThermalEmergency
+ * Sent when machine becomes unsustainably warm in DarkWake.
+ * Kernel PM might choose to put the machine back to sleep right after.
+ */
+#define kIOPMMessageDarkWakeThermalEmergency \
+                iokit_family_msg(sub_iokit_powermanagement, 0x160)
+
 /*******************************************************************************
  *
  * Power commands issued to root domain
@@ -437,7 +436,8 @@ enum {
   kIOPMEnableClamshell          = (1<<7),  // sleep on clamshell closure
   kIOPMProcessorSpeedChange     = (1<<8),  // change the processor speed
   kIOPMOverTemp                 = (1<<9),  // system dangerously hot
-  kIOPMClamshellOpened          = (1<<10)  // clamshell was opened
+  kIOPMClamshellOpened          = (1<<10), // clamshell was opened
+  kIOPMDWOverTemp               = (1<<11)  // DarkWake thermal limits exceeded.
 };
 
 
diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h
index 09fdb19e8..4828f38cd 100644
--- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h
+++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h
@@ -43,28 +43,28 @@
 /* @constant kIOPMTimelineDictionaryKey
  * @abstract RootDomain key for dictionary describing Timeline's info
  */
-#define kIOPMTimelineDictionaryKey              "PMTimelineLogging"
+#define     kIOPMTimelineDictionaryKey                  "PMTimelineLogging"
 
 /* @constant kIOPMTimelineEnabledKey
  * @abstract Boolean value indicating whether the system is recording PM events.
  * @discussion Key may be found in the dictionary at IOPMrootDomain's property 
  * kIOPMTimelineDictionaryKey. uint32_t value; may be 0.
  */
-#define kIOPMTimelineEnabledKey                 "TimelineEnabled"
+#define     kIOPMTimelineEnabledKey                     "TimelineEnabled"
 
 /* @constant kIOMPTimelineSystemNumberTrackedKey
  * @abstract The maximum number of system power events the system may record.
  * @discussion Key may be found in the dictionary at IOPMrootDomain's property 
  * kIOPMTimelineDictionaryKey. uint32_t value; may be 0.
  */
-#define kIOPMTimelineSystemNumberTrackedKey     "TimelineSystemEventsTracked"
+#define     kIOPMTimelineSystemNumberTrackedKey         "TimelineSystemEventsTracked"
 
 /* @constant kIOPMTimelineSystemBufferSizeKey
  * @abstract Size in bytes  of buffer recording system PM events
  * @discussion Key may be found in the dictionary at IOPMrootDomain's property 
  * kIOPMTimelineDictionaryKey. uint32_t value; may be 0.
  */
-#define kIOPMTimelineSystemBufferSizeKey        "TimelineSystemBufferSize"
+#define     kIOPMTimelineSystemBufferSizeKey            "TimelineSystemBufferSize"
 
 
 
@@ -131,7 +131,8 @@ enum {
     kIOPMSleepReasonLowPower                    = 106,
     kIOPMSleepReasonThermalEmergency            = 107,
     kIOPMSleepReasonMaintenance                 = 108,
-    kIOPMSleepReasonSleepServiceExit            = 109
+    kIOPMSleepReasonSleepServiceExit            = 109,
+    kIOPMSleepReasonDarkWakeThermalEmergency    = 110
 };
 
 /*
@@ -145,6 +146,7 @@ enum {
 #define kIOPMLowPowerSleepKey                       "Low Power Sleep"
 #define kIOPMThermalEmergencySleepKey               "Thermal Emergency Sleep"
 #define kIOPMSleepServiceExitKey                    "Sleep Service Back to Sleep"
+#define kIOPMDarkWakeThermalEmergencyKey            "Dark Wake Thermal Emergency"
 
 
 enum {
@@ -618,162 +620,31 @@ enum {
 #define kIOPMSleepWakeFailureUUIDKey        "UUID"
 #define kIOPMSleepWakeFailureDateKey        "Date"
 
-/*****************************************************************************
- *
- * Root Domain private property keys
- *
- *****************************************************************************/
-
-/* kIOPMAutoPowerOffEnabledKey
- * Indicates if Auto Power Off is enabled.
- * It has a boolean value.
- *  true        == Auto Power Off is enabled
- *  false       == Auto Power Off is disabled
- *  not present == Auto Power Off is not supported on this hardware
+/******************************************************************************/
+/* System sleep policy
+ * Shared between PM root domain and platform driver.
  */
-#define kIOPMAutoPowerOffEnabledKey         "AutoPowerOff Enabled"
 
-/* kIOPMAutoPowerOffDelayKey
- * Key refers to a CFNumberRef that represents the delay in seconds before
- * entering the Auto Power Off state.  The property is not present if Auto
- * Power Off is unsupported.
- */
-#define kIOPMAutoPowerOffDelayKey           "AutoPowerOff Delay"
+// Platform specific property added by the platform driver.
+// An OSData that describes the system sleep policy.
+#define kIOPlatformSystemSleepPolicyKey     "IOPlatformSystemSleepPolicy"
 
-/*****************************************************************************
- *
- * System Sleep Policy
- *
- *****************************************************************************/
-
-#define kIOPMSystemSleepPolicySignature     0x54504c53
-#define kIOPMSystemSleepPolicyVersion       2
-
-/*!
- * @defined kIOPMSystemSleepTypeKey
- * @abstract Indicates the type of system sleep.
- * @discussion An OSNumber property of root domain that describes the type
- * of system sleep. This property is set after notifying priority sleep/wake
- * clients, but before informing interested drivers and shutting down power
- * plane drivers.
- */
-#define kIOPMSystemSleepTypeKey             "IOPMSystemSleepType"
+// Root domain property updated before platform sleep.
+// An OSData that describes the system sleep parameters.
+#define kIOPMSystemSleepParametersKey       "IOPMSystemSleepParameters"
 
-struct IOPMSystemSleepPolicyVariables
+struct IOPMSystemSleepParameters
 {
-    uint32_t    signature;                  // kIOPMSystemSleepPolicySignature
-    uint32_t    version;                    // kIOPMSystemSleepPolicyVersion
-
-    uint64_t    currentCapability;          // current system capability bits
-    uint64_t    highestCapability;          // highest system capability bits
-
-    uint64_t    sleepFactors;               // sleep factor bits
-    uint32_t    sleepReason;                // kIOPMSleepReason*
-    uint32_t    sleepPhase;                 // identify the sleep phase
-    uint32_t    hibernateMode;              // current hibernate mode
-
-    uint32_t    standbyDelay;               // standby delay in seconds
-    uint32_t    poweroffDelay;              // auto-poweroff delay in seconds
-
-    uint32_t    reserved[51];               // pad sizeof 256 bytes
-};
-
-enum {
-    kIOPMSleepPhase1 = 1,
-    kIOPMSleepPhase2
-};
-
-// Sleep Factor Mask / Bits
-enum {
-    kIOPMSleepFactorSleepTimerWake          = 0x00000001ULL,
-    kIOPMSleepFactorLidOpen                 = 0x00000002ULL,
-    kIOPMSleepFactorACPower                 = 0x00000004ULL,
-    kIOPMSleepFactorBatteryLow              = 0x00000008ULL,
-    kIOPMSleepFactorStandbyNoDelay          = 0x00000010ULL,
-    kIOPMSleepFactorStandbyForced           = 0x00000020ULL,
-    kIOPMSleepFactorStandbyDisabled         = 0x00000040ULL,
-    kIOPMSleepFactorUSBExternalDevice       = 0x00000080ULL,
-    kIOPMSleepFactorBluetoothHIDDevice      = 0x00000100ULL,
-    kIOPMSleepFactorExternalMediaMounted    = 0x00000200ULL,
-    kIOPMSleepFactorThunderboltDevice       = 0x00000400ULL,
-    kIOPMSleepFactorRTCAlarmScheduled       = 0x00000800ULL,
-    kIOPMSleepFactorMagicPacketWakeEnabled  = 0x00001000ULL,
-    kIOPMSleepFactorHibernateForced         = 0x00010000ULL,
-    kIOPMSleepFactorAutoPowerOffDisabled    = 0x00020000ULL,
-    kIOPMSleepFactorAutoPowerOffForced      = 0x00040000ULL
-};
-
-// System Sleep Types
-enum {
-    kIOPMSleepTypeInvalid                   = 0,
-    kIOPMSleepTypeAbortedSleep              = 1,
-    kIOPMSleepTypeNormalSleep               = 2,
-    kIOPMSleepTypeSafeSleep                 = 3,
-    kIOPMSleepTypeHibernate                 = 4,
-    kIOPMSleepTypeStandby                   = 5,
-    kIOPMSleepTypePowerOff                  = 6,
-    kIOPMSleepTypeLast                      = 7
-};
-
-// System Sleep Flags
-enum {
-    kIOPMSleepFlagDisableHibernateAbort     = 0x00000001,
-    kIOPMSleepFlagDisableUSBWakeEvents      = 0x00000002,
-    kIOPMSleepFlagDisableBatlowAssertion    = 0x00000004
+    uint32_t    version;
+    uint32_t    sleepFlags;
+    uint32_t    sleepTimer;
+    uint32_t    wakeEvents;
 };
 
-// System Wake Events
+// Sleep flags
 enum {
-    kIOPMWakeEventLidOpen                   = 0x00000001,
-    kIOPMWakeEventLidClose                  = 0x00000002,
-    kIOPMWakeEventACAttach                  = 0x00000004,
-    kIOPMWakeEventACDetach                  = 0x00000008,
-    kIOPMWakeEventPowerButton               = 0x00000100,
-    kIOPMWakeEventUserPME                   = 0x00000400,
-    kIOPMWakeEventSleepTimer                = 0x00000800,
-    kIOPMWakeEventBatteryLow                = 0x00001000,
-    kIOPMWakeEventDarkPME                   = 0x00002000
+    kIOPMSleepFlagHibernate         = 0x00000001,
+    kIOPMSleepFlagSleepTimerEnable  = 0x00000002
 };
 
-/*!
- * @defined kIOPMSystemSleepParametersKey
- * @abstract Sleep parameters describing the upcoming sleep
- * @discussion Root domain updates this OSData property before system sleep
- * to pass sleep parameters to the platform driver.  Some of the parameters
- * are based on the chosen entry in the system sleep policy table.
- */
-#define kIOPMSystemSleepParametersKey       "IOPMSystemSleepParameters"
-#define kIOPMSystemSleepParametersVersion   2
-
-struct IOPMSystemSleepParameters
-{
-    uint16_t    version;
-    uint16_t    reserved1;
-    uint32_t    sleepType;
-    uint32_t    sleepFlags;
-    uint32_t    ecWakeEvents;
-    uint32_t    ecWakeTimer;
-    uint32_t    ecPoweroffTimer;
-    uint32_t    reserved2[10];
-} __attribute__((packed));
-
-#ifdef KERNEL
-
-/*!
- * @defined kIOPMInstallSystemSleepPolicyHandlerKey
- * @abstract Name of the platform function to install a sleep policy handler.
- * @discussion Pass to IOPMrootDomain::callPlatformFunction(), with a pointer
- * to the C-function handler at param1, and an optional target at param2, to
- * register a sleep policy handler. Only a single sleep policy handler can
- * be installed.
- */
-#define kIOPMInstallSystemSleepPolicyHandlerKey	\
-        "IOPMInstallSystemSleepPolicyHandler"
-
-typedef IOReturn (*IOPMSystemSleepPolicyHandler)(
-        void * target, const IOPMSystemSleepPolicyVariables * vars,
-        IOPMSystemSleepParameters * params );
-
-#endif /* KERNEL */
-
 #endif /* ! _IOKIT_IOPMPRIVATE_H */
diff --git a/iokit/IOKit/pwr_mgt/Makefile b/iokit/IOKit/pwr_mgt/Makefile
index b82357fe9..db62a3d24 100644
--- a/iokit/IOKit/pwr_mgt/Makefile
+++ b/iokit/IOKit/pwr_mgt/Makefile
@@ -15,7 +15,7 @@ NOT_EXPORT_HEADERS = \
 	IOPMinformee.h		\
 	IOPMinformeeList.h	\
 	IOPMlog.h		\
-	IOPMPagingPlexus.h
+	IOPMPrivate.h
 	
 INSTINC_SUBDIRS =
 INSTINC_SUBDIRS_I386 =
diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h
index 351f7da64..9a514bdbc 100644
--- a/iokit/IOKit/pwr_mgt/RootDomain.h
+++ b/iokit/IOKit/pwr_mgt/RootDomain.h
@@ -30,13 +30,13 @@
 
 #include 
 #include 
-#include "IOKit/pwr_mgt/IOPMPrivate.h"
 #include  
 
 #ifdef XNU_KERNEL_PRIVATE
 struct AggressivesRecord;
 struct IOPMMessageFilterContext;
 struct IOPMActions;
+struct IOPMSystemSleepParameters;
 class PMSettingObject;
 class IOPMTimeline;
 class PMEventDetails;
@@ -311,7 +311,7 @@ public:
     @result On success, returns a new assertion of type IOPMDriverAssertionID *
 */
     IOReturn releasePMAssertion(IOPMDriverAssertionID releaseAssertion);
-
+        
 private:
     virtual IOReturn    changePowerStateTo( unsigned long ordinal );
     virtual IOReturn    changePowerStateToPriv( unsigned long ordinal );
@@ -381,7 +381,11 @@ public:
     void        handleQueueSleepWakeUUID(
                     OSObject *obj);
 
-    IOReturn    setMaintenanceWakeCalendar(const IOPMCalendarStruct * calendar );
+    void        handleSuspendPMNotificationClient(
+                    uint32_t pid, bool doSuspend);
+
+    IOReturn    setMaintenanceWakeCalendar(
+                    const IOPMCalendarStruct * calendar );
 
     // Handle callbacks from IOService::systemWillShutdown()
 	void        acknowledgeSystemWillShutdown( IOService * from );
@@ -406,6 +410,11 @@ public:
     bool        systemMessageFilter(
                     void * object, void * arg1, void * arg2, void * arg3 );
 
+    void        updatePreventIdleSleepList(
+                    IOService * service, bool addNotRemove );
+    void        updatePreventSystemSleepList(
+                    IOService * service, bool addNotRemove );
+
     void        publishPMSetting(
                     const OSSymbol * feature, uint32_t where, uint32_t * featureID );
 
@@ -430,6 +439,23 @@ public:
                                 uint32_t			delay_ms,
                                 int     			app_pid);
 
+
+/*! @function   suspendPMNotificationsForPID
+    @abstract   kernel process management calls this to disable sleep/wake notifications
+                when a process is suspended.
+    @param      pid the process ID
+    @param      doSuspend true suspends the notifications; false enables them
+*/
+    void        suspendPMNotificationsForPID( uint32_t pid, bool doSuspend);
+
+/*! @function   pmNotificationIsSuspended
+    @abstract   returns true if PM notifications have been suspended
+    @param      pid the process ID
+    @result     true if the process has been suspended
+*/
+    bool        pmNotificationIsSuspended( uint32_t pid );
+
+
 #if HIBERNATION
     bool        getHibernateSettings(
                     uint32_t *  hibernateMode,
@@ -463,7 +489,6 @@ private:
                                     IONotifier * notifier);
 
     IOService *             wrangler;
-    IOService *             wranglerConnection;
 
     IOLock                  *featuresDictLock;  // guards supportedFeatures
     IOPMPowerStateQueue     *pmPowerStateQueue;
@@ -492,7 +517,6 @@ private:
     OSArray                 *pmStatsAppResponses;
 
     bool                    uuidPublished;
-    PMStatsStruct           pmStats;
 
     // Pref: idle time before idle sleep
     unsigned long           sleepSlider;		
@@ -554,12 +578,12 @@ private:
 
     unsigned int            idleSleepTimerPending   :1;
     unsigned int            userDisabledAllSleep    :1;
-    unsigned int            childPreventSystemSleep :1;
     unsigned int            ignoreTellChangeDown    :1;
     unsigned int            wranglerAsleep          :1;
     unsigned int            wranglerTickled         :1;
     unsigned int            wranglerSleepIgnored    :1;
     unsigned int            graphicsSuppressed      :1;
+    unsigned int            darkWakeThermalAlarm    :1;
 
     unsigned int            capabilityLoss          :1;
     unsigned int            pciCantSleepFlag        :1;
@@ -573,6 +597,7 @@ private:
     unsigned int            darkWakePostTickle      :1;
     unsigned int            sleepTimerMaintenance   :1;
     unsigned int            lowBatteryCondition     :1;
+    unsigned int            darkWakeThermalEmergency:1;
     unsigned int            hibernateDisabled       :1;
     unsigned int            hibernateNoDefeat       :1;
     unsigned int            rejectWranglerTickle    :1;
@@ -606,10 +631,24 @@ private:
     IONotifier *            systemCapabilityNotifier;
 
     IOPMTimeline            *timeline;
+    
+    typedef struct {
+        uint32_t            pid;
+        uint32_t            refcount;
+    } PMNotifySuspendedStruct;
+    
+    uint32_t                pmSuspendedCapacity;    
+    uint32_t                pmSuspendedSize;
+    PMNotifySuspendedStruct *pmSuspendedPIDS;
+
+    OSSet *                 preventIdleSleepList;
+    OSSet *                 preventSystemSleepList;
+
+#if HIBERNATION
+    clock_sec_t             _standbyTimerResetSeconds;
+#endif
 
-    IOPMSystemSleepPolicyHandler    _sleepPolicyHandler;
-    void *                          _sleepPolicyTarget;
-    IOPMSystemSleepPolicyVariables *_sleepPolicyVars;
+    int         findSuspendedPID(uint32_t pid, uint32_t *outRefCount);
 
 	// IOPMrootDomain internal sleep call
     IOReturn    privateSleepSystem( uint32_t sleepReason );
@@ -665,7 +704,7 @@ private:
 
 #if HIBERNATION
     bool        getSleepOption( const char * key, uint32_t * option );
-    bool        evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p, int phase );
+    bool        evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p, int sleepPhase );
     void        evaluateSystemSleepPolicyEarly( void );
     void        evaluateSystemSleepPolicyFinal( void );
 #endif /* HIBERNATION */
diff --git a/iokit/IOKit/x86_64/Makefile b/iokit/IOKit/x86_64/Makefile
new file mode 100644
index 000000000..3b4a79b4e
--- /dev/null
+++ b/iokit/IOKit/x86_64/Makefile
@@ -0,0 +1,33 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A
+export INCDIR = $(IOKIT_FRAMEDIR)/Headers
+export LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+MD_DIR = x86_64
+EXCLUDE_HEADERS = 
+
+INSTINC_SUBDIRS =
+INSTINC_SUBDIRS_X86_64 =
+
+EXPINC_SUBDIRS = ${INSTINC_SUBDIRS}
+EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64}
+
+ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+HEADER_LIST = $(filter-out $(EXCLUDE_HEADERS), $(ALL_HEADERS))
+
+INSTALL_MD_LIST	= ${HEADER_LIST}
+INSTALL_MD_LCL_LIST = ""
+INSTALL_MD_DIR = $(MD_DIR)
+
+EXPORT_MD_LIST	= 
+EXPORT_MD_DIR = IOKit/$(MD_DIR)
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp
index 563059600..a56b469ee 100644
--- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp
+++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp
@@ -99,6 +99,8 @@ bool IOBufferMemoryDescriptor::initWithPhysicalMask(
 	return (false);
     _ranges.v64->address = 0;
     _ranges.v64->length  = 0;
+	//  make sure super::free doesn't dealloc _ranges before super::init
+	_flags = kIOMemoryAsReference;
 
     // Grab IOMD bits from the Buffer MD options
     iomdOptions  |= (options & kIOBufferDescriptorMemoryFlags);
@@ -148,6 +150,10 @@ bool IOBufferMemoryDescriptor::initWithPhysicalMask(
 	    SET_MAP_MEM(MAP_MEM_COPYBACK, memEntryCacheMode);
 	    break;
 
+	case kIOMapCopybackInnerCache:
+	    SET_MAP_MEM(MAP_MEM_INNERWBACK, memEntryCacheMode);
+	    break;
+
 	case kIOMapDefaultCache:
 	default:
 	    SET_MAP_MEM(MAP_MEM_NOOP, memEntryCacheMode);
diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp
index 5dd9ea416..eae15f97c 100644
--- a/iokit/Kernel/IOCPU.cpp
+++ b/iokit/Kernel/IOCPU.cpp
@@ -42,6 +42,7 @@ extern "C" {
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -50,13 +51,15 @@ extern "C" {
 #include 
 
 typedef kern_return_t (*iocpu_platform_action_t)(void * refcon0, void * refcon1, uint32_t priority,
-						 void * param1, void * param2, void * param3);
+						 void * param1, void * param2, void * param3,
+						 const char * name);
 
 struct iocpu_platform_action_entry
 {
     queue_chain_t                     link;
     iocpu_platform_action_t           action;
     int32_t	                      priority;
+    const char *		      name;
     void *	                      refcon0;
     void *			      refcon1;
     struct iocpu_platform_action_entry * alloc_list;
@@ -168,7 +171,7 @@ iocpu_run_platform_actions(queue_head_t * queue, uint32_t first_priority, uint32
 	if ((pri >= first_priority) && (pri <= last_priority))
 	{
 	    //kprintf("[%p]", next->action);
-	    ret = (*next->action)(next->refcon0, next->refcon1, pri, param1, param2, param3);
+	    ret = (*next->action)(next->refcon0, next->refcon1, pri, param1, param2, param3, next->name);
 	}
 	if (KERN_SUCCESS == result)
 	    result = ret;
@@ -194,13 +197,14 @@ IOCPURunPlatformActiveActions(void)
 
 static kern_return_t 
 IOServicePlatformAction(void * refcon0, void * refcon1, uint32_t priority,
-			  void * param1, void * param2, void * param3)
+			  void * param1, void * param2, void * param3,
+			  const char * service_name)
 {
     IOReturn	     ret;
     IOService *      service  = (IOService *)      refcon0;
     const OSSymbol * function = (const OSSymbol *) refcon1;
 
-    kprintf("%s -> %s\n", function->getCStringNoCopy(), service->getName());
+    kprintf("%s -> %s\n", function->getCStringNoCopy(), service_name);
 
     ret = service->callPlatformFunction(function, false, 
 					 (void *) priority, param1, param2, param3);
@@ -223,6 +227,7 @@ IOInstallServicePlatformAction(IOService * service,
 
     entry = IONew(iocpu_platform_action_entry_t, 1);
     entry->action = &IOServicePlatformAction;
+    entry->name = service->getName();
     priority = num->unsigned32BitValue();
     if (reverse)
 	entry->priority = -priority;
@@ -306,8 +311,9 @@ void IOCPUSleepKernel(void)
 
     kprintf("IOCPUSleepKernel\n");
 
-    OSIterator * iter;
-    IOService *  service;
+    IORegistryIterator * iter;
+    OSOrderedSet *       all;
+    IOService *          service;
 
     rootDomain->tracePoint( kIOPMTracePointSleepPlatformActions );
 
@@ -318,19 +324,28 @@ void IOCPUSleepKernel(void)
 					    kIORegistryIterateRecursively );
     if( iter)
     {
-	do
+	all = 0;
+	do 
 	{
-	    iter->reset();
-	    while((service = (IOService *) iter->getNextObject()))
+	    if (all)
+		all->release();
+	    all = iter->iterateAll();
+	}
+	while (!iter->isValid());
+	iter->release();
+
+	if (all)
+	{
+	    while((service = (IOService *) all->getFirstObject()))
 	    {
 		IOInstallServicePlatformAction(service, gIOPlatformSleepActionKey,   &gIOSleepActionQueue,		 false);
 		IOInstallServicePlatformAction(service, gIOPlatformWakeActionKey,    &gIOWakeActionQueue,		 true);
 		IOInstallServicePlatformAction(service, gIOPlatformQuiesceActionKey, iocpu_get_platform_quiesce_queue(), false);
 		IOInstallServicePlatformAction(service, gIOPlatformActiveActionKey,  iocpu_get_platform_active_queue(),  true);
+		all->removeObject(service);
 	    }
-	}
-	while( !service && !iter->isValid());
-	iter->release();
+	    all->release();
+	}	
     }
 
     iocpu_run_platform_actions(&gIOSleepActionQueue, 0, 0U-1,
diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp
index c6de3b56c..ee193c027 100644
--- a/iokit/Kernel/IOCatalogue.cpp
+++ b/iokit/Kernel/IOCatalogue.cpp
@@ -49,6 +49,7 @@ extern "C" {
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -63,28 +64,15 @@ extern "C" {
 /*********************************************************************
 *********************************************************************/
 
-#define CATALOGTEST 0
-
 IOCatalogue    * gIOCatalogue;
 const OSSymbol * gIOClassKey;
 const OSSymbol * gIOProbeScoreKey;
 const OSSymbol * gIOModuleIdentifierKey;
-IOLock         * gIOCatalogLock;
+IORWLock         * gIOCatalogLock;
 
 #if PRAGMA_MARK
 #pragma mark Utility functions
 #endif
-/*********************************************************************
-* Add a new personality to the set if it has a unique IOResourceMatchKey value.
-* XXX -- svail: This should be optimized.
-* esb - There doesn't seem like any reason to do this - it causes problems
-* esb - when there are more than one loadable driver matching on the same provider class
-*********************************************************************/
-static void
-AddNewImports(OSOrderedSet * set, OSDictionary * dict)
-{
-    set->setObject(dict);
-}
 
 #if PRAGMA_MARK
 #pragma mark IOCatalogue class implementation
@@ -126,42 +114,68 @@ void IOCatalogue::initialize(void)
     array->release();
 }
 
+/*********************************************************************
+* Initialize the IOCatalog object.
+*********************************************************************/
+OSArray * IOCatalogue::arrayForPersonality(OSDictionary * dict)
+{
+    const OSSymbol * sym;
+
+    sym = OSDynamicCast(OSSymbol, dict->getObject(gIOProviderClassKey));
+    if (!sym) 	return (0);
+
+    return ((OSArray *) personalities->getObject(sym));
+}
+
+void IOCatalogue::addPersonality(OSDictionary * dict)
+{
+    const OSSymbol * sym;
+    OSArray * arr;
+
+    sym = OSDynamicCast(OSSymbol, dict->getObject(gIOProviderClassKey));
+    if (!sym) return;
+    arr = (OSArray *) personalities->getObject(sym);
+    if (arr) arr->setObject(dict);
+    else
+    {
+        arr = OSArray::withObjects((const OSObject **)&dict, 1, 2);
+        personalities->setObject(sym, arr);
+        arr->release();
+    }
+}
+
 /*********************************************************************
 * Initialize the IOCatalog object.
 *********************************************************************/
 bool IOCatalogue::init(OSArray * initArray)
 {
     OSDictionary         * dict;
-    
+    OSObject * obj;
+
     if ( !super::init() )
         return false;
 
     generation = 1;
     
-    array = initArray;
-    array->retain();
-    kernelTables = OSCollectionIterator::withCollection( array );
-
-    gIOCatalogLock = IOLockAlloc();
-
-    lock     = gIOCatalogLock;
-#if __i386__
-    kld_lock = NULL;
-#endif /* __i386__ */
-
-    kernelTables->reset();
-    while( (dict = (OSDictionary *) kernelTables->getNextObject())) {
-        OSKext::uniquePersonalityProperties(dict);
+    personalities = OSDictionary::withCapacity(32);
+    personalities->setOptions(OSCollection::kSort, OSCollection::kSort);
+    for (unsigned int idx = 0; (obj = initArray->getObject(idx)); idx++)
+    {
+	dict = OSDynamicCast(OSDictionary, obj);
+	if (!dict) continue;
+	OSKext::uniquePersonalityProperties(dict);
         if( 0 == dict->getObject( gIOClassKey ))
+        {
             IOLog("Missing or bad \"%s\" key\n",
                     gIOClassKey->getCStringNoCopy());
+	    continue;
+	}
+	dict->setObject("KernelConfigTable", kOSBooleanTrue);
+        addPersonality(dict);
     }
 
-#if CATALOGTEST
-    AbsoluteTime deadline;
-    clock_interval_to_deadline( 1000, kMillisecondScale );
-    thread_call_func_delayed( ping, this, deadline );
-#endif
+    gIOCatalogLock = IORWLockAlloc();
+    lock = gIOCatalogLock;
 
     return true;
 }
@@ -172,63 +186,8 @@ bool IOCatalogue::init(OSArray * initArray)
 *********************************************************************/
 void IOCatalogue::free( void )
 {
-    if ( array )
-        array->release();
-
-    if ( kernelTables )
-        kernelTables->release();
-    
-    super::free();
-}
-
-/*********************************************************************
-*********************************************************************/
-#if CATALOGTEST
-
-static int hackLimit;
-enum { kDriversPerIter = 4 };
-
-void
-IOCatalogue::ping(thread_call_param_t arg, thread_call_param_t)
-{
-    IOCatalogue 	 * self = (IOCatalogue *) arg;
-    OSOrderedSet         * set;
-    OSDictionary         * table;
-    int	                   newLimit;
-
-    set = OSOrderedSet::withCapacity( 1 );
-
-    IOLockLock( &self->lock );
-
-    for( newLimit = 0; newLimit < kDriversPerIter; newLimit++) {
-	table = (OSDictionary *) self->array->getObject(
-					hackLimit + newLimit );
-	if( table) {
-	    set->setLastObject( table );
-
-	    OSSymbol * sym = (OSSymbol *) table->getObject(gIOClassKey);
-	    kprintf("enabling %s\n", sym->getCStringNoCopy());
-
-	} else {
-	    newLimit--;
-	    break;
-	}
-    }
-
-    IOService::catalogNewDrivers( set );
-
-    hackLimit += newLimit;
-    self->generation++;
-
-    IOLockUnlock( &self->lock );
-
-    if( kDriversPerIter == newLimit) {
-        AbsoluteTime deadline;
-        clock_interval_to_deadline(500, kMillisecondScale);
-        thread_call_func_delayed(ping, this, deadline);
-    }
+    panic("");
 }
-#endif
 
 /*********************************************************************
 *********************************************************************/
@@ -239,33 +198,32 @@ IOCatalogue::findDrivers(
 {
     OSDictionary         * nextTable;
     OSOrderedSet         * set;
-    OSString             * imports;
+    OSArray              * array;
+    const OSMetaClass    * meta;
+    unsigned int           idx;
 
     set = OSOrderedSet::withCapacity( 1, IOServiceOrdering,
                                       (void *)gIOProbeScoreKey );
     if( !set )
 	return( 0 );
 
-    IOLockLock(lock);
-    kernelTables->reset();
+    IORWLockRead(lock);
 
-#if CATALOGTEST
-    int hackIndex = 0;
-#endif
-    while( (nextTable = (OSDictionary *) kernelTables->getNextObject())) {
-#if CATALOGTEST
-	if( hackIndex++ > hackLimit)
-	    break;
-#endif
-        imports = OSDynamicCast( OSString,
-			nextTable->getObject( gIOProviderClassKey ));
-	if( imports && service->metaCast( imports ))
-            set->setObject( nextTable );
+    meta = service->getMetaClass();
+    while (meta)
+    {
+    	array = (OSArray *) personalities->getObject(meta->getClassNameSymbol());
+	if (array) for (idx = 0; (nextTable = (OSDictionary *) array->getObject(idx)); idx++)
+	{
+            set->setObject(nextTable);
+	}
+	if (meta == &IOService::gMetaClass) break;
+	meta = meta->getSuperClass();
     }
 
     *generationCount = getGenerationCount();
 
-    IOLockUnlock(lock);
+    IORWLockUnlock(lock);
 
     return( set );
 }
@@ -278,27 +236,42 @@ IOCatalogue::findDrivers(
     OSDictionary * matching,
     SInt32 * generationCount)
 {
+    OSCollectionIterator * iter;
     OSDictionary         * dict;
     OSOrderedSet         * set;
+    OSArray              * array;
+    const OSSymbol       * key;
+    unsigned int           idx;
 
     OSKext::uniquePersonalityProperties(matching);
 
     set = OSOrderedSet::withCapacity( 1, IOServiceOrdering,
                                       (void *)gIOProbeScoreKey );
+    if (!set) return (0);
+    iter = OSCollectionIterator::withCollection(personalities);
+    if (!iter) 
+    {
+    	set->release();
+    	return (0);
+    }
 
-    IOLockLock(lock);
-    kernelTables->reset();
-    while ( (dict = (OSDictionary *) kernelTables->getNextObject()) ) {
-
-       /* This comparison must be done with only the keys in the
-        * "matching" dict to enable general searches.
-        */
-        if ( dict->isEqualTo(matching, matching) )
-            set->setObject(dict);
+    IORWLockRead(lock);
+    while ((key = (const OSSymbol *) iter->getNextObject()))
+    {
+        array = (OSArray *) personalities->getObject(key);
+        if (array) for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++)
+        {
+	   /* This comparison must be done with only the keys in the
+	    * "matching" dict to enable general searches.
+	    */
+	    if ( dict->isEqualTo(matching, matching) )
+		set->setObject(dict);
+	}
     }
     *generationCount = getGenerationCount();
-    IOLockUnlock(lock);
+    IORWLockUnlock(lock);
 
+    iter->release();
     return set;
 }
 
@@ -313,6 +286,7 @@ IOCatalogue::findDrivers(
 * xxx - userlib used to refuse to send personalities with IOKitDebug
 * xxx - during safe boot. That would be better implemented here.
 *********************************************************************/
+
 bool IOCatalogue::addDrivers(
     OSArray * drivers,
     bool doNubMatching)
@@ -322,7 +296,7 @@ bool IOCatalogue::addDrivers(
     OSOrderedSet         * set = NULL;        // must release
     OSObject             * object = NULL;       // do not release
     OSArray              * persons = NULL;    // do not release
-
+    
     persons = OSDynamicCast(OSArray, drivers);
     if (!persons) {
         goto finish;
@@ -343,7 +317,7 @@ bool IOCatalogue::addDrivers(
     */
     result = true;
 
-    IOLockLock(lock);
+    IORWLockWrite(lock);
     while ( (object = iter->getNextObject()) ) {
     
         // xxx Deleted OSBundleModuleDemand check; will handle in other ways for SL
@@ -359,43 +333,48 @@ bool IOCatalogue::addDrivers(
         }
 
         OSKext::uniquePersonalityProperties(personality);
-        
+
         // Add driver personality to catalogue.
-        count = array->getCount();
-        while (count--) {
-            OSDictionary * driver;
-            
-            // Be sure not to double up on personalities.
-            driver = (OSDictionary *)array->getObject(count);
-            
-           /* Unlike in other functions, this comparison must be exact!
-            * The catalogue must be able to contain personalities that
-            * are proper supersets of others.
-            * Do not compare just the properties present in one driver
-            * pesonality or the other.
-            */
-            if (personality->isEqualTo(driver)) {
-                break;
-            }
-        }
-        if (count >= 0) {
-            // its a dup
-            continue;
-        }
-        
-        result = array->setObject(personality);
-        if (!result) {
-            break;
+
+	OSArray * array = arrayForPersonality(personality);
+	if (!array) addPersonality(personality);
+	else
+	{       
+	    count = array->getCount();
+	    while (count--) {
+		OSDictionary * driver;
+		
+		// Be sure not to double up on personalities.
+		driver = (OSDictionary *)array->getObject(count);
+		
+	       /* Unlike in other functions, this comparison must be exact!
+		* The catalogue must be able to contain personalities that
+		* are proper supersets of others.
+		* Do not compare just the properties present in one driver
+		* pesonality or the other.
+		*/
+		if (personality->isEqualTo(driver)) {
+		    break;
+		}
+	    }
+	    if (count >= 0) {
+		// its a dup
+		continue;
+	    }
+	    result = array->setObject(personality);
+	    if (!result) {
+		break;
+	    }
         }
-        
-        AddNewImports(set, personality);
+
+	set->setObject(personality);        
     }
     // Start device matching.
     if (result && doNubMatching && (set->getCount() > 0)) {
         IOService::catalogNewDrivers(set);
         generation++;
     }
-    IOLockUnlock(lock);
+    IORWLockUnlock(lock);
 
 finish:
     if (set)  set->release();
@@ -413,61 +392,53 @@ IOCatalogue::removeDrivers(
     OSDictionary * matching,
     bool doNubMatching)
 {
-    OSCollectionIterator * tables;
-    OSDictionary         * dict;
     OSOrderedSet         * set;
-    OSArray              * arrayCopy;
+    OSCollectionIterator * iter;
+    OSDictionary         * dict;
+    OSArray              * array;
+    const OSSymbol       * key;
+    unsigned int           idx;
 
     if ( !matching )
         return false;
-
+    
     set = OSOrderedSet::withCapacity(10,
                                      IOServiceOrdering,
                                      (void *)gIOProbeScoreKey);
     if ( !set )
         return false;
-
-    arrayCopy = OSArray::withCapacity(100);
-    if ( !arrayCopy ) {
-        set->release();
-        return false;
-    }
-    
-    tables = OSCollectionIterator::withCollection(arrayCopy);
-    arrayCopy->release();
-    if ( !tables ) {
-        set->release();
-        return false;
+    iter = OSCollectionIterator::withCollection(personalities);
+    if (!iter) 
+    {
+    	set->release();
+    	return (false);
     }
 
-    OSKext::uniquePersonalityProperties( matching );
-
-    IOLockLock(lock);
-    kernelTables->reset();
-    arrayCopy->merge(array);
-    array->flushCollection();
-    tables->reset();
-    while ( (dict = (OSDictionary *)tables->getNextObject()) ) {
-
-       /* This comparison must be done with only the keys in the
-        * "matching" dict to enable general searches.
-        */
-        if ( dict->isEqualTo(matching, matching) ) {
-            AddNewImports( set, dict );
-            continue;
+    IORWLockWrite(lock);
+    while ((key = (const OSSymbol *) iter->getNextObject()))
+    {
+        array = (OSArray *) personalities->getObject(key);
+        if (array) for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++)
+        {
+           /* This comparison must be done with only the keys in the
+            * "matching" dict to enable general searches.
+            */
+            if ( dict->isEqualTo(matching, matching) ) {
+                set->setObject(dict);        
+                array->removeObject(idx);
+                idx--;
+            }
+        }
+        // Start device matching.
+        if ( doNubMatching && (set->getCount() > 0) ) {
+            IOService::catalogNewDrivers(set);
+            generation++;
         }
-
-        array->setObject(dict);
-    }
-    // Start device matching.
-    if ( doNubMatching && (set->getCount() > 0) ) {
-        IOService::catalogNewDrivers(set);
-        generation++;
     }
-    IOLockUnlock(lock);
-    
+    IORWLockUnlock(lock);
+   
     set->release();
-    tables->release();
+    iter->release();
     
     return true;
 }
@@ -538,7 +509,8 @@ void IOCatalogue::moduleHasLoaded(OSString * moduleName)
     startMatching(dict);
     dict->release();
 
-    (void) OSKext::considerRebuildOfPrelinkedKernel(moduleName);
+    (void) OSKext::setDeferredLoadSucceeded();
+    (void) OSKext::considerRebuildOfPrelinkedKernel();
 }
 
 void IOCatalogue::moduleHasLoaded(const char * moduleName)
@@ -556,7 +528,7 @@ IOReturn IOCatalogue::unloadModule(OSString * moduleName) const
     return OSKext::removeKextWithIdentifier(moduleName->getCStringNoCopy());
 }
 
-static IOReturn _terminateDrivers(OSDictionary * matching)
+IOReturn IOCatalogue::_terminateDrivers(OSDictionary * matching)
 {
     OSDictionary         * dict;
     OSIterator           * iter;
@@ -601,41 +573,39 @@ static IOReturn _terminateDrivers(OSDictionary * matching)
     return ret;
 }
 
-static IOReturn _removeDrivers( OSArray * array, OSDictionary * matching )
+IOReturn IOCatalogue::_removeDrivers(OSDictionary * matching)
 {
-    OSCollectionIterator * tables;
-    OSDictionary         * dict;
-    OSArray              * arrayCopy;
     IOReturn               ret = kIOReturnSuccess;
+    OSCollectionIterator * iter;
+    OSDictionary         * dict;
+    OSArray              * array;
+    const OSSymbol       * key;
+    unsigned int           idx;
 
     // remove configs from catalog.
 
-    arrayCopy = OSArray::withCapacity(100);
-    if ( !arrayCopy )
-        return kIOReturnNoMemory;
-
-    tables = OSCollectionIterator::withCollection(arrayCopy);
-    arrayCopy->release();
-    if ( !tables )
-        return kIOReturnNoMemory;
-
-    arrayCopy->merge(array);
-    array->flushCollection();
-    tables->reset();
-    while ( (dict = (OSDictionary *)tables->getNextObject()) ) {
-
-       /* Remove from the catalogue's array any personalities
-        * that match the matching dictionary.
-        * This comparison must be done with only the keys in the
-        * "matching" dict to enable general matching.
-        */
-        if ( dict->isEqualTo(matching, matching) )
-            continue;
+    iter = OSCollectionIterator::withCollection(personalities);
+    if (!iter) return (kIOReturnNoMemory);
 
-        array->setObject(dict);
+    while ((key = (const OSSymbol *) iter->getNextObject()))
+    {
+        array = (OSArray *) personalities->getObject(key);
+        if (array) for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++)
+        {
+
+	    /* Remove from the catalogue's array any personalities
+	     * that match the matching dictionary.
+	     * This comparison must be done with only the keys in the
+	     * "matching" dict to enable general matching.
+	     */
+            if (dict->isEqualTo(matching, matching))
+            {
+                array->removeObject(idx);
+                idx--;
+            }
+        }
     }
-
-    tables->release();
+    iter->release();
 
     return ret;
 }
@@ -645,11 +615,10 @@ IOReturn IOCatalogue::terminateDrivers(OSDictionary * matching)
     IOReturn ret;
 
     ret = _terminateDrivers(matching);
-    IOLockLock(lock);
+    IORWLockWrite(lock);
     if (kIOReturnSuccess == ret)
-	ret = _removeDrivers(array, matching);
-    kernelTables->reset();
-    IOLockUnlock(lock);
+	ret = _removeDrivers(matching);
+    IORWLockUnlock(lock);
 
     return ret;
 }
@@ -694,18 +663,17 @@ IOReturn IOCatalogue::terminateDriversForModule(
     
    /* No goto between IOLock calls!
     */
-    IOLockLock(lock);
+    IORWLockWrite(lock);
     if (kIOReturnSuccess == ret) {
-        ret = _removeDrivers(array, dict);
+        ret = _removeDrivers(dict);
     }
-    kernelTables->reset();
 
     // Unload the module itself.
     if (unload && isLoaded && ret == kIOReturnSuccess) {
         ret = unloadModule(moduleName);
     }
 
-    IOLockUnlock(lock);
+    IORWLockUnlock(lock);
 
     dict->release();
 
@@ -732,8 +700,12 @@ IOReturn IOCatalogue::terminateDriversForModule(
 
 bool IOCatalogue::startMatching( OSDictionary * matching )
 {
+    OSCollectionIterator * iter;
     OSDictionary         * dict;
     OSOrderedSet         * set;
+    OSArray              * array;
+    const OSSymbol *       key;
+    unsigned int           idx;
     
     if ( !matching )
         return false;
@@ -743,26 +715,39 @@ bool IOCatalogue::startMatching( OSDictionary * matching )
     if ( !set )
         return false;
 
-    IOLockLock(lock);
-    kernelTables->reset();
+    iter = OSCollectionIterator::withCollection(personalities);
+    if (!iter) 
+    {
+    	set->release();
+        return false;
+    }
 
-    while ( (dict = (OSDictionary *)kernelTables->getNextObject()) ) {
+    IORWLockRead(lock);
 
-       /* This comparison must be done with only the keys in the
-        * "matching" dict to enable general matching.
-        */
-        if ( dict->isEqualTo(matching, matching) )
-            AddNewImports(set, dict);
+    while ((key = (const OSSymbol *) iter->getNextObject()))
+    {
+        array = (OSArray *) personalities->getObject(key);
+        if (array) for (idx = 0; (dict = (OSDictionary *) array->getObject(idx)); idx++)
+        {
+	   /* This comparison must be done with only the keys in the
+	    * "matching" dict to enable general matching.
+	    */
+            if (dict->isEqualTo(matching, matching)) {
+                set->setObject(dict);
+            }        
+        }
     }
+
     // Start device matching.
     if ( set->getCount() > 0 ) {
         IOService::catalogNewDrivers(set);
         generation++;
     }
 
-    IOLockUnlock(lock);
+    IORWLockUnlock(lock);
 
     set->release();
+    iter->release();
 
     return true;
 }
@@ -778,172 +763,100 @@ bool IOCatalogue::resetAndAddDrivers(OSArray * drivers, bool doNubMatching)
 {
     bool                   result              = false;
     OSArray              * newPersonalities    = NULL;  // do not release
-    OSCollectionIterator * newPIterator        = NULL;  // must release
+    OSCollectionIterator * iter                = NULL;  // must release
     OSOrderedSet         * matchSet            = NULL;  // must release
-    OSArray              * oldPersonalities    = NULL;  // must release
-    OSArray              * kernelPersonalities = NULL;  // must release
-    OSString             * errorString         = NULL;  // must release
-    OSObject             * object              = NULL;  // do not release
+    const OSSymbol       * key;
+    OSArray              * array;
     OSDictionary         * thisNewPersonality  = NULL;  // do not release
-    signed int             count, i;
-
-    extern const char    * gIOKernelConfigTables;
+    OSDictionary         * thisOldPersonality  = NULL;  // do not release
+    signed int             idx, newIdx;
 
     if (drivers) {
         newPersonalities = OSDynamicCast(OSArray, drivers);
         if (!newPersonalities) {
             goto finish;
         }
-
-        newPIterator = OSCollectionIterator::withCollection(newPersonalities);
-        if (!newPIterator) {
-            goto finish;
-        }
         
         matchSet = OSOrderedSet::withCapacity(10, IOServiceOrdering,
             (void *)gIOProbeScoreKey);
         if (!matchSet) {
             goto finish;
         }
-    }
-
-   /* Read personalities for the built-in kernel driver classes.
-    * We don't have many any more.
-    */
-    kernelPersonalities = OSDynamicCast(OSArray,
-        OSUnserialize(gIOKernelConfigTables, &errorString));
-    if (!kernelPersonalities && errorString) {
-        IOLog("KernelConfigTables syntax error: %s\n",
-            errorString->getCStringNoCopy());
-        goto finish;
-    }
-    
-   /* Now copy the current array of personalities so we can reuse them
-    * if the new list contains any duplicates. This saves on memory
-    * consumption.
-    */
-    oldPersonalities = OSDynamicCast(OSArray, array->copyCollection());
-    if (!oldPersonalities) {
-        goto finish;
+        iter = OSCollectionIterator::withCollection(personalities);
+        if (!iter) {
+            goto finish;
+        }
     }
 
     result = true;
 
     IOLog("Resetting IOCatalogue.\n");
-    
-   /* No goto finish from here to unlock.
-    */
-    IOLockLock(lock);
-    
-    array->flushCollection();
 
-   /* Add back the kernel personalities and remove them from the old
-    * array so we don't try to match on them again. Go forward through
-    * the arrays as this causes the least iteration since kernel personalities
-    * should always be first.
+   /* No goto finish from here to unlock.
     */
-    count = kernelPersonalities->getCount();
-    for (i = 0; i < count; i++) {
+    IORWLockWrite(lock);
     
-       /* Static cast here, as the data is coming from within the kernel image.
-        */
-        OSDictionary * thisNewPersonality = (OSDictionary *)
-            kernelPersonalities->getObject(i);
-        array->setObject(thisNewPersonality);
-
-        signed int oldPCount = oldPersonalities->getCount();
-        for (signed int oldPIndex = 0; oldPIndex < oldPCount; oldPIndex++) {
-            if (thisNewPersonality->isEqualTo(oldPersonalities->getObject(oldPIndex))) {
-                oldPersonalities->removeObject(oldPIndex);
-                break;
-            }
-        }
-    }
-
-   /* Now add the new set of personalities passed in, using existing
-    * copies if we had them in kernel memory already.
-    */
-    if (newPIterator) {
-        OSDictionary * thisOldPersonality = NULL;  // do not release
-        
-        while ( (object = newPIterator->getNextObject()) ) {
-
-            thisNewPersonality = OSDynamicCast(OSDictionary, object);
-            if (!thisNewPersonality) {
-                IOLog("IOCatalogue::resetAndAddDrivers() encountered non-dictionary; bailing.\n");
-            result = false;
-            break;
-            }
-
-           /* Convert common OSString property values to OSSymbols.
-            */
-            OSKext::uniquePersonalityProperties(thisNewPersonality);
-            
-           /* Add driver personality to catalogue, but if we had a copy already
-            * use that instead so we don't have multiple copies from OSKext instances.
+    while ((key = (const OSSymbol *) iter->getNextObject()))
+    {
+        array = (OSArray *) personalities->getObject(key);
+        if (!array) continue;
+        for (idx = 0; (thisOldPersonality = (OSDictionary *) array->getObject(idx)); idx++)
+        {
+            if (thisOldPersonality->getObject("KernelConfigTable")) continue;
+            if (newPersonalities) for (newIdx = 0; 
+                (thisNewPersonality = (OSDictionary *) newPersonalities->getObject(newIdx)); 
+                newIdx++)
+            {
+	       /* Unlike in other functions, this comparison must be exact!
+            * The catalogue must be able to contain personalities that
+            * are proper supersets of others.
+            * Do not compare just the properties present in one driver
+            * pesonality or the other.
             */
-            count = oldPersonalities->getCount();
-            thisOldPersonality = NULL;
-            while (count--) {
-                
-                thisOldPersonality = (OSDictionary *)oldPersonalities->getObject(count);
-                
-               /* Unlike in other functions, this comparison must be exact!
-                * The catalogue must be able to contain personalities that
-                * are proper supersets of others.
-                * Do not compare just the properties present in one driver
-                * pesonality or the other.
-                */
-                if (thisNewPersonality->isEqualTo(thisOldPersonality)) {
+                if (thisNewPersonality->isEqualTo(thisOldPersonality))  
                     break;
-                }
             }
-
-           /* If we found a dup, add the *original* back to the catalogue,
-            * remove it from our bookkeeping list, and continue.
-            * Don't worry about matching on personalities we already had.
-            */
-            if (count >= 0) {
-                array->setObject(thisOldPersonality);
-                oldPersonalities->removeObject(count);
-                continue;
+            if (thisNewPersonality)
+            {
+                // dup, ignore
+                newPersonalities->removeObject(newIdx);
+            }
+            else
+            {
+                // not in new set - remove
+                // only remove dictionary if this module in not loaded - 9953845
+                if ( isModuleLoaded(thisOldPersonality) == false ) 
+                {
+                    if (matchSet)  matchSet->setObject(thisOldPersonality);
+                    array->removeObject(idx);
+                    idx--;
+                }
             }
-
-           /* Otherwise add the new personality and mark it for matching.
-            */
-            array->setObject(thisNewPersonality);
-            AddNewImports(matchSet, thisNewPersonality);                
-        }
-
-       /*****
-        * Now, go through remaining old personalities, which have effectively
-        * been removed, and add them to the match set as necessary.
-        */
-        count = oldPersonalities->getCount();
-        while (count--) {
-        
-           /* Static cast here is ok as these dictionaries were already in the catalogue.
-            */
-            thisOldPersonality = (OSDictionary *)oldPersonalities->getObject(count);
-            AddNewImports(matchSet, thisOldPersonality);
         }
+    }
 
-       /* Finally, start device matching on all new & removed personalities.
-        */
-        if (result && doNubMatching && (matchSet->getCount() > 0)) {
-            IOService::catalogNewDrivers(matchSet);
-            generation++;
-        }
+     // add new
+     for (newIdx = 0;
+          (thisNewPersonality = (OSDictionary *) newPersonalities->getObject(newIdx)); 
+          newIdx++)
+     {
+         OSKext::uniquePersonalityProperties(thisNewPersonality);
+         addPersonality(thisNewPersonality);
+         matchSet->setObject(thisNewPersonality);
+     }
+
+   /* Finally, start device matching on all new & removed personalities.
+    */
+    if (result && doNubMatching && (matchSet->getCount() > 0)) {
+        IOService::catalogNewDrivers(matchSet);
+        generation++;
     }
 
-    IOLockUnlock(lock);
+    IORWLockUnlock(lock);
 
 finish:
-    if (newPIterator) newPIterator->release();
     if (matchSet) matchSet->release();
-    if (oldPersonalities) oldPersonalities->release();
-    if (kernelPersonalities) kernelPersonalities->release();
-    if (errorString) errorString->release();
+    if (iter)     iter->release();
 
     return result;
 }
@@ -963,8 +876,7 @@ bool IOCatalogue::serializeData(IOOptionBits kind, OSSerialize * s) const
     switch ( kind )
     {
         case kIOCatalogGetContents:
-            if (!array->serialize(s))
-                kr = kIOReturnNoMemory;
+            kr = KERN_NOT_SUPPORTED;
             break;
 
         case kIOCatalogGetModuleDemandList:
@@ -987,7 +899,6 @@ bool IOCatalogue::serializeData(IOOptionBits kind, OSSerialize * s) const
     return kr;
 }
 
-
 #if PRAGMA_MARK
 #pragma mark Obsolete Kext Loading Stuff
 #endif
diff --git a/iokit/Kernel/IOCommandGate.cpp b/iokit/Kernel/IOCommandGate.cpp
index 29ecd859e..9b19d70ee 100644
--- a/iokit/Kernel/IOCommandGate.cpp
+++ b/iokit/Kernel/IOCommandGate.cpp
@@ -182,7 +182,7 @@ IOReturn IOCommandGate::runAction(Action inAction,
 	
 	if (trace)
 		IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION),
-								 (uintptr_t) inAction, (uintptr_t) owner);
+					 VM_KERNEL_UNSLIDE(inAction), (uintptr_t) owner);
 	
     IOStatisticsActionCall();
 	
@@ -191,7 +191,7 @@ IOReturn IOCommandGate::runAction(Action inAction,
 	
 	if (trace)
 		IOTimeStampEndConstant(IODBG_CMDQ(IOCMDQ_ACTION),
-							   (uintptr_t) inAction, (uintptr_t) owner);
+				       VM_KERNEL_UNSLIDE(inAction), (uintptr_t) owner);
     
     openGate();
 	
@@ -220,7 +220,7 @@ IOReturn IOCommandGate::attemptAction(Action inAction,
 		
         if (trace)
             IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION),
-									 (uintptr_t) inAction, (uintptr_t) owner);
+				     VM_KERNEL_UNSLIDE(inAction), (uintptr_t) owner);
         
         IOStatisticsActionCall();
         
@@ -228,7 +228,7 @@ IOReturn IOCommandGate::attemptAction(Action inAction,
 		
         if (trace)
             IOTimeStampEndConstant(IODBG_CMDQ(IOCMDQ_ACTION),
-								   (uintptr_t) inAction, (uintptr_t) owner);
+				   VM_KERNEL_UNSLIDE(inAction), (uintptr_t) owner);
     }
 
     openGate();
diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp
index b95ee921d..dadc4dc35 100644
--- a/iokit/Kernel/IODMACommand.cpp
+++ b/iokit/Kernel/IODMACommand.cpp
@@ -147,7 +147,7 @@ IODMACommand::initWithSpecification(SegmentFunction outSegFunc,
 				    IOMapper       *mapper,
 				    void           *refCon)
 {
-    if (!super::init() || !outSegFunc || !numAddressBits)
+    if (!super::init() || !outSegFunc)
         return false;
 
     bool is32Bit = (OutputHost32   == outSegFunc || OutputBig32 == outSegFunc
@@ -502,7 +502,7 @@ IODMACommand::walkAll(UInt8 op)
 		}
 		else
 		{
-		    DEBG("IODMACommand !iovmAlloc");
+		    DEBG("IODMACommand !alloc IOBMD");
 		    return (kIOReturnNoResources);
 		}
 	    }
@@ -513,6 +513,11 @@ IODMACommand::walkAll(UInt8 op)
 	    state->fLocalMapperPageCount = atop_64(round_page(
 	    	    state->fPreparedLength + ((state->fPreparedOffset + fMDSummary.fPageAlign) & page_mask)));
 	    state->fLocalMapperPageAlloc = fMapper->iovmAllocDMACommand(this, state->fLocalMapperPageCount);
+            if (!state->fLocalMapperPageAlloc)
+            {
+                DEBG("IODMACommand !iovmAlloc");
+                return (kIOReturnNoResources);
+            }
 	    state->fMapContig = true;
 	}
     }
@@ -610,7 +615,7 @@ IODMACommand::prepareWithSpecification(SegmentFunction	outSegFunc,
     if (fActive)
         return kIOReturnNotPermitted;
 
-    if (!outSegFunc || !numAddressBits)
+    if (!outSegFunc)
         return kIOReturnBadArgument;
 
     bool is32Bit = (OutputHost32   == outSegFunc || OutputBig32 == outSegFunc
@@ -1143,7 +1148,7 @@ IODMACommand::clientOutputSegment(
     SegmentFunction segmentFunction = (SegmentFunction) reference;
     IOReturn ret = kIOReturnSuccess;
 
-    if ((target->fNumAddressBits < 64) 
+    if (target->fNumAddressBits && (target->fNumAddressBits < 64) 
 	&& ((segment.fIOVMAddr + segment.fLength - 1) >> target->fNumAddressBits)
 	&& (target->reserved->fLocalMapperPageAlloc || !target->reserved->fLocalMapper))
     {
diff --git a/iokit/Kernel/IODataQueue.cpp b/iokit/Kernel/IODataQueue.cpp
index 1001ebeff..95988aaf4 100644
--- a/iokit/Kernel/IODataQueue.cpp
+++ b/iokit/Kernel/IODataQueue.cpp
@@ -73,11 +73,19 @@ IODataQueue *IODataQueue::withEntries(UInt32 numEntries, UInt32 entrySize)
 
 Boolean IODataQueue::initWithCapacity(UInt32 size)
 {
+    vm_size_t allocSize = 0;
+
     if (!super::init()) {
         return false;
     }
 
-    dataQueue = (IODataQueueMemory *)IOMallocAligned(round_page(size + DATA_QUEUE_MEMORY_HEADER_SIZE), PAGE_SIZE);
+    allocSize = round_page(size + DATA_QUEUE_MEMORY_HEADER_SIZE);
+
+    if (allocSize < size) {
+        return false;
+    }
+
+    dataQueue = (IODataQueueMemory *)IOMallocAligned(allocSize, PAGE_SIZE);
     if (dataQueue == 0) {
         return false;
     }
diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp
index 8de463efd..4ee53e566 100644
--- a/iokit/Kernel/IODeviceTreeSupport.cpp
+++ b/iokit/Kernel/IODeviceTreeSupport.cpp
@@ -921,6 +921,7 @@ void IODTSetResolving( IORegistryEntry * 	regEntry,
     if( !prop)
         return;
 
+    prop->setSerializable(false);
     regEntry->setProperty( gIODTPersistKey, prop);
     prop->release();
     return;
@@ -928,8 +929,8 @@ void IODTSetResolving( IORegistryEntry * 	regEntry,
 
 static SInt32 DefaultCompare( UInt32 cellCount, UInt32 left[], UInt32 right[] )
 {
-    cellCount--;
-    return( left[ cellCount ] - right[ cellCount ] );
+	cellCount--;
+	return( left[ cellCount ] - right[ cellCount ] ); 
 }
 
 void IODTGetCellCounts( IORegistryEntry * regEntry,
@@ -959,14 +960,15 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
     // cells in addresses below regEntry
     UInt32		childSizeCells, childAddressCells;
     UInt32		childCells;
-    UInt32		cell[ 5 ], offset = 0, length;
-    UInt32		endCell[ 5 ];
+    UInt32		cell[ 8 ], length;
+    UInt64		offset = 0;
+    UInt32		endCell[ 8 ];
     UInt32		*range;
     UInt32		*lookRange;
     UInt32		*startRange;
     UInt32		*endRanges;
     bool		ok = true;
-    SInt32		diff, diff2, endDiff;
+    SInt64		diff, diff2, endDiff;
 
     IODTPersistent	*persist;
     IODTCompareAddressCellFunc	compare;
@@ -974,10 +976,13 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
     IODTGetCellCounts( regEntry, &childSizeCells, &childAddressCells );
     childCells = childAddressCells + childSizeCells;
 
+    if (childCells > sizeof(cell)/sizeof(cell[0]))
+        panic("IODTResolveAddressCell: Invalid device tree (%u,%u)", (uint32_t)childAddressCells, (uint32_t)childSizeCells);
+
     bcopy( cellsIn, cell, sizeof(UInt32) * childCells );
     if( childSizeCells > 1)
-        *len = IOPhysical32( cellsIn[ childAddressCells ],
-                             cellsIn[ childAddressCells + 1 ] );
+        *len = IOPhysical32( cellsIn[ childAddressCells + 1],
+                             cellsIn[ childAddressCells] );
     else
         *len = IOPhysical32( 0, cellsIn[ childAddressCells ] );
 
@@ -985,8 +990,13 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
     {
 	prop = OSDynamicCast( OSData, regEntry->getProperty( gIODTRangeKey ));
 	if( 0 == prop) {
-	    /* end of the road */
-	    *phys = IOPhysical32( 0,  cell[ childAddressCells - 1 ] + offset);
+            /* end of the road */
+	    if (childAddressCells == 2)  {
+                *phys = IOPhysical32( cell[ childAddressCells - 1 ], cell [ childAddressCells - 2 ]);
+	    } else  {
+	        *phys = IOPhysical32( 0, cell[ childAddressCells - 1 ]);
+	    }
+            *phys += offset;
 	    break;
 	}
 
@@ -1003,8 +1013,11 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
 	    if( prop) {
 		persist = (IODTPersistent *) prop->getBytesNoCopy();
 		compare = persist->compareFunc;
-	    } else
+	    } else if (addressCells == childAddressCells) {
 		compare = DefaultCompare;
+	    } else {
+	 	panic("There is no mixed comparison function yet...");
+	    }
 
 	    for( ok = false;
 		 range < endRanges;
@@ -1013,8 +1026,21 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
 		// is cell start within range?
 		diff = (*compare)( childAddressCells, cell, range );
 
+        if (childAddressCells > sizeof(endCell)/sizeof(endCell[0]))
+            panic("IODTResolveAddressCell: Invalid device tree (%u)", (uint32_t)childAddressCells);
+
 		bcopy(range, endCell, childAddressCells * sizeof(UInt32));
-		endCell[childAddressCells - 1] += range[childCells + addressCells - 1];
+
+		if (childAddressCells == 2) {
+			uint64_t sum = endCell[childAddressCells - 2] + IOPhysical32(range[childCells + addressCells - 1], range[childCells + addressCells - 2]);
+			endCell[childAddressCells - 2] = (uint32_t)(sum & 0x00000000FFFFFFFFULL);
+			if (sum > UINT32_MAX) {
+				endCell[childAddressCells - 1] += (uint32_t)((sum & 0xFFFFFFFF00000000ULL) >> 32);
+			}
+		} else {
+			endCell[childAddressCells - 1] += range[childCells + addressCells - 1];
+		}
+
 		diff2 = (*compare)( childAddressCells, cell, endCell );
 
 		if ((diff < 0) || (diff2 >= 0))
@@ -1025,7 +1051,17 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
 		{
 		    // search for cell end
 		    bcopy(cell, endCell, childAddressCells * sizeof(UInt32));
-		    endCell[childAddressCells - 1] += cell[childCells - 1] - 1;
+
+		    if (childSizeCells == 2) {
+			uint64_t sum;
+                        sum = endCell[childAddressCells - 2] + IOPhysical32(cell[childCells - 1], cell[childCells - 2]) - 1;
+			endCell[childAddressCells - 2] = (uint32_t)(sum & 0x00000000FFFFFFFFULL);
+			if (sum > UINT32_MAX) {
+				endCell[childAddressCells - 1] += (uint32_t)((sum & 0xFFFFFFFF00000000ULL) >> 32);
+			}
+		    } else {
+                        endCell[childAddressCells - 1] += cell[childCells - 1] - 1;
+		    }
 		    lookRange = startRange;
 		    for( ;
 			 lookRange < endRanges;
@@ -1049,6 +1085,9 @@ bool IODTResolveAddressCell( IORegistryEntry * regEntry,
 		break;
 	    }
 
+        if (addressCells + sizeCells > sizeof(cell)/sizeof(cell[0]))
+            panic("IODTResolveAddressCell: Invalid device tree (%u, %u)", (uint32_t)addressCells, (uint32_t)sizeCells);
+
 	    // Get the physical start of the range from our parent
 	    bcopy( range + childAddressCells, cell, sizeof(UInt32) * addressCells );
 	    bzero( cell + addressCells, sizeof(UInt32) * sizeCells );
diff --git a/iokit/Kernel/IOFilterInterruptEventSource.cpp b/iokit/Kernel/IOFilterInterruptEventSource.cpp
index 944e84ced..6ecc33bfd 100644
--- a/iokit/Kernel/IOFilterInterruptEventSource.cpp
+++ b/iokit/Kernel/IOFilterInterruptEventSource.cpp
@@ -157,14 +157,14 @@ void IOFilterInterruptEventSource::normalInterruptOccurred
 	
 	if (trace)
 		IOTimeStampStartConstant(IODBG_INTES(IOINTES_FILTER),
-								 (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+					 VM_KERNEL_UNSLIDE(filterAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
     
     // Call the filter.
     filterRes = (*filterAction)(owner, this);
 	
 	if (trace)
 		IOTimeStampEndConstant(IODBG_INTES(IOINTES_FILTER),
-							   (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+				       VM_KERNEL_UNSLIDE(filterAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
 	
     if (filterRes)
         signalInterrupt();
@@ -178,14 +178,14 @@ void IOFilterInterruptEventSource::disableInterruptOccurred
 	
 	if (trace)
 		IOTimeStampStartConstant(IODBG_INTES(IOINTES_FILTER),
-								 (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+					 VM_KERNEL_UNSLIDE(filterAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
     
     // Call the filter.
     filterRes = (*filterAction)(owner, this);
 	
 	if (trace)
 		IOTimeStampEndConstant(IODBG_INTES(IOINTES_FILTER),
-							   (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+				       VM_KERNEL_UNSLIDE(filterAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
 	
     if (filterRes) {
         prov->disableInterrupt(source);	/* disable the interrupt */
diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp
index 002055ff1..37c0bc7ea 100644
--- a/iokit/Kernel/IOHibernateIO.cpp
+++ b/iokit/Kernel/IOHibernateIO.cpp
@@ -58,12 +58,12 @@ Sleep:
   by hibernate_page_list_setall(), avoiding having to find arch dependent low level bits.
   The image header and block list are written. The header includes the second file extent so
   only the header block is needed to read the file, regardless of filesystem.
-  The kernel section "__HIB" is written uncompressed to the image. This section of code and data 
+  The kernel segment "__HIB" is written uncompressed to the image. This segment of code and data 
   (only) is used to decompress the image during wake/boot.
   Some additional pages are removed from the bitmaps - the buffers used for hibernation.
   The bitmaps are written to the image.
   More areas are removed from the bitmaps (after they have been written to the image) - the 
-  section "__HIB" pages and interrupt stack.
+  segment "__HIB" pages and interrupt stack.
   Each wired page is compressed and written and then each non-wired page. Compression and 
   disk writes are in parallel.
   The image header is written to the start of the file and the polling driver closed.
@@ -152,7 +152,7 @@ to restrict I/O ops.
 #include "IOPMPowerStateQueue.h"
 #include 
 #include 
-#include 
+#include 
 
 #include 
 #include 
@@ -196,6 +196,8 @@ static OSData *	                gIOHibernateBootNextData;
 static OSObject *		gIOHibernateBootNextSave;
 #endif
 
+static IOLock *                           gFSLock;
+static uint32_t                           gFSState;
 static IOPolledFileIOVars	          gFileVars;
 static IOHibernateVars			  gIOHibernateVars;
 static struct kern_direct_file_io_ref_t * gIOHibernateFileRef;
@@ -203,6 +205,16 @@ static hibernate_cryptvars_t 		  gIOHibernateCryptWakeContext;
 static hibernate_graphics_t  		  _hibernateGraphics;
 static hibernate_graphics_t * 		  gIOHibernateGraphicsInfo = &_hibernateGraphics;
 
+enum 
+{
+    kFSIdle     = 0,
+    kFSOpening  = 2,
+    kFSOpened   = 3,
+    kFSTimedOut = 4,
+};
+
+static IOReturn IOHibernateDone(IOHibernateVars * vars);
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 enum { kXPRamAudioVolume = 8 };
@@ -594,7 +606,7 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer,
 			    IOPolledFileIOVars ** fileVars, OSData ** fileExtents,
 			    OSData ** imagePath, uint8_t * volumeCryptKey)
 {
-    IOReturn			err = kIOReturnError;
+    IOReturn			err = kIOReturnSuccess;
     IOPolledFileIOVars *	vars;
     _OpenFileContext		ctx;
     OSData *			extentsData;
@@ -605,8 +617,13 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer,
     dev_t 			block_dev;
     dev_t 			hibernate_image_dev;
     uint64_t			maxiobytes;
+    AbsoluteTime                startTime, endTime;
+    uint64_t                    nsec;
+
+    vars = IONew(IOPolledFileIOVars, 1);
+    if (!vars) return (kIOReturnNoMemory);
+    bzero(vars, sizeof(*vars));
 
-    vars = &gFileVars;
     do
     {
 	HIBLOG("sizeof(IOHibernateImageHeader) == %ld\n", sizeof(IOHibernateImageHeader));
@@ -620,9 +637,9 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer,
 	vars->bufferSize   = ioBuffer->getLength() >> 1;
     
 	extentsData = OSData::withCapacity(32);
-    
-	ctx.extents = extentsData;
+   	ctx.extents = extentsData;
 	ctx.size    = 0;
+	clock_get_uptime(&startTime);
 	vars->fileRef = kern_open_file_for_direct_io(filename, 
 						    &file_extent_callback, &ctx, 
 						    &block_dev,
@@ -632,12 +649,23 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer,
                                                     &vars->flags, 
                                                     0, (caddr_t) gIOHibernateCurrentHeader, 
                                                     sizeof(IOHibernateImageHeader));
-	if (!vars->fileRef)
-	{
-	    err = kIOReturnNoSpace;
-	    break;
-	}
-	gIOHibernateFileRef = vars->fileRef;
+#if 0
+	uint32_t msDelay = (131071 & random());
+	HIBLOG("sleep %d\n", msDelay);
+	IOSleep(msDelay);
+#endif
+        clock_get_uptime(&endTime);
+        SUB_ABSOLUTETIME(&endTime, &startTime);
+        absolutetime_to_nanoseconds(endTime, &nsec);
+
+	if (!vars->fileRef) err = kIOReturnNoSpace;
+
+	IOLockLock(gFSLock);
+	if (kFSOpening != gFSState) err = kIOReturnTimeout;
+	IOLockUnlock(gFSLock);
+
+        HIBLOG("kern_open_file_for_direct_io(%d) took %qd ms\n", err, nsec / 1000000ULL);
+	if (kIOReturnSuccess != err) break;
 
         if (kIOHibernateModeSSDInvert & gIOHibernateMode)
             vars->flags ^= kIOHibernateOptionSSD;
@@ -793,7 +821,7 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer,
 	if (vars->fileRef)
 	{
 	    kern_close_file_for_direct_io(vars->fileRef, 0, 0, 0, 0, 0);
-	    gIOHibernateFileRef = vars->fileRef = NULL;
+	    vars->fileRef = NULL;
 	}
     }
 
@@ -1090,15 +1118,13 @@ IOHibernateSystemSleep(void)
     OSObject * obj;
     OSString * str;
     bool       dsSSD;
-
-    IOHibernateVars * vars  = &gIOHibernateVars;
-
-    if (vars->fileVars && vars->fileVars->fileRef)
-	// already on the way down
-	return (kIOReturnSuccess);
+    IOHibernateVars * vars;
 
     gIOHibernateState = kIOHibernateStateInactive;
 
+    if (!gIOChosenEntry)
+	gIOChosenEntry = IORegistryEntry::fromPath("/chosen", gIODTPlane);
+
     gIOHibernateDebugFlags = 0;
     if (kIOLogHibernate & gIOKitDebug)
 	gIOHibernateDebugFlags |= kIOHibernateDebugRestoreLogs;
@@ -1125,6 +1151,20 @@ IOHibernateSystemSleep(void)
 
     HIBLOG("hibernate image path: %s\n", gIOHibernateFilename);
 
+    vars = IONew(IOHibernateVars, 1);
+    if (!vars) return (kIOReturnNoMemory);
+    bzero(vars, sizeof(*vars));
+
+    IOLockLock(gFSLock);
+    if (kFSIdle != gFSState)
+    {
+	HIBLOG("hibernate file busy\n");
+	IOLockUnlock(gFSLock);
+	IODelete(vars, IOHibernateVars, 1);
+        return (kIOReturnBusy);
+    }
+    gFSState = kFSOpening;
+    IOLockUnlock(gFSLock);
 
     do
     {
@@ -1169,7 +1209,7 @@ IOHibernateSystemSleep(void)
             {
                 uintptr_t smcVars[2];
                 smcVars[0] = sizeof(vars->volumeCryptKey);
-                smcVars[1] = (uintptr_t)(void *) &vars->volumeCryptKey[0];
+                smcVars[1] = (uintptr_t)(void *) &gIOHibernateVars.volumeCryptKey[0];
 
                 IOService::getPMRootDomain()->setProperty(kIOHibernateSMCVariablesKey, smcVars, sizeof(smcVars));
                 bzero(smcVars, sizeof(smcVars));
@@ -1224,8 +1264,6 @@ IOHibernateSystemSleep(void)
             if (regEntry && !gIOOptionsEntry)
                 regEntry->release();
         }
-        if (!gIOChosenEntry)
-            gIOChosenEntry = IORegistryEntry::fromPath("/chosen", gIODTPlane);
 
 	if (gIOOptionsEntry)
 	{
@@ -1405,10 +1443,31 @@ IOHibernateSystemSleep(void)
 	}
 	// --
 
+    }
+    while (false);
+
+    IOLockLock(gFSLock);
+    if ((kIOReturnSuccess == err) && (kFSOpening == gFSState))
+    {
+	gFSState = kFSOpened;
+	gIOHibernateVars = *vars;
+	gFileVars = *vars->fileVars;
+	gIOHibernateVars.fileVars = &gFileVars;
+	gIOHibernateFileRef = gFileVars.fileRef;
 	gIOHibernateCurrentHeader->signature = kIOHibernateHeaderSignature;
 	gIOHibernateState = kIOHibernateStateHibernating;
     }
-    while (false);
+    else
+    {
+	HIBLOG("hibernate file close due timeout\n");
+	if (vars->fileVars && vars->fileVars->fileRef) kern_close_file_for_direct_io(vars->fileVars->fileRef, 0, 0, 0, 0, 0);
+	IOHibernateDone(vars);
+	gFSState = kFSIdle;
+    }
+    IOLockUnlock(gFSLock);
+
+    if (vars->fileVars) IODelete(vars->fileVars, IOPolledFileIOVars, 1);
+    IODelete(vars, IOHibernateVars, 1);
 
     return (err);
 }
@@ -1533,14 +1592,40 @@ ProgressUpdate(hibernate_graphics_t * display, uint8_t * screen, int32_t firstBl
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
+IOReturn
+IOHibernateIOKitSleep(void)
+{
+    IOReturn ret = kIOReturnSuccess;
+    IOLockLock(gFSLock);
+    if (kFSOpening == gFSState)
+    {
+	gFSState = kFSTimedOut;
+	HIBLOG("hibernate file open timed out\n");
+	ret = kIOReturnTimeout;
+    }
+    IOLockUnlock(gFSLock);
+    return (ret);
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
 IOReturn
 IOHibernateSystemHasSlept(void)
 {
+    IOReturn          ret = kIOReturnSuccess;
     IOHibernateVars * vars  = &gIOHibernateVars;
-    OSObject        * obj;
+    OSObject        * obj = 0;
     OSData          * data;
 
-    obj = IOService::getPMRootDomain()->copyProperty(kIOHibernatePreviewBufferKey);
+    IOLockLock(gFSLock);
+    if ((kFSOpened != gFSState) && gIOHibernateMode)
+    {
+	ret = kIOReturnTimeout;
+    }
+    IOLockUnlock(gFSLock);
+    if (kIOReturnSuccess != ret) return (ret);
+
+    if (gIOHibernateMode) obj = IOService::getPMRootDomain()->copyProperty(kIOHibernatePreviewBufferKey);
     vars->previewBuffer = OSDynamicCast(IOMemoryDescriptor, obj);
     if (obj && !vars->previewBuffer)
 	obj->release();
@@ -1587,7 +1672,7 @@ IOHibernateSystemHasSlept(void)
     if (gIOOptionsEntry)
         gIOOptionsEntry->sync();
 
-    return (kIOReturnSuccess);
+    return (ret);
 }
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -1627,8 +1712,21 @@ MergeDeviceTree(DeviceTreeNode * entry, IORegistryEntry * regEntry)
 IOReturn
 IOHibernateSystemWake(void)
 {
-    IOHibernateVars * vars  = &gIOHibernateVars;
+    if (kFSOpened == gFSState)
+    {
+    	IOHibernateDone(&gIOHibernateVars);
+    }
+    else
+    {
+        IOService::getPMRootDomain()->removeProperty(kIOHibernateOptionsKey);
+        IOService::getPMRootDomain()->removeProperty(kIOHibernateGfxStatusKey);
+    }
+    return (kIOReturnSuccess);
+}
 
+static IOReturn
+IOHibernateDone(IOHibernateVars * vars)
+{
     hibernate_teardown(vars->page_list, vars->page_list_wired);
 
     if (vars->videoMapping)
@@ -1766,20 +1864,49 @@ IOHibernateSystemWake(void)
 IOReturn
 IOHibernateSystemPostWake(void)
 {
-    if (gIOHibernateFileRef)
+    struct kern_direct_file_io_ref_t * fileRef;
+
+    if (kFSOpened == gFSState)
     {
 	// invalidate & close the image file
 	gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature;
-	kern_close_file_for_direct_io(gIOHibernateFileRef,
+	if ((fileRef = gIOHibernateFileRef))
+	{
+	    gIOHibernateFileRef = 0;
+	    kern_close_file_for_direct_io(fileRef,
 				       0, (caddr_t) gIOHibernateCurrentHeader, 
 				       sizeof(IOHibernateImageHeader),
 				       sizeof(IOHibernateImageHeader),
 				       gIOHibernateCurrentHeader->imageSize);
-        gIOHibernateFileRef = 0;
+	}
+	gFSState = kFSIdle;
     }
     return (kIOReturnSuccess);
 }
 
+bool IOHibernateWasScreenLocked(void)
+{
+    bool ret = false;
+    if ((kIOHibernateStateWakingFromHibernate == gIOHibernateState) && gIOChosenEntry)
+    {
+	OSData *
+	data = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOScreenLockStateKey));
+	if (data) switch (*((uint32_t *)data->getBytesNoCopy()))
+	{
+	    case kIOScreenLockLocked:
+	    case kIOScreenLockFileVaultDialog:
+		ret = true;
+		break;
+	    case kIOScreenLockNoLock:
+	    case kIOScreenLockUnlocked:
+	    default:
+		ret = false;
+		break;
+	}
+    }
+    return (ret);
+}
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 SYSCTL_STRING(_kern, OID_AUTO, hibernatefile, 
@@ -1810,6 +1937,8 @@ IOHibernateSystemInit(IOPMrootDomain * rootDomain)
     sysctl_register_oid(&sysctl__kern_hibernatefile);
     sysctl_register_oid(&sysctl__kern_bootsignature);
     sysctl_register_oid(&sysctl__kern_hibernatemode);
+
+    gFSLock = IOLockAlloc();
 }
 
 
@@ -2016,7 +2145,7 @@ hibernate_write_image(void)
 
         hibernateBase = HIB_BASE; /* Defined in PAL headers */
 
-        hibernateEnd = (sectHIBB + sectSizeHIB);
+        hibernateEnd = (segHIBB + segSizeHIB);
 
         // copy out restore1 code
 
@@ -2038,7 +2167,7 @@ hibernate_write_image(void)
         header->restore1CodeOffset = ((uintptr_t) &hibernate_machine_entrypoint)      - hibernateBase;
         header->restore1StackOffset = ((uintptr_t) &gIOHibernateRestoreStackEnd[0]) - 64 - hibernateBase;
 
-        // sum __HIB sect, with zeros for the stack
+        // sum __HIB seg, with zeros for the stack
         src = (uint8_t *) trunc_page(hibernateBase);
         for (page = 0; page < count; page++)
         {
@@ -2050,7 +2179,7 @@ hibernate_write_image(void)
         }
         sum1 = restore1Sum;
     
-        // write the __HIB sect, with zeros for the stack
+        // write the __HIB seg, with zeros for the stack
 
         src = (uint8_t *) trunc_page(hibernateBase);
         count = ((uintptr_t) &gIOHibernateRestoreStack[0]) - trunc_page(hibernateBase);
@@ -2075,6 +2204,10 @@ hibernate_write_image(void)
                 break;
         }
 
+	vars->fileVars->encryptStart = (vars->fileVars->position & ~(AES_BLOCK_SIZE - 1));
+	vars->fileVars->encryptEnd   = UINT64_MAX;
+	HIBLOG("encryptStart %qx\n", vars->fileVars->encryptStart);
+
         // write the preview buffer
 
         if (vars->previewBuffer)
@@ -2099,6 +2232,9 @@ hibernate_write_image(void)
                 break;
 
             src = (uint8_t *) vars->previewBuffer->getPhysicalSegment(0, NULL, _kIOMemorySourceSegment);
+
+			((hibernate_preview_t *)src)->lockTime = gIOConsoleLockTime;
+
             count = vars->previewBuffer->getLength();
 
             header->previewPageListSize = ppnum;
@@ -2198,20 +2334,16 @@ hibernate_write_image(void)
 
         for (pageType = kWiredEncrypt; pageType >= kUnwiredEncrypt; pageType--)
         {
-            if (needEncrypt && (kEncrypt & pageType))
-            {
+	    if (kUnwiredEncrypt == pageType)
+	   {
+		// start unwired image
                 vars->fileVars->encryptStart = (vars->fileVars->position & ~(((uint64_t)AES_BLOCK_SIZE) - 1));
                 vars->fileVars->encryptEnd   = UINT64_MAX;
                 HIBLOG("encryptStart %qx\n", vars->fileVars->encryptStart);
-
-                if (kUnwiredEncrypt == pageType)
-                {
-                    // start unwired image
-                    bcopy(&cryptvars->aes_iv[0], 
-                            &gIOHibernateCryptWakeContext.aes_iv[0], 
-                            sizeof(cryptvars->aes_iv));
-                    cryptvars = &gIOHibernateCryptWakeContext;
-                }
+		bcopy(&cryptvars->aes_iv[0], 
+			&gIOHibernateCryptWakeContext.aes_iv[0], 
+			sizeof(cryptvars->aes_iv));
+		cryptvars = &gIOHibernateCryptWakeContext;
             }
             for (iterDone = false, ppnum = 0; !iterDone; )
             {
@@ -2377,6 +2509,7 @@ hibernate_write_image(void)
         header->restore1Sum  = restore1Sum;
         header->image1Sum    = sum1;
         header->image2Sum    = sum2;
+        header->sleepTime    = gIOLastSleepTime.tv_sec;
     
         count = vars->fileExtents->getLength();
         if (count > sizeof(header->fileExtentMap))
@@ -2496,8 +2629,6 @@ hibernate_machine_init(void)
     uint64_t     nsec;
     uint32_t     lastProgressStamp = 0;
     uint32_t     progressStamp;
-    uint64_t	 progressZeroPosition = 0;
-    uint32_t	 blob, lastBlob = (uint32_t) -1L;
     hibernate_cryptvars_t * cryptvars = 0;
 
     IOHibernateVars * vars  = &gIOHibernateVars;
@@ -2522,15 +2653,9 @@ hibernate_machine_init(void)
 	    gIOHibernateCurrentHeader->diag[0], gIOHibernateCurrentHeader->diag[1], 
 	    gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]);
 
-    HIBPRINT("video %x %d %d %d status %x\n",
-	    gIOHibernateGraphicsInfo->physicalAddress, gIOHibernateGraphicsInfo->depth, 
-	    gIOHibernateGraphicsInfo->width, gIOHibernateGraphicsInfo->height, gIOHibernateGraphicsInfo->gfxStatus); 
-
     if ((kIOHibernateModeDiscardCleanActive | kIOHibernateModeDiscardCleanInactive) & gIOHibernateMode)
         hibernate_page_list_discard(vars->page_list);
 
-    boot_args *args = (boot_args *) PE_state.bootArgs;
-
     cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0;
 
     if (gIOHibernateCurrentHeader->handoffPageCount > gIOHibernateHandoffPageCount)
@@ -2587,9 +2712,11 @@ hibernate_machine_init(void)
     if (cryptvars && !foundCryptData)
     	panic("hibernate handoff");
 
-    if (vars->videoMapping 
-	&& gIOHibernateGraphicsInfo->physicalAddress
-	&& (args->Video.v_baseAddr == gIOHibernateGraphicsInfo->physicalAddress))
+    HIBPRINT("video %x %d %d %d status %x\n",
+	    gIOHibernateGraphicsInfo->physicalAddress, gIOHibernateGraphicsInfo->depth, 
+	    gIOHibernateGraphicsInfo->width, gIOHibernateGraphicsInfo->height, gIOHibernateGraphicsInfo->gfxStatus); 
+
+    if (vars->videoMapping && gIOHibernateGraphicsInfo->physicalAddress)
     {
         vars->videoMapSize = round_page(gIOHibernateGraphicsInfo->height 
                                         * gIOHibernateGraphicsInfo->rowBytes);
@@ -2598,6 +2725,10 @@ hibernate_machine_init(void)
                     vars->videoMapSize, kIOMapInhibitCache );
     }
 
+    if (vars->videoMapSize)
+        ProgressUpdate(gIOHibernateGraphicsInfo, 
+                        (uint8_t *) vars->videoMapping, 0, kIOHibernateProgressCount);
+
     uint8_t * src = (uint8_t *) vars->srcBuffer->getBytesNoCopy();
     uint32_t decoOffset;
 
@@ -2609,21 +2740,8 @@ hibernate_machine_init(void)
     err = IOHibernatePollerOpen(vars->fileVars, kIOPolledAfterSleepState, 0);
     HIBLOG("IOHibernatePollerOpen(%x)\n", err);
 
-    if (gIOHibernateCurrentHeader->previewSize)
-        progressZeroPosition = gIOHibernateCurrentHeader->previewSize 
-                             + gIOHibernateCurrentHeader->fileExtentMapSize 
-                             - sizeof(gIOHibernateCurrentHeader->fileExtentMap) 
-                             + ptoa_64(gIOHibernateCurrentHeader->restore1PageCount);
-
     IOPolledFileSeek(vars->fileVars, gIOHibernateCurrentHeader->image1Size);
 
-    if (vars->videoMapSize)
-    {
-        lastBlob = ((vars->fileVars->position - progressZeroPosition) * kIOHibernateProgressCount)
-                        / (gIOHibernateCurrentHeader->imageSize - progressZeroPosition);
-        ProgressUpdate(gIOHibernateGraphicsInfo, (uint8_t *) vars->videoMapping, 0, lastBlob);
-    }
-
     // kick off the read ahead
     vars->fileVars->io	         = false;
     vars->fileVars->bufferHalf   = 0;
@@ -2714,17 +2832,6 @@ hibernate_machine_init(void)
 	    pagesDone++;
 	    pagesRead++;
 
-            if (vars->videoMapSize && (0 == (1023 & pagesDone)))
-            {
-                blob = ((vars->fileVars->position - progressZeroPosition) * kIOHibernateProgressCount)
-                        / (gIOHibernateCurrentHeader->imageSize - progressZeroPosition);
-                if (blob != lastBlob)
-                {
-                    ProgressUpdate(gIOHibernateGraphicsInfo, (uint8_t *) vars->videoMapping, lastBlob, blob);
-                    lastBlob = blob;
-                }
-            }
-
 	    if (0 == (8191 & pagesDone))
 	    {
 		clock_get_uptime(&endTime);
@@ -2753,10 +2860,6 @@ hibernate_machine_init(void)
 
     err = IOHibernatePollerClose(vars->fileVars, kIOPolledAfterSleepState);
 
-    if (vars->videoMapSize)
-        ProgressUpdate(gIOHibernateGraphicsInfo, 
-                        (uint8_t *) vars->videoMapping, 0, kIOHibernateProgressCount);
-
     clock_get_uptime(&endTime);
 
     IOService::getPMRootDomain()->pmStatsRecordEvent( 
diff --git a/iokit/Kernel/IOHibernateInternal.h b/iokit/Kernel/IOHibernateInternal.h
index 7e7e95fe6..2c1378e5f 100644
--- a/iokit/Kernel/IOHibernateInternal.h
+++ b/iokit/Kernel/IOHibernateInternal.h
@@ -101,13 +101,10 @@ extern "C"
 uint32_t
 hibernate_sum_page(uint8_t *buf, uint32_t ppnum);
 
-extern vm_offset_t sectHIBB;
-extern unsigned long sectSizeHIB;
-extern vm_offset_t sectDATAB;
-extern unsigned long sectSizeDATA;
-#if defined(__i386__) || defined(__x86_64__)
-extern vm_offset_t sectINITPTB;
-#endif
+extern vm_offset_t segHIBB;
+extern unsigned long segSizeHIB;
+extern vm_offset_t segDATAB;
+extern unsigned long segSizeDATA;
 
 extern ppnum_t gIOHibernateHandoffPages[];
 extern uint32_t gIOHibernateHandoffPageCount;
diff --git a/iokit/Kernel/IOHibernateRestoreKernel.c b/iokit/Kernel/IOHibernateRestoreKernel.c
index 79410326b..10bd705f5 100644
--- a/iokit/Kernel/IOHibernateRestoreKernel.c
+++ b/iokit/Kernel/IOHibernateRestoreKernel.c
@@ -32,7 +32,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include 
diff --git a/iokit/Kernel/IOInterruptEventSource.cpp b/iokit/Kernel/IOInterruptEventSource.cpp
index 8b49024a1..08ecc626d 100644
--- a/iokit/Kernel/IOInterruptEventSource.cpp
+++ b/iokit/Kernel/IOInterruptEventSource.cpp
@@ -210,14 +210,14 @@ bool IOInterruptEventSource::checkForWork()
 	{
 		if (trace)
 			IOTimeStampStartConstant(IODBG_INTES(IOINTES_ACTION),
-									 (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+						 VM_KERNEL_UNSLIDE(intAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
 		
 		// Call the handler
 		(*intAction)(owner, this, numInts);
 		
 		if (trace)
 			IOTimeStampEndConstant(IODBG_INTES(IOINTES_ACTION),
-								   (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+					       VM_KERNEL_UNSLIDE(intAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
 		
 		consumerCount = cacheProdCount;
 		if (autoDisable && !explicitDisable)
@@ -228,14 +228,14 @@ bool IOInterruptEventSource::checkForWork()
 	{
 		if (trace)
 			IOTimeStampStartConstant(IODBG_INTES(IOINTES_ACTION),
-									 (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+						 VM_KERNEL_UNSLIDE(intAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
 		
 		// Call the handler
 		(*intAction)(owner, this, -numInts);
 		
 		if (trace)
 			IOTimeStampEndConstant(IODBG_INTES(IOINTES_ACTION),
-								   (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
+					       VM_KERNEL_UNSLIDE(intAction), (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop);
 		
 		consumerCount = cacheProdCount;
 		if (autoDisable && !explicitDisable)
diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp
index 21048d88c..c170d83a6 100644
--- a/iokit/Kernel/IOKitDebug.cpp
+++ b/iokit/Kernel/IOKitDebug.cpp
@@ -98,59 +98,12 @@ void IOPrintPlane( const IORegistryPlane * plane )
     iter->release();
 }
 
-void dbugprintf(const char *fmt, ...);
-void db_dumpiojunk( const IORegistryPlane * plane );
-
-void db_piokjunk(void) {
-
-	dbugprintf("\nDT plane:\n");
-	db_dumpiojunk( gIODTPlane );
-	dbugprintf("\n\nService plane:\n");
-	db_dumpiojunk( gIOServicePlane );
-    dbugprintf("\n\n"
-	    "ivar kalloc()       0x%08x\n"
-	    "malloc()            0x%08x\n"
-            "containers kalloc() 0x%08x\n"
-	    "IOMalloc()          0x%08x\n"
-            "----------------------------------------\n",
-	    debug_ivars_size,
-            debug_malloc_size,
-            debug_container_malloc_size,
-            debug_iomalloc_size
-            );
-
+void db_piokjunk(void)
+{
 }
 
-
-void db_dumpiojunk( const IORegistryPlane * plane )
+void db_dumpiojunk( const IORegistryPlane * plane __unused )
 {
-    IORegistryEntry *		next;
-    IORegistryIterator * 	iter;
-    OSOrderedSet *		all;
-    char			format[] = "%xxxs";
-    IOService *			service;
-
-    iter = IORegistryIterator::iterateOver( plane );
-
-    all = iter->iterateAll();
-    if( all) {
-        dbugprintf("Count %d\n", all->getCount() );
-        all->release();
-    } else dbugprintf("Empty\n");
-
-    iter->reset();
-    while( (next = iter->getNextObjectRecursive())) {
-		snprintf(format + 1, sizeof(format) - 1, "%ds", 2 * next->getDepth( plane ));
-		dbugprintf( format, "");
-		dbugprintf( "%s", next->getName( plane ));
-		if( (next->getLocation( plane )))
-				dbugprintf("@%s", next->getLocation( plane ));
-		dbugprintf(" getMetaClass()->getClassName());
-			if( (service = OSDynamicCast(IOService, next)))
-				dbugprintf(", busy %ld", service->getBusyState());
-		dbugprintf( ">\n");
-    }
-    iter->release();
 }
 
 void IOPrintMemory( void )
diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h
index 5a74159a4..27c55e7c4 100644
--- a/iokit/Kernel/IOKitKernelInternal.h
+++ b/iokit/Kernel/IOKitKernelInternal.h
@@ -40,7 +40,7 @@ __BEGIN_DECLS
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
-#if !defined(NO_KDEBUG)
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
 
 #define IOServiceTrace(csc, a, b, c, d) do {				\
     if(kIOTraceIOService & gIOKitDebug) {				\
@@ -48,7 +48,7 @@ __BEGIN_DECLS
     }									\
 } while(0)
 
-#else /* NO_KDEBUG */
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
 
 #define IOServiceTrace(csc, a, b, c, d) do {	\
   (void)a;					\
@@ -57,7 +57,7 @@ __BEGIN_DECLS
   (void)d;					\
 } while (0)
 
-#endif /* NO_KDEBUG */
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
@@ -99,7 +99,7 @@ extern ppnum_t IOGetLastPageNumber(void);
 extern ppnum_t gIOLastPage;
 
 /* Physical to physical copy (ints must be disabled) */
-extern void bcopy_phys(addr64_t from, addr64_t to, int size);
+extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t size);
 
 __END_DECLS
 
@@ -164,9 +164,26 @@ struct IODMACommandInternal
     UInt64   fActualByteCount;
 };
 
+struct IOMemoryDescriptorDevicePager {
+    void *		         devicePager;
+    unsigned int	     pagerContig:1;
+    unsigned int	     unused:31;
+    IOMemoryDescriptor * memory;
+};
+
+struct IOMemoryDescriptorReserved {
+    IOMemoryDescriptorDevicePager dp;
+    uint64_t                      preparationID;
+    // for kernel IOMD subclasses... they have no expansion
+    uint64_t                      kernReserved[4];
+};
+
+
 extern "C" struct timeval gIOLastSleepTime;
 extern "C" struct timeval gIOLastWakeTime;
 
+extern clock_sec_t gIOConsoleLockTime;
+
 extern "C" void IOKitResetTime( void );
 extern "C" void IOKitInitializeTime( void );
 
@@ -176,4 +193,7 @@ extern "C" OSString * IOCopyLogNameForPID(int pid);
 extern "C" void IOSetKeyStoreData(IOMemoryDescriptor * data);
 #endif
 
+void IOScreenLockTimeUpdate(clock_sec_t secs);
+
+
 #endif /* ! _IOKIT_KERNELINTERNAL_H */
diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp
index b2714fc9b..886176acf 100644
--- a/iokit/Kernel/IOLib.cpp
+++ b/iokit/Kernel/IOLib.cpp
@@ -237,7 +237,10 @@ void * IOMallocAligned(vm_size_t size, vm_size_t alignment)
     alignMask = alignment - 1;
     adjustedSize = size + sizeof(vm_size_t) + sizeof(vm_address_t);
 
-    if (adjustedSize >= page_size) {
+    if (size > adjustedSize) {
+	    address = 0;    /* overflow detected */
+    }
+    else if (adjustedSize >= page_size) {
 
         kr = kernel_memory_allocate(kernel_map, &address,
 					size, alignMask, 0);
diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp
index 9b4590945..fe8f9f271 100644
--- a/iokit/Kernel/IOMemoryDescriptor.cpp
+++ b/iokit/Kernel/IOMemoryDescriptor.cpp
@@ -181,18 +181,12 @@ kern_return_t device_data_action(
                vm_object_offset_t      offset, 
                vm_size_t               size)
 {
-    struct ExpansionData {
-        void *				devicePager;
-        unsigned int			pagerContig:1;
-        unsigned int			unused:31;
-	IOMemoryDescriptor *		memory;
-    };
     kern_return_t	 kr;
-    ExpansionData *      ref = (ExpansionData *) device_handle;
+    IOMemoryDescriptorReserved * ref = (IOMemoryDescriptorReserved *) device_handle;
     IOMemoryDescriptor * memDesc;
 
     LOCK;
-    memDesc = ref->memory;
+    memDesc = ref->dp.memory;
     if( memDesc)
     {
 	memDesc->retain();
@@ -210,15 +204,9 @@ kern_return_t device_data_action(
 kern_return_t device_close(
                uintptr_t     device_handle)
 {
-    struct ExpansionData {
-        void *				devicePager;
-        unsigned int			pagerContig:1;
-        unsigned int			unused:31;
-	IOMemoryDescriptor *		memory;
-    };
-    ExpansionData *   ref = (ExpansionData *) device_handle;
+    IOMemoryDescriptorReserved * ref = (IOMemoryDescriptorReserved *) device_handle;
 
-    IODelete( ref, ExpansionData, 1 );
+    IODelete( ref, IOMemoryDescriptorReserved, 1 );
 
     return( kIOReturnSuccess );
 }
@@ -935,7 +923,7 @@ void IOGeneralMemoryDescriptor::free()
     if( reserved)
     {
 	LOCK;
-	reserved->memory = 0;
+	reserved->dp.memory = 0;
 	UNLOCK;
     }
 
@@ -961,11 +949,19 @@ void IOGeneralMemoryDescriptor::free()
 	_ranges.v = NULL;
     }
 
-    if (reserved && reserved->devicePager)
-	device_pager_deallocate( (memory_object_t) reserved->devicePager );
+    if (reserved)
+    {
+        if (reserved->dp.devicePager)
+        {
+            // memEntry holds a ref on the device pager which owns reserved
+            // (IOMemoryDescriptorReserved) so no reserved access after this point
+            device_pager_deallocate( (memory_object_t) reserved->dp.devicePager );
+        }
+        else
+            IODelete(reserved, IOMemoryDescriptorReserved, 1);
+        reserved = NULL;
+    }
 
-    // memEntry holds a ref on the device pager which owns reserved
-    // (ExpansionData) so no reserved access after this point
     if (_memEntry)
         ipc_port_release_send( (ipc_port_t) _memEntry );
 
@@ -1151,7 +1147,10 @@ IOGeneralMemoryDescriptor::getPreparationID( void )
 	return (kIOPreparationIDUnprepared);
 
     if (_flags & (kIOMemoryTypePhysical | kIOMemoryTypePhysical64))
-	return (kIOPreparationIDAlwaysPrepared);
+    {
+        IOMemoryDescriptor::setPreparationID();
+        return (IOMemoryDescriptor::getPreparationID());
+    }
 
     if (!_memoryEntries || !(dataP = getDataP(_memoryEntries)))
 	return (kIOPreparationIDUnprepared);
@@ -1163,10 +1162,35 @@ IOGeneralMemoryDescriptor::getPreparationID( void )
     return (dataP->fPreparationID);
 }
 
-uint64_t
-IOMemoryDescriptor::getPreparationID( void )
+IOMemoryDescriptorReserved * IOMemoryDescriptor::getKernelReserved( void )
 {
-    return (kIOPreparationIDUnsupported);    
+    if (!reserved)
+    {
+        reserved = IONew(IOMemoryDescriptorReserved, 1);
+        if (reserved)
+            bzero(reserved, sizeof(IOMemoryDescriptorReserved));
+    }
+    return (reserved);
+}
+
+void IOMemoryDescriptor::setPreparationID( void )
+{
+    if (getKernelReserved() && (kIOPreparationIDUnprepared == reserved->preparationID))
+    {
+#if defined(__ppc__ )
+        reserved->preparationID = gIOMDPreparationID++;
+#else
+        reserved->preparationID = OSIncrementAtomic64(&gIOMDPreparationID);
+#endif
+    }
+}
+
+uint64_t IOMemoryDescriptor::getPreparationID( void )
+{
+    if (reserved)
+        return (reserved->preparationID);    
+    else
+        return (kIOPreparationIDUnsupported);    
 }
 
 IOReturn IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void *vData, UInt dataSize) const
@@ -1830,6 +1854,7 @@ IOReturn IOMemoryDescriptor::performOperation( IOOptionBits options,
                                                 IOByteCount offset, IOByteCount length )
 {
     IOByteCount remaining;
+    unsigned int res;
     void (*func)(addr64_t pa, unsigned int count) = 0;
 
     switch (options)
@@ -1855,6 +1880,7 @@ IOReturn IOMemoryDescriptor::performOperation( IOOptionBits options,
     if (kIOMemoryThreadSafe & _flags)
 	LOCK;
 
+    res = 0x0UL;
     remaining = length = min(length, getLength() - offset);
     while (remaining)
     // (process another target segment?)
@@ -1882,8 +1908,12 @@ IOReturn IOMemoryDescriptor::performOperation( IOOptionBits options,
     return (remaining ? kIOReturnUnderrun : kIOReturnSuccess);
 }
 
+#if defined(__i386__) || defined(__x86_64__)
 extern vm_offset_t		first_avail;
 #define io_kernel_static_end	first_avail
+#else
+#error io_kernel_static_end is undefined for this architecture
+#endif
 
 static kern_return_t
 io_get_kernel_static_upl(
@@ -2365,11 +2395,14 @@ IOReturn IOGeneralMemoryDescriptor::doMap(
                     {
                         segDestAddr  = address;
                         segLen      -= offset;
+                        srcAddr     += offset;
                         mapLength    = length;
 
                         while (true)
                         {
                             vm_prot_t cur_prot, max_prot;
+
+                            if (segLen > length) segLen = length;
                             kr = mach_vm_remap(map, &segDestAddr, round_page_64(segLen), PAGE_MASK, 
                                                     VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
                                                     get_task_map(_task), trunc_page_64(srcAddr),
@@ -2430,13 +2463,10 @@ IOReturn IOGeneralMemoryDescriptor::doMap(
 
 	    pa = getPhysicalSegment( offset, &segLen, kIOMemoryMapperNone );
 
-            if( !reserved) {
-                reserved = IONew( ExpansionData, 1 );
-                if( !reserved)
-                    continue;
-            }
-            reserved->pagerContig = (1 == _rangesCount);
-	    reserved->memory = this;
+            if( !getKernelReserved())
+                continue;
+            reserved->dp.pagerContig = (1 == _rangesCount);
+	    reserved->dp.memory      = this;
 
 	    /*What cache mode do we need*/
             switch(options & kIOMapCacheMask ) {
@@ -2477,7 +2507,7 @@ IOReturn IOGeneralMemoryDescriptor::doMap(
 		    break;
             }
 
-	    flags |= reserved->pagerContig ? DEVICE_PAGER_CONTIGUOUS : 0;
+	    flags |= reserved->dp.pagerContig ? DEVICE_PAGER_CONTIGUOUS : 0;
 
             pager = device_pager_setup( (memory_object_t) 0, (uintptr_t) reserved, 
 								size, flags);
@@ -2496,11 +2526,7 @@ IOReturn IOGeneralMemoryDescriptor::doMap(
                 }
             }
 	    if( pager && sharedMem)
-		reserved->devicePager    = pager;
-	    else {
-		IODelete( reserved, ExpansionData, 1 );
-		reserved = 0;
-	    }
+		reserved->dp.devicePager    = pager;
 
         } while( false );
 
@@ -2643,6 +2669,10 @@ static kern_return_t IOMemoryDescriptorMapAlloc(vm_map_t map, void * _ref)
                     SET_MAP_MEM(MAP_MEM_COPYBACK, memEntryCacheMode);
                     break;
 
+		case kIOMapCopybackInnerCache:
+                    SET_MAP_MEM(MAP_MEM_INNERWBACK, memEntryCacheMode);
+                    break;
+
 		case kIOMapDefaultCache:
 		default:
                     SET_MAP_MEM(MAP_MEM_NOOP, memEntryCacheMode);
@@ -2783,7 +2813,7 @@ IOReturn IOMemoryDescriptor::doMap(
 	pageOffset = sourceAddr - trunc_page( sourceAddr );
 
 	if( reserved)
-	    pager = (memory_object_t) reserved->devicePager;
+	    pager = (memory_object_t) reserved->dp.devicePager;
 	else
 	    pager = MACH_PORT_NULL;
 
@@ -2839,7 +2869,7 @@ IOReturn IOMemoryDescriptor::doMap(
 		mapping->fMemory->_memEntry = me;
 	    }
 	    if (pager)
-		err = handleFault( reserved->devicePager, mapping->fAddressMap, mapping->fAddress, offset, length, options );
+		err = handleFault( pager, mapping->fAddressMap, mapping->fAddress, offset, length, options );
 	}
 	else
 	{
@@ -2871,8 +2901,8 @@ IOReturn IOMemoryDescriptor::doMap(
 
 #if DEBUG
 	if (kIOLogMapping & gIOKitDebug)
-	    IOLog("mapping(%x) desc %p @ %lx, map %p, address %qx, offset %qx, length %qx\n", 
-		    err, this, sourceAddr, mapping, address, offset, length);
+	    IOLog("mapping(%x) desc %p @ %qx, map %p, address %qx, offset %qx, length %qx\n", 
+		  err, this, (uint64_t)sourceAddr, mapping, address, offset, length);
 #endif
 
 	    if (err == KERN_SUCCESS)
@@ -2950,7 +2980,7 @@ IOReturn IOMemoryDescriptor::handleFault(
 
 
         if( pager) {
-            if( reserved && reserved->pagerContig) {
+            if( reserved && reserved->dp.pagerContig) {
                 IOPhysicalLength	allLen;
                 addr64_t		allPhys;
 
@@ -3424,8 +3454,8 @@ IOMemoryMap * IOMemoryDescriptor::createMappingInTask(
 
 #if DEBUG
     if (!result)
-	IOLog("createMappingInTask failed desc %p, addr %qx, options %lx, offset %qx, length %qx\n",
-		    this, atAddress, options, offset, length);
+	IOLog("createMappingInTask failed desc %p, addr %qx, options %x, offset %qx, length %llx\n",
+		this, atAddress, (uint32_t) options, offset, length);
 #endif
 
     return (result);
diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp
index 85ac1a2ec..51a72cfb1 100644
--- a/iokit/Kernel/IONVRAM.cpp
+++ b/iokit/Kernel/IONVRAM.cpp
@@ -228,23 +228,24 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const
   bool                 result, hasPrivilege;
   UInt32               variablePerm;
   const OSSymbol       *key;
-  OSDictionary         *dict = 0, *tmpDict = 0;
+  OSDictionary         *dict;
   OSCollectionIterator *iter = 0;
   
   // Verify permissions.
   hasPrivilege = (kIOReturnSuccess == IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege));
 
-  tmpDict = OSDictionary::withCapacity(1);
-  if (tmpDict == 0) return false;
+  dict = OSDictionary::withCapacity(1);
+  if (dict == 0) return false;
 
   if (_ofDict == 0) {
     /* No nvram. Return an empty dictionary. */
-    dict = tmpDict;
   } else {
     /* Copy properties with client privilege. */
     iter = OSCollectionIterator::withCollection(_ofDict);
-    if (iter == 0) return false;
-    
+    if (iter == 0) {
+      dict->release();
+      return false;
+    }
     while (1) {
       key = OSDynamicCast(OSSymbol, iter->getNextObject());
       if (key == 0) break;
@@ -252,15 +253,14 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const
       variablePerm = getOFVariablePerm(key);
       if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) &&
 	  ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) {
-	tmpDict->setObject(key, _ofDict->getObject(key));
+	dict->setObject(key, _ofDict->getObject(key));
       }
-      dict = tmpDict;
     }
   }
 
   result = dict->serialize(s);
-  
-  if (tmpDict != 0) tmpDict->release();
+ 
+  dict->release();
   if (iter != 0) iter->release();
   
   return result;
diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp
index 5111a7edd..e086a456d 100644
--- a/iokit/Kernel/IOPMrootDomain.cpp
+++ b/iokit/Kernel/IOPMrootDomain.cpp
@@ -79,6 +79,9 @@ __END_DECLS
 
 #define _LOG(x...)
 
+#define DARK_WAKE_DEBUG                     1
+#define SUSPEND_PM_NOTIFICATIONS_DEBUG      1
+
 #define CHECK_THREAD_CONTEXT
 #ifdef  CHECK_THREAD_CONTEXT
 static IOWorkLoop * gIOPMWorkLoop = 0;
@@ -125,7 +128,8 @@ enum {
     kPowerEventAssertionRelease,                // 10
     kPowerEventAssertionSetLevel,               // 11
     kPowerEventQueueSleepWakeUUID,              // 12
-    kPowerEventPublishSleepWakeUUID             // 13
+    kPowerEventPublishSleepWakeUUID,            // 13
+    kPowerEventSuspendClient                    // 14
 };
 
 // For evaluatePolicy()
@@ -139,7 +143,8 @@ enum {
     kStimulusDarkWakeActivityTickle,    // 5
     kStimulusDarkWakeEntry,             // 6
     kStimulusDarkWakeReentry,           // 7
-    kStimulusDarkWakeEvaluate           // 8
+    kStimulusDarkWakeEvaluate,          // 8
+    kStimulusNoIdleSleepPreventers      // 9
 };
 
 extern "C" {
@@ -198,12 +203,15 @@ static IOPMPowerState ourPowerStates[NUM_POWER_STATES] =
 #define kIOPMRootDomainWakeTypeUser         "User"
 #define kIOPMRootDomainWakeTypeAlarm        "Alarm"
 #define kIOPMRootDomainWakeTypeNetwork      "Network"
+#define kIOPMRootDomainWakeTypeHIDActivity  "HID Activity"
 
 // Special interest that entitles the interested client from receiving
 // all system messages. Only used by powerd.
 //
 #define kIOPMSystemCapabilityInterest       "IOPMSystemCapabilityInterest"
 
+#define kPMSuspendedNotificationClients      "PMSuspendedNotificationClients"
+
 /*
  * Aggressiveness
  */
@@ -269,8 +277,8 @@ static UInt32           gWillShutdown = 0;
 static UInt32           gPagingOff = 0;
 static UInt32           gSleepWakeUUIDIsSet = false;
 static uint32_t         gAggressivesState = 0;
-static uint32_t         gDarkWakeFlags = kDarkWakeFlagHIDTickleNone;
-static bool             gRAMDiskImageBoot = false;
+static uint32_t         gDarkWakeFlags = kDarkWakeFlagHIDTickleNone | kDarkWakeFlagIgnoreDiskIOAlways;
+static PMStatsStruct    gPMStats;
 
 struct timeval gIOLastSleepTime;
 struct timeval gIOLastWakeTime;
@@ -855,17 +863,6 @@ bool IOPMrootDomain::start( IOService * nub )
 
     PE_parse_boot_argn("darkwake", &gDarkWakeFlags, sizeof(gDarkWakeFlags));
     
-    IORegistryEntry * chosenEntry = IORegistryEntry::fromPath("/chosen", gIODTPlane);
-    if (chosenEntry)
-    {
-        if (chosenEntry->getProperty("boot-ramdmg-size") &&
-            chosenEntry->getProperty("boot-ramdmg-extents"))
-        {
-            gRAMDiskImageBoot = true;
-        }
-        chosenEntry->release();
-    }
-
     queue_init(&aggressivesQueue);
     aggressivesThreadCall = thread_call_allocate(handleAggressivesFunction, this);
     aggressivesData = OSData::withCapacity(
@@ -885,7 +882,7 @@ bool IOPMrootDomain::start( IOService * nub )
     
     setProperty(kIOSleepSupportedKey, true);
 
-    bzero(&pmStats, sizeof(pmStats));
+    bzero(&gPMStats, sizeof(gPMStats));
 
     pmTracer = PMTraceWorker::tracer(this);
 
@@ -942,6 +939,8 @@ bool IOPMrootDomain::start( IOService * nub )
                     (const OSObject **) &gIOPMSettingSilentRunningKey, 1, 0);
 
     fPMSettingsDict = OSDictionary::withCapacity(5);
+    preventIdleSleepList = OSSet::withCapacity(8);
+    preventSystemSleepList = OSSet::withCapacity(2);
 
     PMinit();   // creates gIOPMWorkLoop
 
@@ -1010,8 +1009,13 @@ bool IOPMrootDomain::start( IOService * nub )
         publishFeature("DisplayDims");
     }
     if(psIterator) {
-        psIterator->release();
+        psIterator->release();        
     }
+    
+    
+    pmSuspendedCapacity = pmSuspendedSize = 0;
+    pmSuspendedPIDS = NULL;
+    
 
     sysctl_register_oid(&sysctl__kern_sleeptime);
     sysctl_register_oid(&sysctl__kern_waketime);
@@ -1030,6 +1034,126 @@ bool IOPMrootDomain::start( IOService * nub )
     return true;
 }
 
+
+
+
+void IOPMrootDomain::handleSuspendPMNotificationClient(uint32_t pid, bool doSuspend)
+{
+    ASSERT_GATED();
+    
+    int index = -1;
+    unsigned int i;
+    
+    if (!pmSuspendedPIDS) {
+        pmSuspendedCapacity = 8;
+        pmSuspendedSize = pmSuspendedCapacity * sizeof(PMNotifySuspendedStruct);
+        pmSuspendedPIDS = (PMNotifySuspendedStruct *)IOMalloc(pmSuspendedSize);
+        bzero(pmSuspendedPIDS, pmSuspendedSize);
+    }
+    
+    /* Find the existing pid in the existing array */
+
+    for (i=0; i < pmSuspendedCapacity; i++) {
+        if (pmSuspendedPIDS[i].pid == pid) {
+            index = i;
+            break;
+        }
+    }
+    
+    if (-1 == index)
+    {
+        /* Find an unused slot in the suspended pids table. */
+
+        for (i=0; i < pmSuspendedCapacity; i++) {
+            if (pmSuspendedPIDS[i].refcount == 0) {
+                break;
+            }
+        }
+    
+        if (pmSuspendedCapacity == i) 
+        {
+            /* GROW if necessary */
+
+            PMNotifySuspendedStruct *newSuspended = NULL;
+            pmSuspendedCapacity     *= 2;
+            pmSuspendedSize         = pmSuspendedCapacity * sizeof(PMNotifySuspendedStruct);
+            newSuspended            = (PMNotifySuspendedStruct *)IOMalloc(pmSuspendedSize);
+
+            bzero(newSuspended, pmSuspendedSize);
+            bcopy(pmSuspendedPIDS,  newSuspended, pmSuspendedSize/2);
+            IOFree(pmSuspendedPIDS, pmSuspendedSize/2);
+        
+            pmSuspendedPIDS = newSuspended;
+        }
+
+        index = i;
+        pmSuspendedPIDS[index].pid = pid;
+    }
+
+    if (doSuspend) {
+        pmSuspendedPIDS[index].refcount++;
+    } else {
+        pmSuspendedPIDS[index].refcount--;
+    }
+        
+    /*
+     * Publish array of suspended pids in IOPMrootDomain
+     */
+    OSArray     *publish = OSArray::withCapacity(pmSuspendedCapacity);
+
+    for (i=0; i 0) {
+            OSDictionary    *suspended = OSDictionary::withCapacity(2);
+            OSNumber        *n = NULL;
+            
+            n = OSNumber::withNumber(pmSuspendedPIDS[i].pid, 32);
+            suspended->setObject("pid", n);
+            n->release();
+            
+            n = OSNumber::withNumber(pmSuspendedPIDS[i].refcount, 32);
+            suspended->setObject("refcount", n);
+            n->release();
+            
+            publish->setObject(suspended);
+            suspended->release();
+            
+        }
+    }
+    
+    if (0 != publish->getCount()) {
+        setProperty(kPMSuspendedNotificationClients, publish);
+    } else {
+        removeProperty(kPMSuspendedNotificationClients);
+    }
+    
+    publish->release();
+    
+    return;
+}
+
+bool IOPMrootDomain::pmNotificationIsSuspended(uint32_t pid)
+{
+    unsigned int index;
+    
+    for (index=0; index < pmSuspendedCapacity; index++) {
+        if (pmSuspendedPIDS[index].pid == pid) {
+            return pmSuspendedPIDS[index].refcount > 0;
+        }
+    }
+    
+    return false;
+}
+
+
+void IOPMrootDomain::suspendPMNotificationsForPID(uint32_t pid, bool doSuspend)
+{
+    if(pmPowerStateQueue) {
+        pmPowerStateQueue->submitPowerEvent(kPowerEventSuspendClient, (void *)pid, (uint64_t)doSuspend );
+    }
+    return;
+}
+
 //******************************************************************************
 // setProperties
 //
@@ -1064,13 +1188,16 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj )
     const OSSymbol *hibernatefreeratio_string           = OSSymbol::withCString(kIOHibernateFreeRatioKey);
     const OSSymbol *hibernatefreetime_string            = OSSymbol::withCString(kIOHibernateFreeTimeKey);
 #endif
-
+#if SUSPEND_PM_NOTIFICATIONS_DEBUG
+    const OSSymbol *suspendPMClient_string              = OSSymbol::withCString(kPMSuspendedNotificationClients);
+#endif
+    
     if (!dict) 
     {
         return_value = kIOReturnBadArgument;
         goto exit;
     }
-
+    
     if ((b = OSDynamicCast(OSBoolean, dict->getObject(publish_simulated_battery_string))))
     {
         publishResource(publish_simulated_battery_string, kOSBooleanTrue);
@@ -1169,17 +1296,19 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj )
     {
         setProperty(kIOPMDeepSleepDelayKey, n);
     }
-    if ((b = OSDynamicCast(OSBoolean, dict->getObject(kIOPMDestroyFVKeyOnStandbyKey))))
-    {
-        setProperty(kIOPMDestroyFVKeyOnStandbyKey, b);
-    }
-    if ((b = OSDynamicCast(OSBoolean, dict->getObject(kIOPMAutoPowerOffEnabledKey))))
+
+#if SUSPEND_PM_NOTIFICATIONS_DEBUG
+    if ((n = OSDynamicCast(OSNumber, dict->getObject(suspendPMClient_string))))
     {
-        setProperty(kIOPMAutoPowerOffEnabledKey, b);
+        // Toggle the suspended status for pid n.
+        uint32_t pid_int = n->unsigned32BitValue();        
+        suspendPMNotificationsForPID(pid_int, !pmNotificationIsSuspended(pid_int));
     }
-    if ((n = OSDynamicCast(OSNumber, dict->getObject(kIOPMAutoPowerOffDelayKey))))
+#endif
+    
+    if ((b = OSDynamicCast(OSBoolean, dict->getObject(kIOPMDestroyFVKeyOnStandbyKey))))
     {
-        setProperty(kIOPMAutoPowerOffDelayKey, n);
+        setProperty(kIOPMDestroyFVKeyOnStandbyKey, b);
     }
 
     // Relay our allowed PM settings onto our registered PM clients
@@ -1238,6 +1367,9 @@ exit:
     if(hibernatefile_string) hibernatefile_string->release();
     if(hibernatefreeratio_string) hibernatefreeratio_string->release();
     if(hibernatefreetime_string) hibernatefreetime_string->release();
+#endif
+#if SUSPEND_PM_NOTIFICATIONS_DEBUG
+    if(suspendPMClient_string) suspendPMClient_string->release();
 #endif
     return return_value;
 }
@@ -1748,8 +1880,12 @@ void IOPMrootDomain::startIdleSleepTimer( uint32_t inSeconds )
         clock_interval_to_deadline(inSeconds, kSecondScale, &deadline);	
         thread_call_enter_delayed(extraSleepTimer, deadline);
         idleSleepTimerPending = true;
-        DLOG("idle timer set for %u seconds\n", inSeconds);
     }
+    else
+    {
+        thread_call_enter(extraSleepTimer);
+    }
+    DLOG("idle timer set for %u seconds\n", inSeconds);
 }
 
 //******************************************************************************
@@ -1882,9 +2018,10 @@ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason )
         kIOPMOSSwitchHibernationKey,
         kIOPMIdleSleepKey,
         kIOPMLowPowerSleepKey,
-        kIOPMClamshellSleepKey,
         kIOPMThermalEmergencySleepKey,
-        kIOPMMaintenanceSleepKey
+        kIOPMMaintenanceSleepKey,
+        kIOPMSleepServiceExitKey,
+        kIOPMDarkWakeThermalEmergencyKey
     };
 
     PMEventDetails *details;
@@ -1899,6 +2036,9 @@ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason )
         return kIOReturnNotPermitted;
     }
 
+    if (kIOPMSleepReasonDarkWakeThermalEmergency == sleepReason)
+        messageClients(kIOPMMessageDarkWakeThermalEmergency);
+
     if (timeline)
         timeline->setSleepCycleInProgressFlag(true);
   
@@ -1906,7 +2046,6 @@ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason )
     if(pmPowerStateQueue) {
         pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)true);
     }
-
   
     // Log the beginning of system sleep.
 	details = PMEventDetails::eventDetails(kIOPMEventTypeSleep, NULL,
@@ -2077,6 +2216,8 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
             logWranglerTickle  = true;
             sleepTimerMaintenance = false;
             wranglerTickleLatched = false;
+            darkWakeThermalAlarm  = false;
+            darkWakeThermalEmergency = false;
 
             OSString * wakeType = OSDynamicCast(
                 OSString, getProperty(kIOPMRootDomainWakeTypeKey));
@@ -2195,27 +2336,17 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
 
             changePowerStateToPriv(ON_STATE);
         }   break;
-    
-        case ON_STATE: {
-            bool wasPrevented = childPreventSystemSleep;
-
-            details = PMEventDetails::eventDetails(
-                            kIOPMEventTypeWakeDone,
-                            NULL, 
-                            0, 
-                            kIOReturnSuccess);
-			
-            recordAndReleasePMEvent( details );
 
-            // Update childPreventSystemSleep flag using the capability computed
-            // by IOSevice::rebuildChildClampBits().
-
-            childPreventSystemSleep =
-                ((currentCapability() & kIOPMChildClamp2) != 0);
-
-            if (wasPrevented && !childPreventSystemSleep)
+        case ON_STATE: {
+            if (previousPowerState != ON_STATE)
             {
-                evaluatePolicy( kStimulusDarkWakeEvaluate );
+                details = PMEventDetails::eventDetails(
+                                kIOPMEventTypeWakeDone,
+                                NULL, 
+                                0, 
+                                kIOReturnSuccess);
+                
+                recordAndReleasePMEvent( details );
             }
         }   break;
     }
@@ -2225,9 +2356,6 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
 // requestPowerDomainState
 //
 // Extend implementation in IOService. Running on PM work loop thread.
-//
-// Examine children desires and initiate idle-sleep if all children are idle,
-// prevent idle and system sleep flags are not set.
 //******************************************************************************
 
 IOReturn IOPMrootDomain::requestPowerDomainState (
@@ -2235,110 +2363,102 @@ IOReturn IOPMrootDomain::requestPowerDomainState (
     IOPowerConnection * childConnection,
     unsigned long       specification )
 {
-    OSIterator          *iter;
-    OSObject            *next;
-    IOPowerConnection   *connection;
-    IOPMPowerFlags      mergedChildDesire = 0;
-    IOPMPowerFlags      editedChildDesire;
-    IOPMPowerFlags      thisDesire;
-    bool                sleepASAP = false;
+    // Idle and system sleep prevention flags affects driver desire.
+    // Children desire are irrelevant so they are cleared.
+
+    return super::requestPowerDomainState(0, childConnection, specification);
+}
+
+//******************************************************************************
+// updatePreventIdleSleepList
+//
+// Called by IOService on PM work loop.
+//******************************************************************************
+
+void IOPMrootDomain::updatePreventIdleSleepList(
+        IOService * service, bool addNotRemove )
+{
+    unsigned int oldCount, newCount;
 
     ASSERT_GATED();
 
-    // Disregard disk I/O (anything besides the display wrangler) as a
-    // factor in preventing idle sleep - based on a runtime setting.
+    // Disregard disk I/O (anything besides the display wrangler)
+    // as a factor preventing idle sleep,except in the case of legacy disk I/O
 
     if ((gDarkWakeFlags & kDarkWakeFlagIgnoreDiskIOAlways) &&
-        (kIOPMPreventIdleSleep & childDesire) &&
-        (childConnection != wranglerConnection))
+        addNotRemove && (service != wrangler) && (service != this))
     {
-        childDesire &= ~kIOPMPreventIdleSleep;
+        return;
     }
 
-    // Force the child's input power requirement to 0 unless the prevent
-    // idle-sleep flag is set. Nil input power flags maps to our state 0.
-    // Our power clamp (deviceDesire) clamps the lowest power state at 2.
-
-    editedChildDesire = 0;
-    if (childDesire & kIOPMPreventIdleSleep)
-        editedChildDesire |= (kIOPMPowerOn | kIOPMPreventIdleSleep);
-    if (childDesire & kIOPMPreventSystemSleep)
-        editedChildDesire |= (kIOPMPowerOn | kIOPMPreventSystemSleep);
-
-    iter = getChildIterator(gIOPowerPlane);
-    if ( iter )
+    oldCount = preventIdleSleepList->getCount();
+    if (addNotRemove)
     {
-        while ( (next = iter->getNextObject()) )
-        {
-            if ( (connection = OSDynamicCast(IOPowerConnection, next)) )
-            {
-                // Ignore child that are in the process of joining.
-				if (connection->getReadyFlag() == false)
-					continue;
+        preventIdleSleepList->setObject(service);
+        DLOG("prevent idle sleep list: %s+ (%u)\n",
+            service->getName(), preventIdleSleepList->getCount());
+    }
+    else if (preventIdleSleepList->member(service))
+    {
+        preventIdleSleepList->removeObject(service);
+        DLOG("prevent idle sleep list: %s- (%u)\n",
+            service->getName(), preventIdleSleepList->getCount());
+    }
+    newCount = preventIdleSleepList->getCount();
+    
+    if ((oldCount == 0) && (newCount != 0))
+    {
+        // Driver added to empty prevent list.
+        // Update the driver desire to prevent idle sleep.
+        // Driver desire does not prevent demand sleep.
+        
+        changePowerStateTo(ON_STATE);
+    }
+    else if ((oldCount != 0) && (newCount == 0))
+    {
+        // Last driver removed from prevent list.
+        // Drop the driver clamp to allow idle sleep.
 
-                // OR in the child's input power requirements.
-                // Is this connection attached to the child that called
-                // requestPowerDomainState()?
+        changePowerStateTo(SLEEP_STATE);
+        evaluatePolicy( kStimulusNoIdleSleepPreventers );
+    }
+}
 
-                if (connection == childConnection)
-                {
-                    thisDesire = editedChildDesire;
-                }
-                else
-                {
-                    thisDesire = 0;
-                    if (connection->getPreventIdleSleepFlag())
-                        thisDesire |= (kIOPMPowerOn | kIOPMPreventIdleSleep);
-                    if (connection->getPreventSystemSleepFlag())
-                        thisDesire |= (kIOPMPowerOn | kIOPMPreventSystemSleep);
-                }
+//******************************************************************************
+// preventSystemSleepListUpdate
+//
+// Called by IOService on PM work loop.
+//******************************************************************************
 
-                mergedChildDesire |= thisDesire;
-                if (thisDesire && (kIOLogPMRootDomain & gIOKitDebug))
-                {
-                    IOService * child =
-                        (IOService *) connection->getChildEntry(gIOPowerPlane);
-                    LOG("child %p, noIdle %d, noSleep %d - %s\n",
-                        child,
-                        ((thisDesire & kIOPMPreventIdleSleep) != 0),
-                        ((thisDesire & kIOPMPreventSystemSleep) != 0),
-                        child ? child->getName() : "?");
-                }
-            }
-        }
-        iter->release();
-    }
+void IOPMrootDomain::updatePreventSystemSleepList(
+        IOService * service, bool addNotRemove )
+{
+    unsigned int oldCount;
 
-    DLOG("mergedChildDesire 0x%lx, extraSleepDelay %ld\n",
-        mergedChildDesire, extraSleepDelay);
+    ASSERT_GATED();
+    if (this == service)
+        return;
 
-    if ( !mergedChildDesire && !systemBooting )
+    oldCount = preventSystemSleepList->getCount();
+    if (addNotRemove)
     {
-        if (!wrangler)
-        {
-            changePowerStateToPriv(ON_STATE);
-            if (idleSeconds)
-            {
-                // stay awake for at least idleSeconds
-                startIdleSleepTimer(idleSeconds);
-            }
-        }
-        else if (!extraSleepDelay && !idleSleepTimerPending && !systemDarkWake)
+        preventSystemSleepList->setObject(service);
+        DLOG("prevent system sleep list: %s+ (%u)\n",
+            service->getName(), preventSystemSleepList->getCount());
+    }
+    else if (preventSystemSleepList->member(service))
+    {
+        preventSystemSleepList->removeObject(service);
+        DLOG("prevent system sleep list: %s- (%u)\n",
+            service->getName(), preventSystemSleepList->getCount());
+
+        if ((oldCount != 0) && (preventSystemSleepList->getCount() == 0))
         {
-            sleepASAP = true;
+            // Lost all system sleep preventers.
+            // Send stimulus if system sleep was blocked, and is in dark wake.
+            evaluatePolicy( kStimulusDarkWakeEvaluate );
         }
     }
-
-    // Drop our power clamp to SLEEP_STATE when all children became idle,
-    // and system sleep and display sleep slider values are equal.
-
-    adjustPowerState(sleepASAP);
-
-    // If our power clamp has already dropped to SLEEP_STATE, and no child
-    // is keeping us at ON_STATE, then the following will trigger idle sleep.
-
-    return super::requestPowerDomainState(
-        editedChildDesire, childConnection, specification);
 }
 
 //******************************************************************************
@@ -2454,6 +2574,9 @@ void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum )
     DLOG("tellNoChangeDown %u->%u\n",
         (uint32_t) getPowerState(), (uint32_t) stateNum);
 
+	// Sleep canceled, clear the sleep trace point.
+    tracePoint(kIOPMTracePointSystemUp);
+
     if (idleSeconds && !wrangler)
     {
         // stay awake for at least idleSeconds
@@ -2473,7 +2596,6 @@ void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum )
 
 void IOPMrootDomain::tellChangeUp( unsigned long stateNum )
 {
-    OSData *publishPMStats = NULL;
 
     DLOG("tellChangeUp %u->%u\n",
         (uint32_t) getPowerState(), (uint32_t) stateNum);
@@ -2504,10 +2626,6 @@ void IOPMrootDomain::tellChangeUp( unsigned long stateNum )
         }
 
         tracePoint( kIOPMTracePointWakeApplications );
-        publishPMStats = OSData::withBytes(&pmStats, sizeof(pmStats));
-        setProperty(kIOPMSleepStatisticsKey, publishPMStats);
-        publishPMStats->release();
-        bzero(&pmStats, sizeof(pmStats));
 
         if (pmStatsAppResponses) 
         {
@@ -2730,7 +2848,12 @@ void IOPMrootDomain::handlePublishSleepWakeUUID( bool shouldPublish )
 
 IOReturn IOPMrootDomain::changePowerStateTo( unsigned long ordinal )
 {
-    return kIOReturnUnsupported;    // ignored
+    DLOG("changePowerStateTo(%lu)\n", ordinal);
+
+    if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE))
+        return kIOReturnUnsupported;
+
+    return super::changePowerStateTo(ordinal);
 }
 
 IOReturn IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal )
@@ -3450,92 +3573,85 @@ void IOPMrootDomain::informCPUStateChange(
 // evaluateSystemSleepPolicy
 //******************************************************************************
 
-#define kIOPlatformSystemSleepPolicyKey     "IOPlatformSystemSleepPolicy"
-
-// Sleep flags
-enum {
-    kIOPMSleepFlagHibernate         = 0x00000001,
-    kIOPMSleepFlagSleepTimerEnable  = 0x00000002
-};
-
 struct IOPMSystemSleepPolicyEntry
 {
     uint32_t    factorMask;
     uint32_t    factorBits;
     uint32_t    sleepFlags;
     uint32_t    wakeEvents;
-} __attribute__((packed));
+};
 
 struct IOPMSystemSleepPolicyTable
 {
-    uint32_t    signature;
+    uint8_t     signature[4];
     uint16_t    version;
     uint16_t    entryCount;
     IOPMSystemSleepPolicyEntry  entries[];
-} __attribute__((packed));
+};
+
+enum {
+    kIOPMSleepFactorSleepTimerWake          = 0x00000001,
+    kIOPMSleepFactorLidOpen                 = 0x00000002,
+    kIOPMSleepFactorACPower                 = 0x00000004,
+    kIOPMSleepFactorLowBattery              = 0x00000008,
+    kIOPMSleepFactorDeepSleepNoDelay        = 0x00000010,
+    kIOPMSleepFactorDeepSleepDemand         = 0x00000020,
+    kIOPMSleepFactorDeepSleepDisable        = 0x00000040,
+    kIOPMSleepFactorUSBExternalDevice       = 0x00000080,
+    kIOPMSleepFactorBluetoothHIDDevice      = 0x00000100,
+    kIOPMSleepFactorExternalMediaMounted    = 0x00000200,
+    kIOPMSleepFactorDriverAssertBit5        = 0x00000400,   /* Reserved for ThunderBolt */
+    kIOPMSleepFactorDriverAssertBit6        = 0x00000800,
+    kIOPMSleepFactorDriverAssertBit7        = 0x00001000    /* Reserved for legacy I/O */
+};
+
+enum {
+    kSleepPhaseEarly, kSleepPhaseFinal
+};
 
-bool IOPMrootDomain::evaluateSystemSleepPolicy(
-    IOPMSystemSleepParameters * params, int sleepPhase )
+bool IOPMrootDomain::evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p, int sleepPhase )
 {
     const IOPMSystemSleepPolicyTable * pt;
     OSObject *  prop = 0;
     OSData *    policyData;
-    uint64_t    currentFactors = 0;
-    uint32_t    standbyDelay;
-    uint32_t    powerOffDelay;
-    uint32_t    mismatch;
-    bool        standbyEnabled;
-    bool        powerOffEnabled;
-    bool        found = false;
-
-    // Get platform's sleep policy table
-    if (!_sleepPolicyHandler)
-    {
-        prop = getServiceRoot()->copyProperty(kIOPlatformSystemSleepPolicyKey);
-        if (!prop) goto done;
-    }
-
-    // Fetch additional settings
-    standbyEnabled = (getSleepOption(kIOPMDeepSleepDelayKey, &standbyDelay)
-        && (getProperty(kIOPMDeepSleepEnabledKey) == kOSBooleanTrue));
-    powerOffEnabled = (getSleepOption(kIOPMAutoPowerOffDelayKey, &powerOffDelay)
-        && (getProperty(kIOPMAutoPowerOffEnabledKey) == kOSBooleanTrue));
-    DLOG("standby %d delay %u, powerOff %d delay %u, hibernate %u\n",
-        standbyEnabled, standbyDelay, powerOffEnabled, powerOffDelay,
-        hibernateMode);
-
-    // pmset level overrides
-    if ((hibernateMode & kIOHibernateModeOn) == 0)
+    uint32_t    currentFactors;
+    uint32_t    deepSleepDelay = 0;
+    bool        success = false;
+
+    if (getProperty(kIOPMDeepSleepEnabledKey) != kOSBooleanTrue)
+        return false;
+
+    getSleepOption(kIOPMDeepSleepDelayKey, &deepSleepDelay);
+
+    prop = getServiceRoot()->copyProperty(kIOPlatformSystemSleepPolicyKey);
+    if (!prop)
+        return false;
+
+    policyData = OSDynamicCast(OSData, prop);
+    if (!policyData ||
+        (policyData->getLength() < sizeof(IOPMSystemSleepPolicyTable)))
     {
-        standbyEnabled  = false;
-        powerOffEnabled = false;
+        goto done;
     }
-    else if (!(hibernateMode & kIOHibernateModeSleep))
+
+    pt = (const IOPMSystemSleepPolicyTable *) policyData->getBytesNoCopy();
+    if ((pt->signature[0] != 'S') ||
+        (pt->signature[1] != 'L') ||
+        (pt->signature[2] != 'P') ||
+        (pt->signature[3] != 'T') ||
+        (pt->version      != 1)   ||
+        (pt->entryCount   == 0))
     {
-        // Force hibernate (i.e. mode 25)
-        // If standby is enabled, force standy.
-        // If poweroff is enabled, force poweroff.
-        if (standbyEnabled)
-            currentFactors |= kIOPMSleepFactorStandbyForced;
-        else if (powerOffEnabled)
-            currentFactors |= kIOPMSleepFactorAutoPowerOffForced;
-        else
-            currentFactors |= kIOPMSleepFactorHibernateForced;
+        goto done;
     }
 
-    // Current factors based on environment and assertions
-    if (sleepTimerMaintenance)
-        currentFactors |= kIOPMSleepFactorSleepTimerWake;
-    if (!clamshellClosed)
-        currentFactors |= kIOPMSleepFactorLidOpen;
-    if (acAdaptorConnected)
-        currentFactors |= kIOPMSleepFactorACPower;
-    if (lowBatteryCondition)
-        currentFactors |= kIOPMSleepFactorBatteryLow;
-    if (!standbyDelay)
-        currentFactors |= kIOPMSleepFactorStandbyNoDelay;
-    if (!standbyEnabled)
-        currentFactors |= kIOPMSleepFactorStandbyDisabled;
+    if ((policyData->getLength() - sizeof(IOPMSystemSleepPolicyTable)) !=
+        (sizeof(IOPMSystemSleepPolicyEntry) * pt->entryCount))
+    {
+        goto done;
+    }
+
+    currentFactors = 0;
     if (getPMAssertionLevel(kIOPMDriverAssertionUSBExternalDeviceBit) !=
         kIOPMDriverAssertionLevelOff)
         currentFactors |= kIOPMSleepFactorUSBExternalDevice;
@@ -3545,101 +3661,88 @@ bool IOPMrootDomain::evaluateSystemSleepPolicy(
     if (getPMAssertionLevel(kIOPMDriverAssertionExternalMediaMountedBit) !=
         kIOPMDriverAssertionLevelOff)
         currentFactors |= kIOPMSleepFactorExternalMediaMounted;
-    if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) !=
+    if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) !=    /* AssertionBit5 = Thunderbolt */
         kIOPMDriverAssertionLevelOff)
-        currentFactors |= kIOPMSleepFactorThunderboltDevice;
-    if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit8) !=
+        currentFactors |= kIOPMSleepFactorDriverAssertBit5;
+    if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit7) !=
         kIOPMDriverAssertionLevelOff)
-        currentFactors |= kIOPMSleepFactorMagicPacketWakeEnabled;
-    if (!powerOffEnabled)
-        currentFactors |= kIOPMSleepFactorAutoPowerOffDisabled;
-
-    DLOG("sleep factors 0x%llx\n", currentFactors);
-
-    // Clear the output params
-    bzero(params, sizeof(*params));
-
-    if (_sleepPolicyHandler)
-    {
-        if (!_sleepPolicyVars)
-        {
-            _sleepPolicyVars = IONew(IOPMSystemSleepPolicyVariables, 1);
-            if (!_sleepPolicyVars)
-                goto done;
-            bzero(_sleepPolicyVars, sizeof(*_sleepPolicyVars));
-        }
-        _sleepPolicyVars->signature = kIOPMSystemSleepPolicySignature;
-        _sleepPolicyVars->version   = kIOPMSystemSleepPolicyVersion;
-        if (kIOPMSleepPhase1 == sleepPhase)
-        {
-            _sleepPolicyVars->currentCapability = _currentCapability;
-            _sleepPolicyVars->highestCapability = _highestCapability;
-            _sleepPolicyVars->sleepReason   = lastSleepReason;
-            _sleepPolicyVars->hibernateMode = hibernateMode;
-            _sleepPolicyVars->standbyDelay  = standbyDelay;
-            _sleepPolicyVars->poweroffDelay = powerOffDelay;
-        }
-        _sleepPolicyVars->sleepFactors = currentFactors;
-        _sleepPolicyVars->sleepPhase   = sleepPhase;
-        
-        if ((_sleepPolicyHandler(_sleepPolicyTarget, _sleepPolicyVars, params) !=
-             kIOReturnSuccess) || (kIOPMSleepTypeInvalid == params->sleepType) ||
-             (params->sleepType >= kIOPMSleepTypeLast) ||
-             (kIOPMSystemSleepParametersVersion != params->version))
-        {
-            MSG("sleep policy handler error\n");
-            goto done;
-        }
-
-        DLOG("sleep params v%u, type %u, flags 0x%x, wake 0x%x, timer %u, poweroff %u\n",
-            params->version, params->sleepType, params->sleepFlags,
-            params->ecWakeEvents, params->ecWakeTimer, params->ecPoweroffTimer);
-        found = true;
-        goto done;
-    }
-
-    // Policy table is meaningless without standby enabled
-    if (!standbyEnabled)
-        goto done;
-
-    // Validate the sleep policy table
-    policyData = OSDynamicCast(OSData, prop);
-    if (!policyData || (policyData->getLength() <= sizeof(IOPMSystemSleepPolicyTable)))
-        goto done;
-
-    pt = (const IOPMSystemSleepPolicyTable *) policyData->getBytesNoCopy();
-    if ((pt->signature != kIOPMSystemSleepPolicySignature) ||
-        (pt->version != 1) || (0 == pt->entryCount))
-        goto done;
+        currentFactors |= kIOPMSleepFactorDriverAssertBit7;
+    if (0 == deepSleepDelay)
+        currentFactors |= kIOPMSleepFactorDeepSleepNoDelay;
+    if (!clamshellClosed)
+        currentFactors |= kIOPMSleepFactorLidOpen;
+    if (acAdaptorConnected)
+        currentFactors |= kIOPMSleepFactorACPower;
+    if (lowBatteryCondition)
+        currentFactors |= kIOPMSleepFactorLowBattery;
+    if (sleepTimerMaintenance)
+        currentFactors |= kIOPMSleepFactorSleepTimerWake;
 
-    if (((policyData->getLength() - sizeof(IOPMSystemSleepPolicyTable)) !=
-         (sizeof(IOPMSystemSleepPolicyEntry) * pt->entryCount)))
-        goto done;
+    // pmset overrides
+    if ((hibernateMode & kIOHibernateModeOn) == 0)
+        currentFactors |= kIOPMSleepFactorDeepSleepDisable;
+    else if ((hibernateMode & kIOHibernateModeSleep) == 0)
+        currentFactors |= kIOPMSleepFactorDeepSleepDemand;
+    
+    DLOG("Sleep policy %u entries, current factors 0x%x\n",
+        pt->entryCount, currentFactors);
 
     for (uint32_t i = 0; i < pt->entryCount; i++)
     {
-        const IOPMSystemSleepPolicyEntry * entry = &pt->entries[i];
-        mismatch = (((uint32_t)currentFactors ^ entry->factorBits) & entry->factorMask);
+        const IOPMSystemSleepPolicyEntry * policyEntry = &pt->entries[i];
 
-        DLOG("mask 0x%08x, bits 0x%08x, flags 0x%08x, wake 0x%08x, mismatch 0x%08x\n",
-            entry->factorMask, entry->factorBits,
-            entry->sleepFlags, entry->wakeEvents, mismatch);
-        if (mismatch)
-            continue;
+        DLOG("factor mask 0x%08x, bits 0x%08x, flags 0x%08x, wake 0x%08x\n",
+            policyEntry->factorMask, policyEntry->factorBits,
+            policyEntry->sleepFlags, policyEntry->wakeEvents);
 
-        DLOG("^ found match\n");
-        found = true;
+        if ((currentFactors ^ policyEntry->factorBits) & policyEntry->factorMask)
+            continue;   // mismatch, try next
 
-        params->version = kIOPMSystemSleepParametersVersion;
-        params->reserved1 = 1;
-        if (entry->sleepFlags & kIOPMSleepFlagHibernate)
-            params->sleepType = kIOPMSleepTypeStandby;
-        else
-            params->sleepType = kIOPMSleepTypeNormalSleep;
+        if (p)
+        {
+            p->version    = 1;
+            p->sleepFlags = policyEntry->sleepFlags;
+            p->sleepTimer = 0;
+            p->wakeEvents = policyEntry->wakeEvents;
+            if (p->sleepFlags & kIOPMSleepFlagSleepTimerEnable)
+            {
+                if (kSleepPhaseFinal == sleepPhase)
+                {
+                    clock_sec_t now_secs = gIOLastSleepTime.tv_sec;
+
+                    if (!_standbyTimerResetSeconds ||
+                        (now_secs <= _standbyTimerResetSeconds))
+                    {
+                        // Reset standby timer adjustment
+                        _standbyTimerResetSeconds = now_secs;
+                        DLOG("standby delay %u, reset %u\n",
+                            deepSleepDelay, (uint32_t) _standbyTimerResetSeconds);
+                    }
+                    else if (deepSleepDelay)
+                    {
+                        // Shorten the standby delay timer
+                        clock_sec_t elapsed = now_secs - _standbyTimerResetSeconds;
+                        if (deepSleepDelay > elapsed)
+                            deepSleepDelay -= elapsed;
+                        else
+                            deepSleepDelay = 1; // must be > 0
+
+                        DLOG("standby delay %u, elapsed %u\n",
+                            deepSleepDelay, (uint32_t) elapsed);
+                    }
+                }
+                p->sleepTimer = deepSleepDelay;
+            }
+            else if (kSleepPhaseFinal == sleepPhase)
+            {
+                // A sleep that does not enable the sleep timer will reset
+                // the standby delay adjustment.
+                _standbyTimerResetSeconds = 0;
+            }            
+        }
 
-        params->ecWakeEvents = entry->wakeEvents;
-        if (entry->sleepFlags & kIOPMSleepFlagSleepTimerEnable)
-            params->ecWakeTimer = standbyDelay;
+        DLOG("matched policy entry %u\n", i);
+        success = true;
         break;
     }
 
@@ -3647,53 +3750,32 @@ done:
     if (prop)
         prop->release();
 
-    return found;
+    return success;
 }
 
-static IOPMSystemSleepParameters gEarlySystemSleepParams;
-
 void IOPMrootDomain::evaluateSystemSleepPolicyEarly( void )
 {
-    // Evaluate early (priority interest phase), before drivers sleep.
+    IOPMSystemSleepParameters   params;
+
+    // Evaluate sleep policy before driver sleep phase.
 
     DLOG("%s\n", __FUNCTION__);
     removeProperty(kIOPMSystemSleepParametersKey);
 
+    // Full wake resets the standby timer delay adjustment
+    if (_highestCapability & kIOPMSystemCapabilityGraphics)
+        _standbyTimerResetSeconds = 0;
+
     hibernateDisabled = false;
     hibernateMode = 0;
     getSleepOption(kIOHibernateModeKey, &hibernateMode);
 
-    // Save for late evaluation if sleep is aborted
-    bzero(&gEarlySystemSleepParams, sizeof(gEarlySystemSleepParams));
-
-    if (evaluateSystemSleepPolicy(&gEarlySystemSleepParams, kIOPMSleepPhase1))
+    if (!hibernateNoDefeat &&
+        evaluateSystemSleepPolicy(¶ms, kSleepPhaseEarly) &&
+        ((params.sleepFlags & kIOPMSleepFlagHibernate) == 0))
     {
-        if (!hibernateNoDefeat &&
-            (gEarlySystemSleepParams.sleepType == kIOPMSleepTypeNormalSleep))
-        {
-            // Disable hibernate setup for normal sleep
-            hibernateDisabled = true;
-        }
+        hibernateDisabled = true;
     }
-
-    // Publish IOPMSystemSleepType
-    uint32_t sleepType = gEarlySystemSleepParams.sleepType;
-    if (sleepType == kIOPMSleepTypeInvalid)
-    {
-        // no sleep policy
-        sleepType = kIOPMSleepTypeNormalSleep;
-        if (hibernateMode & kIOHibernateModeOn)
-            sleepType = (hibernateMode & kIOHibernateModeSleep) ?
-                        kIOPMSleepTypeSafeSleep : kIOPMSleepTypeHibernate;
-    }
-    else if ((sleepType == kIOPMSleepTypeStandby) &&
-             (gEarlySystemSleepParams.ecPoweroffTimer))
-    {
-        // report the lowest possible sleep state
-        sleepType = kIOPMSleepTypePowerOff;
-    }
-
-    setProperty(kIOPMSystemSleepTypeKey, sleepType, 32);
 }
 
 void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void )
@@ -3701,30 +3783,27 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void )
     IOPMSystemSleepParameters   params;
     OSData *                    paramsData;
 
-    // Evaluate sleep policy after sleeping drivers but before platform sleep.
+    // Evaluate sleep policy after drivers but before platform sleep.
 
     DLOG("%s\n", __FUNCTION__);
 
-    if (evaluateSystemSleepPolicy(¶ms, kIOPMSleepPhase2))
+    if (evaluateSystemSleepPolicy(¶ms, kSleepPhaseFinal))
     {
         if ((hibernateDisabled || hibernateAborted) &&
-            (params.sleepType != kIOPMSleepTypeNormalSleep))
+            (params.sleepFlags & kIOPMSleepFlagHibernate))
         {
-            // Final evaluation picked a state requiring hibernation,
-            // but hibernate setup was skipped. Retry using the early
-            // sleep parameters.
+            // Should hibernate but unable to or aborted.
+            // Arm timer for a short sleep and retry or wake fully.
 
-            bcopy(&gEarlySystemSleepParams, ¶ms, sizeof(params));
-            params.sleepType = kIOPMSleepTypeAbortedSleep;
-            params.ecWakeTimer = 1;
+            params.sleepFlags &= ~kIOPMSleepFlagHibernate;
+            params.sleepFlags |= kIOPMSleepFlagSleepTimerEnable;
+            params.sleepTimer = 1;
             hibernateNoDefeat = true;
             DLOG("wake in %u secs for hibernateDisabled %d, hibernateAborted %d\n",
-                params.ecWakeTimer, hibernateDisabled, hibernateAborted);
+                        params.sleepTimer, hibernateDisabled, hibernateAborted);
         }
         else
-        {
             hibernateNoDefeat = false;
-        }
 
         paramsData = OSData::withBytes(¶ms, sizeof(params));
         if (paramsData)
@@ -3733,28 +3812,25 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void )
             paramsData->release();
         }
 
-        if (params.sleepType >= kIOPMSleepTypeHibernate)
+        if (params.sleepFlags & kIOPMSleepFlagHibernate)
         {
-            // Disable safe sleep to force the hibernate path
+            // Force hibernate
             gIOHibernateMode &= ~kIOHibernateModeSleep;
         }
     }
 }
 
 bool IOPMrootDomain::getHibernateSettings(
-    uint32_t *  hibernateModePtr,
+    uint32_t *  hibernateMode,
     uint32_t *  hibernateFreeRatio,
     uint32_t *  hibernateFreeTime )
 {
-    // Called by IOHibernateSystemSleep() after evaluateSystemSleepPolicyEarly()
-    // has updated the hibernateDisabled flag.
-
-    bool ok = getSleepOption(kIOHibernateModeKey, hibernateModePtr);
+    bool ok = getSleepOption(kIOHibernateModeKey, hibernateMode);
     getSleepOption(kIOHibernateFreeRatioKey, hibernateFreeRatio);
     getSleepOption(kIOHibernateFreeTimeKey, hibernateFreeTime);
     if (hibernateDisabled)
-        *hibernateModePtr = 0;
-    DLOG("hibernateMode 0x%x\n", *hibernateModePtr);
+        *hibernateMode = 0;
+    DLOG("hibernateMode 0x%x\n", *hibernateMode);
     return ok;
 }
 
@@ -3975,7 +4051,6 @@ void IOPMrootDomain::tagPowerPlaneService(
     if (isDisplayWrangler)
     {
         wrangler = service;
-        wranglerConnection = (IOService *) service->getParentEntry(gIOPowerPlane);
     }
 #else
     isDisplayWrangler = false;
@@ -4374,6 +4449,7 @@ void IOPMrootDomain::handleOurPowerChangeDone(
                 darkWakeToSleepASAP = false;
                 pciCantSleepValid   = false;
                 rejectWranglerTickle = false;
+                darkWakeSleepService = false;
             }
 
             // Entered dark mode.
@@ -4453,6 +4529,20 @@ void IOPMrootDomain::handleOurPowerChangeDone(
         {
             setProperty(kIOPMSystemCapabilitiesKey, _currentCapability, 64);
             tracePoint( kIOPMTracePointSystemUp, 0 );
+
+            // kIOPMDWOverTemp notification handling was postponed
+            if (darkWakeThermalAlarm)
+            {
+                if (!wranglerTickled && !darkWakeThermalEmergency &&
+                    CAP_CURRENT(kIOPMSystemCapabilityCPU) &&
+                    !CAP_CURRENT(kIOPMSystemCapabilityGraphics))
+                {
+                    darkWakeThermalEmergency = true;
+                    privateSleepSystem(kIOPMSleepReasonDarkWakeThermalEmergency);
+                    MSG("DarkWake thermal limits breached. Going to sleep!\n");
+                }
+                darkWakeThermalAlarm = false;
+            }
         }
 
         _systemTransitionType = kSystemTransitionNone;
@@ -4485,12 +4575,14 @@ void IOPMrootDomain::overridePowerChangeForUIService(
         // Activate power limiter.
 
         if ((actions->parameter & kPMActionsFlagIsDisplayWrangler) &&
-            ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0))
+            ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0) &&
+            (changeFlags & kIOPMSynchronize))
         {
             actions->parameter |= kPMActionsFlagLimitPower;
         }
         else if ((actions->parameter & kPMActionsFlagIsAudioDevice) &&
-                 ((_pendingCapability & kIOPMSystemCapabilityAudio) == 0))
+                 ((_pendingCapability & kIOPMSystemCapabilityAudio) == 0) &&
+                 (changeFlags & kIOPMSynchronize))
         {
             actions->parameter |= kPMActionsFlagLimitPower;
         }
@@ -4532,13 +4624,6 @@ void IOPMrootDomain::overridePowerChangeForUIService(
         }
     }
 
-    if (gRAMDiskImageBoot &&
-        (actions->parameter & kPMActionsFlagIsDisplayWrangler))
-    {
-        // Tag devices subject to power suppression.
-        *inOutChangeFlags |= kIOPMPowerSuppressed;
-    }
-
     if (actions->parameter & kPMActionsFlagLimitPower)
     {
         uint32_t maxPowerState = (uint32_t)(-1);
@@ -4549,7 +4634,7 @@ void IOPMrootDomain::overridePowerChangeForUIService(
 
             maxPowerState = 0;
             if ((actions->parameter & kPMActionsFlagIsDisplayWrangler) &&
-                (!gRAMDiskImageBoot || (service->getPowerState() > 0)))
+                (service->getPowerState() > 0))
             {
                 // Forces a 3->1 transition sequence
                 if (changeFlags & kIOPMDomainWillChange)
@@ -4625,6 +4710,7 @@ void IOPMrootDomain::handleActivityTickleForDisplayWrangler(
     if (!wranglerTickled &&
         ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0))
     {
+        setProperty(kIOPMRootDomainWakeTypeKey, kIOPMRootDomainWakeTypeHIDActivity);
         DLOG("display wrangler tickled\n");
         if (kIOLogPMRootDomain & gIOKitDebug)
             OSReportWithBacktrace("Dark wake display tickle");
@@ -5107,7 +5193,12 @@ bool IOPMrootDomain::checkSystemCanSleep( IOOptionBits options )
             break;          // always sleep on low battery
         }
 
-        if (childPreventSystemSleep)
+        if(darkWakeThermalEmergency)
+        {
+            break;          // always sleep on dark wake thermal emergencies
+        }
+
+        if (preventSystemSleepList->getCount() != 0)
         {
             err = 4;        // 4. child prevent system sleep clamp
             break;
@@ -5329,6 +5420,9 @@ void IOPMrootDomain::dispatchPowerEvent(
         case kPowerEventPublishSleepWakeUUID:
             handlePublishSleepWakeUUID((bool)arg0);
             break;
+        case kPowerEventSuspendClient:
+            handleSuspendPMNotificationClient((uintptr_t)arg0, (bool)arg1);
+            break;
     }
 }
 
@@ -5447,6 +5541,27 @@ void IOPMrootDomain::handlePowerNotification( UInt32 msg )
         privateSleepSystem (kIOPMSleepReasonThermalEmergency);
     }
 
+    if (msg & kIOPMDWOverTemp)
+    {
+        if (!CAP_CURRENT(kIOPMSystemCapabilityCPU) ||
+            (_systemTransitionType == kSystemTransitionSleep) ||
+            (_systemTransitionType == kSystemTransitionWake)  ||
+            (_systemTransitionType == kSystemTransitionCapability))
+        {
+            // During early wake or when system capability is changing,
+            // set flag and take action at end of transition.
+            darkWakeThermalAlarm = true;
+        }
+        else if (!wranglerTickled && !darkWakeThermalEmergency &&
+                 !CAP_CURRENT(kIOPMSystemCapabilityGraphics))
+        {
+            // System in steady state and in dark wake
+            darkWakeThermalEmergency = true;
+            privateSleepSystem(kIOPMSleepReasonDarkWakeThermalEmergency);
+            MSG("DarkWake thermal limits breached. Going to sleep!\n");
+        }
+    }
+
     /*
      * Sleep Now!
      */
@@ -5477,6 +5592,7 @@ void IOPMrootDomain::handlePowerNotification( UInt32 msg )
         // Don't issue a hid tickle when lid is open and polled on wake
         if (msg & kIOPMSetValue)
         {
+            setProperty(kIOPMRootDomainWakeTypeKey, "Lid Open");
             reportUserInput();
         }
 
@@ -5562,6 +5678,11 @@ void IOPMrootDomain::handlePowerNotification( UInt32 msg )
         // This mirrors the hardware's USB wake event latch, where a latched
         // USB wake event followed by an AC attach will trigger a full wake.
         latchDisplayWranglerTickle( false );
+
+#if HIBERNATION
+        // AC presence will reset the standy timer delay adjustment.
+        _standbyTimerResetSeconds = 0;
+#endif
     }
     
     /*
@@ -5644,6 +5765,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
             int displaySleep        : 1;
             int sleepDelayChanged   : 1;
             int evaluateDarkWake    : 1;
+            int adjustPowerState    : 1;
         } bit;
         uint32_t u32;
     } flags;
@@ -5694,6 +5816,8 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
 
             if ( minutesToIdleSleep > minutesToDisplayDim )
                 minutesDelta = minutesToIdleSleep - minutesToDisplayDim;
+            else if( minutesToIdleSleep == minutesToDisplayDim )
+                minutesDelta = 1;
 
             if ((sleepSlider == 0) && (minutesToIdleSleep != 0))
                 flags.bit.idleSleepEnabled = true;
@@ -5721,8 +5845,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
             break;
 
         case kStimulusAllowSystemSleepChanged:
-            // FIXME: de-compose to change flags.
-            adjustPowerState();
+            flags.bit.adjustPowerState = true;
             break;
 
         case kStimulusDarkWakeActivityTickle:
@@ -5848,6 +5971,10 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
 #endif
             break;
 
+        case kStimulusNoIdleSleepPreventers:
+            flags.bit.adjustPowerState = true;
+            break;
+
     } /* switch(stimulus) */
 
     if (flags.bit.evaluateDarkWake && !wranglerTickled)
@@ -5882,7 +6009,8 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
                 // Parked in dark wake, a tickle will return to full wake
                 rejectWranglerTickle = false;
             }
-        } else // non-maintenance (network) dark wake
+        }
+        else // non-maintenance (network) dark wake
         {
             if (checkSystemCanSleep(true))
             {
@@ -5965,7 +6093,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
                 AbsoluteTime    now;
                 uint64_t        nanos;
                 uint32_t        minutesSinceDisplaySleep = 0;
-                uint32_t        sleepDelay;
+                uint32_t        sleepDelay = 0;
 
                 clock_get_uptime(&now);
                 if (CMP_ABSOLUTETIME(&now, &wranglerSleepTime) > 0)
@@ -5979,10 +6107,6 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
                 {
                     sleepDelay = extraSleepDelay - minutesSinceDisplaySleep;
                 }
-                else
-                {
-                    sleepDelay = 1; // 1 min
-                }
 
                 startIdleSleepTimer(sleepDelay * 60);
                 DLOG("display slept %u min, set idle timer to %u min\n",
@@ -5998,6 +6122,35 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
         restoreUserSpinDownTimeout();
         adjustPowerState();
     }
+
+    if (flags.bit.adjustPowerState)
+    {
+        bool sleepASAP = false;
+
+        if (!systemBooting && (preventIdleSleepList->getCount() == 0))
+        {
+            if (!wrangler)
+            {
+                changePowerStateToPriv(ON_STATE);
+                if (idleSeconds)
+                {
+                    // stay awake for at least idleSeconds
+                    startIdleSleepTimer(idleSeconds);
+                }
+            }
+            else if (!extraSleepDelay && !idleSleepTimerPending && !systemDarkWake)
+            {
+                sleepASAP = true;
+            }
+        }
+        if(sleepASAP)
+        {
+            lastSleepReason = kIOPMSleepReasonIdle;
+            setProperty(kRootDomainSleepReasonKey, kIOPMIdleSleepKey);
+        }
+
+        adjustPowerState(sleepASAP);
+    }
 }
 
 //******************************************************************************
@@ -6019,8 +6172,22 @@ void IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, I
             wrangler->setIgnoreIdleTimer( value );
         }
     }
+
     if (changedBits & kIOPMDriverAssertionCPUBit)
         evaluatePolicy(kStimulusDarkWakeEvaluate);
+
+    if (changedBits & kIOPMDriverAssertionReservedBit7) {
+        bool value = (newAssertions & kIOPMDriverAssertionReservedBit7) ? true : false;
+        if (value) {
+            DLOG("Driver assertion ReservedBit7 raised. Legacy IO preventing sleep\n");
+            updatePreventIdleSleepList(this, true);
+        }
+        else {
+            DLOG("Driver assertion ReservedBit7 dropped\n");
+            updatePreventIdleSleepList(this, false);
+        }
+
+    }
 }
 
 // MARK: -
@@ -6039,6 +6206,7 @@ void IOPMrootDomain::pmStatsRecordEvent(
     bool        stopping = eventIndex & kIOPMStatsEventStopFlag ? true:false;
     uint64_t    delta;
     uint64_t    nsec;
+    OSData *publishPMStats = NULL;
 
     eventIndex &= ~(kIOPMStatsEventStartFlag | kIOPMStatsEventStopFlag);
 
@@ -6047,24 +6215,29 @@ void IOPMrootDomain::pmStatsRecordEvent(
     switch (eventIndex) {
         case kIOPMStatsHibernateImageWrite:
             if (starting)
-                pmStats.hibWrite.start = nsec;
+                gPMStats.hibWrite.start = nsec;
             else if (stopping)
-                pmStats.hibWrite.stop = nsec;
+                gPMStats.hibWrite.stop = nsec;
 
             if (stopping) {
-                delta = pmStats.hibWrite.stop - pmStats.hibWrite.start;
+                delta = gPMStats.hibWrite.stop - gPMStats.hibWrite.start;
                 IOLog("PMStats: Hibernate write took %qd ms\n", delta/1000000ULL);
             }
             break;
         case kIOPMStatsHibernateImageRead:
             if (starting)
-                pmStats.hibRead.start = nsec;
+                gPMStats.hibRead.start = nsec;
             else if (stopping)
-                pmStats.hibRead.stop = nsec;
+                gPMStats.hibRead.stop = nsec;
 
             if (stopping) {
-                delta = pmStats.hibRead.stop - pmStats.hibRead.start;
+                delta = gPMStats.hibRead.stop - gPMStats.hibRead.start;
                 IOLog("PMStats: Hibernate read took %qd ms\n", delta/1000000ULL);
+
+                publishPMStats = OSData::withBytes(&gPMStats, sizeof(gPMStats));
+                setProperty(kIOPMSleepStatisticsKey, publishPMStats);
+                publishPMStats->release();
+                bzero(&gPMStats, sizeof(gPMStats));
             }
             break;
     }
@@ -6194,18 +6367,6 @@ IOReturn IOPMrootDomain::callPlatformFunction(
 
         return kIOReturnSuccess;
     }
-    else if (functionName &&
-             functionName->isEqualTo(kIOPMInstallSystemSleepPolicyHandlerKey))
-    {
-        if (_sleepPolicyHandler)
-            return kIOReturnExclusiveAccess;
-        if (!param1)
-            return kIOReturnBadArgument;
-        _sleepPolicyHandler = (IOPMSystemSleepPolicyHandler) param1;
-        _sleepPolicyTarget  = (void *) param2;
-        setProperty("IOPMSystemSleepPolicyHandler", kOSBooleanTrue);
-        return kIOReturnSuccess;
-    }
 
     return super::callPlatformFunction(
         functionName, waitForFunction, param1, param2, param3, param4);
@@ -6213,8 +6374,13 @@ IOReturn IOPMrootDomain::callPlatformFunction(
 
 void IOPMrootDomain::tracePoint( uint8_t point )
 {
-    if (!systemBooting)
-        pmTracer->tracePoint(point);
+    if (systemBooting) return;
+
+    pmTracer->tracePoint(point);
+
+#if	HIBERNATION
+    if (kIOPMTracePointSleepPowerPlaneDrivers == point) IOHibernateIOKitSleep();
+#endif
 }
 
 void IOPMrootDomain::tracePoint( uint8_t point, uint8_t data )
@@ -6932,7 +7098,7 @@ IOPMDriverAssertionID IOPMrootDomain::createPMAssertion(
  
     if (!pmAssertions)
         return 0;
- 
+
     ret = pmAssertions->createAssertion(whichAssertionBits, assertionLevel, ownerService, ownerDescription, &newAssertion);
 
     if (kIOReturnSuccess == ret)
diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp
index 7800babda..03779d4b1 100644
--- a/iokit/Kernel/IOPlatformExpert.cpp
+++ b/iokit/Kernel/IOPlatformExpert.cpp
@@ -747,10 +747,16 @@ static void IOShutdownNotificationsTimedOut(
     thread_call_param_t p0, 
     thread_call_param_t p1)
 {
+#ifdef CONFIG_EMBEDDED
+    /* 30 seconds has elapsed - panic */
+    panic("Halt/Restart Timed Out");
+
+#else /* ! CONFIG_EMBEDDED */
     int type = (int)(long)p0;
 
     /* 30 seconds has elapsed - resume shutdown */
     if(gIOPlatform) gIOPlatform->haltRestart(type);
+#endif /* CONFIG_EMBEDDED */
 }
 
 
@@ -960,6 +966,41 @@ void IOPlatformExpert::registerNVRAMController(IONVRAMController * caller)
     OSString *        string = 0;
     uuid_string_t     uuid;
 
+#if CONFIG_EMBEDDED
+    entry = IORegistryEntry::fromPath( "/chosen", gIODTPlane );
+    if ( entry )
+    {
+        OSData * data1;
+
+        data1 = OSDynamicCast( OSData, entry->getProperty( "unique-chip-id" ) );
+        if ( data1 && data1->getLength( ) == 8 )
+        {
+            OSData * data2;
+
+            data2 = OSDynamicCast( OSData, entry->getProperty( "chip-id" ) );
+            if ( data2 && data2->getLength( ) == 4 )
+            {
+                SHA1_CTX     context;
+                uint8_t      digest[ SHA_DIGEST_LENGTH ];
+                const uuid_t space = { 0xA6, 0xDD, 0x4C, 0xCB, 0xB5, 0xE8, 0x4A, 0xF5, 0xAC, 0xDD, 0xB6, 0xDC, 0x6A, 0x05, 0x42, 0xB8 };
+
+                SHA1Init( &context );
+                SHA1Update( &context, space, sizeof( space ) );
+                SHA1Update( &context, data1->getBytesNoCopy( ), data1->getLength( ) );
+                SHA1Update( &context, data2->getBytesNoCopy( ), data2->getLength( ) );
+                SHA1Final( digest, &context );
+
+                digest[ 6 ] = ( digest[ 6 ] & 0x0F ) | 0x50;
+                digest[ 8 ] = ( digest[ 8 ] & 0x3F ) | 0x80;
+
+                uuid_unparse( digest, uuid );
+                string = OSString::withCString( uuid );
+            }
+        }
+
+        entry->release( );
+    }
+#else /* !CONFIG_EMBEDDED */
     entry = IORegistryEntry::fromPath( "/efi/platform", gIODTPlane );
     if ( entry )
     {
@@ -984,6 +1025,7 @@ void IOPlatformExpert::registerNVRAMController(IONVRAMController * caller)
 
         entry->release( );
     }
+#endif /* !CONFIG_EMBEDDED */
 
     if ( string == 0 )
     {
diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp
index a299d3fa1..7d0b01167 100644
--- a/iokit/Kernel/IORegistryEntry.cpp
+++ b/iokit/Kernel/IORegistryEntry.cpp
@@ -60,6 +60,8 @@ OSDefineMetaClassAndStructors(IORegistryEntry, OSObject)
 #define kIORegPlaneNameSuffixLen	(sizeof(kIORegPlaneNameSuffix) - 1)
 #define kIORegPlaneLocationSuffixLen	(sizeof(kIORegPlaneLocationSuffix) - 1)
 
+#define KASLR_IOREG_DEBUG 0
+
 static IORegistryEntry * gRegistryRoot;
 static OSDictionary * 	 gIORegistryPlanes;
 
@@ -526,6 +528,15 @@ IORegistryEntry::removeProperty( const OSSymbol * aKey)
     PUNLOCK;
 }
 
+#if KASLR_IOREG_DEBUG
+extern "C" {
+    
+bool ScanForAddrInObject(OSObject * theObject, 
+                         int indent);
+    
+}; /* extern "C" */
+#endif
+
 bool
 IORegistryEntry::setProperty( const OSSymbol * aKey, OSObject * anObject)
 {
@@ -543,7 +554,18 @@ IORegistryEntry::setProperty( const OSSymbol * aKey, OSObject * anObject)
 
     ret = getPropertyTable()->setObject( aKey, anObject );
     PUNLOCK;
-    
+
+#if KASLR_IOREG_DEBUG
+    if ( anObject && strcmp(kIOKitDiagnosticsKey, aKey->getCStringNoCopy()) != 0 ) {
+        if (ScanForAddrInObject(anObject, 0)) {
+            IOLog("%s: IORegistryEntry name %s with key \"%s\" \n",
+                  __FUNCTION__,
+                  getName(0),
+                  aKey->getCStringNoCopy() );        
+        }
+    }
+#endif
+
     return ret;
 }
 
diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp
index f08348272..0c5dbc36c 100644
--- a/iokit/Kernel/IOService.cpp
+++ b/iokit/Kernel/IOService.cpp
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -46,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +56,7 @@
 
 #define LOG kprintf
 //#define LOG IOLog
+#define MATCH_DEBUG	0
 
 #include "IOServicePrivate.h"
 #include "IOKitKernelInternal.h"
@@ -119,7 +122,10 @@ const OSSymbol *		gIOConsoleSessionLoginDoneKey;
 const OSSymbol *		gIOConsoleSessionSecureInputPIDKey;
 const OSSymbol *		gIOConsoleSessionScreenLockedTimeKey;
 
-static clock_sec_t		gIOConsoleLockTime;
+clock_sec_t			gIOConsoleLockTime;
+static bool			gIOConsoleLoggedIn;
+static uint32_t			gIOScreenLockState;
+static IORegistryEntry *        gIOChosenEntry;
 
 static int			gIOResourceGenerationCount;
 
@@ -225,7 +231,6 @@ static IOLock *     gArbitrationLockQueueLock;
 bool IOService::isInactive( void ) const
     { return( 0 != (kIOServiceInactiveState & getState())); }
 
-
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -490,6 +495,10 @@ void IOService::detach( IOService * provider )
             _adjustBusy( -busy );
     }
 
+    if (kIOServiceInactiveState & __state[0]) {
+	getMetaClass()->removeInstance(this);
+    }
+
     unlockForArbitration();
 
     if( newProvider) {
@@ -628,7 +637,9 @@ void IOService::startMatching( IOOptionBits options )
             lockForArbitration();
             IOLockLock( gIOServiceBusyLock );
 
-            waitAgain = (prevBusy < (__state[1] & kIOServiceBusyStateMask));
+            waitAgain = ((prevBusy < (__state[1] & kIOServiceBusyStateMask))
+				       && (0 == (__state[0] & kIOServiceInactiveState)));
+
             if( waitAgain)
                 __state[1] |= kIOServiceSyncPubState | kIOServiceBusyWaiterState;
             else
@@ -661,37 +672,37 @@ IOReturn IOService::catalogNewDrivers( OSOrderedSet * newTables )
     
     while( (table = (OSDictionary *) newTables->getFirstObject())) {
 
-	LOCKWRITENOTIFY();
+        LOCKWRITENOTIFY();
         set = (OSSet *) copyExistingServices( table, 
 						kIOServiceRegisteredState,
 						kIOServiceExistingSet);
-	UNLOCKNOTIFY();
-	if( set) {
+        UNLOCKNOTIFY();
+        if( set) {
 
 #if IOMATCHDEBUG
-	    count += set->getCount();
+            count += set->getCount();
 #endif
-	    if (allSet) {
-		allSet->merge((const OSSet *) set);
-		set->release();
-	    }
-	    else
-		allSet = set;
-	}
+            if (allSet) {
+                allSet->merge((const OSSet *) set);
+                set->release();
+            }
+            else
+                allSet = set;
+        }
 
 #if IOMATCHDEBUG
-	if( getDebugFlags( table ) & kIOLogMatch)
-	    LOG("Matching service count = %ld\n", (long)count);
+        if( getDebugFlags( table ) & kIOLogMatch)
+            LOG("Matching service count = %ld\n", (long)count);
 #endif
-	newTables->removeObject(table);
+        newTables->removeObject(table);
     }
 
     if (allSet) {
-	while( (service = (IOService *) allSet->getAnyObject())) {
-	    service->startMatching(kIOServiceAsynchronous);
-	    allSet->removeObject(service);
-	}
-	allSet->release();
+        while( (service = (IOService *) allSet->getAnyObject())) {
+            service->startMatching(kIOServiceAsynchronous);
+            allSet->removeObject(service);
+        }
+        allSet->release();
     }
 
     newTables->release();
@@ -2475,13 +2486,13 @@ static SInt32 IOServiceObjectOrder( const OSObject * entry, void * ref)
     OSSymbol *		key = (OSSymbol *) ref;
     OSNumber *		offset;
 
-    if( (notify = OSDynamicCast( _IOServiceNotifier, entry)))
+    if( (dict = OSDynamicCast( OSDictionary, entry)))
+        offset = OSDynamicCast(OSNumber, dict->getObject( key ));
+    else if( (notify = OSDynamicCast( _IOServiceNotifier, entry)))
 	return( notify->priority );
 
     else if( (service = OSDynamicCast( IOService, entry)))
         offset = OSDynamicCast(OSNumber, service->getProperty( key ));
-    else if( (dict = OSDynamicCast( OSDictionary, entry)))
-        offset = OSDynamicCast(OSNumber, dict->getObject( key ));
     else {
 	assert( false );
 	offset = 0;
@@ -2602,10 +2613,6 @@ void IOService::probeCandidates( OSOrderedSet * matches )
     OSObject 		*	nextMatch = 0;
     bool			started;
     bool			needReloc = false;
-#if CONFIG_MACF_KEXT
-    OSBoolean		*	isSandbox = 0;
-    bool			useSandbox = false;
-#endif
 #if IOMATCHDEBUG
     SInt64			debugFlags;
 #endif
@@ -2667,7 +2674,7 @@ void IOService::probeCandidates( OSOrderedSet * matches )
 	    props->setCapacityIncrement(1);		
 
 	    // check the nub matches
-	    if( false == passiveMatch( props, true ))
+	    if( false == matchPassive(props, kIOServiceChangesOK | kIOServiceClassDone))
 		continue;
 
             // Check to see if driver reloc has been loaded.
@@ -2748,10 +2755,6 @@ void IOService::probeCandidates( OSOrderedSet * matches )
                 if( 0 == category)
                     category = gIODefaultMatchCategoryKey;
                 inst->setProperty( gIOMatchCategoryKey, (OSObject *) category );
-#if CONFIG_MACF_KEXT
-		isSandbox = OSDynamicCast(OSBoolean,
-                            props->getObject("IOKitForceMatch"));
-#endif
                 // attach driver instance
                 if( !(inst->attach( this )))
                         continue;
@@ -2768,21 +2771,6 @@ void IOService::probeCandidates( OSOrderedSet * matches )
     
                 newInst = inst->probe( this, &score );
                 inst->detach( this );
-#if CONFIG_MACF_KEXT
-		/*
-		 * If this is the Sandbox driver and it matched, this is a
-		 * disallowed device; toss any drivers that were already
-		 * matched.
-		 */
-		if (isSandbox && isSandbox->isTrue() && newInst != 0) {
-		    if (startDict != 0) {
-			startDict->flushCollection();
-			startDict->release();
-			startDict = 0;
-		    }
-		    useSandbox = true;
-		}
-#endif
                 if( 0 == newInst) {
 #if IOMATCHDEBUG
                     if( debugFlags & kIOLogProbe)
@@ -2821,13 +2809,6 @@ void IOService::probeCandidates( OSOrderedSet * matches )
             props->release();
             if( inst)
                 inst->release();
-#if CONFIG_MACF_KEXT
-	    /*
-	     * If we're forcing the sandbox, drop out of the loop.
-	     */
-	    if (isSandbox && isSandbox->isTrue() && useSandbox)
-		    break;
-#endif
         }
         familyMatches->release();
         familyMatches = 0;
@@ -3113,6 +3094,7 @@ void IOService::doServiceMatch( IOOptionBits options )
     SInt32		catalogGeneration;
     bool		keepGuessing = true;
     bool		reRegistered = true;
+    bool		didRegister;
 
 //    job->nub->deliverNotification( gIOPublishNotification,
 //  				kIOServiceRegisteredState, 0xffffffff );
@@ -3130,6 +3112,7 @@ void IOService::doServiceMatch( IOOptionBits options )
 	    LOCKREADNOTIFY();
             __state[1] &= ~kIOServiceNeedConfigState;
             __state[1] |= kIOServiceConfigState;
+            didRegister = (0 == (kIOServiceRegisteredState & __state[0]));
             __state[0] |= kIOServiceRegisteredState;
 
 	    keepGuessing &= (0 == (__state[0] & kIOServiceInactiveState));
@@ -3140,7 +3123,7 @@ void IOService::doServiceMatch( IOOptionBits options )
                     while((notify = (_IOServiceNotifier *)
                            iter->getNextObject())) {
 
-                        if( passiveMatch( notify->matching )
+                        if( matchPassive(notify->matching, 0)
                          && (kIOServiceNotifyEnable & notify->state))
                             matches->setObject( notify );
                     }
@@ -3149,6 +3132,9 @@ void IOService::doServiceMatch( IOOptionBits options )
             }
 
 	    UNLOCKNOTIFY();
+	    if (didRegister) {
+		getMetaClass()->addInstance(this);
+	    }
             unlockForArbitration();
 
             if (keepGuessing && matches->getCount() && (kIOReturnSuccess == getResources()))
@@ -3518,27 +3504,83 @@ void _IOServiceJob::pingConfig( _IOServiceJob * job )
     semaphore_signal( gJobsSemaphore );
 }
 
+struct IOServiceMatchContext
+{
+    OSDictionary * table;
+    OSObject *     result;
+    uint32_t	   options;
+    uint32_t	   state;
+    uint32_t	   count;
+    uint32_t       done;
+};
+
+bool IOService::instanceMatch(const OSObject * entry, void * context)
+{
+    IOServiceMatchContext * ctx = (typeof(ctx)) context;
+    IOService *    service = (typeof(service)) entry;
+    OSDictionary * table   = ctx->table;
+    uint32_t	   options = ctx->options;
+    uint32_t	   state   = ctx->state;
+    uint32_t       done;
+    bool           match;
+
+    done = 0;
+    do
+    {
+	match = ((state == (state & service->__state[0]))
+		&& (0 == (service->__state[0] & kIOServiceInactiveState)));
+	if (!match) break;
+	ctx->count += table->getCount();
+        match = service->matchInternal(table, options, &done);
+	ctx->done += done;
+    }
+    while (false);
+    if (!match)
+    	return (false);
+
+    if ((kIONotifyOnce & options) && (ctx->done == ctx->count))
+    {
+	service->retain();
+	ctx->result = service;
+	return (true);
+    }
+    else if (!ctx->result)
+    {
+	ctx->result = OSSet::withObjects((const OSObject **) &service, 1, 1);
+    }
+    else
+    {
+    	((OSSet *)ctx->result)->setObject(service);
+    }
+    return (false);
+}
+
 // internal - call with gNotificationLock
 OSObject * IOService::copyExistingServices( OSDictionary * matching,
 		 IOOptionBits inState, IOOptionBits options )
 {
-    OSObject *		current = 0;
-    OSIterator *	iter;
-    IOService *		service;
-    OSObject *		obj;
+    OSObject *	 current = 0;
+    OSIterator * iter;
+    IOService *	 service;
+    OSObject *	 obj;
+    OSString *   str;
 
     if( !matching)
 	return( 0 );
 
-    if(true 
-      && (obj = matching->getObject(gIOProviderClassKey))
+#if MATCH_DEBUG
+    OSSerialize * s = OSSerialize::withCapacity(128);
+    matching->serialize(s);
+#endif
+
+    if((obj = matching->getObject(gIOProviderClassKey))
       && gIOResourcesKey
       && gIOResourcesKey->isEqualTo(obj)
       && (service = gIOResources))
     {
 	if( (inState == (service->__state[0] & inState))
 	  && (0 == (service->__state[0] & kIOServiceInactiveState))
-	  &&  service->passiveMatch( matching ))
+	  &&  service->matchPassive(matching, options))
 	{
 	    if( options & kIONotifyOnce)
 	    {
@@ -3546,12 +3588,69 @@ OSObject * IOService::copyExistingServices( OSDictionary * matching,
 		current = service;
 	    }
 	    else
-		current = OSSet::withObjects(
-				(const OSObject **) &service, 1, 1 );
+		current = OSSet::withObjects((const OSObject **) &service, 1, 1 );
 	}
     }
     else
     {
+    	IOServiceMatchContext ctx;
+	ctx.table   = matching;
+	ctx.state   = inState;
+	ctx.count   = 0;
+	ctx.done    = 0;
+	ctx.options = options;
+	ctx.result  = 0;
+
+	if ((str = OSDynamicCast(OSString, obj)))
+	{
+	    const OSSymbol * sym = OSSymbol::withString(str);
+	    OSMetaClass::applyToInstancesOfClassName(sym, instanceMatch, &ctx);
+	    sym->release();
+	}
+	else
+	{
+	    IOService::gMetaClass.applyToInstances(instanceMatch, &ctx);
+	}
+
+
+	current = ctx.result;
+
+	options |= kIOServiceInternalDone | kIOServiceClassDone;
+	if (current && (ctx.done != ctx.count))
+	{
+	    OSSet *
+	    source = OSDynamicCast(OSSet, current);
+	    current = 0;
+	    while ((service = (IOService *) source->getAnyObject()))
+	    {
+		if (service->matchPassive(matching, options))
+		{
+		    if( options & kIONotifyOnce)
+		    {
+			service->retain();
+			current = service;
+			break;
+		    }
+		    if( current)
+		    {
+			((OSSet *)current)->setObject( service );
+		    }
+		    else
+		    {
+			current = OSSet::withObjects(
+					(const OSObject **) &service, 1, 1 );
+		    }
+		}
+		source->removeObject(service);	    
+	    }
+	    source->release();
+	}
+    }
+
+#if MATCH_DEBUG
+    {
+	OSObject * _current = 0;
+    
 	iter = IORegistryIterator::iterateOver( gIOServicePlane,
 					    kIORegistryIterateRecursively );
 	if( iter) {
@@ -3560,24 +3659,42 @@ OSObject * IOService::copyExistingServices( OSDictionary * matching,
 		while( (service = (IOService *) iter->getNextObject())) {
 		    if( (inState == (service->__state[0] & inState))
 		    && (0 == (service->__state[0] & kIOServiceInactiveState))
-		    &&  service->passiveMatch( matching )) {
+		    &&  service->matchPassive(matching, 0)) {
     
 			if( options & kIONotifyOnce) {
 			    service->retain();
-			    current = service;
+			    _current = service;
 			    break;
 			}
-			if( current)
-			    ((OSSet *)current)->setObject( service );
+			if( _current)
+			    ((OSSet *)_current)->setObject( service );
 			else
-			    current = OSSet::withObjects(
+			    _current = OSSet::withObjects(
 					    (const OSObject **) &service, 1, 1 );
 		    }
 		}
 	    } while( !service && !iter->isValid());
 	    iter->release();
 	}
-    }
+
+
+	if ( ((current != 0) != (_current != 0)) 
+	|| (current && _current && !current->isEqualTo(_current)))
+	{
+	    OSSerialize * s1 = OSSerialize::withCapacity(128);
+	    OSSerialize * s2 = OSSerialize::withCapacity(128);
+	    current->serialize(s1);
+	    _current->serialize(s2);
+	    kprintf("**mismatch** %p %p\n%s\n%s\n%s\n", current, _current, s->text(), s1->text(), s2->text());
+	    s1->release();
+	    s2->release();
+	}
+
+	if (_current) _current->release();
+    }    
+
+    s->release();
+#endif
 
     if( current && (0 == (options & (kIONotifyOnce | kIOServiceExistingSet)))) {
 	iter = OSCollectionIterator::withCollection( (OSSet *)current );
@@ -3604,6 +3721,21 @@ OSIterator * IOService::getMatchingServices( OSDictionary * matching )
     return( iter );
 }
 
+IOService * IOService::copyMatchingService( OSDictionary * matching )
+{
+    IOService *	service;
+
+    // is a lock even needed?
+    LOCKWRITENOTIFY();
+
+    service = (IOService *) copyExistingServices( matching,
+						kIOServiceMatchedState, kIONotifyOnce );
+    
+    UNLOCKNOTIFY();
+
+    return( service );
+}
+
 struct _IOServiceMatchingNotificationHandlerRef
 {
     IOServiceNotificationHandler handler;
@@ -3911,7 +4043,7 @@ void IOService::deliverNotification( const OSSymbol * type,
         if( iter) {
             while( (notify = (_IOServiceNotifier *) iter->getNextObject())) {
 
-                if( passiveMatch( notify->matching)
+                if( matchPassive(notify->matching, 0)
                   && (kIOServiceNotifyEnable & notify->state)) {
                     if( 0 == willSend)
                         willSend = OSArray::withCapacity(8);
@@ -3950,10 +4082,18 @@ IOOptionBits IOService::getState( void ) const
 OSDictionary * IOService::serviceMatching( const OSString * name,
 			OSDictionary * table )
 {
+
+    const OSString *	str;
+
+    str = OSSymbol::withString(name);
+    if( !str)
+	return( 0 );
+
     if( !table)
 	table = OSDictionary::withCapacity( 2 );
     if( table)
-        table->setObject(gIOProviderClassKey, (OSObject *)name );
+        table->setObject(gIOProviderClassKey, (OSObject *)str );
+    str->release();
 
     return( table );
 }
@@ -4238,28 +4378,37 @@ void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessa
     IORegistryEntry * regEntry;
     OSObject *        locked = kOSBooleanFalse;
     uint32_t          idx;
-    bool              loggedIn;
     bool              publish;
     OSDictionary *    user;
     static IOMessage  sSystemPower;
 
     regEntry = IORegistryEntry::getRegistryRoot();
 
+    if (!gIOChosenEntry)
+	gIOChosenEntry = IORegistryEntry::fromPath("/chosen", gIODTPlane);
+
     IOLockLock(gIOConsoleUsersLock);
 
     if (systemMessage)
     {
         sSystemPower = systemMessage;
+#if HIBERNATION
+	if ((kIOMessageSystemHasPoweredOn == systemMessage) && IOHibernateWasScreenLocked())
+	{
+	    locked = kOSBooleanTrue;
+	}
+#endif /* HIBERNATION */
     }
-    loggedIn = false;
+
     if (consoleUsers)
     {
         OSNumber * num = 0;
+	gIOConsoleLoggedIn = false;
 	for (idx = 0; 
 	      (user = OSDynamicCast(OSDictionary, consoleUsers->getObject(idx))); 
 	      idx++)
 	{
-	    loggedIn |= ((kOSBooleanTrue == user->getObject(gIOConsoleSessionOnConsoleKey))
+	    gIOConsoleLoggedIn |= ((kOSBooleanTrue == user->getObject(gIOConsoleSessionOnConsoleKey))
 	      		&& (kOSBooleanTrue == user->getObject(gIOConsoleSessionLoginDoneKey)));
 	    if (!num)
 	    {
@@ -4269,7 +4418,7 @@ void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessa
         gIOConsoleLockTime = num ? num->unsigned32BitValue() : 0;
     }
 
-    if (!loggedIn 
+    if (!gIOConsoleLoggedIn 
      || (kIOMessageSystemWillSleep == sSystemPower)
      || (kIOMessageSystemPagingOff == sSystemPower))
     {
@@ -4304,6 +4453,20 @@ void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessa
 	OSIncrementAtomic( &gIOConsoleUsersSeed );
     }
 
+#if HIBERNATION
+    if (gIOChosenEntry)
+    {
+	uint32_t screenLockState;
+
+	if (locked == kOSBooleanTrue) screenLockState = kIOScreenLockLocked;
+	else if (gIOConsoleLockTime)  screenLockState = kIOScreenLockUnlocked;
+	else                          screenLockState = kIOScreenLockNoLock;
+
+	if (screenLockState != gIOScreenLockState) gIOChosenEntry->setProperty(kIOScreenLockStateKey, &screenLockState, sizeof(screenLockState));
+	gIOScreenLockState = screenLockState;
+    }
+#endif /* HIBERNATION */
+
     IOLockUnlock(gIOConsoleUsersLock);
 
     if (publish)
@@ -4455,144 +4618,188 @@ IOService * IOService::matchLocation( IOService * /* client */ )
     return( parent );
 }
 
-bool IOService::passiveMatch( OSDictionary * table, bool changesOK )
+bool IOService::matchInternal(OSDictionary * table, uint32_t options, uint32_t * did)
 {
-    IOService *		where;
     OSString *		matched;
     OSObject *		obj;
     OSString *		str;
     IORegistryEntry *	entry;
     OSNumber *		num;
-    SInt32		score;
-    OSNumber *		newPri;
     bool		match = true;
-    bool		matchParent = false;
-    UInt32		done;
-
-    assert( table );
+    bool                changesOK = (0 != (kIOServiceChangesOK & options));
+    uint32_t            count;
+    uint32_t            done;
 
-    where = this;
+    do
+    {
+	count = table->getCount();
+	done = 0;
+	str = OSDynamicCast(OSString, table->getObject(gIOProviderClassKey));
+	if (str) {
+	    done++;
+	    match = ((kIOServiceClassDone & options) || (0 != metaCast(str)));
+#if MATCH_DEBUG
+	    match = (0 != metaCast( str ));
+	    if ((kIOServiceClassDone & options) && !match) panic("classDone");
+#endif
+	    if ((!match) || (done == count)) break;
+	}
 
-    do {
-        do {
-            done = 0;
-
-            str = OSDynamicCast( OSString, table->getObject( gIOProviderClassKey));
-            if( str) {
-                done++;
-                match = (0 != where->metaCast( str ));
-                if( !match)
-                    break;
-            }
+	obj = table->getObject( gIONameMatchKey );
+	if( obj) {
+	    done++;
+	    match = compareNames( obj, changesOK ? &matched : 0 );
+	    if (!match)	break;
+	    if( changesOK && matched) {
+		// leave a hint as to which name matched
+		table->setObject( gIONameMatchedKey, matched );
+		matched->release();
+	    }
+	    if (done == count) break;
+	}
 
-            obj = table->getObject( gIONameMatchKey );
-            if( obj) {
-                done++;
-                match = where->compareNames( obj, changesOK ? &matched : 0 );
-                if( !match)
-                    break;
-                if( changesOK && matched) {
-                    // leave a hint as to which name matched
-                    table->setObject( gIONameMatchedKey, matched );
-                    matched->release();
-                }
-            }
+	str = OSDynamicCast( OSString, table->getObject( gIOLocationMatchKey ));
+	if (str)
+	{
+    	    const OSSymbol * sym;
+	    done++;
+	    match = false;
+	    sym = copyLocation();
+	    if (sym) {
+		match = sym->isEqualTo( str );
+		sym->release();
+	    }
+	    if ((!match) || (done == count)) break;
+	}
 
-            str = OSDynamicCast( OSString, table->getObject( gIOLocationMatchKey ));
-            if( str) {
+	obj = table->getObject( gIOPropertyMatchKey );
+	if( obj)
+	{
+	    OSDictionary * dict;
+	    OSDictionary * nextDict;
+	    OSIterator *   iter;
+	    done++;
+	    match = false;
+	    dict = dictionaryWithProperties();
+	    if( dict) {
+		nextDict = OSDynamicCast( OSDictionary, obj);
+		if( nextDict)
+		    iter = 0;
+		else
+		    iter = OSCollectionIterator::withCollection(
+				OSDynamicCast(OSCollection, obj));
+
+		while( nextDict
+		    || (iter && (0 != (nextDict = OSDynamicCast(OSDictionary,
+					    iter->getNextObject()))))) {
+		    match = dict->isEqualTo( nextDict, nextDict);
+		    if( match)
+			break;
+		    nextDict = 0;
+		}
+		dict->release();
+		if( iter)
+		    iter->release();
+	    }
+	    if ((!match) || (done == count)) break;
+	}
 
-                const OSSymbol * sym;
+	str = OSDynamicCast( OSString, table->getObject( gIOPathMatchKey ));
+	if( str) {
+	    done++;
+	    entry = IORegistryEntry::fromPath( str->getCStringNoCopy() );
+	    match = (this == entry);
+	    if( entry)
+		entry->release();
+	    if ((!match) || (done == count)) break;
+	}
 
-                done++;
-                match = false;
-                sym = where->copyLocation();
-                if( sym) {
-                    match = sym->isEqualTo( str );
-                    sym->release();
-                }
-                if( !match)
-                    break;
-            }
+	num = OSDynamicCast( OSNumber, table->getObject( gIORegistryEntryIDKey ));
+	if (num) {
+	    done++;
+	    match = (getRegistryEntryID() == num->unsigned64BitValue());
+	    if ((!match) || (done == count)) break;
+	}
 
-            obj = table->getObject( gIOPropertyMatchKey );
-            if( obj) {
+	num = OSDynamicCast( OSNumber, table->getObject( gIOMatchedServiceCountKey ));
+	if( num)
+	{
+	    OSIterator *	iter;
+	    IOService *		service = 0;
+	    UInt32		serviceCount = 0;
 
-                OSDictionary * dict;
-                OSDictionary * nextDict;
-                OSIterator *   iter;
+	    done++;
+	    iter = getClientIterator();
+	    if( iter) {
+		while( (service = (IOService *) iter->getNextObject())) {
+		    if( kIOServiceInactiveState & service->__state[0])
+			continue;
+		    if( 0 == service->getProperty( gIOMatchCategoryKey ))
+			continue;
+		    ++serviceCount;
+		}
+		iter->release();
+	    }
+	    match = (serviceCount == num->unsigned32BitValue());
+	    if ((!match) || (done == count)) break;
+	}
 
-                done++;
-                match = false;
-                dict = where->dictionaryWithProperties();
-                if( dict) {
-                    nextDict = OSDynamicCast( OSDictionary, obj);
-                    if( nextDict)
-                        iter = 0;
-                    else
-                        iter = OSCollectionIterator::withCollection(
-                                    OSDynamicCast(OSCollection, obj));
-
-                    while( nextDict
-                        || (iter && (0 != (nextDict = OSDynamicCast(OSDictionary,
-                                                iter->getNextObject()))))) {
-                        match = dict->isEqualTo( nextDict, nextDict);
-                        if( match)
-                            break;
-                        nextDict = 0;
-                    }
-                    dict->release();
-                    if( iter)
-                        iter->release();
-                }
-                if( !match)
-                    break;
-            }
+#define propMatch(key)					\
+	obj = table->getObject(key);			\
+	if (obj)					\
+	{						\
+	    OSObject * prop;				\
+	    done++;					\
+	    prop = copyProperty(key);			\
+	    match = obj->isEqualTo(prop);		\
+            if (prop) prop->release();			\
+	    if ((!match) || (done == count)) break;	\
+	}
+	propMatch(kIOBSDNameKey)
+	propMatch(kIOBSDMajorKey)
+	propMatch(kIOBSDMinorKey)
+	propMatch(kIOBSDUnitKey)
+#undef propMatch
+    }
+    while (false);
 
-            str = OSDynamicCast( OSString, table->getObject( gIOPathMatchKey ));
-            if( str) {
-                done++;
-                entry = IORegistryEntry::fromPath( str->getCStringNoCopy() );
-                match = (where == entry);
-                if( entry)
-                    entry->release();
-                if( !match)
-                    break;
-            }
+    if (did) *did = done;
+    return (match);
+}
 
-            num = OSDynamicCast( OSNumber, table->getObject( gIORegistryEntryIDKey ));
-            if( num) {
-		done++;
-                match = (getRegistryEntryID() == num->unsigned64BitValue());
-	    }
+bool IOService::passiveMatch( OSDictionary * table, bool changesOK )
+{
+    return (matchPassive(table, changesOK ? kIOServiceChangesOK : 0));
+}
 
-            num = OSDynamicCast( OSNumber, table->getObject( gIOMatchedServiceCountKey ));
-            if( num) {
+bool IOService::matchPassive(OSDictionary * table, uint32_t options)
+{
+    IOService *		where;
+    OSDictionary *      nextTable;
+    SInt32		score;
+    OSNumber *		newPri;
+    bool		match = true;
+    bool		matchParent = false;
+    uint32_t		count;
+    uint32_t		done;
 
-                OSIterator *	iter;
-                IOService *		service = 0;
-                UInt32		serviceCount = 0;
+    assert( table );
 
-                done++;
-                iter = where->getClientIterator();
-                if( iter) {
-                    while( (service = (IOService *) iter->getNextObject())) {
-                        if( kIOServiceInactiveState & service->__state[0])
-                            continue;
-                        if( 0 == service->getProperty( gIOMatchCategoryKey ))
-                            continue;
-                        ++serviceCount;
-                    }
-                    iter->release();
-                }
-                match = (serviceCount == num->unsigned32BitValue());
-                if( !match)
-                    break;
-            }
+#if MATCH_DEBUG 
+    OSDictionary * root = table;
+#endif
 
-            if( done == table->getCount()) {
-                // don't call family if we've done all the entries in the table
-                matchParent = false;
-                break;
+    where = this;
+    do
+    {
+        do
+        {
+	    count = table->getCount();
+	    if (!(kIOServiceInternalDone & options))
+	    {
+		match = where->matchInternal(table, options, &done);
+		// don't call family if we've done all the entries in the table
+		if ((!match) || (done == count)) break;
             }
 
             // pass in score from property table
@@ -4609,7 +4816,7 @@ bool IOService::passiveMatch( OSDictionary * table, bool changesOK )
                 break;
             }
 
-            if( changesOK) {
+            if (kIOServiceChangesOK & options) {
                 // save the score
                 newPri = OSNumber::withNumber( score, 32 );
                 if( newPri) {
@@ -4618,43 +4825,42 @@ bool IOService::passiveMatch( OSDictionary * table, bool changesOK )
                 }
             }
 
-            if( !(match = where->compareProperty( table, kIOBSDNameKey )))
-                break;
-            if( !(match = where->compareProperty( table, kIOBSDMajorKey )))
-                break;
-            if( !(match = where->compareProperty( table, kIOBSDMinorKey )))
-                break;
-            if( !(match = where->compareProperty( table, kIOBSDUnitKey )))
-                break;
-
+	    options = 0;
             matchParent = false;
 
-            obj = OSDynamicCast( OSDictionary,
+            nextTable = OSDynamicCast(OSDictionary,
                   table->getObject( gIOParentMatchKey ));
-            if( obj) {
+            if( nextTable) {
+		// look for a matching entry anywhere up to root
                 match = false;
                 matchParent = true;
-                table = (OSDictionary *) obj;
+		table = nextTable;
                 break;
             }
 
-            table = OSDynamicCast( OSDictionary,
+            table = OSDynamicCast(OSDictionary,
                     table->getObject( gIOLocationMatchKey ));
-            if( table) {
+            if (table) {
+		// look for a matching entry at matchLocation()
                 match = false;
                 where = where->getProvider();
-                if( where)
-                    where = where->matchLocation( where );
+                if (where && (where = where->matchLocation(where))) continue;
             }
+            break;
+        }
+        while (true);
+    }
+    while( matchParent && (!match) && (where = where->getProvider()) );
 
-        } while( table && where );
-
-    } while( matchParent && (where = where->getProvider()) );
-
-    if( kIOLogMatch & gIOKitDebug)
-        if( where && (where != this) )
-            LOG("match parent @ %s = %d\n",
-                        where->getName(), match );
+#if MATCH_DEBUG
+    if (where != this) 
+    {
+	OSSerialize * s = OSSerialize::withCapacity(128);
+	root->serialize(s);
+	kprintf("parent match 0x%llx, %d,\n%s\n", getRegistryEntryID(), match, s->text());
+	s->release();
+    }
+#endif
 
     return( match );
 }
diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp
index 4905ec2cd..814a402ef 100644
--- a/iokit/Kernel/IOServicePM.cpp
+++ b/iokit/Kernel/IOServicePM.cpp
@@ -47,6 +47,7 @@
 #include 
 
 #include 
+#include 
 #include 
 
 // Required for notification instrumentation
@@ -90,8 +91,8 @@ static IOPMRequestQueue *    gIOPMReplyQueue    = 0;
 static IOPMWorkQueue *       gIOPMWorkQueue     = 0;
 static IOPMCompletionQueue * gIOPMFreeQueue     = 0;
 static IOPMRequest *         gIOPMRequest       = 0;
-static IOPlatformExpert *    gPlatform          = 0;
 static IOService *           gIOPMRootNode      = 0;
+static IOPlatformExpert *    gPlatform          = 0;
 
 static const OSSymbol *      gIOPMPowerClientDevice     = 0;
 static const OSSymbol *      gIOPMPowerClientDriver     = 0;
@@ -143,7 +144,7 @@ do {                                  \
 #define PM_LOCK_WAKEUP(event)       IOLockWakeup(fPMLock, event, false)
 
 #define ns_per_us                   1000
-#define k30seconds                  (30*1000000)
+#define k30Seconds                  (30*1000000)
 #define kMinAckTimeoutTicks         (10*1000000)
 #define kIOPMTardyAckSPSKey         "IOPMTardyAckSetPowerState"
 #define kIOPMTardyAckPSCKey         "IOPMTardyAckPowerStateChange"
@@ -196,6 +197,16 @@ enum {
     do { if (fPMActions.a) { \
          (fPMActions.a)(fPMActions.target, this, &fPMActions, x, y); } \
          } while (false)
+         
+static OSNumber * copyClientIDForNotification(
+    OSObject *object, 
+    IOPMInterestContext *context);
+
+static void logClientIDForNotification(
+    OSObject *object,
+    IOPMInterestContext *context, 
+    const char *logString);
+         
 
 //*********************************************************************************
 // PM machine states
@@ -747,6 +758,10 @@ void IOService::handlePMstop ( IOPMRequest * request )
 	PM_ASSERT_IN_GATE();
 	PM_LOG2("%s: %p %s start\n", getName(), this, __FUNCTION__);
 
+    // remove driver from prevent system sleep lists
+    getPMRootDomain()->updatePreventIdleSleepList(this, false);
+    getPMRootDomain()->updatePreventSystemSleepList(this, false);
+
     // remove the property
     removeProperty(kPwrMgtKey);			
 
@@ -1983,64 +1998,51 @@ void IOService::setParentInfo (
     }
 }
 
-//*********************************************************************************
-// [private] rebuildChildClampBits
-//
-// The ChildClamp bits (kIOPMChildClamp & kIOPMChildClamp2) in our capabilityFlags
-// indicate that one of our children (or grandchildren or great-grandchildren ...)
-// doesn't support idle or system sleep in its current state. Since we don't track
-// the origin of each bit, every time any child changes state we have to clear
-// these bits and rebuild them.
-//*********************************************************************************
+//******************************************************************************
+// [private] trackSystemSleepPreventers
+//******************************************************************************
 
-void IOService::rebuildChildClampBits ( void )
+void IOService::trackSystemSleepPreventers(
+    IOPMPowerStateIndex     oldPowerState,
+    IOPMPowerStateIndex     newPowerState,
+    IOPMPowerChangeFlags    changeFlags __unused )
 {
-    unsigned long		i;
-    OSIterator *		iter;
-    OSObject *			next;
-    IOPowerConnection *	connection;
-	unsigned long		powerState;
+    IOPMPowerFlags  oldCapability, newCapability;
 
-    // A child's desires has changed. We need to rebuild the child-clamp bits in
-	// our power state array. Start by clearing the bits in each power state.
-    
-    for ( i = 0; i < fNumberOfPowerStates; i++ )
-    {
-        fPowerStates[i].capabilityFlags &= ~(kIOPMChildClamp | kIOPMChildClamp2);
-    }
-
-	if (!inPlane(gIOPowerPlane))
-		return;
+    oldCapability = fPowerStates[oldPowerState].capabilityFlags &
+                    (kIOPMPreventIdleSleep | kIOPMPreventSystemSleep);
+    newCapability = fPowerStates[newPowerState].capabilityFlags &
+                    (kIOPMPreventIdleSleep | kIOPMPreventSystemSleep);
 
-    // Loop through the children. When we encounter the calling child, save the
-	// computed state as this child's desire. And set the ChildClamp bits in any
-    // of our states that some child has clamp on.
+    if (fHeadNoteChangeFlags & kIOPMInitialPowerChange)
+        oldCapability = 0;
+    if (oldCapability == newCapability)
+        return;
 
-    iter = getChildIterator(gIOPowerPlane);
-    if ( iter )
+    if ((oldCapability ^ newCapability) & kIOPMPreventIdleSleep)
     {
-        while ( (next = iter->getNextObject()) )
+#if SUPPORT_IDLE_CANCEL
+        if ((oldCapability & kIOPMPreventIdleSleep) == 0)
         {
-            if ( (connection = OSDynamicCast(IOPowerConnection, next)) )
-            {
-				if (connection->getReadyFlag() == false)
-				{
-					PM_LOG3("[%s] %s: connection not ready\n",
-						getName(), __FUNCTION__);
-					continue;
-				}
+            IOPMRequest *   cancelRequest;
 
-				powerState = connection->getDesiredDomainState();
-                if (powerState < fNumberOfPowerStates)
-                {
-                    if ( connection->getPreventIdleSleepFlag() )
-                        fPowerStates[powerState].capabilityFlags |= kIOPMChildClamp;
-                    if ( connection->getPreventSystemSleepFlag() )
-                        fPowerStates[powerState].capabilityFlags |= kIOPMChildClamp2;
-                }
+            cancelRequest = acquirePMRequest( this, kIOPMRequestTypeIdleCancel );
+            if (cancelRequest)
+            {
+                getPMRootDomain()->submitPMRequest( cancelRequest );
             }
         }
-        iter->release();
+#endif
+    
+        getPMRootDomain()->updatePreventIdleSleepList(this,
+            ((oldCapability & kIOPMPreventIdleSleep) == 0));
+    }
+
+    if ((oldCapability ^ newCapability) & kIOPMPreventSystemSleep)
+    {
+        
+        getPMRootDomain()->updatePreventSystemSleepList(this,
+            ((oldCapability & kIOPMPreventSystemSleep) == 0));
     }
 }
 
@@ -2059,7 +2061,6 @@ IOReturn IOService::requestPowerDomainState(
 	IOPMPowerFlags		outputPowerFlags;
     IOService *         child;
 	IOPMRequest *       subRequest;
-    bool                preventIdle, preventSleep; 
     bool                adjustPower = false;
 
     if (!initialized)
@@ -2082,10 +2083,6 @@ IOReturn IOService::requestPowerDomainState(
 	child = (IOService *) childConnection->getChildEntry(gIOPowerPlane);
 	assert(child);
 
-    preventIdle  = ((childRequestPowerFlags & kIOPMPreventIdleSleep) != 0);
-    preventSleep = ((childRequestPowerFlags & kIOPMPreventSystemSleep) != 0);
-    childRequestPowerFlags &= ~(kIOPMPreventIdleSleep | kIOPMPreventSystemSleep);
-
     // Merge in the power flags contributed by this power parent
     // at its current or impending power state. 
 
@@ -2156,9 +2153,7 @@ IOReturn IOService::requestPowerDomainState(
     // prevent idle/sleep flags towards the root domain.
 
     if (!childConnection->childHasRequestedPower() ||
-        (ps != childConnection->getDesiredDomainState()) ||
-        (childConnection->getPreventIdleSleepFlag() != preventIdle) ||
-        (childConnection->getPreventSystemSleepFlag() != preventSleep))
+        (ps != childConnection->getDesiredDomainState()))
         adjustPower = true;
 
 #if ENABLE_DEBUG_LOGS
@@ -2173,13 +2168,8 @@ IOReturn IOService::requestPowerDomainState(
 #endif
 
 	// Record the child's desires on the connection.
-#if SUPPORT_IDLE_CANCEL
-	bool attemptCancel = (preventIdle && !childConnection->getPreventIdleSleepFlag());
-#endif
 	childConnection->setChildHasRequestedPower();
 	childConnection->setDesiredDomainState( ps );
-	childConnection->setPreventIdleSleepFlag( preventIdle );
-	childConnection->setPreventSystemSleepFlag( preventSleep );
 
 	// Schedule a request to re-evaluate all children desires and
 	// adjust power state. Submit a request if one wasn't pending,
@@ -2197,17 +2187,6 @@ IOReturn IOService::requestPowerDomainState(
 		}
     }
 
-#if SUPPORT_IDLE_CANCEL
-	if (attemptCancel)
-	{
-		subRequest = acquirePMRequest( this, kIOPMRequestTypeIdleCancel );
-		if (subRequest)
-		{
-			submitPMRequest( subRequest );
-		}
-	}
-#endif
-
     return kIOReturnSuccess;
 }
 
@@ -3860,12 +3839,14 @@ bool IOService::notifyControllingDriver ( void )
 
     if (fInitialSetPowerState)
     {
+        fInitialSetPowerState = false;
+        fHeadNoteChangeFlags |= kIOPMInitialPowerChange;
+
         // Driver specified flag to skip the inital setPowerState()
         if (fHeadNotePowerArrayEntry->capabilityFlags & kIOPMInitialDeviceState)
         {
             return false;
         }
-        fInitialSetPowerState = false;
     }
 
     param = (DriverCallParam *) fDriverCallParamPtr;
@@ -3989,6 +3970,9 @@ void IOService::all_done ( void )
         // could our driver switch to the new state?
         if ( !( fHeadNoteChangeFlags & kIOPMNotDone) )
         {
+            trackSystemSleepPreventers(
+                fCurrentPowerState, fHeadNotePowerState, fHeadNoteChangeFlags);
+
 			// we changed, tell our parent
             requestDomainPower(fHeadNotePowerState);
 
@@ -4037,14 +4021,8 @@ void IOService::all_done ( void )
 			  ((fHeadNoteChangeFlags & kIOPMDomainDidChange)  &&
              (fCurrentPowerState < fHeadNotePowerState)))
         {
-            if ((fHeadNoteChangeFlags & kIOPMPowerSuppressed) &&
-                (fHeadNotePowerState != fCurrentPowerState) &&
-                (fHeadNotePowerState == fDesiredPowerState))
-            {
-                // Power changed, and desired power state restored.
-                // Clear any prior power desire while in suppressed state.
-                requestDomainPower(fHeadNotePowerState);
-            }
+            trackSystemSleepPreventers(
+                fCurrentPowerState, fHeadNotePowerState, fHeadNoteChangeFlags);
 
             // did power raise?
             if ( fCurrentPowerState < fHeadNotePowerState )
@@ -4221,7 +4199,6 @@ IOReturn IOService::requestDomainPower(
     IOPMPowerStateIndex ourPowerState,
     IOOptionBits        options )
 {
-    const IOPMPSEntry *             powerStateEntry;
     IOPMPowerFlags                  requestPowerFlags;
     IOPMPowerStateIndex             maxPowerState;
     IOPMRequestDomainPowerContext   context;
@@ -4236,13 +4213,7 @@ IOReturn IOService::requestDomainPower(
     // Fetch the input power flags for the requested power state.
     // Parent request is stated in terms of required power flags.
 
-	powerStateEntry = &fPowerStates[ourPowerState];
-	requestPowerFlags = powerStateEntry->inputPowerFlags;
-
-    if (powerStateEntry->capabilityFlags & (kIOPMChildClamp | kIOPMPreventIdleSleep))
-        requestPowerFlags |= kIOPMPreventIdleSleep;
-    if (powerStateEntry->capabilityFlags & (kIOPMChildClamp2 | kIOPMPreventSystemSleep))
-        requestPowerFlags |= kIOPMPreventSystemSleep;
+	requestPowerFlags = fPowerStates[ourPowerState].inputPowerFlags;
 
     // Disregard the "previous request" for power reservation.
 
@@ -5091,18 +5062,24 @@ static void logAppTimeouts ( OSObject * object, void * arg )
             (flag = context->responseArray->getObject(clientIndex)) &&
             (flag != kOSBooleanTrue))
         {
-            OSString * clientID = 0;
-            context->us->messageClient(context->messageType, object, &clientID);
-            PM_ERROR(context->errorLog, clientID ? clientID->getCStringNoCopy() : "");
+            OSString *logClientID = NULL;
+            OSNumber *clientID = copyClientIDForNotification(object, context);    
+            
+            if (clientID) {
+                logClientID = IOCopyLogNameForPID(clientID->unsigned32BitValue());
+                clientID->release();
+            }
+                
+            PM_ERROR(context->errorLog, logClientID ? logClientID->getCStringNoCopy() : "");
 
             // TODO: record message type if possible
             IOService::getPMRootDomain()->pmStatsRecordApplicationResponse(
                 gIOPMStatsApplicationResponseTimedOut,
-                clientID ? clientID->getCStringNoCopy() : "",
+                logClientID ? logClientID->getCStringNoCopy() : "",
                 0, (30*1000), -1);
 
-            if (clientID)
-                clientID->release();
+            if (logClientID)
+                logClientID->release();
         }
     }
 }
@@ -5225,7 +5202,7 @@ bool IOService::tellClientsWithResponse ( int messageType )
                 context.notifyType  = fOutOfBandParameter;
                 context.messageType = messageType;
             }
-            context.maxTimeRequested = k30seconds;
+            context.maxTimeRequested = k30Seconds;
 
             applyToInterested( gIOGeneralInterest,
 				pmTellClientWithResponse, (void *) &context );
@@ -5252,7 +5229,7 @@ bool IOService::tellClientsWithResponse ( int messageType )
             applyToInterested( gIOAppPowerStateInterest,
 				pmTellCapabilityAppWithResponse, (void *) &context );
             fNotifyClientArray = context.notifyClients;
-            context.maxTimeRequested = k30seconds;
+            context.maxTimeRequested = k30Seconds;
             break;
 
         case kNotifyCapabilityChangePriority:
@@ -5299,6 +5276,9 @@ void IOService::pmTellAppWithResponse ( OSObject * object, void * arg )
     IOPMInterestContext *   context = (IOPMInterestContext *) arg;
     IOServicePM *           pwrMgt = context->us->pwrMgt;
     uint32_t                msgIndex, msgRef, msgType;
+    OSNumber                *clientID = NULL;
+    proc_t                  proc = NULL;
+    boolean_t               proc_suspended = FALSE;
 #if LOG_APP_RESPONSE_TIMES
     AbsoluteTime            now;
 #endif
@@ -5306,19 +5286,34 @@ void IOService::pmTellAppWithResponse ( OSObject * object, void * arg )
     if (!OSDynamicCast(_IOServiceInterestNotifier, object))
         return;
 
+    if (context->us == getPMRootDomain())
+    {
+        if ((clientID = copyClientIDForNotification(object, context)))
+        {
+            uint32_t clientPID = clientID->unsigned32BitValue();
+            clientID->release();
+            proc = proc_find(clientPID);
+
+            if (proc)
+            {
+                proc_suspended = get_task_pidsuspended((task_t) proc->task);
+                proc_rele(proc);
+
+                if (proc_suspended)
+                {
+                    logClientIDForNotification(object, context, "PMTellAppWithResponse - Suspended");
+                    return;
+                }
+            }
+        }
+    }
+    
     if (context->messageFilter &&
         !context->messageFilter(context->us, object, context, 0, 0))
     {
         if (kIOLogDebugPower & gIOKitDebug)
         {
-            // Log client pid/name and client array index.
-            OSString * clientID = 0;
-            context->us->messageClient(kIOMessageCopyClientID, object, &clientID);
-            PM_LOG("%s DROP App %s, %s\n",
-                context->us->getName(),
-                getIOMessageString(context->messageType),
-                clientID ? clientID->getCStringNoCopy() : "");
-            if (clientID) clientID->release();
+            logClientIDForNotification(object, context, "DROP App");
         }
         return;
     }
@@ -5335,14 +5330,7 @@ void IOService::pmTellAppWithResponse ( OSObject * object, void * arg )
     OUR_PMLog(kPMLogAppNotify, msgType, msgRef);
     if (kIOLogDebugPower & gIOKitDebug)
     {
-        // Log client pid/name and client array index.
-        OSString * clientID = 0;
-        context->us->messageClient(kIOMessageCopyClientID, object, &clientID);
-        PM_LOG("%s MESG App(%u) %s, %s\n",
-            context->us->getName(),
-            msgIndex, getIOMessageString(msgType),
-            clientID ? clientID->getCStringNoCopy() : "");
-        if (clientID) clientID->release();
+        logClientIDForNotification(object, context, "MESG App");
     }
 
 #if LOG_APP_RESPONSE_TIMES
@@ -5433,15 +5421,12 @@ void IOService::pmTellClientWithResponse ( OSObject * object, void * arg )
     }
 
     retCode = context->us->messageClient(msgType, object, (void *) ¬ify, sizeof(notify));
-    if ( kIOReturnSuccess == retCode )
+
+    if (kIOReturnSuccess == retCode)
     {
-        if ( 0 == notify.returnValue )
-        {
-            // client doesn't want time to respond
+        if (0 == notify.returnValue) {
 			OUR_PMLog(kPMLogClientAcknowledge, msgRef, (uintptr_t) object);
-        }
-        else
-        {
+        } else {
             replied = kOSBooleanFalse;
             if ( notify.returnValue > context->maxTimeRequested )
             {
@@ -5458,9 +5443,7 @@ void IOService::pmTellClientWithResponse ( OSObject * object, void * arg )
                     context->maxTimeRequested = notify.returnValue;
             }
         }
-    }
-    else
-    {
+    } else {
         // not a client of ours
         // so we won't be waiting for response
 		OUR_PMLog(kPMLogClientAcknowledge, msgRef, 0);
@@ -5507,14 +5490,20 @@ void IOService::pmTellCapabilityAppWithResponse ( OSObject * object, void * arg
     if (kIOLogDebugPower & gIOKitDebug)
     {
         // Log client pid/name and client array index.
-        OSString * clientID = 0;
+        OSNumber * clientID = NULL;
+        OSString * clientIDString = NULL;;
         context->us->messageClient(kIOMessageCopyClientID, object, &clientID);
+        if (clientID) {
+            clientIDString = IOCopyLogNameForPID(clientID->unsigned32BitValue());
+        }
+    
         PM_LOG("%s MESG App(%u) %s, wait %u, %s\n",
             context->us->getName(),
             msgIndex, getIOMessageString(msgType),
             (replied != kOSBooleanTrue),
-            clientID ? clientID->getCStringNoCopy() : "");
+            clientIDString ? clientIDString->getCStringNoCopy() : "");
         if (clientID) clientID->release();
+        if (clientIDString) clientIDString->release();
     }
 
     msgArg.notifyRef = msgRef;
@@ -5763,43 +5752,86 @@ static void tellKernelClientApplier ( OSObject * object, void * arg )
     }
 }
 
-//*********************************************************************************
-// [private] tellAppClientApplier
-//
-// Message a registered application.
-//*********************************************************************************
+static OSNumber * copyClientIDForNotification(
+    OSObject *object, 
+    IOPMInterestContext *context)
+{
+    OSNumber *clientID = NULL;
+    context->us->messageClient(kIOMessageCopyClientID, object, &clientID);
+    return clientID;
+}
+
+static void logClientIDForNotification(
+    OSObject *object,
+    IOPMInterestContext *context, 
+    const char *logString)
+{
+    OSString *logClientID = NULL;
+    OSNumber *clientID = copyClientIDForNotification(object, context);    
+
+    if (logString) 
+    {
+        if (clientID)
+            logClientID = IOCopyLogNameForPID(clientID->unsigned32BitValue());
+    
+        PM_LOG("%s %s %s, %s\n",
+            context->us->getName(), logString,
+            IOService::getIOMessageString(context->messageType),
+            logClientID ? logClientID->getCStringNoCopy() : "");
+
+        if (logClientID) 
+            logClientID->release();
+    }
+    
+    if (clientID) 
+        clientID->release();
+
+    return;
+}
+
 
 static void tellAppClientApplier ( OSObject * object, void * arg )
 {
     IOPMInterestContext * context = (IOPMInterestContext *) arg;
+    OSNumber            * clientID = NULL;
+    proc_t                proc = NULL;
+    boolean_t             proc_suspended = FALSE;
+    
+    if (context->us == IOService::getPMRootDomain())
+    {
+        if ((clientID = copyClientIDForNotification(object, context)))
+        {
+            uint32_t clientPID = clientID->unsigned32BitValue();
+            clientID->release();
+            proc = proc_find(clientPID);
+
+            if (proc)
+            {
+                proc_suspended = get_task_pidsuspended((task_t) proc->task);
+                proc_rele(proc);
+
+                if (proc_suspended)
+                {
+                    logClientIDForNotification(object, context, "tellAppClientApplier - Suspended");
+                    return;
+                }
+            }
+        }
+    }
 
     if (context->messageFilter &&
         !context->messageFilter(context->us, object, context, 0, 0))
     {
         if (kIOLogDebugPower & gIOKitDebug)
         {
-            // Log client pid/name and client array index.
-            OSString * clientID = 0;
-            context->us->messageClient(kIOMessageCopyClientID, object, &clientID);
-            PM_LOG("%s DROP App %s, %s\n",
-                context->us->getName(),
-                IOService::getIOMessageString(context->messageType),
-                clientID ? clientID->getCStringNoCopy() : "");
-            if (clientID) clientID->release();
+            logClientIDForNotification(object, context, "DROP App");
         }
         return;
     }
 
     if (kIOLogDebugPower & gIOKitDebug)
     {
-        // Log client pid/name and client array index.
-        OSString * clientID = 0;
-        context->us->messageClient(kIOMessageCopyClientID, object, &clientID);
-        PM_LOG("%s MESG App %s, %s\n",
-            context->us->getName(),
-            IOService::getIOMessageString(context->messageType),
-            clientID ? clientID->getCStringNoCopy() : "");
-        if (clientID) clientID->release();
+        logClientIDForNotification(object, context, "MESG App");
     }
 
     context->us->messageClient(context->messageType, object, 0);
@@ -5814,20 +5846,18 @@ bool IOService::checkForDone ( void )
     int			i = 0;
     OSObject *	theFlag;
 
-    if ( fResponseArray == NULL )
-    {
+    if (fResponseArray == NULL) {
         return true;
     }
     
-    for ( i = 0; ; i++ )
-    {
+    for (i = 0; ; i++) {
         theFlag = fResponseArray->getObject(i);
-        if ( theFlag == NULL )
-        {
+
+        if (NULL == theFlag) {
             break;
         }
-        if ( kOSBooleanTrue != theFlag ) 
-        {
+
+        if (kOSBooleanTrue != theFlag) {
             return false;
         }
     }
@@ -6778,7 +6808,6 @@ void IOService::executePMRequest( IOPMRequest * request )
 
 		case kIOPMRequestTypeAdjustPowerState:
 			fAdjustPowerScheduled = false;
-			rebuildChildClampBits();
 			adjustPowerState();
 			break;
 
diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h
index bd2ec9234..61e88e4eb 100644
--- a/iokit/Kernel/IOServicePMPrivate.h
+++ b/iokit/Kernel/IOServicePMPrivate.h
@@ -232,7 +232,6 @@ private:
     unsigned int            InitialPowerChange:1;
     unsigned int            InitialSetPowerState:1;
     unsigned int            DeviceOverrideEnabled:1;
-    unsigned int            DeviceWasActive:1;
     unsigned int            DoNotPowerDown:1;
     unsigned int            ParentsKnowState:1;
     unsigned int            StrictTreeOrder:1;
@@ -313,7 +312,12 @@ private:
     uint32_t                OutOfBandMessage;
     uint32_t                TempClampCount;
     uint32_t                OverrideMaxPowerState;
+
+    // Protected by ActivityLock - BEGIN
     uint32_t                ActivityTickleCount;
+    uint32_t                DeviceWasActive;
+    // Protected by ActivityLock - END
+
     uint32_t                WaitReason;
     uint32_t                SavedMachineState;
     uint32_t                RootDomainState;
@@ -445,7 +449,7 @@ the ack timer is ticking every tenth of a second.
 #define kIOPMSyncNoChildNotify      0x0200  // sync root domain only, not entire tree
 #define kIOPMSyncTellPowerDown      0x0400  // send the ask/will power off messages
 #define kIOPMSyncCancelPowerDown    0x0800  // sleep cancel for maintenance wake
-#define kIOPMPowerSuppressed        0x1000  // power suppressed for dark wake
+#define kIOPMInitialPowerChange     0x1000  // set for initial power change
 
 enum {
     kDriverCallInformPreChange,
diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h
index 873d47660..cba83742d 100644
--- a/iokit/Kernel/IOServicePrivate.h
+++ b/iokit/Kernel/IOServicePrivate.h
@@ -38,8 +38,11 @@
 
 // options for getExistingServices()
 enum {
-    kIONotifyOnce		= 0x00000001,
-    kIOServiceExistingSet	= 0x00000002
+    kIONotifyOnce	      = 0x00000001,
+    kIOServiceExistingSet     = 0x00000002,
+    kIOServiceChangesOK       = 0x00000004,
+    kIOServiceInternalDone    = 0x00000008,
+    kIOServiceClassDone       = 0x00000010,
 };
 
 // masks for __state[1]
@@ -56,7 +59,7 @@ enum {
     kIOServiceTermPhase2State	= 0x01000000,
     kIOServiceTermPhase3State	= 0x00800000,
     kIOServiceTermPhase1State	= 0x00400000,
-	kIOServiceTerm1WaiterState  = 0x00200000
+    kIOServiceTerm1WaiterState  = 0x00200000
 };
 
 // options for terminate()
diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp
index 9235b293d..def362e60 100644
--- a/iokit/Kernel/IOStatistics.cpp
+++ b/iokit/Kernel/IOStatistics.cpp
@@ -39,7 +39,6 @@
 #include 
 
 #if IOKITSTATS
-
 bool IOStatistics::enabled = false;
 
 uint32_t IOStatistics::sequenceID = 0;
@@ -178,14 +177,10 @@ void IOStatistics::initialize()
 		return;
 	}
 
-#if DEVELOPMENT || DEBUG
-	/* Always enabled in development and debug builds. */
-#else
-	/* Only enabled in release builds if the boot argument is set. */
+	/* Only enabled if the boot argument is set. */
 	if (!(kIOStatistics & gIOKitDebug)) {
 		return;
 	}
-#endif	
 	
 	sysctl_register_oid(&sysctl__debug_iokit_statistics_general);
 	sysctl_register_oid(&sysctl__debug_iokit_statistics_workloop);
@@ -1218,7 +1213,7 @@ KextNode *IOStatistics::getKextNodeFromBacktrace(boolean_t write) {
 	vm_offset_t *scanAddr = NULL;
 	uint32_t i;
 	KextNode *found = NULL, *ke = NULL;
-
+    
 	btCount = OSBacktrace(bt, btCount);
 
 	if (write) {
@@ -1230,7 +1225,7 @@ KextNode *IOStatistics::getKextNodeFromBacktrace(boolean_t write) {
 	/* Ignore first levels */
 	scanAddr = (vm_offset_t *)&bt[btMin - 1];
 
-	for (i = 0; i < btCount; i++, scanAddr++) {
+	for (i = btMin - 1; i < btCount; i++, scanAddr++) {
 		ke = RB_ROOT(&kextAddressHead);
 		while (ke) {
 			if (*scanAddr < ke->address) {
diff --git a/iokit/Kernel/IOSubMemoryDescriptor.cpp b/iokit/Kernel/IOSubMemoryDescriptor.cpp
index 3e06210fb..c82a927ee 100644
--- a/iokit/Kernel/IOSubMemoryDescriptor.cpp
+++ b/iokit/Kernel/IOSubMemoryDescriptor.cpp
@@ -61,10 +61,7 @@ bool IOSubMemoryDescriptor::initSubRange( IOMemoryDescriptor * parent,
 					IOByteCount offset, IOByteCount length,
 					IODirection direction )
 {
-    if( !parent)
-	return( false);
-
-    if( (offset + length) > parent->getLength())
+    if( parent && ((offset + length) > parent->getLength()))
 	return( false);
 
     /*
@@ -83,10 +80,15 @@ bool IOSubMemoryDescriptor::initSubRange( IOMemoryDescriptor * parent,
 	 */
 
 	_parent->release();
-	_parent = 0;
     }
 
-    parent->retain();
+    if (parent) {
+	parent->retain();
+	_tag	= parent->getTag();
+    }
+    else {
+        _tag    = 0;
+    }
     _parent	= parent;
     _start	= offset;
     _length	= length;
@@ -94,7 +96,6 @@ bool IOSubMemoryDescriptor::initSubRange( IOMemoryDescriptor * parent,
 #ifndef __LP64__
     _direction  = (IODirection) (_flags & kIOMemoryDirectionMask);
 #endif /* !__LP64__ */
-    _tag	= parent->getTag();
 
     return( true );
 }
@@ -188,6 +189,19 @@ IOMemoryMap * IOSubMemoryDescriptor::makeMapping(
 uint64_t
 IOSubMemoryDescriptor::getPreparationID( void )
 {
-    return (_parent->getPreparationID());    
+    uint64_t pID;
+
+    if (!super::getKernelReserved())
+        return (kIOPreparationIDUnsupported);    
+
+    pID = _parent->getPreparationID();
+    if (reserved->kernReserved[0] != pID)
+    {
+        reserved->kernReserved[0] = pID;
+        reserved->preparationID   = kIOPreparationIDUnprepared;
+        super::setPreparationID();
+    }
+
+    return (super::getPreparationID());    
 }
 
diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp
index 32ce10c0f..29fb9577f 100644
--- a/iokit/Kernel/IOUserClient.cpp
+++ b/iokit/Kernel/IOUserClient.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -580,6 +580,9 @@ bool IOServiceUserNotification::init( mach_port_t port, natural_t type,
 				       void * reference, vm_size_t referenceSize,
 				       bool clientIs64 )
 {
+    if( !super::init())
+        return( false );
+
     newSet = OSArray::withCapacity( 1 );
     if( !newSet)
         return( false );
@@ -605,7 +608,7 @@ bool IOServiceUserNotification::init( mach_port_t port, natural_t type,
     pingMsg->notifyHeader.type = type;
     bcopy( reference, pingMsg->notifyHeader.reference, referenceSize );
 
-    return( super::init() );
+    return( true );
 }
 
 void IOServiceUserNotification::free( void )
@@ -622,8 +625,12 @@ void IOServiceUserNotification::free( void )
 
     super::free();
 
-    if( _pingMsg && _msgSize)
-        IOFree( _pingMsg, _msgSize);
+    if( _pingMsg && _msgSize) {
+		if (_pingMsg->msgHdr.msgh_remote_port) {
+			iokit_release_port_send(_pingMsg->msgHdr.msgh_remote_port);
+		}
+        IOFree(_pingMsg, _msgSize);
+	}
 
     if( _lastEntry)
         _lastEntry->release();
@@ -715,6 +722,8 @@ bool IOServiceMessageUserNotification::init( mach_port_t port, natural_t type,
 				void * reference, vm_size_t referenceSize, vm_size_t extraSize,
 				bool client64 )
 {
+    if( !super::init())
+        return( false );
 
     if (referenceSize > sizeof(OSAsyncReference64))
         return( false );
@@ -749,7 +758,7 @@ bool IOServiceMessageUserNotification::init( mach_port_t port, natural_t type,
     pingMsg->notifyHeader.type 		= type;
     bcopy( reference, pingMsg->notifyHeader.reference, referenceSize );
 
-    return( super::init() );
+    return( true );
 }
 
 void IOServiceMessageUserNotification::free( void )
@@ -762,8 +771,12 @@ void IOServiceMessageUserNotification::free( void )
 
     super::free();
 
-    if( _pingMsg && _msgSize)
+    if( _pingMsg && _msgSize) {
+		if (_pingMsg->msgHdr.msgh_remote_port) {
+			iokit_release_port_send(_pingMsg->msgHdr.msgh_remote_port);
+		}
         IOFree( _pingMsg, _msgSize);
+	}
 }
 
 IOReturn IOServiceMessageUserNotification::_handler( void * target, void * ref,
@@ -786,8 +799,8 @@ IOReturn IOServiceMessageUserNotification::handler( void * ref,
 
     if (kIOMessageCopyClientID == messageType)
     {
-	*((void **) messageArgument) = IOCopyLogNameForPID(owningPID);
-	return (kIOReturnSuccess);
+        *((void **) messageArgument) = OSNumber::withNumber(owningPID, 32);
+        return (kIOReturnSuccess);
     }
 
     data->messageType = messageType;
@@ -1619,6 +1632,60 @@ kern_return_t is_io_service_get_matching_services_ool(
     return( kr );
 }
 
+
+/* Routine io_service_get_matching_service */
+kern_return_t is_io_service_get_matching_service(
+	mach_port_t master_port,
+	io_string_t matching,
+	io_service_t *service )
+{
+    kern_return_t	kr;
+    OSObject *		obj;
+    OSDictionary *	dict;
+
+    if( master_port != master_device_port)
+        return( kIOReturnNotPrivileged);
+
+    obj = OSUnserializeXML( matching );
+
+    if( (dict = OSDynamicCast( OSDictionary, obj))) {
+        *service = IOService::copyMatchingService( dict );
+	kr = *service ? kIOReturnSuccess : kIOReturnNotFound;
+    } else
+	kr = kIOReturnBadArgument;
+
+    if( obj)
+        obj->release();
+
+    return( kr );
+}
+
+/* Routine io_service_get_matching_services_ool */
+kern_return_t is_io_service_get_matching_service_ool(
+	mach_port_t master_port,
+	io_buf_ptr_t matching,
+	mach_msg_type_number_t matchingCnt,
+	kern_return_t *result,
+	io_object_t *service )
+{
+    kern_return_t	kr;
+    vm_offset_t 	data;
+    vm_map_offset_t	map_data;
+
+    kr = vm_map_copyout( kernel_map, &map_data, (vm_map_copy_t) matching );
+    data = CAST_DOWN(vm_offset_t, map_data);
+
+    if( KERN_SUCCESS == kr) {
+        // must return success after vm_map_copyout() succeeds
+	*result = is_io_service_get_matching_service( master_port,
+			(char *) data, service );
+	vm_deallocate( kernel_map, data, matchingCnt );
+    }
+
+    return( kr );
+}
+
+
 static kern_return_t internal_io_service_add_notification(
 	mach_port_t master_port,
 	io_name_t notification_type,
@@ -1667,6 +1734,7 @@ static kern_return_t internal_io_service_add_notification(
 
         if( userNotify && !userNotify->init( port, userMsgType,
                                              reference, referenceSize, client64)) {
+			iokit_release_port_send(port);
             userNotify->release();
             userNotify = 0;
         }
@@ -1828,6 +1896,7 @@ static kern_return_t internal_io_service_add_interest_notification(
                                              reference, referenceSize,
 					     kIOUserNotifyMaxMessageSize,
 					     client64 )) {
+			iokit_release_port_send(port);
             userNotify->release();
             userNotify = 0;
         }
@@ -3846,72 +3915,6 @@ kern_return_t shim_io_async_method_structureI_structureO(
     return( err);
 }
 
-/* Routine io_make_matching */
-kern_return_t is_io_make_matching(
-	mach_port_t	    master_port,
-	uint32_t	    type,
-	uint32_t		options,
-        io_struct_inband_t	input,
-        mach_msg_type_number_t	inputCount,
-	io_string_t	matching )
-{
-    OSSerialize * 	s;
-    IOReturn		err = kIOReturnSuccess;
-    OSDictionary *	dict;
-
-    if( master_port != master_device_port)
-        return( kIOReturnNotPrivileged);
-
-    switch( type) {
-
-	case kIOServiceMatching:
-            dict = IOService::serviceMatching( gIOServiceKey );
-	    break;
-
-	case kIOBSDNameMatching:
-	    dict = IOBSDNameMatching( (const char *) input );
-	    break;
-
-	case kIOOFPathMatching:
-	    dict = IOOFPathMatching( (const char *) input,
-                                    matching, sizeof( io_string_t));
-	    break;
-
-	default:
-	    dict = 0;
-    }
-
-    if( !dict)
-	return( kIOReturnUnsupported);
-
-    do {
-        s = OSSerialize::withCapacity(4096);
-        if( !s) {
-            err = kIOReturnNoMemory;
-	    continue;
-	}
-        s->clearText();
-        if( !dict->serialize( s )) {
-            err = kIOReturnUnsupported;
-	    continue;
-        }
-
-        if( s->getLength() > sizeof( io_string_t)) {
-            err = kIOReturnNoMemory;
-	    continue;
-        } else
-            strlcpy(matching, s->text(), sizeof(io_string_t));
-    }
-    while( false);
-
-    if( s)
-	s->release();
-    if( dict)
-	dict->release();
-
-    return( err);
-}
-
 /* Routine io_catalog_send_data */
 kern_return_t is_io_catalog_send_data(
         mach_port_t		master_port,
diff --git a/iokit/Kernel/IOWorkLoop.cpp b/iokit/Kernel/IOWorkLoop.cpp
index 51045a234..d2c28b043 100644
--- a/iokit/Kernel/IOWorkLoop.cpp
+++ b/iokit/Kernel/IOWorkLoop.cpp
@@ -556,6 +556,8 @@ IOReturn IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *)
 
     case mRemoveEvent:
         if (inEvent->getWorkLoop()) {
+        	IOStatisticsDetachEventSource();
+    		
         	if (eventSourcePerformsWork(inEvent)) {
 				if (eventChain == inEvent)
 					eventChain = inEvent->getNext();
@@ -595,7 +597,6 @@ IOReturn IOWorkLoop::_maintRequest(void *inC, void *inD, void *, void *)
             inEvent->setNext(0);
             inEvent->release();
             SETP(&fFlags, kLoopRestart);
-            IOStatisticsDetachEventSource();
         }
         break;
 
diff --git a/iokit/Kernel/RootDomainUserClient.cpp b/iokit/Kernel/RootDomainUserClient.cpp
index 92097acde..75a26e7cc 100644
--- a/iokit/Kernel/RootDomainUserClient.cpp
+++ b/iokit/Kernel/RootDomainUserClient.cpp
@@ -36,6 +36,7 @@
 #include 
 #include "RootDomainUserClient.h"
 #include 
+#include 
 
 #define super IOUserClient
 
@@ -311,6 +312,7 @@ IOReturn RootDomainUserClient::externalMethod(
             
         case kPMActivityTickle:
             fOwner->reportUserInput( );
+            fOwner->setProperty(kIOPMRootDomainWakeTypeKey, "UserActivity Assertion");
             ret = kIOReturnSuccess;
             break;
             
diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp
index feffd1a9e..35ca86d8d 100644
--- a/iokit/bsddev/IOKitBSDInit.cpp
+++ b/iokit/bsddev/IOKitBSDInit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -155,83 +155,6 @@ OSDictionary * IOUUIDMatching( void )
     return IOService::resourceMatching( "boot-uuid-media" );
 }
 
-
-OSDictionary * IOCDMatching( void )
-{
-    OSDictionary *	dict;
-    const OSSymbol *	str;
-    
-    dict = IOService::serviceMatching( "IOMedia" );
-    if( dict == 0 ) {
-        IOLog("Unable to find IOMedia\n");
-        return 0;
-    }
-    
-    str = OSSymbol::withCString( "CD_ROM_Mode_1" );
-    if( str == 0 ) {
-        dict->release();
-        return 0;
-    }
-    
-    dict->setObject( "Content Hint", (OSObject *)str );
-    str->release();        
-    return( dict );
-}
-
-OSDictionary * IONetworkMatching(  const char * path,
-				   char * buf, int maxLen )
-{
-    OSDictionary *	matching = 0;
-    OSDictionary *	dict;
-    OSString *		str;
-    char *		comp;
-    const char *	skip;
-    int			len;
-
-    do {
-
-	len = strlen( kIODeviceTreePlane ":" );
-	maxLen -= len;
-	if( maxLen <= 0)
-	    continue;
-
-	strlcpy( buf, kIODeviceTreePlane ":", len + 1 );
-	comp = buf + len;
-
-        // remove parameters following ':' from the path
-        skip = strchr( path, ':');
-	if( !skip)
-	    continue;
-
-        len = skip - path;
-	maxLen -= len;
-	if( maxLen <= 0)
-	    continue;
-	strlcpy( comp, path, len + 1 );
-
-	matching = IOService::serviceMatching( "IONetworkInterface" );
-	if( !matching)
-	    continue;
-	dict = IOService::addLocation( matching );
-	if( !dict)
-	    continue;
-
-	str = OSString::withCString( buf );
-	if( !str)
-	    continue;
-        dict->setObject( kIOPathMatchKey, str );
-	str->release();
-
-	return( matching );
-
-    } while( false );
-
-    if( matching)
-        matching->release();
-
-    return( 0 );
-}
-
 OSDictionary * IONetworkNamePrefixMatching( const char * prefix )
 {
     OSDictionary *	 matching;
@@ -339,107 +262,6 @@ static bool IORegisterNetworkInterface( IOService * netif )
 	return ( netif->getProperty( kIOBSDNameKey ) != 0 );
 }
 
-OSDictionary * IODiskMatching( const char * path, char * buf, int maxLen )
-{
-    const char * look;
-    const char * alias;
-    char *       comp;
-    long         unit = -1;
-    long         partition = -1;
-    long		 lun = -1;
-    char         c;
-    int          len;
-
-    // scan the tail of the path for "@unit:partition"
-    do {
-        // Have to get the full path to the controller - an alias may
-        // tell us next to nothing, like "hd:8"
-        alias = IORegistryEntry::dealiasPath( &path, gIODTPlane );
-		
-        look = path + strlen( path);
-        c = ':';
-        while( look != path) {
-            if( *(--look) == c) {
-                if( c == ':') {
-                    partition = strtol( look + 1, 0, 0 );
-                    c = '@';
-                } else if( c == '@') {
-                    unit = strtol( look + 1, &comp, 16 );
-
-                    if( *comp == ',') {
-                        lun = strtol( comp + 1, 0, 16 );
-                    }
-                    
-                    c = '/';
-                } else if( c == '/') {
-                    c = 0;
-                    break;
-                }
-            }
-
-	        if( alias && (look == path)) {
-                path = alias;
-                look = path + strlen( path);
-                alias = 0;
-            }
-        }
-        if( c || unit == -1 || partition == -1)
-            continue;
-		
-        len = strlen( "{" kIOPathMatchKey "='" kIODeviceTreePlane ":" );
-        maxLen -= len;
-        if( maxLen <= 0)
-            continue;
-
-        snprintf( buf, len + 1, "{" kIOPathMatchKey "='" kIODeviceTreePlane ":" );
-        comp = buf + len;
-
-        if( alias) {
-            len = strlen( alias );
-            maxLen -= len;
-            if( maxLen <= 0)
-                continue;
-
-            strlcpy( comp, alias, len + 1 );
-            comp += len;
-        }
-
-        if ( (look - path)) {
-            len = (look - path);
-            maxLen -= len;
-            if( maxLen <= 0)
-                continue;
-
-            strlcpy( comp, path, len + 1 );
-            comp += len;
-        }
-			
-        if ( lun != -1 )
-        {
-            len = strlen( "/@hhhhhhhh,hhhhhhhh:dddddddddd';}" );
-            maxLen -= len;
-            if( maxLen <= 0)
-                continue;
-
-            snprintf( comp, len + 1, "/@%lx,%lx:%ld';}", unit, lun, partition );
-        }
-        else
-        {
-            len = strlen( "/@hhhhhhhh:dddddddddd';}" );
-            maxLen -= len;
-            if( maxLen <= 0)
-                continue;
-
-            snprintf( comp, len + 1, "/@%lx:%ld';}", unit, partition );
-        }
-		
-        return( OSDynamicCast(OSDictionary, OSUnserialize( buf, 0 )) );
-
-    } while( false );
-
-    return( 0 );
-}
-
 OSDictionary * IOOFPathMatching( const char * path, char * buf, int maxLen )
 {
     OSDictionary *	matching;
@@ -447,13 +269,6 @@ OSDictionary * IOOFPathMatching( const char * path, char * buf, int maxLen )
     char *		comp;
     int			len;
 
-    /* need to look up path, get device type,
-        call matching help based on device type */
-
-    matching = IODiskMatching( path, buf, maxLen );
-    if( matching)
-	return( matching );
-
     do {
 
 	len = strlen( kIODeviceTreePlane ":" );
@@ -490,42 +305,6 @@ OSDictionary * IOOFPathMatching( const char * path, char * buf, int maxLen )
     return( 0 );
 }
 
-IOService * IOFindMatchingChild( IOService * service )
-{
-    // find a matching child service
-    IOService * child = 0;
-    OSIterator * iter = service->getClientIterator();
-    if ( iter ) {
-        while( ( child = (IOService *) iter->getNextObject() ) ) {
-            OSDictionary * dict = OSDictionary::withCapacity( 1 );
-            if( dict == 0 ) {
-                iter->release();
-                return 0;
-            }
-            const OSSymbol * str = OSSymbol::withCString( "Apple_HFS" );
-            if( str == 0 ) {
-                dict->release();
-                iter->release();
-                return 0;
-            }
-            dict->setObject( "Content", (OSObject *)str );
-            str->release();
-            if ( child->compareProperty( dict, "Content" ) ) {
-                dict->release();
-                break;
-            }
-            dict->release();
-            IOService * subchild = IOFindMatchingChild( child );
-            if ( subchild ) {
-                child = subchild;
-                break;
-            }
-        }
-        iter->release();
-    }
-    return child;
-}
-
 static int didRam = 0;
 
 kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
@@ -538,18 +317,15 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
     OSString *		iostr;
     OSNumber *		off;
     OSData *		data = 0;
-    UInt32		*ramdParms = 0;
 
     UInt32		flags = 0;
     int			mnr, mjr;
-    bool		findHFSChild = false;
     const char *        mediaProperty = 0;
     char *		rdBootVar;
     enum {		kMaxPathBuf = 512, kMaxBootVar = 128 };
     char *		str;
     const char *	look = 0;
     int			len;
-    bool		forceNet = false;
     bool		debugInfoPrintedOnce = false;
     const char * 	uuidStr = NULL;
 
@@ -599,34 +375,10 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
 		    uuidStr = NULL;
 		}
 	    }
-
-	    // else try for an OF Path
-	    data = (OSData *) regEntry->getProperty( "rootpath" );
 	    regEntry->release();
-	    if( data) continue;
-	}
-        if( (regEntry = IORegistryEntry::fromPath( "/options", gIODTPlane ))) {
-	    data = (OSData *) regEntry->getProperty( "boot-file" );
-	    regEntry->release();
-	    if( data) continue;
 	}
     } while( false );
 
-    if( data && !uuidStr)
-        look = (const char *) data->getBytesNoCopy();
-
-    if( rdBootVar[0] == '*') {
-        look = rdBootVar + 1;
-		forceNet = false;
-    } else {
-        if( (regEntry = IORegistryEntry::fromPath( "/", gIODTPlane ))) {
-            forceNet = (0 != regEntry->getProperty( "net-boot" ));
-	    	regEntry->release();
-		}
-    }
-
-
-
 //
 //	See if we have a RAMDisk property in /chosen/memory-map.  If so, make it into a device.
 //	It will become /dev/mdx, where x is 0-f. 
@@ -637,7 +389,7 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
 		if((regEntry = IORegistryEntry::fromPath( "/chosen/memory-map", gIODTPlane ))) {	/* Find the map node */
 			data = (OSData *)regEntry->getProperty("RAMDisk");	/* Find the ram disk, if there */
 			if(data) {											/* We found one */
-
+				UInt32		*ramdParms = 0;
 				ramdParms = (UInt32 *)data->getBytesNoCopy();	/* Point to the ram disk base and size */
 				(void)mdevadd(-1, ml_static_ptovirt(ramdParms[0]) >> 12, ramdParms[1] >> 12, 0);	/* Initialize it and pass back the device number */
 			}
@@ -676,19 +428,6 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
 		}
 	}
 
-    if( look) {
-	// from OpenFirmware path
-	IOLog("From path: \"%s\", ", look);
-
-        if (!matching) {
-            if( forceNet || (0 == strncmp( look, "enet", strlen( "enet" ))) ) {
-                matching = IONetworkMatching( look, str, kMaxPathBuf );
-            } else {
-                matching = IODiskMatching( look, str, kMaxPathBuf );
-            }
-        }
-    }
-    
       if( (!matching) && rdBootVar[0] ) {
 	// by BSD name
 	look = rdBootVar;
@@ -697,10 +436,7 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
     
 	if ( strncmp( look, "en", strlen( "en" )) == 0 ) {
 	    matching = IONetworkNamePrefixMatching( "en" );
-	} else if ( strncmp( look, "cdrom", strlen( "cdrom" )) == 0 ) {
-            matching = IOCDMatching();
-            findHFSChild = true;
-        } else if ( strncmp( look, "uuid", strlen( "uuid" )) == 0 ) {
+	} else if ( strncmp( look, "uuid", strlen( "uuid" )) == 0 ) {
             char *uuid;
             OSString *uuidString;
 
@@ -772,25 +508,7 @@ kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
     } while( !service);
     matching->release();
 
-    if ( service && findHFSChild ) {
-        bool waiting = true;
-        uint64_t    timeoutNS;
-
-        // wait for children services to finish registering
-        while ( waiting ) {
-            timeoutNS = ROOTDEVICETIMEOUT;
-            timeoutNS *= kSecondScale;
-            
-            if ( (service->waitQuiet(timeoutNS) ) == kIOReturnSuccess) {
-                waiting = false;
-            } else {
-                IOLog( "Waiting for child registration\n" );
-            }
-        }
-        // look for a subservice with an Apple_HFS child
-        IOService * subservice = IOFindMatchingChild( service );
-        if ( subservice ) service = subservice;
-    } else if ( service && mediaProperty ) {
+    if ( service && mediaProperty ) {
         service = (IOService *)service->getProperty(mediaProperty);
     }
 
diff --git a/iokit/conf/MASTER b/iokit/conf/MASTER
index f1d0f0648..3eff425fe 100644
--- a/iokit/conf/MASTER
+++ b/iokit/conf/MASTER
@@ -62,9 +62,12 @@ options		KERNOBJC	# Objective-C implementation	# 
 options		IOKITCPP	# C++ implementation		# 
 options		IOKITSTATS	# IOKit statistics		# 
 options		KDEBUG		# kernel tracing		# 
+options		IST_KDEBUG	# limited tracing		# 
+options		NO_KDEBUG   # no kernel tracing 	# 
 options		NETWORKING	# kernel networking		# 
 options		CRYPTO		# want crypto code		# 
 options		CONFIG_DTRACE	# enable dtrace			# 
+options		VM_PRESSURE_EVENTS				# 
 
 options		CONFIG_SLEEP	#				# 
 
@@ -104,4 +107,5 @@ options   MACH_ASSERT				# 
 #
 options		CONFIG_MACF			# Mandatory Access Control Framework
 
-options   DEVELOPMENT                           # 
+options		DEVELOPMENT			# 
+options		DEBUG				# 
diff --git a/iokit/conf/MASTER.i386 b/iokit/conf/MASTER.i386
index ab7ff3360..b75268921 100644
--- a/iokit/conf/MASTER.i386
+++ b/iokit/conf/MASTER.i386
@@ -3,7 +3,7 @@
 #  Standard Apple Mac OS Configurations:
 #  -------- ----- ------ ---------------
 #
-#  RELEASE	= [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep iokitstats ]
+#  RELEASE	= [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep iokitstats vm_pressure_events ]
 #  PROFILE	= [ RELEASE profile ]
 #  DEBUG	= [ RELEASE debug ]
 #
diff --git a/iokit/conf/MASTER.x86_64 b/iokit/conf/MASTER.x86_64
index 781ce8c7c..b1fceabab 100644
--- a/iokit/conf/MASTER.x86_64
+++ b/iokit/conf/MASTER.x86_64
@@ -3,9 +3,9 @@
 #  Standard Apple Mac OS Configurations:
 #  -------- ----- ------ ---------------
 #
-#  RELEASE	= [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep iokitstats ]
+#  RELEASE	= [ intel mach iokitcpp hibernation medium crypto config_dtrace config_sleep iokitstats vm_pressure_events ]
 #  PROFILE	= [ RELEASE profile ]
-#  DEBUG	= [ RELEASE debug ]
+#  DEBUG	= [ RELEASE debug mach_assert ]
 #
 #  EMBEDDED	= [ intel mach iokitcpp hibernation no_kextd bsmall crypto ]
 #  DEVELOPMENT	= [ EMBEDDED development ]
diff --git a/iokit/conf/Makefile b/iokit/conf/Makefile
index 7b37a4736..868b1422b 100644
--- a/iokit/conf/Makefile
+++ b/iokit/conf/Makefile
@@ -42,9 +42,11 @@ $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)/Makefile :  $(SOURCE)/MASTER  \
 
 do_all: $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)/Makefile
 	$(_v)next_source=$(subst conf/,,$(SOURCE));			\
+	next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH));		\
 	${MAKE} -C $(COMPOBJROOT)/$(IOKIT_KERNEL_CONFIG)	\
 		MAKEFILES=$(TARGET)/$(IOKIT_KERNEL_CONFIG)/Makefile	\
 		SOURCE=$${next_source}			\
+		RELATIVE_SOURCE_PATH=$${next_relsource}			\
 		TARGET=$(TARGET)					\
 		INCL_MAKEDEP=FALSE	\
 		KERNEL_CONFIG=$(IOKIT_KERNEL_CONFIG) \
diff --git a/iokit/conf/Makefile.i386 b/iokit/conf/Makefile.i386
index 8842b32d7..df2fbb323 100644
--- a/iokit/conf/Makefile.i386
+++ b/iokit/conf/Makefile.i386
@@ -8,6 +8,8 @@ UNCONFIGURED_HIB_FILES=	\
 
 HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS))
 
+IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector
+
 ######################################################################
 #END    Machine dependent Makefile fragment for i386
 ######################################################################
diff --git a/iokit/conf/Makefile.x86_64 b/iokit/conf/Makefile.x86_64
index 463de5a20..39d2cc065 100644
--- a/iokit/conf/Makefile.x86_64
+++ b/iokit/conf/Makefile.x86_64
@@ -8,6 +8,8 @@ UNCONFIGURED_HIB_FILES=	\
 
 HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS))
 
+IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector
+
 ######################################################################
 #END    Machine dependent Makefile fragment for x86_64
 ######################################################################
diff --git a/iokit/conf/files b/iokit/conf/files
index 532732d3b..90fbc0098 100644
--- a/iokit/conf/files
+++ b/iokit/conf/files
@@ -16,7 +16,6 @@ iokit/Kernel/IOHibernateRestoreKernel.c			optional hibernation
 iokit/Kernel/IOLib.cpp					optional iokitcpp
 iokit/Kernel/IOLocks.cpp				optional iokitcpp
 iokit/Kernel/IOConditionLock.cpp			optional iokitcpp
-iokit/Kernel/IOSyncer.cpp				optional iokitcpp
 
 #iokit/Kernel/IORuntime.cpp				optional iokitcpp
 iokit/Kernel/IOStartIOKit.cpp				optional iokitcpp
diff --git a/iokit/conf/files.i386 b/iokit/conf/files.i386
index 17c544f86..5f337b90a 100644
--- a/iokit/conf/files.i386
+++ b/iokit/conf/files.i386
@@ -1,4 +1,8 @@
 
+# libIOKit
+
+iokit/Kernel/IOSyncer.cpp				optional iokitcpp
+
 # Shared lock
 
 iokit/Kernel/i386/IOSharedLock.s                            standard
diff --git a/iokit/conf/files.x86_64 b/iokit/conf/files.x86_64
index 9d6ca13ee..457354b48 100644
--- a/iokit/conf/files.x86_64
+++ b/iokit/conf/files.x86_64
@@ -1,4 +1,8 @@
 
+# libIOKit
+
+iokit/Kernel/IOSyncer.cpp				optional iokitcpp
+
 # Shared lock
 
 iokit/Kernel/x86_64/IOSharedLock.s                            standard
diff --git a/kgmacros b/kgmacros
index a2c6879f8..0f5dcbc2b 100644
--- a/kgmacros
+++ b/kgmacros
@@ -306,6 +306,9 @@ document kgm
 |     showmodctl        Display info about a dtrace modctl
 |     showfbtprobe      Display info about an fbt probe given an id (traverses fbt_probetab)
 |     processortimers   Display all processor timers, noting any inconsistencies
+|	  
+|     maplocalcache     Enable local caching in GDB for improved debug speeds
+|     flushlocalcahe    Disable local caching in GDB (deletes all memory regions)
 |
 | Type "help " for more specific help on a particular macro.
 | Type "show user " to see what the macro is really doing.
@@ -666,17 +669,17 @@ define showactint
 	      printf "     "
 	   end
 	   set $diskpolicy = 0
-	   if ($kgm_thread->ext_actionstate.hw_disk != 0)
-		set $diskpolicy = $kgm_thread->ext_actionstate.hw_disk
+	   if ($kgm_thread->ext_appliedstate.hw_disk != 0)
+		set $diskpolicy = $kgm_thread->ext_appliedstate.hw_disk
 	   else 
-		if ($kgm_thread->actionstate.hw_disk != 0)
-			set $diskpolicy = $kgm_thread->actionstate.hw_disk
+		if ($kgm_thread->appliedstate.hw_disk != 0)
+			set $diskpolicy = $kgm_thread->appliedstate.hw_disk
 		end
 	   end
-	   if ($kgm_thread->ext_actionstate.hw_bg != 0)
+	   if ($kgm_thread->ext_appliedstate.hw_bg != 0)
 		set $diskpolicy = 5
 	   end
-	   if ($kgm_thread->actionstate.hw_bg != 0)
+	   if ($kgm_thread->appliedstate.hw_bg != 0)
 		set $diskpolicy = 4
 	   end
 	   if ($diskpolicy == 2)
@@ -1533,13 +1536,13 @@ end
 define showipcheader
     printf "ipc_space "
     showptrhdrpad
-    printf "  is_table  "
+    printf "  is_task   "
     showptrhdrpad
-    printf "  table_next"
+    printf "  is_table  "
     showptrhdrpad
-    printf " flags ports  splaysize   "
+    printf " flags ports  table_next  "
     showptrhdrpad
-    printf "splaybase\n"
+    printf "   low_mod   high_mod\n"
 end
 
 define showipceheader
@@ -1622,29 +1625,25 @@ define showipcint
     set $kgm_is = *$kgm_isp
     showptr $arg0
     printf "  "
-    showptr $kgm_is.is_table
+    showptr $kgm_is.is_task
     printf "  "
-    showptr $kgm_is.is_table_next
+    showptr $kgm_is.is_table
     printf "  "
-    if $kgm_is.is_growing != 0
-        printf "G"
-    else
-        printf " "
-    end
-    if $kgm_is.is_fast != 0
-        printf "F"
+    if ($kgm_is.is_bits & 0x40000000) == 0
+        printf "A"
     else
         printf " "
     end
-    if $kgm_is.is_active != 0
-        printf "A  "
+    if ($kgm_is.is_bits & 0x20000000) != 0
+        printf "G   "
     else
-        printf "   "
+        printf "    "
     end
-    printf "%5d  ", $kgm_is.is_table_size + $kgm_is.is_tree_total
-    showptr $kgm_is.is_tree_total
+    printf "%5d  ", $kgm_is.is_table_size 
+    showptr $kgm_is.is_table_next
     printf "  "
-    showptr &$kgm_isp->is_tree
+    printf "%10d ", $kgm_is.is_low_mod
+    printf "%10d", $kgm_is.is_high_mod
     printf "\n"
     if $arg1 != 0
         showipceheader
@@ -1666,9 +1665,6 @@ define showipcint
             set $kgm_iindex = $kgm_iindex + 1
             set $kgm_iep = &($kgm_is.is_table[$kgm_iindex])
         end
-        if $kgm_is.is_tree_total
-            printf "Still need to write tree traversal\n"
-        end
     end
     printf "\n"
 end
@@ -1956,19 +1952,22 @@ define showprocint
 	end
 	set $ptask = (struct task *)$kgm_procp->task
 	set $diskpolicy = 0
-	if ($ptask->ext_actionstate.hw_disk != 0)
-		set $diskpolicy = $ptask->ext_actionstate.hw_disk
+	if ($ptask->ext_appliedstate.hw_disk != 0)
+		set $diskpolicy = $ptask->ext_appliedstate.hw_disk
 	else 
-		if ($ptask->actionstate.hw_disk != 0)
-			set $diskpolicy = $ptask->actionstate.hw_disk
+		if ($ptask->appliedstate.hw_disk != 0)
+			set $diskpolicy = $ptask->appliedstate.hw_disk
 		end
 	end
-	if ($ptask->ext_actionstate.hw_bg != 0)
+	if ($ptask->ext_appliedstate.hw_bg != 0)
 		set $diskpolicy = 5
 	end
-	if ($ptask->actionstate.hw_bg != 0)
+	if ($ptask->appliedstate.hw_bg != 0)
 		set $diskpolicy = 4
 	end
+	if ($ptask->ext_appliedstate.apptype == 2)
+		set $diskpolicy = 6
+	end
 	if ($diskpolicy == 2)
 		printf "PASS    "
 		set $kgm_printed = 1
@@ -1985,12 +1984,16 @@ define showprocint
 		printf "EBG_THRT"
 		set $kgm_printed = 1
 	end
+	if ($diskpolicy == 6)
+		printf "APD_THRT"
+		set $kgm_printed = 1
+	end
 	if ($kgm_printed == 0)
 	   printf "      "
 	end
 	set $kgm_wqp = (struct workqueue *)$kgm_procp->p_wqptr
 	if $kgm_wqp != 0
-	   printf "  %2d %2d %2d ", $kgm_wqp->wq_nthreads, $kgm_wqp->wq_thidlecount, $kgm_wqp->wq_itemcount
+	   printf "  %2d %2d %2d ", $kgm_wqp->wq_nthreads, $kgm_wqp->wq_thidlecount, $kgm_wqp->wq_reqcount
 	else
 	   printf "           "
 	end
@@ -2450,8 +2453,10 @@ define zprint_one
     printf "%8x ",$kgm_zone->max_size
     printf "%8d ",$kgm_zone->elem_size
     printf "%8x ",$kgm_zone->alloc_size
-	printf " %16ld ",$kgm_zone->num_allocs
-	printf "%16ld ",$kgm_zone->num_frees
+    if ($kgm_mtype != $kgm_mtype_arm) 
+        printf " %16ld ",$kgm_zone->num_allocs 
+        printf "%16ld ",$kgm_zone->num_frees
+    end
     printf "%s ",$kgm_zone->zone_name
 
     if ($kgm_zone->exhaustible)
@@ -2705,52 +2710,54 @@ define switchtoctx
 		set $pc=((struct savearea *) $arg0)->save_srr0
 		update
 	else
-	if ($kgm_mtype == $kgm_mtype_arm)
-		select 0
-		set $kdp_arm_act_counter = $kdp_arm_act_counter + 1
-		if ($kdp_arm_act_counter == 1)
-			set $r0_save   = $r0
-			set $r1_save   = $r1
-			set $r2_save   = $r2
-			set $r3_save   = $r3
-			set $r4_save   = $r4
-			set $r5_save   = $r5
-			set $r6_save   = $r6
-			set $r7_save   = $r7
-			set $r8_save   = $r8
-			set $r9_save   = $r9
-			set $r10_save  = $r10
-			set $r11_save  = $r11
-			set $r12_save  = $r12
-			set $sp_save   = $sp
-			set $lr_save   = $lr
-			set $pc_save   = $pc
-		end
-		set $kgm_statep = (struct arm_saved_state *)$arg0
-		set $r0 =  $kgm_statep->r[0]
-		set $r1 =  $kgm_statep->r[1]
-		set $r2 =  $kgm_statep->r[2]
-		set $r3 =  $kgm_statep->r[3]
-		set $r4 =  $kgm_statep->r[4]
-		set $r5 =  $kgm_statep->r[5]
-		set $r6 =  $kgm_statep->r[6]
-		set $r8 =  $kgm_statep->r[8]
-		set $r9 =  $kgm_statep->r[9]
-		set $r10 = $kgm_statep->r[10]
-		set $r11 = $kgm_statep->r[11]
-		set $r12 = $kgm_statep->r[12]
-		set $sp = $kgm_statep->sp
-		set $lr = $kgm_statep->lr
-		set $r7 =  $kgm_statep->r[7]
-		set $pc = $kgm_statep->pc
-		flushregs
-		flushstack
-		update
-	else
-		echo switchtoctx not implemented for this architecture.\n
+		if ($kgm_mtype == $kgm_mtype_arm)
+			select 0
+			set $kdp_arm_act_counter = $kdp_arm_act_counter + 1
+			if ($kdp_arm_act_counter == 1)
+				set $r0_save   = $r0
+				set $r1_save   = $r1
+				set $r2_save   = $r2
+				set $r3_save   = $r3
+				set $r4_save   = $r4
+				set $r5_save   = $r5
+				set $r6_save   = $r6
+				set $r7_save   = $r7
+				set $r8_save   = $r8
+				set $r9_save   = $r9
+				set $r10_save  = $r10
+				set $r11_save  = $r11
+				set $r12_save  = $r12
+				set $sp_save   = $sp
+				set $lr_save   = $lr
+				set $pc_save   = $pc
+			end
+			set $kgm_statep = (struct arm_saved_state *)$arg0
+			set $r0 =  $kgm_statep->r[0]
+			set $r1 =  $kgm_statep->r[1]
+			set $r2 =  $kgm_statep->r[2]
+			set $r3 =  $kgm_statep->r[3]
+			set $r4 =  $kgm_statep->r[4]
+			set $r5 =  $kgm_statep->r[5]
+			set $r6 =  $kgm_statep->r[6]
+			set $r8 =  $kgm_statep->r[8]
+			set $r9 =  $kgm_statep->r[9]
+			set $r10 = $kgm_statep->r[10]
+			set $r11 = $kgm_statep->r[11]
+			set $r12 = $kgm_statep->r[12]
+			set $sp = $kgm_statep->sp
+			set $lr = $kgm_statep->lr
+			set $r7 =  $kgm_statep->r[7]
+			set $pc = $kgm_statep->pc
+			flushregs
+			flushstack
+			update
+		else
+			echo switchtoctx not implemented for this architecture.\n
+		end
 	end
 end
 
+
 document switchtoctx  
 Syntax: switchtoctx 
| This command allows gdb to examine an execution context and dump the @@ -2896,8 +2903,8 @@ define dumpcallqueue set $kgm_i = 0 while $kgm_callentry != $kgm_callhead set $kgm_call = (struct call_entry *)$kgm_callentry - printf "0x%08x ", $kgm_call - printf "0x%08x 0x%08x ", $kgm_call->param0, $kgm_call->param1 + showptr $kgm_call + printf "0x%lx 0x%lx ", $kgm_call->param0, $kgm_call->param1 output $kgm_call->deadline printf "\t" output $kgm_call->func @@ -3151,6 +3158,12 @@ define showuserstack showactint $kgm_threadp 0 set $kgm_thread_pmap = $kgm_threadp->task->map->pmap set $kgm_thread_sp = $kgm_threadp.machine->PcbData.r[7] + showptrhdrpad + printf " " + showptr 0 + printf " " + showptr $kgm_threadp.machine->PcbData.pc + printf "\n" set kdp_pmap = $kgm_thread_pmap while ($kgm_thread_sp != 0) set $link_register = *($kgm_thread_sp + 4) @@ -3164,7 +3177,23 @@ define showuserstack end set kdp_pmap = $kgm_saved_pmap else - echo You must be connected via nanokdp to use this macro\n + set $kgm_threadp = (struct thread *)$arg0 + showactheader + showactint $kgm_threadp 0 + set $kgm_thread_sp = $kgm_threadp.machine->PcbData.r[7] + while ($kgm_thread_sp != 0) + _map_user_data_from_task $kgm_threadp->task $kgm_thread_sp 8 + set $kgm_thread_sp_window = (int *)$kgm_map_user_window + set $link_register = *($kgm_thread_sp_window + 1) + showptrhdrpad + printf " " + showptr $kgm_thread_sp + printf " " + showptr $link_register + printf "\n" + set $kgm_thread_sp = *$kgm_thread_sp_window + _unmap_user_data_from_task + end end else echo showuserstack not supported on this architecture\n @@ -3316,7 +3345,7 @@ define showuserregisters else if ($kgm_mtype == $kgm_mtype_arm) printf "ARM Thread State:\n" - set $kgm_pcb = (arm_saved_state_t *) ($kgm_threadp->machine.upcb) + set $kgm_pcb = (arm_saved_state_t *) (&$kgm_threadp->machine.PcbData) printf " r0: " showuserptr $kgm_pcb.r[0] @@ -3398,7 +3427,7 @@ define kdp-reboot # Alternatively, set *(*(unsigned **) 0x2498) = 1 # (or 0x5498 on PPC, 0xffffff8000002928 on x86_64, 0xffff049c on arm) manualhdrint $kgm_kdp_pkt_hostreboot - continue + detach end document kdp-reboot @@ -3531,9 +3560,9 @@ define getdumpinfo dumpinfoint KDP_DUMPINFO_GETINFO set $kgm_dumpinfo = (kdp_dumpinfo_reply_t *) manual_pkt.data if $kgm_dumpinfo->type & KDP_DUMPINFO_REBOOT - printf "Sysem will reboot after kernel info gets dumped.\n" + printf "System will reboot after kernel info gets dumped.\n" else - printf "Sysem will not reboot after kernel info gets dumped.\n" + printf "System will not reboot after kernel info gets dumped.\n" end if $kgm_dumpinfo->type & KDP_DUMPINFO_NORESUME printf "System will allow a re-attach after a KDP disconnect.\n" @@ -4759,7 +4788,7 @@ define readphysint # No KDP. Attempt to use physical memory mapping if ($kgm_mtype == $kgm_mtype_x86_64) - set $kgm_readphys_paddr_in_kva = (unsigned long long)$arg0 + (((unsigned long long)-1 << 47) | ((unsigned long long)509 << 39)) + set $kgm_readphys_paddr_in_kva = (unsigned long long)$arg0 + physmap_base else if ($kgm_mtype == $kgm_mtype_arm) set $kgm_readphys_paddr_in_kva = (unsigned long long)$arg0 - gPhysBase + gVirtBase @@ -4900,16 +4929,30 @@ document writephys64 end define addkextsyms - shell echo cd `pwd` > /tmp/gdb-cd - cd $arg0 - source kcbmacros - source /tmp/gdb-cd - set $kgm_show_kmod_syms = 1 + if ($argc <= 1) + if ($argc == 0) + printf "Adding kext symbols from in-kernel summary data.\n" + add-all-kexts + else + printf "Adding kext symbols from $arg0.\n" + shell echo cd `pwd` > /tmp/gdb-cd + cd $arg0 + source kcbmacros + source /tmp/gdb-cd + end + set $kgm_show_kmod_syms = 1 + else + printf "| Usage:\n|\n" + help addkextsyms + end end document addkextsyms -| Takes a directory of symbols for kexts generated with kextcache -y and loads them -| into gdb. +| If specified without an argument, uses gdb's add-all-kexts command to load +| kext symbols. Otherwise, takes a directory of kext symbols generated with +| kextcache -y or kcgen and loads them into gdb. +| (gdb) addkextsyms +| - or - | (gdb) addkextsyms /path/to/symboldir end @@ -6486,6 +6529,9 @@ set $RTF_BROADCAST = 0x400000 set $RTF_MULTICAST = 0x800000 set $RTF_IFSCOPE = 0x1000000 set $RTF_CONDEMNED = 0x2000000 +set $RTF_IFREF = 0x4000000 +set $RTF_PROXY = 0x8000000 +set $RTF_ROUTER = 0x10000000 set $AF_INET = 2 set $AF_INET6 = 30 @@ -6610,6 +6656,18 @@ define rtentry_prdetails if $rt->rt_flags & $RTF_IFSCOPE printf "I" end + if $rt->rt_flags & $RTF_CONDEMNED + printf "Z" + end + if $rt->rt_flags & $RTF_IFREF + printf "i" + end + if $rt->rt_flags & $RTF_PROXY + printf "Y" + end + if $rt->rt_flags & $RTF_ROUTER + printf "r" + end printf "/%s%d", $rt->rt_ifp->if_name, $rt->rt_ifp->if_unit end @@ -8312,7 +8370,7 @@ set $INP_ANONPORT=0x40 set $INP_RECVIF=0x80 set $INP_MTUDISC=0x100 set $INP_STRIPHDR=0x200 -set $INP_FAITH=0x400 +set $INP_RECV_ANYIF=0x400 set $INP_INADDR_ANY=0x800 set $INP_RECVTTL=0x1000 set $INP_UDP_NOCKSUM=0x2000 @@ -8416,8 +8474,8 @@ define _dump_inpcb if ($pcb->inp_flags & $INP_STRIPHDR) printf "striphdr " end - if ($pcb->inp_flags & $INP_FAITH) - printf "faith " + if ($pcb->inp_flags & $INP_RECV_ANYIF) + printf "recv_anyif " end if ($pcb->inp_flags & $INP_INADDR_ANY) printf "inaddr_any " @@ -9635,9 +9693,303 @@ define _pmap_walk_x86 _pml4_walk $kgm_pmap->pm_cr3 $arg1 end +define _pmap_walk_arm_level1_section + set $kgm_tte_p = $arg0 + set $kgm_tte = *$kgm_tte_p + set $kgm_vaddr = $arg1 + + # Supersection or just section? + if (($kgm_tte & 0x00040000) == 0x00040000) + set $kgm_paddr = ($kgm_tte & 0xFF000000) | ($kgm_vaddr & 0x00FFFFFF) + set $kgm_paddr_isvalid = 1 + else + set $kgm_paddr = ($kgm_tte & 0xFFF00000) | ($kgm_vaddr & 0x000FFFFF) + set $kgm_paddr_isvalid = 1 + end + + if $kgm_pt_verbose >= 2 + printf "0x%08x\n\t0x%08x\n\t", (unsigned long)$kgm_tte_p, $kgm_tte + + # bit [1:0] evaluated in _pmap_walk_arm + + # B bit 2 + set $kgm_b_bit = (($kgm_tte & 0x00000004) >> 2) + + # C bit 3 + set $kgm_c_bit = (($kgm_tte & 0x00000008) >> 3) + + # XN bit 4 + if ($kgm_tte & 0x00000010) + printf "no-execute" + else + printf "execute" + end + + # Domain bit [8:5] if not supersection + if (($kgm_tte & 0x00040000) == 0x00000000) + printf " domain(%d)", (($kgm_tte & 0x000001e0) >> 5) + end + + # IMP bit 9 + printf " imp(%d)", (($kgm_tte & 0x00000200) >> 9) + + # AP bit 15 and [11:10], merged to a single 3-bit value + set $kgm_access = (($kgm_tte & 0x00000c00) >> 10) | (($kgm_tte & 0x00008000) >> 13) + if ($kgm_access == 0x0) + printf " noaccess" + end + if ($kgm_access == 0x1) + printf " supervisor(readwrite) user(noaccess)" + end + if ($kgm_access == 0x2) + printf " supervisor(readwrite) user(readonly)" + end + if ($kgm_access == 0x3) + printf " supervisor(readwrite) user(readwrite)" + end + if ($kgm_access == 0x4) + printf " noaccess(reserved)" + end + if ($kgm_access == 0x5) + printf " supervisor(readonly) user(noaccess)" + end + if ($kgm_access == 0x6) + printf " supervisor(readonly) user(readonly)" + end + if ($kgm_access == 0x7) + printf " supervisor(readonly) user(readonly)" + end + + # TEX bit [14:12] + set $kgm_tex_bits = (($kgm_tte & 0x00007000) >> 12) + + # Print TEX, C, B all together + printf " TEX:C:B(%d%d%d:%d:%d)", ($kgm_tex_bits & 0x4 ? 1 : 0), ($kgm_tex_bits & 0x2 ? 1 : 0), ($kgm_tex_bits & 0x1 ? 1 : 0), $kgm_c_bit, $kgm_b_bit + + # S bit 16 + if ($kgm_tte & 0x00010000) + printf " shareable" + else + printf " not-shareable" + end + + # nG bit 17 + if ($kgm_tte & 0x00020000) + printf " not-global" + else + printf " global" + end + + # Supersection bit 18 + if ($kgm_tte & 0x00040000) + printf " supersection" + else + printf " section" + end + + # NS bit 19 + if ($kgm_tte & 0x00080000) + printf " no-secure" + else + printf " secure" + end + + printf "\n" + end +end + +define _pmap_walk_arm_level2 + set $kgm_tte_p = $arg0 + set $kgm_tte = *$kgm_tte_p + set $kgm_vaddr = $arg1 + + set $kgm_pte_pbase = (($kgm_tte & 0xFFFFFC00) - gPhysBase + gVirtBase) + set $kgm_pte_index = ($kgm_vaddr >> 12) & 0x000000FF + set $kgm_pte_p = &((pt_entry_t *)$kgm_pte_pbase)[$kgm_pte_index] + set $kgm_pte = *$kgm_pte_p + + # Print first level symbolically + if $kgm_pt_verbose >= 2 + printf "0x%08x\n\t0x%08x\n\t", (unsigned long)$kgm_tte_p, $kgm_tte + + # bit [1:0] evaluated in _pmap_walk_arm + + # NS bit 3 + if ($kgm_tte & 0x00000008) + printf "no-secure" + else + printf "secure" + end + + # Domain bit [8:5] + printf " domain(%d)", (($kgm_tte & 0x000001e0) >> 5) + + # IMP bit 9 + printf " imp(%d)", (($kgm_tte & 0x00000200) >> 9) + + printf "\n" + end + + if $kgm_pt_verbose >= 2 + printf "second-level table (index %d):\n", $kgm_pte_index + end + if $kgm_pt_verbose >= 3 + set $kgm_pte_loop = 0 + while $kgm_pte_loop < 256 + set $kgm_pte_p_tmp = &((pt_entry_t *)$kgm_pte_pbase)[$kgm_pte_loop] + printf "0x%08x:\t0x%08x\n", (unsigned long)$kgm_pte_p_tmp, *$kgm_pte_p_tmp + set $kgm_pte_loop = $kgm_pte_loop + 1 + end + end + + if ($kgm_pte & 0x00000003) + set $kgm_pve_p = (pv_entry_t *)($kgm_pte_pbase + 0x100*sizeof(pt_entry_t) + $kgm_pte_index*sizeof(pv_entry_t)) + if ($kgm_pve_p->shadow != 0) + set $kgm_spte = $kgm_pve_p->shadow ^ ($kgm_vaddr & ~0xFFF) + set $kgm_paddr = ($kgm_spte & 0xFFFFF000) | ($kgm_vaddr & 0xFFF) + set $kgm_paddr_isvalid = 1 + else + set $kgm_paddr = (*$kgm_pte_p & 0xFFFFF000) | ($kgm_vaddr & 0xFFF) + set $kgm_paddr_isvalid = 1 + end + else + set $kgm_paddr = 0 + set $kgm_paddr_isvalid = 0 + end + + if $kgm_pt_verbose >= 2 + printf "0x%08x\n\t0x%08x\n\t", (unsigned long)$kgm_pte_p, $kgm_pte + if (($kgm_pte & 0x00000003) == 0x00000000) + printf "invalid" + else + if (($kgm_pte & 0x00000003) == 0x00000001) + printf "large" + + # XN bit 15 + if ($kgm_pte & 0x00008000) == 0x00008000 + printf " no-execute" + else + printf " execute" + end + else + printf "small" + + # XN bit 0 + if ($kgm_pte & 0x00000001) == 0x00000001 + printf " no-execute" + else + printf " execute" + end + end + + # B bit 2 + set $kgm_b_bit = (($kgm_pte & 0x00000004) >> 2) + + # C bit 3 + set $kgm_c_bit = (($kgm_pte & 0x00000008) >> 3) + + # AP bit 9 and [5:4], merged to a single 3-bit value + set $kgm_access = (($kgm_pte & 0x00000030) >> 4) | (($kgm_pte & 0x00000200) >> 7) + if ($kgm_access == 0x0) + printf " noaccess" + end + if ($kgm_access == 0x1) + printf " supervisor(readwrite) user(noaccess)" + end + if ($kgm_access == 0x2) + printf " supervisor(readwrite) user(readonly)" + end + if ($kgm_access == 0x3) + printf " supervisor(readwrite) user(readwrite)" + end + if ($kgm_access == 0x4) + printf " noaccess(reserved)" + end + if ($kgm_access == 0x5) + printf " supervisor(readonly) user(noaccess)" + end + if ($kgm_access == 0x6) + printf " supervisor(readonly) user(readonly)" + end + if ($kgm_access == 0x7) + printf " supervisor(readonly) user(readonly)" + end + + # TEX bit [14:12] for large, [8:6] for small + if (($kgm_pte & 0x00000003) == 0x00000001) + set $kgm_tex_bits = (($kgm_pte & 0x00007000) >> 12) + else + set $kgm_tex_bits = (($kgm_pte & 0x000001c0) >> 6) + end + + # Print TEX, C, B all together + printf " TEX:C:B(%d%d%d:%d:%d)", ($kgm_tex_bits & 0x4 ? 1 : 0), ($kgm_tex_bits & 0x2 ? 1 : 0), ($kgm_tex_bits & 0x1 ? 1 : 0), $kgm_c_bit, $kgm_b_bit + + # S bit 10 + if ($kgm_pte & 0x00000400) + printf " shareable" + else + printf " not-shareable" + end + + # nG bit 11 + if ($kgm_pte & 0x00000800) + printf " not-global" + else + printf " global" + end + + end + printf "\n" + end +end + +# See ARM ARM Section B3.3 define _pmap_walk_arm + set $kgm_pmap = (pmap_t) $arg0 + set $kgm_vaddr = $arg1 set $kgm_paddr = 0 set $kgm_paddr_isvalid = 0 + + # Shift by TTESHIFT (20) to get tte index + set $kgm_tte_index = (($kgm_vaddr - $kgm_pmap->min) >> 20) + set $kgm_tte_p = &$kgm_pmap->tte[$kgm_tte_index] + set $kgm_tte = *$kgm_tte_p + if $kgm_pt_verbose >= 2 + printf "first-level table (index %d):\n", $kgm_tte_index + end + if $kgm_pt_verbose >= 3 + set $kgm_tte_loop = 0 + while $kgm_tte_loop < 4096 + set $kgm_tte_p_tmp = &$kgm_pmap->tte[$kgm_tte_loop] + printf "0x%08x:\t0x%08x\n", (unsigned long)$kgm_tte_p_tmp, *$kgm_tte_p_tmp + set $kgm_tte_loop = $kgm_tte_loop + 1 + end + end + + if (($kgm_tte & 0x00000003) == 0x00000001) + _pmap_walk_arm_level2 $kgm_tte_p $kgm_vaddr + else + if (($kgm_tte & 0x00000003) == 0x00000002) + _pmap_walk_arm_level1_section $kgm_tte_p $kgm_vaddr + else + set $kgm_paddr = 0 + set $kgm_paddr_isvalid = 0 + if $kgm_pt_verbose >= 2 + printf "Invalid First-Level Translation Table Entry: 0x%08x\n", $kgm_tte + end + end + end + + if $kgm_pt_verbose >= 1 + if $kgm_paddr_isvalid + readphysint $kgm_paddr 32 $kgm_lcpu_self + set $kgm_value = $kgm_readphysint_result + printf "phys 0x%016llx: 0x%08x\n", $kgm_paddr, $kgm_value + else + printf "(no translation)\n" + end + end end define pmap_walk @@ -9718,7 +10070,6 @@ define zstack else printf "FREE " end - showptr zrecords[$index].z_element printf " : index %d : ztime %d -------------\n", $index, zrecords[$index].z_time @@ -9844,7 +10195,7 @@ define findelem zstack $fe_index if (zrecords[$fe_index].z_opcode == $fe_prev_op) - printf "*************** DOUBLE OP! *********************\n + printf "*************** DOUBLE OP! *********************\n" end set $fe_prev_op = zrecords[$fe_index].z_opcode @@ -9964,12 +10315,12 @@ define _map_user_data_from_task set $kgm_pt_verbose = 0 _pmap_walk_x86 $kgm_map_user_pmap $kgm_vaddr_range1_start if $kgm_paddr_isvalid - set $kgm_paddr_range1_in_kva = $kgm_paddr + (((unsigned long long)-1 << 47) | ((unsigned long long)509 << 39)) + set $kgm_paddr_range1_in_kva = $kgm_paddr + physmap_base end if $kgm_vaddr_range2_start _pmap_walk_x86 $kgm_map_user_pmap $kgm_vaddr_range2_start if $kgm_paddr_isvalid - set $kgm_paddr_range2_in_kva = $kgm_paddr + (((unsigned long long)-1 << 47) | ((unsigned long long)509 << 39)) + set $kgm_paddr_range2_in_kva = $kgm_paddr + physmap_base end end else @@ -10185,14 +10536,19 @@ define _print_images_for_dyld_image_info set $kgm_image_info_size = 24 set $kgm_image_info_array_address = ((unsigned long long *)$kgm_dyld_all_image_infos)[1] set $kgm_dyld_load_address = ((unsigned long long *)$kgm_dyld_all_image_infos)[4] + set $kgm_dyld_all_image_infos_address_from_struct = ((unsigned long long *)$kgm_dyld_all_image_infos)[13] else set $kgm_image_info_size = 12 set $kgm_image_info_array_address = ((unsigned int *)$kgm_dyld_all_image_infos)[2] set $kgm_dyld_load_address = ((unsigned int *)$kgm_dyld_all_image_infos)[5] + set $kgm_dyld_all_image_infos_address_from_struct = ((unsigned int *)$kgm_dyld_all_image_infos)[14] end _unmap_user_data_from_task $kgm_taskp + # Account for ASLR slide before dyld can fix the structure + set $kgm_dyld_load_address = $kgm_dyld_load_address + ($kgm_dyld_all_image_infos_address - $kgm_dyld_all_image_infos_address_from_struct) + set $kgm_image_info_i = 0 while $kgm_image_info_i < $kgm_image_info_count @@ -10301,6 +10657,9 @@ define showuserdyldinfo _unmap_user_data_from_task $kgm_taskp + set $kgm_dyld_all_imfo_infos_slide = ( $kgm_dyld_all_image_infos_address - $kgm_dyld_all_image_infos_dyldAllImageInfosAddress ) + set $kgm_dyld_all_image_infos_dyldVersion_postslide = ( $kgm_dyld_all_image_infos_dyldVersion + $kgm_dyld_all_imfo_infos_slide ) + printf " version %u\n", $kgm_dyld_all_image_infos_version printf " infoArrayCount %u\n", $kgm_dyld_all_image_infos_infoArrayCount printf " infoArray " @@ -10321,7 +10680,12 @@ define showuserdyldinfo showuserptr $kgm_dyld_all_image_infos_dyldVersion printf "\n" printf " " - _print_path_for_image $kgm_dyld_all_image_infos_dyldVersion + _print_path_for_image $kgm_dyld_all_image_infos_dyldVersion_postslide + if ($kgm_dyld_all_imfo_infos_slide != 0) + printf " (currently " + showuserptr $kgm_dyld_all_image_infos_dyldVersion_postslide + printf ")" + end printf "\n" printf " errorMessage " @@ -10726,8 +11090,8 @@ define showkerneldebugbuffercpu set $kgm_cpu_number = (int) $arg0 set $kgm_entry_count = (int) $arg1 set $kgm_debugentriesfound = 0 - - if (kdebug_flags & 0x80000000) # 0x80000000 == KDBG_BFINIT + # 0x80000000 == KDBG_BFINIT + if (kd_ctrl_page.kdebug_flags & 0x80000000) showkerneldebugheader if $kgm_entry_count == 0 @@ -10740,16 +11104,17 @@ define showkerneldebugbuffercpu else set $kgm_kdbp = &kdbip[$kgm_cpu_number] set $kgm_kdsp = $kgm_kdbp->kd_list_head - while (($kgm_kdsp != 0) && ($kgm_entry_count > 0)) - if $kgm_kdsp->kds_readlast != $kgm_kdsp->kds_bufptr - set $kgm_kds_bufptr = $kgm_kdsp->kds_bufptr - while (($kgm_kds_bufptr > $kgm_kdsp->kds_readlast) && ($kgm_entry_count > 0)) + while (($kgm_kdsp.raw != 0) && ($kgm_entry_count > 0)) + set $kgm_kdsp_actual = &kd_bufs[$kgm_kdsp.buffer_index].kdsb_addr[$kgm_kdsp.offset] + if $kgm_kdsp_actual->kds_readlast != $kgm_kdsp_actual->kds_bufindx + set $kgm_kds_bufptr = &$kgm_kdsp_actual->kds_records[$kgm_kdsp_actual->kds_bufindx] + while (($kgm_kds_bufptr > &$kgm_kdsp_actual->kds_records[$kgm_kdsp_actual->kds_readlast]) && ($kgm_entry_count > 0)) set $kgm_kds_bufptr = $kgm_kds_bufptr - 1 set $kgm_entry_count = $kgm_entry_count - 1 showkerneldebugbufferentry $kgm_kds_bufptr end end - set $kgm_kdsp = $kgm_kdsp->kds_next + set $kgm_kdsp = $kgm_kdsp_actual->kds_next end end else @@ -10763,8 +11128,8 @@ Syntax: showkerneldebugbuffercpu end define showkerneldebugbuffer - - if (kdebug_flags & 0x80000000) # 0x80000000 == KDBG_BFINIT + # 0x80000000 == KDBG_BFINIT + if (kd_ctrl_page.kdebug_flags & 0x80000000) set $kgm_entrycount = (int) $arg0 @@ -13279,4 +13644,27 @@ Syntax: (gdb) processortimers | Print details of processor timers, noting any timer which might be suspicious end +define maplocalcache + if ($kgm_mtype == $kgm_mtype_arm) + mem 0x80000000 0xefffffff cache + set dcache-linesize-power 9 + printf "GDB memory caching enabled. Be sure to disable by calling flushlocalcache before detaching or connecting to a new device\n" + end +end + +document maplocalcache +Syntax: (gdb) maplocalcache +| Sets up memory regions for GDB to cache on read. Significantly increases debug speed over KDP +end +define flushlocalcache + if ($kgm_mtype == $kgm_mtype_arm) + delete mem + printf "GDB memory caching disabled.\n" + end +end + +document flushlocalcache +Syntax: (gdb) flushlocalcache +| Clears all memory regions +end diff --git a/libkern/Makefile b/libkern/Makefile index ff3bbec5f..67e6f4c99 100644 --- a/libkern/Makefile +++ b/libkern/Makefile @@ -22,12 +22,12 @@ EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} SETUP_SUBDIRS = -COMP_SUBDIRS_I386 = conf kmod -COMP_SUBDIRS_X86_64 = conf kmod -COMP_SUBDIRS_ARM = conf kmod +COMP_SUBDIRS_I386 = conf +COMP_SUBDIRS_X86_64 = conf +COMP_SUBDIRS_ARM = conf -INST_SUBDIRS = kmod +INST_SUBDIRS = include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/libkern/OSKextLib.cpp b/libkern/OSKextLib.cpp index c782a830f..73c216fcf 100644 --- a/libkern/OSKextLib.cpp +++ b/libkern/OSKextLib.cpp @@ -351,8 +351,8 @@ finish: /********************************************************************* * Gets the vm_map for the current kext *********************************************************************/ -extern vm_offset_t sectPRELINKB; -extern int sectSizePRELINK; +extern vm_offset_t segPRELINKB; +extern unsigned long segSizePRELINK; extern int kth_started; extern vm_map_t g_kext_map; @@ -362,8 +362,8 @@ kext_get_vm_map(kmod_info_t *info) vm_map_t kext_map = NULL; /* Set the vm map */ - if ((info->address >= sectPRELINKB) && - (info->address < (sectPRELINKB + sectSizePRELINK))) + if ((info->address >= segPRELINKB) && + (info->address < (segPRELINKB + segSizePRELINK))) { kext_map = kernel_map; } else { diff --git a/libkern/c++/OSData.cpp b/libkern/c++/OSData.cpp index b7cd6852c..61b4342d2 100644 --- a/libkern/c++/OSData.cpp +++ b/libkern/c++/OSData.cpp @@ -56,6 +56,12 @@ extern int debug_container_malloc_size; #define ACCUMSIZE(s) #endif +struct OSData::ExpansionData +{ + DeallocFunction deallocFunction; + bool disableSerialization; +}; + bool OSData::initWithCapacity(unsigned int inCapacity) { if (!super::init()) @@ -191,11 +197,12 @@ void OSData::free() kfree(data, capacity); ACCUMSIZE( -capacity ); } else if (capacity == EXTERNAL) { - DeallocFunction freemem = (DeallocFunction)reserved; - if (freemem && data && length) { - freemem(data, length); - } + DeallocFunction freemem = reserved ? reserved->deallocFunction : NULL; + if (freemem && data && length) { + freemem(data, length); } + } + if (reserved) kfree(reserved, sizeof(ExpansionData)); super::free(); } @@ -388,12 +395,16 @@ bool OSData::serialize(OSSerialize *s) const unsigned int i; const unsigned char *p; unsigned char c; + unsigned int serializeLength; if (s->previouslySerialized(this)) return true; if (!s->addXMLStartTag(this, "data")) return false; - for (i = 0, p = (unsigned char *)data; i < length; i++, p++) { + serializeLength = length; + if (reserved && reserved->disableSerialization) serializeLength = 0; + + for (i = 0, p = (unsigned char *)data; i < serializeLength; i++, p++) { /* 3 bytes are encoded as 4 */ switch (i % 3) { case 0: @@ -431,11 +442,24 @@ bool OSData::serialize(OSSerialize *s) const return s->addXMLEndTag("data"); } -/* Note I am just using the reserved pointer here instead of allocating a whole buffer - * to hold one pointer. - */ void OSData::setDeallocFunction(DeallocFunction func) { - reserved = (ExpansionData *)func; - return; + if (!reserved) + { + reserved = (typeof(reserved)) kalloc(sizeof(ExpansionData)); + if (!reserved) return; + bzero(reserved, sizeof(ExpansionData)); + } + reserved->deallocFunction = func; +} + +void OSData::setSerializable(bool serializable) +{ + if (!reserved) + { + reserved = (typeof(reserved)) kalloc(sizeof(ExpansionData)); + if (!reserved) return; + bzero(reserved, sizeof(ExpansionData)); + } + reserved->disableSerialization = (!serializable); } diff --git a/libkern/c++/OSDictionary.cpp b/libkern/c++/OSDictionary.cpp index eaa1483df..7329f3a4e 100644 --- a/libkern/c++/OSDictionary.cpp +++ b/libkern/c++/OSDictionary.cpp @@ -68,6 +68,8 @@ bool OSDictionary::initWithCapacity(unsigned int inCapacity) int size = inCapacity * sizeof(dictEntry); +//fOptions |= kSort; + dictionary = (dictEntry *) kalloc(size); if (!dictionary) return false; @@ -170,6 +172,15 @@ bool OSDictionary::initWithDictionary(const OSDictionary *dict, if (!initWithCapacity(newCapacity)) return false; + if ((kSort & fOptions) && !(kSort & dict->fOptions)) { + for (unsigned int i = 0; i < dict->count; i++) { + if (!setObject(dict->dictionary[i].key, dict->dictionary[i].value)) { + return false; + } + } + return true; + } + count = dict->count; bcopy(dict->dictionary, dictionary, count * sizeof(dictEntry)); for (unsigned int i = 0; i < count; i++) { @@ -306,34 +317,45 @@ void OSDictionary::flushCollection() bool OSDictionary:: setObject(const OSSymbol *aKey, const OSMetaClassBase *anObject) { + unsigned int i; + bool exists; + if (!anObject || !aKey) return false; // if the key exists, replace the object - for (unsigned int i = 0; i < count; i++) { - if (aKey == dictionary[i].key) { - const OSMetaClassBase *oldObject = dictionary[i].value; - haveUpdated(); - - anObject->taggedRetain(OSTypeID(OSCollection)); - dictionary[i].value = anObject; + if (fOptions & kSort) { + i = OSSymbol::bsearch(aKey, &dictionary[0], count, sizeof(dictionary[0])); + exists = (i < count) && (aKey == dictionary[i].key); + } else for (exists = false, i = 0; i < count; i++) { + if ((exists = (aKey == dictionary[i].key))) break; + } - oldObject->taggedRelease(OSTypeID(OSCollection)); - return true; - } + if (exists) { + const OSMetaClassBase *oldObject = dictionary[i].value; + + haveUpdated(); + + anObject->taggedRetain(OSTypeID(OSCollection)); + dictionary[i].value = anObject; + + oldObject->taggedRelease(OSTypeID(OSCollection)); + return true; } // add new key, possibly extending our capacity if (count >= capacity && count >= ensureCapacity(count+1)) - return 0; + return false; haveUpdated(); + bcopy(&dictionary[i], &dictionary[i+1], (count - i) * sizeof(dictionary[0])); + aKey->taggedRetain(OSTypeID(OSCollection)); anObject->taggedRetain(OSTypeID(OSCollection)); - dictionary[count].key = aKey; - dictionary[count].value = anObject; + dictionary[i].key = aKey; + dictionary[i].value = anObject; count++; return true; @@ -341,24 +363,33 @@ setObject(const OSSymbol *aKey, const OSMetaClassBase *anObject) void OSDictionary::removeObject(const OSSymbol *aKey) { + unsigned int i; + bool exists; + if (!aKey) return; // if the key exists, remove the object - for (unsigned int i = 0; i < count; i++) - if (aKey == dictionary[i].key) { - dictEntry oldEntry = dictionary[i]; - haveUpdated(); + if (fOptions & kSort) { + i = OSSymbol::bsearch(aKey, &dictionary[0], count, sizeof(dictionary[0])); + exists = (i < count) && (aKey == dictionary[i].key); + } else for (exists = false, i = 0; i < count; i++) { + if ((exists = (aKey == dictionary[i].key))) break; + } + + if (exists) { + dictEntry oldEntry = dictionary[i]; - count--; - for (; i < count; i++) - dictionary[i] = dictionary[i+1]; + haveUpdated(); - oldEntry.key->taggedRelease(OSTypeID(OSCollection)); - oldEntry.value->taggedRelease(OSTypeID(OSCollection)); - return; - } + count--; + bcopy(&dictionary[i+1], &dictionary[i], (count - i) * sizeof(dictionary[0])); + + oldEntry.key->taggedRelease(OSTypeID(OSCollection)); + oldEntry.value->taggedRelease(OSTypeID(OSCollection)); + return; + } } @@ -391,13 +422,24 @@ bool OSDictionary::merge(const OSDictionary *srcDict) OSObject *OSDictionary::getObject(const OSSymbol *aKey) const { + unsigned int i; + bool exists; + if (!aKey) return 0; - // if the key exists, remove the object - for (unsigned int i = 0; i < count; i++) - if (aKey == dictionary[i].key) - return (const_cast ((const OSObject *)dictionary[i].value)); + // if the key exists, return the object + + if (fOptions & kSort) { + i = OSSymbol::bsearch(aKey, &dictionary[0], count, sizeof(dictionary[0])); + exists = (i < count) && (aKey == dictionary[i].key); + } else for (exists = false, i = 0; i < count; i++) { + if ((exists = (aKey == dictionary[i].key))) break; + } + + if (exists) { + return (const_cast ((const OSObject *)dictionary[i].value)); + } return 0; } diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index 68139f092..1e4395043 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -88,16 +88,29 @@ static OSReturn _OSDictionarySetCStringValue( OSDictionary * dict, const char * key, const char * value); +static bool _OSKextInPrelinkRebuildWindow(void); +static bool _OSKextInUnloadedPrelinkedKexts(const OSSymbol * theBundleID); // We really should add containsObject() & containsCString to OSCollection & subclasses. // So few pad slots, though.... static bool _OSArrayContainsCString(OSArray * array, const char * cString); -#if CONFIG_MACF_KEXT -static void * MACFCopyModuleDataForKext( - OSKext * theKext, - mach_msg_type_number_t * datalen); -#endif /* CONFIG_MACF_KEXT */ +#if CONFIG_KEC_FIPS +static void * GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict); +#endif // CONFIG_KEC_FIPS + +/* Prelinked arm kexts do not have VM entries because the method we use to + * fake an entry (see libsa/bootstrap.cpp:readPrelinkedExtensions()) does + * not work on ARM. To get around that, we must free prelinked kext + * executables with ml_static_mfree() instead of kext_free(). + */ +#if __i386__ || __x86_64__ +#define VM_MAPPED_KEXTS 1 +#define KASLR_KEXT_DEBUG 0 +#define KASLR_IOREG_DEBUG 0 +#else +#error Unsupported architecture +#endif #if PRAGMA_MARK #pragma mark Constants & Macros @@ -136,6 +149,9 @@ static void * MACFCopyModuleDataForKext( #define STRING_HAS_PREFIX(s, p) (strncmp((s), (p), strlen(p)) == 0) +#define REBUILD_MAX_TIME (60 * 5) // 5 minutes +#define MINIMUM_WAKEUP_SECONDS (30) + /********************************************************************* * infoDict keys for internally-stored data. Saves on ivar slots for * objects we don't keep around past boot time or during active load. @@ -263,7 +279,7 @@ kmod_info_t g_kernel_kmod_info = { /* version */ "0", // filled in in OSKext::initialize() /* reference_count */ -1, // never adjusted; kernel never unloads /* reference_list */ NULL, - /* address */ (vm_address_t)&_mh_execute_header, + /* address */ NULL, /* size */ 0, // filled in in OSKext::initialize() /* hdr_size */ 0, /* start */ 0, @@ -318,6 +334,7 @@ static unsigned int sConsiderUnloadDelay = 60; // seconds static thread_call_t sUnloadCallout = 0; static thread_call_t sDestroyLinkContextThread = 0; // one-shot, one-at-a-time thread static bool sSystemSleep = false; // true when system going to sleep +static AbsoluteTime sLastWakeTime; // last time we woke up /********************************************************************* * Backtraces can be printed at various times so we need a tight lock @@ -505,22 +522,22 @@ kxld_log_callback( #define notifyKextLoadObservers(kext, kmod_info) \ do { \ - IOStatistics::onKextLoad(kext, kmod_info); \ + IOStatistics::onKextLoad(kext, kmod_info); \ } while (0) #define notifyKextUnloadObservers(kext) \ do { \ - IOStatistics::onKextUnload(kext); \ + IOStatistics::onKextUnload(kext); \ } while (0) #define notifyAddClassObservers(kext, addedClass, flags) \ do { \ - IOStatistics::onClassAdded(kext, addedClass); \ + IOStatistics::onClassAdded(kext, addedClass); \ } while (0) #define notifyRemoveClassObservers(kext, removedClass, flags) \ do { \ - IOStatistics::onClassRemoved(kext, removedClass); \ + IOStatistics::onClassRemoved(kext, removedClass); \ } while (0) #else @@ -583,7 +600,7 @@ OSKext::initialize(void) /* Read the log flag boot-args and set the log flags. */ - if (PE_parse_boot_argn("kextlog", &bootLogFilter, sizeof("kextlog=0x00000000 "))) { + if (PE_parse_boot_argn("kextlog", &bootLogFilter, sizeof(bootLogFilter))) { sBootArgLogFilterFound = true; sKernelLogFilter = bootLogFilter; // log this if any flags are set @@ -618,6 +635,13 @@ OSKext::initialize(void) kernelStart, kernelLength); assert(kernelExecutable); +#if KASLR_KEXT_DEBUG + IOLog("kaslr: kernel start 0x%lx end 0x%lx length %lu \n", + (unsigned long)kernelStart, + (unsigned long)getlastaddr(), + kernelLength); +#endif + sKernelKext->loadTag = sNextLoadTag++; // the kernel is load tag 0 sKernelKext->bundleID = OSSymbol::withCString(kOSKextKernelIdentifier); @@ -631,6 +655,7 @@ OSKext::initialize(void) sKernelKext->flags.loaded = 1; sKernelKext->flags.started = 1; sKernelKext->flags.CPPInitialized = 0; + sKernelKext->flags.jettisonLinkeditSeg = 0; sKernelKext->kmod_info = &g_kernel_kmod_info; strlcpy(g_kernel_kmod_info.version, osrelease, @@ -693,6 +718,8 @@ OSKext::initialize(void) *timestamp = 0; timestamp = __OSAbsoluteTimePtr(&last_unloaded_timestamp); *timestamp = 0; + timestamp = __OSAbsoluteTimePtr(&sLastWakeTime); + *timestamp = 0; OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | @@ -700,7 +727,7 @@ OSKext::initialize(void) "Kext system initialized."); notifyKextLoadObservers(sKernelKext, sKernelKext->kmod_info); - + return; } @@ -726,6 +753,7 @@ OSKext::removeKextBootstrap(void) kernel_segment_command_t * seg_to_remove = NULL; + /* This must be the very first thing done by this function. */ IORecursiveLockLock(sKextLock); @@ -774,10 +802,10 @@ OSKext::removeKextBootstrap(void) * defining the lower bound for valid physical addresses. */ if (seg_to_remove && seg_to_remove->vmaddr && seg_to_remove->vmsize) { - // 04/18/11 - gab: - // overwrite memory occupied by KLD segment with random data before - // releasing it. - read_random((void *) seg_to_remove->vmaddr, seg_to_remove->vmsize); + // 04/18/11 - gab: + // overwrite memory occupied by KLD segment with random data before + // releasing it. + read_random((void *) seg_to_remove->vmaddr, seg_to_remove->vmsize); ml_static_mfree(seg_to_remove->vmaddr, seg_to_remove->vmsize); } #else @@ -831,7 +859,7 @@ OSKext::removeKextBootstrap(void) /* Copy it out. */ memcpy(seg_copy, seg_data, seg_length); - + /* Dump the booter memory. */ ml_static_mfree(seg_offset, seg_length); @@ -846,7 +874,7 @@ OSKext::removeKextBootstrap(void) (ipc_port_t)NULL, (vm_object_offset_t) 0, /* copy */ FALSE, - /* cur_protection */ VM_PROT_ALL, + /* cur_protection */ VM_PROT_READ | VM_PROT_WRITE, /* max_protection */ VM_PROT_ALL, /* inheritance */ VM_INHERIT_DEFAULT); if ((mem_result != KERN_SUCCESS) || @@ -869,20 +897,22 @@ OSKext::removeKextBootstrap(void) kmem_free(kernel_map, seg_copy_offset, seg_length); } #else /* we are not CONFIG_KXLD */ +#error CONFIG_KXLD is expected for this arch /***** * Dump the LINKEDIT segment, unless keepsyms is set. */ if (!sKeepSymbols) { -#if __i386__ || __x86_64__ - if (seg_to_remove && seg_to_remove->vmaddr && seg_to_remove->vmsize) { - ml_static_mfree(seg_to_remove->vmaddr, seg_to_remove->vmsize); + const char *dt_segment_name = "Kernel-__LINKEDIT"; + if (0 == IODTGetLoaderInfo(dt_segment_name, + &segment_paddress, &segment_size)) { +#ifdef SECURE_KERNEL + vm_offset_t vmaddr = ml_static_ptovirt((vm_offset_t)segment_paddress); + bzero((void*)vmaddr, segment_size); +#endif + IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress, + (int)segment_size); } -#else /* from if __arm__ */ - -#error arch -#endif /* from if __arm__ */ - } else { OSKextLog(/* kext */ NULL, kOSKextLogBasicLevel | @@ -1342,6 +1372,9 @@ OSKext::initWithPrelinkedInfoDict( if (!setInfoDictionaryAndPath(anInfoDict, kextPath)) { goto finish; } +#if KASLR_KEXT_DEBUG + IOLog("kaslr: kext %s \n", getIdentifierCString()); +#endif /* Also get the executable's bundle-relative path if present. * Don't look for an arch-specific path property. @@ -1373,9 +1406,16 @@ OSKext::initWithPrelinkedInfoDict( goto finish; } - data = (void *) (intptr_t) (addressNum->unsigned64BitValue()); + data = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide); length = (uint32_t) (lengthNum->unsigned32BitValue()); +#if KASLR_KEXT_DEBUG + IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n", + (unsigned long)VM_KERNEL_UNSLIDE(data), + (unsigned long)data, + length); +#endif + anInfoDict->removeObject(kPrelinkExecutableLoadKey); anInfoDict->removeObject(kPrelinkExecutableSizeKey); @@ -1384,7 +1424,13 @@ OSKext::initWithPrelinkedInfoDict( */ addressNum = OSDynamicCast(OSNumber, anInfoDict->getObject(kPrelinkExecutableSourceKey)); if (addressNum) { - srcData = (void *) (intptr_t) (addressNum->unsigned64BitValue()); + srcData = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide); + +#if KASLR_KEXT_DEBUG + IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n", + (unsigned long)VM_KERNEL_UNSLIDE(srcData), + (unsigned long)srcData); +#endif if (data != srcData) { #if __LP64__ @@ -1421,7 +1467,12 @@ OSKext::initWithPrelinkedInfoDict( getIdentifierCString()); goto finish; } + +#if VM_MAPPED_KEXTS prelinkedExecutable->setDeallocFunction(osdata_kext_free); +#else + prelinkedExecutable->setDeallocFunction(osdata_phys_free); +#endif setLinkedExecutable(prelinkedExecutable); addressNum = OSDynamicCast(OSNumber, @@ -1435,7 +1486,18 @@ OSKext::initWithPrelinkedInfoDict( goto finish; } - kmod_info = (kmod_info_t *) (intptr_t) (addressNum->unsigned64BitValue()); + if (addressNum->unsigned64BitValue() != 0) { + kmod_info = (kmod_info_t *) (intptr_t) (addressNum->unsigned64BitValue() + vm_kernel_slide); + kmod_info->address += vm_kernel_slide; +#if KASLR_KEXT_DEBUG + IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n", + (unsigned long)VM_KERNEL_UNSLIDE(kmod_info), + (unsigned long)kmod_info); + IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info->address \n", + (unsigned long)VM_KERNEL_UNSLIDE(kmod_info->address), + (unsigned long)kmod_info->address); + #endif + } anInfoDict->removeObject(kPrelinkKmodInfoKey); } @@ -3881,22 +3943,22 @@ OSKext::getLoadTag(void) *********************************************************************/ void OSKext::getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize) { - if (linkedExecutable) { - *loadSize = linkedExecutable->getLength(); + if (linkedExecutable) { + *loadSize = linkedExecutable->getLength(); - /* If we have a kmod_info struct, calculated the wired size - * from that. Otherwise it's the full load size. - */ - if (kmod_info) { - *wiredSize = *loadSize - kmod_info->hdr_size; - } else { - *wiredSize = *loadSize; - } - } - else { - *wiredSize = 0; - *loadSize = 0; - } + /* If we have a kmod_info struct, calculated the wired size + * from that. Otherwise it's the full load size. + */ + if (kmod_info) { + *wiredSize = *loadSize - kmod_info->hdr_size; + } else { + *wiredSize = *loadSize; + } + } + else { + *wiredSize = 0; + *loadSize = 0; + } } /********************************************************************* @@ -3949,6 +4011,7 @@ finish: /********************************************************************* *********************************************************************/ + #if defined (__i386__) #define ARCHNAME "i386" #elif defined (__x86_64__) @@ -4521,6 +4584,222 @@ finish: return result; } +/********************************************************************* +* +*********************************************************************/ +OSReturn +OSKext::slidePrelinkedExecutable() +{ + OSReturn result = kOSKextReturnBadData; + kernel_mach_header_t * mh = NULL; + kernel_segment_command_t * seg = NULL; + kernel_segment_command_t * linkeditSeg = NULL; + kernel_section_t * sec = NULL; + char * linkeditBase = NULL; + bool haveLinkeditBase = false; + char * relocBase = NULL; + bool haveRelocBase = false; + struct dysymtab_command * dysymtab = NULL; + struct symtab_command * symtab = NULL; + kernel_nlist_t * sym = NULL; + struct relocation_info * reloc = NULL; + uint32_t i = 0; + int reloc_size; + vm_offset_t new_kextsize; + + if (linkedExecutable == NULL || vm_kernel_slide == 0) { + result = kOSReturnSuccess; + goto finish; + } + + mh = (kernel_mach_header_t *)linkedExecutable->getBytesNoCopy(); + + for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { + seg->vmaddr += vm_kernel_slide; + +#if KASLR_KEXT_DEBUG + IOLog("kaslr: segname %s unslid 0x%lx slid 0x%lx \n", + seg->segname, + (unsigned long)VM_KERNEL_UNSLIDE(seg->vmaddr), + (unsigned long)seg->vmaddr); +#endif + + if (!haveRelocBase) { + relocBase = (char *) seg->vmaddr; + haveRelocBase = true; + } + if (!strcmp(seg->segname, "__LINKEDIT")) { + linkeditBase = (char *) seg->vmaddr - seg->fileoff; + haveLinkeditBase = true; + linkeditSeg = seg; + } + for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) { + sec->addr += vm_kernel_slide; + +#if KASLR_KEXT_DEBUG + IOLog("kaslr: sectname %s unslid 0x%lx slid 0x%lx \n", + sec->sectname, + (unsigned long)VM_KERNEL_UNSLIDE(sec->addr), + (unsigned long)sec->addr); +#endif + } + } + + dysymtab = (struct dysymtab_command *) getcommandfromheader(mh, LC_DYSYMTAB); + + symtab = (struct symtab_command *) getcommandfromheader(mh, LC_SYMTAB); + + if (symtab != NULL) { + /* Some pseudo-kexts have symbol tables without segments. + * Ignore them. */ + if (symtab->nsyms > 0 && haveLinkeditBase) { + sym = (kernel_nlist_t *) (linkeditBase + symtab->symoff); + for (i = 0; i < symtab->nsyms; i++) { + if (sym[i].n_type & N_STAB) { + continue; + } + sym[i].n_value += vm_kernel_slide; + +#if KASLR_KEXT_DEBUG +#define MAX_SYMS_TO_LOG 5 + if ( i < MAX_SYMS_TO_LOG ) { + IOLog("kaslr: LC_SYMTAB unslid 0x%lx slid 0x%lx \n", + (unsigned long)VM_KERNEL_UNSLIDE(sym[i].n_value), + (unsigned long)sym[i].n_value); + } +#endif + } + } + } + + if (dysymtab != NULL) { + if (dysymtab->nextrel > 0) { + OSKextLog(this, + kOSKextLogErrorLevel | kOSKextLogLoadFlag | + kOSKextLogLinkFlag, + "Sliding kext %s: External relocations found.", + getIdentifierCString()); + goto finish; + } + + if (dysymtab->nlocrel > 0) { + if (!haveLinkeditBase) { + OSKextLog(this, + kOSKextLogErrorLevel | kOSKextLogLoadFlag | + kOSKextLogLinkFlag, + "Sliding kext %s: No linkedit segment.", + getIdentifierCString()); + goto finish; + } + + if (!haveRelocBase) { + OSKextLog(this, + kOSKextLogErrorLevel | kOSKextLogLoadFlag | + kOSKextLogLinkFlag, +#if __x86_64__ + "Sliding kext %s: No writable segments.", +#else + "Sliding kext %s: No segments.", +#endif + getIdentifierCString()); + goto finish; + } + + reloc = (struct relocation_info *) (linkeditBase + dysymtab->locreloff); + reloc_size = dysymtab->nlocrel * sizeof(struct relocation_info); + + for (i = 0; i < dysymtab->nlocrel; i++) { + if ( reloc[i].r_extern != 0 + || reloc[i].r_type != 0 + || reloc[i].r_length != (sizeof(void *) == 8 ? 3 : 2) +#if __i386__ + || (reloc[i].r_address & R_SCATTERED) +#endif + ) { + OSKextLog(this, + kOSKextLogErrorLevel | kOSKextLogLoadFlag | + kOSKextLogLinkFlag, + "Sliding kext %s: Unexpected relocation found.", + getIdentifierCString()); + goto finish; + } + if (reloc[i].r_pcrel != 0) { + continue; + } + *((uintptr_t *)(relocBase + reloc[i].r_address)) += vm_kernel_slide; + +#if KASLR_KEXT_DEBUG +#define MAX_DYSYMS_TO_LOG 5 + if ( i < MAX_DYSYMS_TO_LOG ) { + IOLog("kaslr: LC_DYSYMTAB unslid 0x%lx slid 0x%lx \n", + (unsigned long)VM_KERNEL_UNSLIDE(*((uintptr_t *)(relocBase + reloc[i].r_address))), + (unsigned long)*((uintptr_t *)(relocBase + reloc[i].r_address))); + } +#endif + } + + /* We should free these relocations, not just delete the reference to them. + * Free relocations from PIE kexts. + */ + new_kextsize = round_page(kmod_info->size - reloc_size); + + if ((kmod_info->size - new_kextsize) > PAGE_SIZE) { + vm_offset_t endofkext = kmod_info->address + kmod_info->size; + vm_offset_t new_endofkext = kmod_info->address + new_kextsize; + vm_offset_t endofrelocInfo = (vm_offset_t) (((uint8_t *)reloc) + reloc_size); + int bytes_remaining = endofkext - endofrelocInfo; + OSData * new_osdata = NULL; + + /* fix up symbol offsets if they are after the dsymtab local relocs */ + if (symtab) { + if (dysymtab->locreloff < symtab->symoff){ + symtab->symoff -= reloc_size; + } + if (dysymtab->locreloff < symtab->stroff) { + symtab->stroff -= reloc_size; + } + } + if (dysymtab->locreloff < dysymtab->extreloff) { + dysymtab->extreloff -= reloc_size; + } + + /* move data behind reloc info down to new offset */ + if (endofrelocInfo < endofkext) { + memcpy(reloc, (void *)endofrelocInfo, bytes_remaining); + } + + /* Create a new OSData for the smaller kext object and reflect + * new linkedit segment size. + */ + linkeditSeg->vmsize = round_page(linkeditSeg->vmsize - reloc_size); + linkeditSeg->filesize = linkeditSeg->vmsize; + + new_osdata = OSData::withBytesNoCopy((void *)kmod_info->address, new_kextsize); + if (new_osdata) { + /* Fix up kmod info and linkedExecutable. + */ + kmod_info->size = new_kextsize; + linkedExecutable->setDeallocFunction(NULL); + linkedExecutable->release(); + linkedExecutable = new_osdata; + +#if VM_MAPPED_KEXTS + kext_free(new_endofkext, (endofkext - new_endofkext)); +#else + ml_static_mfree(new_endofkext, (endofkext - new_endofkext)); +#endif + } + } + dysymtab->nlocrel = 0; + dysymtab->locreloff = 0; + } + } + + result = kOSReturnSuccess; +finish: + return result; +} + /********************************************************************* * called only by load() *********************************************************************/ @@ -4579,6 +4858,10 @@ OSKext::loadExecutable() } if (isPrelinked()) { + result = slidePrelinkedExecutable(); + if (result != kOSReturnSuccess) { + goto finish; + } goto register_kmod; } @@ -4731,7 +5014,7 @@ OSKext::loadExecutable() * cache and invalidate the instruction cache. * I/D caches are coherent on x86 */ -#if !defined(__i386__) && !defined(__x86_64__) +#if !defined(__i386__) && !defined(__x86_64__) flush_dcache(kmod_info->address, kmod_info->size, false); invalidate_icache(kmod_info->address, kmod_info->size, false); #endif @@ -4803,7 +5086,7 @@ register_kmod: "Kext %s executable loaded; %u pages at 0x%lx (load tag %u).", kmod_info->name, (unsigned)kmod_info->size / PAGE_SIZE, - (unsigned long)kmod_info->address, + (unsigned long)VM_KERNEL_UNSLIDE(kmod_info->address), (unsigned)kmod_info->id); } @@ -4873,11 +5156,18 @@ OSKext::jettisonLinkeditSegment(void) { kernel_mach_header_t * machhdr = (kernel_mach_header_t *)kmod_info->address; kernel_segment_command_t * linkedit = NULL; + vm_offset_t start; vm_size_t linkeditsize, kextsize; - vm_offset_t linkeditaddr = 0; OSData * data = NULL; - - if (sKeepSymbols || isLibrary() || !isExecutable() || !linkedExecutable) { + +#if NO_KEXTD + /* We can free symbol tables for all embedded kexts because we don't + * support runtime kext linking. + */ + if (sKeepSymbols || !isExecutable() || !linkedExecutable || flags.jettisonLinkeditSeg) { +#else + if (sKeepSymbols || isLibrary() || !isExecutable() || !linkedExecutable || flags.jettisonLinkeditSeg) { +#endif goto finish; } @@ -4900,21 +5190,12 @@ OSKext::jettisonLinkeditSegment(void) */ linkeditsize = round_page(linkedit->vmsize); kextsize = kmod_info->size - linkeditsize; - - /* Save linkedit address as removeLinkeditHeaders() will zero it */ - linkeditaddr = trunc_page(linkedit->vmaddr); - + start = linkedit->vmaddr; + data = OSData::withBytesNoCopy((void *)kmod_info->address, kextsize); if (!data) { goto finish; } - data->setDeallocFunction(osdata_kext_free); - - /* Rewrite the Mach-O headers. - */ - if (KERN_SUCCESS != removeLinkeditHeaders(linkedit)) { - goto finish; - } /* Fix the kmod info and linkedExecutable. */ @@ -4922,75 +5203,20 @@ OSKext::jettisonLinkeditSegment(void) linkedExecutable->setDeallocFunction(NULL); linkedExecutable->release(); linkedExecutable = data; - + flags.jettisonLinkeditSeg = 1; + /* Free the linkedit segment. */ - kext_free(linkeditaddr, linkeditsize); +#if VM_MAPPED_KEXTS + kext_free(start, linkeditsize); +#else + ml_static_mfree(start, linkeditsize); +#endif finish: return; } -/********************************************************************* -*********************************************************************/ -OSReturn -OSKext::removeLinkeditHeaders(kernel_segment_command_t *linkedit) -{ - OSReturn result = KERN_FAILURE; - kernel_mach_header_t * machhdr = (kernel_mach_header_t *)kmod_info->address; - vm_map_t kext_map; - u_char * src, * dst; - uint32_t cmdsize, ncmds; - u_int i = 0; - - kext_map = kext_get_vm_map(kmod_info); - if (!kext_map) { - result = KERN_MEMORY_ERROR; - goto finish; - } - - result = vm_map_protect(kext_map, kmod_info->address, - kmod_info->address + kmod_info->hdr_size, VM_PROT_DEFAULT, TRUE); - if (result != KERN_SUCCESS) { - goto finish; - } - - ncmds = machhdr->ncmds; - src = dst = (u_char *)(kmod_info->address + sizeof(*machhdr)); - - for (i = 0; i < ncmds; ++i, src += cmdsize) { - struct load_command * lc = (struct load_command *) src; - cmdsize = lc->cmdsize; - - switch (lc->cmd) { - case LC_SEGMENT: - case LC_SEGMENT_64: - if (src != (u_char *)linkedit) break; - /* FALLTHROUGH */ - case LC_SYMTAB: - case LC_DYSYMTAB: - bzero(src, cmdsize); - machhdr->ncmds--; - machhdr->sizeofcmds -= cmdsize; - continue; - } - - memmove(dst, src, cmdsize); - dst += cmdsize; - } - - result = vm_map_protect(kext_map, kmod_info->address, - kmod_info->address + kmod_info->hdr_size, VM_PROT_READ, TRUE); - if (result != KERN_SUCCESS) { - goto finish; - } - - result = KERN_SUCCESS; - -finish: - return result; -} - /********************************************************************* *********************************************************************/ void @@ -5037,7 +5263,7 @@ OSKext::registerKextsWithDTrace(void) } extern "C" { - extern int (*dtrace_modload)(struct kmod_info *); + extern int (*dtrace_modload)(struct kmod_info *, uint32_t); extern int (*dtrace_modunload)(struct kmod_info *); }; @@ -5050,7 +5276,13 @@ OSKext::registerWithDTrace(void) * prevent a kext from loading, so we ignore the return code. */ if (!flags.dtraceInitialized && (dtrace_modload != NULL)) { - (void)(*dtrace_modload)(kmod_info); + uint32_t modflag = 0; + OSObject * forceInit = getPropertyForHostArch("OSBundleForceDTraceInit"); + if (forceInit == kOSBooleanTrue) { + modflag |= KMOD_DTRACE_FORCE_INIT; + } + + (void)(*dtrace_modload)(kmod_info, modflag); flags.dtraceInitialized = true; jettisonLinkeditSegment(); } @@ -5076,6 +5308,35 @@ OSKext::unregisterWithDTrace(void) /********************************************************************* * called only by loadExecutable() *********************************************************************/ +#if !VM_MAPPED_KEXTS +#error Unrecognized architecture +#else +static inline kern_return_t +OSKext_protect( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t new_prot, + boolean_t set_max) +{ + if (start == end) { // 10538581 + return(KERN_SUCCESS); + } + return vm_map_protect(map, start, end, new_prot, set_max); +} + +static inline kern_return_t +OSKext_wire( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t access_type, + boolean_t user_wire) +{ + return vm_map_wire(map, start, end, access_type, user_wire); +} +#endif + OSReturn OSKext::setVMProtections(void) { @@ -5097,14 +5358,8 @@ OSKext::setVMProtections(void) goto finish; } - /* XXX: On arm, the vme covering the prelinked kernel (really, the whole - * range from 0xc0000000 to a little over 0xe0000000) has maxprot set to 0 - * so the vm_map_protect calls below fail - * I believe this happens in the call to vm_map_enter in kmem_init but I - * need to confirm. - */ /* Protect the headers as read-only; they do not need to be wired */ - result = vm_map_protect(kext_map, kmod_info->address, + result = OSKext_protect(kext_map, kmod_info->address, kmod_info->address + kmod_info->hdr_size, VM_PROT_READ, TRUE); if (result != KERN_SUCCESS) { goto finish; @@ -5116,7 +5371,7 @@ OSKext::setVMProtections(void) start = round_page(seg->vmaddr); end = trunc_page(seg->vmaddr + seg->vmsize); - result = vm_map_protect(kext_map, start, end, seg->maxprot, TRUE); + result = OSKext_protect(kext_map, start, end, seg->maxprot, TRUE); if (result != KERN_SUCCESS) { OSKextLog(this, kOSKextLogErrorLevel | @@ -5127,7 +5382,7 @@ OSKext::setVMProtections(void) goto finish; } - result = vm_map_protect(kext_map, start, end, seg->initprot, FALSE); + result = OSKext_protect(kext_map, start, end, seg->initprot, FALSE); if (result != KERN_SUCCESS) { OSKextLog(this, kOSKextLogErrorLevel | @@ -5139,7 +5394,7 @@ OSKext::setVMProtections(void) } if (segmentShouldBeWired(seg)) { - result = vm_map_wire(kext_map, start, end, seg->initprot, FALSE); + result = OSKext_wire(kext_map, start, end, seg->initprot, FALSE); if (result != KERN_SUCCESS) { goto finish; } @@ -5226,9 +5481,9 @@ OSKext::validateKextMapping(bool startFlag) getIdentifierCString(), whichOp, whichOp, - (void *)address, - (void *)kmod_info->address, - (void *)(kmod_info->address + kmod_info->size)); + (void *)VM_KERNEL_UNSLIDE(address), + (void *)VM_KERNEL_UNSLIDE(kmod_info->address), + (void *)(VM_KERNEL_UNSLIDE(kmod_info->address) + kmod_info->size)); result = kOSKextReturnBadData; goto finish; } @@ -5249,11 +5504,12 @@ OSKext::validateKextMapping(bool startFlag) kOSKextLogLoadFlag, "Kext %s - bad %s pointer %p.", getIdentifierCString(), - whichOp, (void *)address); + whichOp, (void *)VM_KERNEL_UNSLIDE(address)); result = kOSKextReturnBadData; goto finish; } +#if VM_MAPPED_KEXTS if (!(info.protection & VM_PROT_EXECUTE)) { OSKextLog(this, kOSKextLogErrorLevel | @@ -5264,6 +5520,7 @@ OSKext::validateKextMapping(bool startFlag) result = kOSKextReturnBadData; goto finish; } +#endif /* Verify that the kext's segments are backed by physical memory. */ @@ -5319,10 +5576,7 @@ OSKext::start(bool startDependenciesFlag) OSReturn result = kOSReturnError; kern_return_t (* startfunc)(kmod_info_t *, void *); unsigned int i, count; - void * kmodStartData = NULL; // special handling needed -#if CONFIG_MACF_KEXT - mach_msg_type_number_t kmodStartDataCount = 0; -#endif /* CONFIG_MACF_KEXT */ + void * kmodStartData = NULL; if (isStarted() || isInterface() || isKernelComponent()) { result = kOSReturnSuccess; @@ -5394,14 +5648,6 @@ OSKext::start(bool startDependenciesFlag) } } -#if CONFIG_MACF_KEXT - /* See if the kext has any MAC framework module data in its plist. - * This is passed in as arg #2 of the kext's start routine, - * which is otherwise reserved for any other kext. - */ - kmodStartData = MACFCopyModuleDataForKext(this, &kmodStartDataCount); -#endif /* CONFIG_MACF_KEXT */ - OSKextLog(this, kOSKextLogDetailLevel | kOSKextLogLoadFlag, @@ -5415,6 +5661,20 @@ OSKext::start(bool startDependenciesFlag) if (result == KERN_SUCCESS) { #endif +#if CONFIG_KEC_FIPS + kmodStartData = GetAppleTEXTHashForKext(this, this->infoDict); + +#if 0 + if (kmodStartData) { + OSKextLog(this, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag, + "Kext %s calling module start function. kmodStartData %p. arch %s", + getIdentifierCString(), kmodStartData, ARCHNAME); + } +#endif +#endif // CONFIG_KEC_FIPS + result = startfunc(kmod_info, kmodStartData); #if !CONFIG_STATIC_CPPINIT @@ -5450,18 +5710,6 @@ OSKext::start(bool startDependenciesFlag) } finish: -#if CONFIG_MACF_KEXT - /* Free the module data for a MAC framework kext. When we start using - * param #2 we'll have to distinguish and free/release appropriately. - * - * xxx - I'm pretty sure the old codepath freed the data and that it's - * xxx - up to the kext to copy it. - */ - if (kmodStartData) { - kmem_free(kernel_map, (vm_offset_t)kmodStartData, kmodStartDataCount); - } -#endif /* CONFIG_MACF_KEXT */ - return result; } @@ -5747,6 +5995,7 @@ OSKext::unload(void) /* Unwire and free the linked executable. */ if (linkedExecutable) { +#if VM_MAPPED_KEXTS if (!isInterface()) { kernel_segment_command_t *seg = NULL; vm_map_t kext_map = kext_get_vm_map(kmod_info); @@ -5786,7 +6035,7 @@ OSKext::unload(void) seg = nextsegfromheader((kernel_mach_header_t *) kmod_info->address, seg); } } - +#endif OSSafeReleaseNULL(linkedExecutable); } @@ -5808,7 +6057,13 @@ OSKext::unload(void) * kernel cache. 9055303 */ if (isPrelinked()) { - sUnloadedPrelinkedKexts->setObject(bundleID); + if (!_OSKextInUnloadedPrelinkedKexts(bundleID)) { + IORecursiveLockLock(sKextLock); + if (sUnloadedPrelinkedKexts) { + sUnloadedPrelinkedKexts->setObject(bundleID); + } + IORecursiveLockUnlock(sKextLock); + } } OSKextLog(this, @@ -6097,7 +6352,7 @@ finish: sConsiderUnloadsPending = false; sConsiderUnloadsExecuted = true; - (void) OSKext::considerRebuildOfPrelinkedKernel(NULL); + (void) OSKext::considerRebuildOfPrelinkedKernel(); IORecursiveLockUnlock(sKextInnerLock); IORecursiveLockUnlock(sKextLock); @@ -6177,9 +6432,11 @@ IOReturn OSKextSystemSleepOrWake(UInt32 messageType) thread_call_cancel(sUnloadCallout); } sSystemSleep = true; + AbsoluteTime_to_scalar(&sLastWakeTime) = 0; } else if (messageType == kIOMessageSystemHasPoweredOn) { sSystemSleep = false; - } + clock_get_uptime(&sLastWakeTime); + } IORecursiveLockUnlock(sKextInnerLock); return kIOReturnSuccess; @@ -6198,68 +6455,119 @@ IOReturn OSKextSystemSleepOrWake(UInt32 messageType) *********************************************************************/ /* static */ void -OSKext::considerRebuildOfPrelinkedKernel(OSString * moduleName) -{ - OSReturn checkResult = kOSReturnError; - static bool requestedPrelink = false; - OSDictionary * prelinkRequest = NULL; // must release - +OSKext::considerRebuildOfPrelinkedKernel(void) +{ + static bool requestedPrelink = false; + OSReturn checkResult = kOSReturnError; + OSDictionary * prelinkRequest = NULL; // must release + OSCollectionIterator * kextIterator = NULL; // must release + const OSSymbol * thisID = NULL; // do not release + bool doRebuild = false; + AbsoluteTime my_abstime; + UInt64 my_ns; + SInt32 delta_secs; + + /* Only one auto rebuild per boot and only on boot from prelinked kernel */ + if (requestedPrelink || !sPrelinkBoot) { + return; + } + + /* no direct return from this point */ IORecursiveLockLock(sKextLock); - /* moduleName is only passed when we see a load come in. We are only - * interested in rebuilding the kernel cache if the kext we are loading - * is not already in the original kernel cache. 9055303 + /* We need to wait for kextd to get up and running with unloads already done + * and any new startup kexts loaded. */ - if (moduleName) { - int count = sUnloadedPrelinkedKexts->getCount(); - int i; - - for (i = 0; i < count; i++) { - const OSSymbol * myBundleID; // do not release - - myBundleID = OSDynamicCast(OSSymbol, sUnloadedPrelinkedKexts->getObject(i)); - if (!myBundleID) continue; - if (moduleName->isEqualTo(myBundleID->getCStringNoCopy())) { - OSKextLog(/* kext */ NULL, - kOSKextLogDetailLevel | - kOSKextLogArchiveFlag, - "bundleID %s already in cache skipping rebuild.", - myBundleID->getCStringNoCopy()); - - /* no need to rebuild, already in kernel cache */ - goto finish; - } + if (!sConsiderUnloadsExecuted || + !sDeferredLoadSucceeded) { + goto finish; + } + + /* we really only care about boot / system start up related kexts so bail + * if we're here after REBUILD_MAX_TIME. + */ + if (!_OSKextInPrelinkRebuildWindow()) { + OSKextLog(/* kext */ NULL, + kOSKextLogArchiveFlag, + "%s prebuild rebuild has expired", + __FUNCTION__); + requestedPrelink = true; + goto finish; + } + + /* we do not want to trigger a rebuild if we get here too close to waking + * up. (see radar 10233768) + */ + IORecursiveLockLock(sKextInnerLock); + + clock_get_uptime(&my_abstime); + delta_secs = MINIMUM_WAKEUP_SECONDS + 1; + if (AbsoluteTime_to_scalar(&sLastWakeTime) != 0) { + SUB_ABSOLUTETIME(&my_abstime, &sLastWakeTime); + absolutetime_to_nanoseconds(my_abstime, &my_ns); + delta_secs = (SInt32)(my_ns / NSEC_PER_SEC); + } + IORecursiveLockUnlock(sKextInnerLock); + + if (delta_secs < MINIMUM_WAKEUP_SECONDS) { + /* too close to time of last wake from sleep */ + goto finish; + } + requestedPrelink = true; + + /* Now it's time to see if we have a reason to rebuild. We may have done + * some loads and unloads but the kernel cache didn't actually change. + * We will rebuild if any kext is not marked prelinked AND is not in our + * list of prelinked kexts that got unloaded. (see radar 9055303) + */ + kextIterator = OSCollectionIterator::withCollection(sKextsByID); + if (!kextIterator) { + goto finish; + } + + while ((thisID = OSDynamicCast(OSSymbol, kextIterator->getNextObject()))) { + OSKext * thisKext; // do not release + + thisKext = OSDynamicCast(OSKext, sKextsByID->getObject(thisID)); + if (!thisKext || thisKext->isPrelinked() || thisKext->isKernel()) { + continue; + } + + if (_OSKextInUnloadedPrelinkedKexts(thisKext->bundleID)) { + continue; } - (void) OSKext::setDeferredLoadSucceeded(); + /* kext is loaded and was not in current kernel cache so let's rebuild + */ + doRebuild = true; + OSKextLog(/* kext */ NULL, + kOSKextLogArchiveFlag, + "considerRebuildOfPrelinkedKernel %s triggered rebuild", + thisKext->bundleID->getCStringNoCopy()); + break; } - - if (!sDeferredLoadSucceeded || !sConsiderUnloadsExecuted || - sSafeBoot || requestedPrelink) - { + sUnloadedPrelinkedKexts->flushCollection(); + + if (!doRebuild) { goto finish; } - - OSKextLog(/* kext */ NULL, - kOSKextLogProgressLevel | - kOSKextLogArchiveFlag, - "Requesting build of prelinked kernel."); - + checkResult = _OSKextCreateRequest(kKextRequestPredicateRequestPrelink, - &prelinkRequest); + &prelinkRequest); if (checkResult != kOSReturnSuccess) { goto finish; } - + if (!sKernelRequests->setObject(prelinkRequest)) { goto finish; } - + OSKext::pingKextd(); - requestedPrelink = true; - + finish: IORecursiveLockUnlock(sKextLock); OSSafeRelease(prelinkRequest); + OSSafeRelease(kextIterator); + return; } @@ -7008,6 +7316,8 @@ OSKext::handleRequest( OSKext * theKext = NULL; // do not release OSBoolean * boolArg = NULL; // do not release + bool hideTheSlide = false; + IORecursiveLockLock(sKextLock); if (responseOut) { @@ -7070,13 +7380,19 @@ OSKext::handleRequest( result = kOSKextReturnNotPrivileged; if (hostPriv == HOST_PRIV_NULL) { - if (!predicate->isEqualTo(kKextRequestPredicateGetLoaded) && - !predicate->isEqualTo(kKextRequestPredicateGetKernelImage) && - !predicate->isEqualTo(kKextRequestPredicateGetKernelLoadAddress)) { - - goto finish; - } - } + if (sPrelinkBoot) { + hideTheSlide = true; + + /* must be root to use these kext requests */ + if (predicate->isEqualTo(kKextRequestPredicateGetKernelLoadAddress) ) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogIPCFlag, + "Access Failure - must be root user."); + goto finish; + } + } + } /* Get common args in anticipation of use. */ @@ -7218,20 +7534,13 @@ OSKext::handleRequest( kOSKextLogDebugLevel | kOSKextLogIPCFlag, "Returning kernel load address 0x%llx.", - (unsigned long long)textseg->vmaddr); + (unsigned long long) textseg->vmaddr ); + addressNum = OSNumber::withNumber((long long unsigned int)textseg->vmaddr, 8 * sizeof(long long unsigned int)); responseObject = addressNum; result = kOSReturnSuccess; - } else if (predicate->isEqualTo(kKextRequestPredicateGetKernelImage)) { - OSKextLog(/* kext */ NULL, - kOSKextLogDebugLevel | - kOSKextLogIPCFlag, - "Returning kernel image."); - responseData = OSKext::copySanitizedKernelImage(); - result = kOSReturnSuccess; - } else if (predicate->isEqualTo(kKextRequestPredicateGetKernelRequests)) { /* Hand the current sKernelRequests array to the caller @@ -7360,7 +7669,7 @@ finish: OSDictionary * OSKext::copyLoadedKextInfo( OSArray * kextIdentifiers, - OSArray * infoKeys) + OSArray * infoKeys) { OSDictionary * result = NULL; OSDictionary * kextInfo = NULL; // must release @@ -7494,11 +7803,49 @@ OSKext::copyInfo(OSArray * infoKeys) linkedExecutable->getBytesNoCopy(); if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleMachOHeadersKey)) { + kernel_mach_header_t * temp_kext_mach_hdr; + struct load_command * lcp; + headerData = OSData::withBytes(kext_mach_hdr, (u_int) (sizeof(*kext_mach_hdr) + kext_mach_hdr->sizeofcmds)); if (!headerData) { goto finish; } + + // unslide any vmaddrs we return to userspace - 10726716 + temp_kext_mach_hdr = (kernel_mach_header_t *) + headerData->getBytesNoCopy(); + if (temp_kext_mach_hdr == NULL) { + goto finish; + } + + lcp = (struct load_command *) (temp_kext_mach_hdr + 1); + for (i = 0; i < temp_kext_mach_hdr->ncmds; i++) { + if (lcp->cmd == LC_SEGMENT_KERNEL) { + kernel_segment_command_t * scp; + + scp = (kernel_segment_command_t *) lcp; + // 10543468 - if we jettisoned __LINKEDIT clear size info + if (flags.jettisonLinkeditSeg) { + if (strncmp(scp->segname, SEG_LINKEDIT, sizeof(scp->segname)) == 0) { + scp->vmsize = 0; + scp->fileoff = 0; + scp->filesize = 0; + } + } +#if 0 + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag, + "%s: LC_SEGMENT_KERNEL segname '%s' vmaddr 0x%llX 0x%lX vmsize %llu nsects %u", + __FUNCTION__, scp->segname, scp->vmaddr, + VM_KERNEL_UNSLIDE(scp->vmaddr), + scp->vmsize, scp->nsects); +#endif + scp->vmaddr = VM_KERNEL_UNSLIDE(scp->vmaddr); + } + lcp = (struct load_command *)((caddr_t)lcp + lcp->cmdsize); + } result->setObject(kOSBundleMachOHeadersKey, headerData); } @@ -7656,6 +8003,7 @@ OSKext::copyInfo(OSArray * infoKeys) */ if (linkedExecutable /* && !isInterface() */) { loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy(); + loadAddress = VM_KERNEL_UNSLIDE(loadAddress); loadSize = linkedExecutable->getLength(); /* If we have a kmod_info struct, calculated the wired size @@ -7746,10 +8094,10 @@ OSKext::copyInfo(OSArray * infoKeys) while ( (thisMetaClass = OSDynamicCast(OSMetaClass, metaClassIterator->getNextObject())) ) { - OSSafeReleaseNULL(metaClassDict); - OSSafeReleaseNULL(scratchNumber); - OSSafeReleaseNULL(metaClassName); - OSSafeReleaseNULL(superclassName); + OSSafeReleaseNULL(metaClassDict); + OSSafeReleaseNULL(scratchNumber); + OSSafeReleaseNULL(metaClassName); + OSSafeReleaseNULL(superclassName); metaClassDict = OSDictionary::withCapacity(3); if (!metaClassDict) { @@ -7821,218 +8169,8 @@ finish: return result; } -/********************************************************************/ -static struct symtab_command * getKernelSymtab(void) -{ - struct symtab_command * result = NULL; - struct load_command * load_cmd = NULL; - unsigned long i; - - load_cmd = (struct load_command *) - ((uintptr_t)&_mh_execute_header + sizeof(_mh_execute_header)); - for(i = 0; i < _mh_execute_header.ncmds; i++){ - if (load_cmd->cmd == LC_SYMTAB) { - result = (struct symtab_command *)load_cmd; - goto finish; - } - load_cmd = (struct load_command *) - ((uintptr_t)load_cmd + load_cmd->cmdsize); - } - -finish: - return result; -} - /********************************************************************* -*********************************************************************/ -/* static */ -OSData * -OSKext::copySanitizedKernelImage(void) -{ - OSData * result = NULL; - - kernel_mach_header_t * kernelHeader = NULL; - uint32_t sizeofcmds = 0; - - /* These start out pointing to running kernel but - * after copying point to the copied info. - */ - kernel_segment_command_t * text_seg = NULL; - kernel_segment_command_t * data_seg = NULL; - kernel_segment_command_t * linkedit_seg = NULL; - struct symtab_command * symtab_cmd = NULL; - kernel_section_t * text_const_sect = NULL; - kernel_section_t * data_const_sect = NULL; - - kern_return_t kern_result = 0; - u_long kernelCopyLength = 0; - vm_offset_t kernelCopyAddr = 0; - u_char * kernelCopy = NULL; - - vm_offset_t contentOffset = 0; - struct load_command * scan_cmd = NULL; - kernel_section_t * scan_sect = NULL; - int64_t stroff_shift = 0; - - uint32_t i; - - text_seg = getsegbyname("__TEXT"); - data_seg = getsegbyname("__DATA"); - linkedit_seg = getsegbyname("__LINKEDIT"); - symtab_cmd = getKernelSymtab(); - - text_const_sect = getsectbyname("__TEXT", "__const"); - data_const_sect = getsectbyname("__DATA", "__const"); - - if (!text_seg || !data_seg || !linkedit_seg || !symtab_cmd || - !text_const_sect || ! data_const_sect) { - - OSKextLog(/* kext */ NULL, - kOSKextLogErrorLevel | kOSKextLogIPCFlag, - "Can't provide kernel image for linking; missing component."); - goto finish; - } - - /* Figure the size of the kernel image to build. We don't use the sizes of - * the __TEXT & __DATA segments overall as we only use the __const sections, - * so add those in manually. We're going to round each part to page size - * multiples too, just to be extra cautious. - */ - sizeofcmds = text_seg->cmdsize + data_seg->cmdsize + - linkedit_seg->cmdsize + symtab_cmd->cmdsize; - kernelCopyLength = round_page(sizeof(_mh_execute_header) + sizeofcmds) + - round_page(text_const_sect->size) + - round_page(data_const_sect->size) + - round_page(linkedit_seg->filesize); - - kern_result = kmem_alloc(kernel_map, &kernelCopyAddr, kernelCopyLength); - if (kern_result != KERN_SUCCESS) { - goto finish; - } - - kernelCopy = (u_char *)kernelCopyAddr; - bzero(kernelCopy, kernelCopyLength); // ??? - is this really necessary? - - /***** - * Copy the kernel Mach header and the load commands we want. - */ - memcpy(kernelCopy, &_mh_execute_header, sizeof(_mh_execute_header)); - kernelHeader = (kernel_mach_header_t *)kernelCopy; - kernelHeader->ncmds = 0; - kernelHeader->sizeofcmds = sizeofcmds; - contentOffset = round_page(sizeof(_mh_execute_header) + sizeofcmds); - - /* __TEXT segment load command and sections. - * Note that the __TEXT segment's 'offset' and 'filesize' include - * the data from the beginning of the mach header. - * - * Don't muck with the __TEXT segment's vmsize here; - * user-space linking requires it to match what is in the running kernel. - * We'll just have to live with it not being accurate - * (not like we can run the sanitized image after all). - */ - scan_cmd = (struct load_command *)&kernelHeader[1]; // just past mach header - memcpy(scan_cmd, text_seg, text_seg->cmdsize); - kernelHeader->ncmds++; - text_seg = (kernel_segment_command_t *)scan_cmd; // retarget to constructed segment - text_seg->fileoff = 0; - text_seg->filesize = round_page(sizeof(_mh_execute_header) + sizeofcmds); - - scan_sect = (kernel_section_t *)(text_seg + 1); - for (i = 0; i < text_seg->nsects; i++, scan_sect++) { - if (0 == strncmp("__const", scan_sect->sectname, sizeof("__const"))) { - text_const_sect = scan_sect; // retarget to constructed section - - text_seg->filesize += scan_sect->size; - - scan_sect->offset = contentOffset; - contentOffset += scan_sect->size; - - memcpy(kernelCopy + scan_sect->offset, (void *)(uintptr_t)scan_sect->addr, - scan_sect->size); - } else { - scan_sect->addr = 0; - scan_sect->size = 0; - scan_sect->offset = contentOffset; - scan_sect->nreloc = 0; - } - } - - contentOffset = round_page(contentOffset); - - /* __DATA segment load command and sections. - * Leave the vmsize as in the running kernel here, too. - */ - scan_cmd = (struct load_command *)((uintptr_t)scan_cmd + scan_cmd->cmdsize); - memcpy(scan_cmd, data_seg, data_seg->cmdsize); - kernelHeader->ncmds++; - data_seg = (kernel_segment_command_t *)scan_cmd; // retarget to constructed segment - data_seg->fileoff = contentOffset; - data_seg->filesize = 0; - - scan_sect = (kernel_section_t *)(data_seg + 1); - for (i = 0; i < data_seg->nsects; i++, scan_sect++) { - if (0 == strncmp("__const", scan_sect->sectname, sizeof("__const"))) { - data_const_sect = scan_sect; // retarget to constructed section - - data_seg->filesize += scan_sect->size; - - scan_sect->offset = contentOffset; - contentOffset += scan_sect->size; - - memcpy(kernelCopy + scan_sect->offset, (void *)(uintptr_t)scan_sect->addr, - scan_sect->size); - } else { - scan_sect->addr = 0; - scan_sect->size = 0; - scan_sect->offset = contentOffset; - scan_sect->nreloc = 0; - } - } - - contentOffset = round_page(contentOffset); - - /* __LINKEDIT segment load command. - * Leave the vmsize as in the running kernel here, too. - */ - scan_cmd = (struct load_command *)((uintptr_t)scan_cmd + scan_cmd->cmdsize); - memcpy(scan_cmd, linkedit_seg, linkedit_seg->cmdsize); - kernelHeader->ncmds++; - linkedit_seg = (kernel_segment_command_t *)scan_cmd; // retarget to constructed segment - linkedit_seg->fileoff = contentOffset; - linkedit_seg->filesize = linkedit_seg->vmsize; - - contentOffset += round_page(linkedit_seg->vmsize); - - memcpy(kernelCopy + linkedit_seg->fileoff, (void *)(uintptr_t)linkedit_seg->vmaddr, - linkedit_seg->vmsize); - - /* __SYMTAB load command (contents shared with __LINKEDIT). - */ - scan_cmd = (struct load_command *)((uintptr_t)scan_cmd + scan_cmd->cmdsize); - memcpy(scan_cmd, symtab_cmd, symtab_cmd->cmdsize); - kernelHeader->ncmds++; - symtab_cmd = (struct symtab_command *)scan_cmd; // retarget to constructed cmd - stroff_shift = symtab_cmd->stroff - symtab_cmd->symoff; - symtab_cmd->symoff = linkedit_seg->fileoff; - symtab_cmd->stroff = symtab_cmd->symoff + stroff_shift; - - /* Wrap the thing up in an OSData. - */ - result = OSData::withBytesNoCopy(kernelCopy, kernelCopyLength); - if (result) { - result->setDeallocFunction(osdata_kmem_free); - kernelCopy = NULL; - } - -finish: - if (kernelCopy) kmem_free(kernel_map, kernelCopyAddr, kernelCopyLength); - - return result; -} - -/********************************************************************* -*********************************************************************/ + *********************************************************************/ /* static */ OSReturn OSKext::requestResource( @@ -8068,8 +8206,8 @@ OSKext::requestResource( OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogIPCFlag, "Can't request resource %s for %s - requests to user space are disabled.", - resourceNameCString, - kextIdentifierCString); + resourceNameCString, + kextIdentifierCString); result = kOSKextReturnDisabled; goto finish; } @@ -8368,17 +8506,13 @@ OSKext::dispatchResource(OSDictionary * requestDict) if (!callbackKext) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogIPCFlag, - "Can't invoke callback for resource request; " - "no kext loaded at callback address %p.", - callback); + "Can't invoke callback for resource request; "); goto finish; } if (!callbackKext->flags.starting && !callbackKext->flags.started) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogIPCFlag, - "Can't invoke kext resource callback; " - "kext at callback address %p is not running.", - callback); + "Can't invoke kext resource callback; "); goto finish; } @@ -8713,6 +8847,63 @@ finish: return result; } +/********************************************************************* + * We really only care about boot / system start up related kexts. + * We return true if we're less than REBUILD_MAX_TIME since start up, + * otherwise return false. + *********************************************************************/ +bool _OSKextInPrelinkRebuildWindow(void) +{ + static bool outside_the_window = false; + AbsoluteTime my_abstime; + UInt64 my_ns; + SInt32 my_secs; + + if (outside_the_window) { + return(false); + } + clock_get_uptime(&my_abstime); + absolutetime_to_nanoseconds(my_abstime, &my_ns); + my_secs = (SInt32)(my_ns / NSEC_PER_SEC); + if (my_secs > REBUILD_MAX_TIME) { + outside_the_window = true; + return(false); + } + return(true); +} + +/********************************************************************* + *********************************************************************/ +bool _OSKextInUnloadedPrelinkedKexts( const OSSymbol * theBundleID ) +{ + int unLoadedCount, i; + bool result = false; + + IORecursiveLockLock(sKextLock); + + if (sUnloadedPrelinkedKexts == NULL) { + goto finish; + } + unLoadedCount = sUnloadedPrelinkedKexts->getCount(); + if (unLoadedCount == 0) { + goto finish; + } + + for (i = 0; i < unLoadedCount; i++) { + const OSSymbol * myBundleID; // do not release + + myBundleID = OSDynamicCast(OSSymbol, sUnloadedPrelinkedKexts->getObject(i)); + if (!myBundleID) continue; + if (theBundleID->isEqualTo(myBundleID->getCStringNoCopy())) { + result = true; + break; + } + } +finish: + IORecursiveLockUnlock(sKextLock); + return(result); +} + #if PRAGMA_MARK #pragma mark Personalities (IOKit Drivers) #endif @@ -9319,6 +9510,195 @@ finish: return; } +#if KASLR_IOREG_DEBUG + +#define IOLOG_INDENT( the_indention ) \ +{ \ + int i; \ + for ( i = 0; i < (the_indention); i++ ) { \ + IOLog(" "); \ + } \ +} + +extern vm_offset_t vm_kernel_stext; +extern vm_offset_t vm_kernel_etext; +extern mach_vm_offset_t kext_alloc_base; +extern mach_vm_offset_t kext_alloc_max; + +bool ScanForAddrInObject(OSObject * theObject, + int indent ); + +bool ScanForAddrInObject(OSObject * theObject, + int indent) +{ + const OSMetaClass * myTypeID; + OSCollectionIterator * myIter; + OSSymbol * myKey; + OSObject * myValue; + bool myResult = false; + + if ( theObject == NULL ) { + IOLog("%s: theObject is NULL \n", + __FUNCTION__); + return myResult; + } + + myTypeID = OSTypeIDInst(theObject); + + if ( myTypeID == OSTypeID(OSDictionary) ) { + OSDictionary * myDictionary; + + myDictionary = OSDynamicCast(OSDictionary, theObject); + myIter = OSCollectionIterator::withCollection( myDictionary ); + if ( myIter == NULL ) + return myResult; + myIter->reset(); + + while ( (myKey = OSDynamicCast(OSSymbol, myIter->getNextObject())) ) { + bool myTempResult; + + myValue = myDictionary->getObject(myKey); + myTempResult = ScanForAddrInObject(myValue, (indent + 4)); + if (myTempResult) { + // if we ever get a true result return true + myResult = true; + IOLOG_INDENT(indent); + IOLog("OSDictionary key \"%s\" \n", myKey->getCStringNoCopy()); + } + } + myIter->release(); + } + else if ( myTypeID == OSTypeID(OSArray) ) { + OSArray * myArray; + + myArray = OSDynamicCast(OSArray, theObject); + myIter = OSCollectionIterator::withCollection(myArray); + if ( myIter == NULL ) + return myResult; + myIter->reset(); + + while ( (myValue = myIter->getNextObject()) ) { + bool myTempResult; + myTempResult = ScanForAddrInObject(myValue, (indent + 4)); + if (myTempResult) { + // if we ever get a true result return true + myResult = true; + IOLOG_INDENT(indent); + IOLog("OSArray: \n"); + } + } + myIter->release(); + } + else if ( myTypeID == OSTypeID(OSString) || myTypeID == OSTypeID(OSSymbol) ) { + + // should we look for addresses in strings? + } + else if ( myTypeID == OSTypeID(OSData) ) { + + void * * myPtrPtr; + unsigned int myLen; + OSData * myDataObj; + + myDataObj = OSDynamicCast(OSData, theObject); + myPtrPtr = (void * *) myDataObj->getBytesNoCopy(); + myLen = myDataObj->getLength(); + + if (myPtrPtr && myLen && myLen > 7) { + int i; + int myPtrCount = (myLen / sizeof(void *)); + + for (i = 0; i < myPtrCount; i++) { + UInt64 numberValue = (UInt64) *(myPtrPtr); + + if ( kext_alloc_max != 0 && + numberValue >= kext_alloc_base && + numberValue < kext_alloc_max ) { + + OSKext * myKext = NULL; // must release (looked up) + // IOLog("found OSData %p in kext map %p to %p \n", + // *(myPtrPtr), + // (void *) kext_alloc_base, + // (void *) kext_alloc_max); + + myKext = OSKext::lookupKextWithAddress( (vm_address_t) *(myPtrPtr) ); + if (myKext) { + IOLog("found addr %p from an OSData obj within kext \"%s\" \n", + *(myPtrPtr), + myKext->getIdentifierCString()); + myKext->release(); + } + myResult = true; + } + if ( vm_kernel_etext != 0 && + numberValue >= vm_kernel_stext && + numberValue < vm_kernel_etext ) { + IOLog("found addr %p from an OSData obj within kernel text segment %p to %p \n", + *(myPtrPtr), + (void *) vm_kernel_stext, + (void *) vm_kernel_etext); + myResult = true; + } + myPtrPtr++; + } + } + } + else if ( myTypeID == OSTypeID(OSBoolean) ) { + + // do nothing here... + } + else if ( myTypeID == OSTypeID(OSNumber) ) { + + OSNumber * number = OSDynamicCast(OSNumber, theObject); + + UInt64 numberValue = number->unsigned64BitValue(); + + if ( kext_alloc_max != 0 && + numberValue >= kext_alloc_base && + numberValue < kext_alloc_max ) { + + OSKext * myKext = NULL; // must release (looked up) + IOLog("found OSNumber in kext map %p to %p \n", + (void *) kext_alloc_base, + (void *) kext_alloc_max); + IOLog("OSNumber 0x%08llx (%llu) \n", numberValue, numberValue); + + myKext = OSKext::lookupKextWithAddress( (vm_address_t) numberValue ); + if (myKext) { + IOLog("found in kext \"%s\" \n", + myKext->getIdentifierCString()); + myKext->release(); + } + + myResult = true; + } + if ( vm_kernel_etext != 0 && + numberValue >= vm_kernel_stext && + numberValue < vm_kernel_etext ) { + IOLog("found OSNumber in kernel text segment %p to %p \n", + (void *) vm_kernel_stext, + (void *) vm_kernel_etext); + IOLog("OSNumber 0x%08llx (%llu) \n", numberValue, numberValue); + myResult = true; + } + } +#if 0 + else { + const OSMetaClass* myMetaClass = NULL; + + myMetaClass = theObject->getMetaClass(); + if ( myMetaClass ) { + IOLog("class %s \n", myMetaClass->getClassName() ); + } + else { + IOLog("Unknown object \n" ); + } + } +#endif + + return myResult; +} +#endif // KASLR_KEXT_DEBUG + }; /* extern "C" */ #if PRAGMA_MARK @@ -9920,8 +10300,6 @@ OSKext::updateLoadedKextSummaries(void) start = (vm_map_offset_t) summaryHeader; end = start + summarySize; - result = vm_map_protect(kernel_map, start, end, VM_PROT_DEFAULT, FALSE); - if (result != KERN_SUCCESS) goto finish; } /* Populate the summary header. @@ -9948,8 +10326,6 @@ OSKext::updateLoadedKextSummaries(void) start = (vm_map_offset_t) summaryHeader; end = start + summarySize; - result = vm_map_protect(kernel_map, start, end, VM_PROT_READ, FALSE); - if (result != KERN_SUCCESS) goto finish; sPrevLoadedKextSummaries = gLoadedKextSummaries; sPrevLoadedKextSummariesAllocSize = sLoadedKextSummariesAllocSize; @@ -10107,426 +10483,65 @@ finish: return result; } #endif /* __i386__ */ + +#if CONFIG_KEC_FIPS + #if PRAGMA_MARK -#pragma mark MAC Framework Support +#pragma mark Kernel External Components for FIPS compliance #endif + /********************************************************************* -*********************************************************************/ -#if CONFIG_MACF_KEXT -/* MAC Framework support */ - -/* - * define IOC_DEBUG to display run-time debugging information - * #define IOC_DEBUG 1 - */ - -#ifdef IOC_DEBUG -#define DPRINTF(x) printf x -#else -#define IOC_DEBUG -#define DPRINTF(x) -#endif - -/********************************************************************* -*********************************************************************/ -static bool -MACFObjectIsPrimitiveType(OSObject * obj) -{ - const OSMetaClass * typeID = NULL; // do not release - - typeID = OSTypeIDInst(obj); - if (typeID == OSTypeID(OSString) || typeID == OSTypeID(OSNumber) || - typeID == OSTypeID(OSBoolean) || typeID == OSTypeID(OSData)) { - - return true; - } - return false; -} - -/********************************************************************* -*********************************************************************/ -static int -MACFLengthForObject(OSObject * obj) -{ - const OSMetaClass * typeID = NULL; // do not release - int len; - - typeID = OSTypeIDInst(obj); - if (typeID == OSTypeID(OSString)) { - OSString * stringObj = OSDynamicCast(OSString, obj); - len = stringObj->getLength() + 1; - } else if (typeID == OSTypeID(OSNumber)) { - len = sizeof("4294967295"); /* UINT32_MAX */ - } else if (typeID == OSTypeID(OSBoolean)) { - OSBoolean * boolObj = OSDynamicCast(OSBoolean, obj); - len = (boolObj == kOSBooleanTrue) ? sizeof("true") : sizeof("false"); - } else if (typeID == OSTypeID(OSData)) { - OSData * dataObj = OSDynamicCast(OSData, obj); - len = dataObj->getLength(); - } else { - len = 0; - } - return len; -} - -/********************************************************************* -*********************************************************************/ -static void -MACFInitElementFromObject( - struct mac_module_data_element * element, - OSObject * value) -{ - const OSMetaClass * typeID = NULL; // do not release - - typeID = OSTypeIDInst(value); - if (typeID == OSTypeID(OSString)) { - OSString * stringObj = OSDynamicCast(OSString, value); - element->value_type = MAC_DATA_TYPE_PRIMITIVE; - element->value_size = stringObj->getLength() + 1; - DPRINTF(("osdict: string %s size %d\n", - stringObj->getCStringNoCopy(), element->value_size)); - memcpy(element->value, stringObj->getCStringNoCopy(), - element->value_size); - } else if (typeID == OSTypeID(OSNumber)) { - OSNumber * numberObj = OSDynamicCast(OSNumber, value); - element->value_type = MAC_DATA_TYPE_PRIMITIVE; - element->value_size = sprintf(element->value, "%u", - numberObj->unsigned32BitValue()) + 1; - } else if (typeID == OSTypeID(OSBoolean)) { - OSBoolean * boolObj = OSDynamicCast(OSBoolean, value); - element->value_type = MAC_DATA_TYPE_PRIMITIVE; - if (boolObj == kOSBooleanTrue) { - strcpy(element->value, "true"); - element->value_size = 5; - } else { - strcpy(element->value, "false"); - element->value_size = 6; - } - } else if (typeID == OSTypeID(OSData)) { - OSData * dataObj = OSDynamicCast(OSData, value); - element->value_type = MAC_DATA_TYPE_PRIMITIVE; - element->value_size = dataObj->getLength(); - DPRINTF(("osdict: data size %d\n", dataObj->getLength())); - memcpy(element->value, dataObj->getBytesNoCopy(), - element->value_size); - } - return; -} - -/********************************************************************* -* This function takes an OSDictionary and returns a struct mac_module_data -* list. -*********************************************************************/ -static struct mac_module_data * -MACFEncodeOSDictionary(OSDictionary * dict) + * Kernel External Components for FIPS compliance (KEC_FIPS) + *********************************************************************/ +static void * +GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict) { - struct mac_module_data * result = NULL; // do not free - const OSMetaClass * typeID = NULL; // do not release - OSString * key = NULL; // do not release - OSCollectionIterator * keyIterator = NULL; // must release - struct mac_module_data_element * element = NULL; // do not free - unsigned int strtabsize = 0; - unsigned int listtabsize = 0; - unsigned int dicttabsize = 0; - unsigned int nkeys = 0; - unsigned int datalen = 0; - char * strtab = NULL; // do not free - char * listtab = NULL; // do not free - char * dicttab = NULL; // do not free - vm_offset_t data_addr = 0; + AppleTEXTHash_t my_ath = {1, 0, NULL}; + AppleTEXTHash_t * my_athp = NULL; // do not release + OSDictionary * textHashDict = NULL; // do not release + OSData * segmentHash = NULL; // do not release - keyIterator = OSCollectionIterator::withCollection(dict); - if (!keyIterator) { - goto finish; + if (theKext == NULL || theInfoDict == NULL) { + return(NULL); } - /* Iterate over OSModuleData to figure out total size */ - while ( (key = OSDynamicCast(OSString, keyIterator->getNextObject())) ) { - - // Get the key's value and determine its type - OSObject * value = dict->getObject(key); - if (!value) { - continue; - } - - typeID = OSTypeIDInst(value); - if (MACFObjectIsPrimitiveType(value)) { - strtabsize += MACFLengthForObject(value); - } - else if (typeID == OSTypeID(OSArray)) { - unsigned int k, cnt, nents; - OSArray * arrayObj = OSDynamicCast(OSArray, value); - - nents = 0; - cnt = arrayObj->getCount(); - for (k = 0; k < cnt; k++) { - value = arrayObj->getObject(k); - typeID = OSTypeIDInst(value); - if (MACFObjectIsPrimitiveType(value)) { - listtabsize += MACFLengthForObject(value); - nents++; - } - else if (typeID == OSTypeID(OSDictionary)) { - unsigned int dents = 0; - OSDictionary * dictObj = NULL; // do not release - OSString * dictkey = NULL; // do not release - OSCollectionIterator * dictIterator = NULL; // must release - - dictObj = OSDynamicCast(OSDictionary, value); - dictIterator = OSCollectionIterator::withCollection(dictObj); - if (!dictIterator) { - goto finish; - } - while ((dictkey = OSDynamicCast(OSString, - dictIterator->getNextObject()))) { - - OSObject * dictvalue = NULL; // do not release - - dictvalue = dictObj->getObject(dictkey); - if (!dictvalue) { - continue; - } - if (MACFObjectIsPrimitiveType(dictvalue)) { - strtabsize += MACFLengthForObject(dictvalue); - } else { - continue; /* Only handle primitive types here. */ - } - /* - * Allow for the "arraynnn/" prefix in the key length. - */ - strtabsize += dictkey->getLength() + 1; - dents++; - } - dictIterator->release(); - if (dents-- > 0) { - dicttabsize += sizeof(struct mac_module_data_list) + - dents * sizeof(struct mac_module_data_element); - nents++; - } - } - else { - continue; /* Skip everything else. */ - } - } - if (nents == 0) { - continue; - } - listtabsize += sizeof(struct mac_module_data_list) + - (nents - 1) * sizeof(struct mac_module_data_element); - } else { - continue; /* skip anything else */ - } - strtabsize += key->getLength() + 1; - nkeys++; + textHashDict = OSDynamicCast(OSDictionary, theInfoDict->getObject(kAppleTextHashesKey)); + if (textHashDict == NULL) { + return(NULL); } - if (nkeys == 0) { - goto finish; + + segmentHash = OSDynamicCast(OSData, + textHashDict->getObject(ARCHNAME)); + if (segmentHash == NULL) { + return(NULL); } - /* - * Allocate and fill in the module data structures. - */ - datalen = sizeof(struct mac_module_data) + - sizeof(mac_module_data_element) * (nkeys - 1) + - strtabsize + listtabsize + dicttabsize; - DPRINTF(("osdict: datalen %d strtabsize %d listtabsize %d dicttabsize %d\n", - datalen, strtabsize, listtabsize, dicttabsize)); - if (kmem_alloc(kernel_map, &data_addr, datalen) != KERN_SUCCESS) { - goto finish; + // KEC_FIPS type kexts never unload so we don't have to clean up our + // AppleTEXTHash_t + if (kmem_alloc(kernel_map, (vm_offset_t *) &my_athp, + sizeof(AppleTEXTHash_t)) != KERN_SUCCESS) { + return(NULL); } - result = (mac_module_data *)data_addr; - result->base_addr = data_addr; - result->size = datalen; - result->count = nkeys; - strtab = (char *)&result->data[nkeys]; - listtab = strtab + strtabsize; - dicttab = listtab + listtabsize; - DPRINTF(("osdict: data_addr %p strtab %p listtab %p dicttab %p end %p\n", - data_addr, strtab, listtab, dicttab, data_addr + datalen)); - keyIterator->reset(); - nkeys = 0; - element = &result->data[0]; - DPRINTF(("osdict: element %p\n", element)); - while ( (key = OSDynamicCast(OSString, keyIterator->getNextObject())) ) { - - // Get the key's value and determine its type - OSObject * value = dict->getObject(key); - if (!value) { - continue; - } + memcpy(my_athp, &my_ath, sizeof(my_ath)); + my_athp->ath_length = segmentHash->getLength(); + if (my_athp->ath_length > 0) { + my_athp->ath_hash = (void *)segmentHash->getBytesNoCopy(); + } - /* Store key */ - DPRINTF(("osdict: element @%p\n", element)); - element->key = strtab; - element->key_size = key->getLength() + 1; - DPRINTF(("osdict: key %s size %d @%p\n", key->getCStringNoCopy(), - element->key_size, strtab)); - memcpy(element->key, key->getCStringNoCopy(), element->key_size); +#if 0 + OSKextLog(theKext, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag, + "Kext %s ath_version %d ath_length %d ath_hash %p", + theKext->getIdentifierCString(), + my_athp->ath_version, + my_athp->ath_length, + my_athp->ath_hash); +#endif - typeID = OSTypeIDInst(value); - if (MACFObjectIsPrimitiveType(value)) { - /* Store value */ - element->value = element->key + element->key_size; - DPRINTF(("osdict: primitive element value %p\n", element->value)); - MACFInitElementFromObject(element, value); - strtab += element->key_size + element->value_size; - DPRINTF(("osdict: new strtab %p\n", strtab)); - } else if (typeID == OSTypeID(OSArray)) { - unsigned int k, cnt, nents; - char *astrtab; - struct mac_module_data_list *arrayhd; - struct mac_module_data_element *ele; - OSArray *arrayObj = OSDynamicCast(OSArray, value); - - element->value = listtab; - DPRINTF(("osdict: array element value %p\n", element->value)); - element->value_type = MAC_DATA_TYPE_ARRAY; - arrayhd = (struct mac_module_data_list *)element->value; - arrayhd->type = 0; - DPRINTF(("osdict: arrayhd %p\n", arrayhd)); - nents = 0; - astrtab = strtab + element->key_size; - ele = &(arrayhd->list[0]); - cnt = arrayObj->getCount(); - for (k = 0; k < cnt; k++) { - value = arrayObj->getObject(k); - DPRINTF(("osdict: array ele %d @%p\n", nents, ele)); - ele->key = NULL; - ele->key_size = 0; - typeID = OSTypeIDInst(value); - if (MACFObjectIsPrimitiveType(value)) { - if (arrayhd->type != 0 && - arrayhd->type != MAC_DATA_TYPE_PRIMITIVE) { - - continue; - } - arrayhd->type = MAC_DATA_TYPE_PRIMITIVE; - ele->value = astrtab; - MACFInitElementFromObject(ele, value); - astrtab += ele->value_size; - DPRINTF(("osdict: array new astrtab %p\n", astrtab)); - } else if (typeID == OSTypeID(OSDictionary)) { - unsigned int dents; - char * dstrtab = NULL; // do not free - OSDictionary * dictObj = NULL; // do not release - OSString * dictkey = NULL; // do not release - OSCollectionIterator * dictIterator = NULL; // must release - struct mac_module_data_list * dicthd = NULL; // do not free - struct mac_module_data_element * dele = NULL; // do not free - - if (arrayhd->type != 0 && - arrayhd->type != MAC_DATA_TYPE_DICT) { - - continue; - } - dictObj = OSDynamicCast(OSDictionary, value); - dictIterator = OSCollectionIterator::withCollection(dictObj); - if (!dictIterator) { - goto finish; - } - DPRINTF(("osdict: dict\n")); - ele->value = dicttab; - ele->value_type = MAC_DATA_TYPE_DICT; - dicthd = (struct mac_module_data_list *)ele->value; - DPRINTF(("osdict: dicthd %p\n", dicthd)); - dstrtab = astrtab; - dents = 0; - while ((dictkey = OSDynamicCast(OSString, - dictIterator->getNextObject()))) { - - OSObject * dictvalue = NULL; // do not release - - dictvalue = dictObj->getObject(dictkey); - if (!dictvalue) { - continue; - } - dele = &(dicthd->list[dents]); - DPRINTF(("osdict: dict ele %d @%p\n", dents, dele)); - if (MACFObjectIsPrimitiveType(dictvalue)) { - dele->key = dstrtab; - dele->key_size = dictkey->getLength() + 1; - DPRINTF(("osdict: dictkey %s size %d @%p\n", - dictkey->getCStringNoCopy(), dictkey->getLength(), dstrtab)); - memcpy(dele->key, dictkey->getCStringNoCopy(), - dele->key_size); - dele->value = dele->key + dele->key_size; - MACFInitElementFromObject(dele, dictvalue); - dstrtab += dele->key_size + dele->value_size; - DPRINTF(("osdict: dict new dstrtab %p\n", dstrtab)); - } else { - continue; /* Only handle primitive types here. */ - } - dents++; - } - dictIterator->release(); - if (dents == 0) { - continue; - } - arrayhd->type = MAC_DATA_TYPE_DICT; - ele->value_size = sizeof(struct mac_module_data_list) + - (dents - 1) * sizeof(struct mac_module_data_element); - DPRINTF(("osdict: dict ele size %d ents %d\n", ele->value_size, dents)); - dicttab += ele->value_size; - DPRINTF(("osdict: new dicttab %p\n", dicttab)); - dicthd->count = dents; - astrtab = dstrtab; - } else { - continue; /* Skip everything else. */ - } - nents++; - ele++; - } - if (nents == 0) { - continue; - } - element->value_size = sizeof(struct mac_module_data_list) + - (nents - 1) * sizeof(struct mac_module_data_element); - listtab += element->value_size; - DPRINTF(("osdict: new listtab %p\n", listtab)); - arrayhd->count = nents; - strtab = astrtab; - DPRINTF(("osdict: new strtab %p\n", strtab)); - } else { - continue; /* skip anything else */ - } - element++; - } - DPRINTF(("result list @%p, key %p value %p\n", - result, result->data[0].key, result->data[0].value)); -finish: - if (keyIterator) keyIterator->release(); - return result; + return( (void *) my_athp ); } - -/********************************************************************* -* This function takes a plist and looks for an OSModuleData dictionary. -* If it is found, an encoded copy is returned. The value must be -* kmem_free()'d. -*********************************************************************/ -static void * -MACFCopyModuleDataForKext( - OSKext * theKext, - mach_msg_type_number_t * datalen) - -{ - struct mac_module_data * result = NULL; - OSDictionary * kextModuleData = NULL; // do not release - vm_map_copy_t copy = 0; - kextModuleData = OSDynamicCast(OSDictionary, - theKext->getPropertyForHostArch("OSModuleData")); - if (!kextModuleData) { - goto finish; - } - - result = MACFEncodeOSDictionary(kextModuleData); - if (!result) { - goto finish; - } - *datalen = module_data->size; +#endif // CONFIG_KEC_FIPS -finish: - return (void *)result; -} -#endif /* CONFIG_MACF_KEXT */ diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index 0696e2b02..009383888 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -95,7 +95,9 @@ static enum { static const int kClassCapacityIncrement = 40; static const int kKModCapacityIncrement = 10; static OSDictionary * sAllClassesDict; +static unsigned int sDeepestClass; IOLock * sAllClassesLock = NULL; +IOLock * sInstancesLock = NULL; /* * While loading a kext and running all its constructors to register @@ -111,6 +113,13 @@ static struct StalledData { } * sStalled; IOLock * sStalledClassesLock = NULL; + +struct ExpansionData { + OSOrderedSet * instances; + OSKext * kext; +}; + + #if PRAGMA_MARK #pragma mark OSMetaClassBase #endif /* PRAGMA_MARK */ @@ -118,6 +127,7 @@ IOLock * sStalledClassesLock = NULL; * OSMetaClassBase. *********************************************************************/ +#if APPLE_KEXT_VTABLE_PADDING /********************************************************************* * Reserved vtable functions. *********************************************************************/ @@ -139,7 +149,8 @@ void OSMetaClassBase::_RESERVEDOSMetaClassBase5() { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 5); } void OSMetaClassBase::_RESERVEDOSMetaClassBase6() { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 6); } - +#endif + /********************************************************************* * These used to be inline in the header but gcc didn't believe us * Now we MUST pull the inline out at least until the compiler is @@ -176,8 +187,10 @@ initialize() { sAllClassesLock = IOLockAlloc(); sStalledClassesLock = IOLockAlloc(); + sInstancesLock = IOLockAlloc(); } +#if APPLE_KEXT_VTABLE_PADDING /********************************************************************* * If you need this slot you had better setup an IOCTL style interface. * 'Cause the whole kernel world depends on OSMetaClassBase and YOU @@ -186,6 +199,7 @@ initialize() void OSMetaClassBase::_RESERVEDOSMetaClassBase7() { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 7); } +#endif /********************************************************************* *********************************************************************/ @@ -285,6 +299,7 @@ const OSMetaClass * OSMetaClass::getMetaClass() const * OSMetaClass *********************************************************************/ +#if APPLE_KEXT_VTABLE_PADDING /********************************************************************* * Reserved functions. *********************************************************************/ @@ -304,6 +319,7 @@ void OSMetaClass::_RESERVEDOSMetaClass6() { panic("OSMetaClass::_RESERVEDOSMetaClass%d called", 6); } void OSMetaClass::_RESERVEDOSMetaClass7() { panic("OSMetaClass::_RESERVEDOSMetaClass%d called", 7); } +#endif /********************************************************************* *********************************************************************/ @@ -377,6 +393,9 @@ OSMetaClass::OSMetaClass( classSize = inClassSize; superClassLink = inSuperClass; + reserved = IONew(ExpansionData, 1); + bzero(reserved, sizeof(ExpansionData)); + /* Hack alert: We are just casting inClassName and storing it in * an OSString * instance variable. This may be because you can't * create C++ objects in static constructors, but I really don't know! @@ -420,7 +439,7 @@ OSMetaClass::OSMetaClass( *********************************************************************/ OSMetaClass::~OSMetaClass() { - OSKext * myKext = (OSKext *)reserved; // do not release + OSKext * myKext = reserved ? reserved->kext : 0; // do not release /* Hack alert: 'className' is a C string during early C++ init, and * is converted to a real OSSymbol only when we record the OSKext in @@ -494,7 +513,13 @@ OSMetaClass::getClassName() const if (!className) return NULL; return className->getCStringNoCopy(); } - +/********************************************************************* +*********************************************************************/ +const OSSymbol * +OSMetaClass::getClassNameSymbol() const +{ + return className; +} /********************************************************************* *********************************************************************/ unsigned int @@ -571,6 +596,7 @@ OSMetaClass::postModLoad(void * loadHandle) result = kOSMetaClassNoDicts; break; } + sAllClassesDict->setOptions(OSCollection::kSort, OSCollection::kSort); // No break; fall through @@ -605,7 +631,7 @@ OSMetaClass::postModLoad(void * loadHandle) */ IOLockLock(sAllClassesLock); for (i = 0; i < sStalled->count; i++) { - OSMetaClass * me = sStalled->classes[i]; + const OSMetaClass * me = sStalled->classes[i]; OSMetaClass * orig = OSDynamicCast(OSMetaClass, sAllClassesDict->getObject((const char *)me->className)); @@ -618,10 +644,13 @@ OSMetaClass::postModLoad(void * loadHandle) "OSMetaClass: Kext %s class %s is a duplicate;" "kext %s already has a class by that name.", sStalled->kextIdentifier, (const char *)me->className, - ((OSKext *)orig->reserved)->getIdentifierCString()); + ((OSKext *)orig->reserved->kext)->getIdentifierCString()); result = kOSMetaClassDuplicateClass; break; } + unsigned int depth = 1; + while ((me = me->superClassLink)) depth++; + if (depth > sDeepestClass) sDeepestClass = depth; } IOLockUnlock(sAllClassesLock); @@ -649,7 +678,7 @@ OSMetaClass::postModLoad(void * loadHandle) /* Do not retain the kext object here. */ - me->reserved = (ExpansionData *)myKext; + me->reserved->kext = myKext; if (myKext) { result = myKext->addClass(me, sStalled->count); if (result != kOSReturnSuccess) { @@ -718,7 +747,7 @@ OSMetaClass::instanceDestructed() const } if (((int)instanceCount) < 0) { - OSKext * myKext = (OSKext *)reserved; + OSKext * myKext = reserved->kext; OSKextLog(myKext, kOSMetaClassLogSpec, // xxx - this phrasing is rather cryptic @@ -756,6 +785,138 @@ OSMetaClass::reportModInstances(const char * kextIdentifier) kOSKextLogExplicitLevel); return; } +/********************************************************************* +*********************************************************************/ + +void +OSMetaClass::addInstance(const OSObject * instance, bool super) const +{ + if (!super) IOLockLock(sInstancesLock); + + if (!reserved->instances) { + reserved->instances = OSOrderedSet::withCapacity(16); + if (superClassLink) { + superClassLink->addInstance(reserved->instances, true); + } + } + reserved->instances->setLastObject(instance); + + if (!super) IOLockUnlock(sInstancesLock); +} + +void +OSMetaClass::removeInstance(const OSObject * instance, bool super) const +{ + if (!super) IOLockLock(sInstancesLock); + + if (reserved->instances) { + reserved->instances->removeObject(instance); + if (0 == reserved->instances->getCount()) { + if (superClassLink) { + superClassLink->removeInstance(reserved->instances, true); + } + reserved->instances->release(); + reserved->instances = 0; + } + } + + if (!super) IOLockUnlock(sInstancesLock); +} + +void +OSMetaClass::applyToInstances(OSOrderedSet * set, + OSMetaClassInstanceApplierFunction applier, + void * context) +{ + enum { kLocalDepth = 24 }; + unsigned int _nextIndex[kLocalDepth]; + OSOrderedSet * _sets[kLocalDepth]; + unsigned int * nextIndex = &_nextIndex[0]; + OSOrderedSet ** sets = &_sets[0]; + OSObject * obj; + OSOrderedSet * childSet; + unsigned int maxDepth; + unsigned int idx; + unsigned int level; + bool done; + + maxDepth = sDeepestClass; + if (maxDepth > kLocalDepth) + { + nextIndex = IONew(typeof(nextIndex[0]), maxDepth); + sets = IONew(typeof(sets[0]), maxDepth); + } + done = false; + level = 0; + idx = 0; + do + { + while (!done && (obj = set->getObject(idx++))) + { + if ((childSet = OSDynamicCast(OSOrderedSet, obj))) + { + if (level >= maxDepth) panic(">maxDepth"); + sets[level] = set; + nextIndex[level] = idx; + level++; + set = childSet; + idx = 0; + break; + } + done = (*applier)(obj, context); + } + if (!obj) + { + if (!done && level) + { + level--; + set = sets[level]; + idx = nextIndex[level]; + } else done = true; + } + } + while (!done); + if (maxDepth > kLocalDepth) + { + IODelete(nextIndex, typeof(nextIndex[0]), maxDepth); + IODelete(sets, typeof(sets[0]), maxDepth); + } +} + +void +OSMetaClass::applyToInstances(OSMetaClassInstanceApplierFunction applier, + void * context) const +{ + IOLockLock(sInstancesLock); + if (reserved->instances) applyToInstances(reserved->instances, applier, context); + IOLockUnlock(sInstancesLock); +} + +void +OSMetaClass::applyToInstancesOfClassName( + const OSSymbol * name, + OSMetaClassInstanceApplierFunction applier, + void * context) +{ + OSMetaClass * meta; + OSOrderedSet * set = 0; + + IOLockLock(sAllClassesLock); + if (sAllClassesDict + && (meta = (OSMetaClass *) sAllClassesDict->getObject(name)) + && (set = meta->reserved->instances)) + { + set->retain(); + } + IOLockUnlock(sAllClassesLock); + + if (!set) return; + + IOLockLock(sInstancesLock); + applyToInstances(set, applier, context); + IOLockUnlock(sInstancesLock); + set->release(); +} /********************************************************************* *********************************************************************/ @@ -922,7 +1083,7 @@ OSMetaClass::getSuperClass() const const OSSymbol * OSMetaClass::getKmodName() const { - OSKext * myKext = (OSKext *)reserved; + OSKext * myKext = reserved ? reserved->kext : 0; if (myKext) { return myKext->getIdentifier(); } diff --git a/libkern/c++/OSRuntime.cpp b/libkern/c++/OSRuntime.cpp index ae8faf0ef..d16fa34ce 100644 --- a/libkern/c++/OSRuntime.cpp +++ b/libkern/c++/OSRuntime.cpp @@ -548,6 +548,7 @@ void OSlibkernInit(void) // This must be called before calling OSRuntimeInitializeCPP. OSMetaClassBase::initialize(); + g_kernel_kmod_info.address = (vm_address_t) &_mh_execute_header; if (kOSReturnSuccess != OSRuntimeInitializeCPP(&g_kernel_kmod_info, 0)) { panic("OSRuntime: C++ runtime failed to initialize."); } diff --git a/libkern/c++/OSSet.cpp b/libkern/c++/OSSet.cpp index f2d5c3e8c..775253baf 100644 --- a/libkern/c++/OSSet.cpp +++ b/libkern/c++/OSSet.cpp @@ -280,6 +280,8 @@ bool OSSet::isEqualTo(const OSSet *aSet) const for ( i = 0; i < count; i++ ) { obj1 = aSet->members->getObject(i); + if (containsObject(obj1)) + continue; obj2 = members->getObject(i); if ( !obj1 || !obj2 ) return false; diff --git a/libkern/c++/OSSymbol.cpp b/libkern/c++/OSSymbol.cpp index 5f9fad84e..d2eca1bf7 100644 --- a/libkern/c++/OSSymbol.cpp +++ b/libkern/c++/OSSymbol.cpp @@ -601,3 +601,33 @@ bool OSSymbol::isEqualTo(const OSMetaClassBase *obj) const else return false; } + +unsigned int +OSSymbol::bsearch( + const void * key, + const void * array, + unsigned int arrayCount, + size_t memberSize) +{ + const void **p; + unsigned int baseIdx = 0; + unsigned int lim; + + for (lim = arrayCount; lim; lim >>= 1) + { + p = (typeof(p)) (((uintptr_t) array) + (baseIdx + (lim >> 1)) * memberSize); + if (key == *p) + { + return (baseIdx + (lim >> 1)); + } + if (key > *p) + { + // move right + baseIdx += (lim >> 1) + 1; + lim--; + } + // else move left + } + // not found, insertion point here + return (baseIdx + (lim >> 1)); +} diff --git a/libkern/c++/Tests/TestSerialization/test1/test1.xcodeproj/project.pbxproj b/libkern/c++/Tests/TestSerialization/test1/test1.xcodeproj/project.pbxproj index 3f4a3744e..3a22ef1bd 100644 --- a/libkern/c++/Tests/TestSerialization/test1/test1.xcodeproj/project.pbxproj +++ b/libkern/c++/Tests/TestSerialization/test1/test1.xcodeproj/project.pbxproj @@ -103,7 +103,11 @@ isa = PBXProject; buildConfigurationList = 1DEB91C708733DAC0010E9CD /* Build configuration list for PBXProject "test1" */; compatibilityVersion = "Xcode 3.1"; + developmentRegion = English; hasScannedForEncodings = 1; + knownRegions = ( + en, + ); mainGroup = 089C166AFE841209C02AAC07 /* test1 */; projectDirPath = ""; projectRoot = ""; diff --git a/libkern/conf/MASTER b/libkern/conf/MASTER index 783f1af08..1e25a04c3 100644 --- a/libkern/conf/MASTER +++ b/libkern/conf/MASTER @@ -51,13 +51,17 @@ # ident LIBKERN -options HIBERNATION # system hibernation # +options HIBERNATION # system hibernation # options KDEBUG # kernel tracing # +options IST_KDEBUG # limited tracing # +options NO_KDEBUG # no kernel tracing # options GPROF # kernel profiling # options LIBKERNCPP # C++ implementation # options NETWORKING # kernel networking # -options CONFIG_DTRACE # dtrace support # +options CONFIG_DTRACE # dtrace support # +options VM_PRESSURE_EVENTS # options CRYPTO # cryptographic routines # +options ALLCRYPTO # options ZLIB # zlib support # options IOKITSTATS # IOKit statistics # @@ -69,11 +73,22 @@ options IPSEC # IP security # options CONFIG_KXLD # kxld/runtime linking of kexts # +options CONFIG_KEC_FIPS # Kernel External Components for FIPS compliance (KEC_FIPS) # + + # Note that when adding this config option to an architecture, one MUST # add the architecture to the preprocessor test at the beginning of # libkern/kmod/cplus_{start.c,stop.c}. options CONFIG_STATIC_CPPINIT # Static library initializes kext cpp runtime # +# configurable kernel - general switch to say we are building for an +# embedded device +# +options CONFIG_EMBEDDED # + # secure_kernel - secure kernel from user programs options SECURE_KERNEL # + +options DEBUG # +options MACH_ASSERT # diff --git a/libkern/conf/MASTER.i386 b/libkern/conf/MASTER.i386 index 46f20d9ec..fab8b50c8 100644 --- a/libkern/conf/MASTER.i386 +++ b/libkern/conf/MASTER.i386 @@ -1,10 +1,10 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp hibernation networking config_dtrace crypto zlib config_kxld config_static_cppinit iokitstats ] +# RELEASE = [ intel mach libkerncpp hibernation networking config_dtrace crypto allcrypto zlib config_kxld config_static_cppinit iokitstats vm_pressure_events ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ RELEASE debug mach_kdb ] +# DEBUG = [ RELEASE debug ] # -# EMBEDDED = [ intel mach libkerncpp hibernation networking crypto zlib ] +# EMBEDDED = [ intel mach libkerncpp networking crypto zlib ] # DEVELOPMENT = [ EMBEDDED config_dtrace ] # ###################################################################### @@ -12,6 +12,4 @@ machine "i386" # cpu "i386" # -options MACH_KDB # # - options NO_NESTED_PMAP # diff --git a/libkern/conf/MASTER.x86_64 b/libkern/conf/MASTER.x86_64 index a9fd68364..311403c6f 100644 --- a/libkern/conf/MASTER.x86_64 +++ b/libkern/conf/MASTER.x86_64 @@ -1,8 +1,8 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp hibernation networking config_dtrace crypto zlib config_kxld iokitstats ] +# RELEASE = [ intel mach libkerncpp hibernation networking config_dtrace crypto allcrypto zlib config_kxld iokitstats vm_pressure_events config_kec_fips ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ RELEASE debug mach_kdb ] +# DEBUG = [ RELEASE debug mach_assert ] # # EMBEDDED = [ intel mach libkerncpp hibernation networking crypto zlib ] # DEVELOPMENT = [ EMBEDDED ] @@ -12,6 +12,4 @@ machine "x86_64" # cpu "x86_64" # -options MACH_KDB # # - options NO_NESTED_PMAP # diff --git a/libkern/conf/Makefile b/libkern/conf/Makefile index f0cf53e3d..65190ee61 100644 --- a/libkern/conf/Makefile +++ b/libkern/conf/Makefile @@ -42,9 +42,11 @@ $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ do_all: $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ + next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH)); \ ${MAKE} -C $(COMPOBJROOT)/$(LIBKERN_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(LIBKERN_KERNEL_CONFIG)/Makefile \ SOURCE=$${next_source} \ + RELATIVE_SOURCE_PATH=$${next_relsource} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ KERNEL_CONFIG=$(LIBKERN_KERNEL_CONFIG) \ diff --git a/libkern/conf/Makefile.i386 b/libkern/conf/Makefile.i386 index f28e7a459..d75614478 100644 --- a/libkern/conf/Makefile.i386 +++ b/libkern/conf/Makefile.i386 @@ -2,12 +2,16 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### +# sha1 Files to build with -DSHA1_USE_ASSEMBLY=1 +sha1.o_CFLAGS_ADD += -DSHA1_USE_ASSEMBLY=1 + # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ WKdmDecompress.o HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) + ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/libkern/conf/Makefile.template b/libkern/conf/Makefile.template index 9dad4c816..7d1848535 100644 --- a/libkern/conf/Makefile.template +++ b/libkern/conf/Makefile.template @@ -27,7 +27,7 @@ include $(MakeInc_def) # CFLAGS # CFLAGS+= -include meta_features.h -DLIBKERN_KERNEL_PRIVATE -DOSALLOCDEBUG=1 \ - $(CFLAGS_INLINE_CONFIG) + $(CFLAGS_INLINE_CONFIG) # zlib is 3rd party source compress.o_CWARNFLAGS_ADD = -Wno-cast-qual @@ -41,6 +41,9 @@ uncompr.o_CWARNFLAGS_ADD = -Wno-cast-qual # warnings in bison-generated code OSUnserializeXML.cpo_CXXWARNFLAGS_ADD = -Wno-uninitialized +# Runtime support functions don't interact well with LTO (9294679) +stack_protector.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) + # # Directories for mig generated files # @@ -87,10 +90,10 @@ LDOBJS = $(OBJS) $(COMPONENT).filelist: $(LDOBJS) $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 ]; then \ - for hib_file in ${HIB_FILES}; \ + for hib_file in ${HIB_FILES}; \ do \ - $(SEG_HACK) __HIB $${hib_file} -o $${hib_file}__; \ - mv $${hib_file}__ $${hib_file} ; \ + $(SEG_HACK) __HIB $${hib_file} -o $${hib_file}__; \ + mv $${hib_file}__ $${hib_file} ; \ done; \ fi @echo LDFILELIST $(COMPONENT) diff --git a/libkern/conf/Makefile.x86_64 b/libkern/conf/Makefile.x86_64 index a7fda56ca..719fd1d29 100644 --- a/libkern/conf/Makefile.x86_64 +++ b/libkern/conf/Makefile.x86_64 @@ -2,9 +2,12 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### +# sha1 Files to build with -DSHA1_USE_ASSEMBLY=1 +sha1.o_CFLAGS_ADD += -DSHA1_USE_ASSEMBLY=1 + # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ - WKdmDecompress.o + WKdmDecompress.o HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) diff --git a/libkern/conf/files b/libkern/conf/files index 6f3d432ac..0228f1e00 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -3,8 +3,8 @@ OPTIONS/libkerncpp optional libkerncpp OPTIONS/kdebug optional kdebug OPTIONS/gprof optional gprof -OPTIONS/config_dtrace optional config_dtrace -OPTIONS/hibernation optional hibernation +OPTIONS/config_dtrace optional config_dtrace +OPTIONS/hibernation optional hibernation OPTIONS/networking optional networking OPTIONS/crypto optional crypto OPTIONS/zlib optional zlib @@ -59,10 +59,15 @@ libkern/zlib/trees.c optional zlib libkern/zlib/uncompr.c optional zlib libkern/zlib/zutil.c optional zlib -libkern/crypto/md5.c optional crypto -libkern/crypto/md5.c optional networking -libkern/crypto/sha1.c optional crypto -libkern/crypto/sha1.c optional ipsec +libkern/crypto/register_crypto.c optional crypto +libkern/crypto/corecrypto_sha2.c optional crypto allcrypto +libkern/crypto/corecrypto_sha1.c optional crypto +libkern/crypto/corecrypto_sha1.c optional ipsec +libkern/crypto/corecrypto_md5.c optional crypto +libkern/crypto/corecrypto_md5.c optional networking +libkern/crypto/corecrypto_des.c optional crypto +libkern/crypto/corecrypto_aes.c optional crypto +libkern/crypto/corecrypto_aesxts.c optional crypto libkern/stack_protector.c standard @@ -76,9 +81,11 @@ libkern/kxld/kxld_reloc.c optional config_kxld libkern/kxld/kxld_object.c optional config_kxld libkern/kxld/kxld_sect.c optional config_kxld libkern/kxld/kxld_seg.c optional config_kxld +libkern/kxld/kxld_srcversion.c optional config_kxld libkern/kxld/kxld_sym.c optional config_kxld libkern/kxld/kxld_symtab.c optional config_kxld libkern/kxld/kxld_util.c optional config_kxld libkern/kxld/kxld_uuid.c optional config_kxld +libkern/kxld/kxld_versionmin.c optional config_kxld libkern/kxld/kxld_vtable.c optional config_kxld libkern/kxld/kxld_stubs.c standard diff --git a/libkern/conf/files.i386 b/libkern/conf/files.i386 index 18edb6e7d..2982431f7 100644 --- a/libkern/conf/files.i386 +++ b/libkern/conf/files.i386 @@ -1,8 +1,7 @@ libkern/i386/OSAtomic.s standard libkern/zlib/intel/inffastS.s optional zlib libkern/zlib/intel/adler32vec.s optional zlib -libkern/crypto/intel/sha1edp.s optional crypto # Optimized WKdm compressor -libkern/kxld/i386/WKdmCompress.s optional hibernation -libkern/kxld/i386/WKdmDecompress.s optional hibernation +libkern/kxld/i386/WKdmCompress.s optional hibernation +libkern/kxld/i386/WKdmDecompress.s optional hibernation diff --git a/libkern/conf/files.x86_64 b/libkern/conf/files.x86_64 index bc32a4846..b1f7e44fa 100644 --- a/libkern/conf/files.x86_64 +++ b/libkern/conf/files.x86_64 @@ -1,8 +1,7 @@ libkern/x86_64/OSAtomic.s standard libkern/zlib/intel/inffastS.s optional zlib libkern/zlib/intel/adler32vec.s optional zlib -libkern/crypto/intel/sha1edp.s optional crypto # Optimized WKdm compressor -libkern/kxld/i386/WKdmCompress.s optional hibernation -libkern/kxld/i386/WKdmDecompress.s optional hibernation +libkern/kxld/i386/WKdmCompress.s optional hibernation +libkern/kxld/i386/WKdmDecompress.s optional hibernation diff --git a/libkern/crypto/corecrypto_aes.c b/libkern/crypto/corecrypto_aes.c new file mode 100644 index 000000000..161715af1 --- /dev/null +++ b/libkern/crypto/corecrypto_aes.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]) +{ + const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_encrypt; + + /* Make sure the context size for the mode fits in the one we have */ + if(cbc->size>sizeof(aes_encrypt_ctx)) + panic("%s: inconsistent size for AES encrypt context", __FUNCTION__); + + cccbc_init(cbc, cx[0].ctx, key_len, key); + + return aes_good; +} + +aes_rval aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out_blk, aes_encrypt_ctx cx[1]) +{ + const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_encrypt; + cccbc_iv_decl(cbc->block_size, ctx_iv); + + cccbc_set_iv(cbc, ctx_iv, in_iv); + cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc encrypt. + + return aes_good; +} + +/* This does one block of ECB, using the CBC implementation - this allow to use the same context for both CBC and ECB */ +aes_rval aes_encrypt(const unsigned char *in_blk, unsigned char *out_blk, aes_encrypt_ctx cx[1]) +{ + return aes_encrypt_cbc(in_blk, NULL, 1, out_blk, cx); +} + +aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]) +{ + const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_decrypt; + + /* Make sure the context size for the mode fits in the one we have */ + if(cbc->size>sizeof(aes_decrypt_ctx)) + panic("%s: inconsistent size for AES decrypt context", __FUNCTION__); + + cccbc_init(cbc, cx[0].ctx, key_len, key); + + return aes_good; +} + +aes_rval aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out_blk, aes_decrypt_ctx cx[1]) +{ + const struct ccmode_cbc *cbc = g_crypto_funcs->ccaes_cbc_decrypt; + cccbc_iv_decl(cbc->block_size, ctx_iv); + + cccbc_set_iv(cbc, ctx_iv, in_iv); + cccbc_update(cbc, cx[0].ctx, ctx_iv, num_blk, in_blk, out_blk); //Actually cbc decrypt. + + return aes_good; +} + +/* This does one block of ECB, using the CBC implementation - this allow to use the same context for both CBC and ECB */ +aes_rval aes_decrypt(const unsigned char *in_blk, unsigned char *out_blk, aes_decrypt_ctx cx[1]) +{ + return aes_decrypt_cbc(in_blk, NULL, 1, out_blk, cx); +} + +aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]) +{ + return aes_encrypt_key(key, 16, cx); +} + +aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]) +{ + return aes_decrypt_key(key, 16, cx); +} + + +aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]) +{ + return aes_encrypt_key(key, 32, cx); +} + +aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]) +{ + return aes_decrypt_key(key, 32, cx); +} diff --git a/libkern/crypto/corecrypto_aesxts.c b/libkern/crypto/corecrypto_aesxts.c new file mode 100644 index 000000000..dc0d6f40e --- /dev/null +++ b/libkern/crypto/corecrypto_aesxts.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +/* + * These are the interfaces required for XTS-AES support + */ + +uint32_t +xts_start(uint32_t cipher __unused, // ignored - we're doing this for xts-aes only + const uint8_t *IV __unused, // ignored + const uint8_t *key1, int keylen, + const uint8_t *key2, int tweaklen __unused, // both keys are the same size for xts + uint32_t num_rounds __unused, // ignored + uint32_t options __unused, // ignored + symmetric_xts *xts) +{ + const struct ccmode_xts *enc, *dec; + + if(!g_crypto_funcs) + panic("%s: corecrypto not registered!\n", __FUNCTION__); + + enc = g_crypto_funcs->ccaes_xts_encrypt; + dec = g_crypto_funcs->ccaes_xts_decrypt; + + if(!enc && !dec) + panic("%s: xts mode not registered? enc=%p, dec=%p\n", __FUNCTION__, enc, dec); + + /* Make sure the context size for the mode fits in the one we have */ + if((enc->size>sizeof(xts->enc)) || (dec->size>sizeof(xts->dec))) + panic("%s: inconsistent size for AES-XTS context", __FUNCTION__); + + enc->init(enc, xts->enc, keylen, key1, key2); + dec->init(dec, xts->dec, keylen, key1, key2); + + return 0; //never fails +} + +int xts_encrypt(const uint8_t *pt, unsigned long ptlen, + uint8_t *ct, + const uint8_t *iv, // this can be considered the sector IV for this use + symmetric_xts *xts) +{ + const struct ccmode_xts *xtsenc = g_crypto_funcs->ccaes_xts_encrypt; + ccxts_tweak_decl(xtsenc->tweak_size, tweak); + + if(ptlen%16) panic("xts encrypt not a multiple of block size\n"); + + xtsenc->set_tweak(xts->enc, tweak, iv); + xtsenc->xts(xts->enc, tweak, ptlen/16, pt, ct); + + return 0; //never fails +} + +int xts_decrypt(const uint8_t *ct, unsigned long ptlen, + uint8_t *pt, + const uint8_t *iv, // this can be considered the sector IV for this use + symmetric_xts *xts) +{ + const struct ccmode_xts *xtsdec = g_crypto_funcs->ccaes_xts_decrypt; + ccxts_tweak_decl(xtsdec->tweak_size, tweak); + + if(ptlen%16) panic("xts decrypt not a multiple of block size\n"); + + xtsdec->set_tweak(xts->dec, tweak, iv); + xtsdec->xts(xts->dec, tweak, ptlen/16, ct, pt); + + return 0; //never fails +} + +void xts_done(symmetric_xts *xts __unused) +{ + +} diff --git a/libkern/crypto/corecrypto_des.c b/libkern/crypto/corecrypto_des.c new file mode 100644 index 000000000..26f5ab50e --- /dev/null +++ b/libkern/crypto/corecrypto_des.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include +#include +#include +#include +#include + +/* Single DES ECB - used by ipv6 (esp_core.c) */ +int des_ecb_key_sched(des_cblock *key, des_ecb_key_schedule *ks) +{ + const struct ccmode_ecb *enc = g_crypto_funcs->ccdes_ecb_encrypt; + const struct ccmode_ecb *dec = g_crypto_funcs->ccdes_ecb_decrypt; + + /* Make sure the context size for the mode fits in the one we have */ + if((enc->size>sizeof(ks->enc)) || (dec->size>sizeof(ks->dec))) + panic("%s: inconsistent size for DES-ECB context", __FUNCTION__); + + enc->init(enc, ks->enc, CCDES_KEY_SIZE, key); + dec->init(dec, ks->dec, CCDES_KEY_SIZE, key); + + /* The old DES interface could return -1 or -2 for weak keys and wrong parity, + but this was disabled all the time, so we never fail here */ + return 0; +} + +/* Simple des - 1 block */ +void des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int enc) +{ + const struct ccmode_ecb *ecb = enc ? g_crypto_funcs->ccdes_ecb_encrypt : g_crypto_funcs->ccdes_ecb_decrypt; + ccecb_ctx *ctx = enc ? ks->enc : ks->dec; + + ecb->ecb(ctx, 1, in, out); +} + + +/* Triple DES ECB - used by ipv6 (esp_core.c) */ +int des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks) +{ + const struct ccmode_ecb *enc = g_crypto_funcs->cctdes_ecb_encrypt; + const struct ccmode_ecb *dec = g_crypto_funcs->cctdes_ecb_decrypt; + + /* Make sure the context size for the mode fits in the one we have */ + if((enc->size>sizeof(ks->enc)) || (dec->size>sizeof(ks->dec))) + panic("%s: inconsistent size for 3DES-ECB context", __FUNCTION__); + + enc->init(enc, ks->enc, CCDES_KEY_SIZE*3, key); + dec->init(dec, ks->dec, CCDES_KEY_SIZE*3, key); + + /* The old DES interface could return -1 or -2 for weak keys and wrong parity, + but this was disabled all the time, so we never fail here */ + return 0; +} + +/* Simple des - 1 block */ +void des3_ecb_encrypt(des_cblock *in, des_cblock *out, des3_ecb_key_schedule *ks, int enc) +{ + const struct ccmode_ecb *ecb = enc ? g_crypto_funcs->cctdes_ecb_encrypt : g_crypto_funcs->cctdes_ecb_decrypt; + ccecb_ctx *ctx = enc ? ks->enc : ks->dec; + + ecb->ecb(ctx, 1, in, out); +} + +/* Single DES CBC - used by nfs_gss */ +int des_cbc_key_sched(des_cblock *key, des_cbc_key_schedule *ks) +{ + const struct ccmode_cbc *enc = g_crypto_funcs->ccdes_cbc_encrypt; + const struct ccmode_cbc *dec = g_crypto_funcs->ccdes_cbc_decrypt; + + /* Make sure the context size for the mode fits in the one we have */ + if((enc->size>sizeof(ks->enc)) || (dec->size>sizeof(ks->dec))) + panic("%s: inconsistent size for DES-CBC context", __FUNCTION__); + + + cccbc_init(enc, ks->enc, CCDES_KEY_SIZE, key); + cccbc_init(dec, ks->dec, CCDES_KEY_SIZE, key); + + /* The old DES interface could return -1 or -2 for weak keys and wrong parity, + but this was disabled all the time, so we never fail here */ + return 0; +} + +/* this is normally only called with length an 8 bytes multiple */ +void +des_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t length, + des_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt) +{ + const struct ccmode_cbc *cbc = encrypt?g_crypto_funcs->ccdes_cbc_encrypt:g_crypto_funcs->ccdes_cbc_decrypt; + cccbc_ctx *ctx = encrypt ? ks->enc : ks->dec; + int nblocks; + cccbc_iv_decl(cbc->block_size, ctx_iv); + + assert(length%8==0); + nblocks=length/8; + + /* set the iv */ + cccbc_set_iv(cbc, ctx_iv, iv); + + cccbc_update(cbc, ctx, ctx_iv, nblocks, in, out); + + /* copy back iv */ + if(retiv) + memcpy(retiv, ctx_iv, 8); +} + +/* Triple DES CBC - used by nfs_gss */ +int des3_cbc_key_sched(des_cblock *key, des3_cbc_key_schedule *ks) +{ + const struct ccmode_cbc *enc = g_crypto_funcs->cctdes_cbc_encrypt; + const struct ccmode_cbc *dec = g_crypto_funcs->cctdes_cbc_decrypt; + + /* Make sure the context size for the mode fits in the one we have */ + if((enc->size>sizeof(ks->enc)) || (dec->size>sizeof(ks->dec))) + panic("%s: inconsistent size for 3DES-CBC context", __FUNCTION__); + + cccbc_init(enc, ks->enc, CCDES_KEY_SIZE*3, key); + cccbc_init(dec, ks->dec, CCDES_KEY_SIZE*3, key); + + /* The old DES interface could return -1 or -2 for weak keys and wrong parity, + but this was disabled all the time, so we never fail here */ + return 0; +} + +/* this is normally only called with length an 8 bytes multiple */ +void +des3_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t length, + des3_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt) +{ + const struct ccmode_cbc *cbc = encrypt?g_crypto_funcs->cctdes_cbc_encrypt:g_crypto_funcs->cctdes_cbc_decrypt; + cccbc_ctx *ctx = encrypt ? ks->enc : ks->dec; + int nblocks; + cccbc_iv_decl(cbc->block_size, ctx_iv); + + assert(length%8==0); + nblocks=length/8; + + /* set the iv */ + cccbc_set_iv(cbc, ctx_iv, iv); + + cccbc_update(cbc, ctx, ctx_iv, nblocks, in, out); + + /* copy back iv */ + if(retiv) + memcpy(retiv, ctx_iv, 8); +} + + +/* + * DES MAC implemented according to FIPS 113 + * http://www.itl.nist.gov/fipspubs/fip113.htm + * Only full blocks. + * Used by nfs-gss + */ +void +des_cbc_cksum(des_cblock *in, des_cblock *out, + int len, des_cbc_key_schedule *ks) +{ + const struct ccmode_cbc *cbc = g_crypto_funcs->ccdes_cbc_encrypt; + int nblocks; + des_cblock cksum; + cccbc_iv_decl(cbc->block_size, ctx_iv); + + assert(len%8==0); + nblocks=len/8; + + cccbc_set_iv(cbc, ctx_iv, NULL); + while(nblocks--) { + cccbc_update(cbc, ks->enc, ctx_iv, 1, in++, cksum); + } + memcpy(out, cksum, sizeof(des_cblock)); +} + + +/* Raw key helper functions */ +void des_fixup_key_parity(des_cblock *key) +{ + g_crypto_funcs->ccdes_key_set_odd_parity_fn(key, CCDES_KEY_SIZE); +} + +int des_is_weak_key(des_cblock *key) +{ + return g_crypto_funcs->ccdes_key_is_weak_fn(key, CCDES_KEY_SIZE); +} diff --git a/libkern/crypto/corecrypto_md5.c b/libkern/crypto/corecrypto_md5.c new file mode 100644 index 000000000..70225a5a8 --- /dev/null +++ b/libkern/crypto/corecrypto_md5.c @@ -0,0 +1,65 @@ + +#include +#include +#include +#include + +static uint64_t getCount(MD5_CTX *ctx) +{ + return ( (((uint64_t)ctx->count[0])<<32) | (ctx->count[1]) ); +} + +static void setCount(MD5_CTX *ctx, uint64_t count) +{ + ctx->count[0]=(uint32_t)(count>>32); + ctx->count[1]=(uint32_t)count; +} + +/* Copy a ccdigest ctx into a legacy MD5 context */ +static void DiToMD5(const struct ccdigest_info *di, struct ccdigest_ctx *di_ctx, MD5_CTX *md5_ctx) +{ + setCount(md5_ctx, ccdigest_nbits(di, di_ctx)/8+ccdigest_num(di, di_ctx)); + memcpy(md5_ctx->buffer, ccdigest_data(di, di_ctx), di->block_size); + memcpy(md5_ctx->state, ccdigest_state_ccn(di, di_ctx), di->state_size); +} + +/* Copy a legacy MD5 context into a ccdigest ctx */ +static void MD5ToDi(const struct ccdigest_info *di, MD5_CTX *md5_ctx, struct ccdigest_ctx *di_ctx) +{ + uint64_t count = getCount(md5_ctx); + + ccdigest_num(di, di_ctx)=count%di->block_size; + ccdigest_nbits(di, di_ctx)=(count-ccdigest_num(di, di_ctx))*8; + memcpy(ccdigest_data(di, di_ctx), md5_ctx->buffer, di->block_size); + memcpy(ccdigest_state_ccn(di, di_ctx), md5_ctx->state, di->state_size); +} + +void MD5Init(MD5_CTX *ctx) +{ + const struct ccdigest_info *di=g_crypto_funcs->ccmd5_di; + ccdigest_di_decl(di, di_ctx); + + g_crypto_funcs->ccdigest_init_fn(di, di_ctx); + + DiToMD5(di, di_ctx, ctx); +} + +void MD5Update(MD5_CTX *ctx, const void *data, unsigned int len) +{ + const struct ccdigest_info *di=g_crypto_funcs->ccmd5_di; + ccdigest_di_decl(di, di_ctx); + + MD5ToDi(di, ctx, di_ctx); + g_crypto_funcs->ccdigest_update_fn(di, di_ctx, len, data); + DiToMD5(di, di_ctx, ctx); +} + +void MD5Final(unsigned char digest[MD5_DIGEST_LENGTH], MD5_CTX *ctx) +{ + const struct ccdigest_info *di=g_crypto_funcs->ccmd5_di; + ccdigest_di_decl(di, di_ctx); + + MD5ToDi(di, ctx, di_ctx); + ccdigest_final(di, di_ctx, digest); +} + diff --git a/libkern/crypto/corecrypto_sha1.c b/libkern/crypto/corecrypto_sha1.c new file mode 100644 index 000000000..1513287da --- /dev/null +++ b/libkern/crypto/corecrypto_sha1.c @@ -0,0 +1,110 @@ + +#include +#include +#include +#include + + +static uint64_t getCount(SHA1_CTX *ctx) +{ + return ctx->c.b64[0]; +} + +static void setCount(SHA1_CTX *ctx, uint64_t count) +{ + ctx->c.b64[0]=count; +} + +/* Copy a ccdigest ctx into a legacy SHA1 context */ +static void DiToSHA1(const struct ccdigest_info *di, struct ccdigest_ctx *di_ctx, SHA1_CTX *sha1_ctx) +{ + setCount(sha1_ctx, ccdigest_nbits(di, di_ctx)/8+ccdigest_num(di, di_ctx)); + memcpy(sha1_ctx->m.b8, ccdigest_data(di, di_ctx), di->block_size); + memcpy(sha1_ctx->h.b8, ccdigest_state_ccn(di, di_ctx), di->state_size); +} + +/* Copy a legacy SHA1 context into a ccdigest ctx */ +static void SHA1ToDi(const struct ccdigest_info *di, SHA1_CTX *sha1_ctx, struct ccdigest_ctx *di_ctx) +{ + uint64_t count = getCount(sha1_ctx); + + ccdigest_num(di, di_ctx)=count%di->block_size; + ccdigest_nbits(di, di_ctx)=(count-ccdigest_num(di, di_ctx))*8; + memcpy(ccdigest_data(di, di_ctx), sha1_ctx->m.b8, di->block_size); + memcpy(ccdigest_state_ccn(di, di_ctx), sha1_ctx->h.b8, di->state_size); +} + +void SHA1Init(SHA1_CTX *ctx) +{ + const struct ccdigest_info *di=g_crypto_funcs->ccsha1_di; + ccdigest_di_decl(di, di_ctx); + + g_crypto_funcs->ccdigest_init_fn(di, di_ctx); + + DiToSHA1(di, di_ctx, ctx); +} + +void SHA1Update(SHA1_CTX *ctx, const void *data, size_t len) +{ + const struct ccdigest_info *di=g_crypto_funcs->ccsha1_di; + ccdigest_di_decl(di, di_ctx); + + SHA1ToDi(di, ctx, di_ctx); + g_crypto_funcs->ccdigest_update_fn(di, di_ctx, len, data); + DiToSHA1(di, di_ctx, ctx); +} + +void SHA1Final(void *digest, SHA1_CTX *ctx) +{ + const struct ccdigest_info *di=g_crypto_funcs->ccsha1_di; + ccdigest_di_decl(di, di_ctx); + + SHA1ToDi(di, ctx, di_ctx); + ccdigest_final(di, di_ctx, digest); +} + +#ifdef XNU_KERNEL_PRIVATE +void SHA1UpdateUsePhysicalAddress(SHA1_CTX *ctx, const void *data, size_t len) +{ + //TODO: What the hell ? + SHA1Update(ctx, data, len); +} +#endif + +/* This is not publicised in header, but exported in libkern.exports */ +void SHA1Final_r(SHA1_CTX *context, void *digest); +void SHA1Final_r(SHA1_CTX *context, void *digest) +{ + SHA1Final(digest, context); +} + + +/* + * This function is called by the SHA1 hardware kext during its init. + * This will register the function to call to perform SHA1 using hardware. + */ +#include +#include +#include + +typedef kern_return_t (*InKernelPerformSHA1Func)(void *ref, const void *data, size_t dataLen, u_int32_t *inHash, u_int32_t options, u_int32_t *outHash, Boolean usePhysicalAddress); +void sha1_hardware_hook(Boolean option, InKernelPerformSHA1Func func, void *ref); +static void *SHA1Ref; +static InKernelPerformSHA1Func performSHA1WithinKernelOnly; + +void sha1_hardware_hook(Boolean option, InKernelPerformSHA1Func func, void *ref) +{ + if(option) { + // Establish the hook. The hardware is ready. + OSCompareAndSwapPtr((void*)NULL, (void*)ref, (void * volatile*)&SHA1Ref); + + if(!OSCompareAndSwapPtr((void *)NULL, (void *)func, (void * volatile *)&performSHA1WithinKernelOnly)) { + panic("sha1_hardware_hook: Called twice.. Should never happen\n"); + } + } + else { + // The hardware is going away. Tear down the hook. + performSHA1WithinKernelOnly = NULL; + SHA1Ref = NULL; + } +} diff --git a/libkern/crypto/corecrypto_sha2.c b/libkern/crypto/corecrypto_sha2.c new file mode 100644 index 000000000..e85479d3b --- /dev/null +++ b/libkern/crypto/corecrypto_sha2.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +void SHA256_Init(SHA256_CTX *ctx) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha256_di; + + /* Make sure the context size for the digest info fits in the one we have */ + if(ccdigest_di_size(di)>sizeof(SHA256_CTX)) + panic("%s: inconsistent size for SHA256 context", __FUNCTION__); + + g_crypto_funcs->ccdigest_init_fn(di, ctx->ctx); +} + +void SHA256_Update(SHA256_CTX *ctx, const void *data, size_t len) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha256_di; + + g_crypto_funcs->ccdigest_update_fn(di, ctx->ctx, len, data); +} + +void SHA256_Final(void *digest, SHA256_CTX *ctx) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha256_di; + + ccdigest_final(di, ctx->ctx, digest); +} + +void SHA384_Init(SHA384_CTX *ctx) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha384_di; + + /* Make sure the context size for the digest info fits in the one we have */ + if(ccdigest_di_size(di)>sizeof(SHA384_CTX)) + panic("%s: inconsistent size for SHA384 context", __FUNCTION__); + + g_crypto_funcs->ccdigest_init_fn(di, ctx->ctx); +} + +void SHA384_Update(SHA384_CTX *ctx, const void *data, size_t len) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha384_di; + + g_crypto_funcs->ccdigest_update_fn(di, ctx->ctx, len, data); +} + + +void SHA384_Final(void *digest, SHA384_CTX *ctx) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha512_di; + + ccdigest_final(di, ctx->ctx, digest); +} + +void SHA512_Init(SHA512_CTX *ctx) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha512_di; + + /* Make sure the context size for the digest info fits in the one we have */ + if(ccdigest_di_size(di)>sizeof(SHA512_CTX)) + panic("%s: inconsistent size for SHA512 context", __FUNCTION__); + + g_crypto_funcs->ccdigest_init_fn(di, ctx->ctx); +} + +void SHA512_Update(SHA512_CTX *ctx, const void *data, size_t len) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha512_di; + + g_crypto_funcs->ccdigest_update_fn(di, ctx->ctx, len, data); +} + +void SHA512_Final(void *digest, SHA512_CTX *ctx) +{ + const struct ccdigest_info *di; + di=g_crypto_funcs->ccsha512_di; + + ccdigest_final(di, ctx->ctx, digest); +} diff --git a/libkern/crypto/intel/sha1edp.s b/libkern/crypto/intel/sha1edp.s index 80da81a62..8c52a5e7b 100644 --- a/libkern/crypto/intel/sha1edp.s +++ b/libkern/crypto/intel/sha1edp.s @@ -1199,8 +1199,13 @@ void SHA1( int HASH[], int MESSAGE[] ) 0: INTERNAL_nossse3 // update W (i=16:79) and update ABCDE (i=0:63) #if Multiple_Blocks +#if defined(__x86_64__) add $$64, BUFFER_PTR // BUFFER_PTR+=64; sub $$1, cnt // pre-decrement cnt by 1 +#else + addl $$64, BUFFER_PTR // BUFFER_PTR+=64; + subl $$1, cnt // pre-decrement cnt by 1 +#endif jbe 1f // if cnt <= 0, branch to finish off SOFTWARE_PIPELINING_nossse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) UPDATE_ALL_HASH // update output hashes @@ -1223,8 +1228,13 @@ void SHA1( int HASH[], int MESSAGE[] ) 0: INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63) #if Multiple_Blocks +#if defined(__x86_64__) add $$64, BUFFER_PTR // BUFFER_PTR+=64; sub $$1, cnt // pre-decrement cnt by 1 +#else + addl $$64, BUFFER_PTR // BUFFER_PTR+=64; + subl $$1, cnt // pre-decrement cnt by 1 +#endif jbe 1f // if cnt <= 0, branch to finish off SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15) UPDATE_ALL_HASH // update output hashes @@ -1236,12 +1246,16 @@ void SHA1( int HASH[], int MESSAGE[] ) UPDATE_ALL_HASH // update output hashes .endm +#ifdef KERNEL #include +#else +#include +#endif .text .globl _SHA1Transform - .private_extern _SHA1Transform + //.private_extern _SHA1Transform _SHA1Transform: // detect SSSE3 and dispatch appropriate code branch diff --git a/libkern/crypto/md5.c b/libkern/crypto/md5.c deleted file mode 100644 index 46e005986..000000000 --- a/libkern/crypto/md5.c +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * MD5.C - RSA Data Security, Inc., MD5 message-digest algorithm - * - * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All - * rights reserved. - * - * License to copy and use this software is granted provided that it - * is identified as the "RSA Data Security, Inc. MD5 Message-Digest - * Algorithm" in all material mentioning or referencing this software - * or this function. - * - * License is also granted to make and use derivative works provided - * that such works are identified as "derived from the RSA Data - * Security, Inc. MD5 Message-Digest Algorithm" in all material - * mentioning or referencing the derived work. - * - * RSA Data Security, Inc. makes no representations concerning either - * the merchantability of this software or the suitability of this - * software for any particular purpose. It is provided "as is" - * without express or implied warranty of any kind. - * - * These notices must be retained in any copies of any part of this - * documentation and/or software. - * - * This code is the same as the code published by RSA Inc. It has been - * edited for clarity and style only. - */ - -#include -#include -#include - -#define memset(x, y, z) bzero(x, z); -#define memcpy(x, y, z) bcopy(y, x, z) - -/* - * The digest algorithm interprets the input message as a sequence of 32-bit - * little-endian words. We must reverse bytes in each word on PPC and other - * big-endian platforms, but not on little-endian ones. When we can, we try - * to load each word at once. We don't quite care about alignment, since - * x86/x64 allows us to do 4-byte loads on non 4-byte aligned addresses, - * and on PPC we do 1-byte loads anyway. - * - * We could check against __LITLE_ENDIAN__ to generalize the 4-byte load - * optimization, but that might not tell us whether or not we need 4-byte - * aligned loads. Since we know that __i386__ and __x86_64__ are the two - * little-endian architectures that are not alignment-restrictive, we check - * explicitly against them below. Note that the byte-reversing code for - * big-endian will still work on little-endian, albeit much slower. - */ -#if defined(__i386__) || defined(__x86_64__) -#define FETCH_32(p) (*(const u_int32_t *)(p)) -#else -#define FETCH_32(p) \ - (((u_int32_t)*((const u_int8_t *)(p))) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 1)) << 8) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 2)) << 16) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 3)) << 24)) -#endif /* __i386__ || __x86_64__ */ - -/* - * Encodes input (u_int32_t) into output (unsigned char). Assumes len is - * a multiple of 4. This is not compatible with memcpy(). - */ -static void -Encode(unsigned char *output, u_int32_t *input, unsigned int len) -{ - unsigned int i, j; - - for (i = 0, j = 0; j < len; i++, j += 4) { -#if defined(__i386__) || defined(__x86_64__) - *(u_int32_t *)(output + j) = input[i]; -#else - output[j] = input[i] & 0xff; - output[j + 1] = (input[i] >> 8) & 0xff; - output[j + 2] = (input[i] >> 16) & 0xff; - output[j + 3] = (input[i] >> 24) & 0xff; -#endif /* __i386__ || __x86_64__ */ - } -} - -static unsigned char PADDING[64] = { 0x80, /* zeros */ }; - -/* F, G, H and I are basic MD5 functions. */ -#define F(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) -#define G(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) -#define H(x, y, z) ((x) ^ (y) ^ (z)) -#define I(x, y, z) (((~(z)) | (x)) ^ (y)) - -/* ROTATE_LEFT rotates x left n bits. */ -#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) - -/* - * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. - * Rotation is separate from addition to prevent recomputation. - */ -#define FF(a, b, c, d, x, s, ac) { \ - (a) += F((b), (c), (d)) + (x) + (unsigned long long)(ac); \ - (a) = ROTATE_LEFT((a), (s)); \ - (a) += (b); \ -} - -#define GG(a, b, c, d, x, s, ac) { \ - (a) += G((b), (c), (d)) + (x) + (unsigned long long)(ac); \ - (a) = ROTATE_LEFT((a), (s)); \ - (a) += (b); \ -} - -#define HH(a, b, c, d, x, s, ac) { \ - (a) += H((b), (c), (d)) + (x) + (unsigned long long)(ac); \ - (a) = ROTATE_LEFT((a), (s)); \ - (a) += (b); \ -} - -#define II(a, b, c, d, x, s, ac) { \ - (a) += I((b), (c), (d)) + (x) + (unsigned long long)(ac); \ - (a) = ROTATE_LEFT((a), (s)); \ - (a) += (b); \ -} - -static void MD5Transform(u_int32_t, u_int32_t, u_int32_t, u_int32_t, - const u_int8_t [64], MD5_CTX *); - -/* - * MD5 initialization. Begins an MD5 operation, writing a new context. - */ -void -MD5Init(MD5_CTX *context) -{ - context->count[0] = context->count[1] = 0; - - /* Load magic initialization constants. */ - context->state[0] = 0x67452301UL; - context->state[1] = 0xefcdab89UL; - context->state[2] = 0x98badcfeUL; - context->state[3] = 0x10325476UL; -} - -/* - * MD5 block update operation. Continues an MD5 message-digest - * operation, processing another message block, and updating the - * context. - */ -void -MD5Update(MD5_CTX *context, const void *inpp, unsigned int inputLen) -{ - u_int32_t i, index, partLen; - const unsigned char *input = (const unsigned char *)inpp; - - /* Compute number of bytes mod 64 */ - index = (context->count[0] >> 3) & 0x3F; - - /* Update number of bits */ - if ((context->count[0] += (inputLen << 3)) < (inputLen << 3)) - context->count[1]++; - context->count[1] += (inputLen >> 29); - - partLen = 64 - index; - - /* Transform as many times as possible. */ - i = 0; - if (inputLen >= partLen) { - if (index != 0) { - memcpy(&context->buffer[index], input, partLen); - MD5Transform(context->state[0], context->state[1], - context->state[2], context->state[3], - context->buffer, context); - i = partLen; - } - - for (; i + 63 < inputLen; i += 64) - MD5Transform(context->state[0], context->state[1], - context->state[2], context->state[3], - &input[i], context); - - if (inputLen == i) - return; - - index = 0; - } - - /* Buffer remaining input */ - memcpy(&context->buffer[index], &input[i], inputLen - i); -} - -/* - * MD5 finalization. Ends an MD5 message-digest operation, writing the - * the message digest and zeroizing the context. - */ -void -MD5Final(unsigned char digest[MD5_DIGEST_LENGTH], MD5_CTX *context) -{ - unsigned char bits[8]; - u_int32_t index = (context->count[0] >> 3) & 0x3f; - - /* Save number of bits */ - Encode(bits, context->count, 8); - - /* Pad out to 56 mod 64. */ - MD5Update(context, PADDING, ((index < 56) ? 56 : 120) - index); - - /* Append length (before padding) */ - MD5Update(context, bits, 8); - - /* Store state in digest */ - Encode(digest, context->state, 16); - - /* Zeroize sensitive information. */ - memset(context, 0, sizeof (*context)); -} - -/* - * MD5 basic transformation. Transforms state based on block. - */ -static void -MD5Transform(u_int32_t a, u_int32_t b, u_int32_t c, u_int32_t d, - const u_int8_t block[64], MD5_CTX *context) -{ - /* Register (instead of array) is a win in most cases */ - register u_int32_t x0, x1, x2, x3, x4, x5, x6, x7; - register u_int32_t x8, x9, x10, x11, x12, x13, x14, x15; - - x15 = FETCH_32(block + 60); - x14 = FETCH_32(block + 56); - x13 = FETCH_32(block + 52); - x12 = FETCH_32(block + 48); - x11 = FETCH_32(block + 44); - x10 = FETCH_32(block + 40); - x9 = FETCH_32(block + 36); - x8 = FETCH_32(block + 32); - x7 = FETCH_32(block + 28); - x6 = FETCH_32(block + 24); - x5 = FETCH_32(block + 20); - x4 = FETCH_32(block + 16); - x3 = FETCH_32(block + 12); - x2 = FETCH_32(block + 8); - x1 = FETCH_32(block + 4); - x0 = FETCH_32(block + 0); - - /* Round 1 */ -#define S11 7 -#define S12 12 -#define S13 17 -#define S14 22 - FF(a, b, c, d, x0, S11, 0xd76aa478UL); /* 1 */ - FF(d, a, b, c, x1, S12, 0xe8c7b756UL); /* 2 */ - FF(c, d, a, b, x2, S13, 0x242070dbUL); /* 3 */ - FF(b, c, d, a, x3, S14, 0xc1bdceeeUL); /* 4 */ - FF(a, b, c, d, x4, S11, 0xf57c0fafUL); /* 5 */ - FF(d, a, b, c, x5, S12, 0x4787c62aUL); /* 6 */ - FF(c, d, a, b, x6, S13, 0xa8304613UL); /* 7 */ - FF(b, c, d, a, x7, S14, 0xfd469501UL); /* 8 */ - FF(a, b, c, d, x8, S11, 0x698098d8UL); /* 9 */ - FF(d, a, b, c, x9, S12, 0x8b44f7afUL); /* 10 */ - FF(c, d, a, b, x10, S13, 0xffff5bb1UL); /* 11 */ - FF(b, c, d, a, x11, S14, 0x895cd7beUL); /* 12 */ - FF(a, b, c, d, x12, S11, 0x6b901122UL); /* 13 */ - FF(d, a, b, c, x13, S12, 0xfd987193UL); /* 14 */ - FF(c, d, a, b, x14, S13, 0xa679438eUL); /* 15 */ - FF(b, c, d, a, x15, S14, 0x49b40821UL); /* 16 */ - - /* Round 2 */ -#define S21 5 -#define S22 9 -#define S23 14 -#define S24 20 - GG(a, b, c, d, x1, S21, 0xf61e2562UL); /* 17 */ - GG(d, a, b, c, x6, S22, 0xc040b340UL); /* 18 */ - GG(c, d, a, b, x11, S23, 0x265e5a51UL); /* 19 */ - GG(b, c, d, a, x0, S24, 0xe9b6c7aaUL); /* 20 */ - GG(a, b, c, d, x5, S21, 0xd62f105dUL); /* 21 */ - GG(d, a, b, c, x10, S22, 0x02441453UL); /* 22 */ - GG(c, d, a, b, x15, S23, 0xd8a1e681UL); /* 23 */ - GG(b, c, d, a, x4, S24, 0xe7d3fbc8UL); /* 24 */ - GG(a, b, c, d, x9, S21, 0x21e1cde6UL); /* 25 */ - GG(d, a, b, c, x14, S22, 0xc33707d6UL); /* 26 */ - GG(c, d, a, b, x3, S23, 0xf4d50d87UL); /* 27 */ - GG(b, c, d, a, x8, S24, 0x455a14edUL); /* 28 */ - GG(a, b, c, d, x13, S21, 0xa9e3e905UL); /* 29 */ - GG(d, a, b, c, x2, S22, 0xfcefa3f8UL); /* 30 */ - GG(c, d, a, b, x7, S23, 0x676f02d9UL); /* 31 */ - GG(b, c, d, a, x12, S24, 0x8d2a4c8aUL); /* 32 */ - - /* Round 3 */ -#define S31 4 -#define S32 11 -#define S33 16 -#define S34 23 - HH(a, b, c, d, x5, S31, 0xfffa3942UL); /* 33 */ - HH(d, a, b, c, x8, S32, 0x8771f681UL); /* 34 */ - HH(c, d, a, b, x11, S33, 0x6d9d6122UL); /* 35 */ - HH(b, c, d, a, x14, S34, 0xfde5380cUL); /* 36 */ - HH(a, b, c, d, x1, S31, 0xa4beea44UL); /* 37 */ - HH(d, a, b, c, x4, S32, 0x4bdecfa9UL); /* 38 */ - HH(c, d, a, b, x7, S33, 0xf6bb4b60UL); /* 39 */ - HH(b, c, d, a, x10, S34, 0xbebfbc70UL); /* 40 */ - HH(a, b, c, d, x13, S31, 0x289b7ec6UL); /* 41 */ - HH(d, a, b, c, x0, S32, 0xeaa127faUL); /* 42 */ - HH(c, d, a, b, x3, S33, 0xd4ef3085UL); /* 43 */ - HH(b, c, d, a, x6, S34, 0x04881d05UL); /* 44 */ - HH(a, b, c, d, x9, S31, 0xd9d4d039UL); /* 45 */ - HH(d, a, b, c, x12, S32, 0xe6db99e5UL); /* 46 */ - HH(c, d, a, b, x15, S33, 0x1fa27cf8UL); /* 47 */ - HH(b, c, d, a, x2, S34, 0xc4ac5665UL); /* 48 */ - - /* Round 4 */ -#define S41 6 -#define S42 10 -#define S43 15 -#define S44 21 - II(a, b, c, d, x0, S41, 0xf4292244UL); /* 49 */ - II(d, a, b, c, x7, S42, 0x432aff97UL); /* 50 */ - II(c, d, a, b, x14, S43, 0xab9423a7UL); /* 51 */ - II(b, c, d, a, x5, S44, 0xfc93a039UL); /* 52 */ - II(a, b, c, d, x12, S41, 0x655b59c3UL); /* 53 */ - II(d, a, b, c, x3, S42, 0x8f0ccc92UL); /* 54 */ - II(c, d, a, b, x10, S43, 0xffeff47dUL); /* 55 */ - II(b, c, d, a, x1, S44, 0x85845dd1UL); /* 56 */ - II(a, b, c, d, x8, S41, 0x6fa87e4fUL); /* 57 */ - II(d, a, b, c, x15, S42, 0xfe2ce6e0UL); /* 58 */ - II(c, d, a, b, x6, S43, 0xa3014314UL); /* 59 */ - II(b, c, d, a, x13, S44, 0x4e0811a1UL); /* 60 */ - II(a, b, c, d, x4, S41, 0xf7537e82UL); /* 61 */ - II(d, a, b, c, x11, S42, 0xbd3af235UL); /* 62 */ - II(c, d, a, b, x2, S43, 0x2ad7d2bbUL); /* 63 */ - II(b, c, d, a, x9, S44, 0xeb86d391UL); /* 64 */ - - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - - /* Zeroize sensitive information. */ - x15 = x14 = x13 = x12 = x11 = x10 = x9 = x8 = 0; - x7 = x6 = x5 = x4 = x3 = x2 = x1 = x0 = 0; -} diff --git a/libkern/crypto/register_crypto.c b/libkern/crypto/register_crypto.c new file mode 100644 index 000000000..4f08156f7 --- /dev/null +++ b/libkern/crypto/register_crypto.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include +#include + +crypto_functions_t g_crypto_funcs = NULL; + +int register_crypto_functions(const crypto_functions_t funcs) +{ + if(g_crypto_funcs) + return -1; + + g_crypto_funcs = funcs; + + return 0; +} + + diff --git a/libkern/crypto/sha1.c b/libkern/crypto/sha1.c deleted file mode 100644 index b85cbec96..000000000 --- a/libkern/crypto/sha1.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * This SHA1 code is based on the basic framework from the reference - * implementation for MD5. That implementation is Copyright (C) - * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. - * - * License to copy and use this software is granted provided that it - * is identified as the "RSA Data Security, Inc. MD5 Message-Digest - * Algorithm" in all material mentioning or referencing this software - * or this function. - * - * License is also granted to make and use derivative works provided - * that such works are identified as "derived from the RSA Data - * Security, Inc. MD5 Message-Digest Algorithm" in all material - * mentioning or referencing the derived work. - * - * RSA Data Security, Inc. makes no representations concerning either - * the merchantability of this software or the suitability of this - * software for any particular purpose. It is provided "as is" - * without express or implied warranty of any kind. - * - * These notices must be retained in any copies of any part of this - * documentation and/or software. - * - * Based on the FIPS 180-1: Secure Hash Algorithm (SHA-1) available at - * http://www.itl.nist.gov/div897/pubs/fip180-1.htm - */ - -#include -#include -#include -#include -#define SHA1_TIMER 0 // change to nonzero to write timing stamps to profile sha1transform - -#if SHA1_TIMER -#include -#endif - -#define memset(x, y, z) bzero(x, z); -#define memcpy(x, y, z) bcopy(y, x, z) - -/* Internal mappings to the legacy sha1_ctxt structure. */ -#define state h.b32 -#define bcount c.b32 -#define buffer m.b8 - -/* - * The digest algorithm interprets the input message as a sequence of 32-bit - * big-endian words. We must reverse bytes in each word on x86/64 platforms, - * but not on big-endian ones such as PPC. For performance, we take advantage - * of the bswap instruction on x86/64 to perform byte-reversal. On PPC, we - * could do 4-byte load if the address is 4-byte aligned which should further - * improve the performance. But for code simplicity, we punt and do 1-byte - * loads instead. - */ -#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) -#define FETCH_32(p) ({ \ - register u_int32_t l = (u_int32_t)*((const u_int32_t *)(p)); \ - __asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l)); \ - l; \ -}) -#else -#define FETCH_32(p) \ - (((u_int32_t)*((const u_int8_t *)(p) + 3)) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 2)) << 8) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 1)) << 16) | \ - (((u_int32_t)*((const u_int8_t *)(p))) << 24)) -#endif /* __i386__ || __x86_64__ */ - -/* - * Encodes input (u_int32_t) into output (unsigned char). Assumes len is - * a multiple of 4. This is not compatible with memcpy(). - */ -static void -Encode(unsigned char *output, u_int32_t *input, unsigned int len) -{ - unsigned int i, j; - - for (i = 0, j = 0; j < len; i++, j += 4) { - output[j + 3] = input[i] & 0xff; - output[j + 2] = (input[i] >> 8) & 0xff; - output[j + 1] = (input[i] >> 16) & 0xff; - output[j] = (input[i] >> 24) & 0xff; - } -} - -static unsigned char PADDING[64] = { 0x80, /* zeros */ }; - -/* Constants from FIPS 180-1 */ -#define K_00_19 0x5a827999UL -#define K_20_39 0x6ed9eba1UL -#define K_40_59 0x8f1bbcdcUL -#define K_60_79 0xca62c1d6UL - -/* F, G, H and I are basic SHA1 functions. */ -#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define G(b, c, d) ((b) ^ (c) ^ (d)) -#define H(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) - -/* ROTATE_LEFT rotates x left n bits. */ -#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) - -/* R, R1-R4 are macros used during each transformation round. */ -#define R(f, k, v, w, x, y, z, i) { \ - (v) = ROTATE_LEFT(w, 5) + f(x, y, z) + (v) + (i) + (k); \ - (x) = ROTATE_LEFT(x, 30); \ -} - -#define R1(v, w, x, y, z, i) R(F, K_00_19, v, w, x, y, z, i) -#define R2(v, w, x, y, z, i) R(G, K_20_39, v, w, x, y, z, i) -#define R3(v, w, x, y, z, i) R(H, K_40_59, v, w, x, y, z, i) -#define R4(v, w, x, y, z, i) R(G, K_60_79, v, w, x, y, z, i) - -/* WUPDATE represents Wt variable that gets updated for steps 16-79 */ -#define WUPDATE(p, q, r, s) { \ - (p) = ((q) ^ (r) ^ (s) ^ (p)); \ - (p) = ROTATE_LEFT(p, 1); \ -} - -#if (defined (__x86_64__) || defined (__i386__)) -extern void SHA1Transform(SHA1_CTX *, const u_int8_t *, u_int32_t Nblocks); -#else -static void SHA1Transform(SHA1_CTX *, const u_int8_t *); -#endif - -void _SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen); - -void SHA1Final_r(SHA1_CTX *, void *); - -typedef kern_return_t (*InKernelPerformSHA1Func)(void *ref, const void *data, size_t dataLen, u_int32_t *inHash, u_int32_t options, u_int32_t *outHash, Boolean usePhysicalAddress); -void sha1_hardware_hook(Boolean option, InKernelPerformSHA1Func func, void *ref); -static void *SHA1Ref; -InKernelPerformSHA1Func performSHA1WithinKernelOnly; -#define SHA1_USE_HARDWARE_THRESHOLD 2048 //bytes - - -/* - * SHA1 initialization. Begins a SHA1 operation, writing a new context. - */ -void -SHA1Init(SHA1_CTX *context) -{ - context->bcount[0] = context->bcount[1] = 0; - context->count = 0; - - /* Load magic initialization constants. */ - context->state[0] = 0x67452301UL; - context->state[1] = 0xefcdab89UL; - context->state[2] = 0x98badcfeUL; - context->state[3] = 0x10325476UL; - context->state[4] = 0xc3d2e1f0UL; -} - -/* - * SHA1 block update operation. Continues a SHA1 message-digest - * operation, processing another message block, and updating the - * context. - */ -void -_SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) -{ - u_int32_t i, index, partLen; - const unsigned char *input = (const unsigned char *)inpp; - - if (inputLen == 0) - return; - - /* Compute number of bytes mod 64 */ - index = (context->bcount[1] >> 3) & 0x3F; - - /* Update number of bits */ - if ((context->bcount[1] += (inputLen << 3)) < (inputLen << 3)) - context->bcount[0]++; - context->bcount[0] += (inputLen >> 29); - - partLen = 64 - index; - - /* Transform as many times as possible. */ - i = 0; - if (inputLen >= partLen) { - if (index != 0) { - memcpy(&context->buffer[index], input, partLen); -#if (defined (__x86_64__) || defined (__i386__)) - SHA1Transform(context, context->buffer, 1); -#else - SHA1Transform(context, context->buffer); -#endif - i = partLen; - } - -#if SHA1_TIMER - KERNEL_DEBUG_CONSTANT(0xaa800004 | DBG_FUNC_START, 0, 0, 0, 0, 0); -#endif -#if (defined (__x86_64__) || defined (__i386__)) - { - int kk = (inputLen-i)>>6; - if (kk>0) { - SHA1Transform(context, &input[i], kk); - i += (kk<<6); - } - } -#else - for (; i + 63 < inputLen; i += 64) - SHA1Transform(context, &input[i]); -#endif - - if (inputLen == i) { -#if SHA1_TIMER - KERNEL_DEBUG_CONSTANT(0xaa800004 | DBG_FUNC_END, 0, 0, 0, 0, 0); -#endif - return; - } - - index = 0; - } - - /* Buffer remaining input */ - memcpy(&context->buffer[index], &input[i], inputLen - i); -} - - - - -/* - * This function is called by the SHA1 hardware kext during its init. - * This will register the function to call to perform SHA1 using hardware. - */ -void sha1_hardware_hook(Boolean option, InKernelPerformSHA1Func func, void *ref) -{ - if(option) { - // Establish the hook. The hardware is ready. - OSCompareAndSwapPtr((void*)NULL, (void*)ref, (void * volatile*)&SHA1Ref); - - if(!OSCompareAndSwapPtr((void *)NULL, (void *)func, (void * volatile *)&performSHA1WithinKernelOnly)) { - panic("sha1_hardware_hook: Called twice.. Should never happen\n"); - } - } - else { - // The hardware is going away. Tear down the hook. - performSHA1WithinKernelOnly = NULL; - SHA1Ref = NULL; - } -} - -static u_int32_t SHA1UpdateWithHardware(SHA1_CTX *context, const unsigned char *data, size_t dataLen, Boolean usePhysicalAddress) -{ - u_int32_t *inHashBuffer = context->state; - u_int32_t options = 0; - int result; - - result = performSHA1WithinKernelOnly(SHA1Ref, data, dataLen, inHashBuffer, options, inHashBuffer, usePhysicalAddress); - if(result != KERN_SUCCESS) { - //The hardware failed to hash for some reason. Fall back to software. - return 0; - } - - //Update the context with the total length. - /* Update number of bits */ - if ((context->bcount[1] += (dataLen << 3)) < (dataLen << 3)) - context->bcount[0]++; - context->bcount[0] += (dataLen >> 29); - return dataLen; -} - -/* - * This is function is only called in from the pagefault path or from page_copy(). - * So we assume that we can safely convert the virtual address to the physical address and use it. - * Assumptions: The passed in address(inpp) is a kernel virtual address - * and a physical page has been faulted in. - * The inputLen passed in should always be less than or equal to a page size (4096) - * and inpp should be on a page boundary. - * "performSHA1WithinKernelOnly" is initialized only when the hardware driver exists and is ready. - */ -void SHA1UpdateUsePhysicalAddress(SHA1_CTX *context, const void *inpp, size_t inputLen) -{ - Boolean usePhysicalAddress = TRUE; - if((inputLen == PAGE_SIZE) && performSHA1WithinKernelOnly) { // If hardware exists and is ready. - if(SHA1UpdateWithHardware(context, (const unsigned char *)inpp, inputLen, usePhysicalAddress)) - return; - //else for some reason the hardware failed.. - //fall through to software and try the hash in software. - } - //Use the software implementation since the hardware is absent or - // has not been initialized yet or inputLen != PAGE_SIZE. - _SHA1Update(context, inpp, inputLen); -} - -/* - * A wrapper around _SHA1Update() to pick between software or hardware based SHA1. - * - */ -void SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) -{ - const unsigned char *input = (const unsigned char *)inpp; - Boolean usePhysicalAddress = FALSE; - u_int32_t index; - - if((inputLen > SHA1_USE_HARDWARE_THRESHOLD) && performSHA1WithinKernelOnly) { - index = (context->bcount[1] >> 3) & 0x3F; - if(index != 0) { //bytes left in the context. Handle them first. - u_int32_t partLen = 64 - index; - memcpy(&context->buffer[index], input, partLen); - _SHA1Update(context, inpp, inputLen); - inputLen -= partLen; - input += partLen; - } - - u_int32_t lenForHardware = inputLen & (~0x3F); //multiple of 64 - u_int32_t bytesHashed = 0; - bytesHashed = SHA1UpdateWithHardware(context, input, lenForHardware, usePhysicalAddress); - - inputLen -= bytesHashed; - input += bytesHashed; - } - - //Fall through to the software implementation. - _SHA1Update(context, input, inputLen); -} - -/* - * For backwards compatibility, sha1_result symbol is mapped to this - * routine since it's equivalent to SHA1Final with reversed parameters. - */ -void -SHA1Final_r(SHA1_CTX *context, void *digest) -{ - SHA1Final(digest, context); -} - -/* - * SHA1 finalization. Ends an SHA1 message-digest operation, writing the - * the message digest and zeroizing the context. - */ -void -SHA1Final(void *digest, SHA1_CTX *context) -{ - unsigned char bits[8]; - u_int32_t index = (context->bcount[1] >> 3) & 0x3f; - - /* Save number of bits */ - Encode(bits, context->bcount, 8); - - /* Pad out to 56 mod 64. */ - SHA1Update(context, PADDING, ((index < 56) ? 56 : 120) - index); - - /* Append length (before padding) */ - SHA1Update(context, bits, 8); - - /* Store state in digest */ - Encode(digest, context->state, 20); - - /* Zeroize sensitive information. */ - memset(context, 0, sizeof (*context)); -} - -/* - * SHA1 basic transformation. Transforms state based on block. - */ -#if !(defined (__x86_64__) || defined (__i386__)) -static void -SHA1Transform(SHA1_CTX *context, const u_int8_t block[64]) -{ - /* Register (instead of array) is a win in most cases */ - register u_int32_t a, b, c, d, e; - register u_int32_t w0, w1, w2, w3, w4, w5, w6, w7; - register u_int32_t w8, w9, w10, w11, w12, w13, w14, w15; - - a = context->state[0]; - b = context->state[1]; - c = context->state[2]; - d = context->state[3]; - e = context->state[4]; - - w15 = FETCH_32(block + 60); - w14 = FETCH_32(block + 56); - w13 = FETCH_32(block + 52); - w12 = FETCH_32(block + 48); - w11 = FETCH_32(block + 44); - w10 = FETCH_32(block + 40); - w9 = FETCH_32(block + 36); - w8 = FETCH_32(block + 32); - w7 = FETCH_32(block + 28); - w6 = FETCH_32(block + 24); - w5 = FETCH_32(block + 20); - w4 = FETCH_32(block + 16); - w3 = FETCH_32(block + 12); - w2 = FETCH_32(block + 8); - w1 = FETCH_32(block + 4); - w0 = FETCH_32(block + 0); - - /* Round 1 */ - R1(e, a, b, c, d, w0); /* 0 */ - R1(d, e, a, b, c, w1); /* 1 */ - R1(c, d, e, a, b, w2); /* 2 */ - R1(b, c, d, e, a, w3); /* 3 */ - R1(a, b, c, d, e, w4); /* 4 */ - R1(e, a, b, c, d, w5); /* 5 */ - R1(d, e, a, b, c, w6); /* 6 */ - R1(c, d, e, a, b, w7); /* 7 */ - R1(b, c, d, e, a, w8); /* 8 */ - R1(a, b, c, d, e, w9); /* 9 */ - R1(e, a, b, c, d, w10); /* 10 */ - R1(d, e, a, b, c, w11); /* 11 */ - R1(c, d, e, a, b, w12); /* 12 */ - R1(b, c, d, e, a, w13); /* 13 */ - R1(a, b, c, d, e, w14); /* 14 */ - R1(e, a, b, c, d, w15); /* 15 */ - WUPDATE( w0, w13, w8, w2); R1(d, e, a, b, c, w0); /* 16 */ - WUPDATE( w1, w14, w9, w3); R1(c, d, e, a, b, w1); /* 17 */ - WUPDATE( w2, w15, w10, w4); R1(b, c, d, e, a, w2); /* 18 */ - WUPDATE( w3, w0, w11, w5); R1(a, b, c, d, e, w3); /* 19 */ - - /* Round 2 */ - WUPDATE( w4, w1, w12, w6); R2(e, a, b, c, d, w4); /* 20 */ - WUPDATE( w5, w2, w13, w7); R2(d, e, a, b, c, w5); /* 21 */ - WUPDATE( w6, w3, w14, w8); R2(c, d, e, a, b, w6); /* 22 */ - WUPDATE( w7, w4, w15, w9); R2(b, c, d, e, a, w7); /* 23 */ - WUPDATE( w8, w5, w0, w10); R2(a, b, c, d, e, w8); /* 24 */ - WUPDATE( w9, w6, w1, w11); R2(e, a, b, c, d, w9); /* 25 */ - WUPDATE(w10, w7, w2, w12); R2(d, e, a, b, c, w10); /* 26 */ - WUPDATE(w11, w8, w3, w13); R2(c, d, e, a, b, w11); /* 27 */ - WUPDATE(w12, w9, w4, w14); R2(b, c, d, e, a, w12); /* 28 */ - WUPDATE(w13, w10, w5, w15); R2(a, b, c, d, e, w13); /* 29 */ - WUPDATE(w14, w11, w6, w0); R2(e, a, b, c, d, w14); /* 30 */ - WUPDATE(w15, w12, w7, w1); R2(d, e, a, b, c, w15); /* 31 */ - WUPDATE( w0, w13, w8, w2); R2(c, d, e, a, b, w0); /* 32 */ - WUPDATE( w1, w14, w9, w3); R2(b, c, d, e, a, w1); /* 33 */ - WUPDATE( w2, w15, w10, w4); R2(a, b, c, d, e, w2); /* 34 */ - WUPDATE( w3, w0, w11, w5); R2(e, a, b, c, d, w3); /* 35 */ - WUPDATE( w4, w1, w12, w6); R2(d, e, a, b, c, w4); /* 36 */ - WUPDATE( w5, w2, w13, w7); R2(c, d, e, a, b, w5); /* 37 */ - WUPDATE( w6, w3, w14, w8); R2(b, c, d, e, a, w6); /* 38 */ - WUPDATE( w7, w4, w15, w9); R2(a, b, c, d, e, w7); /* 39 */ - - /* Round 3 */ - WUPDATE( w8, w5, w0, w10); R3(e, a, b, c, d, w8); /* 40 */ - WUPDATE( w9, w6, w1, w11); R3(d, e, a, b, c, w9); /* 41 */ - WUPDATE(w10, w7, w2, w12); R3(c, d, e, a, b, w10); /* 42 */ - WUPDATE(w11, w8, w3, w13); R3(b, c, d, e, a, w11); /* 43 */ - WUPDATE(w12, w9, w4, w14); R3(a, b, c, d, e, w12); /* 44 */ - WUPDATE(w13, w10, w5, w15); R3(e, a, b, c, d, w13); /* 45 */ - WUPDATE(w14, w11, w6, w0); R3(d, e, a, b, c, w14); /* 46 */ - WUPDATE(w15, w12, w7, w1); R3(c, d, e, a, b, w15); /* 47 */ - WUPDATE( w0, w13, w8, w2); R3(b, c, d, e, a, w0); /* 48 */ - WUPDATE( w1, w14, w9, w3); R3(a, b, c, d, e, w1); /* 49 */ - WUPDATE( w2, w15, w10, w4); R3(e, a, b, c, d, w2); /* 50 */ - WUPDATE( w3, w0, w11, w5); R3(d, e, a, b, c, w3); /* 51 */ - WUPDATE( w4, w1, w12, w6); R3(c, d, e, a, b, w4); /* 52 */ - WUPDATE( w5, w2, w13, w7); R3(b, c, d, e, a, w5); /* 53 */ - WUPDATE( w6, w3, w14, w8); R3(a, b, c, d, e, w6); /* 54 */ - WUPDATE( w7, w4, w15, w9); R3(e, a, b, c, d, w7); /* 55 */ - WUPDATE( w8, w5, w0, w10); R3(d, e, a, b, c, w8); /* 56 */ - WUPDATE( w9, w6, w1, w11); R3(c, d, e, a, b, w9); /* 57 */ - WUPDATE(w10, w7, w2, w12); R3(b, c, d, e, a, w10); /* 58 */ - WUPDATE(w11, w8, w3, w13); R3(a, b, c, d, e, w11); /* 59 */ - - WUPDATE(w12, w9, w4, w14); R4(e, a, b, c, d, w12); /* 60 */ - WUPDATE(w13, w10, w5, w15); R4(d, e, a, b, c, w13); /* 61 */ - WUPDATE(w14, w11, w6, w0); R4(c, d, e, a, b, w14); /* 62 */ - WUPDATE(w15, w12, w7, w1); R4(b, c, d, e, a, w15); /* 63 */ - WUPDATE( w0, w13, w8, w2); R4(a, b, c, d, e, w0); /* 64 */ - WUPDATE( w1, w14, w9, w3); R4(e, a, b, c, d, w1); /* 65 */ - WUPDATE( w2, w15, w10, w4); R4(d, e, a, b, c, w2); /* 66 */ - WUPDATE( w3, w0, w11, w5); R4(c, d, e, a, b, w3); /* 67 */ - WUPDATE( w4, w1, w12, w6); R4(b, c, d, e, a, w4); /* 68 */ - WUPDATE( w5, w2, w13, w7); R4(a, b, c, d, e, w5); /* 69 */ - WUPDATE( w6, w3, w14, w8); R4(e, a, b, c, d, w6); /* 70 */ - WUPDATE( w7, w4, w15, w9); R4(d, e, a, b, c, w7); /* 71 */ - WUPDATE( w8, w5, w0, w10); R4(c, d, e, a, b, w8); /* 72 */ - WUPDATE( w9, w6, w1, w11); R4(b, c, d, e, a, w9); /* 73 */ - WUPDATE(w10, w7, w2, w12); R4(a, b, c, d, e, w10); /* 74 */ - WUPDATE(w11, w8, w3, w13); R4(e, a, b, c, d, w11); /* 75 */ - WUPDATE(w12, w9, w4, w14); R4(d, e, a, b, c, w12); /* 76 */ - WUPDATE(w13, w10, w5, w15); R4(c, d, e, a, b, w13); /* 77 */ - WUPDATE(w14, w11, w6, w0); R4(b, c, d, e, a, w14); /* 78 */ - WUPDATE(w15, w12, w7, w1); R4(a, b, c, d, e, w15); /* 79 */ - - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - context->state[4] += e; - - /* Zeroize sensitive information. */ - w15 = w14 = w13 = w12 = w11 = w10 = w9 = w8 = 0; - w7 = w6 = w5 = w4 = w3 = w2 = w1 = w0 = 0; -} -#endif diff --git a/libkern/gen/OSAtomicOperations.c b/libkern/gen/OSAtomicOperations.c index 3484791d5..cfb15c5c1 100644 --- a/libkern/gen/OSAtomicOperations.c +++ b/libkern/gen/OSAtomicOperations.c @@ -54,48 +54,10 @@ enum { * Like standards, there are a lot of atomic ops to choose from! */ -#if !defined(__i386__) && !defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) /* Implemented in assembly for i386 and x86_64 */ -#undef OSAddAtomic -SInt32 -OSAddAtomic(SInt32 amount, volatile SInt32 * value) -{ - SInt32 oldValue; - SInt32 newValue; - - do { - oldValue = *value; - newValue = oldValue + amount; - } while (!OSCompareAndSwap((UInt32)oldValue, - (UInt32)newValue, - (volatile UInt32 *) value)); - return oldValue; -} - -#undef OSAddAtomicLong -long -OSAddAtomicLong(long theAmount, volatile long *address) -{ -#if __LP64__ -#error Unimplemented -#else - return (long)OSAddAtomic((SInt32)theAmount, address); -#endif -} - -/* Implemented as an assembly alias for i386 */ -#undef OSCompareAndSwapPtr -Boolean OSCompareAndSwapPtr(void *oldValue, void *newValue, - void * volatile *address) -{ -#if __LP64__ - return OSCompareAndSwap64((UInt64)oldValue, (UInt64)newValue, - (volatile UInt64 *)address); #else - return OSCompareAndSwap((UInt32)oldValue, (UInt32)newValue, - (volatile UInt32 *)address); -#endif -} +#error Unsupported arch #endif #undef OSIncrementAtomic diff --git a/libkern/gen/OSDebug.cpp b/libkern/gen/OSDebug.cpp index 3e67cfff8..14ee32621 100644 --- a/libkern/gen/OSDebug.cpp +++ b/libkern/gen/OSDebug.cpp @@ -286,4 +286,3 @@ pad: #endif return frame; } - diff --git a/libkern/kernel_mach_header.c b/libkern/kernel_mach_header.c index 0edc6b64d..ebef5b5ab 100644 --- a/libkern/kernel_mach_header.c +++ b/libkern/kernel_mach_header.c @@ -59,7 +59,7 @@ getlastaddr(void) sgp = (kernel_segment_command_t *) ((uintptr_t)header + sizeof(kernel_mach_header_t)); for (i = 0; i < header->ncmds; i++){ - if ( sgp->cmd == LC_SEGMENT_KERNEL) { + if (sgp->cmd == LC_SEGMENT_KERNEL) { if (sgp->vmaddr + sgp->vmsize > last_addr) last_addr = sgp->vmaddr + sgp->vmsize; } @@ -69,32 +69,47 @@ getlastaddr(void) } /* - * Find the UUID load command in the Mach-O headers, and return - * the address of the UUID blob and size in "*size". If the - * Mach-O image is missing a UUID, NULL is returned. + * Find the specified load command in the Mach-O headers, and return + * the command. If there is no such load command, NULL is returned. */ void * -getuuidfromheader(kernel_mach_header_t *mhp, unsigned long *size) -{ - struct uuid_command *uuidp; +getcommandfromheader(kernel_mach_header_t *mhp, uint32_t cmd) { + struct load_command *lcp; unsigned long i; - uuidp = (struct uuid_command *) - ((uintptr_t)mhp + sizeof(kernel_mach_header_t)); + lcp = (struct load_command *) (mhp + 1); for(i = 0; i < mhp->ncmds; i++){ - if(uuidp->cmd == LC_UUID) { - if (size) - *size = sizeof(uuidp->uuid); - - return (void *)uuidp->uuid; + if(lcp->cmd == cmd) { + return (void *)lcp; } - uuidp = (struct uuid_command *)((uintptr_t)uuidp + uuidp->cmdsize); + lcp = (struct load_command *)((uintptr_t)lcp + lcp->cmdsize); } return NULL; } +/* + * Find the UUID load command in the Mach-O headers, and return + * the address of the UUID blob and size in "*size". If the + * Mach-O image is missing a UUID, NULL is returned. + */ +void * +getuuidfromheader(kernel_mach_header_t *mhp, unsigned long *size) +{ + struct uuid_command *cmd = (struct uuid_command *) + getcommandfromheader(mhp, LC_UUID); + + if (cmd != NULL) { + if (size) { + *size = sizeof(cmd->uuid); + } + return cmd->uuid; + } + + return NULL; +} + /* * This routine returns the a pointer to the data for the named section in the * named segment if it exist in the mach header passed to it. Also it returns @@ -323,68 +338,3 @@ nextsect(kernel_segment_command_t *sgp, kernel_section_t *sp) return sp+1; } - -#ifdef MACH_KDB -/* - * This routine returns the section command for the symbol table in the - * named segment for the mach_header pointer passed to it if it exist. - * Otherwise it returns zero. - */ -static struct symtab_command * -getsectcmdsymtabfromheader( - kernel_mach_header_t *mhp) -{ - kernel_segment_command_t *sgp; - unsigned long i; - - sgp = (kernel_segment_command_t *) - ((uintptr_t)mhp + sizeof(kernel_mach_header_t)); - for(i = 0; i < mhp->ncmds; i++){ - if(sgp->cmd == LC_SYMTAB) - return((struct symtab_command *)sgp); - sgp = (kernel_segment_command_t *)((uintptr_t)sgp + sgp->cmdsize); - } - return((struct symtab_command *)NULL); -} - -boolean_t getsymtab(kernel_mach_header_t *header, - vm_offset_t *symtab, - int *nsyms, - vm_offset_t *strtab, - vm_size_t *strtabsize) -{ - kernel_segment_command_t *seglink_cmd; - struct symtab_command *symtab_cmd; - - seglink_cmd = NULL; - - if((header->magic != MH_MAGIC) - && (header->magic != MH_MAGIC_64)) { /* Check if this is a valid header format */ - return (FALSE); /* Bye y'all... */ - } - - seglink_cmd = getsegbynamefromheader(header,"__LINKEDIT"); - if (seglink_cmd == NULL) { - return(FALSE); - } - - symtab_cmd = NULL; - symtab_cmd = getsectcmdsymtabfromheader(header); - if (symtab_cmd == NULL) - return(FALSE); - - *nsyms = symtab_cmd->nsyms; - if(symtab_cmd->nsyms == 0) return (FALSE); /* No symbols */ - - *strtabsize = symtab_cmd->strsize; - if(symtab_cmd->strsize == 0) return (FALSE); /* Symbol length is 0 */ - - *symtab = seglink_cmd->vmaddr + symtab_cmd->symoff - - seglink_cmd->fileoff; - - *strtab = seglink_cmd->vmaddr + symtab_cmd->stroff - - seglink_cmd->fileoff; - - return(TRUE); -} -#endif diff --git a/libkern/kmod/Makefile b/libkern/kmod/Makefile deleted file mode 100644 index 8ffce509e..000000000 --- a/libkern/kmod/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -ifneq ($(MACHINE_CONFIG), DEFAULT) -export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT) -else -export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) -endif - -do_all: - $(_v)($(MKDIR) $(COMPOBJROOT)/kmod; \ - cd $(COMPOBJROOT)/kmod; \ - ${MAKE} MAKEFILES=$(SOURCE)/Makefile.kmod \ - TARGET=$(TARGET) \ - do_build_all \ - ) - -do_build_all: do_all - -do_install: - @echo "[ $(SOURCE) ] make do_install $(COMPONENT) $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)"; \ - ($(MKDIR) $(COMPOBJROOT)/kmod; \ - cd $(COMPOBJROOT)/kmod; \ - ${MAKE} MAKEFILES=$(SOURCE)/Makefile.kmod \ - TARGET=$(TARGET) \ - do_build_install \ - ) - -do_build_install: do_install - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/libkern/kmod/Makefile.kmod b/libkern/kmod/Makefile.kmod deleted file mode 100644 index 62ffd893b..000000000 --- a/libkern/kmod/Makefile.kmod +++ /dev/null @@ -1,111 +0,0 @@ -# -# Kernel Module Library code makefile -# - -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTALL_DIR = $(DSTROOT)/usr/lib -KMOD_NAME = libkmod -KMODCPP_NAME = libkmodc++ -LIB_INSTALL_FLAGS = -p -m 444 - -# -mkernel implies -mlong-branch/-mlong-calls/-mno-red-zone as needed for -# code linked into kexts -# -fno-stack-protector is necessary for the kernel, but not for kexts -CFLAGS_KMOD = $(filter-out -O0 -O1 -O2 -O3 -O4 -Os -Oz -freorder-blocks -flto -fno-stack-protector,$(CFLAGS)) \ - -Os -mkernel -Wall - -ifneq ($(MACHINE_CONFIG), DEFAULT) -COMPOBJROOT = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/kmod -INSTOBJROOT = $(OBJROOT)/$(INSTALL_TYPE)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/kmod -else -COMPOBJROOT = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/kmod -INSTOBJROOT = $(OBJROOT)/$(INSTALL_TYPE)_$(ARCH_CONFIG)/$(COMPONENT)/kmod -endif - - -KMOD_CFILES = c_start.c c_stop.c -KMODCPP_CFILES = cplus_start.c cplus_stop.c - -KMOD_OFILES = $(KMOD_CFILES:.c=.o) -KMODCPP_OFILES = $(KMODCPP_CFILES:.c=.o) - -ALL_OFILES = $(KMOD_OFILES) $(KMODCPP_OFILES) - -$(ALL_OFILES): %.o : %.c - @echo LIBKMOD_CC $@ - $(_v)${LIBKMOD_CC} -c ${CFLAGS_KMOD} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $< - -$(COMPOBJROOT)/$(KMOD_NAME).a: $(KMOD_OFILES) - @echo LIBTOOL $(notdir $@) - $(_v)$(LIBTOOL) -static -o $@ $^ $(_vstdout) 2>&1 - -$(COMPOBJROOT)/$(KMODCPP_NAME).a: $(KMODCPP_OFILES) - @echo LIBTOOL $(notdir $@) - $(_v)$(LIBTOOL) -static -o $@ $^ $(_vstdout) 2>&1 - -do_build_all: $(COMPOBJROOT)/$(KMOD_NAME).a $(COMPOBJROOT)/$(KMODCPP_NAME).a - -$(INSTALL_DIR)/%.a: $(INSTOBJROOT)/%.a - @echo Installing $< in $@; - $(_v)$(RM) $@ || true; \ - ${MKDIR} $(INSTALL_DIR) $(SYMROOT); \ - if [ $(MACHINE_CONFIG) = DEFAULT ]; then \ - allarchs=""; \ - for onearch in $(INSTALL_ARCHS); do \ - if [ $${onearch} = ARM ] ; then \ - archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}_$(DEFAULT_ARM_MACHINE_CONFIG)/$(COMPONENT); \ - else \ - archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT); \ - fi; \ - if [ -e $${archdir}/kmod/$(*F).a ]; then \ - allarchs="$${allarchs} $${archdir}/kmod/$(*F).a"; \ - fi; \ - done; \ - cmd="$(LIPO) $${allarchs} -create -output $(SYMROOT)/$(*F).a"; \ - echo $$cmd; eval $$cmd; \ - else \ - my_counter=1; \ - my_innercounter=1; \ - outputfile=$(SYMROOT)/$(*F).a; \ - for my_config in $(TARGET_CONFIGS_UC); do \ - if [ $${my_counter} -eq 1 ]; then \ - my_counter=2; \ - my_kconfig=$${my_config}; \ - elif [ $${my_counter} -eq 2 ]; then \ - my_counter=3; \ - my_aconfig=$${my_config}; \ - else \ - my_counter=1; \ - if [ $${my_aconfig} = ARM ] ; then \ - if [ $${my_config} = DEFAULT ] ; then \ - my_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ - fi; \ - fi; \ - inputfile=$(OBJROOT)/$${my_kconfig}_$${my_aconfig}_$${my_config}/$(COMPONENT)/kmod/$(*F).a; \ - if [ -e $${inputfile} ]; then \ - if [ $${my_innercounter} -eq 1 ]; then \ - my_innercounter=2; \ - cmd="$(LIPO) -create $${inputfile} -o $${outputfile}"; \ - else \ - cmd="$(LIPO) -create $${outputfile} $${inputfile} -o $${outputfile} || true"; \ - fi; \ - echo $$cmd; eval $$cmd; \ - fi; \ - fi; \ - done; \ - fi; \ - cmd="$(INSTALL) $(LIB_INSTALL_FLAGS) $(SYMROOT)/$(*F).a $@"; \ - echo $$cmd; eval $$cmd - -do_build_install: $(INSTALL_DIR)/$(KMOD_NAME).a $(INSTALL_DIR)/$(KMODCPP_NAME).a - -# include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/libkern/kmod/libkmod.xcodeproj/project.pbxproj b/libkern/kmod/libkmod.xcodeproj/project.pbxproj new file mode 100644 index 000000000..39a81212d --- /dev/null +++ b/libkern/kmod/libkmod.xcodeproj/project.pbxproj @@ -0,0 +1,482 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 46; + objects = { + +/* Begin PBXAggregateTarget section */ + C61E2D9512F3647000FC9BCA /* All */ = { + isa = PBXAggregateTarget; + buildConfigurationList = C61E2D9612F3647000FC9BCA /* Build configuration list for PBXAggregateTarget "All" */; + buildPhases = ( + ); + dependencies = ( + C61E2D9912F364A800FC9BCA /* PBXTargetDependency */, + C61E2DA212F3650100FC9BCA /* PBXTargetDependency */, + ); + name = All; + productName = All; + }; +/* End PBXAggregateTarget section */ + +/* Begin PBXBuildFile section */ + C61E2D8012F360A200FC9BCA /* libkmodtest.h in Headers */ = {isa = PBXBuildFile; fileRef = C61E2D7F12F360A200FC9BCA /* libkmodtest.h */; }; + C61E2D8212F360A200FC9BCA /* libkmodtest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C61E2D8112F360A200FC9BCA /* libkmodtest.cpp */; }; + C61E2DAC12F3661900FC9BCA /* README in Resources */ = {isa = PBXBuildFile; fileRef = C61E2DA712F3661900FC9BCA /* README */; }; + C61E2DAD12F3672F00FC9BCA /* c_start.c in Sources */ = {isa = PBXBuildFile; fileRef = C61E2DA312F3661900FC9BCA /* c_start.c */; }; + C61E2DAE12F3672F00FC9BCA /* c_stop.c in Sources */ = {isa = PBXBuildFile; fileRef = C61E2DA412F3661900FC9BCA /* c_stop.c */; }; + C61E2DAF12F3673A00FC9BCA /* cplus_start.c in Sources */ = {isa = PBXBuildFile; fileRef = C61E2DA512F3661900FC9BCA /* cplus_start.c */; }; + C61E2DB012F3673A00FC9BCA /* cplus_stop.c in Sources */ = {isa = PBXBuildFile; fileRef = C61E2DA612F3661900FC9BCA /* cplus_stop.c */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + C61E2D9812F364A800FC9BCA /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C61E2D6012F3605700FC9BCA /* Project object */; + proxyType = 1; + remoteGlobalIDString = C61E2D9112F3642100FC9BCA; + remoteInfo = libkmod; + }; + C61E2DA112F3650100FC9BCA /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C61E2D6012F3605700FC9BCA /* Project object */; + proxyType = 1; + remoteGlobalIDString = C61E2D9D12F364C100FC9BCA; + remoteInfo = "libkmodc++"; + }; + C61E2DB112F36AC700FC9BCA /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C61E2D6012F3605700FC9BCA /* Project object */; + proxyType = 1; + remoteGlobalIDString = C61E2D9112F3642100FC9BCA; + remoteInfo = libkmod; + }; + C61E2DB312F36ACB00FC9BCA /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C61E2D6012F3605700FC9BCA /* Project object */; + proxyType = 1; + remoteGlobalIDString = C61E2D9D12F364C100FC9BCA; + remoteInfo = "libkmodc++"; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + C61E2D7312F360A200FC9BCA /* libkmodtest.kext */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = libkmodtest.kext; sourceTree = BUILT_PRODUCTS_DIR; }; + C61E2D7712F360A200FC9BCA /* Kernel.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Kernel.framework; path = System/Library/Frameworks/Kernel.framework; sourceTree = SDKROOT; }; + C61E2D7A12F360A200FC9BCA /* libkmodtest-Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "libkmodtest-Info.plist"; sourceTree = ""; }; + C61E2D7F12F360A200FC9BCA /* libkmodtest.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = libkmodtest.h; sourceTree = ""; }; + C61E2D8112F360A200FC9BCA /* libkmodtest.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = libkmodtest.cpp; sourceTree = ""; }; + C61E2D9212F3642100FC9BCA /* libkmod.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkmod.a; sourceTree = BUILT_PRODUCTS_DIR; }; + C61E2D9E12F364C100FC9BCA /* libkmodc++.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libkmodc++.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + C61E2DA312F3661900FC9BCA /* c_start.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = c_start.c; sourceTree = ""; }; + C61E2DA412F3661900FC9BCA /* c_stop.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = c_stop.c; sourceTree = ""; }; + C61E2DA512F3661900FC9BCA /* cplus_start.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = cplus_start.c; sourceTree = ""; }; + C61E2DA612F3661900FC9BCA /* cplus_stop.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = cplus_stop.c; sourceTree = ""; }; + C61E2DA712F3661900FC9BCA /* README */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = README; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + C61E2D6E12F360A200FC9BCA /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + C61E2D8F12F3642100FC9BCA /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + C61E2D9B12F364C100FC9BCA /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + C61E2D5E12F3605700FC9BCA = { + isa = PBXGroup; + children = ( + C61E2DA312F3661900FC9BCA /* c_start.c */, + C61E2DA412F3661900FC9BCA /* c_stop.c */, + C61E2DA512F3661900FC9BCA /* cplus_start.c */, + C61E2DA612F3661900FC9BCA /* cplus_stop.c */, + C61E2DA712F3661900FC9BCA /* README */, + C61E2D7812F360A200FC9BCA /* libkmodtest */, + C61E2D7512F360A200FC9BCA /* Frameworks */, + C61E2D7412F360A200FC9BCA /* Products */, + ); + sourceTree = ""; + }; + C61E2D7412F360A200FC9BCA /* Products */ = { + isa = PBXGroup; + children = ( + C61E2D7312F360A200FC9BCA /* libkmodtest.kext */, + C61E2D9212F3642100FC9BCA /* libkmod.a */, + C61E2D9E12F364C100FC9BCA /* libkmodc++.a */, + ); + name = Products; + sourceTree = ""; + }; + C61E2D7512F360A200FC9BCA /* Frameworks */ = { + isa = PBXGroup; + children = ( + C61E2D7612F360A200FC9BCA /* Other Frameworks */, + ); + name = Frameworks; + sourceTree = ""; + }; + C61E2D7612F360A200FC9BCA /* Other Frameworks */ = { + isa = PBXGroup; + children = ( + C61E2D7712F360A200FC9BCA /* Kernel.framework */, + ); + name = "Other Frameworks"; + sourceTree = ""; + }; + C61E2D7812F360A200FC9BCA /* libkmodtest */ = { + isa = PBXGroup; + children = ( + C61E2D7F12F360A200FC9BCA /* libkmodtest.h */, + C61E2D8112F360A200FC9BCA /* libkmodtest.cpp */, + C61E2D7912F360A200FC9BCA /* Supporting Files */, + ); + path = libkmodtest; + sourceTree = ""; + }; + C61E2D7912F360A200FC9BCA /* Supporting Files */ = { + isa = PBXGroup; + children = ( + C61E2D7A12F360A200FC9BCA /* libkmodtest-Info.plist */, + ); + name = "Supporting Files"; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXHeadersBuildPhase section */ + C61E2D6F12F360A200FC9BCA /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + C61E2D8012F360A200FC9BCA /* libkmodtest.h in Headers */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + C61E2D9012F3642100FC9BCA /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + C61E2D9C12F364C100FC9BCA /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXHeadersBuildPhase section */ + +/* Begin PBXNativeTarget section */ + C61E2D7212F360A200FC9BCA /* libkmodtest */ = { + isa = PBXNativeTarget; + buildConfigurationList = C61E2D8512F360A200FC9BCA /* Build configuration list for PBXNativeTarget "libkmodtest" */; + buildPhases = ( + C61E2D6D12F360A200FC9BCA /* Sources */, + C61E2D6E12F360A200FC9BCA /* Frameworks */, + C61E2D6F12F360A200FC9BCA /* Headers */, + C61E2D7012F360A200FC9BCA /* Resources */, + C61E2D7112F360A200FC9BCA /* Rez */, + ); + buildRules = ( + ); + dependencies = ( + C61E2DB212F36AC700FC9BCA /* PBXTargetDependency */, + C61E2DB412F36ACB00FC9BCA /* PBXTargetDependency */, + ); + name = libkmodtest; + productName = libkmodtest; + productReference = C61E2D7312F360A200FC9BCA /* libkmodtest.kext */; + productType = "com.apple.product-type.kernel-extension"; + }; + C61E2D9112F3642100FC9BCA /* libkmod */ = { + isa = PBXNativeTarget; + buildConfigurationList = C61E2D9312F3642100FC9BCA /* Build configuration list for PBXNativeTarget "libkmod" */; + buildPhases = ( + C61E2D8E12F3642100FC9BCA /* Sources */, + C61E2D8F12F3642100FC9BCA /* Frameworks */, + C61E2D9012F3642100FC9BCA /* Headers */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = libkmod; + productName = libkmod; + productReference = C61E2D9212F3642100FC9BCA /* libkmod.a */; + productType = "com.apple.product-type.library.static"; + }; + C61E2D9D12F364C100FC9BCA /* libkmodc++ */ = { + isa = PBXNativeTarget; + buildConfigurationList = C61E2D9F12F364C100FC9BCA /* Build configuration list for PBXNativeTarget "libkmodc++" */; + buildPhases = ( + C61E2D9A12F364C100FC9BCA /* Sources */, + C61E2D9B12F364C100FC9BCA /* Frameworks */, + C61E2D9C12F364C100FC9BCA /* Headers */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = "libkmodc++"; + productName = "libkmodc++"; + productReference = C61E2D9E12F364C100FC9BCA /* libkmodc++.a */; + productType = "com.apple.product-type.library.static"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + C61E2D6012F3605700FC9BCA /* Project object */ = { + isa = PBXProject; + attributes = { + LastUpgradeCheck = 0420; + }; + buildConfigurationList = C61E2D6312F3605700FC9BCA /* Build configuration list for PBXProject "libkmod" */; + compatibilityVersion = "Xcode 3.2"; + developmentRegion = English; + hasScannedForEncodings = 0; + knownRegions = ( + en, + ); + mainGroup = C61E2D5E12F3605700FC9BCA; + productRefGroup = C61E2D7412F360A200FC9BCA /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + C61E2D9512F3647000FC9BCA /* All */, + C61E2D9112F3642100FC9BCA /* libkmod */, + C61E2D9D12F364C100FC9BCA /* libkmodc++ */, + C61E2D7212F360A200FC9BCA /* libkmodtest */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + C61E2D7012F360A200FC9BCA /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C61E2DAC12F3661900FC9BCA /* README in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXRezBuildPhase section */ + C61E2D7112F360A200FC9BCA /* Rez */ = { + isa = PBXRezBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXRezBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + C61E2D6D12F360A200FC9BCA /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C61E2D8212F360A200FC9BCA /* libkmodtest.cpp in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + C61E2D8E12F3642100FC9BCA /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C61E2DAD12F3672F00FC9BCA /* c_start.c in Sources */, + C61E2DAE12F3672F00FC9BCA /* c_stop.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + C61E2D9A12F364C100FC9BCA /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + C61E2DAF12F3673A00FC9BCA /* cplus_start.c in Sources */, + C61E2DB012F3673A00FC9BCA /* cplus_stop.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + C61E2D9912F364A800FC9BCA /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = C61E2D9112F3642100FC9BCA /* libkmod */; + targetProxy = C61E2D9812F364A800FC9BCA /* PBXContainerItemProxy */; + }; + C61E2DA212F3650100FC9BCA /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = C61E2D9D12F364C100FC9BCA /* libkmodc++ */; + targetProxy = C61E2DA112F3650100FC9BCA /* PBXContainerItemProxy */; + }; + C61E2DB212F36AC700FC9BCA /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = C61E2D9112F3642100FC9BCA /* libkmod */; + targetProxy = C61E2DB112F36AC700FC9BCA /* PBXContainerItemProxy */; + }; + C61E2DB412F36ACB00FC9BCA /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = C61E2D9D12F364C100FC9BCA /* libkmodc++ */; + targetProxy = C61E2DB312F36ACB00FC9BCA /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + C61E2D6612F3605700FC9BCA /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ARCHS = "$(ARCHS_STANDARD_32_64_BIT)"; + }; + name = Release; + }; + C61E2D8412F360A200FC9BCA /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CURRENT_PROJECT_VERSION = 1.0.0d1; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + INFOPLIST_FILE = "libkmodtest/libkmodtest-Info.plist"; + MODULE_NAME = com.apple.driver.libkmodtest; + MODULE_VERSION = 1.0.0d1; + PRODUCT_NAME = "$(TARGET_NAME)"; + WRAPPER_EXTENSION = kext; + }; + name = Release; + }; + C61E2D9412F3642100FC9BCA /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + COPY_PHASE_STRIP = YES; + EXECUTABLE_PREFIX = lib; + GCC_ENABLE_BUILTIN_FUNCTIONS = NO; + GCC_ENABLE_KERNEL_DEVELOPMENT = YES; + GCC_PREPROCESSOR_DEFINITIONS = ( + KERNEL, + KERNEL_PRIVATE, + DRIVER_PRIVATE, + APPLE, + NeXT, + ); + GCC_TREAT_IMPLICIT_FUNCTION_DECLARATIONS_AS_ERRORS = YES; + GCC_TREAT_WARNINGS_AS_ERRORS = YES; + GCC_USE_STANDARD_INCLUDE_SEARCHING = NO; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = ( + /System/Library/Frameworks/Kernel.framework/PrivateHeaders, + /System/Library/Frameworks/Kernel.framework/Headers, + ); + INSTALL_PATH = /usr/lib; + PRODUCT_NAME = kmod; + }; + name = Release; + }; + C61E2D9712F3647000FC9BCA /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Release; + }; + C61E2DA012F364C100FC9BCA /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + COPY_PHASE_STRIP = YES; + EXECUTABLE_PREFIX = lib; + GCC_ENABLE_BUILTIN_FUNCTIONS = NO; + GCC_ENABLE_KERNEL_DEVELOPMENT = YES; + GCC_PREPROCESSOR_DEFINITIONS = ( + KERNEL, + KERNEL_PRIVATE, + DRIVER_PRIVATE, + APPLE, + NeXT, + ); + GCC_TREAT_IMPLICIT_FUNCTION_DECLARATIONS_AS_ERRORS = YES; + GCC_TREAT_WARNINGS_AS_ERRORS = YES; + GCC_USE_STANDARD_INCLUDE_SEARCHING = NO; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = ( + /System/Library/Frameworks/Kernel.framework/PrivateHeaders, + /System/Library/Frameworks/Kernel.framework/Headers, + ); + INSTALL_PATH = /usr/lib; + PRODUCT_NAME = "kmodc++"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + C61E2D6312F3605700FC9BCA /* Build configuration list for PBXProject "libkmod" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C61E2D6612F3605700FC9BCA /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C61E2D8512F360A200FC9BCA /* Build configuration list for PBXNativeTarget "libkmodtest" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C61E2D8412F360A200FC9BCA /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C61E2D9312F3642100FC9BCA /* Build configuration list for PBXNativeTarget "libkmod" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C61E2D9412F3642100FC9BCA /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C61E2D9612F3647000FC9BCA /* Build configuration list for PBXAggregateTarget "All" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C61E2D9712F3647000FC9BCA /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + C61E2D9F12F364C100FC9BCA /* Build configuration list for PBXNativeTarget "libkmodc++" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + C61E2DA012F364C100FC9BCA /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = C61E2D6012F3605700FC9BCA /* Project object */; +} diff --git a/libkern/kmod/libkmodtest/libkmodtest-Info.plist b/libkern/kmod/libkmodtest/libkmodtest-Info.plist new file mode 100644 index 000000000..7092cb6b1 --- /dev/null +++ b/libkern/kmod/libkmodtest/libkmodtest-Info.plist @@ -0,0 +1,51 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleExecutable + ${EXECUTABLE_NAME} + CFBundleIconFile + + CFBundleIdentifier + com.apple.driver.${PRODUCT_NAME:rfc1034identifier} + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + ${PRODUCT_NAME} + CFBundlePackageType + KEXT + CFBundleShortVersionString + 1.0 + CFBundleSignature + ???? + CFBundleVersion + 1 + IOKitPersonalities + + testlibkmod + + CFBundleIdentifier + com.apple.driver.${PRODUCT_NAME:rfc1034identifier} + IOClass + testlibkmod + IOMatchCategory + testlibkmod + IOProviderClass + IOResources + IOResourceMatch + IOKit + + + NSHumanReadableCopyright + Copyright © 2011 Apple, Inc. All rights reserved. + OSBundleLibraries + + com.apple.kpi.iokit + 11.0 + com.apple.kpi.libkern + 11.0 + + + diff --git a/bsd/net/dlil_pvt.h b/libkern/kmod/libkmodtest/libkmodtest.cpp similarity index 71% rename from bsd/net/dlil_pvt.h rename to libkern/kmod/libkmodtest/libkmodtest.cpp index 192b2726d..6886cd35f 100644 --- a/bsd/net/dlil_pvt.h +++ b/libkern/kmod/libkmodtest/libkmodtest.cpp @@ -25,23 +25,17 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef DLIL_PVT_H -#define DLIL_PVT_H -#include -#ifdef KERNEL_PRIVATE -#include -#include +#include "libkmodtest.h" -struct dlil_family_mod_str { - TAILQ_ENTRY(dlil_family_mod_str) dl_fam_next; - char *interface_family; - int (*add_if)(struct ifnet_ptr *ifp); - int (*del_if)(struct ifnet *ifp); - int (*add_proto)(struct ifnet *ifp, uint32_t protocol_family, - struct ddesc_head_str *demux_desc_head); - int (*del_proto)(struct ifnet *ifp, uint32_t proto_family); -} +#define super IOService +OSDefineMetaClassAndStructors(testlibkmod, super); -#endif /* KERNEL_PRIVATE */ -#endif +IOService * +testlibkmod::probe( + IOService *provider, + SInt32 *score ) +{ + IOLog("%s\n", __PRETTY_FUNCTION__); + return NULL; +} diff --git a/libkern/kmod/libkmodtest/libkmodtest.h b/libkern/kmod/libkmodtest/libkmodtest.h new file mode 100644 index 000000000..cd0eb4401 --- /dev/null +++ b/libkern/kmod/libkmodtest/libkmodtest.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011 Apple, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +class testlibkmod : public IOService { + OSDeclareDefaultStructors(testlibkmod); + + virtual IOService * probe( + IOService *provider, + SInt32 *score ); + +}; \ No newline at end of file diff --git a/libkern/kxld/Makefile b/libkern/kxld/Makefile index 0e962487b..8c322053c 100644 --- a/libkern/kxld/Makefile +++ b/libkern/kxld/Makefile @@ -18,10 +18,12 @@ ifndef SYMROOT SYMROOT=./BUILD/sym endif ifdef SRCROOT +EXTHDRSRC=$(SRCROOT)/EXTERNAL_HEADERS HDRSRC=$(SRCROOT)/libkern/libkern OBJSRC=$(SRCROOT)/libkern/kxld else SRCROOT=. +EXTHDRSRC=$(SRCROOT)/../../EXTERNAL_HEADERS HDRSRC=$(SRCROOT)/../libkern OBJSRC=$(SRCROOT) ROOT=BUILD @@ -29,9 +31,12 @@ endif ifdef RC_CFLAGS ARCHS=$(addprefix -arch , $(RC_ARCHS)) else -ARCHS=-arch i386 -arch x86_64 -arch ppc +ARCHS=-arch i386 -arch x86_64 RC_CFLAGS=$(ARCHS) -pipe endif +ifdef INSTALL_LOCATION +override DSTROOT := $(DSTROOT)/$(INSTALL_LOCATION) +endif PRODUCT_TYPE ?= DYLIB @@ -60,7 +65,7 @@ CFLAGS=-std=c99 -Wall -Wextra -Werror -pedantic -Wformat=2 -Wcast-align \ LDFLAGS=$(ARCHS) -dynamiclib -install_name $(LIBKXLD_INSTALLNAME) \ -compatibility_version $(COMPATIBILITY_VERSION) \ -current_version $(CURRENT_VERSION) -lstdc++ -INCLUDES=-I$(HDRSRC) +INCLUDES=-I$(HDRSRC) -I$(EXTHDRSRC) ifneq ($(SDKROOT),/) CFLAGS += -isysroot $(SDKROOT) @@ -68,22 +73,18 @@ ifneq ($(SDKROOT),/) endif # Tools -CC = xcrun -sdk $(SDKROOT) cc +CC = xcrun -sdk $(SDKROOT) clang CLANG_ANALYZER = clang --analyze LIBTOOL = xcrun -sdk $(SDKROOT) libtool STRIP = xcrun -sdk $(SDKROOT) strip -# Turn on -Wno-cast-align for arm since it won't build without it -ifeq ($(findstring arm, $(ARCHS)),arm) -CFLAGS+=-Wno-cast-align -endif - # Files HDR_NAMES=kxld.h kxld_types.h WKdm.h OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_demangle.o kxld_dict.o \ - kxld_kext.o kxld_object.o kxld_reloc.o kxld_sect.o kxld_seg.o \ - kxld_sym.o kxld_symtab.o kxld_util.o kxld_uuid.o kxld_vtable.o \ - WKdmCompress.o WKdmDecompress.o + kxld_kext.o kxld_object.o kxld_reloc.o kxld_sect.o kxld_seg.o \ + kxld_srcversion.o kxld_sym.o kxld_symtab.o kxld_util.o kxld_uuid.o \ + kxld_vtable.o kxld_versionmin.o WKdmCompress.o WKdmDecompress.o + HDRS=$(addprefix $(HDRSRC)/, $(HDR_NAMES)) OBJS=$(addprefix $(OBJROOT)/, $(OBJ_NAMES)) @@ -99,7 +100,7 @@ $(OBJROOT)/%.o : $(TESTSRC)/%.c SRCROOTESC=$(subst /,\/,$(SRCROOT)) OBJROOTESC=$(subst /,\/,$(OBJROOT)) SEDOBJS=sed -E 's/(^[a-z_]+)\.o/$(OBJROOTESC)\/\1\.o $(OBJROOTESC)\/\1\.d/' -SEDSRCS=sed -E 's/([a-z_]+\.[ch])/$(SRCROOTESC)\/\1/g' +SEDSRCS=sed -E 's/ ([a-z_]+\.[ch])/ $(SRCROOTESC)\/\1/g' $(OBJROOT)/%.d: $(OBJSRC)/%.c @set -e; rm -f $@; \ $(CC) $(INCLUDES) -MM $< | $(SEDOBJS) | $(SEDSRCS) > $@; diff --git a/libkern/kxld/WKdmCompress.c b/libkern/kxld/WKdmCompress.c index db2c5c05b..5109015c9 100644 --- a/libkern/kxld/WKdmCompress.c +++ b/libkern/kxld/WKdmCompress.c @@ -150,7 +150,7 @@ WKdm_compress (WK_word* src_buf, */ dict_location = (WK_word *) - (((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word)); + ((void*) (((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word))); dict_word = *dict_location; @@ -232,7 +232,7 @@ WKdm_compress (WK_word* src_buf, #endif boundary_tmp = WK_pack_2bits(tempTagsArray, - (WK_word *) next_tag, + (WK_word *) ((void *) next_tag), dest_buf + HEADER_SIZE_IN_WORDS); #ifdef WK_DEBUG diff --git a/libkern/kxld/WKdmDecompress.c b/libkern/kxld/WKdmDecompress.c index 8921ae0e9..8eaf78bd8 100644 --- a/libkern/kxld/WKdmDecompress.c +++ b/libkern/kxld/WKdmDecompress.c @@ -262,7 +262,7 @@ WKdm_decompress (WK_word* src_buf, WK_word missed_word = *(next_full_word++); WK_word *dict_location = (WK_word *) - (((char *) dictionary) + HASH_TO_DICT_BYTE_OFFSET(missed_word)); + ((void *) (((char *) dictionary) + HASH_TO_DICT_BYTE_OFFSET(missed_word))); *dict_location = missed_word; *next_output = missed_word; break; diff --git a/libkern/kxld/kxld.c b/libkern/kxld/kxld.c index ada1cf3cf..da3fbec7d 100644 --- a/libkern/kxld/kxld.c +++ b/libkern/kxld/kxld.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2008, 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -354,8 +354,9 @@ init_kext_objects(KXLDContext *context, u_char *file, u_long size, * export its symbols by name by value. If it's indirect, just export the * C++ symbols by value. */ - for (i = 0; i < ndependencies; ++i) { kext = - kxld_array_get_item(&context->dependencies, i); kext_object = NULL; + for (i = 0; i < ndependencies; ++i) { + kext = kxld_array_get_item(&context->dependencies, i); + kext_object = NULL; interface_object = NULL; kext_object = get_object_for_file(context, dependencies[i].kext, @@ -432,7 +433,7 @@ get_object_for_file(KXLDContext *context, u_char *file, u_long size, if (!kxld_object_get_file(object)) { result = kxld_object_init_from_macho(object, file, size, name, - context->section_order, context->cputype, context->cpusubtype); + context->section_order, context->cputype, context->cpusubtype, context->flags); require_noerr(result, finish); rval = object; @@ -480,6 +481,8 @@ allocate_kext(KXLDContext *context, void *callback_data, *linked_object_alloc_out = linked_object; } + kxld_kext_set_linked_object_size(context->kext, vmsize); + /* Zero out the memory before we fill it. We fill this buffer in a * sparse fashion, and it's simpler to clear it now rather than * track and zero any pieces we didn't touch after we've written diff --git a/libkern/kxld/kxld_kext.c b/libkern/kxld/kxld_kext.c index b2be1535a..a9ef47798 100644 --- a/libkern/kxld/kxld_kext.c +++ b/libkern/kxld/kxld_kext.c @@ -26,7 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include -#include #include #include #include @@ -60,7 +59,6 @@ #include "kxld_seg.h" #include "kxld_symtab.h" #include "kxld_util.h" -#include "kxld_uuid.h" #include "kxld_vtable.h" struct symtab_command; @@ -185,9 +183,9 @@ kxld_kext_deinit(KXLDKext *kext) *******************************************************************************/ kern_return_t kxld_kext_export_symbols(const KXLDKext *kext, - struct kxld_dict *defined_symbols_by_name, - struct kxld_dict *obsolete_symbols_by_name, - struct kxld_dict *defined_cxx_symbols_by_value) + KXLDDict *defined_symbols_by_name, + KXLDDict *obsolete_symbols_by_name, + KXLDDict *defined_cxx_symbols_by_value) { kern_return_t rval = KERN_FAILURE; @@ -364,7 +362,16 @@ kxld_kext_get_vmsize(const KXLDKext *kext, { (void) kxld_object_get_vmsize(kext->kext, header_size, vmsize); } - + +/******************************************************************************* + *******************************************************************************/ +void +kxld_kext_set_linked_object_size(KXLDKext *kext, u_long vmsize) +{ + (void) kxld_object_set_linked_object_size(kext->kext, vmsize); +} + + /******************************************************************************* *******************************************************************************/ kern_return_t diff --git a/libkern/kxld/kxld_kext.h b/libkern/kxld/kxld_kext.h index f2b80c0f6..58e932684 100644 --- a/libkern/kxld/kxld_kext.h +++ b/libkern/kxld/kxld_kext.h @@ -28,7 +28,6 @@ #ifndef _KXLD_KEXT_H_ #define _KXLD_KEXT_H_ -#include #include #if KERNEL #include @@ -76,7 +75,10 @@ kern_return_t kxld_kext_export_symbols(const KXLDKext *kext, void kxld_kext_get_vmsize(const KXLDKext *kext, u_long *header_size, u_long *vmsize) __attribute__((nonnull, visibility("hidden"))); - + +void kxld_kext_set_linked_object_size(KXLDKext *kext, u_long vmsize) +__attribute__((nonnull, visibility("hidden"))); + kern_return_t kxld_kext_export_linked_object(const KXLDKext *kext, u_char *linked_object, kxld_addr_t *kmod_info) __attribute__((nonnull, visibility("hidden"))); diff --git a/libkern/kxld/kxld_object.c b/libkern/kxld/kxld_object.c index 24b589912..752518b7a 100644 --- a/libkern/kxld/kxld_object.c +++ b/libkern/kxld/kxld_object.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009, 2011-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,23 +26,29 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include - -#include -#include -#include #include #if KERNEL #include + #include #include #include #else /* !KERNEL */ + /* Get machine.h from the kernel source so we can support all platforms + * that the kernel supports. Otherwise we're at the mercy of the host. + */ + #include "../../osfmk/mach/machine.h" + #include #include #include #include #endif /* KERNEL */ +#include +#include +#include + #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" #include @@ -51,9 +57,11 @@ #include "kxld_reloc.h" #include "kxld_sect.h" #include "kxld_seg.h" +#include "kxld_srcversion.h" #include "kxld_symtab.h" #include "kxld_util.h" #include "kxld_uuid.h" +#include "kxld_versionmin.h" #include "kxld_vtable.h" #include "kxld_object.h" @@ -75,9 +83,12 @@ struct kxld_object { KXLDArray locrelocs; KXLDRelocator relocator; KXLDuuid uuid; + KXLDversionmin versionmin; + KXLDsrcversion srcversion; KXLDSymtab *symtab; struct dysymtab_command *dysymtab_hdr; kxld_addr_t link_addr; + u_long output_buffer_size; boolean_t is_kernel; boolean_t is_final_image; boolean_t is_linked; @@ -85,6 +96,9 @@ struct kxld_object { #if KXLD_USER_OR_OBJECT KXLDArray *section_order; #endif +#if KXLD_PIC_KEXTS + boolean_t include_kaslr_relocs; +#endif #if !KERNEL enum NXByteOrder host_order; enum NXByteOrder target_order; @@ -129,6 +143,11 @@ static kern_return_t init_from_object(KXLDObject *object); static kern_return_t process_relocs_from_sections(KXLDObject *object); #endif /* KXLD_USER_OR_OBJECT */ +#if KXLD_PIC_KEXTS +static boolean_t target_supports_slideable_kexts(const KXLDObject *object); +#endif /* KXLD_PIC_KEXTS */ + + static kern_return_t export_macho_header(const KXLDObject *object, u_char *buf, u_int ncmds, u_long *header_offset, u_long header_size); #if KXLD_USER_OR_ILP32 @@ -183,7 +202,7 @@ kxld_object_sizeof(void) kern_return_t kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, const char *name, KXLDArray *section_order __unused, - cpu_type_t cputype, cpu_subtype_t cpusubtype) + cpu_type_t cputype, cpu_subtype_t cpusubtype, KXLDFlags flags __unused) { kern_return_t rval = KERN_FAILURE; KXLDSeg * seg = NULL; @@ -198,6 +217,10 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, #if KXLD_USER_OR_OBJECT object->section_order = section_order; #endif +#if KXLD_PIC_KEXTS + object->include_kaslr_relocs = ((flags & kKXLDFlagIncludeRelocs) == kKXLDFlagIncludeRelocs); +#endif + /* Find the local architecture */ rval = get_target_machine_info(object, cputype, cpusubtype); @@ -231,10 +254,10 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, */ if (kxld_object_is_32_bit(object)) { - struct mach_header *mach_hdr = (struct mach_header *) object->file; + struct mach_header *mach_hdr = (struct mach_header *) ((void *) object->file); object->filetype = mach_hdr->filetype; } else { - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) object->file; + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) ((void *) object->file); object->filetype = mach_hdr->filetype; } @@ -273,7 +296,12 @@ kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); if (seg) { (void) kxld_seg_populate_linkedit(seg, object->symtab, - kxld_object_is_32_bit(object)); + kxld_object_is_32_bit(object) +#if KXLD_PIC_KEXTS + , &object->locrelocs, &object->extrelocs, + target_supports_slideable_kexts(object) +#endif + ); } } @@ -344,9 +372,6 @@ get_target_machine_info(KXLDObject *object, cpu_type_t cputype __unused, case CPU_TYPE_I386: object->cpusubtype = CPU_SUBTYPE_I386_ALL; break; - case CPU_TYPE_POWERPC: - object->cpusubtype = CPU_SUBTYPE_POWERPC_ALL; - break; case CPU_TYPE_X86_64: object->cpusubtype = CPU_SUBTYPE_X86_64_ALL; break; @@ -368,9 +393,6 @@ get_target_machine_info(KXLDObject *object, cpu_type_t cputype __unused, case CPU_TYPE_X86_64: object->target_order = NX_LittleEndian; break; - case CPU_TYPE_POWERPC: - object->target_order = NX_BigEndian; - break; default: rval = KERN_NOT_SUPPORTED; kxld_log(kKxldLogLinking, kKxldLogErr, @@ -393,7 +415,7 @@ get_macho_slice_for_arch(KXLDObject *object, u_char *file, u_long size) kern_return_t rval = KERN_FAILURE; struct mach_header *mach_hdr = NULL; #if !KERNEL - struct fat_header *fat = (struct fat_header *) file; + struct fat_header *fat = (struct fat_header *) ((void *) file); struct fat_arch *archs = (struct fat_arch *) &fat[1]; boolean_t swap = FALSE; #endif /* KERNEL */ @@ -462,7 +484,7 @@ get_macho_slice_for_arch(KXLDObject *object, u_char *file, u_long size) } require_noerr(rval, finish); - mach_hdr = (struct mach_header *) object->file; + mach_hdr = (struct mach_header *) ((void *) object->file); require_action(object->cputype == mach_hdr->cputype, finish, rval=KERN_FAILURE; kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); @@ -484,6 +506,8 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, struct load_command *cmd_hdr = NULL; struct symtab_command *symtab_hdr = NULL; struct uuid_command *uuid_hdr = NULL; + struct version_min_command *versionmin_hdr = NULL; + struct source_version_command *source_version_hdr = NULL; u_long base_offset = 0; u_long offset = 0; u_long sect_offset = 0; @@ -504,7 +528,7 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, offset = base_offset; for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) (object->file + offset); + cmd_hdr = (struct load_command *) ((void *) (object->file + offset)); switch(cmd_hdr->cmd) { #if KXLD_USER_OR_ILP32 @@ -525,7 +549,7 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, case LC_SEGMENT_64: { struct segment_command_64 *seg_hdr = - (struct segment_command_64 *) cmd_hdr; + (struct segment_command_64 *) ((void *) cmd_hdr); /* Ignore segments with no vm size */ if (!seg_hdr->vmsize) continue; @@ -554,7 +578,7 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, offset = base_offset; for (i = 0; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) (object->file + offset); + cmd_hdr = (struct load_command *) ((void *) (object->file + offset)); seg = NULL; switch(cmd_hdr->cmd) { @@ -580,7 +604,7 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, case LC_SEGMENT_64: { struct segment_command_64 *seg_hdr = - (struct segment_command_64 *) cmd_hdr; + (struct segment_command_64 *) ((void *) cmd_hdr); /* Ignore segments with no vm size */ if (!seg_hdr->vmsize) continue; @@ -601,26 +625,44 @@ init_from_final_linked_image(KXLDObject *object, u_int *filetype_out, uuid_hdr = (struct uuid_command *) cmd_hdr; kxld_uuid_init_from_macho(&object->uuid, uuid_hdr); break; + case LC_VERSION_MIN_MACOSX: + case LC_VERSION_MIN_IPHONEOS: + versionmin_hdr = (struct version_min_command *) cmd_hdr; + kxld_versionmin_init_from_macho(&object->versionmin, versionmin_hdr); + break; + case LC_SOURCE_VERSION: + source_version_hdr = (struct source_version_command *) (void *) cmd_hdr; + kxld_srcversion_init_from_macho(&object->srcversion, source_version_hdr); + break; case LC_DYSYMTAB: object->dysymtab_hdr = (struct dysymtab_command *) cmd_hdr; rval = kxld_reloc_create_macho(&object->extrelocs, &object->relocator, - (struct relocation_info *) (object->file + object->dysymtab_hdr->extreloff), + (struct relocation_info *) ((void *) (object->file + object->dysymtab_hdr->extreloff)), object->dysymtab_hdr->nextrel); require_noerr(rval, finish); rval = kxld_reloc_create_macho(&object->locrelocs, &object->relocator, - (struct relocation_info *) (object->file + object->dysymtab_hdr->locreloff), + (struct relocation_info *) ((void *) (object->file + object->dysymtab_hdr->locreloff)), object->dysymtab_hdr->nlocrel); require_noerr(rval, finish); break; case LC_UNIXTHREAD: - /* Don't need to do anything with UNIXTHREAD for the kernel */ + case LC_MAIN: + /* Don't need to do anything with UNIXTHREAD or MAIN for the kernel */ require_action(kxld_object_is_kernel(object), finish, rval=KERN_FAILURE; kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "LC_UNIXTHREAD segment is not valid in a kext.")); + "LC_UNIXTHREAD/LC_MAIN segment is not valid in a kext.")); + break; + case LC_CODE_SIGNATURE: + case LC_DYLD_INFO: + case LC_DYLD_INFO_ONLY: + case LC_FUNCTION_STARTS: + case LC_DATA_IN_CODE: + case LC_DYLIB_CODE_SIGN_DRS: + /* Various metadata that might be stored in the linkedit segment */ break; default: rval=KERN_FAILURE; @@ -695,7 +737,7 @@ init_from_execute(KXLDObject *object) kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); #endif - KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, + KXLD_3264_FUNC(kxld_object_is_32_bit(object), rval, kxld_symtab_init_from_macho_32, kxld_symtab_init_from_macho_64, object->symtab, symtab_hdr, object->file, kernel_linkedit_seg); require_noerr(rval, finish); @@ -736,11 +778,9 @@ finish: /******************************************************************************* *******************************************************************************/ static boolean_t -target_supports_bundle(const KXLDObject *object) +target_supports_bundle(const KXLDObject *object __unused) { - return (object->cputype == CPU_TYPE_I386 || - object->cputype == CPU_TYPE_X86_64 || - object->cputype == CPU_TYPE_ARM); + return TRUE; } /******************************************************************************* @@ -782,9 +822,7 @@ finish: *******************************************************************************/ static boolean_t target_supports_object(const KXLDObject *object) { - return (object->cputype == CPU_TYPE_POWERPC || - object->cputype == CPU_TYPE_I386 || - object->cputype == CPU_TYPE_ARM); + return (object->cputype == CPU_TYPE_I386); } /******************************************************************************* @@ -825,7 +863,7 @@ init_from_object(KXLDObject *object) */ for (; i < ncmds; ++i, offset += cmd_hdr->cmdsize) { - cmd_hdr = (struct load_command *) (object->file + offset); + cmd_hdr = (struct load_command *) ((void *) (object->file + offset)); switch(cmd_hdr->cmd) { #if KXLD_USER_OR_ILP32 @@ -861,7 +899,7 @@ init_from_object(KXLDObject *object) case LC_SEGMENT_64: { struct segment_command_64 *seg_hdr = - (struct segment_command_64 *) cmd_hdr; + (struct segment_command_64 *) ((void *) cmd_hdr); /* Ignore segments with no vm size */ if (!seg_hdr->vmsize) continue; @@ -900,8 +938,21 @@ init_from_object(KXLDObject *object) kxld_uuid_init_from_macho(&object->uuid, uuid_hdr); break; case LC_UNIXTHREAD: - /* Don't need to do anything with UNIXTHREAD */ + case LC_MAIN: + /* Don't need to do anything with UNIXTHREAD or MAIN */ break; + case LC_CODE_SIGNATURE: + case LC_DYLD_INFO: + case LC_DYLD_INFO_ONLY: + case LC_FUNCTION_STARTS: + case LC_DATA_IN_CODE: + case LC_DYLIB_CODE_SIGN_DRS: + /* Various metadata that might be stored in the linkedit segment */ + break; + case LC_VERSION_MIN_MACOSX: + case LC_VERSION_MIN_IPHONEOS: + case LC_SOURCE_VERSION: + /* Not supported for object files, fall through */ default: rval = KERN_FAILURE; kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO @@ -964,7 +1015,7 @@ finish: static u_long get_macho_cmd_data_32(u_char *file, u_long offset, u_int *filetype, u_int *ncmds) { - struct mach_header *mach_hdr = (struct mach_header *) (file + offset); + struct mach_header *mach_hdr = (struct mach_header *) ((void *) (file + offset)); if (filetype) *filetype = mach_hdr->filetype; if (ncmds) *ncmds = mach_hdr->ncmds; @@ -980,7 +1031,7 @@ get_macho_cmd_data_32(u_char *file, u_long offset, u_int *filetype, u_int *ncmds static u_long get_macho_cmd_data_64(u_char *file, u_long offset, u_int *filetype, u_int *ncmds) { - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) (file + offset); + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) ((void *) (file + offset)); if (filetype) *filetype = mach_hdr->filetype; if (ncmds) *ncmds = mach_hdr->ncmds; @@ -997,28 +1048,39 @@ get_macho_header_size(const KXLDObject *object) KXLDSeg *seg = NULL; u_long header_size = 0; u_int i = 0; + boolean_t object_is_32_bit = kxld_object_is_32_bit(object); check(object); /* Mach, segment, symtab, and UUID headers */ - if (kxld_object_is_32_bit(object)) { - header_size += sizeof(struct mach_header); - } else { - header_size += sizeof(struct mach_header_64); - } + header_size += object_is_32_bit ? sizeof(struct mach_header) : sizeof(struct mach_header_64); for (i = 0; i < object->segs.nitems; ++i) { seg = kxld_array_get_item(&object->segs, i); - header_size += kxld_seg_get_macho_header_size(seg, kxld_object_is_32_bit(object)); + header_size += kxld_seg_get_macho_header_size(seg, object_is_32_bit); } header_size += kxld_symtab_get_macho_header_size(); +#if KXLD_PIC_KEXTS + if (target_supports_slideable_kexts(object)) { + header_size += kxld_reloc_get_macho_header_size(); + } +#endif /* KXLD_PIC_KEXTS */ + if (object->uuid.has_uuid) { header_size += kxld_uuid_get_macho_header_size(); } + if (object->versionmin.has_versionmin) { + header_size += kxld_versionmin_get_macho_header_size(); + } + + if (object->srcversion.has_srcversion) { + header_size += kxld_srcversion_get_macho_header_size(); + } + return header_size; } @@ -1033,11 +1095,48 @@ get_macho_data_size(const KXLDObject *object) check(object); + /* total all segment vmsize values */ for (i = 0; i < object->segs.nitems; ++i) { seg = kxld_array_get_item(&object->segs, i); data_size += (u_long) kxld_seg_get_vmsize(seg); } +#if KXLD_PIC_KEXTS + { + /* ensure that when we eventually emit the final linked object, + * appending the __DYSYMTAB data after the __LINKEDIT data will + * not overflow the space allocated for the __LINKEDIT segment + */ + + u_long seg_vmsize = 0; + u_long symtab_size = 0; + u_long reloc_size = 0; + + /* get current __LINKEDIT sizes */ + seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); + seg_vmsize = (u_long) kxld_seg_get_vmsize(seg); + + /* get size of symbol table data that will eventually be dumped + * into the __LINKEDIT segment + */ + symtab_size = kxld_symtab_get_macho_data_size(object->symtab, kxld_object_is_32_bit(object)); + + if (target_supports_slideable_kexts(object)) { + /* get size of __DYSYMTAB relocation entries */ + reloc_size = kxld_reloc_get_macho_data_size(&object->locrelocs, &object->extrelocs); + } + + /* combine, and ensure they'll both fit within the page(s) + * allocated for the __LINKEDIT segment. If they'd overflow, + * increase the vmsize appropriately so no overflow will occur + */ + if ((symtab_size + reloc_size) > seg_vmsize) { + u_long overflow = (symtab_size + reloc_size) - seg_vmsize; + data_size += round_page(overflow); + } + } +#endif // KXLD_PIC_KEXTS + return data_size; } @@ -1395,7 +1494,7 @@ set_is_object_linked(KXLDObject *object) } if (object->is_final_image) { - object->is_linked = !object->extrelocs.nitems && !object->locrelocs.nitems; + object->is_linked = !object->extrelocs.nitems; return; } @@ -1442,6 +1541,8 @@ void kxld_object_clear(KXLDObject *object __unused) kxld_array_reset(&object->locrelocs); kxld_relocator_clear(&object->relocator); kxld_uuid_clear(&object->uuid); + kxld_versionmin_clear(&object->versionmin); + kxld_srcversion_clear(&object->srcversion); if (object->symtab) kxld_symtab_clear(object->symtab); @@ -1570,8 +1671,7 @@ kxld_object_target_supports_strict_patching(const KXLDObject *object) { check(object); - return (object->cputype != CPU_TYPE_I386 && - object->cputype != CPU_TYPE_POWERPC); + return (object->cputype != CPU_TYPE_I386); } /******************************************************************************* @@ -1581,8 +1681,7 @@ kxld_object_target_supports_common_symbols(const KXLDObject *object) { check(object); - return (object->cputype == CPU_TYPE_I386 || - object->cputype == CPU_TYPE_POWERPC); + return (object->cputype == CPU_TYPE_I386); } /******************************************************************************* @@ -1605,6 +1704,15 @@ kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, } +/******************************************************************************* + *******************************************************************************/ +void +kxld_object_set_linked_object_size(KXLDObject *object, u_long vmsize) +{ + object->output_buffer_size = vmsize; /* cache this for use later */ + return; +} + /******************************************************************************* *******************************************************************************/ kern_return_t @@ -1619,6 +1727,7 @@ kxld_object_export_linked_object(const KXLDObject *object, u_long data_offset = 0; u_int ncmds = 0; u_int i = 0; + boolean_t is_32bit_object = kxld_object_is_32_bit(object); check(object); check(linked_object); @@ -1627,36 +1736,74 @@ kxld_object_export_linked_object(const KXLDObject *object, header_size = get_macho_header_size(object); data_offset = (object->is_final_image) ? header_size : round_page(header_size); - size = data_offset + get_macho_data_size(object); + size = object->output_buffer_size; /* Copy data to the file */ - ncmds = object->segs.nitems + (object->uuid.has_uuid == TRUE) + 1 /* linkedit */; + ncmds = object->segs.nitems + 1 /* LC_SYMTAB */; - rval = export_macho_header(object, linked_object, ncmds, - &header_offset, header_size); +#if KXLD_PIC_KEXTS + /* don't write out a DYSYMTAB segment for targets that can't digest it + */ + if (target_supports_slideable_kexts(object)) { + ncmds++; /* dysymtab */ + } +#endif /* KXLD_PIC_KEXTS */ + + if (object->uuid.has_uuid == TRUE) { + ncmds++; + } + + if (object->versionmin.has_versionmin == TRUE) { + ncmds++; + } + + if (object->srcversion.has_srcversion == TRUE) { + ncmds++; + } + + rval = export_macho_header(object, linked_object, ncmds, &header_offset, header_size); require_noerr(rval, finish); for (i = 0; i < object->segs.nitems; ++i) { seg = kxld_array_get_item(&object->segs, i); - rval = kxld_seg_export_macho_to_vm(seg, linked_object, &header_offset, - header_size, size, object->link_addr, kxld_object_is_32_bit(object)); + rval = kxld_seg_export_macho_to_vm(seg, linked_object, &header_offset, + header_size, size, object->link_addr, is_32bit_object); require_noerr(rval, finish); } seg = kxld_object_get_seg_by_name(object, SEG_LINKEDIT); data_offset = (u_long) (seg->link_addr - object->link_addr); + rval = kxld_symtab_export_macho(object->symtab, linked_object, &header_offset, - header_size, &data_offset, size, kxld_object_is_32_bit(object)); + header_size, &data_offset, size, is_32bit_object); require_noerr(rval, finish); +#if KXLD_PIC_KEXTS + if (target_supports_slideable_kexts(object)) { + rval = kxld_reloc_export_macho(&object->relocator, &object->locrelocs, + &object->extrelocs, linked_object, &header_offset, header_size, + &data_offset, size); + require_noerr(rval, finish); + } +#endif /* KXLD_PIC_KEXTS */ + if (object->uuid.has_uuid) { - rval = kxld_uuid_export_macho(&object->uuid, linked_object, - &header_offset, header_size); + rval = kxld_uuid_export_macho(&object->uuid, linked_object, &header_offset, header_size); + require_noerr(rval, finish); + } + + if (object->versionmin.has_versionmin) { + rval = kxld_versionmin_export_macho(&object->versionmin, linked_object, &header_offset, header_size); require_noerr(rval, finish); } + if (object->srcversion.has_srcversion) { + rval = kxld_srcversion_export_macho(&object->srcversion, linked_object, &header_offset, header_size); + require_noerr(rval, finish); + } + #if !KERNEL unswap_macho(linked_object, object->host_order, object->target_order); #endif /* KERNEL */ @@ -1706,7 +1853,7 @@ export_macho_header_32(const KXLDObject *object, u_char *buf, u_int ncmds, require_action(sizeof(*mach) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - mach = (struct mach_header *) (buf + *header_offset); + mach = (struct mach_header *) ((void *) (buf + *header_offset)); mach->magic = MH_MAGIC; mach->cputype = object->cputype; @@ -1741,7 +1888,7 @@ export_macho_header_64(const KXLDObject *object, u_char *buf, u_int ncmds, require_action(sizeof(*mach) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - mach = (struct mach_header_64 *) (buf + *header_offset); + mach = (struct mach_header_64 *) ((void *) (buf + *header_offset)); mach->magic = MH_MAGIC_64; mach->cputype = object->cputype; @@ -1965,17 +2112,11 @@ process_symbol_pointers(KXLDObject *object) */ sect = kxld_object_get_sect_by_name(object, SEG_DATA, SECT_SYM_PTRS); - if (!sect) { + if (!sect || !(sect->flags & S_NON_LAZY_SYMBOL_POINTERS)) { rval = KERN_SUCCESS; goto finish; } - require_action(sect->flags & S_NON_LAZY_SYMBOL_POINTERS, - finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO - "Section %s,%s does not have S_NON_LAZY_SYMBOL_POINTERS flag.", - SEG_DATA, SECT_SYM_PTRS)); - /* Calculate the table offset and number of entries in the section */ if (kxld_object_is_32_bit(object)) { @@ -1989,7 +2130,8 @@ process_symbol_pointers(KXLDObject *object) require_action(firstsym + nsyms <= object->dysymtab_hdr->nindirectsyms, finish, rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO)); + kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogMalformedMachO + "firstsym + nsyms > object->dysymtab_hdr->nindirectsyms")); /* Iterate through the indirect symbol table and fill in the section of * symbol pointers. There are three cases: @@ -2001,7 +2143,7 @@ process_symbol_pointers(KXLDObject *object) * action is required. */ - symidx = (int32_t *) (object->file + object->dysymtab_hdr->indirectsymoff); + symidx = (int32_t *) ((void *) (object->file + object->dysymtab_hdr->indirectsymoff)); symidx += firstsym; symptr = sect->data; for (i = 0; i < nsyms; ++i, ++symidx, symptr+=symptrsize) { @@ -2088,10 +2230,10 @@ static void add_to_ptr(u_char *symptr, kxld_addr_t val, boolean_t is_32_bit) { if (is_32_bit) { - uint32_t *ptr = (uint32_t *) symptr; + uint32_t *ptr = (uint32_t *) ((void *) symptr); *ptr += (uint32_t) val; } else { - uint64_t *ptr = (uint64_t *) symptr; + uint64_t *ptr = (uint64_t *) ((void *) symptr); *ptr += (uint64_t) val; } } @@ -2146,7 +2288,7 @@ populate_kmod_info(KXLDObject *object) kmodsect = kxld_array_get_item(&object->sects, kmodsym->sectnum); kmod_offset = (u_long) (kmodsym->base_addr - kmodsect->base_addr); - kmod_info = (kmod_info_t *) (kmodsect->data + kmod_offset); + kmod_info = (kmod_info_t *) ((void *) (kmodsect->data + kmod_offset)); if (kxld_object_is_32_bit(object)) { kmod_info_32_v1_t *kmod = (kmod_info_32_v1_t *) (kmod_info); @@ -2183,3 +2325,16 @@ finish: return rval; } +#if KXLD_PIC_KEXTS +/******************************************************************************* + *******************************************************************************/ +static boolean_t +target_supports_slideable_kexts(const KXLDObject *object) +{ + check(object); + + return ( object->cputype != CPU_TYPE_I386 + && object->include_kaslr_relocs + ); +} +#endif /* KXLD_PIC_KEXTS */ diff --git a/libkern/kxld/kxld_object.h b/libkern/kxld/kxld_object.h index 5b6b5064d..ab78f200e 100644 --- a/libkern/kxld/kxld_object.h +++ b/libkern/kxld/kxld_object.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009, 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,7 +28,6 @@ #ifndef _KXLD_OBJECT_H_ #define _KXLD_OBJECT_H_ -#include #include #if KERNEL #include @@ -56,7 +55,7 @@ size_t kxld_object_sizeof(void) kern_return_t kxld_object_init_from_macho(KXLDObject *object, u_char *file, u_long size, const char *name, struct kxld_array *section_order, - cpu_type_t cputype, cpu_subtype_t cpusubtype) + cpu_type_t cputype, cpu_subtype_t cpusubtype, KXLDFlags flags) __attribute__((nonnull(1,2,4) visibility("hidden"))); void kxld_object_clear(KXLDObject *object) @@ -121,6 +120,9 @@ void kxld_object_get_vmsize(const KXLDObject *object, u_long *header_size, u_long *vmsize) __attribute__((nonnull, visibility("hidden"))); +void kxld_object_set_linked_object_size(KXLDObject *object, u_long vmsize) + __attribute__((nonnull, visibility("hidden"))); + /* This will be the same size as kxld_kext_get_vmsize */ kern_return_t kxld_object_export_linked_object(const KXLDObject *object, u_char *linked_object) diff --git a/libkern/kxld/kxld_reloc.c b/libkern/kxld/kxld_reloc.c index 4867c8c78..583b5bc5f 100644 --- a/libkern/kxld/kxld_reloc.c +++ b/libkern/kxld/kxld_reloc.c @@ -27,14 +27,19 @@ */ #include #include -#include #include #if KERNEL #include + #include #else - #include #include + #include + + /* Get machine.h from the kernel source so we can support all platforms + * that the kernel supports. Otherwise we're at the mercy of the host. + */ + #include "../../osfmk/mach/machine.h" #endif #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" @@ -51,11 +56,15 @@ #include "kxld_util.h" #include "kxld_vtable.h" +#if KXLD_PIC_KEXTS +/* This will try to pull in mach/machine.h, so it has to come after the + * explicit include above. + */ +#include +#endif + /* include target-specific relocation prototypes */ #include -#if KXLD_USER_OR_PPC -#include -#endif #if KXLD_USER_OR_X86_64 #include #endif @@ -101,7 +110,7 @@ #if KXLD_USER_OR_I386 static boolean_t generic_reloc_has_pair(u_int _type) __attribute__((const)); -static boolean_t generic_reloc_is_pair(u_int _type, u_int _prev_type) +static u_int generic_reloc_get_pair_type(u_int _prev_type) __attribute__((const)); static boolean_t generic_reloc_has_got(u_int _type) __attribute__((const)); @@ -111,23 +120,10 @@ static kern_return_t generic_process_reloc(const KXLDRelocator *relocator, kxld_addr_t pair_target, boolean_t swap); #endif /* KXLD_USER_OR_I386 */ -#if KXLD_USER_OR_PPC -static boolean_t ppc_reloc_has_pair(u_int _type) - __attribute__((const)); -static boolean_t ppc_reloc_is_pair(u_int _type, u_int _prev_type) - __attribute__((const)); -static boolean_t ppc_reloc_has_got(u_int _type) - __attribute__((const)); -static kern_return_t ppc_process_reloc(const KXLDRelocator *relocator, - u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, - kxld_addr_t link_pc, kxld_addr_t link_disp, u_int type, kxld_addr_t target, - kxld_addr_t pair_target, boolean_t swap); -#endif /* KXLD_USER_OR_PPC */ - #if KXLD_USER_OR_X86_64 static boolean_t x86_64_reloc_has_pair(u_int _type) __attribute__((const)); -static boolean_t x86_64_reloc_is_pair(u_int _type, u_int _prev_type) +static u_int x86_64_reloc_get_pair_type(u_int _prev_type) __attribute__((const)); static boolean_t x86_64_reloc_has_got(u_int _type) __attribute__((const)); @@ -142,7 +138,7 @@ static kern_return_t calculate_displacement_x86_64(uint64_t target, #if KXLD_USER_OR_ARM static boolean_t arm_reloc_has_pair(u_int _type) __attribute__((const)); -static boolean_t arm_reloc_is_pair(u_int _type, u_int _prev_type) +static u_int arm_reloc_get_pair_type(u_int _prev_type) __attribute__((const)); static boolean_t arm_reloc_has_got(u_int _type) __attribute__((const)); @@ -179,6 +175,13 @@ static kern_return_t get_target_by_address_lookup(kxld_addr_t *target, static kern_return_t check_for_direct_pure_virtual_call( const KXLDRelocator *relocator, u_long offset); +#if KXLD_PIC_KEXTS +static u_long get_macho_data_size_for_array(const KXLDArray *relocs); + +static kern_return_t export_macho_for_array(const KXLDRelocator *relocator, + const KXLDArray *relocs, struct relocation_info **dstp); +#endif /* KXLD_PIC_KEXTS */ + /******************************************************************************* *******************************************************************************/ kern_return_t @@ -189,46 +192,39 @@ kxld_relocator_init(KXLDRelocator *relocator, u_char *file, kern_return_t rval = KERN_FAILURE; check(relocator); - + switch(cputype) { #if KXLD_USER_OR_I386 case CPU_TYPE_I386: relocator->reloc_has_pair = generic_reloc_has_pair; - relocator->reloc_is_pair = generic_reloc_is_pair; + relocator->reloc_get_pair_type = generic_reloc_get_pair_type; relocator->reloc_has_got = generic_reloc_has_got; relocator->process_reloc = generic_process_reloc; relocator->function_align = 0; relocator->is_32_bit = TRUE; + relocator->may_scatter = TRUE; break; #endif /* KXLD_USER_OR_I386 */ -#if KXLD_USER_OR_PPC - case CPU_TYPE_POWERPC: - relocator->reloc_has_pair = ppc_reloc_has_pair; - relocator->reloc_is_pair = ppc_reloc_is_pair; - relocator->reloc_has_got = ppc_reloc_has_got; - relocator->process_reloc = ppc_process_reloc; - relocator->function_align = 0; - relocator->is_32_bit = TRUE; - break; -#endif /* KXLD_USER_OR_PPC */ #if KXLD_USER_OR_X86_64 case CPU_TYPE_X86_64: relocator->reloc_has_pair = x86_64_reloc_has_pair; - relocator->reloc_is_pair = x86_64_reloc_is_pair; + relocator->reloc_get_pair_type = x86_64_reloc_get_pair_type; relocator->reloc_has_got = x86_64_reloc_has_got; relocator->process_reloc = x86_64_process_reloc; relocator->function_align = 0; relocator->is_32_bit = FALSE; + relocator->may_scatter = FALSE; break; #endif /* KXLD_USER_OR_X86_64 */ #if KXLD_USER_OR_ARM case CPU_TYPE_ARM: relocator->reloc_has_pair = arm_reloc_has_pair; - relocator->reloc_is_pair = arm_reloc_is_pair; + relocator->reloc_get_pair_type = arm_reloc_get_pair_type; relocator->reloc_has_got = arm_reloc_has_got; relocator->process_reloc = arm_process_reloc; relocator->function_align = 1; relocator->is_32_bit = TRUE; + relocator->may_scatter = FALSE; break; #endif /* KXLD_USER_OR_ARM */ default: @@ -293,8 +289,8 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, * symbols. */ - if (!(src->r_address & R_SCATTERED) && !(src->r_extern) && - (R_ABS == src->r_symbolnum)) + if (!(relocator->may_scatter && (src->r_address & R_SCATTERED)) && + !(src->r_extern) && (R_ABS == src->r_symbolnum)) { continue; } @@ -306,7 +302,7 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, * Extern -> Symbolnum by Index */ reloc = kxld_array_get_item(relocarray, reloc_index++); - if (src->r_address & R_SCATTERED) { + if (relocator->may_scatter && (src->r_address & R_SCATTERED)) { reloc->address = scatsrc->r_address; reloc->pcrel = scatsrc->r_pcrel; reloc->length = scatsrc->r_length; @@ -337,16 +333,18 @@ kxld_reloc_create_macho(KXLDArray *relocarray, const KXLDRelocator *relocator, src = srcs + i; scatsrc = (const struct scattered_relocation_info *) src; - if (src->r_address & R_SCATTERED) { - require_action(relocator->reloc_is_pair( - scatsrc->r_type, reloc->reloc_type), + if (relocator->may_scatter && (src->r_address & R_SCATTERED)) { + require_action(relocator->reloc_get_pair_type( + reloc->reloc_type) == scatsrc->r_type, finish, rval=KERN_FAILURE); + reloc->pair_address= scatsrc->r_address; reloc->pair_target = scatsrc->r_value; reloc->pair_target_type = KXLD_TARGET_LOOKUP; } else { - require_action(relocator->reloc_is_pair(src->r_type, - reloc->reloc_type), finish, rval=KERN_FAILURE); - + require_action(relocator->reloc_get_pair_type( + reloc->reloc_type) == scatsrc->r_type, + finish, rval=KERN_FAILURE); + reloc->pair_address = scatsrc->r_address; if (src->r_extern) { reloc->pair_target = src->r_symbolnum; reloc->pair_target_type = KXLD_TARGET_SYMBOLNUM; @@ -384,7 +382,6 @@ count_relocatable_relocs(const KXLDRelocator *relocator, { u_int num_nonpair_relocs = 0; u_int i = 0; - u_int prev_type = 0; const struct relocation_info *reloc = NULL; const struct scattered_relocation_info *sreloc = NULL; @@ -394,7 +391,6 @@ count_relocatable_relocs(const KXLDRelocator *relocator, /* Loop over all of the relocation entries */ num_nonpair_relocs = 1; - prev_type = relocs->r_type; for (i = 1; i < nrelocs; ++i) { reloc = relocs + i; @@ -405,18 +401,14 @@ count_relocatable_relocs(const KXLDRelocator *relocator, sreloc = (const struct scattered_relocation_info *) reloc; num_nonpair_relocs += - (!relocator->reloc_is_pair(sreloc->r_type, prev_type)); - - prev_type = sreloc->r_type; + !relocator->reloc_has_pair(sreloc->r_type); } else { /* A normal relocation entry is relocatable if it is not a pair and * if it is not a section-based relocation for an absolute symbol. */ num_nonpair_relocs += - !(relocator->reloc_is_pair(reloc->r_type, prev_type) + !(relocator->reloc_has_pair(reloc->r_type) || (0 == reloc->r_extern && R_ABS == reloc->r_symbolnum)); - - prev_type = reloc->r_type; } } @@ -444,13 +436,13 @@ kxld_relocator_has_pair(const KXLDRelocator *relocator, u_int r_type) /******************************************************************************* *******************************************************************************/ -boolean_t -kxld_relocator_is_pair(const KXLDRelocator *relocator, u_int r_type, +u_int +kxld_relocator_get_pair_type(const KXLDRelocator *relocator, u_int prev_r_type) { check(relocator); - return relocator->reloc_is_pair(r_type, prev_r_type); + return relocator->reloc_get_pair_type(prev_r_type); } /******************************************************************************* @@ -538,6 +530,81 @@ finish: return reloc; } +#if KXLD_PIC_KEXTS +/******************************************************************************* +*******************************************************************************/ +u_long +kxld_reloc_get_macho_header_size() +{ + return sizeof(struct dysymtab_command); +} + +/******************************************************************************* +*******************************************************************************/ +u_long +kxld_reloc_get_macho_data_size(const KXLDArray *locrelocs, + const KXLDArray *extrelocs) +{ + u_long rval = 0; + + rval += get_macho_data_size_for_array(locrelocs); + rval += get_macho_data_size_for_array(extrelocs); + + return (rval); +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_reloc_export_macho(const KXLDRelocator *relocator, + const KXLDArray *locrelocs, const KXLDArray *extrelocs, + u_char *buf, u_long *header_offset, u_long header_size, + u_long *data_offset, u_long size) +{ + kern_return_t rval = KERN_FAILURE; + struct dysymtab_command *dysymtabhdr = NULL; + struct relocation_info *start = NULL; + struct relocation_info *dst = NULL; + u_long count = 0; + u_long data_size = 0; + + check(locrelocs); + check(extrelocs); + check(buf); + check(header_offset); + check(data_offset); + + require_action(sizeof(*dysymtabhdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); + dysymtabhdr = (struct dysymtab_command *) ((void *) (buf + *header_offset)); + *header_offset += sizeof(*dysymtabhdr); + + data_size = kxld_reloc_get_macho_data_size(locrelocs, extrelocs); + require_action((*data_offset + data_size) <= size, finish, rval=KERN_FAILURE); + + start = dst = (struct relocation_info *) ((void *) (buf + *data_offset)); + + rval = export_macho_for_array(relocator, locrelocs, &dst); + require_noerr(rval, finish); + + rval = export_macho_for_array(relocator, extrelocs, &dst); + require_noerr(rval, finish); + + count = dst - start; + + memset(dysymtabhdr, 0, sizeof(*dysymtabhdr)); + dysymtabhdr->cmd = LC_DYSYMTAB; + dysymtabhdr->cmdsize = (uint32_t) sizeof(*dysymtabhdr); + dysymtabhdr->locreloff = (uint32_t) *data_offset; + dysymtabhdr->nlocrel = (uint32_t) count; + + *data_offset += count * sizeof(struct relocation_info); + + rval = KERN_SUCCESS; +finish: + return rval; +} +#endif /* KXLD_PIC_KEXTS */ + /******************************************************************************* *******************************************************************************/ kxld_addr_t @@ -564,7 +631,7 @@ get_pointer_at_addr_32(const KXLDRelocator *relocator, check(relocator); - addr = *(const uint32_t *) (data + offset); + addr = *(const uint32_t *) ((void *) (data + offset)); #if !KERNEL if (relocator->swap) { addr = OSSwapInt32(addr); @@ -586,7 +653,7 @@ get_pointer_at_addr_64(const KXLDRelocator *relocator, check(relocator); - addr = *(const uint64_t *) (data + offset); + addr = *(const uint64_t *) ((void *) (data + offset)); #if !KERNEL if (relocator->swap) { addr = OSSwapInt64(addr); @@ -600,8 +667,7 @@ get_pointer_at_addr_64(const KXLDRelocator *relocator, /******************************************************************************* *******************************************************************************/ void -kxld_relocator_set_vtables(KXLDRelocator *relocator, - const struct kxld_dict *vtables) +kxld_relocator_set_vtables(KXLDRelocator *relocator, const KXLDDict *vtables) { relocator->vtables = vtables; } @@ -627,7 +693,7 @@ align_raw_function_address(const KXLDRelocator *relocator, kxld_addr_t value) *******************************************************************************/ kern_return_t kxld_relocator_process_sect_reloc(KXLDRelocator *relocator, - const KXLDReloc *reloc, const struct kxld_sect *sect) + const KXLDReloc *reloc, const KXLDSect *sect) { kern_return_t rval = KERN_FAILURE; u_char *instruction = NULL; @@ -910,165 +976,184 @@ finish: return rval; } -#if KXLD_USER_OR_I386 +#if KXLD_PIC_KEXTS /******************************************************************************* *******************************************************************************/ -static boolean_t -generic_reloc_has_pair(u_int _type) +static u_long +get_macho_data_size_for_array(const KXLDArray *relocs) { - enum reloc_type_generic type = _type; + const KXLDReloc *reloc = NULL; + u_int i = 0; + u_long size = 0; - return (type == GENERIC_RELOC_SECTDIFF || - type == GENERIC_RELOC_LOCAL_SECTDIFF); -} + check(relocs); -/******************************************************************************* -*******************************************************************************/ -static boolean_t -generic_reloc_is_pair(u_int _type, u_int _prev_type __unused) -{ - enum reloc_type_generic type = _type; + for (i = 0; i < relocs->nitems; ++i) { + reloc = kxld_array_get_item(relocs, i); + if (!reloc->pcrel) { + size += sizeof(struct relocation_info); + if(reloc->pair_target_type != KXLD_TARGET_NONE) { + size += sizeof(struct relocation_info); + } + } + } - return (type == GENERIC_RELOC_PAIR); + return size; } /******************************************************************************* *******************************************************************************/ -static boolean_t generic_reloc_has_got(u_int _type __unused) -{ - return FALSE; -} - -/******************************************************************************* -*******************************************************************************/ -static kern_return_t -generic_process_reloc(const KXLDRelocator *relocator, u_char *instruction, - u_int length, u_int pcrel, kxld_addr_t _base_pc, kxld_addr_t _link_pc, - kxld_addr_t _link_disp __unused, u_int _type, kxld_addr_t _target, - kxld_addr_t _pair_target, boolean_t swap __unused) +static kern_return_t +export_macho_for_array(const KXLDRelocator *relocator, + const KXLDArray *relocs, struct relocation_info **dstp) { kern_return_t rval = KERN_FAILURE; - uint32_t base_pc = (uint32_t) _base_pc; - uint32_t link_pc = (uint32_t) _link_pc; - uint32_t *instr_addr = NULL; - uint32_t instr_data = 0; - uint32_t target = (uint32_t) _target; - uint32_t pair_target = (uint32_t) _pair_target; - enum reloc_type_generic type = _type; + const KXLDReloc *reloc = NULL; + struct relocation_info *dst = NULL; + struct scattered_relocation_info *scatdst = NULL; + u_int i = 0; - check(instruction); - require_action(length == 2, finish, rval=KERN_FAILURE); + dst = *dstp; - if (pcrel) target = target + base_pc - link_pc; - - instr_addr = (uint32_t *)instruction; - instr_data = *instr_addr; + for (i = 0; i < relocs->nitems; ++i) { + reloc = kxld_array_get_item(relocs, i); + scatdst = (struct scattered_relocation_info *) dst; -#if !KERNEL - if (swap) instr_data = OSSwapInt32(instr_data); -#endif + if (reloc->pcrel) { + continue; + } - rval = check_for_direct_pure_virtual_call(relocator, instr_data); - require_noerr(rval, finish); + switch (reloc->target_type) { + case KXLD_TARGET_LOOKUP: + scatdst->r_address = reloc->address; + scatdst->r_pcrel = reloc->pcrel; + scatdst->r_length = reloc->length; + scatdst->r_type = reloc->reloc_type; + scatdst->r_value = reloc->target; + scatdst->r_scattered = 1; + break; + case KXLD_TARGET_SECTNUM: + dst->r_address = reloc->address; + dst->r_pcrel = reloc->pcrel; + dst->r_length = reloc->length; + dst->r_type = reloc->reloc_type; + dst->r_symbolnum = reloc->target + 1; + dst->r_extern = 0; + break; + case KXLD_TARGET_SYMBOLNUM: + /* Assume that everything will be slid together; otherwise, + * there is no sensible value for the section number. + */ + dst->r_address = reloc->address; + dst->r_pcrel = reloc->pcrel; + dst->r_length = reloc->length; + dst->r_type = reloc->reloc_type; + dst->r_symbolnum = 1; + dst->r_extern = 0; + break; + default: + rval = KERN_FAILURE; + goto finish; + } - switch (type) { - case GENERIC_RELOC_VANILLA: - instr_data += target; - break; - case GENERIC_RELOC_SECTDIFF: - case GENERIC_RELOC_LOCAL_SECTDIFF: - instr_data = instr_data + target - pair_target; - break; - case GENERIC_RELOC_PB_LA_PTR: - rval = KERN_FAILURE; - goto finish; - case GENERIC_RELOC_PAIR: - default: - rval = KERN_FAILURE; - goto finish; + ++dst; + + if(reloc->pair_target_type != KXLD_TARGET_NONE) { + ++i; + require_action(i < relocs->nitems, finish, rval=KERN_FAILURE); + scatdst = (struct scattered_relocation_info *) dst; + switch (reloc->pair_target_type) { + case KXLD_TARGET_LOOKUP: + scatdst->r_address = reloc->pair_address; + scatdst->r_pcrel = reloc->pcrel; + scatdst->r_length = reloc->length; + scatdst->r_type = relocator->reloc_get_pair_type(reloc->reloc_type); + scatdst->r_value = reloc->pair_target; + scatdst->r_scattered = 1; + break; + case KXLD_TARGET_SECTNUM: + dst->r_address = reloc->pair_address; + dst->r_pcrel = reloc->pcrel; + dst->r_length = reloc->length; + dst->r_type = relocator->reloc_get_pair_type(reloc->reloc_type); + dst->r_symbolnum = reloc->pair_target + 1; + dst->r_extern = 0; + break; + case KXLD_TARGET_SYMBOLNUM: + dst->r_address = reloc->pair_address; + dst->r_pcrel = reloc->pcrel; + dst->r_length = reloc->length; + dst->r_type = relocator->reloc_get_pair_type(reloc->reloc_type); + dst->r_symbolnum = 1; + dst->r_extern = 0; + break; + default: + rval = KERN_FAILURE; + goto finish; + } + ++dst; + } } -#if !KERNEL - if (swap) instr_data = OSSwapInt32(instr_data); -#endif - - *instr_addr = instr_data; - rval = KERN_SUCCESS; - finish: + *dstp = dst; return rval; } -#endif /* KXLD_USER_OR_I386 */ +#endif /* KXLD_PIC_KEXTS */ -#if KXLD_USER_OR_PPC +#if KXLD_USER_OR_I386 /******************************************************************************* *******************************************************************************/ static boolean_t -ppc_reloc_has_pair(u_int _type) +generic_reloc_has_pair(u_int _type) { - enum reloc_type_ppc type = _type; + enum reloc_type_generic type = _type; - switch(type) { - case PPC_RELOC_HI16: - case PPC_RELOC_LO16: - case PPC_RELOC_HA16: - case PPC_RELOC_LO14: - case PPC_RELOC_JBSR: - case PPC_RELOC_SECTDIFF: - return TRUE; - default: - return FALSE; - } + return (type == GENERIC_RELOC_SECTDIFF || + type == GENERIC_RELOC_LOCAL_SECTDIFF); } /******************************************************************************* *******************************************************************************/ -static boolean_t -ppc_reloc_is_pair(u_int _type, u_int _prev_type __unused) +static u_int +generic_reloc_get_pair_type(u_int _prev_type __unused) { - enum reloc_type_ppc type = _type; - - return (type == PPC_RELOC_PAIR); + return GENERIC_RELOC_PAIR; } /******************************************************************************* *******************************************************************************/ -static boolean_t ppc_reloc_has_got(u_int _type __unused) +static boolean_t generic_reloc_has_got(u_int _type __unused) { return FALSE; } /******************************************************************************* *******************************************************************************/ -static kern_return_t -ppc_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, +static kern_return_t +generic_process_reloc(const KXLDRelocator *relocator, u_char *instruction, u_int length, u_int pcrel, kxld_addr_t _base_pc, kxld_addr_t _link_pc, kxld_addr_t _link_disp __unused, u_int _type, kxld_addr_t _target, - kxld_addr_t _pair_target __unused, boolean_t swap __unused) + kxld_addr_t _pair_target, boolean_t swap __unused) { kern_return_t rval = KERN_FAILURE; - uint32_t *instr_addr = NULL; - uint32_t instr_data = 0; uint32_t base_pc = (uint32_t) _base_pc; uint32_t link_pc = (uint32_t) _link_pc; + uint32_t *instr_addr = NULL; + uint32_t instr_data = 0; uint32_t target = (uint32_t) _target; uint32_t pair_target = (uint32_t) _pair_target; - int32_t addend = 0; - int32_t displacement = 0; - uint32_t difference = 0; - uint32_t br14_disp_sign = 0; - enum reloc_type_ppc type = _type; + enum reloc_type_generic type = _type; check(instruction); - require_action(length == 2 || length == 3, finish, - rval=KERN_FAILURE); + require_action(length == 2, finish, rval=KERN_FAILURE); - if (pcrel) displacement = target + base_pc - link_pc; + if (pcrel) target = target + base_pc - link_pc; - instr_addr = (uint32_t *)instruction; + instr_addr = (uint32_t *) ((void *) instruction); instr_data = *instr_addr; - + #if !KERNEL if (swap) instr_data = OSSwapInt32(instr_data); #endif @@ -1077,100 +1162,17 @@ ppc_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, require_noerr(rval, finish); switch (type) { - case PPC_RELOC_VANILLA: - require_action(!pcrel, finish, rval=KERN_FAILURE); - + case GENERIC_RELOC_VANILLA: instr_data += target; break; - case PPC_RELOC_BR14: - require_action(pcrel, finish, rval=KERN_FAILURE); - - addend = BR14D(instr_data); - displacement += SIGN_EXTEND(addend, BR14_NBITS_DISPLACEMENT); - difference = ABSOLUTE_VALUE(displacement); - require_action(difference < BR14_LIMIT, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogRelocationOverflow)); - - - br14_disp_sign = BIT15(instr_data); - instr_data = BR14I(instr_data) | BR14D(displacement); - - /* If this is a predicted conditional branch (signified by an - * instruction length of 3) that is not branch-always, and the sign of - * the displacement is different after relocation, then flip the y-bit - * to preserve the branch prediction - */ - if ((length == 3) && - IS_COND_BR_INSTR(instr_data) && - IS_NOT_ALWAYS_TAKEN(instr_data) && - (BIT15(instr_data) != br14_disp_sign)) - { - FLIP_PREDICT_BIT(instr_data); - } - break; - case PPC_RELOC_BR24: - require_action(pcrel, finish, rval=KERN_FAILURE); - - addend = BR24D(instr_data); - displacement += SIGN_EXTEND(addend, BR24_NBITS_DISPLACEMENT); - difference = ABSOLUTE_VALUE(displacement); - require_action(difference < BR24_LIMIT, finish, - rval=KERN_FAILURE; - kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogRelocationOverflow)); - - instr_data = BR24I(instr_data) | BR24D(displacement); - break; - case PPC_RELOC_HI16: - require_action(!pcrel, finish, rval=KERN_FAILURE); - - target += LO16S(instr_data) | LO16(pair_target); - instr_data = HI16(instr_data) | HI16S(target); - break; - case PPC_RELOC_LO16: - require_action(!pcrel, finish, rval=KERN_FAILURE); - - target += LO16S(pair_target) | LO16(instr_data); - instr_data = HI16(instr_data) | LO16(target); - break; - case PPC_RELOC_HA16: - require_action(!pcrel, finish, rval=KERN_FAILURE); - - instr_data -= BIT15(pair_target) ? 1 : 0; - target += LO16S(instr_data) | LO16(pair_target); - instr_data = HI16(instr_data) | HI16S(target); - instr_data += BIT15(target) ? 1 : 0; - break; - case PPC_RELOC_JBSR: - require_action(!pcrel, finish, rval=KERN_FAILURE); - - /* The generated code as written branches to an island that loads the - * absolute address of the target. If we can branch to the target - * directly with less than 24 bits of displacement, we modify the branch - * instruction to do so which avoids the cost of the island. - */ - - displacement = target + pair_target - link_pc; - difference = ABSOLUTE_VALUE(displacement); - if (difference < BR24_LIMIT) { - instr_data = BR24I(instr_data) | BR24D(displacement); - } - break; - case PPC_RELOC_SECTDIFF: - require_action(!pcrel, finish, rval=KERN_FAILURE); - + case GENERIC_RELOC_SECTDIFF: + case GENERIC_RELOC_LOCAL_SECTDIFF: instr_data = instr_data + target - pair_target; break; - case PPC_RELOC_LO14: - case PPC_RELOC_PB_LA_PTR: - case PPC_RELOC_HI16_SECTDIFF: - case PPC_RELOC_LO16_SECTDIFF: - case PPC_RELOC_HA16_SECTDIFF: - case PPC_RELOC_LO14_SECTDIFF: - case PPC_RELOC_LOCAL_SECTDIFF: + case GENERIC_RELOC_PB_LA_PTR: rval = KERN_FAILURE; goto finish; - case PPC_RELOC_PAIR: + case GENERIC_RELOC_PAIR: default: rval = KERN_FAILURE; goto finish; @@ -1183,11 +1185,11 @@ ppc_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, *instr_addr = instr_data; rval = KERN_SUCCESS; -finish: +finish: return rval; } -#endif /* KXLD_USER_OR_PPC */ +#endif /* KXLD_USER_OR_I386 */ #if KXLD_USER_OR_X86_64 /******************************************************************************* @@ -1202,13 +1204,10 @@ x86_64_reloc_has_pair(u_int _type) /******************************************************************************* *******************************************************************************/ -static boolean_t -x86_64_reloc_is_pair(u_int _type, u_int _prev_type) +static u_int +x86_64_reloc_get_pair_type(u_int _prev_type __unused) { - enum reloc_type_x86_64 type = _type; - enum reloc_type_x86_64 prev_type = _prev_type; - - return (x86_64_reloc_has_pair(prev_type) && type == X86_64_RELOC_UNSIGNED); + return X86_64_RELOC_UNSIGNED; } /******************************************************************************* @@ -1246,7 +1245,7 @@ x86_64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instructio finish, rval=KERN_FAILURE); if (length == 2) { - instr32p = (int32_t *) instruction; + instr32p = (int32_t *) ((void *) instruction); instr32 = *instr32p; #if !KERNEL @@ -1348,7 +1347,7 @@ x86_64_process_reloc(const KXLDRelocator *relocator __unused, u_char *instructio *instr32p = instr32; } else { - instr64p = (uint64_t *) instruction; + instr64p = (uint64_t *) ((void *) instruction); instr64 = *instr64p; #if !KERNEL @@ -1437,12 +1436,10 @@ arm_reloc_has_pair(u_int _type) /******************************************************************************* *******************************************************************************/ -static boolean_t -arm_reloc_is_pair(u_int _type, u_int _prev_type __unused) +static u_int +arm_reloc_get_pair_type(u_int _prev_type __unused) { - enum reloc_type_arm type = _type; - - return (type == ARM_RELOC_PAIR); + return ARM_RELOC_PAIR; } /******************************************************************************* @@ -1476,7 +1473,7 @@ arm_process_reloc(const KXLDRelocator *relocator __unused, u_char *instruction, if (pcrel) displacement = target + base_pc - link_pc; - instr_addr = (uint32_t *)instruction; + instr_addr = (uint32_t *) ((void *) instruction); instr_data = *instr_addr; #if !KERNEL @@ -1535,4 +1532,3 @@ finish: } #endif /* KXLD_USER_OR_ARM */ - diff --git a/libkern/kxld/kxld_reloc.h b/libkern/kxld/kxld_reloc.h index 40a610d1a..695e708fd 100644 --- a/libkern/kxld/kxld_reloc.h +++ b/libkern/kxld/kxld_reloc.h @@ -49,7 +49,7 @@ typedef struct kxld_relocator KXLDRelocator; typedef struct kxld_reloc KXLDReloc; typedef boolean_t (*RelocHasPair)(u_int r_type); -typedef boolean_t (*RelocIsPair)(u_int r_type, u_int prev_r_type); +typedef u_int (*RelocGetPairType)(u_int prev_r_type); typedef boolean_t (*RelocHasGot)(u_int r_type); typedef kern_return_t(*ProcessReloc)(const KXLDRelocator *relocator, u_char *instruction, u_int length, u_int pcrel, kxld_addr_t base_pc, @@ -58,7 +58,7 @@ typedef kern_return_t(*ProcessReloc)(const KXLDRelocator *relocator, struct kxld_relocator { RelocHasPair reloc_has_pair; - RelocIsPair reloc_is_pair; + RelocGetPairType reloc_get_pair_type; RelocHasGot reloc_has_got; ProcessReloc process_reloc; const struct kxld_symtab *symtab; @@ -69,10 +69,12 @@ struct kxld_relocator { u_int function_align; /* Power of two alignment of functions */ boolean_t is_32_bit; boolean_t swap; + boolean_t may_scatter; }; struct kxld_reloc { u_int address; + u_int pair_address; u_int target; u_int pair_target; u_int target_type:3; @@ -104,7 +106,7 @@ void kxld_relocator_clear(KXLDRelocator *relocator) boolean_t kxld_relocator_has_pair(const KXLDRelocator *relocator, u_int r_type) __attribute__((pure, nonnull,visibility("hidden"))); -boolean_t kxld_relocator_is_pair(const KXLDRelocator *relocator, u_int r_type, +u_int kxld_relocator_get_pair_type(const KXLDRelocator *relocator, u_int last_r_type) __attribute__((pure, nonnull,visibility("hidden"))); @@ -127,6 +129,21 @@ KXLDReloc * kxld_reloc_get_reloc_by_offset(const struct kxld_array *relocs, kxld_addr_t offset) __attribute__((pure, nonnull, visibility("hidden"))); +#if KXLD_PIC_KEXTS +u_long kxld_reloc_get_macho_header_size(void) + __attribute__((pure, visibility("hidden"))); + +u_long kxld_reloc_get_macho_data_size(const struct kxld_array *locrelocs, + const struct kxld_array *extrelocs) + __attribute__((pure, nonnull, visibility("hidden"))); + +kern_return_t kxld_reloc_export_macho(const KXLDRelocator *relocator, + const struct kxld_array *locrelocs, const struct kxld_array *extrelocs, + u_char *buf, u_long *header_offset, u_long header_size, + u_long *data_offset, u_long size) + __attribute__((nonnull, visibility("hidden"))); +#endif /* KXLD_PIC_KEXTS */ + /******************************************************************************* * Modifiers *******************************************************************************/ diff --git a/libkern/kxld/kxld_sect.c b/libkern/kxld/kxld_sect.c index d00d6596d..a89e3f693 100644 --- a/libkern/kxld/kxld_sect.c +++ b/libkern/kxld/kxld_sect.c @@ -58,7 +58,7 @@ kxld_sect_init_from_macho_32(KXLDSect *sect, u_char *macho, u_long *sect_offset, u_int sectnum, const KXLDRelocator *relocator) { kern_return_t rval = KERN_FAILURE; - struct section *src = (struct section *) (macho + *sect_offset); + struct section *src = (struct section *) ((void *) (macho + *sect_offset)); struct relocation_info *relocs = NULL; check(sect); @@ -82,7 +82,7 @@ kxld_sect_init_from_macho_32(KXLDSect *sect, u_char *macho, u_long *sect_offset, sect->data = NULL; } - relocs = (struct relocation_info *) (macho + src->reloff); + relocs = (struct relocation_info *) ((void *) (macho + src->reloff)); rval = kxld_reloc_create_macho(§->relocs, relocator, relocs, src->nreloc); @@ -106,7 +106,7 @@ kxld_sect_init_from_macho_64(KXLDSect *sect, u_char *macho, u_long *sect_offset, u_int sectnum, const KXLDRelocator *relocator) { kern_return_t rval = KERN_FAILURE; - struct section_64 *src = (struct section_64 *) (macho + *sect_offset); + struct section_64 *src = (struct section_64 *) ((void *) (macho + *sect_offset)); struct relocation_info *relocs = NULL; check(sect); @@ -130,7 +130,7 @@ kxld_sect_init_from_macho_64(KXLDSect *sect, u_char *macho, u_long *sect_offset, sect->data = NULL; } - relocs = (struct relocation_info *) (macho + src->reloff); + relocs = (struct relocation_info *) ((void *) (macho + src->reloff)); rval = kxld_reloc_create_macho(§->relocs, relocator, relocs, src->nreloc); @@ -430,11 +430,11 @@ export_macho(const KXLDSect *sect, u_char *buf, u_long offset, u_long bufsize) case S_LITERAL_POINTERS: case S_COALESCED: case S_16BYTE_LITERALS: + case S_SYMBOL_STUBS: memcpy(buf + offset, sect->data, (size_t)sect->size); break; case S_ZEROFILL: /* sect->data should be NULL, so we'll never get here */ case S_LAZY_SYMBOL_POINTERS: - case S_SYMBOL_STUBS: case S_GB_ZEROFILL: case S_INTERPOSING: case S_DTRACE_DOF: @@ -467,7 +467,7 @@ sect_export_macho_header_32(const KXLDSect *sect, u_char *buf, require_action(sizeof(*secthdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - secthdr = (struct section *) (buf + *header_offset); + secthdr = (struct section *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*secthdr); /* Initalize header */ @@ -507,7 +507,7 @@ sect_export_macho_header_64(const KXLDSect *sect, u_char *buf, require_action(sizeof(*secthdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - secthdr = (struct section_64 *) (buf + *header_offset); + secthdr = (struct section_64 *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*secthdr); /* Initalize header */ diff --git a/libkern/kxld/kxld_sect.h b/libkern/kxld/kxld_sect.h index 2f655b4af..96d0b1b35 100644 --- a/libkern/kxld/kxld_sect.h +++ b/libkern/kxld/kxld_sect.h @@ -28,7 +28,6 @@ #ifndef _KXLD_SECT_H_ #define _KXLD_SECT_H_ -#include #include #if KERNEL #include diff --git a/libkern/kxld/kxld_seg.c b/libkern/kxld/kxld_seg.c index ba14b4917..ca3d2fb4f 100644 --- a/libkern/kxld/kxld_seg.c +++ b/libkern/kxld/kxld_seg.c @@ -39,6 +39,7 @@ #define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" #include +#include "kxld_reloc.h" #include "kxld_sect.h" #include "kxld_seg.h" #include "kxld_symtab.h" @@ -494,6 +495,8 @@ kxld_seg_get_macho_header_size(const KXLDSeg *seg, boolean_t is_32_bit) /******************************************************************************* *******************************************************************************/ +/* This is no longer used, but may be useful some day... */ +#if 0 u_long kxld_seg_get_macho_data_size(const KXLDSeg *seg) { @@ -511,6 +514,7 @@ kxld_seg_get_macho_data_size(const KXLDSeg *seg) return round_page(size); } +#endif /******************************************************************************* *******************************************************************************/ @@ -535,9 +539,9 @@ kxld_seg_export_macho_to_file_buffer(const KXLDSeg *seg, u_char *buf, u_long base_data_offset = *data_offset; u_int i = 0; struct segment_command *hdr32 = - (struct segment_command *) (buf + *header_offset); + (struct segment_command *) ((void *) (buf + *header_offset)); struct segment_command_64 *hdr64 = - (struct segment_command_64 *) (buf + *header_offset); + (struct segment_command_64 *) ((void *) (buf + *header_offset)); check(seg); check(buf); @@ -634,7 +638,7 @@ seg_export_macho_header_32(const KXLDSeg *seg, u_char *buf, require_action(sizeof(*seghdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - seghdr = (struct segment_command *) (buf + *header_offset); + seghdr = (struct segment_command *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*seghdr); seghdr->cmd = LC_SEGMENT; @@ -674,7 +678,7 @@ seg_export_macho_header_64(const KXLDSeg *seg, u_char *buf, require_action(sizeof(*seghdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - seghdr = (struct segment_command_64 *) (buf + *header_offset); + seghdr = (struct segment_command_64 *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*seghdr); seghdr->cmd = LC_SEGMENT_64; @@ -752,8 +756,7 @@ kxld_seg_finish_init(KXLDSeg *seg) } /* XXX Cross architecture linking will fail if the page size ever differs - * from 4096. (As of this writing, we're fine on ppc, i386, x86_64, and - * arm.) + * from 4096. (As of this writing, we're fine on i386, x86_64, and arm). */ seg->vmsize = round_page(maxaddr + maxsize - seg->base_addr); } @@ -807,9 +810,24 @@ kxld_seg_relocate(KXLDSeg *seg, kxld_addr_t link_addr) /******************************************************************************* *******************************************************************************/ void -kxld_seg_populate_linkedit(KXLDSeg *seg, - const KXLDSymtab *symtab, boolean_t is_32_bit) +kxld_seg_populate_linkedit(KXLDSeg *seg, const KXLDSymtab *symtab, boolean_t is_32_bit +#if KXLD_PIC_KEXTS + , const KXLDArray *locrelocs + , const KXLDArray *extrelocs + , boolean_t target_supports_slideable_kexts +#endif /* KXLD_PIC_KEXTS */ + ) { - seg->vmsize = round_page(kxld_symtab_get_macho_data_size(symtab, is_32_bit)); + u_long size = 0; + + size += kxld_symtab_get_macho_data_size(symtab, is_32_bit); + +#if KXLD_PIC_KEXTS + if (target_supports_slideable_kexts) { + size += kxld_reloc_get_macho_data_size(locrelocs, extrelocs); + } +#endif /* KXLD_PIC_KEXTS */ + + seg->vmsize = round_page(size); } diff --git a/libkern/kxld/kxld_seg.h b/libkern/kxld/kxld_seg.h index ab5abcdc6..1d863bf02 100644 --- a/libkern/kxld/kxld_seg.h +++ b/libkern/kxld/kxld_seg.h @@ -100,8 +100,11 @@ kxld_size_t kxld_seg_get_vmsize(const KXLDSeg *seg) u_long kxld_seg_get_macho_header_size(const KXLDSeg *seg, boolean_t is_32_bit) __attribute__((pure, nonnull, visibility("hidden"))); +#if 0 +/* This is no longer used, but may be useful some day... */ u_long kxld_seg_get_macho_data_size(const KXLDSeg *seg) __attribute__((pure, nonnull, visibility("hidden"))); +#endif kern_return_t kxld_seg_export_macho_to_file_buffer(const KXLDSeg *seg, u_char *buf, @@ -134,8 +137,14 @@ void kxld_seg_set_vm_protections(KXLDSeg *seg, boolean_t strict_protections) void kxld_seg_relocate(KXLDSeg *seg, kxld_addr_t link_addr) __attribute__((nonnull, visibility("hidden"))); -void kxld_seg_populate_linkedit(KXLDSeg *seg, - const struct kxld_symtab *symtab, boolean_t is_32_bit) +void kxld_seg_populate_linkedit(KXLDSeg *seg, const struct kxld_symtab *symtab, + boolean_t is_32_bit +#if KXLD_PIC_KEXTS + , const struct kxld_array *locrelocs + , const struct kxld_array *extrelocs + , boolean_t target_supports_slideable_kexts +#endif /* KXLD_PIC_KEXTS */ + ) __attribute__((nonnull, visibility("hidden"))); #endif /* _KXLD_SEG_H_ */ diff --git a/libkern/kxld/kxld_srcversion.c b/libkern/kxld/kxld_srcversion.c new file mode 100644 index 000000000..c6d4462d8 --- /dev/null +++ b/libkern/kxld/kxld_srcversion.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" +#include + +#include "kxld_util.h" +#include "kxld_srcversion.h" + +/******************************************************************************* + *******************************************************************************/ +void +kxld_srcversion_init_from_macho(KXLDsrcversion *srcversion, struct source_version_command *src) +{ + check(srcversion); + check(src); + + srcversion->version = src->version; + srcversion->has_srcversion = TRUE; +} + +/******************************************************************************* + *******************************************************************************/ +void +kxld_srcversion_clear(KXLDsrcversion *srcversion) +{ + bzero(srcversion, sizeof(*srcversion)); +} + +/******************************************************************************* + *******************************************************************************/ +u_long +kxld_srcversion_get_macho_header_size(void) +{ + return sizeof(struct source_version_command); +} + +/******************************************************************************* + *******************************************************************************/ +kern_return_t +kxld_srcversion_export_macho(const KXLDsrcversion *srcversion, u_char *buf, + u_long *header_offset, u_long header_size) +{ + kern_return_t rval = KERN_FAILURE; + struct source_version_command *srcversionhdr = NULL; + + check(srcversion); + check(buf); + check(header_offset); + + require_action(sizeof(*srcversionhdr) <= header_size - *header_offset, finish, + rval=KERN_FAILURE); + srcversionhdr = (struct source_version_command *) ((void *) (buf + *header_offset)); + *header_offset += sizeof(*srcversionhdr); + + srcversionhdr->cmd = LC_SOURCE_VERSION; + srcversionhdr->cmdsize = (uint32_t) sizeof(*srcversionhdr); + srcversionhdr->version = srcversion->version; + + rval = KERN_SUCCESS; + +finish: + return rval; +} + diff --git a/libkern/kxld/kxld_srcversion.h b/libkern/kxld/kxld_srcversion.h new file mode 100644 index 000000000..b6cdf387b --- /dev/null +++ b/libkern/kxld/kxld_srcversion.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KXLD_SRCVERSION_H_ +#define _KXLD_SRCVERSION_H_ + +#include +#if KERNEL +#include +#else +#include "kxld_types.h" +#endif + +struct source_version_command; +typedef struct kxld_srcversion KXLDsrcversion; + +struct kxld_srcversion { + uint64_t version; + boolean_t has_srcversion; +}; + +/******************************************************************************* + * Constructors and destructors + *******************************************************************************/ + +void kxld_srcversion_init_from_macho(KXLDsrcversion *srcversion, struct source_version_command *src) +__attribute__((nonnull, visibility("hidden"))); + +void kxld_srcversion_clear(KXLDsrcversion *srcversion) +__attribute__((nonnull, visibility("hidden"))); + +/******************************************************************************* + * Accessors + *******************************************************************************/ + +u_long kxld_srcversion_get_macho_header_size(void) +__attribute__((pure, visibility("hidden"))); + +kern_return_t +kxld_srcversion_export_macho(const KXLDsrcversion *srcversion, u_char *buf, + u_long *header_offset, u_long header_size) +__attribute__((pure, nonnull, visibility("hidden"))); + +#endif /* _KXLD_SRCVERSION_H_ */ diff --git a/libkern/kxld/kxld_sym.c b/libkern/kxld/kxld_sym.c index 2e9cb16e9..d82cd5cce 100644 --- a/libkern/kxld/kxld_sym.c +++ b/libkern/kxld/kxld_sym.c @@ -856,7 +856,7 @@ kxld_sym_export_macho_32(const KXLDSym *sym, u_char *_nl, char *strtab, u_long *stroff, u_long strsize) { kern_return_t rval = KERN_FAILURE; - struct nlist *nl = (struct nlist *) _nl; + struct nlist *nl = (struct nlist *) ((void *) _nl); char *str = NULL; long bytes = 0; @@ -897,7 +897,7 @@ kxld_sym_export_macho_64(const KXLDSym *sym, u_char *_nl, char *strtab, u_long *stroff, u_long strsize) { kern_return_t rval = KERN_FAILURE; - struct nlist_64 *nl = (struct nlist_64 *) _nl; + struct nlist_64 *nl = (struct nlist_64 *) ((void *) _nl); char *str = NULL; long bytes = 0; diff --git a/libkern/kxld/kxld_sym.h b/libkern/kxld/kxld_sym.h index 69cb8cbf7..81fe4a4ab 100644 --- a/libkern/kxld/kxld_sym.h +++ b/libkern/kxld/kxld_sym.h @@ -28,7 +28,6 @@ #ifndef _KXLD_SYMBOL_H_ #define _KXLD_SYMBOL_H_ -#include #include #if KERNEL #include diff --git a/libkern/kxld/kxld_symtab.c b/libkern/kxld/kxld_symtab.c index 6700774f4..c5ce51740 100644 --- a/libkern/kxld/kxld_symtab.c +++ b/libkern/kxld/kxld_symtab.c @@ -114,7 +114,7 @@ init_macho(KXLDSymtab *symtab, struct symtab_command *src, boolean_t is_32_bit __unused) { kern_return_t rval = KERN_FAILURE; - u_long symoff; + u_long symoff; u_char * macho_or_linkedit = macho; check(symtab); @@ -128,7 +128,7 @@ init_macho(KXLDSymtab *symtab, struct symtab_command *src, /* Initialize the string table */ - if (kernel_linkedit_seg) { + if (kernel_linkedit_seg) { /* If initing the kernel file in memory, we can't trust * the symtab offsets directly, because the kernel file has been mapped @@ -146,13 +146,13 @@ init_macho(KXLDSymtab *symtab, struct symtab_command *src, * the base of the linkedit segment. */ - symoff = (u_long)(src->symoff - kernel_linkedit_seg->fileoff); - symtab->strings = (char *)(uintptr_t)kernel_linkedit_seg->base_addr + + symoff = (u_long)(src->symoff - kernel_linkedit_seg->fileoff); + symtab->strings = (char *)(uintptr_t)kernel_linkedit_seg->base_addr + src->stroff - kernel_linkedit_seg->fileoff; macho_or_linkedit = (u_char *)(uintptr_t)kernel_linkedit_seg->base_addr; - } else { - symoff = (u_long)src->symoff; - symtab->strings = (char *) (macho + src->stroff); + } else { + symoff = (u_long)src->symoff; + symtab->strings = (char *) (macho + src->stroff); } symtab->strsize = src->strsize; @@ -185,7 +185,7 @@ init_syms_32(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms) kern_return_t rval = KERN_FAILURE; KXLDSym *sym = NULL; u_int i = 0; - struct nlist *src_syms = (struct nlist *) (macho + offset); + struct nlist *src_syms = (struct nlist *) ((void *) (macho + offset)); for (i = 0; i < nsyms; ++i) { sym = kxld_array_get_item(&symtab->syms, i); @@ -212,7 +212,7 @@ init_syms_64(KXLDSymtab *symtab, u_char *macho, u_long offset, u_int nsyms) kern_return_t rval = KERN_FAILURE; KXLDSym *sym = NULL; u_int i = 0; - struct nlist_64 *src_syms = (struct nlist_64 *) (macho + offset); + struct nlist_64 *src_syms = (struct nlist_64 *) ((void *) (macho + offset)); for (i = 0; i < nsyms; ++i) { sym = kxld_array_get_item(&symtab->syms, i); @@ -421,6 +421,8 @@ kxld_symtab_get_macho_data_size(const KXLDSymtab *symtab, boolean_t is_32_bit) size += nsyms * sizeof(struct nlist_64); } + size = (size + 7) & ~7; + return size; } @@ -448,7 +450,7 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, require_action(sizeof(*symtabhdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - symtabhdr = (struct symtab_command *) (buf + *header_offset); + symtabhdr = (struct symtab_command *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*symtabhdr); /* Initialize the symbol table header */ @@ -501,6 +503,8 @@ kxld_symtab_export_macho(const KXLDSymtab *symtab, u_char *buf, /* Update the data offset */ *data_offset += (symtabhdr->nsyms * nlistsize) + stroff; + *data_offset = (*data_offset + 7) & ~7; + rval = KERN_SUCCESS; finish: diff --git a/libkern/kxld/kxld_symtab.h b/libkern/kxld/kxld_symtab.h index a5a038756..ff4b557c5 100644 --- a/libkern/kxld/kxld_symtab.h +++ b/libkern/kxld/kxld_symtab.h @@ -28,7 +28,6 @@ #ifndef _KXLD_SYMTAB_H_ #define _KXLD_SYMTAB_H_ -#include #include #if KERNEL #include diff --git a/libkern/kxld/kxld_util.c b/libkern/kxld/kxld_util.c index 2f7a10643..67d838fe8 100644 --- a/libkern/kxld/kxld_util.c +++ b/libkern/kxld/kxld_util.c @@ -271,7 +271,7 @@ validate_and_swap_macho_32(u_char *file, u_long size ) { kern_return_t rval = KERN_FAILURE; - struct mach_header *mach_hdr = (struct mach_header *) file; + struct mach_header *mach_hdr = (struct mach_header *) ((void *) file); struct load_command *load_hdr = NULL; struct segment_command *seg_hdr = NULL; struct section *sects = NULL; @@ -325,7 +325,7 @@ validate_and_swap_macho_32(u_char *file, u_long size for(i = 0; i < mach_hdr->ncmds; ++i, offset += cmdsize) { /* Get the load command and size */ - load_hdr = (struct load_command *) (file + offset); + load_hdr = (struct load_command *) ((void *) (file + offset)); cmd = load_hdr->cmd; cmdsize = load_hdr->cmdsize; @@ -382,7 +382,7 @@ validate_and_swap_macho_32(u_char *file, u_long size kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); /* Swap the relocation entries */ - relocs = (struct relocation_info *) (file + sects[j].reloff); + relocs = (struct relocation_info *) ((void *) (file + sects[j].reloff)); #if !KERNEL if (swap) { swap_relocation_info(relocs, sects[j].nreloc, @@ -412,7 +412,7 @@ validate_and_swap_macho_32(u_char *file, u_long size #if !KERNEL /* Swap the symbol table entries */ - symtab = (struct nlist *) (file + symtab_hdr->symoff); + symtab = (struct nlist *) ((void *) (file + symtab_hdr->symoff)); if (swap) swap_nlist(symtab, symtab_hdr->nsyms, host_order); #endif /* !KERNEL */ @@ -442,7 +442,7 @@ validate_and_swap_macho_64(u_char *file, u_long size ) { kern_return_t rval = KERN_FAILURE; - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) file; + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) ((void *) file); struct load_command *load_hdr = NULL; struct segment_command_64 *seg_hdr = NULL; struct section_64 *sects = NULL; @@ -495,7 +495,7 @@ validate_and_swap_macho_64(u_char *file, u_long size /* Validate and potentially swap the load commands */ for(i = 0; i < mach_hdr->ncmds; ++i, offset += cmdsize) { /* Get the load command and size */ - load_hdr = (struct load_command *) (file + offset); + load_hdr = (struct load_command *) ((void *) (file + offset)); cmd = load_hdr->cmd; cmdsize = load_hdr->cmdsize; @@ -513,7 +513,7 @@ validate_and_swap_macho_64(u_char *file, u_long size switch(cmd) { case LC_SEGMENT_64: /* Get and swap the segment header */ - seg_hdr = (struct segment_command_64 *) load_hdr; + seg_hdr = (struct segment_command_64 *) ((void *) load_hdr); #if !KERNEL if (swap) swap_segment_command_64(seg_hdr, host_order); #endif /* !KERNEL */ @@ -551,7 +551,7 @@ validate_and_swap_macho_64(u_char *file, u_long size kxld_log(kKxldLogLinking, kKxldLogErr, kKxldLogTruncatedMachO)); /* Swap the relocation entries */ - relocs = (struct relocation_info *) (file + sects[j].reloff); + relocs = (struct relocation_info *) ((void *) (file + sects[j].reloff)); #if !KERNEL if (swap) { swap_relocation_info(relocs, sects[j].nreloc, @@ -581,7 +581,7 @@ validate_and_swap_macho_64(u_char *file, u_long size #if !KERNEL /* Swap the symbol table entries */ - symtab = (struct nlist_64 *) (file + symtab_hdr->symoff); + symtab = (struct nlist_64 *) ((void *) (file + symtab_hdr->symoff)); if (swap) swap_nlist_64(symtab, symtab_hdr->nsyms, host_order); #endif /* !KERNEL */ @@ -607,7 +607,7 @@ finish: void unswap_macho(u_char *file, enum NXByteOrder host_order, enum NXByteOrder target_order) { - struct mach_header *hdr = (struct mach_header *) file; + struct mach_header *hdr = (struct mach_header *) ((void *) file); if (!hdr) return; @@ -624,7 +624,7 @@ static void unswap_macho_32(u_char *file, enum NXByteOrder host_order, enum NXByteOrder target_order) { - struct mach_header *mach_hdr = (struct mach_header *) file; + struct mach_header *mach_hdr = (struct mach_header *) ((void *) file); struct load_command *load_hdr = NULL; struct segment_command *seg_hdr = NULL; struct section *sects = NULL; @@ -641,7 +641,7 @@ unswap_macho_32(u_char *file, enum NXByteOrder host_order, offset = sizeof(*mach_hdr); for(i = 0; i < mach_hdr->ncmds; ++i, offset += size) { - load_hdr = (struct load_command *) (file + offset); + load_hdr = (struct load_command *) ((void *) (file + offset)); cmd = load_hdr->cmd; size = load_hdr->cmdsize; @@ -659,7 +659,7 @@ unswap_macho_32(u_char *file, enum NXByteOrder host_order, break; case LC_SYMTAB: symtab_hdr = (struct symtab_command *) load_hdr; - symtab = (struct nlist*) (file + symtab_hdr->symoff); + symtab = (struct nlist*) ((void *) (file + symtab_hdr->symoff)); swap_nlist(symtab, symtab_hdr->nsyms, target_order); swap_symtab_command(symtab_hdr, target_order); @@ -680,7 +680,7 @@ static void unswap_macho_64(u_char *file, enum NXByteOrder host_order, enum NXByteOrder target_order) { - struct mach_header_64 *mach_hdr = (struct mach_header_64 *) file; + struct mach_header_64 *mach_hdr = (struct mach_header_64 *) ((void *) file); struct load_command *load_hdr = NULL; struct segment_command_64 *seg_hdr = NULL; struct section_64 *sects = NULL; @@ -697,13 +697,13 @@ unswap_macho_64(u_char *file, enum NXByteOrder host_order, offset = sizeof(*mach_hdr); for(i = 0; i < mach_hdr->ncmds; ++i, offset += size) { - load_hdr = (struct load_command *) (file + offset); + load_hdr = (struct load_command *) ((void *) (file + offset)); cmd = load_hdr->cmd; size = load_hdr->cmdsize; switch(cmd) { case LC_SEGMENT_64: - seg_hdr = (struct segment_command_64 *) load_hdr; + seg_hdr = (struct segment_command_64 *) ((void *) load_hdr); sects = (struct section_64 *) &seg_hdr[1]; /* We don't need to unswap relocations because this function is @@ -715,7 +715,7 @@ unswap_macho_64(u_char *file, enum NXByteOrder host_order, break; case LC_SYMTAB: symtab_hdr = (struct symtab_command *) load_hdr; - symtab = (struct nlist_64 *) (file + symtab_hdr->symoff); + symtab = (struct nlist_64 *) ((void *) (file + symtab_hdr->symoff)); swap_nlist_64(symtab, symtab_hdr->nsyms, target_order); swap_symtab_command(symtab_hdr, target_order); diff --git a/libkern/kxld/kxld_util.h b/libkern/kxld/kxld_util.h index 9d5720f04..0eb0f2f7a 100644 --- a/libkern/kxld/kxld_util.h +++ b/libkern/kxld/kxld_util.h @@ -28,13 +28,18 @@ #ifndef _KXLD_UTIL_H_ #define _KXLD_UTIL_H_ -#include #include #if KERNEL #include + #include #else #include #include "kxld_types.h" + + /* Get machine.h from the kernel source so we can support all platforms + * that the kernel supports. Otherwise we're at the mercy of the host. + */ + #include "../../osfmk/mach/machine.h" #endif /* 64-bit helpers */ diff --git a/libkern/kxld/kxld_uuid.c b/libkern/kxld/kxld_uuid.c index ce64c343e..66f32a0fa 100644 --- a/libkern/kxld/kxld_uuid.c +++ b/libkern/kxld/kxld_uuid.c @@ -78,7 +78,7 @@ kxld_uuid_export_macho(const KXLDuuid *uuid, u_char *buf, require_action(sizeof(*uuidhdr) <= header_size - *header_offset, finish, rval=KERN_FAILURE); - uuidhdr = (struct uuid_command *) (buf + *header_offset); + uuidhdr = (struct uuid_command *) ((void *) (buf + *header_offset)); *header_offset += sizeof(*uuidhdr); uuidhdr->cmd = LC_UUID; diff --git a/libkern/kxld/kxld_versionmin.c b/libkern/kxld/kxld_versionmin.c new file mode 100644 index 000000000..9b4753c4b --- /dev/null +++ b/libkern/kxld/kxld_versionmin.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld" +#include + +#include "kxld_util.h" +#include "kxld_versionmin.h" + +/******************************************************************************* +*******************************************************************************/ +void +kxld_versionmin_init_from_macho(KXLDversionmin *versionmin, struct version_min_command *src) +{ + check(versionmin); + check(src); + check((src->cmd == LC_VERSION_MIN_MACOSX) || (src->cmd == LC_VERSION_MIN_IPHONEOS)); + + switch (src->cmd) { + case LC_VERSION_MIN_MACOSX: + versionmin->platform = kKxldVersionMinMacOSX; + break; + case LC_VERSION_MIN_IPHONEOS: + versionmin->platform = kKxldVersionMiniPhoneOS; + break; + } + + versionmin->version = src->version; + versionmin->has_versionmin = TRUE; +} + +/******************************************************************************* +*******************************************************************************/ +void +kxld_versionmin_clear(KXLDversionmin *versionmin) +{ + bzero(versionmin, sizeof(*versionmin)); +} + +/******************************************************************************* +*******************************************************************************/ +u_long +kxld_versionmin_get_macho_header_size(void) +{ + return sizeof(struct version_min_command); +} + +/******************************************************************************* +*******************************************************************************/ +kern_return_t +kxld_versionmin_export_macho(const KXLDversionmin *versionmin, u_char *buf, + u_long *header_offset, u_long header_size) +{ + kern_return_t rval = KERN_FAILURE; + struct version_min_command *versionminhdr = NULL; + + check(versionmin); + check(buf); + check(header_offset); + + require_action(sizeof(*versionminhdr) <= header_size - *header_offset, finish, + rval=KERN_FAILURE); + versionminhdr = (struct version_min_command *) ((void *) (buf + *header_offset)); + bzero(versionminhdr, sizeof(*versionminhdr)); + *header_offset += sizeof(*versionminhdr); + + switch (versionmin->platform) { + case kKxldVersionMinMacOSX: + versionminhdr->cmd = LC_VERSION_MIN_MACOSX; + break; + case kKxldVersionMiniPhoneOS: + versionminhdr->cmd = LC_VERSION_MIN_IPHONEOS; + break; + } + versionminhdr->cmdsize = (uint32_t) sizeof(*versionminhdr); + versionminhdr->version = versionmin->version; + versionminhdr->sdk = 0; + + rval = KERN_SUCCESS; + +finish: + return rval; +} + diff --git a/libkern/kxld/kxld_versionmin.h b/libkern/kxld/kxld_versionmin.h new file mode 100644 index 000000000..3ebcac665 --- /dev/null +++ b/libkern/kxld/kxld_versionmin.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KXLD_VERSIONMIN_H_ +#define _KXLD_VERSIONMIN_H_ + +#include +#if KERNEL + #include +#else + #include "kxld_types.h" +#endif + +struct version_min_command; +typedef struct kxld_versionmin KXLDversionmin; + +enum kxld_versionmin_platforms { + kKxldVersionMinMacOSX, + kKxldVersionMiniPhoneOS +}; + +struct kxld_versionmin { + enum kxld_versionmin_platforms platform; + uint32_t version; + boolean_t has_versionmin; +}; + +/******************************************************************************* +* Constructors and destructors +*******************************************************************************/ + +void kxld_versionmin_init_from_macho(KXLDversionmin *versionmin, struct version_min_command *src) + __attribute__((nonnull, visibility("hidden"))); + +void kxld_versionmin_clear(KXLDversionmin *versionmin) + __attribute__((nonnull, visibility("hidden"))); + +/******************************************************************************* +* Accessors +*******************************************************************************/ + +u_long kxld_versionmin_get_macho_header_size(void) + __attribute__((pure, visibility("hidden"))); + +kern_return_t +kxld_versionmin_export_macho(const KXLDversionmin *versionmin, u_char *buf, + u_long *header_offset, u_long header_size) + __attribute__((pure, nonnull, visibility("hidden"))); + +#endif /* _KXLD_VERSIONMIN_H_ */ + diff --git a/libkern/kxld/kxld_vtable.c b/libkern/kxld/kxld_vtable.c index e792d3842..24408145b 100644 --- a/libkern/kxld/kxld_vtable.c +++ b/libkern/kxld/kxld_vtable.c @@ -495,6 +495,7 @@ kxld_vtable_patch(KXLDVTable *vtable, const KXLDVTable *super_vtable, symtab = kxld_object_get_symtab(object); require_action(!vtable->is_patched, finish, rval=KERN_SUCCESS); + require_action(super_vtable->is_patched, finish, rval=KERN_FAILURE); require_action(vtable->entries.nitems >= super_vtable->entries.nitems, finish, rval=KERN_FAILURE; kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMalformedVTable, diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index 2d86d6882..55954e985 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -15,13 +15,11 @@ INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} \ i386 INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} \ i386 -INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} \ - arm + EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} -EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} DATAFILES = \ OSAtomic.h \ @@ -43,7 +41,7 @@ PRIVATE_DATAFILES = \ OSKextLibPrivate.h \ kext_request_keys.h \ mkext.h \ - prelink.h \ + prelink.h \ WKdm.h INSTALL_MI_LIST = \ @@ -77,7 +75,8 @@ EXPORT_MI_LIST = \ ${PRIVATE_DATAFILES} \ kernel_mach_header.h \ kxld.h \ - kxld_types.h + kxld_types.h \ + stack_protector.h EXPORT_MI_GEN_LIST = version.h diff --git a/libkern/libkern/OSAtomic.h b/libkern/libkern/OSAtomic.h index d585c4175..98e0eb99b 100644 --- a/libkern/libkern/OSAtomic.h +++ b/libkern/libkern/OSAtomic.h @@ -85,8 +85,6 @@ extern Boolean OSCompareAndSwap64( #endif /* defined(__i386__) || defined(__x86_64__) */ -#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) - /*! * @function OSAddAtomic64 * @@ -130,8 +128,6 @@ inline static SInt64 OSDecrementAtomic64(volatile SInt64 * address) return OSAddAtomic64(-1LL, address); } -#endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm__) */ - #if XNU_KERNEL_PRIVATE /* Not to be included in headerdoc. * diff --git a/libkern/libkern/OSKextLib.h b/libkern/libkern/OSKextLib.h index 6ecc3548d..34e7544ac 100644 --- a/libkern/libkern/OSKextLib.h +++ b/libkern/libkern/OSKextLib.h @@ -346,6 +346,15 @@ __BEGIN_DECLS */ #define kIOPersonalityPublisherKey "IOPersonalityPublisher" +#if CONFIG_KEC_FIPS +/* + * @define kAppleTextHashesKey + * @abstract A dictionary conataining hashes for corecrypto kext. + */ +#define kAppleTextHashesKey "AppleTextHashes" +#endif + + #if PRAGMA_MARK /********************************************************************/ @@ -916,6 +925,25 @@ extern const void * gOSKextUnresolved; #define OSKextSymbolIsResolved(weak_sym) \ (&(weak_sym) != gOSKextUnresolved) + +#if CONFIG_KEC_FIPS + +#if PRAGMA_MARK +#pragma mark - +/********************************************************************/ +#pragma mark Kernel External Components for FIPS compliance +/********************************************************************/ +#endif + +// Kernel External Components for FIPS compliance (KEC_FIPS) +// WARNING - ath_hash is owned by the kernel, do not free +typedef struct AppleTEXTHash { + const int ath_version; // version of this structure (value is 1) + int ath_length; // length of hash data + void * ath_hash; // hash extracted from AppleTextHashes dict +} AppleTEXTHash_t; +#endif // CONFIG_KEC_FIPS + #endif /* KERNEL */ __END_DECLS diff --git a/libkern/libkern/OSKextLibPrivate.h b/libkern/libkern/OSKextLibPrivate.h index 53fbc3921..15f85461d 100644 --- a/libkern/libkern/OSKextLibPrivate.h +++ b/libkern/libkern/OSKextLibPrivate.h @@ -72,6 +72,28 @@ typedef uint8_t OSKextExcludeLevel; */ #define kOSBundleHelperKey "OSBundleHelper" +/*! + * @define kOSBundleDeveloperOnlyKey + * @abstract A boolean value indicating whether the kext should only load on + * Developer devices. + */ +#define kOSBundleDeveloperOnlyKey "OSBundleDeveloperOnly" + + +/*! + * @define kAppleSecurityExtensionKey + * @abstract A boolean value indicating whether the kext registers + * MACF hooks. + */ +#define kAppleSecurityExtensionKey "AppleSecurityExtension" + +/*! + * @define kAppleKernelExternalComponentKey + * @abstract A boolean value indicating whether the kext is vending kernel + * KPI, and needs special loading behavior. + */ +#define kAppleKernelExternalComponentKey "AppleKernelExternalComponent" + // properties found in the registry root #define kOSKernelCPUTypeKey "OSKernelCPUType" #define kOSKernelCPUSubtypeKey "OSKernelCPUSubtype" diff --git a/libkern/libkern/OSTypes.h b/libkern/libkern/OSTypes.h index 6a03a2740..0945952bb 100644 --- a/libkern/libkern/OSTypes.h +++ b/libkern/libkern/OSTypes.h @@ -89,7 +89,7 @@ typedef struct wide { typedef SInt32 OSStatus; -#if defined(__LP64__) && defined(KERNEL) +#if (defined(__LP64__) || defined (__arm__)) && defined(KERNEL) #ifndef ABSOLUTETIME_SCALAR_TYPE #define ABSOLUTETIME_SCALAR_TYPE 1 #endif diff --git a/libkern/libkern/WKdm.h b/libkern/libkern/WKdm.h index f88b9971b..68977ce8a 100644 --- a/libkern/libkern/WKdm.h +++ b/libkern/libkern/WKdm.h @@ -68,11 +68,11 @@ typedef unsigned int WK_word; /* the next few are used during compression to write the header */ #define SET_QPOS_AREA_START(compr_dest_buf,qpos_start_addr) \ - (compr_dest_buf[1] = (unsigned int)(qpos_start_addr - compr_dest_buf)) + (compr_dest_buf[1] = (WK_word)(qpos_start_addr - compr_dest_buf)) #define SET_LOW_BITS_AREA_START(compr_dest_buf,lb_start_addr) \ - (compr_dest_buf[2] = (unsigned int)(lb_start_addr - compr_dest_buf)) + (compr_dest_buf[2] = (WK_word)(lb_start_addr - compr_dest_buf)) #define SET_LOW_BITS_AREA_END(compr_dest_buf,lb_end_addr) \ - (compr_dest_buf[3] = (unsigned int)(lb_end_addr - compr_dest_buf)) + (compr_dest_buf[3] = (WK_word)(lb_end_addr - compr_dest_buf)) /* the next few are only use during decompression to read the header */ #define TAGS_AREA_START(decomp_src_buf) \ diff --git a/libkern/libkern/c++/Makefile b/libkern/libkern/c++/Makefile index 8045763a1..9b7738bd7 100644 --- a/libkern/libkern/c++/Makefile +++ b/libkern/libkern/c++/Makefile @@ -12,7 +12,6 @@ INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = -INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} @@ -20,7 +19,6 @@ EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} -EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} DATAFILES = \ OSArray.h \ diff --git a/libkern/libkern/c++/OSCollection.h b/libkern/libkern/c++/OSCollection.h index ea0acb648..adb7cbf8b 100644 --- a/libkern/libkern/c++/OSCollection.h +++ b/libkern/libkern/c++/OSCollection.h @@ -102,7 +102,11 @@ protected: */ unsigned int updateStamp; +#ifdef XNU_KERNEL_PRIVATE +protected: +#else private: +#endif /* XNU_KERNEL_PRIVATE */ /* Reserved for future use. (Internal use only) */ // ExpansionData * reserved; unsigned int fOptions; @@ -230,6 +234,7 @@ public: */ typedef enum { kImmutable = 0x00000001, + kSort = 0x00000002, kMASK = (unsigned) -1 } _OSCollectionFlags; diff --git a/libkern/libkern/c++/OSData.h b/libkern/libkern/c++/OSData.h index 2b4159604..11c3a3d10 100644 --- a/libkern/libkern/c++/OSData.h +++ b/libkern/libkern/c++/OSData.h @@ -81,7 +81,7 @@ protected: unsigned int capacity; unsigned int capacityIncrement; - struct ExpansionData { }; + struct ExpansionData; /* Reserved for future use. (Internal use only) */ ExpansionData * reserved; @@ -711,6 +711,9 @@ public: unsigned char byte, unsigned int numBytes); + + void setSerializable(bool serializable); + #ifdef XNU_KERNEL_PRIVATE /* Available within xnu source only */ public: diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index d3f0fa232..c8d5edd0c 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -245,6 +245,7 @@ private: unsigned int delayAutounload:1; // for development unsigned int CPPInitialized:1; + unsigned int jettisonLinkeditSeg:1; } flags; #if PRAGMA_MARK @@ -388,9 +389,9 @@ private: static void recordIdentifierRequest( OSString * kextIdentifier); + virtual OSReturn slidePrelinkedExecutable(void); virtual OSReturn loadExecutable(void); virtual void jettisonLinkeditSegment(void); - virtual OSReturn removeLinkeditHeaders(kernel_segment_command_t *linkedit); static void considerDestroyingLinkContext(void); virtual OSData * getExecutable(void); virtual void setLinkedExecutable(OSData * anExecutable); @@ -437,8 +438,6 @@ private: OSArray * keys = NULL); virtual OSDictionary * copyInfo(OSArray * keys = NULL); - static OSData * copySanitizedKernelImage(void); - /* Logging to user space. */ static OSKextLogSpec setUserSpaceLogFilter( @@ -573,7 +572,7 @@ public: static void flushNonloadedKexts(Boolean flushPrelinkedKexts); static void setKextdActive(Boolean active = true); static void setDeferredLoadSucceeded(Boolean succeeded = true); - static void considerRebuildOfPrelinkedKernel(OSString * moduleName); + static void considerRebuildOfPrelinkedKernel(void); virtual bool setAutounloadEnabled(bool flag); diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index cb2f9896a..fe211c724 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -39,6 +39,9 @@ class OSString; class OSSymbol; class OSDictionary; class OSSerialize; +#ifdef XNU_KERNEL_PRIVATE +class OSOrderedSet; +#endif /*! @@ -54,8 +57,26 @@ class OSSerialize; /*! @parseOnly */ #define APPLE_KEXT_COMPATIBILITY +#ifdef XNU_KERNEL_PRIVATE + +#ifdef CONFIG_EMBEDDED +#define APPLE_KEXT_VTABLE_PADDING 0 +#else /* CONFIG_EMBEDDED */ /*! @parseOnly */ #define APPLE_KEXT_VTABLE_PADDING 1 +#endif /* CONFIG_EMBEDDED */ + +#else /* XNU_KERNEL_PRIVATE */ +#include + +#if TARGET_OS_EMBEDDED +#define APPLE_KEXT_VTABLE_PADDING 0 +#else /* TARGET_OS_EMBEDDED */ +/*! @parseOnly */ +#define APPLE_KEXT_VTABLE_PADDING 1 +#endif /* TARGET_OS_EMBEDDED */ + +#endif /* XNU_KERNEL_PRIVATE */ #if defined(__LP64__) /*! @parseOnly */ @@ -64,16 +85,6 @@ class OSSerialize; #define APPLE_KEXT_LEGACY_ABI 1 #endif -#if APPLE_KEXT_VTABLE_PADDING -/*! @parseOnly */ -#define APPLE_KEXT_PAD_METHOD virtual -/*! @parseOnly */ -#define APPLE_KEXT_PAD_IMPL(index) gMetaClass.reservedCalled(index) -#else -#define APPLE_KEXT_PAD_METHOD static -#define APPLE_KEXT_PAD_IMPL(index) -#endif - #if defined(__LP64__) /*! @parseOnly */ #define APPLE_KEXT_COMPATIBILITY_VIRTUAL @@ -329,6 +340,7 @@ _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) } #else /* !APPLE_KEXT_LEGACY_ABI */ +#if defined(__i386__) || defined(__x86_64__) // Slightly less arcane and slightly less evil code to do // the same for kexts compiled with the standard Itanium C++ @@ -361,6 +373,9 @@ _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) } } +#else +#error Unknown architecture. +#endif /* __arm__ */ #endif /* !APPLE_KEXT_LEGACY_ABI */ @@ -745,15 +760,22 @@ protected: const int freeWhen) const = 0; private: +#if APPLE_KEXT_VTABLE_PADDING // Virtual Padding virtual void _RESERVEDOSMetaClassBase3(); virtual void _RESERVEDOSMetaClassBase4(); virtual void _RESERVEDOSMetaClassBase5(); virtual void _RESERVEDOSMetaClassBase6(); virtual void _RESERVEDOSMetaClassBase7(); +#endif } APPLE_KEXT_COMPATIBILITY; +#ifdef XNU_KERNEL_PRIVATE +typedef bool (*OSMetaClassInstanceApplierFunction)(const OSObject * instance, + void * context); +#endif /* XNU_KERNEL_PRIVATE */ + /*! * @class OSMetaClass * @@ -848,10 +870,8 @@ private: // Can never be allocated must be created at compile time static void * operator new(size_t size); - struct ExpansionData { }; - /* Reserved for future use. (Internal use only) */ - ExpansionData *reserved; + struct ExpansionData *reserved; /* superClass Handle to the superclass's meta class. */ const OSMetaClass *superClassLink; @@ -1474,7 +1494,6 @@ public: */ const OSMetaClass * getSuperClass() const; - /*! * @function getKmodName * @@ -1501,6 +1520,7 @@ public: * Returns the name of the C++ class managed by this metaclass. */ const char * getClassName() const; + const OSSymbol * getClassNameSymbol() const; /*! @@ -1531,6 +1551,21 @@ public: */ virtual OSObject * alloc() const = 0; +#ifdef XNU_KERNEL_PRIVATE + void addInstance(const OSObject * instance, bool super = false) const; + void removeInstance(const OSObject * instance, bool super = false) const; + void applyToInstances(OSMetaClassInstanceApplierFunction applier, + void * context) const; + static void applyToInstancesOfClassName( + const OSSymbol * name, + OSMetaClassInstanceApplierFunction applier, + void * context); +private: + static void applyToInstances(OSOrderedSet * set, + OSMetaClassInstanceApplierFunction applier, + void * context); +public: +#endif /* Not to be included in headerdoc. * @@ -1939,9 +1974,13 @@ public: * @link OSMetaClassDeclareReservedUsed * OSMetaClassDeclareReservedUsed@/link. */ +#if APPLE_KEXT_VTABLE_PADDING #define OSMetaClassDeclareReservedUnused(className, index) \ private: \ - APPLE_KEXT_PAD_METHOD void _RESERVED ## className ## index () + virtual void _RESERVED ## className ## index () +#else +#define OSMetaClassDeclareReservedUnused(className, index) +#endif /*! @@ -2001,9 +2040,13 @@ public: * @link OSMetaClassDefineReservedUsed * OSMetaClassDefineReservedUsed@/link. */ +#if APPLE_KEXT_VTABLE_PADDING #define OSMetaClassDefineReservedUnused(className, index) \ void className ::_RESERVED ## className ## index () \ - { APPLE_KEXT_PAD_IMPL(index); } + { gMetaClass.reservedCalled(index); } +#else +#define OSMetaClassDefineReservedUnused(className, index) +#endif /*! diff --git a/libkern/libkern/c++/OSObject.h b/libkern/libkern/c++/OSObject.h index b33ed3c47..a24f30e98 100644 --- a/libkern/libkern/c++/OSObject.h +++ b/libkern/libkern/c++/OSObject.h @@ -36,6 +36,10 @@ HISTORY #include +#if defined(__clang__) +#pragma clang diagnostic ignored "-Woverloaded-virtual" +#endif + class OSSymbol; class OSString; diff --git a/libkern/libkern/c++/OSSymbol.h b/libkern/libkern/c++/OSSymbol.h index 5067423a9..d3ae9e1e1 100644 --- a/libkern/libkern/c++/OSSymbol.h +++ b/libkern/libkern/c++/OSSymbol.h @@ -366,13 +366,21 @@ public: virtual bool isEqualTo(const OSMetaClassBase * anObject) const; +#ifdef XNU_KERNEL_PRIVATE /* OSRuntime only INTERNAL API - DO NOT USE */ /* Not to be included in headerdoc. */ // xx-review: this should be removed from the symbol set. + static void checkForPageUnload( void * startAddr, void * endAddr); + static unsigned int bsearch( + const void * key, + const void * array, + unsigned int arrayCount, + size_t memberSize); +#endif /* XNU_KERNEL_PRIVATE */ OSMetaClassDeclareReservedUnused(OSSymbol, 0); OSMetaClassDeclareReservedUnused(OSSymbol, 1); diff --git a/libkern/libkern/crypto/Makefile b/libkern/libkern/crypto/Makefile index 38aaa055e..1b8cc587c 100644 --- a/libkern/libkern/crypto/Makefile +++ b/libkern/libkern/crypto/Makefile @@ -16,11 +16,15 @@ EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} DATAFILES = md5.h sha1.h +PRIVATE_DATAFILES = register_crypto.h sha2.h des.h aes.h aesxts.h + INSTALL_KF_MI_LIST = ${DATAFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} + INSTALL_MI_DIR = libkern/crypto -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} crypto_internal.h EXPORT_MI_DIR = libkern/crypto diff --git a/libkern/libkern/crypto/aes.h b/libkern/libkern/crypto/aes.h new file mode 100644 index 000000000..dc7a16c6e --- /dev/null +++ b/libkern/libkern/crypto/aes.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _AES_H +#define _AES_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#include +#include + +#define AES_BLOCK_SIZE 16 /* the AES block size in bytes */ + +//Unholy HACK: this works because we know the size of the context for every +//possible corecrypto implementation is less than this. +#define AES_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(void *)) + ccn_sizeof_size(AES_BLOCK_SIZE) + ccn_sizeof_size(64*4)) + +typedef struct{ + cccbc_ctx_decl(AES_CBC_CTX_MAX_SIZE, ctx); +} aes_decrypt_ctx; + +typedef struct{ + cccbc_ctx_decl(AES_CBC_CTX_MAX_SIZE, ctx); +} aes_encrypt_ctx; + +typedef struct +{ + aes_decrypt_ctx decrypt; + aes_encrypt_ctx encrypt; +} aes_ctx; + + +/* for compatibility with old apis*/ +#define aes_ret int +#define aes_good 0 +#define aes_error -1 +#define aes_rval aes_ret + + + +/* Key lengths in the range 16 <= key_len <= 32 are given in bytes, */ +/* those in the range 128 <= key_len <= 256 are given in bits */ + +aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]); +aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); +aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); + +#if defined (__i386__) || defined (__x86_64__) +aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, aes_encrypt_ctx cx[1]); +#endif + +aes_rval aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out_blk, aes_encrypt_ctx cx[1]); + + +aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]); +aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); +aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); + +#if defined (__i386__) || defined (__x86_64__) +aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, aes_decrypt_ctx cx[1]); +#endif + +aes_rval aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *out_blk, aes_decrypt_ctx cx[1]); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/libkern/libkern/crypto/aesxts.h b/libkern/libkern/crypto/aesxts.h new file mode 100644 index 000000000..ad1da4310 --- /dev/null +++ b/libkern/libkern/crypto/aesxts.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CRYPTO_AESXTS_H +#define _CRYPTO_AESXTS_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#include +#include +#include + +//Unholy HACK: this works because we know the size of the context for every +//possible corecrypto implementation is less than this. +#define AES_XTS_CTX_MAX_SIZE (ccn_sizeof_size(3*sizeof(void *)) + 2*ccn_sizeof_size(128*4) + ccn_sizeof_size(16)) + +typedef struct { + ccxts_ctx_decl(AES_XTS_CTX_MAX_SIZE, enc); + ccxts_ctx_decl(AES_XTS_CTX_MAX_SIZE, dec); +} symmetric_xts; + + +/* + * These are the interfaces required for XTS-AES support + */ + +uint32_t +xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only + const uint8_t *IV, // ignored + const uint8_t *key1, int keylen, + const uint8_t *key2, int tweaklen, // both keys are the same size for xts + uint32_t num_rounds, // ignored + uint32_t options, // ignored + symmetric_xts *xts); + +int xts_encrypt(const uint8_t *pt, unsigned long ptlen, + uint8_t *ct, + const uint8_t *tweak, // this can be considered the sector IV for this use + symmetric_xts *xts); + +int xts_decrypt(const uint8_t *ct, unsigned long ptlen, + uint8_t *pt, + const uint8_t *tweak, // this can be considered the sector IV for this use + symmetric_xts *xts); + +void xts_done(symmetric_xts *xts); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/libkern/libkern/crypto/crypto_internal.h b/libkern/libkern/crypto/crypto_internal.h new file mode 100644 index 000000000..82c98b151 --- /dev/null +++ b/libkern/libkern/crypto/crypto_internal.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* To access the corecrypto functions */ +#ifndef _CRYPTO_CRYPTO_INTERNAL_H_ +#define _CRYPTO_CRYPTO_INTERNAL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +extern crypto_functions_t g_crypto_funcs; + +#ifdef __cplusplus +} +#endif + +#endif /*_CRYPTO_CRYPTO_INTERNAL_H_*/ diff --git a/libkern/libkern/crypto/des.h b/libkern/libkern/crypto/des.h new file mode 100644 index 000000000..960e60e04 --- /dev/null +++ b/libkern/libkern/crypto/des.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CRYPTO_DES_H +#define _CRYPTO_DES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* must be 32bit quantity */ +#define DES_LONG u_int32_t + +typedef unsigned char des_cblock[8]; + +/* Unholy hack: this is currently the size for the only implementation of DES in corecrypto */ +#define DES_ECB_CTX_MAX_SIZE (64*4) +#define DES_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(struct ccmode_ecb)) + ccn_sizeof_size(CCDES_BLOCK_SIZE) + ccn_sizeof_size(DES_ECB_CTX_MAX_SIZE)) +#define DES3_ECB_CTX_MAX_SIZE (64*4*3) +#define DES3_CBC_CTX_MAX_SIZE (ccn_sizeof_size(sizeof(struct ccmode_ecb)) + ccn_sizeof_size(CCDES_BLOCK_SIZE) + ccn_sizeof_size(DES3_ECB_CTX_MAX_SIZE)) + + +typedef struct{ + ccecb_ctx_decl(DES_ECB_CTX_MAX_SIZE, enc); + ccecb_ctx_decl(DES_ECB_CTX_MAX_SIZE, dec); +} des_ecb_key_schedule; + +typedef struct{ + cccbc_ctx_decl(DES_CBC_CTX_MAX_SIZE, enc); + cccbc_ctx_decl(DES_CBC_CTX_MAX_SIZE, dec); +} des_cbc_key_schedule; + +typedef struct{ + ccecb_ctx_decl(DES3_ECB_CTX_MAX_SIZE, enc); + ccecb_ctx_decl(DES3_ECB_CTX_MAX_SIZE, dec); +} des3_ecb_key_schedule; + +typedef struct{ + cccbc_ctx_decl(DES3_CBC_CTX_MAX_SIZE, enc); + cccbc_ctx_decl(DES3_CBC_CTX_MAX_SIZE, dec); +} des3_cbc_key_schedule; + +/* Only here for backward compatibility with smb kext */ +typedef des_ecb_key_schedule des_key_schedule[1]; +#define des_set_key des_ecb_key_sched + +#define DES_ENCRYPT 1 +#define DES_DECRYPT 0 + + +/* Single DES ECB - 1 block */ +int des_ecb_key_sched(des_cblock *key, des_ecb_key_schedule *ks); +void des_ecb_encrypt(des_cblock *in, des_cblock *out, des_ecb_key_schedule *ks, int encrypt); + +/* Triple DES ECB - 1 block */ +int des3_ecb_key_sched(des_cblock *key, des3_ecb_key_schedule *ks); +void des3_ecb_encrypt(des_cblock *block, des_cblock *, des3_ecb_key_schedule *ks, int encrypt); + +/* Single DES CBC */ +int des_cbc_key_sched(des_cblock *key, des_cbc_key_schedule *ks); +void des_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t len, + des_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt); + +/* Triple DES CBC */ +int des3_cbc_key_sched(des_cblock *key, des3_cbc_key_schedule *ks); +void des3_cbc_encrypt(des_cblock *in, des_cblock *out, int32_t len, + des3_cbc_key_schedule *ks, des_cblock *iv, des_cblock *retiv, int encrypt); + +/* Single DES CBC-MAC */ +void des_cbc_cksum(des_cblock *in, des_cblock *out, int len, des_cbc_key_schedule *ks); + +void des_fixup_key_parity(des_cblock *key); +int des_is_weak_key(des_cblock *key); +// int des_set_key(des_cblock *, des_key_schedule); // Unsupported KPI. + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libkern/libkern/crypto/register_crypto.h b/libkern/libkern/crypto/register_crypto.h new file mode 100644 index 000000000..d6647dba5 --- /dev/null +++ b/libkern/libkern/crypto/register_crypto.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CRYPTO_REGISTER_CRYPTO_H_ +#define _CRYPTO_REGISTER_CRYPTO_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/* Function types */ + +/* digests */ +typedef void (*ccdigest_init_fn_t)(const struct ccdigest_info *di, ccdigest_ctx_t ctx); +typedef void (*ccdigest_update_fn_t)(const struct ccdigest_info *di, ccdigest_ctx_t ctx, + unsigned long len, const void *data); +typedef void (*ccdigest_final_fn_t)(const struct ccdigest_info *di, ccdigest_ctx_t ctx, + void *digest); +typedef void (*ccdigest_fn_t)(const struct ccdigest_info *di, unsigned long len, + const void *data, void *digest); + +/* hmac */ +typedef void (*cchmac_init_fn_t)(const struct ccdigest_info *di, cchmac_ctx_t ctx, + unsigned long key_len, const void *key); +typedef void (*cchmac_update_fn_t)(const struct ccdigest_info *di, cchmac_ctx_t ctx, + unsigned long data_len, const void *data); +typedef void (*cchmac_final_fn_t)(const struct ccdigest_info *di, cchmac_ctx_t ctx, + unsigned char *mac); + +typedef void (*cchmac_fn_t)(const struct ccdigest_info *di, unsigned long key_len, + const void *key, unsigned long data_len, const void *data, + unsigned char *mac); + +/* pbkdf2 */ +typedef void (*ccpbkdf2_hmac_fn_t)(const struct ccdigest_info *di, + unsigned long passwordLen, const void *password, + unsigned long saltLen, const void *salt, + unsigned long iterations, + unsigned long dkLen, void *dk); + +/* des weak key testing */ +typedef int (*ccdes_key_is_weak_fn_t)(void *key, unsigned long length); +typedef void (*ccdes_key_set_odd_parity_fn_t)(void *key, unsigned long length); + + +typedef void (*ccpad_xts_decrypt_fn_t)(const struct ccmode_xts *xts, ccxts_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + +typedef void (*ccpad_xts_encrypt_fn_t)(const struct ccmode_xts *xts, ccxts_ctx *ctx, + unsigned long nbytes, const void *in, void *out); + + +typedef struct crypto_functions { + /* digests common functions */ + ccdigest_init_fn_t ccdigest_init_fn; + ccdigest_update_fn_t ccdigest_update_fn; + ccdigest_final_fn_t ccdigest_final_fn; + ccdigest_fn_t ccdigest_fn; + /* digest implementations */ + const struct ccdigest_info * ccmd5_di; + const struct ccdigest_info * ccsha1_di; + const struct ccdigest_info * ccsha256_di; + const struct ccdigest_info * ccsha384_di; + const struct ccdigest_info * ccsha512_di; + + /* hmac common function */ + cchmac_init_fn_t cchmac_init_fn; + cchmac_update_fn_t cchmac_update_fn; + cchmac_final_fn_t cchmac_final_fn; + cchmac_fn_t cchmac_fn; + + /* ciphers modes implementations */ + /* AES, ecb, cbc and xts */ + const struct ccmode_ecb *ccaes_ecb_encrypt; + const struct ccmode_ecb *ccaes_ecb_decrypt; + const struct ccmode_cbc *ccaes_cbc_encrypt; + const struct ccmode_cbc *ccaes_cbc_decrypt; + const struct ccmode_xts *ccaes_xts_encrypt; + const struct ccmode_xts *ccaes_xts_decrypt; + /* DES, ecb and cbc */ + const struct ccmode_ecb *ccdes_ecb_encrypt; + const struct ccmode_ecb *ccdes_ecb_decrypt; + const struct ccmode_cbc *ccdes_cbc_encrypt; + const struct ccmode_cbc *ccdes_cbc_decrypt; + /* Triple DES, ecb and cbc */ + const struct ccmode_ecb *cctdes_ecb_encrypt; + const struct ccmode_ecb *cctdes_ecb_decrypt; + const struct ccmode_cbc *cctdes_cbc_encrypt; + const struct ccmode_cbc *cctdes_cbc_decrypt; + /* RC4 */ + const struct ccrc4_info *ccrc4_info; + /* Blowfish - ECB only */ + const struct ccmode_ecb *ccblowfish_ecb_encrypt; + const struct ccmode_ecb *ccblowfish_ecb_decrypt; + /* CAST - ECB only */ + const struct ccmode_ecb *cccast_ecb_encrypt; + const struct ccmode_ecb *cccast_ecb_decrypt; + /* DES key helper functions */ + ccdes_key_is_weak_fn_t ccdes_key_is_weak_fn; + ccdes_key_set_odd_parity_fn_t ccdes_key_set_odd_parity_fn; + /* XTS padding functions */ + ccpad_xts_encrypt_fn_t ccpad_xts_encrypt_fn; + ccpad_xts_decrypt_fn_t ccpad_xts_decrypt_fn; +} *crypto_functions_t; + +int register_crypto_functions(const crypto_functions_t funcs); + +#ifdef __cplusplus +} +#endif + +#endif /*_CRYPTO_REGISTER_CRYPTO_H_*/ diff --git a/libkern/libkern/crypto/sha2.h b/libkern/libkern/crypto/sha2.h new file mode 100644 index 000000000..7908f7ef8 --- /dev/null +++ b/libkern/libkern/crypto/sha2.h @@ -0,0 +1,77 @@ +/* + * copyright (c) 2012 apple computer, inc. all rights reserved. + * + * @apple_osreference_license_header_start@ + * + * this file contains original code and/or modifications of original code + * as defined in and that are subject to the apple public source license + * version 2.0 (the 'license'). you may not use this file except in + * compliance with the license. the rights granted to you under the license + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an apple operating system software license agreement. + * + * please obtain a copy of the license at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * the original code and all software distributed under the license are + * distributed on an 'as is' basis, without warranty of any kind, either + * express or implied, and apple hereby disclaims all such warranties, + * including without limitation, any warranties of merchantability, + * fitness for a particular purpose, quiet enjoyment or non-infringement. + * please see the license for the specific language governing rights and + * limitations under the license. + * + * @apple_osreference_license_header_end@ + */ + +#ifndef _CRYPTO_SHA2_H__ +#define _CRYPTO_SHA2_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/*** SHA-256/384/512 Various Length Definitions ***********************/ +#define SHA256_BLOCK_LENGTH CCSHA256_BLOCK_SIZE +#define SHA256_DIGEST_LENGTH CCSHA256_OUTPUT_SIZE +#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) +#define SHA384_BLOCK_LENGTH CCSHA512_BLOCK_SIZE +#define SHA384_DIGEST_LENGTH CCSHA384_OUTPUT_SIZE +#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) +#define SHA512_BLOCK_LENGTH CCSHA512_BLOCK_SIZE +#define SHA512_DIGEST_LENGTH CCSHA384_OUTPUT_SIZE +#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) + +typedef struct { + ccdigest_ctx_decl(CCSHA256_STATE_SIZE, CCSHA256_BLOCK_SIZE, ctx); +} SHA256_CTX; + +typedef struct SHA512_CTX { + ccdigest_ctx_decl(CCSHA256_STATE_SIZE, CCSHA256_BLOCK_SIZE, ctx); +} SHA512_CTX; + +typedef SHA512_CTX SHA384_CTX; + +/*** SHA-256/384/512 Function Prototypes ******************************/ + +void SHA256_Init(SHA256_CTX *ctx); +void SHA256_Update(SHA256_CTX *ctx, const void *data, size_t len); +void SHA256_Final(void *digest, SHA256_CTX *ctx); + +void SHA384_Init(SHA384_CTX *ctx); +void SHA384_Update(SHA384_CTX *ctx, const void *data, size_t len); +void SHA384_Final(void *digest, SHA384_CTX *ctx); + +void SHA512_Init(SHA512_CTX *ctx); +void SHA512_Update(SHA512_CTX *ctx, const void *data, size_t len); +void SHA512_Final(void *digest, SHA512_CTX *ctx); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* _CRYPTO_SHA2_H__ */ diff --git a/libkern/libkern/kernel_mach_header.h b/libkern/libkern/kernel_mach_header.h index 6588b9b09..59218993e 100644 --- a/libkern/libkern/kernel_mach_header.h +++ b/libkern/libkern/kernel_mach_header.h @@ -45,6 +45,8 @@ extern "C" { #include #include +#include +#include #if !KERNEL #error this header for kernel use only @@ -55,15 +57,19 @@ extern "C" { typedef struct mach_header_64 kernel_mach_header_t; typedef struct segment_command_64 kernel_segment_command_t; typedef struct section_64 kernel_section_t; +typedef struct nlist_64 kernel_nlist_t; -#define LC_SEGMENT_KERNEL LC_SEGMENT_64 +#define MH_MAGIC_KERNEL MH_MAGIC_64 +#define LC_SEGMENT_KERNEL LC_SEGMENT_64 #else typedef struct mach_header kernel_mach_header_t; typedef struct segment_command kernel_segment_command_t; typedef struct section kernel_section_t; +typedef struct nlist kernel_nlist_t; +#define MH_MAGIC_KERNEL MH_MAGIC #define LC_SEGMENT_KERNEL LC_SEGMENT #define SECT_CONSTRUCTOR "__constructor" #define SECT_DESTRUCTOR "__destructor" @@ -95,16 +101,9 @@ kernel_section_t *getsectbynamefromheader( void *getsectdatafromheader(kernel_mach_header_t *, const char *, const char *, unsigned long *); kernel_section_t *firstsect(kernel_segment_command_t *sgp); kernel_section_t *nextsect(kernel_segment_command_t *sgp, kernel_section_t *sp); +void *getcommandfromheader(kernel_mach_header_t *, uint32_t); void *getuuidfromheader(kernel_mach_header_t *, unsigned long *); -#if MACH_KDB -boolean_t getsymtab(kernel_mach_header_t *header, - vm_offset_t *symtab, - int *nsyms, - vm_offset_t *strtab, - vm_size_t *strtabsize); -#endif - #ifdef __cplusplus } #endif diff --git a/libkern/libkern/kext_request_keys.h b/libkern/libkern/kext_request_keys.h index 6b908f133..66b50e6c2 100644 --- a/libkern/libkern/kext_request_keys.h +++ b/libkern/libkern/kext_request_keys.h @@ -88,16 +88,6 @@ extern "C" { */ #define kKextRequestPredicateGetLoaded "Get Loaded Kext Info" -/* Predicate: Get Kernel Image - * Argument: None - * Response: Raw bytes + length containing the sanitized image of the kernel. - * Op result: OSReturn indicating any errors in processing (see OSKextLib.h) - * - * Retrieves a sanitized image of the running kernel for use in generating - * debug symbols in user space. - */ -#define kKextRequestPredicateGetKernelImage "Get Kernel Image" - /* Predicate: Get Kernel Load Address * Argument: None * Response: OSNumber containing kernel load address. diff --git a/libkern/libkern/kxld_types.h b/libkern/libkern/kxld_types.h index 1578b5859..0aad7abbe 100644 --- a/libkern/libkern/kxld_types.h +++ b/libkern/libkern/kxld_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2008, 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -47,11 +47,6 @@ #define KXLD_USER_OR_LP64 1 #endif -/* For ppc-specific linking code */ -#if (!KERNEL || __ppc__) - #define KXLD_USER_OR_PPC 1 -#endif - /* For i386-specific linking code */ #if (!KERNEL || __i386__) #define KXLD_USER_OR_I386 1 @@ -68,30 +63,33 @@ #endif /* For linking code specific to architectures that support common symbols */ -#if (!KERNEL || __i386__ || __ppc__) +#if (!KERNEL || __i386__) #define KXLD_USER_OR_COMMON 1 #endif /* For linking code specific to architectures that support strict patching */ -#if (!KERNEL || !(__i386__ || __ppc__)) +#if (!KERNEL || !__i386__) #define KXLD_USER_OR_STRICT_PATCHING 1 #endif /* For linking code specific to architectures that use MH_OBJECT */ -#if (!KERNEL || __i386__ || __ppc__ || __arm__) +#if (!KERNEL || __i386__) #define KXLD_USER_OR_OBJECT 1 #endif /* For linking code specific to architectures that use MH_KEXT_BUNDLE */ -#if (!KERNEL || __i386__ || __x86_64__ || __arm__) - #define KXLD_USER_OR_BUNDLE 1 -#endif +#define KXLD_USER_OR_BUNDLE 1 /* We no longer need to generate our own GOT for any architectures, but the code * required to do this will be saved inside this macro. */ #define KXLD_USER_OR_GOT 0 +/* for building the dysymtab command generation into the dylib */ +#if (!KERNEL) + #define KXLD_PIC_KEXTS 1 +#endif + /******************************************************************************* * Types *******************************************************************************/ @@ -117,6 +115,7 @@ typedef uint64_t kxld_size_t; /* Flags for general linker behavior */ enum kxld_flags { kKxldFlagDefault = 0x0, + kKXLDFlagIncludeRelocs = 0x01 }; typedef enum kxld_flags KXLDFlags; diff --git a/libkern/libkern/machine/Makefile b/libkern/libkern/machine/Makefile index e4d4ce152..c5f944fa0 100644 --- a/libkern/libkern/machine/Makefile +++ b/libkern/libkern/machine/Makefile @@ -12,7 +12,6 @@ INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = -INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} @@ -20,7 +19,6 @@ EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} -EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} DATAFILES = \ OSByteOrder.h diff --git a/libsyscall/custom/__psynch_cvwait.s b/libkern/libkern/stack_protector.h similarity index 80% rename from libsyscall/custom/__psynch_cvwait.s rename to libkern/libkern/stack_protector.h index c5d69ce8c..d2d3c82b2 100644 --- a/libsyscall/custom/__psynch_cvwait.s +++ b/libkern/libkern/stack_protector.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,16 +25,15 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. */ -#include "SYS.h" +#ifndef _STACK_PROTECTOR_H +#define _STACK_PROTECTOR_H -#define __SYSCALL_32BIT_ARG_BYTES 40 +/* Set up by machine-dependent code early in boot */ +extern unsigned long __stack_chk_guard; -#if defined(__i386__) || defined(__x86_64__) +/* Called as a consequence on stack corruption */ +extern void __stack_chk_fail(void); -__SYSCALL(___psynch_cvwait, psynch_cvwait, 8) +#endif // _STACK_PROTECTOR_H_ -#else -#error Unsupported architecture -#endif diff --git a/libkern/libkern/tree.h b/libkern/libkern/tree.h index 3a26162bd..7865f359e 100644 --- a/libkern/libkern/tree.h +++ b/libkern/libkern/tree.h @@ -334,9 +334,9 @@ struct name { \ #define RB_PLACEHOLDER NULL #define RB_ENTRY(type) \ struct { \ + struct type *rbe_parent; /* parent element */ \ struct type *rbe_left; /* left element */ \ struct type *rbe_right; /* right element */ \ - struct type *rbe_parent; /* parent element */ \ } #define RB_COLOR_MASK (uintptr_t)0x1 diff --git a/libkern/stack_protector.c b/libkern/stack_protector.c index dad8a7e2e..21c4d06ae 100644 --- a/libkern/stack_protector.c +++ b/libkern/stack_protector.c @@ -1,55 +1,35 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2012 Apple Inc. All rights reserved. * - * %Begin-Header% - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, and the entire permission notice in its entirety, - * including the disclaimer of warranties. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF - * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT - * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE - * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH - * DAMAGE. - * %End-Header% + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/******************************************************************************* -* NOTE: This implementation of the stack check routines required by the GCC -* -fstack-protector flag is only safe for kernel extensions. -*******************************************************************************/ - -#include -#include +#include #include -long __stack_chk_guard[8]; -void __stack_chk_fail(void); - -static void __guard_setup(void) __attribute__((constructor)); - -static void -__guard_setup(void) -{ - /* Cannot report failure. */ - read_random(__stack_chk_guard, sizeof(__stack_chk_guard)); -} +unsigned long __stack_chk_guard = 0UL; void __stack_chk_fail(void) diff --git a/libkern/uuid/Makefile b/libkern/uuid/Makefile index 13a3f1969..f20633ddb 100644 --- a/libkern/uuid/Makefile +++ b/libkern/uuid/Makefile @@ -13,7 +13,6 @@ INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ -INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ @@ -21,7 +20,7 @@ EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ -EXPINC_SUBDIRS_ARM = \ + # uuid.h is now installed by bsd/uuid/Makefile DATAFILES = \ diff --git a/libkern/uuid/uuid.c b/libkern/uuid/uuid.c index ffc5c8059..217b6b667 100644 --- a/libkern/uuid/uuid.c +++ b/libkern/uuid/uuid.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * %Begin-Header% * Redistribution and use in source and binary forms, with or without @@ -40,9 +40,7 @@ #include #include -#include -#include -#include +extern int uuid_get_ethernet(u_int8_t *); UUID_DEFINE(UUID_NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); @@ -50,25 +48,8 @@ static void read_node(uint8_t *node) { #if NETWORKING - struct ifnet *ifp; - struct sockaddr_dl *sdl; - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - ifnet_lock_shared(ifp); - IFA_LOCK_SPIN(ifp->if_lladdr); - sdl = (struct sockaddr_dl *)ifp->if_lladdr->ifa_addr; - if (sdl->sdl_type == IFT_ETHER) { - memcpy(node, LLADDR(sdl), 6); - IFA_UNLOCK(ifp->if_lladdr); - ifnet_lock_done(ifp); - ifnet_head_done(); - return; - } - IFA_UNLOCK(ifp->if_lladdr); - ifnet_lock_done(ifp); - } - ifnet_head_done(); + if (uuid_get_ethernet(node) == 0) + return; #endif /* NETWORKING */ read_random(node, 6); diff --git a/libkern/x86_64/OSAtomic.s b/libkern/x86_64/OSAtomic.s index 30713ef3d..f3a7e617c 100644 --- a/libkern/x86_64/OSAtomic.s +++ b/libkern/x86_64/OSAtomic.s @@ -32,6 +32,12 @@ .globl _OSCompareAndSwap _OSCompareAndSwap: #;oldValue, newValue, ptr +#if DEBUG + test $3, %rdx + jz 1f + ud2 +1: +#endif movl %edi, %eax lock cmpxchgl %esi, (%rdx) #; CAS (eax is an implicit operand) @@ -48,6 +54,12 @@ _OSCompareAndSwap: #;oldValue, newValue, ptr _OSCompareAndSwap64: _OSCompareAndSwapPtr: #;oldValue, newValue, ptr +#if DEBUG + test $7, %rdx + jz 1f + ud2 +1: +#endif movq %rdi, %rax lock cmpxchgq %rsi, (%rdx) #; CAS (rax is an implicit operand) @@ -63,6 +75,12 @@ _OSCompareAndSwapPtr: #;oldValue, newValue, ptr .globl _OSAddAtomic64 _OSAddAtomic64: _OSAddAtomicLong: +#if DEBUG + test $7, %rsi + jz 1f + ud2 +1: +#endif lock xaddq %rdi, (%rsi) #; Atomic exchange and add movq %rdi, %rax; @@ -75,6 +93,12 @@ _OSAddAtomicLong: .globl _OSAddAtomic _OSAddAtomic: +#if DEBUG + test $3, %rsi + jz 1f + ud2 +1: +#endif lock xaddl %edi, (%rsi) #; Atomic exchange and add movl %edi, %eax; diff --git a/libkern/zlib/zutil.h b/libkern/zlib/zutil.h index 14c2f30d7..c12296b5c 100644 --- a/libkern/zlib/zutil.h +++ b/libkern/zlib/zutil.h @@ -280,7 +280,7 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ #endif /* Diagnostic functions */ -#ifdef DEBUG +#if defined(DEBUG) && !defined(KERNEL) # include extern int z_verbose; extern void z_error OF((char *m)); @@ -298,7 +298,7 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ # define Tracec(c,x) # define Tracecv(c,x) #endif - +#undef DEBUG #ifndef NO_ZCFUNCS voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size)); diff --git a/libsa/bootstrap.cpp b/libsa/bootstrap.cpp index 9ad023c1a..f24547c95 100644 --- a/libsa/bootstrap.cpp +++ b/libsa/bootstrap.cpp @@ -40,6 +40,10 @@ extern "C" { #include #include +#if __x86_64__ +#define KASLR_KEXT_DEBUG 0 +#endif + #if PRAGMA_MARK #pragma mark Bootstrap Declarations #endif @@ -100,20 +104,6 @@ static const char * sKernelComponentNames[] = { "com.apple.driver.AppleNMI", "com.apple.iokit.IOSystemManagementFamily", "com.apple.iokit.ApplePlatformFamily", - -#if defined(__i386__) || defined(__arm__) - /* These ones are not supported on x86_64 or any newer platforms. - * They must be version 7.9.9; check by "com.apple.kernel.", with - * the trailing period; "com.apple.kernel" always represents the - * current kernel version. - */ - "com.apple.kernel.6.0", - "com.apple.kernel.bsd", - "com.apple.kernel.iokit", - "com.apple.kernel.libkern", - "com.apple.kernel.mach", -#endif - NULL }; @@ -142,6 +132,7 @@ private: OSData * deviceTreeData); OSReturn loadKernelComponentKexts(void); + void loadKernelExternalComponents(void); void readBuiltinPersonalities(void); void loadSecurityExtensions(void); @@ -207,6 +198,7 @@ KLDBootstrap::readStartupExtensions(void) } loadKernelComponentKexts(); + loadKernelExternalComponents(); readBuiltinPersonalities(); OSKext::sendAllKextPersonalitiesToCatalog(); @@ -234,7 +226,7 @@ KLDBootstrap::readPrelinkedExtensions( void * prelinkData = NULL; // see code vm_size_t prelinkLength = 0; -#if !__LP64__ && !defined(__arm__) +#if __i386__ vm_map_offset_t prelinkDataMapOffset = 0; void * prelinkCopy = NULL; // see code kern_return_t mem_result = KERN_SUCCESS; @@ -246,6 +238,9 @@ KLDBootstrap::readPrelinkedExtensions( OSNumber * prelinkCountObj = NULL; // must release u_int i = 0; +#if NO_KEXTD + bool developerDevice; +#endif OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | @@ -260,18 +255,62 @@ KLDBootstrap::readPrelinkedExtensions( "Can't find prelinked kexts' text segment."); goto finish; } + +#if KASLR_KEXT_DEBUG + unsigned long scratchSize; + vm_offset_t scratchAddr; + + IOLog("kaslr: prelinked kernel address info: \n"); + + scratchAddr = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT", &scratchSize); + IOLog("kaslr: start 0x%lx end 0x%lx length %lu for __TEXT \n", + (unsigned long)scratchAddr, + (unsigned long)(scratchAddr + scratchSize), + scratchSize); + + scratchAddr = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &scratchSize); + IOLog("kaslr: start 0x%lx end 0x%lx length %lu for __DATA \n", + (unsigned long)scratchAddr, + (unsigned long)(scratchAddr + scratchSize), + scratchSize); + + scratchAddr = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &scratchSize); + IOLog("kaslr: start 0x%lx end 0x%lx length %lu for __LINKEDIT \n", + (unsigned long)scratchAddr, + (unsigned long)(scratchAddr + scratchSize), + scratchSize); + + scratchAddr = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &scratchSize); + IOLog("kaslr: start 0x%lx end 0x%lx length %lu for __KLD \n", + (unsigned long)scratchAddr, + (unsigned long)(scratchAddr + scratchSize), + scratchSize); + + scratchAddr = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &scratchSize); + IOLog("kaslr: start 0x%lx end 0x%lx length %lu for __PRELINK_TEXT \n", + (unsigned long)scratchAddr, + (unsigned long)(scratchAddr + scratchSize), + scratchSize); + + scratchAddr = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_INFO", &scratchSize); + IOLog("kaslr: start 0x%lx end 0x%lx length %lu for __PRELINK_INFO \n", + (unsigned long)scratchAddr, + (unsigned long)(scratchAddr + scratchSize), + scratchSize); +#endif prelinkData = (void *) prelinkTextSegment->vmaddr; prelinkLength = prelinkTextSegment->vmsize; -#if !__LP64__ && !__arm__ - /* XXX: arm's pmap implementation doesn't seem to let us do this */ - +#if __i386__ /* To enable paging and write/execute protections on the kext * executables, we need to copy them out of the booter-created * memory, reallocate that space with VM, then prelinkCopy them back in. - * This isn't necessary on LP64 because kexts have their own VM - * region on that architecture model. + * + * This isn't necessary on x86_64 because kexts have their own VM + * region for that architecture. + * + * XXX: arm's pmap implementation doesn't seem to let us do this. */ mem_result = kmem_alloc(kernel_map, (vm_offset_t *)&prelinkCopy, @@ -323,7 +362,7 @@ KLDBootstrap::readPrelinkedExtensions( memcpy(prelinkData, prelinkCopy, prelinkLength); kmem_free(kernel_map, (vm_offset_t)prelinkCopy, prelinkLength); -#endif /* !__LP64__ && !__arm__*/ +#endif /* __i386__ */ /* Unserialize the info dictionary from the prelink info section. */ @@ -345,6 +384,22 @@ KLDBootstrap::readPrelinkedExtensions( goto finish; } +#if NO_KEXTD + /* Check if we should keep developer kexts around. Default: + * Release: No + * Development: Yes + * Debug : Yes + * TODO: Check DeviceTree instead of a boot-arg + */ +#if DEVELOPMENT + developerDevice = true; +#else + developerDevice = false; +#endif + + PE_parse_boot_argn("developer", &developerDevice, sizeof(developerDevice)); +#endif /* NO_KEXTD */ + infoDictArray = OSDynamicCast(OSArray, prelinkInfoDict->getObject(kPrelinkInfoDictionaryKey)); if (!infoDictArray) { @@ -365,6 +420,34 @@ KLDBootstrap::readPrelinkedExtensions( continue; } +#if NO_KEXTD + /* If we're not on a developer device, skip and free developer kexts. + */ + if (developerDevice == false) { + OSBoolean *devOnlyBool = OSDynamicCast(OSBoolean, + infoDict->getObject(kOSBundleDeveloperOnlyKey)); + if (devOnlyBool == kOSBooleanTrue) { + OSString *bundleID = OSDynamicCast(OSString, + infoDict->getObject(kCFBundleIdentifierKey)); + if (bundleID) { + OSKextLog(NULL, kOSKextLogWarningLevel | kOSKextLogGeneralFlag, + "Kext %s not loading on non-dev device.", bundleID->getCStringNoCopy()); + } + + OSNumber *addressNum = OSDynamicCast(OSNumber, + infoDict->getObject(kPrelinkExecutableLoadKey)); + OSNumber *lengthNum = OSDynamicCast(OSNumber, + infoDict->getObject(kPrelinkExecutableSizeKey)); + if (addressNum && lengthNum) { +#error Pick the right way to free prelinked data on this arch + } + + infoDictArray->removeObject(i--); + continue; + } + } +#endif /* NO_KEXTD */ + /* Create the kext for the entry, then release it, because the * kext system keeps them around until explicitly removed. * Any creation/registration failures are already logged for us. @@ -394,12 +477,13 @@ KLDBootstrap::readPrelinkedExtensions( "%u prelinked kexts", infoDictArray->getCount()); -#if __LP64__ - /* On LP64 systems, kexts are copied to their own special VM region - * during OSKext init time, so we can free the whole segment now. +#if CONFIG_KEXT_BASEMENT + /* On CONFIG_KEXT_BASEMENT systems, kexts are copied to their own + * special VM region during OSKext init time, so we can free the whole + * segment now. */ ml_static_mfree((vm_offset_t) prelinkData, prelinkLength); -#endif /* __LP64__ */ +#endif /* __x86_64__ */ /* Free the prelink info segment, we're done with it. */ @@ -665,7 +749,7 @@ KLDBootstrap::loadSecurityExtensions(void) } isSecurityKext = OSDynamicCast(OSBoolean, - theKext->getPropertyForHostArch("AppleSecurityExtension")); + theKext->getPropertyForHostArch(kAppleSecurityExtensionKey)); if (isSecurityKext && isSecurityKext->isTrue()) { OSKextLog(/* kext */ NULL, kOSKextLogStepLevel | @@ -724,6 +808,80 @@ KLDBootstrap::loadKernelComponentKexts(void) return result; } +/********************************************************************* +* Ensure that Kernel External Components are loaded early in boot, +* before other kext personalities get sent to the IOCatalogue. These +* kexts are treated specially because they may provide the implementation +* for kernel-vended KPI, so they must register themselves before +* general purpose IOKit probing begins. +*********************************************************************/ + +#define COM_APPLE_KEC "com.apple.kec." + +void +KLDBootstrap::loadKernelExternalComponents(void) +{ + OSDictionary * extensionsDict = NULL; // must release + OSCollectionIterator * keyIterator = NULL; // must release + OSString * bundleID = NULL; // don't release + OSKext * theKext = NULL; // don't release + OSBoolean * isKernelExternalComponent = NULL; // don't release + + OSKextLog(/* kext */ NULL, + kOSKextLogStepLevel | + kOSKextLogLoadFlag, + "Loading Kernel External Components."); + + extensionsDict = OSKext::copyKexts(); + if (!extensionsDict) { + return; + } + + keyIterator = OSCollectionIterator::withCollection(extensionsDict); + if (!keyIterator) { + OSKextLog(/* kext */ NULL, + kOSKextLogErrorLevel | + kOSKextLogGeneralFlag, + "Failed to allocate iterator for Kernel External Components."); + goto finish; + } + + while ((bundleID = OSDynamicCast(OSString, keyIterator->getNextObject()))) { + + const char * bundle_id = bundleID->getCStringNoCopy(); + + /* Skip extensions whose bundle IDs don't start with "com.apple.kec.". + */ + if (!bundle_id || + (strncmp(bundle_id, COM_APPLE_KEC, CONST_STRLEN(COM_APPLE_KEC)) != 0)) { + + continue; + } + + theKext = OSDynamicCast(OSKext, extensionsDict->getObject(bundleID)); + if (!theKext) { + continue; + } + + isKernelExternalComponent = OSDynamicCast(OSBoolean, + theKext->getPropertyForHostArch(kAppleKernelExternalComponentKey)); + if (isKernelExternalComponent && isKernelExternalComponent->isTrue()) { + OSKextLog(/* kext */ NULL, + kOSKextLogStepLevel | + kOSKextLogLoadFlag, + "Loading kernel external component %s.", bundleID->getCStringNoCopy()); + OSKext::loadKextWithIdentifier(bundleID->getCStringNoCopy(), + /* allowDefer */ false); + } + } + +finish: + OSSafeRelease(keyIterator); + OSSafeRelease(extensionsDict); + + return; +} + /********************************************************************* *********************************************************************/ void diff --git a/libsa/conf/MASTER b/libsa/conf/MASTER index 99865aa3e..a47e74813 100644 --- a/libsa/conf/MASTER +++ b/libsa/conf/MASTER @@ -51,16 +51,32 @@ # ident LIBSA -options KDEBUG # kernel tracing # +options KDEBUG # kernel tracing # +options IST_KDEBUG # limited tracing # +options NO_KDEBUG # no kernel tracing # + options GPROF # kernel profiling # options CONFIG_NOLIBKLD # kernel linker # options MALLOC_RESET_GC # options CONFIG_DTRACE # +options VM_PRESSURE_EVENTS # options CONFIG_NO_PANIC_STRINGS # options CONFIG_NO_PRINTF_STRINGS # options CONFIG_NO_KPRINTF_STRINGS # options CONFIG_KXLD # kxld/runtime linking of kexts # + +options DEVELOPMENT # dev kernel # + +# configurable kernel - general switch to say we are building for an +# embedded device +# +options CONFIG_EMBEDDED # + +# CONFIG_KEXT_BASEMENT - alloc post boot loaded kexts after prelinked kexts +# +options CONFIG_KEXT_BASEMENT # # + diff --git a/libsa/conf/MASTER.i386 b/libsa/conf/MASTER.i386 index 448133126..8e7b8008c 100644 --- a/libsa/conf/MASTER.i386 +++ b/libsa/conf/MASTER.i386 @@ -1,6 +1,6 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp config_dtrace config_kxld ] +# RELEASE = [ intel mach libkerncpp config_dtrace config_kxld vm_pressure_events ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # diff --git a/libsa/conf/MASTER.x86_64 b/libsa/conf/MASTER.x86_64 index 89c745125..c4edf238f 100644 --- a/libsa/conf/MASTER.x86_64 +++ b/libsa/conf/MASTER.x86_64 @@ -1,6 +1,6 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp config_dtrace config_kxld ] +# RELEASE = [ intel mach libkerncpp config_dtrace config_kxld vm_pressure_events config_kext_basement ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # diff --git a/libsa/conf/Makefile b/libsa/conf/Makefile index b463b2528..45981d362 100644 --- a/libsa/conf/Makefile +++ b/libsa/conf/Makefile @@ -42,9 +42,11 @@ $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ do_all: $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ + next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH)); \ ${MAKE} -C $(COMPOBJROOT)/$(LIBSA_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(LIBSA_KERNEL_CONFIG)/Makefile \ SOURCE=$${next_source} \ + RELATIVE_SOURCE_PATH=$${next_relsource} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ KERNEL_CONFIG=$(LIBSA_KERNEL_CONFIG) \ diff --git a/libsa/libsa/Makefile b/libsa/libsa/Makefile index 4554d46ba..196b8f326 100644 --- a/libsa/libsa/Makefile +++ b/libsa/libsa/Makefile @@ -10,12 +10,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS} -INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} EXPINC_SUBDIRS = EXPINC_SUBDIRS_I386 = ${EXPINC_SUBDIRS} EXPINC_SUBDIRS_X86_64 = ${EXPINC_SUBDIRS} -EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} INSTALL_MI_LIST = diff --git a/libsyscall/Libsyscall.xcconfig b/libsyscall/Libsyscall.xcconfig index 8881d5028..3f9e34bdb 100644 --- a/libsyscall/Libsyscall.xcconfig +++ b/libsyscall/Libsyscall.xcconfig @@ -10,15 +10,20 @@ PUBLIC_HEADERS_FOLDER_PATH = /usr/include/mach PUBLIC_HEADERS_FOLDER_PATH[sdk=iphoneos*] = /usr/include/mach PUBLIC_HEADERS_FOLDER_PATH[sdk=iphonesimulator*] = $(SDKROOT)/usr/include/mach PUBLIC_HEADERS_FOLDER_PATH[sdk=macosx*] = /usr/include/mach +PRIVATE_HEADERS_FOLDER_PATH = /usr/local/include +PRIVATE_HEADERS_FOLDER_PATH[sdk=iphoneos*] = /usr/local/include +PRIVATE_HEADERS_FOLDER_PATH[sdk=iphonesimulator*] = $(SDKROOT)/usr/local/include +PRIVATE_HEADERS_FOLDER_PATH[sdk=macosx*] = /usr/local/include EXECUTABLE_PREFIX = libsystem_ PRODUCT_NAME = kernel ALWAYS_SEARCH_USER_PATHS = NO +ORDER_FILE[sdk=iphoneos*] = $(SDKROOT)/$(APPLE_INTERNAL_DIR)/OrderFiles/libsystem_kernel.order OTHER_CFLAGS = -fdollars-in-identifiers -no-cpp-precomp -fno-common -fno-stack-protector -pipe -DLIBSYSCALL_INTERFACE -D__DARWIN_VERS_1050=1 OTHER_CFLAGS[sdk=macosx*] = $(inherited) -DSYSCALL_PRE1050 OTHER_CFLAGS[sdk=macosx*][arch=x86_64] = $(inherited) -DNO_SYSCALL_LEGACY OTHER_CFLAGS[sdk=iphoneos*] = $(inherited) -DNO_SYSCALL_LEGACY GCC_PREPROCESSOR_DEFINITIONS = CF_OPEN_SOURCE CF_EXCLUDE_CSTD_HEADERS DEBUG _FORTIFY_SOURCE=0 -HEADER_SEARCH_PATHS = /System/Library/Frameworks/System.framework/PrivateHeaders $(PROJECT_DIR)/mach $(PROJECT_DIR)/wrappers +HEADER_SEARCH_PATHS = $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders $(PROJECT_DIR)/mach $(PROJECT_DIR)/wrappers WARNING_CFLAGS = -Wmost GCC_TREAT_WARNINGS_AS_ERRORS = YES GCC_WARN_ABOUT_MISSING_NEWLINE = YES diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj index 6310cd437..c391bcf32 100644 --- a/libsyscall/Libsyscall.xcodeproj/project.pbxproj +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -33,6 +33,7 @@ /* End PBXAggregateTarget section */ /* Begin PBXBuildFile section */ + 030B179B135377B400DAD1F0 /* open_dprotected_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 030B179A135377B400DAD1F0 /* open_dprotected_np.c */; }; 240BAC4C1214770F000A1719 /* memcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B028D511FF4FBB00CA64A9 /* memcpy.c */; }; 2419382B12135FF6003CDE41 /* chmod.c in Sources */ = {isa = PBXBuildFile; fileRef = 2419382A12135FF6003CDE41 /* chmod.c */; }; 242AB66611EBDC1200107336 /* errno.c in Sources */ = {isa = PBXBuildFile; fileRef = 242AB66511EBDC1200107336 /* errno.c */; }; @@ -86,6 +87,8 @@ 24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B8C2611237F53900D36CC3 /* remove-counter.c */; }; 24D1158311E671B20063D54D /* SYS.h in Headers */ = {isa = PBXBuildFile; fileRef = 24D1157411E671B20063D54D /* SYS.h */; }; 24E4782712088267009A384D /* _libc_funcptr.c in Sources */ = {isa = PBXBuildFile; fileRef = 24E47824120881DF009A384D /* _libc_funcptr.c */; }; + 291D3C281354FDD100D46061 /* mach_port.c in Sources */ = {isa = PBXBuildFile; fileRef = 291D3C261354FDD100D46061 /* mach_port.c */; }; + 291D3C291354FDD100D46061 /* mach_vm.c in Sources */ = {isa = PBXBuildFile; fileRef = 291D3C271354FDD100D46061 /* mach_vm.c */; }; C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */ = {isa = PBXBuildFile; fileRef = C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */; }; C99A4F531305B43F0054B7B7 /* init_cpu_capabilities.c in Sources */ = {isa = PBXBuildFile; fileRef = C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */; }; C9D9BD17114B00600000D8B9 /* vm_map_compat.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCC2114B00600000D8B9 /* vm_map_compat.c */; }; @@ -113,7 +116,6 @@ C9D9BD2F114B00600000D8B9 /* vm_task.h in Headers */ = {isa = PBXBuildFile; fileRef = C9D9BCE3114B00600000D8B9 /* vm_task.h */; settings = {ATTRIBUTES = (Public, ); }; }; C9D9BD30114B00600000D8B9 /* host_priv.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCE4114B00600000D8B9 /* host_priv.defs */; }; C9D9BD31114B00600000D8B9 /* host_security.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCE5114B00600000D8B9 /* host_security.defs */; }; - C9D9BD34114B00600000D8B9 /* ledger.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCE9114B00600000D8B9 /* ledger.defs */; }; C9D9BD35114B00600000D8B9 /* lock_set.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCEA114B00600000D8B9 /* lock_set.defs */; }; C9D9BD36114B00600000D8B9 /* mach_error_string.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCEB114B00600000D8B9 /* mach_error_string.c */; }; C9D9BD37114B00600000D8B9 /* mach_error.c in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BCEC114B00600000D8B9 /* mach_error.c */; }; @@ -143,6 +145,7 @@ C9D9BD57114B00600000D8B9 /* task.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD0F114B00600000D8B9 /* task.defs */; }; C9D9BD58114B00600000D8B9 /* thread_act.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD10114B00600000D8B9 /* thread_act.defs */; }; C9D9BD59114B00600000D8B9 /* vm_map.defs in Sources */ = {isa = PBXBuildFile; fileRef = C9D9BD11114B00600000D8B9 /* vm_map.defs */; }; + EE3F605A149A6D66003BAEBA /* getaudit.c in Sources */ = {isa = PBXBuildFile; fileRef = EE3F6059149A6D66003BAEBA /* getaudit.c */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -170,6 +173,7 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ + 030B179A135377B400DAD1F0 /* open_dprotected_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = open_dprotected_np.c; sourceTree = ""; }; 240D716711933ED300556E97 /* mach_install_mig.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = mach_install_mig.sh; sourceTree = ""; }; 2419382A12135FF6003CDE41 /* chmod.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = chmod.c; sourceTree = ""; }; 2427FA821200BCF800EF7A1F /* compat-symlinks.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = "compat-symlinks.sh"; sourceTree = ""; }; @@ -252,6 +256,8 @@ 24D1159811E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; 24D1159911E6723E0063D54D /* create-syscalls.pl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.perl; path = "create-syscalls.pl"; sourceTree = ""; }; 24E47824120881DF009A384D /* _libc_funcptr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = _libc_funcptr.c; sourceTree = ""; }; + 291D3C261354FDD100D46061 /* mach_port.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = mach_port.c; path = mach/mach_port.c; sourceTree = ""; }; + 291D3C271354FDD100D46061 /* mach_vm.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = mach_vm.c; path = mach/mach_vm.c; sourceTree = ""; }; C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; path = __get_cpu_capabilities.s; sourceTree = ""; }; C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = init_cpu_capabilities.c; sourceTree = ""; }; C9D9BCBF114B00600000D8B9 /* .open_source_exclude */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = .open_source_exclude; sourceTree = ""; }; @@ -287,7 +293,6 @@ C9D9BCE3114B00600000D8B9 /* vm_task.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vm_task.h; sourceTree = ""; }; C9D9BCE4114B00600000D8B9 /* host_priv.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = host_priv.defs; sourceTree = ""; }; C9D9BCE5114B00600000D8B9 /* host_security.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = host_security.defs; sourceTree = ""; }; - C9D9BCE9114B00600000D8B9 /* ledger.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = ledger.defs; sourceTree = ""; }; C9D9BCEA114B00600000D8B9 /* lock_set.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = lock_set.defs; sourceTree = ""; }; C9D9BCEB114B00600000D8B9 /* mach_error_string.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_error_string.c; sourceTree = ""; }; C9D9BCEC114B00600000D8B9 /* mach_error.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_error.c; sourceTree = ""; }; @@ -319,6 +324,7 @@ C9D9BD11114B00600000D8B9 /* vm_map.defs */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.mig; path = vm_map.defs; sourceTree = ""; }; C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Libsyscall.xcconfig; sourceTree = ""; }; D2AAC0630554660B00DB518D /* libsystem_kernel.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsystem_kernel.a; sourceTree = BUILT_PRODUCTS_DIR; }; + EE3F6059149A6D66003BAEBA /* getaudit.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getaudit.c; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -344,6 +350,8 @@ 08FB7794FE84155DC02AAC07 /* mach */ = { isa = PBXGroup; children = ( + 291D3C261354FDD100D46061 /* mach_port.c */, + 291D3C271354FDD100D46061 /* mach_vm.c */, C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */, 24D1158911E672270063D54D /* Platforms */, 24D1156511E671B20063D54D /* custom */, @@ -383,7 +391,6 @@ C9D9BCD8114B00600000D8B9 /* mach */, C9D9BCE4114B00600000D8B9 /* host_priv.defs */, C9D9BCE5114B00600000D8B9 /* host_security.defs */, - C9D9BCE9114B00600000D8B9 /* ledger.defs */, C9D9BCEA114B00600000D8B9 /* lock_set.defs */, C9D9BCEB114B00600000D8B9 /* mach_error_string.c */, C9D9BCEC114B00600000D8B9 /* mach_error.c */, @@ -461,6 +468,7 @@ C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */, 247A08B211F8B05900E4693F /* _libkernel_init.h */, 247A08B311F8B05900E4693F /* _libkernel_init.c */, + 030B179A135377B400DAD1F0 /* open_dprotected_np.c */, 24E47824120881DF009A384D /* _libc_funcptr.c */, 24A7C5CB11FF973C007669EB /* _errno.h */, C99A4F511305B43F0054B7B7 /* init_cpu_capabilities.c */, @@ -498,6 +506,7 @@ 24A7C5AF11FF8DA6007669EB /* bind.c */, 248BA01C121C56BF008C073F /* connect.c */, 24A7C5B111FF8DA6007669EB /* getattrlist.c */, + EE3F6059149A6D66003BAEBA /* getaudit.c */, 24A7C5B211FF8DA6007669EB /* getpeername.c */, 24A7C5B311FF8DA6007669EB /* getsockname.c */, 24A7C5B411FF8DA6007669EB /* lchown.c */, @@ -697,9 +706,9 @@ buildConfigurationList = 1DEB914A08733D8E0010E9CD /* Build configuration list for PBXNativeTarget "Libmach" */; buildPhases = ( D2AAC0600554660B00DB518D /* Headers */, + 2487545E11629934000975E0 /* Install Headers */, D2AAC0610554660B00DB518D /* Sources */, D289988505E68E00004EDB86 /* Frameworks */, - 2487545E11629934000975E0 /* Install Headers */, ); buildRules = ( ); @@ -819,7 +828,6 @@ C9D9BD22114B00600000D8B9 /* exc.defs in Sources */, C9D9BD30114B00600000D8B9 /* host_priv.defs in Sources */, C9D9BD31114B00600000D8B9 /* host_security.defs in Sources */, - C9D9BD34114B00600000D8B9 /* ledger.defs in Sources */, C9D9BD35114B00600000D8B9 /* lock_set.defs in Sources */, C9D9BD38114B00600000D8B9 /* mach_host.defs in Sources */, C9D9BD3D114B00600000D8B9 /* mach_port.defs in Sources */, @@ -898,6 +906,10 @@ 24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */, C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */, C99A4F531305B43F0054B7B7 /* init_cpu_capabilities.c in Sources */, + 030B179B135377B400DAD1F0 /* open_dprotected_np.c in Sources */, + 291D3C281354FDD100D46061 /* mach_port.c in Sources */, + 291D3C291354FDD100D46061 /* mach_vm.c in Sources */, + EE3F605A149A6D66003BAEBA /* getaudit.c in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/libsyscall/Platforms/MacOSX/i386/syscall.map b/libsyscall/Platforms/MacOSX/i386/syscall.map index bdfa11aac..656bd8fdb 100644 --- a/libsyscall/Platforms/MacOSX/i386/syscall.map +++ b/libsyscall/Platforms/MacOSX/i386/syscall.map @@ -60,6 +60,7 @@ _recvmsg$NOCANCEL$UNIX2003 ___recvmsg_nocancel _recvmsg$UNIX2003 ___recvmsg _select$DARWIN_EXTSN ___select _select$DARWIN_EXTSN$NOCANCEL ___select_nocancel +_sem_open ___sem_open _sem_wait ___sem_wait_nocancel _sem_wait$NOCANCEL$UNIX2003 ___sem_wait_nocancel _sem_wait$UNIX2003 ___sem_wait diff --git a/libsyscall/Platforms/MacOSX/x86_64/syscall.map b/libsyscall/Platforms/MacOSX/x86_64/syscall.map index b8cb6b1e1..d717a159c 100644 --- a/libsyscall/Platforms/MacOSX/x86_64/syscall.map +++ b/libsyscall/Platforms/MacOSX/x86_64/syscall.map @@ -45,6 +45,7 @@ _open ___open _recvfrom ___recvfrom _recvmsg ___recvmsg _semctl ___semctl +_sem_open ___sem_open _sendmsg ___sendmsg _sendto ___sendto _setattrlist ___setattrlist diff --git a/libsyscall/custom/SYS.h b/libsyscall/custom/SYS.h index a16f358d8..675fede15 100644 --- a/libsyscall/custom/SYS.h +++ b/libsyscall/custom/SYS.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,6 +48,8 @@ #include +/* Binary compatibility stubs for syscalls that no longer exist */ + #ifndef SYS_setquota #define SYS_setquota 148 #endif @@ -86,14 +88,14 @@ LEAF(_##name, 0) ;\ 2: #if defined(__SYSCALL_32BIT_ARG_BYTES) && ((__SYSCALL_32BIT_ARG_BYTES >= 4) && (__SYSCALL_32BIT_ARG_BYTES <= 20)) -#define UNIX_SYSCALL_NONAME(name, nargs) \ +#define UNIX_SYSCALL_NONAME(name, nargs, cerror) \ movl $(SYS_##name | (__SYSCALL_32BIT_ARG_BYTES << I386_SYSCALL_ARG_BYTES_SHIFT)), %eax ;\ UNIX_SYSCALL_SYSENTER ;\ jnb 2f ;\ BRANCH_EXTERN(cerror) ;\ 2: #else /* __SYSCALL_32BIT_ARG_BYTES < 4 || > 20 */ -#define UNIX_SYSCALL_NONAME(name, nargs) \ +#define UNIX_SYSCALL_NONAME(name, nargs, cerror) \ .globl cerror ;\ movl $ SYS_##name, %eax ;\ UNIX_SYSCALL_SYSENTER ;\ @@ -107,19 +109,23 @@ LEAF(_##name, 0) ;\ movl $ SYS_##name, %eax ;\ UNIX_SYSCALL_TRAP ;\ jnb 2f ;\ - BRANCH_EXTERN(cerror) ;\ + BRANCH_EXTERN(cerror_nocancel) ;\ 2: -#define PSEUDO(pseudo, name, nargs) \ +#define PSEUDO(pseudo, name, nargs, cerror) \ LEAF(pseudo, 0) ;\ - UNIX_SYSCALL_NONAME(name, nargs) + UNIX_SYSCALL_NONAME(name, nargs, cerror) #define PSEUDO_INT(pseudo, name, nargs) \ LEAF(pseudo, 0) ;\ UNIX_SYSCALL_INT_NONAME(name, nargs) +#define __SYSCALL2(pseudo, name, nargs, cerror) \ + PSEUDO(pseudo, name, nargs, cerror) ;\ + ret + #define __SYSCALL(pseudo, name, nargs) \ - PSEUDO(pseudo, name, nargs) ;\ + PSEUDO(pseudo, name, nargs, cerror) ;\ ret #define __SYSCALL_INT(pseudo, name, nargs) \ @@ -144,7 +150,7 @@ LEAF(_##name, 0) ;\ BRANCH_EXTERN(cerror) ;\ 2: -#define UNIX_SYSCALL_NONAME(name, nargs) \ +#define UNIX_SYSCALL_NONAME(name, nargs, cerror) \ .globl cerror ;\ movl $ SYSCALL_CONSTRUCT_UNIX(SYS_##name), %eax ;\ UNIX_SYSCALL_SYSCALL ;\ @@ -152,14 +158,19 @@ LEAF(_##name, 0) ;\ BRANCH_EXTERN(cerror) ;\ 2: -#define PSEUDO(pseudo, name, nargs) \ +#define PSEUDO(pseudo, name, nargs, cerror) \ LEAF(pseudo, 0) ;\ - UNIX_SYSCALL_NONAME(name, nargs) + UNIX_SYSCALL_NONAME(name, nargs, cerror) + +#define __SYSCALL2(pseudo, name, nargs, cerror) \ + PSEUDO(pseudo, name, nargs, cerror) ;\ + ret #define __SYSCALL(pseudo, name, nargs) \ - PSEUDO(pseudo, name, nargs) ;\ + PSEUDO(pseudo, name, nargs, cerror) ;\ ret #else #error Unsupported architecture #endif + diff --git a/libsyscall/custom/__getpid.s b/libsyscall/custom/__getpid.s index 48c85313c..2768d9b82 100644 --- a/libsyscall/custom/__getpid.s +++ b/libsyscall/custom/__getpid.s @@ -32,9 +32,9 @@ .data .private_extern __current_pid -__current_pid: +L__current_pid_addr: + __current_pid: .long 0 -L__current_pid_addr = __current_pid #if defined(__DYNAMIC__) #define GET_CURRENT_PID \ @@ -61,7 +61,7 @@ LEAF(___getpid, 0) jle 1f ret 1: - UNIX_SYSCALL_NONAME(getpid, 0) + UNIX_SYSCALL_NONAME(getpid, 0, cerror_nocancel) movl %eax, %edx xorl %eax, %eax GET_CURRENT_PID @@ -88,7 +88,7 @@ LEAF(___getpid, 0) jle 1f ret 1: - UNIX_SYSCALL_NONAME(getpid, 0) + UNIX_SYSCALL_NONAME(getpid, 0, cerror_nocancel) movl %eax, %edx xorl %eax, %eax leaq __current_pid(%rip), %rcx diff --git a/libsyscall/custom/__gettimeofday.s b/libsyscall/custom/__gettimeofday.s index 1dbf19c77..0076f49ce 100644 --- a/libsyscall/custom/__gettimeofday.s +++ b/libsyscall/custom/__gettimeofday.s @@ -48,7 +48,7 @@ LABEL(___gettimeofday) * This syscall is special cased: the timeval is returned in rax:rdx. */ LABEL(___gettimeofday) - UNIX_SYSCALL_NONAME(gettimeofday,0) + UNIX_SYSCALL_NONAME(gettimeofday,0,cerror_nocancel) movq %rax, (%rdi) movl %edx, 8(%rdi) xorl %eax, %eax diff --git a/libsyscall/custom/__pipe.s b/libsyscall/custom/__pipe.s index 0131d476d..d375dddbd 100644 --- a/libsyscall/custom/__pipe.s +++ b/libsyscall/custom/__pipe.s @@ -40,7 +40,7 @@ PSEUDO_INT(___pipe, pipe, 0) #elif defined(__x86_64__) -PSEUDO(___pipe, pipe, 0) +PSEUDO(___pipe, pipe, 0, cerror_nocancel) movl %eax, (%rdi) movl %edx, 4(%rdi) xorl %eax, %eax diff --git a/libsyscall/custom/__ptrace.s b/libsyscall/custom/__ptrace.s index 9eae221f2..bdcbec9fb 100644 --- a/libsyscall/custom/__ptrace.s +++ b/libsyscall/custom/__ptrace.s @@ -36,7 +36,7 @@ LEAF(___ptrace, 0) xorl %eax,%eax REG_TO_EXTERN(%eax,_errno) -UNIX_SYSCALL_NONAME(ptrace, 4) + UNIX_SYSCALL_NONAME(ptrace, 4, cerror) ret #elif defined(__x86_64__) @@ -47,7 +47,7 @@ LEAF(___ptrace, 0) xorq %rax,%rax PICIFY(_errno) movl %eax,(%r11) -UNIX_SYSCALL_NONAME(ptrace, 4) + UNIX_SYSCALL_NONAME(ptrace, 4, cerror) ret #else diff --git a/libsyscall/custom/custom.s b/libsyscall/custom/custom.s index b9d46ba13..a6a4f8bb8 100644 --- a/libsyscall/custom/custom.s +++ b/libsyscall/custom/custom.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,14 +35,19 @@ .globl _errno LABEL(cerror) - REG_TO_EXTERN(%eax, _errno) + movl $0,%ecx + jmp 1f +LABEL(cerror_nocancel) + movl $1,%ecx +1: REG_TO_EXTERN(%eax, _errno) mov %esp,%edx andl $0xfffffff0,%esp subl $16,%esp - movl %edx,4(%esp) + movl %edx,8(%esp) + movl %ecx,4(%esp) movl %eax,(%esp) CALL_EXTERN(_cthread_set_errno_self) - movl 4(%esp),%esp + movl 8(%esp),%esp movl $-1,%eax movl $-1,%edx /* in case a 64-bit value is returned */ ret @@ -57,7 +62,13 @@ LABEL(__sysenter_trap) .globl _errno LABEL(cerror) - PICIFY(_errno) /* address -> %r11 */ + /* cancelable syscall, for arg1 to _cthread_set_errno_self */ + movq $0,%rsi + jmp 1f +LABEL(cerror_nocancel) + /* non-cancelable, see above. */ + movq $1,%rsi +1: PICIFY(_errno) /* address -> %r11 */ movl %eax,(%r11) mov %rsp,%rdx andq $-16,%rsp diff --git a/libsyscall/mach/mach/mach_interface.h b/libsyscall/mach/mach/mach_interface.h index af939cb3b..b0f7a01c9 100644 --- a/libsyscall/mach/mach/mach_interface.h +++ b/libsyscall/mach/mach/mach_interface.h @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/libsyscall/mach/mach_msg.c b/libsyscall/mach/mach_msg.c index d8b094119..87f6cb573 100644 --- a/libsyscall/mach/mach_msg.c +++ b/libsyscall/mach/mach_msg.c @@ -318,6 +318,7 @@ mach_msg_destroy(mach_msg_header_t *msg) /* * Just skip it. */ + dsc = &daddr->out_of_line; daddr = (mach_msg_descriptor_t *)(dsc + 1); break; } diff --git a/libsyscall/mach/mach_port.c b/libsyscall/mach/mach_port.c new file mode 100644 index 000000000..954d45e56 --- /dev/null +++ b/libsyscall/mach/mach_port.c @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +kern_return_t +mach_port_names( + ipc_space_t task, + mach_port_name_array_t *names, + mach_msg_type_number_t *namesCnt, + mach_port_type_array_t *types, + mach_msg_type_number_t *typesCnt) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_names(task, names, namesCnt, types, + typesCnt); + + return (rv); +} + +kern_return_t +mach_port_type( + ipc_space_t task, + mach_port_name_t name, + mach_port_type_t *ptype) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_type(task, name, ptype); + + return (rv); +} + +kern_return_t +mach_port_rename( + ipc_space_t task, + mach_port_name_t old_name, + mach_port_name_t new_name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_rename(task, old_name, new_name); + + return (rv); +} + +kern_return_t +mach_port_allocate_name( + ipc_space_t task, + mach_port_right_t right, + mach_port_name_t name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_allocate_name(task, right, name); + + return (rv); +} + +kern_return_t +mach_port_allocate( + ipc_space_t task, + mach_port_right_t right, + mach_port_name_t *name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_allocate_trap(task, right, name); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_allocate(task, right, name); + + return (rv); +} + +kern_return_t +mach_port_destroy( + ipc_space_t task, + mach_port_name_t name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_destroy_trap(task, name); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_destroy(task, name); + + return (rv); +} + +kern_return_t +mach_port_deallocate( + ipc_space_t task, + mach_port_name_t name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_deallocate_trap(task, name); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_deallocate(task,name); + + return (rv); +} + +kern_return_t +mach_port_get_refs( + ipc_space_t task, + mach_port_name_t name, + mach_port_right_t right, + mach_port_urefs_t *refs) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_get_refs(task, name, right, refs); + + return (rv); +} + +kern_return_t +mach_port_mod_refs( + ipc_space_t task, + mach_port_name_t name, + mach_port_right_t right, + mach_port_delta_t delta) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_mod_refs_trap(task, name, right, delta); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_mod_refs(task, name, right, delta); + + return (rv); +} + +kern_return_t +mach_port_set_mscount( + ipc_space_t task, + mach_port_name_t name, + mach_port_mscount_t mscount) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_set_mscount(task, name, mscount); + + return (rv); +} + +kern_return_t +mach_port_get_set_status( + ipc_space_t task, + mach_port_name_t name, + mach_port_name_array_t *members, + mach_msg_type_number_t *membersCnt) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_get_set_status(task, name, members, + membersCnt); + + return (rv); +} + +kern_return_t +mach_port_move_member( + ipc_space_t task, + mach_port_name_t member, + mach_port_name_t after) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_move_member_trap(task, member, after); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_move_member(task, member, after); + + return (rv); +} + +kern_return_t +mach_port_request_notification( + ipc_space_t task, + mach_port_name_t name, + mach_msg_id_t msgid, + mach_port_mscount_t sync, + mach_port_t notify, + mach_msg_type_name_t notifyPoly, + mach_port_t *previous) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_request_notification(task, name, msgid, + sync, notify, notifyPoly, previous); + + return (rv); +} + +kern_return_t +mach_port_insert_right( + ipc_space_t task, + mach_port_name_t name, + mach_port_t poly, + mach_msg_type_name_t polyPoly) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_insert_right_trap(task, name, poly, polyPoly); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_insert_right(task, name, poly, + polyPoly); + + return (rv); +} + +kern_return_t +mach_port_extract_right( + ipc_space_t task, + mach_port_name_t name, + mach_msg_type_name_t msgt_name, + mach_port_t *poly, + mach_msg_type_name_t *polyPoly) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_extract_right(task, name, msgt_name, + poly, polyPoly); + + return (rv); +} + +kern_return_t +mach_port_set_seqno( + ipc_space_t task, + mach_port_name_t name, + mach_port_seqno_t seqno) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_set_seqno(task, name, seqno); + + return (rv); +} + +kern_return_t +mach_port_get_attributes( + ipc_space_t task, + mach_port_name_t name, + mach_port_flavor_t flavor, + mach_port_info_t port_info_out, + mach_msg_type_number_t *port_info_outCnt) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_get_attributes(task, name, flavor, + port_info_out, port_info_outCnt); + + return (rv); +} + +kern_return_t +mach_port_set_attributes( + ipc_space_t task, + mach_port_name_t name, + mach_port_flavor_t flavor, + mach_port_info_t port_info, + mach_msg_type_number_t port_infoCnt) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_set_attributes(task, name, flavor, + port_info, port_infoCnt); + + return (rv); +} + +kern_return_t +mach_port_allocate_qos( + ipc_space_t task, + mach_port_right_t right, + mach_port_qos_t *qos, + mach_port_name_t *name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_allocate_qos(task, right, qos, name); + + return (rv); +} + +kern_return_t +mach_port_allocate_full( + ipc_space_t task, + mach_port_right_t right, + mach_port_t proto, + mach_port_qos_t *qos, + mach_port_name_t *name) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_allocate_full(task, right, proto, qos, name); + + return (rv); +} + +kern_return_t +task_set_port_space( + ipc_space_t task, + int table_entries) +{ + kern_return_t rv; + + rv = _kernelrpc_task_set_port_space(task, table_entries); + + return (rv); +} + +kern_return_t +mach_port_get_srights( + ipc_space_t task, + mach_port_name_t name, + mach_port_rights_t *srights) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_get_srights(task, name, srights); + + return (rv); +} + +kern_return_t +mach_port_space_info( + ipc_space_t task, + ipc_info_space_t *space_info, + ipc_info_name_array_t *table_info, + mach_msg_type_number_t *table_infoCnt, + ipc_info_tree_name_array_t *tree_info, + mach_msg_type_number_t *tree_infoCnt) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_space_info(task, space_info, table_info, + table_infoCnt, tree_info, tree_infoCnt); + + return (rv); +} + +kern_return_t +mach_port_dnrequest_info( + ipc_space_t task, + mach_port_name_t name, + unsigned *dnr_total, + unsigned *dnr_used) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_dnrequest_info(task, name, dnr_total, + dnr_used); + + return (rv); +} + +kern_return_t +mach_port_kernel_object( + ipc_space_t task, + mach_port_name_t name, + unsigned *object_type, + unsigned *object_addr) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_kernel_object(task, name, + object_type, object_addr); + + return (rv); +} + +kern_return_t +mach_port_insert_member( + ipc_space_t task, + mach_port_name_t name, + mach_port_name_t pset) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_insert_member_trap(task, name, pset); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_insert_member(task, name, pset); + + return (rv); +} + +kern_return_t +mach_port_extract_member( + ipc_space_t task, + mach_port_name_t name, + mach_port_name_t pset) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_extract_member_trap(task, name, pset); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_extract_member(task, name, pset); + + return (rv); +} + +kern_return_t +mach_port_get_context( + ipc_space_t task, + mach_port_name_t name, + mach_port_context_t *context) +{ + kern_return_t rv; + mach_vm_address_t wide_context; + + rv = _kernelrpc_mach_port_get_context(task, name, &wide_context); + + if (rv == KERN_SUCCESS) { + *context = (mach_port_context_t)wide_context; + } + + return (rv); +} + +kern_return_t +mach_port_set_context( + ipc_space_t task, + mach_port_name_t name, + mach_port_context_t context) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_set_context(task, name, context); + + return (rv); +} + +kern_return_t +mach_port_kobject( + ipc_space_t task, + mach_port_name_t name, + natural_t *object_type, + mach_vm_address_t *object_addr) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_kobject(task, name, object_type, object_addr); + + return (rv); +} diff --git a/libsyscall/mach/mach_vm.c b/libsyscall/mach/mach_vm.c new file mode 100644 index 000000000..2db383021 --- /dev/null +++ b/libsyscall/mach/mach_vm.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Make sure we don't accidentally include the external definitions of + * the routines we're interposing on below. + */ +#define _vm_map_user_ +#define _mach_vm_user_ +#include +#include +#undef _vm_map_user_ +#include +#undef _mach_vm_user_ +#include + +kern_return_t +mach_vm_allocate( + mach_port_name_t target, + mach_vm_address_t *address, + mach_vm_size_t size, + int flags) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_vm_allocate_trap(target, address, size, flags); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_vm_allocate(target, address, size, flags); + + return (rv); +} + +kern_return_t +mach_vm_deallocate( + mach_port_name_t target, + mach_vm_address_t address, + mach_vm_size_t size) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_vm_deallocate_trap(target, address, size); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_vm_deallocate(target, address, size); + + return (rv); +} + +kern_return_t +mach_vm_protect( + mach_port_name_t task, + mach_vm_address_t address, + mach_vm_size_t size, + boolean_t set_maximum, + vm_prot_t new_protection) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_vm_protect_trap(task, address, size, set_maximum, + new_protection); + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_vm_protect(task, address, size, + set_maximum, new_protection); + + return (rv); +} + +kern_return_t +vm_allocate( + mach_port_name_t task, + vm_address_t *address, + vm_size_t size, + int flags) +{ + kern_return_t rv; + mach_vm_address_t mach_addr; + + mach_addr = (mach_vm_address_t)*address; + rv = mach_vm_allocate(task, &mach_addr, size, flags); +#if defined(__LP64__) + *address = mach_addr; +#else + *address = (vm_address_t)(mach_addr & ((vm_address_t)-1)); +#endif + + return (rv); +} + +kern_return_t +vm_deallocate( + mach_port_name_t task, + vm_address_t address, + vm_size_t size) +{ + kern_return_t rv; + + rv = mach_vm_deallocate(task, address, size); + + return (rv); +} + +kern_return_t +vm_protect( + mach_port_name_t task, + vm_address_t address, + vm_size_t size, + boolean_t set_maximum, + vm_prot_t new_protection) +{ + kern_return_t rv; + + rv = mach_vm_protect(task, address, size, set_maximum, new_protection); + + return (rv); +} diff --git a/libsyscall/mach/string.h b/libsyscall/mach/string.h index 9b20980cf..39a02753a 100644 --- a/libsyscall/mach/string.h +++ b/libsyscall/mach/string.h @@ -56,6 +56,5 @@ int _mach_vsnprintf(char *buffer, int length, const char *fmt, va_list ap); // Actually in memcpy.c but MIG likes to include string.h void *memcpy(void *dst0, const void *src0, size_t length); -int memcmp(const void *s1, const void *s2, size_t n); #endif /* _STRING_H_ */ diff --git a/libsyscall/mach/vm_map.defs b/libsyscall/mach/vm_map.defs index c9aefb3c6..15a60542c 100644 --- a/libsyscall/mach/vm_map.defs +++ b/libsyscall/mach/vm_map.defs @@ -25,15 +25,11 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if !LIBSYSCALL_INTERFACE && (defined(__LP64__) || defined(__arm__)) +#if !LIBSYSCALL_INTERFACE && defined(__LP64__) /* * In an LP64 environment, the traditional Mach VM interface names are * really just a second instance of the "wide" Mach VM interfaces. * - * For ARM, which doesn't support two address space sizes, use the "wide" - * interfaces as well, to reduce the amount of duplicate code compiled - * into the kernel. - * * The _MACH_VM_PUBLISH_AS_LOCAL_ flag triggers mach_vm.defs to export * the local names instead. * diff --git a/libsyscall/wrappers/__get_cpu_capabilities.s b/libsyscall/wrappers/__get_cpu_capabilities.s index 12e9c7652..f03e44420 100644 --- a/libsyscall/wrappers/__get_cpu_capabilities.s +++ b/libsyscall/wrappers/__get_cpu_capabilities.s @@ -46,4 +46,6 @@ __get_cpu_capabilities: movl _COMM_PAGE_CPU_CAPABILITIES, %eax ret +#else +#error Unsupported architecture #endif diff --git a/libsyscall/wrappers/cancelable/fcntl-base.c b/libsyscall/wrappers/cancelable/fcntl-base.c index 2f48a42c1..589af9a69 100644 --- a/libsyscall/wrappers/cancelable/fcntl-base.c +++ b/libsyscall/wrappers/cancelable/fcntl-base.c @@ -45,15 +45,15 @@ fcntl(int fd, int cmd, ...) case F_PREALLOCATE: case F_SETSIZE: case F_RDADVISE: - case F_READBOOTSTRAP: - case F_WRITEBOOTSTRAP: case F_LOG2PHYS: + case F_LOG2PHYS_EXT: case F_GETPATH: case F_GETPATH_MTMINFO: case F_PATHPKG_CHECK: case F_OPENFROM: case F_UNLINKFROM: case F_ADDSIGS: + case F_ADDFILESIGS: arg = va_arg(ap, void *); break; default: diff --git a/libsyscall/wrappers/legacy/getaudit.c b/libsyscall/wrappers/legacy/getaudit.c new file mode 100644 index 000000000..087053212 --- /dev/null +++ b/libsyscall/wrappers/legacy/getaudit.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +/* + * Wrappers for the legacy getaudit() and setaudit() syscalls. + */ + +int +getaudit(struct auditinfo *ainfo) +{ + int err; + auditinfo_addr_t aia; + + if ((err = getaudit_addr(&aia, sizeof(aia))) != 0) + return (err); + + ainfo->ai_auid = aia.ai_auid; + ainfo->ai_mask = aia.ai_mask; + ainfo->ai_termid.port = aia.ai_termid.at_port; + ainfo->ai_termid.machine = aia.ai_termid.at_addr[0]; + ainfo->ai_asid = aia.ai_asid; + + return (0); +} + +int +setaudit(const struct auditinfo *ainfo) +{ + int err; + struct auditinfo *ai = (struct auditinfo *)ainfo; + auditinfo_addr_t aia; + + /* Get the current ai_flags so they are preserved. */ + if ((err = getaudit_addr(&aia, sizeof(aia))) != 0) + return (err); + + aia.ai_auid = ai->ai_auid; + aia.ai_mask = ai->ai_mask; + aia.ai_termid.at_port = ai->ai_termid.port; + aia.ai_termid.at_type = AU_IPv4; + aia.ai_termid.at_addr[0] = ai->ai_termid.machine; + aia.ai_asid = ai->ai_asid; + + if ((err = setaudit_addr(&aia, sizeof(aia))) != 0) + return (err); + + /* The session ID may have been assigned by kernel so copy that back. */ + ai->ai_asid = aia.ai_asid; + + return (0); +} diff --git a/libsyscall/wrappers/memcpy.c b/libsyscall/wrappers/memcpy.c index c9af35506..5bce5933a 100644 --- a/libsyscall/wrappers/memcpy.c +++ b/libsyscall/wrappers/memcpy.c @@ -124,20 +124,3 @@ bcopy(const void *s1, void *s2, size_t n) { memcpy(s2, s1, n); } - -/* - * Compare memory regions. - */ -__private_extern__ int -memcmp(const void *s1, const void *s2, size_t n) -{ - if (n != 0) { - const unsigned char *p1 = s1, *p2 = s2; - - do { - if (*p1++ != *p2++) - return (*--p1 - *--p2); - } while (--n != 0); - } - return (0); -} diff --git a/libsyscall/wrappers/open_dprotected_np.c b/libsyscall/wrappers/open_dprotected_np.c new file mode 100644 index 000000000..afd213d78 --- /dev/null +++ b/libsyscall/wrappers/open_dprotected_np.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include + +int __open_dprotected_np(const char* path, int flags, int class, int dpflags, int mode); + +int open_dprotected_np(const char *path, int flags, int class, int dpflags, ...) { + int mode = 0; + + if (flags & O_CREAT) { + va_list ap; + va_start(ap, dpflags); + mode = va_arg(ap, int); + va_end(ap); + } + return (__open_dprotected_np(path, flags, class, dpflags, mode)); +} + diff --git a/libsyscall/xcodescripts/create-syscalls.pl b/libsyscall/xcodescripts/create-syscalls.pl index 68366de86..85c282f0b 100755 --- a/libsyscall/xcodescripts/create-syscalls.pl +++ b/libsyscall/xcodescripts/create-syscalls.pl @@ -126,6 +126,25 @@ my %Symbols = ( }, ); +# An explicit list of cancelable syscalls. For creating stubs that call the +# cancellable version of cerror. +my @Cancelable = qw/ + accept access aio_suspend + close connect + fcntl fdatasync fpathconf fstat fsync + getlogin + ioctl + link lseek lstat + msgrcv msgsnd msync + open + pathconf poll posix_spawn pread pwrite + read readv recvfrom recvmsg rename + __semwait_signal __sigwait + select sem_wait semop sendmsg sendto sigsuspend stat symlink sync + unlink + wait4 waitid write writev +/; + sub usage { die "Usage: $MyName syscalls.master custom-directory platforms-directory out-directory\n"; } @@ -216,7 +235,7 @@ sub checkForCustomStubs { $$sym{is_custom} = $source; if (!$$sym{is_private}) { foreach my $subarch (@Architectures) { - (my $arch = $subarch) =~ s/arm(.*)/arm/; + (my $arch = $subarch) =~ s/arm(v.*)/arm/; $$sym{aliases}{$arch} = [] unless $$sym{aliases}{$arch}; push(@{$$sym{aliases}{$arch}}, $$sym{asm_sym}); } @@ -237,7 +256,7 @@ sub readAliases { my @a = (); for my $arch (@Architectures) { - (my $new_arch = $arch) =~ s/arm(.*)/arm/g; + (my $new_arch = $arch) =~ s/arm(v.*)/arm/g; push(@a, $new_arch) unless grep { $_ eq $new_arch } @a; } @@ -294,18 +313,22 @@ sub writeStubForSymbol { my @conditions; for my $subarch (@Architectures) { - (my $arch = $subarch) =~ s/arm(.*)/arm/; + (my $arch = $subarch) =~ s/arm(v.*)/arm/; push(@conditions, "defined(__${arch}__)") unless grep { $_ eq $arch } @{$$symbol{except}}; } + + my %is_cancel; + for (@Cancelable) { $is_cancel{$_} = 1 }; print $f "#define __SYSCALL_32BIT_ARG_BYTES $$symbol{bytes}\n"; print $f "#include \"SYS.h\"\n\n"; if (scalar(@conditions)) { + my $nc = ($is_cancel{$$symbol{syscall}} ? "cerror" : "cerror_nocancel"); printf $f "#if " . join(" || ", @conditions) . "\n"; - printf $f "__SYSCALL(%s, %s, %d)\n", $$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}; + printf $f "__SYSCALL2(%s, %s, %d, %s)\n", $$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}, $nc; if (!$$symbol{is_private} && (scalar(@conditions) < scalar(@Architectures))) { printf $f "#else\n"; - printf $f "__SYSCALL(%s, %s, %d)\n", "__".$$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}; + printf $f "__SYSCALL2(%s, %s, %d, %s)\n", "__".$$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}, $nc; } printf $f "#endif\n\n"; } else { @@ -318,7 +341,7 @@ sub writeAliasesForSymbol { my ($f, $symbol) = @_; foreach my $subarch (@Architectures) { - (my $arch = $subarch) =~ s/arm(.*)/arm/; + (my $arch = $subarch) =~ s/arm(v.*)/arm/; next unless scalar($$symbol{aliases}{$arch}); diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh index 068bc30ad..3f98c487d 100755 --- a/libsyscall/xcodescripts/mach_install_mig.sh +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/sh -x # # Copyright (c) 2010 Apple Inc. All rights reserved. # @@ -31,17 +31,22 @@ cd $OBJROOT # check if we're building for the simulator -[ "$RC_ProjectName" == "Libmach_Sim" ] && DSTROOT="$DSTROOT$SDKROOT" +if [ "${RC_ProjectName%_Sim}" != "${RC_ProjectName}" ] ; then + DSTROOT="${DSTROOT}${SDKROOT}" +fi MIG=`xcrun -sdk "$SDKROOT" -find mig` MIGCC=`xcrun -sdk "$SDKROOT" -find cc` export MIGCC MIG_DEFINES="-DLIBSYSCALL_INTERFACE" MIG_HEADER_DST="$DSTROOT/usr/include/mach" +MIG_PRIVATE_HEADER_DST="$DSTROOT/usr/local/include/mach" SERVER_HEADER_DST="$DSTROOT/usr/include/servers" # from old Libsystem makefiles MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 1` SRC="$SRCROOT/mach" +MIG_INTERNAL_HEADER_DST="$DERIVED_SOURCES_DIR/mach" +MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders" MIGS="clock.defs clock_priv.defs @@ -49,18 +54,32 @@ MIGS="clock.defs exc.defs host_priv.defs host_security.defs - ledger.defs lock_set.defs - mach_port.defs mach_host.defs - mach_vm.defs + mach_port.defs processor.defs processor_set.defs vm_map.defs" +MIGS_PRIVATE="" + +MIGS_DUAL_PUBLIC_PRIVATE="" + +if [[ "$PLATFORM" = "iPhoneOS" || "$RC_ProjectName" = "Libsyscall_headers_Sim" ]] +then + MIGS_PRIVATE="mach_vm.defs" +else + MIGS+=" mach_vm.defs" +fi + + MIGS_ARCH="thread_act.defs task.defs" +MIGS_INTERNAL="mach_port.defs + mach_vm.defs + vm_map.defs" + SERVER_HDRS="key_defs.h ls_defs.h netname_defs.h @@ -79,11 +98,33 @@ $MIG -arch $MACHINE_ARCH -header "$SERVER_HEADER_DST/netname.h" $SRC/servers/net mkdir -p $MIG_HEADER_DST -for mig in $MIGS; do +for mig in $MIGS $MIGS_DUAL_PUBLIC_PRIVATE; do MIG_NAME=`basename $mig .defs` $MIG -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $SRC/$mig done +mkdir -p $MIG_PRIVATE_HEADER_DST + +for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do + MIG_NAME=`basename $mig .defs` + $MIG -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig + if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then + echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h" + fi +done + + +# special headers used just for building Libsyscall +# Note: not including -DLIBSYSCALL_INTERFACE to mig so we'll get the proper +# 'internal' version of the headers being built + +mkdir -p $MIG_INTERNAL_HEADER_DST + +for mig in $MIGS_INTERNAL; do + MIG_NAME=`basename $mig .defs` + $MIG -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $SRC/$mig +done + ARCHS=`echo $ARCHS | sed -e 's/armv./arm/g'` for arch in $ARCHS; do MIG_ARCH_DST="$MIG_HEADER_DST/$arch" diff --git a/lldbmacros.py b/lldbmacros.py new file mode 100644 index 000000000..9278abb61 --- /dev/null +++ b/lldbmacros.py @@ -0,0 +1,184 @@ +import lldb +import re +import getopt + +# Note: This module will eventually contain loads of macros. So please bear with the Macro/EndMacro comments + + +# Global functions +def findGlobal(variable): + return lldb.target.FindGlobalVariables(variable, 0).GetValueAtIndex(0) + +def findGlobalValue(variable): + return findGlobal(variable).GetValue() + +def readMemberUnsigned(variable,member): + return variable.GetChildMemberWithName(member).GetValueAsUnsigned(0) + +def readMemberSigned(variable,member): + return variable.GetChildMemberWithName(member).GetValueAsSigned() + +def readMemberString(variable,member): + return str(variable.GetChildMemberWithName(member).GetSummary()).strip('"') + + + +class Output : + """ + An output handler for all command. Use Output.print to direct all output of macro via the handler. + Currently this provide capabilities + -o path/to/filename + The output of this command execution will be saved to file. Parser information or errors will + not be sent to file though. eg /tmp/output.txt + -s filter_string + the "filter_string" param is parsed to python regex expression and each line of output + will be printed/saved only if it matches the expression. + The command header will not be filtered in any case. + """ + STDOUT =1 + FILEOUT =2 + FILTER =False + + def __init__(self): + self.out = Output.STDOUT + self.fname=None + self.fhandle=None + self.FILTER=False + + def printString(self, s): + """ Handler for all commands output. By default just print to stdout """ + if self.FILTER and not self.reg.search(s): return + if self.out == Output.STDOUT: print s + elif self.out == Output.FILEOUT : self.fhandle.write(s+"\n") + + def printHeader(self,s): + if self.out == Output.STDOUT: print s + elif self.out == Output.FILEOUT: self.fhandle.write(s+"\n") + + def done(self): + """ closes any open files. report on any errors """ + if self.fhandle != None : + self.fhandle.close() + + def setOptions(self,args): + """ parse the arguments passed to the command + param : args => [] of (typically args.split()) + """ + opts=() + try: + opts,args = getopt.getopt(args,'o:s:',[]) + except getopt.GetoptError,err: + print str(err) + #continue with processing + for o,a in opts : + if o == "-o" and len(a) > 0: + self.fname=a.strip() + self.fhandle=open(self.fname,"w") + self.out = Output.FILEOUT + print "saving results in file ",str(a) + elif o == "-s" and len(a) > 0: + self.reg = re.compile(a.strip(),re.MULTILINE|re.DOTALL) + self.FILTER=True + print "showing results for regex:",a.strip() + else : + print "Error: unknown option ",o,a + + +# Inteface function for showallkexts command +def showallkexts_command(debugger, args, result, lldb_dict): + kext_summary_header = findGlobal("gLoadedKextSummaries") + result.Printf(_summarizeallkexts(kext_summary_header)) + return None + +# Interface function for loaded kext summary formatter +def showallkexts_summary(kext_summary_header, lldb_dict): + return "\n" + _summarizeallkexts(kext_summary_header) + +# Internal function for walking kext summaries +def _summarizeallkexts(kext_summary_header): + summary = "ID Address Size Version Name\n" + summaries = kext_summary_header.GetChildMemberWithName("summaries") + count = int(kext_summary_header.GetChildMemberWithName("numSummaries").GetValue()) + for i in range(0, count): + summary += summaries.GetChildAtIndex(i, lldb.eNoDynamicValues, True).GetSummary() + "\n" + return summary + +# Macro: memstats +def memstats_command(debugger,args,result,lldb_dict): + stream = Output() + stream.setOptions(args.split()) + memstats(stream) + stream.done() + +def memstats(ostream): + ostream.printString ( "kern_memorystatus_level: {0}".format(findGlobalValue("kern_memorystatus_level")) ) + ostream.printString ( "vm_page_throttled_count: {0}".format(findGlobalValue("vm_page_throttled_count")) ) + ostream.printString ( "vm_page_active_count: {0}".format(findGlobalValue("vm_page_active_count")) ) + ostream.printString ( "vm_page_inactive_count: {0}".format(findGlobalValue("vm_page_inactive_count")) ) + ostream.printString ( "vm_page_wire_count: {0}".format(findGlobalValue("vm_page_wire_count")) ) + ostream.printString ( "vm_page_free_count: {0}".format(findGlobalValue("vm_page_free_count")) ) + ostream.printString ( "vm_page_purgeable_count: {0}".format(findGlobalValue("vm_page_purgeable_count")) ) + ostream.printString ( "vm_page_inactive_target: {0}".format(findGlobalValue("vm_page_inactive_target")) ) + ostream.printString ( "vm_page_free_target: {0}".format(findGlobalValue("vm_page_free_target")) ) + ostream.printString ( "insue_ptepages_count: {0}".format(findGlobalValue("inuse_ptepages_count")) ) + ostream.printString ( "vm_page_free_reserved: {0}".format(findGlobalValue("vm_page_free_reserved")) ) +# EndMacro: memstats + + +# Macro: zprint +def zprint_command(debugger,args,result,lldb_dict): + stream = Output() + stream.setOptions(args.split()) + _zprint(stream) + stream.done() + +def _zprint(ostream): + """Display info about memory zones""" + ostream.printHeader ( "{0: ^20s} {1: >5s} {2: >12s} {3: >12s} {4: >7s} {5: >8s} {6: >9s} {7: >8s} {8: <20s} {9} ".format('ZONE', 'COUNT', 'TOT_SZ', 'MAX_SZ', 'ELT_SZ', 'ALLOC_SZ', 'TOT_ALLOC', 'TOT_FREE', 'NAME','') ) + format_string = '{0: >#020x} {1: >5d} {2: >12d} {3: >12d} {4: >7d} {5: >8d} {6: >9d} {7: >8d} {8: <20s} {9}' + zone_ptr = findGlobal("first_zone"); + + while zone_ptr.GetValueAsUnsigned() != 0 : + addr = zone_ptr.GetValueAsUnsigned() + count = readMemberUnsigned(zone_ptr, "count") + cur_size = readMemberUnsigned(zone_ptr, "cur_size") + max_size = readMemberUnsigned(zone_ptr, "max_size") + elem_size = readMemberUnsigned(zone_ptr, "elem_size") + alloc_size = readMemberUnsigned(zone_ptr, "alloc_size") + num_allocs = readMemberUnsigned(zone_ptr, "num_allocs") + num_frees = readMemberUnsigned(zone_ptr, "num_frees") + name = str(readMemberString(zone_ptr, "zone_name")) + markings="" + if str(zone_ptr.GetChildMemberWithName("exhaustible").GetValue()) == '1' : markings+="H" + if str(zone_ptr.GetChildMemberWithName("collectable").GetValue()) == '1' : markings+="C" + if str(zone_ptr.GetChildMemberWithName("expandable").GetValue()) == '1' : markings+="X" + if str(zone_ptr.GetChildMemberWithName("noencrypt").GetValue()) == '1' : markings+="$" + + ostream.printString(format_string.format(addr, count, cur_size, max_size, elem_size, alloc_size, num_allocs, num_frees, name, markings)) + + zone_ptr = zone_ptr.GetChildMemberWithName("next_zone") + return None +# EndMacro: zprint + + +# Macro: showioalloc +def showioalloc_command(debugger,args,result,lldb_dict): + stream = Output() + stream.setOptions(args.split()) + _showioalloc(stream) + stream.done() + +def _showioalloc(ostream): + ivars_size = findGlobal("debug_ivars_size").GetValueAsUnsigned() + container_malloc_size = findGlobal("debug_container_malloc_size").GetValueAsUnsigned() + iomalloc_size = findGlobal("debug_iomalloc_size").GetValueAsUnsigned() + iomallocpageable_size = findGlobal("debug_iomallocpageable_size").GetValueAsUnsigned() + + ostream.printString("Instance allocation = {0:#0x} = {1:d} K".format(ivars_size, (int)(ivars_size/1024))) + ostream.printString("Container allocation = {0:#0x} = {1:d} K".format(container_malloc_size,(int)(container_malloc_size/1024))) + ostream.printString("IOMalloc allocation = {0:#0x} = {1:d} K".format(iomalloc_size,(int)(iomalloc_size/1024))) + ostream.printString("Pageable allocation = {0:#0x} = {1:d} K".format(iomallocpageable_size,(int)(iomallocpageable_size/1024))) + return None +# EndMacro: showioalloc + + diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 5fea21d30..12f5203b1 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -25,6 +25,19 @@ endif SDKROOT ?= / HOST_SDKROOT ?= / +HOST_SPARSE_SDKROOT ?= / + +# SDKROOT may be passed as a shorthand like "iphoneos.internal". We +# must resolve these to a full path and override SDKROOT. + +ifeq ($(SDKROOT_RESOLVED),) +ifeq ($(SDKROOT),/) +export SDKROOT_RESOLVED := / +else +export SDKROOT_RESOLVED := $(shell xcodebuild -sdk $(SDKROOT) -version Path | head -1) +endif +endif +override SDKROOT = $(SDKROOT_RESOLVED) ifeq ($(PLATFORM),) export PLATFORM := $(shell xcodebuild -sdk $(SDKROOT) -version PlatformPath | head -1 | sed 's,^.*/\([^/]*\)\.platform$$,\1,') @@ -33,20 +46,25 @@ ifeq ($(PLATFORM),) endif endif +ifeq ($(PLATFORM),iPhoneOS) + DEVELOPER_DIR ?= $(shell xcode-select -print-path) + export HOST_SPARSE_SDKROOT := $(DEVELOPER_DIR)/SDKs/iPhoneHostSideTools.sparse.sdk +endif + # CC/CXX get defined by make(1) by default, so we can't check them # against the empty string to see if they haven't been set ifeq ($(origin CC),default) ifneq ($(findstring iPhone,$(PLATFORM)),) - export CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find gcc-4.2) + export CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find clang) else - export CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find cc) + export CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find clang) endif endif ifeq ($(origin CXX),default) ifneq ($(findstring iPhone,$(PLATFORM)),) - export CXX := $(shell $(XCRUN) -sdk $(SDKROOT) -find g++-4.2) + export CXX := $(shell $(XCRUN) -sdk $(SDKROOT) -find clang++) else - export CXX := $(shell $(XCRUN) -sdk $(SDKROOT) -find c++) + export CXX := $(shell $(XCRUN) -sdk $(SDKROOT) -find clang++) endif endif ifeq ($(MIG),) @@ -55,9 +73,6 @@ endif ifeq ($(MIGCC),) export MIGCC := $(CC) endif -ifeq ($(RELPATH),) - export RELPATH := $(shell $(XCRUN) -sdk $(SDKROOT) -find relpath) -endif ifeq ($(STRIP),) export STRIP := $(shell $(XCRUN) -sdk $(SDKROOT) -find strip) endif @@ -73,9 +88,6 @@ endif ifeq ($(UNIFDEF),) export UNIFDEF := $(shell $(XCRUN) -sdk $(SDKROOT) -find unifdef) endif -ifeq ($(DECOMMENT),) - export DECOMMENT := $(shell $(XCRUN) -sdk $(SDKROOT) -find decomment) -endif ifeq ($(DSYMUTIL),) export DSYMUTIL := $(shell $(XCRUN) -sdk $(SDKROOT) -find dsymutil) endif @@ -94,6 +106,9 @@ endif # Platform-specific tools ifneq ($(findstring iPhone,$(PRODUCT)),) +ifeq ($(EMBEDDED_DEVICE_MAP),) + export EMBEDDED_DEVICE_MAP := $(shell $(XCRUN) -sdk $(SDKROOT) -find embedded_device_map) +endif ifeq ($(IPHONEOS_OPTIMIZE),) export IPHONEOS_OPTIMIZE := $(shell $(XCRUN) -sdk $(SDKROOT) -find iphoneos-optimize) endif @@ -102,10 +117,11 @@ endif # Scripts or tools we build ourselves SEG_HACK := $(OBJROOT)/SETUP/setsegname/setsegname KEXT_CREATE_SYMBOL_SET := $(OBJROOT)/SETUP/kextsymboltool/kextsymboltool +DECOMMENT := $(OBJROOT)/SETUP/decomment/decomment NEWVERS = $(SRCROOT)/config/newvers.pl +MD := $(OBJROOT)/SETUP/md/md # Standard BSD tools -MD = /usr/bin/md RM = /bin/rm -f CP = /bin/cp MV = /bin/mv @@ -113,6 +129,7 @@ LN = /bin/ln -fs CAT = /bin/cat MKDIR = /bin/mkdir -p FIND = /usr/bin/find +XARGS = /usr/bin/xargs INSTALL = /usr/bin/install TAR = /usr/bin/gnutar BASENAME = /usr/bin/basename @@ -120,6 +137,9 @@ TR = /usr/bin/tr # Platform-specific tools ifeq (iPhoneOS,$(PLATFORM)) +ifeq ($(EMBEDDED_DEVICE_MAP),) + export EMBEDDED_DEVICE_MAP := $(shell $(XCRUN) -sdk $(SDKROOT) -find embedded_device_map || echo /usr/bin/true) +endif ifeq ($(IPHONEOS_OPTIMIZE),) export IPHONEOS_OPTIMIZE := $(shell $(XCRUN) -sdk $(SDKROOT) -find iphoneos-optimize || echo /usr/bin/true) endif @@ -144,13 +164,4 @@ ifeq ($(HOST_CODESIGN),) export HOST_CODESIGN := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -find codesign) endif -# -# Command to build libkmod.a/libkmodc++.a, which are -# linked into kext binaries, and should be built as if -# they followed system-wide policies -# -ifeq ($(LIBKMOD_CC),) - export LIBKMOD_CC := $(shell $(XCRUN) -sdk $(SDKROOT) -find cc) -endif - # vim: set ft=make: diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index 92d80379f..4e49fe6a7 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -1,4 +1,5 @@ export SOURCE=$(shell /bin/pwd) +export RELATIVE_SOURCE_PATH ?= . # # gnumake 3.77 support @@ -25,11 +26,11 @@ export COMPONENT_LIST = osfmk bsd libkern iokit pexpert libsa security export COMPONENT_LIST_UC := $(shell printf "%s" "$(COMPONENT_LIST)" | $(TR) a-z A-Z) endif ifndef COMPONENT -export COMPONENT := $(firstword $(subst /, ,$(shell $(RELPATH) $(SRCROOT) $(SOURCE)))) +export COMPONENT := $(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH))) export COMPONENT_IMPORT_LIST := $(filter-out $(COMPONENT),$(COMPONENT_LIST)) else ifeq ($(COMPONENT), .) -export COMPONENT := $(firstword $(subst /, ,$(shell $(RELPATH) $(SRCROOT) $(SOURCE)))) +export COMPONENT := $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH)))) export COMPONENT_IMPORT_LIST := $(filter-out $(COMPONENT),$(COMPONENT_LIST)) endif endif @@ -42,9 +43,13 @@ endif ifndef ARCH_CONFIGS ifdef RC_ARCHS -export ARCH_CONFIGS := $(shell printf "%s" "$(RC_ARCHS)" | $(TR) a-z A-Z | sed -e 's/ARMV./ARM/g') +export ARCH_CONFIGS := $(shell printf "%s" "$(RC_ARCHS)" | $(TR) a-z A-Z | sed -E 's/ARMV[0-9][A-Z]?/ARM/g') else -export ARCH_CONFIGS := $(shell arch | $(TR) a-z A-Z | sed -e 's/ARMV./ARM/g') +ifeq ($(PLATFORM),iPhoneOS) + export ARCH_CONFIGS := ARM +else + export ARCH_CONFIGS := $(shell arch | $(TR) a-z A-Z | sed -E 's/ARMV[0-9][A-Z]?/ARM/g') +endif endif endif ifdef ARCH_CONFIG @@ -104,10 +109,12 @@ export MACHINE_CONFIG = DEFAULT endif +# +# Machine Configuration options +# +export SUPPORTED_I386_MACHINE_CONFIGS := DEFAULT +export SUPPORTED_X86_64_MACHINE_CONFIGS := DEFAULT -ifndef SUPPORTED_MACHINE_CONFIGS -export SUPPORTED_MACHINE_CONFIGS = DEFAULT -endif # # Target configuration options. NOTE - target configurations will @@ -117,10 +124,10 @@ endif # kernel configuration, the second is the architecture configuration, # and the third is the machine configuration. You may pass in as # many groups of configurations as you wish. Each item passed in is -# seperated by whitespace. +# separated by whitespace. # # Example: -# TARGET_CONFIGS="release x86_64 default debug i386 default release arm MX31ADS" +# TARGET_CONFIGS="release x86_64 default debug i386 default release arm S5l8920X" # Parameters may be in upper or lower case (they are converted to upper). # # "default" parameter is a special case. It means use the default value for @@ -138,9 +145,17 @@ ifdef TARGET_CONFIGS export MACHINE_CONFIG = $(word 3, $(TARGET_CONFIGS_UC)) export DEFAULT_KERNEL_CONFIG = $(word 1, $(TARGET_CONFIGS_UC)) else - # generate TARGET_CONFIGS using KERNEL_CONFIGS and ARCH_CONFIGS and MACHINE_CONFIG (which defaults to "DEFAULT") - temp_list = $(foreach my_kern_config, $(KERNEL_CONFIGS), $(my_kern_config) arch_slot $(MACHINE_CONFIG)) - export TARGET_CONFIGS = $(strip $(foreach my_arch_config, $(ARCH_CONFIGS), $(subst arch_slot,$(my_arch_config),$(temp_list)))) + + ifneq ($(filter %_embedded,$(MAKECMDGOALS)),) +# generate set of standard embedded configs + export TARGET_CONFIGS = $(TARGET_CONFIGS_EMBEDDED) + else ifneq ($(filter %_devicemap,$(MAKECMDGOALS)),) + DEVICEMAP_PLATFORMS = $(shell $(EMBEDDED_DEVICE_MAP) -query SELECT DISTINCT Platform FROM Targets | $(TR) [:lower:] [:upper:]) + export TARGET_CONFIGS = $(foreach my_kernel_config,$(KERNEL_CONFIGS_EMBEDDED),$(foreach my_arch,$(ARCH_CONFIGS),$(foreach my_machine_config,$(filter $(DEVICEMAP_PLATFORMS),$(SUPPORTED_$(my_arch)_MACHINE_CONFIGS)),$(my_kernel_config) $(my_arch) $(my_machine_config) ))) + else +# generate TARGET_CONFIGS using KERNEL_CONFIGS and ARCH_CONFIGS and MACHINE_CONFIG (which defaults to "DEFAULT") + export TARGET_CONFIGS = $(strip $(foreach my_arch_config, $(ARCH_CONFIGS), $(foreach my_kern_config, $(KERNEL_CONFIGS), $(my_kern_config) $(my_arch_config) $(MACHINE_CONFIG)))) + endif export TARGET_CONFIGS_UC := $(shell printf "%s" "$(TARGET_CONFIGS)" | $(TR) a-z A-Z) export MACHINE_CONFIG = $(word 3, $(TARGET_CONFIGS_UC)) export DEFAULT_KERNEL_CONFIG = $(word 1, $(TARGET_CONFIGS_UC)) @@ -166,10 +181,12 @@ endif endif ifneq ($(MACHINE_CONFIG),) -ifeq ($(filter $(MACHINE_CONFIG),$(SUPPORTED_MACHINE_CONFIGS)),) +ifneq ($(ARCH_CONFIG),) +ifeq ($(filter $(MACHINE_CONFIG),$(SUPPORTED_$(ARCH_CONFIG)_MACHINE_CONFIGS)),) $(error Unsupported MACHINE_CONFIG $(MACHINE_CONFIG)) endif endif +endif ifneq ($(PLATFORM),) ifeq ($(filter $(PLATFORM),$(SUPPORTED_PLATFORMS)),) @@ -180,7 +197,7 @@ endif # # Kernel Configuration to install # -# supported install architecture : I386 X86_64 ARM +# supported install architecture : I386 X86_64 # export INSTALL_TYPE = $(DEFAULT_KERNEL_CONFIG) @@ -194,6 +211,22 @@ ifeq ($(INSTALL_ARCH_DEFAULT),) $(error Could not determine INSTALL_ARCH_DEFAULT) endif +# +# Deployment target flag +# +ifndef DEPLOYMENT_TARGET_FLAGS +SDKVERSION=$(shell xcodebuild -sdk $(SDKROOT) -version SDKVersion | head -1) +ifeq ($(PLATFORM),MacOSX) + export DEPLOYMENT_TARGET_FLAGS := -mmacosx-version-min=$(SDKVERSION) +else ifeq ($(PLATFORM),iPhoneOS) + export DEPLOYMENT_TARGET_FLAGS := -miphoneos-version-min=$(SDKVERSION) +else ifeq ($(PLATFORM),iPhoneSimulator) + export DEPLOYMENT_TARGET_FLAGS := +else + export DEPLOYMENT_TARGET_FLAGS := +endif +endif + # # Standard defines list # @@ -231,8 +264,8 @@ CXXWARNFLAGS_STD = \ -Wcast-qual -Wwrite-strings -Wswitch -Wcast-align -Wchar-subscripts \ -Wredundant-decls -Wextra-tokens -# Certain warnings are non-fatal (8474835) -CXXWARNFLAGS_STD += -Wno-error=cast-align +# Certain warnings are non-fatal (8474835, 9000888) +CXXWARNFLAGS_STD += -Wno-error=cast-align -Wno-error=overloaded-virtual # Can be overridden in Makefile.template or Makefile.$arch export CXXWARNFLAGS ?= $(CXXWARNFLAGS_STD) @@ -253,11 +286,6 @@ endif ARCH_FLAGS_I386 = -arch i386 ARCH_FLAGS_X86_64 = -arch x86_64 -ARCH_FLAGS_ARM = $($(addsuffix $(MACHINE_CONFIG),ARCH_FLAGS_ARM_)) - -ARCH_FLAGS_ALL_I386 = $(ARCH_FLAGS_I386) -ARCH_FLAGS_ALL_X86_64 = $(ARCH_FLAGS_X86_64) -ARCH_FLAGS_ALL_ARM = -arch arm # @@ -274,10 +302,9 @@ export DSYMBUILDDIR = ./Contents/Resources/DWARF/ # We must not use -fno-keep-inline-functions, or it will remove the dtrace # probes from the kernel. # -export CFLAGS_GEN = -static $(DEBUG_CFLAGS) -nostdinc \ - -freorder-blocks \ - -fno-builtin -fno-common -msoft-float \ - -fsigned-bitfields -fno-stack-protector $(OTHER_CFLAGS) +export CFLAGS_GEN = $(DEBUG_CFLAGS) -nostdinc \ + -freorder-blocks -fno-builtin -fno-common \ + -fsigned-bitfields $(OTHER_CFLAGS) ifeq ($(BUILD_STABS),1) export CFLAGS_GEN += -gstabs+ @@ -291,15 +318,15 @@ endif export CFLAGS_RELEASE = export CFLAGS_DEVELOPMENT = -export CFLAGS_DEBUG = -export CFLAGS_PROFILE = -pg +export CFLAGS_DEBUG = -fstack-protector-all +export CFLAGS_PROFILE = -pg -export CFLAGS_I386 = -Di386 -DI386 -D__I386__ \ - -DPAGE_SIZE_FIXED +export CFLAGS_I386 = -static -Di386 -DI386 -D__I386__ \ + -DPAGE_SIZE_FIXED -msoft-float \ + -integrated-as export CFLAGS_X86_64 = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \ - -DPAGE_SIZE_FIXED -mkernel -export CFLAGS_ARM = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \ - -fno-strict-aliasing -fno-keep-inline-functions + -DPAGE_SIZE_FIXED -mkernel -msoft-float \ + -integrated-as ifeq (-arch armv7,$(ARCH_FLAGS_ARM)) @@ -308,12 +335,7 @@ endif ifeq (-arch armv6,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb endif -ifeq (-arch armv5,$(ARCH_FLAGS_ARM)) -CFLAGS_ARM += -mno-thumb -endif -ifeq (-arch xscale,$(ARCH_FLAGS_ARM)) -CFLAGS_ARM += -mthumb -endif + export CFLAGS_RELEASEI386 = -O2 export CFLAGS_DEVELOPMENTI386 = -O2 @@ -328,7 +350,7 @@ export CFLAGS_PROFILEX86_64 = -O2 export CFLAGS_RELEASEARM = -O2 export CFLAGS_DEVELOPMENTARM = -O2 -export CFLAGS_DEBUGARM = -O2 +export CFLAGS_DEBUGARM = -O0 export CFLAGS_PROFILEARM = -O2 export CFLAGS = $(CFLAGS_GEN) \ @@ -337,6 +359,7 @@ export CFLAGS = $(CFLAGS_GEN) \ $($(addsuffix $(ARCH_CONFIG),CFLAGS_)) \ $($(addsuffix $(KERNEL_CONFIG),CFLAGS_)) \ $($(addsuffix $(ARCH_CONFIG), $(addsuffix $(KERNEL_CONFIG),CFLAGS_))) \ + $(DEPLOYMENT_TARGET_FLAGS) \ $(DEFINES) # @@ -344,30 +367,13 @@ export CFLAGS = $(CFLAGS_GEN) \ # OTHER_CXXFLAGS = - -CXXFLAGS_GEN = -fno-rtti -fno-exceptions -fcheck-new -fapple-kext \ - $(OTHER_CXXFLAGS) + +CXXFLAGS_GEN = -fapple-kext $(OTHER_CXXFLAGS) CXXFLAGS = $(CXXFLAGS_GEN) \ $($(addsuffix $(ARCH_CONFIG),CXXFLAGS_)) \ $($(addsuffix $(KERNEL_CONFIG),CXXFLAGS_)) - -# -# Support for LLVM Link Time Optimization (LTO) -# - -ifeq ($(BUILD_LTO),1) -export CFLAGS_GEN += -flto -export CXXFLAGS_GEN += -flto -export BUILD_MACHO_OBJ = 0 -export BUILD_LTO = 1 -else -export BUILD_MACHO_OBJ = 1 -export BUILD_LTO = 0 -endif - - # # Assembler command # @@ -377,7 +383,7 @@ S_KCC = $(CC) # # Default SFLAGS # -export SFLAGS_GEN = -static -D__ASSEMBLER__ $(OTHER_CFLAGS) +export SFLAGS_GEN = -D__ASSEMBLER__ $(OTHER_CFLAGS) export SFLAGS_RELEASE = export SFLAGS_DEVELOPMENT = @@ -385,16 +391,18 @@ export SFLAGS_DEBUG = export SFLAGS_PROFILE = export SFLAGS_I386 = $(CFLAGS_I386) -export SFLAGS_ARM = $(CFLAGS_ARM) export SFLAGS_X86_64 = $(CFLAGS_X86_64) + export SFLAGS = $(SFLAGS_GEN) \ $($(addsuffix $(MACHINE_CONFIG),MACHINE_FLAGS_)) \ $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ $($(addsuffix $(ARCH_CONFIG),SFLAGS_)) \ $($(addsuffix $(KERNEL_CONFIG),SFLAGS_)) \ + $(DEPLOYMENT_TARGET_FLAGS) \ $(DEFINES) + # # Linker command # @@ -403,19 +411,22 @@ LD = $(KC++) -nostdlib # # Default LDFLAGS # - export LDFLAGS_KERNEL_GEN = \ - -static \ -nostdlib \ -fapple-kext \ -Wl,-e,__start \ -Wl,-sectalign,__TEXT,__text,0x1000 \ + -Wl,-sectalign,__TEXT,initcode,0x1000 \ -Wl,-sectalign,__DATA,__common,0x1000 \ -Wl,-sectalign,__DATA,__bss,0x1000 \ -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \ -Wl,-sectcreate,__PRELINK_STATE,__kernel,/dev/null \ -Wl,-sectcreate,__PRELINK_STATE,__kexts,/dev/null \ - -Wl,-sectcreate,__PRELINK_INFO,__info,/dev/null + -Wl,-sectcreate,__PRELINK_INFO,__info,/dev/null \ + -Wl,-new_linker \ + -Wl,-pagezero_size,0x0 \ + -Wl,-version_load_command \ + -Wl,-function_starts # Availability of DWARF allows DTrace CTF (compressed type format) to be constructed. # ctf_insert creates the CTF section. It needs reserved padding in the @@ -431,8 +442,6 @@ export LDFLAGS_KERNEL_DEBUG = export LDFLAGS_KERNEL_PROFILE = export LDFLAGS_KERNEL_RELEASEI386 = \ - -Wl,-new_linker \ - -Wl,-pagezero_size,0x0 \ -Wl,-segaddr,__INITPT,0x00100000 \ -Wl,-segaddr,__INITGDT,0x00106000 \ -Wl,-segaddr,__SLEEP,0x00107000 \ @@ -444,34 +453,58 @@ export LDFLAGS_KERNEL_DEBUGI386 = $(LDFLAGS_KERNEL_RELEASEI386) export LDFLAGS_KERNEL_DEVELOPMENTI386 = $(LDFLAGS_KERNEL_RELEASEI386) export LDFLAGS_KERNEL_PROFILEI386 = $(LDFLAGS_KERNEL_RELEASEI386) -# Keep these constants in sync with the *_SEG_BASE definitions in i386/pmap.h -export LDFLAGS_KERNEL_RELEASEX86_64 = \ - -Wl,-new_linker \ - -Wl,-pagezero_size,0x0 \ - -Wl,-segaddr,__INITPT,0xffffff8000100000 \ - -Wl,-segaddr,__INITGDT,0xffffff8000106000 \ - -Wl,-segaddr,__SLEEP,0xffffff8000107000 \ - -Wl,-segaddr,__HIB,0xffffff8000108000 \ - -Wl,-image_base,0xffffff8000200000 \ - -Wl,-seg_page_size,__TEXT,0x200000 +# KASLR static slide config: +ifndef SLIDE +SLIDE=0x00 +endif +KERNEL_MIN_ADDRESS := 0xffffff8000000000 +KERNEL_BASE_OFFSET := 0x100000 +KERNEL_STATIC_SLIDE := $(shell printf "0x%016x" \ + $$[ $(SLIDE) << 21 ]) +KERNEL_STATIC_BASE := $(shell printf "0x%016x" \ + $$[ $(KERNEL_MIN_ADDRESS) + $(KERNEL_BASE_OFFSET) ]) +KERNEL_HIB_SECTION_BASE := $(shell printf "0x%016x" \ + $$[ $(KERNEL_STATIC_BASE) + $(KERNEL_STATIC_SLIDE) ]) +KERNEL_TEXT_BASE := $(shell printf "0x%016x" \ + $$[ $(KERNEL_HIB_SECTION_BASE) + 0x100000 ]) + +export LDFLAGS_KERNEL_RELEASEX86_64 = \ + -Wl,-pie \ + -Wl,-segaddr,__HIB,$(KERNEL_HIB_SECTION_BASE) \ + -Wl,-image_base,$(KERNEL_TEXT_BASE) \ + -Wl,-seg_page_size,__TEXT,0x200000 \ + -Wl,-sectalign,__DATA,__const,0x1000 \ + -Wl,-sectalign,__DATA,__sysctl_set,0x1000 \ + -Wl,-sectalign,__HIB,__bootPT,0x1000 \ + -Wl,-sectalign,__HIB,__desc,0x1000 \ + -Wl,-sectalign,__HIB,__data,0x1000 \ + -Wl,-sectalign,__HIB,__text,0x1000 \ + -Wl,-sectalign,__HIB,__const,0x1000 \ + -Wl,-sectalign,__HIB,__bss,0x1000 \ + -Wl,-sectalign,__HIB,__common,0x1000 \ + +# Define KERNEL_BASE_OFFSET so known at compile time: +export CFLAGS_X86_64 += -DKERNEL_BASE_OFFSET=$(KERNEL_BASE_OFFSET) export LDFLAGS_KERNEL_DEBUGX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) export LDFLAGS_KERNEL_DEVELOPMENTX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) export LDFLAGS_KERNEL_PROFILEX86_64 = $(LDFLAGS_KERNEL_RELEASEX86_64) export LDFLAGS_KERNEL_RELEASEARM = \ - -Wl,-new_linker \ - -Wl,-pagezero_size,0x0 \ + -Wl,-pie \ + -Wl,-static \ -Wl,-image_base,0x80001000 \ -Wl,-exported_symbols_list,$(TARGET)/kernel-kpi.exp export LDFLAGS_KERNEL_DEVELOPMENTARM = \ - -Wl,-new_linker \ - -Wl,-pagezero_size,0x0 \ + -Wl,-pie \ + -Wl,-static \ -Wl,-image_base,0x80001000 export LDFLAGS_KERNEL_DEBUGARM = $(LDFLAGS_KERNEL_DEVELOPMENTARM) +# Offset image base by page to have iBoot load kernel TEXT correctly. +# First page is used for various purposes : sleep token, reset vector. export LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ $($(addsuffix $(MACHINE_CONFIG),MACHINE_FLAGS_)) \ @@ -479,7 +512,7 @@ export LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ $($(addsuffix $(ARCH_CONFIG),LDFLAGS_KERNEL_)) \ $($(addsuffix $(KERNEL_CONFIG),LDFLAGS_KERNEL_)) \ $($(addsuffix $(ARCH_CONFIG), $(addsuffix $(KERNEL_CONFIG),LDFLAGS_KERNEL_))) \ - + $(DEPLOYMENT_TARGET_FLAGS) # # Default runtime libraries to be linked with the kernel @@ -501,7 +534,40 @@ export INCFLAGS = $(INCFLAGS_LOCAL) $(INCFLAGS_GEN) $(INCFLAGS_IMPORT) $(INCFLA # # Default MIGFLAGS # -export MIGFLAGS = $(DEFINES) $(INCFLAGS) $($(addsuffix $(ARCH_CONFIG),CFLAGS_)) $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) +export MIGFLAGS = $(DEFINES) $(INCFLAGS) $($(addsuffix $(ARCH_CONFIG),CFLAGS_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ + $(DEPLOYMENT_TARGET_FLAGS) + +# +# Support for LLVM Link Time Optimization (LTO) +# + +ifeq ($(BUILD_LTO),1) +export CFLAGS_GEN += -flto +export CXXFLAGS_GEN += -flto +export LDFLAGS_KERNEL_GEN += -Wl,-object_path_lto,$(TARGET)/lto.o +export CFLAGS_NOLTO_FLAG = -fno-lto +export BUILD_MACHO_OBJ = 0 +export BUILD_LTO = 1 +else +export CFLAGS_NOLTO_FLAG = +export BUILD_MACHO_OBJ = 1 +export BUILD_LTO = 0 +endif + +# +# Support for LLVM Integrated Assembler with clang driver +# +ifeq ($(BUILD_INTEGRATED_ASSEMBLER),1) +export SFLAGS_GEN += -integrated-as +export CFLAGS_GEN += -integrated-as +export CXXFLAGS_GEN += -integrated-as +export SFLAGS_NOINTEGRATEDAS_FLAGS = -no-integrated-as +export CFLAGS_NOINTEGRATEDAS_FLAGS = -no-integrated-as +else +export SFLAGS_NOINTEGRATEDAS_FLAGS = +export CFLAGS_NOINTEGRATEDAS_FLAGS = +endif # # Default VPATH @@ -564,7 +630,7 @@ KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) -UKERNEL_PRIVATE # -# Compononent Header file destinations +# Component Header file destinations # EXPDIR = EXPORT_HDRS/$(COMPONENT) @@ -583,7 +649,6 @@ export STRIP_FLAGS = $($(addsuffix $(KERNEL_CONFIG),STRIP_FLAGS_)) # export DSYMUTIL_FLAGS_I386 = --arch=i386 export DSYMUTIL_FLAGS_X86_64 = --arch=x86_64 -export DSYMUTIL_FLAGS_ARM = --arch=arm export DSYMUTIL_FLAGS = $($(addsuffix $(ARCH_CONFIG),DSYMUTIL_FLAGS_)) diff --git a/makedefs/MakeInc.dir b/makedefs/MakeInc.dir index b4b594cd6..12191a3c2 100644 --- a/makedefs/MakeInc.dir +++ b/makedefs/MakeInc.dir @@ -3,7 +3,7 @@ # .PHONY: installhdrs -ifeq ($(RC_ProjectName),Libsyscall) +ifeq ($(findstring Libsyscall,$(RC_ProjectName)),Libsyscall) installhdrs: cd libsyscall ; \ sdk="$(SDKROOT)" ; \ @@ -19,8 +19,11 @@ installhdrs: else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) installhdrs: make -C libkern/kxld/ installhdrs +else ifeq ($(RC_ProjectName),libkmod) +installhdrs: +# nothing to do else # xnu, xnu_debug, or xnu_headers_Sim -installhdrs: exporthdrs installhdrs_mi installhdrs_md +installhdrs: exporthdrs installhdrs_mi installhdrs_md setup @echo "[ $(SRCROOT) ] make installhdrs installing Kernel.framework" $(_v)kincpath=$(DSTROOT)/$(KINCDIR); \ krespath=$(DSTROOT)/$(KRESDIR); \ @@ -53,6 +56,9 @@ ifeq (iPhoneOS,$(PLATFORM)) endif endif +.PHONY: installhdrs_embedded installhdrs_devicemap +installhdrs_embedded installhdrs_devicemap: installhdrs + # # Install header files order # @@ -62,9 +68,8 @@ endif # # Install machine independent header files # -installhdrs_mi: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - kernel_config=$(INSTALL_TYPE); \ +installhdrs_mi: setup + $(_v)kernel_config=$(INSTALL_TYPE); \ machine_config=$(MACHINE_CONFIG); \ arch_config=$(INSTALL_ARCH_DEFAULT); \ if [ $${arch_config} = ARM ] ; then \ @@ -72,27 +77,34 @@ installhdrs_mi: machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/${RELATIVE_SOURCE_PATH}; \ else \ - installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/${RELATIVE_SOURCE_PATH}; \ fi; \ [ -d $${installinc_dir} ] || $(MKDIR) $${installinc_dir}; \ ${MAKE} ${MAKEJOBS} -C $${installinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ + MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=. \ TARGET=$${installinc_dir}/ \ build_installhdrs_mi; \ # # Install machine dependent kernel header files +# Uses hack for machine_config, which is not threaded through properly. # -installhdrs_md: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - kernel_config=$(INSTALL_TYPE); \ - machine_config=$(MACHINE_CONFIG); \ +installhdrs_md: setup + $(_v)kernel_config=$(INSTALL_TYPE); \ + machine_config=$(MACHINE_CONFIG); \ for arch_config in $(INSTALL_ARCHS); \ do \ if [ $${arch_config} = ARM ] ; then \ @@ -100,17 +112,24 @@ installhdrs_md: machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/${RELATIVE_SOURCE_PATH}; \ else \ - installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + installinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/${RELATIVE_SOURCE_PATH}; \ fi; \ [ -d $${installinc_dir} ] || $(MKDIR) $${installinc_dir}; \ ${MAKE} ${MAKEJOBS} -C $${installinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ + MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=. \ TARGET=$${installinc_dir}/ \ build_installhdrs_md; \ done; @@ -130,6 +149,7 @@ $(BUILD_INSTALLHDRS_MI_SUBDIRS_TARGETS): ${MAKE} -C $${installinc_subdir} \ MAKEFILES=$(SOURCE)$${installinc_subdir}/Makefile \ SOURCE=$(SOURCE)$${installinc_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${installinc_subdir} \ TARGET=$(TARGET)$${installinc_subdir}/ \ build_installhdrs_mi; @@ -151,6 +171,7 @@ $(BUILD_INSTALLHDRS_MD_SUBDIRS_TARGETS): ${MAKE} -C $${installinc_subdir} \ MAKEFILES=$(SOURCE)$${installinc_subdir}/Makefile \ SOURCE=$(SOURCE)$${installinc_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${installinc_subdir} \ TARGET=$(TARGET)$${installinc_subdir}/ \ build_installhdrs_md; @@ -176,26 +197,32 @@ exporthdrs: exporthdrs_mi exporthdrs_md do_exporthdrs_mi: exporthdrs_mi: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - kernel_config=$(INSTALL_TYPE); \ - machine_config=$(MACHINE_CONFIG); \ + $(_v)kernel_config=$(INSTALL_TYPE); \ arch_config=$(INSTALL_ARCH_DEFAULT); \ + machine_config=DEFAULT; \ if [ $${arch_config} = ARM ] ; then \ if [ $${machine_config} = DEFAULT ] ; then \ machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG);\ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/${RELATIVE_SOURCE_PATH}; \ else \ - exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/${RELATIVE_SOURCE_PATH}; \ fi; \ [ -d $${exportinc_dir} ] || $(MKDIR) $${exportinc_dir}; \ ${MAKE} ${MAKEJOBS} -C $${exportinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ + MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=. \ TARGET=$${exportinc_dir}/ \ build_exporthdrs_mi; \ @@ -209,8 +236,7 @@ exporthdrs_mi: # set is the kernel configuration. The second item in the set is the architecture and the # third item is the machine configuration. There may be multiple sets to build. exporthdrs_md: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - my_counter=1; \ + $(_v)my_counter=1; \ for my_config in $(TARGET_CONFIGS_UC); \ do \ if [ $${my_counter} -eq 1 ] ; then \ @@ -238,17 +264,24 @@ exporthdrs_md: machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}/${RELATIVE_SOURCE_PATH}; \ else \ - exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + exportinc_dir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/${RELATIVE_SOURCE_PATH}; \ fi; \ [ -d $${exportinc_dir} ] || $(MKDIR) $${exportinc_dir}; \ ${MAKE} ${MAKEJOBS} -C $${exportinc_dir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ + MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=. \ TARGET=$${exportinc_dir}/ \ build_exporthdrs_md; \ fi; \ @@ -270,6 +303,7 @@ $(BUILD_EXPORTHDRS_MI_SUBDIRS_TARGETS): ${MAKE} -C $${exportinc_subdir} \ MAKEFILES=$(SOURCE)$${exportinc_subdir}/Makefile \ SOURCE=$(SOURCE)$${exportinc_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${exportinc_subdir} \ TARGET=$(TARGET)$${exportinc_subdir}/ \ build_exporthdrs_mi; @@ -291,6 +325,7 @@ $(BUILD_EXPORTHDRS_MD_SUBDIRS_TARGETS): ${MAKE} -C $${exportinc_subdir} \ MAKEFILES=$(SOURCE)$${exportinc_subdir}/Makefile \ SOURCE=$(SOURCE)$${exportinc_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${exportinc_subdir} \ TARGET=$(TARGET)$${exportinc_subdir}/ \ build_exporthdrs_md; @@ -303,16 +338,17 @@ build_exporthdrs_md: $(BUILD_EXPORTHDRS_MD_SUBDIRS_TARGETS) .PHONY: setup setup: - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - kernel_config=$(INSTALL_TYPE); \ + $(_v)kernel_config=$(INSTALL_TYPE); \ arch_config=$(INSTALL_ARCH_DEFAULT); \ - setup_subdir=${OBJROOT}/$${rel_path}; \ + setup_subdir=${OBJROOT}/$${RELATIVE_SOURCE_PATH}; \ [ -d $${setup_subdir} ] || $(MKDIR) $${setup_subdir}; \ ${MAKE} ${MAKEJOBS} -C $${setup_subdir} \ KERNEL_CONFIG=$${kernel_config} \ ARCH_CONFIG=$${arch_config} \ + MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=. \ TARGET=$${setup_subdir}/ \ build_setup; @@ -328,6 +364,7 @@ $(BUILD_SETUP_SUBDIRS_TARGETS): ${MAKE} -C $${setup_subdir} \ MAKEFILES=${SOURCE}/$${setup_subdir}/Makefile \ SOURCE=${SOURCE}/$${setup_subdir}/ \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH}/$${setup_subdir} \ TARGET=${TARGET}/$${setup_subdir}/ \ build_setup; @@ -347,11 +384,11 @@ build_setup: $(BUILD_SETUP_SUBDIRS_TARGETS) ifeq ($(RC_ProjectName),Libsyscall) all: cd libsyscall ; \ - sdk="$(SDKROOT)" ; \ + sdk="$(SDKROOT)" ; \ if [ $${sdk} = / ] ; then \ - sdk="" ; \ - fi; \ - xcrun -sdk "$(SDKROOT)" xcodebuild install \ + sdk="" ; \ + fi; \ + xcrun -sdk "$(SDKROOT)" xcodebuild install \ "SRCROOT=$(SRCROOT)/libsyscall" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ @@ -363,7 +400,20 @@ all: else ifeq ($(RC_ProjectName),libkxld_host) all: make -C libkern/kxld/ install PRODUCT_TYPE=ARCHIVE -else ifeq ($(RC_ProjectName),xnu_headers_Sim) +else ifeq ($(RC_ProjectName),libkmod) +all: + cd libkern/kmod ; \ + sdk="$(SDKROOT)" ; \ + if [ $${sdk} = / ] ; then \ + sdk="" ; \ + fi; \ + xcrun -sdk "$(SDKROOT)" xcodebuild install \ + "SRCROOT=$(SRCROOT)/libkern/kmod" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ + "DSTROOT=$(DSTROOT)" \ + "SDKROOT=$${sdk}" +else ifeq ($(findstring _headers_Sim,$(RC_ProjectName)),_headers_Sim) # Libsyscall/xnu _headers_Sim all: exporthdrs else # xnu or xnu_debug ifeq ($(COMPONENT), .) @@ -394,10 +444,15 @@ endif machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - build_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + build_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}/${RELATIVE_SOURCE_PATH}; \ else \ - build_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + build_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/${RELATIVE_SOURCE_PATH}; \ fi; \ [ -d $${build_subdir} ] || $(MKDIR) $${build_subdir}; \ ${MAKE} ${MAKEJOBS} -C $${build_subdir} \ @@ -406,11 +461,15 @@ endif MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH} \ build_all; \ fi; \ done; endif +.PHONY: all_embedded all_devicemap +all_embedded all_devicemap: all + # # Build all architectures for all Configuration/Architecture options # @@ -431,6 +490,7 @@ $(BUILD_ALL_SUBDIRS_TARGETS): ${MAKE} -C $${comp_subdir} \ MAKEFILES=${SOURCE}/$${comp_subdir}/Makefile \ SOURCE=${SOURCE}$${comp_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${comp_subdir} \ TARGET=$${TARGET} \ build_all; @@ -448,6 +508,7 @@ build_all: $(BUILD_ALL_SUBDIRS_TARGETS) ${MAKE} -C $${comp_subdir} \ MAKEFILES=${SOURCE}/$${comp_subdir}/Makefile \ SOURCE=${SOURCE}$${comp_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${comp_subdir} \ TARGET=$${TARGET} \ build_all; \ done; @@ -484,6 +545,11 @@ mach_kernel: machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ build_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}; \ else \ @@ -521,15 +587,17 @@ build_mach_kernel: # Install kernel header files based on RC_ARCHS # install: installhdrs all installman installmachinekernels -ifeq ($(RC_ProjectName),Libsyscall) +ifeq ($(findstring Libsyscall,$(RC_ProjectName)),Libsyscall) # nothing to do else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) # nothing to do, work performed in "all" action -else ifeq ($(RC_ProjectName),xnu_headers_Sim) +else ifeq ($(RC_ProjectName),libkmod) +# nothing to do, work performed in "all" action +else ifeq ($(findstring _headers_Sim,$(RC_ProjectName)),_headers_Sim) # nothing to do else # xnu or xnu_debug - $(_v)rel_path=$(shell $(RELPATH) $(SRCROOT) $(SOURCE)); \ - machine_config=$(MACHINE_CONFIG); \ +# A bit of a hack for machine_config: machine configs aren't really threaded through properly. + $(_v)machine_config=$(MACHINE_CONFIG); \ for kernel_config in $(INSTALL_TYPE); \ do \ for arch_config in $(INSTALL_ARCHS); \ @@ -539,10 +607,15 @@ else # xnu or xnu_debug machine_config=$(DEFAULT_ARM_MACHINE_CONFIG); \ fi; \ fi; \ + if [ $${arch_config} = L4_ARM ] ; then \ + if [ $${machine_config} = DEFAULT ] ; then \ + machine_config=$(DEFAULT_L4_ARM_MACHINE_CONFIG); \ + fi; \ + fi; \ if [ $${machine_config} = DEFAULT ] ; then \ - install_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}/$${rel_path}; \ + install_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}/${RELATIVE_SOURCE_PATH}; \ else \ - install_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/$${rel_path}; \ + install_subdir=${OBJROOT}/$${kernel_config}_$${arch_config}_$${machine_config}/${RELATIVE_SOURCE_PATH}; \ fi; \ [ -d $${install_subdir} ] || $(MKDIR) $${install_subdir}; \ ${MAKE} ${MAKEJOBS} -C $${install_subdir} \ @@ -551,6 +624,7 @@ else # xnu or xnu_debug MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH} \ build_install; \ done; \ done; @@ -565,6 +639,9 @@ ifeq ($(RC_ProjectName),xnu_debug) endif endif +.PHONY: install_embedded install_devicemap +install_embedded install_devicemap: install + installmachinekernels: @echo "[ $(SOURCE) ] make installmachinekernels"; \ my_counter=1; \ @@ -596,6 +673,7 @@ installmachinekernels: MACHINE_CONFIG=$${machine_config} \ MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH} \ TARGET=$${build_subdir}/ \ do_build_install; \ fi; \ @@ -627,6 +705,7 @@ $(BUILD_INSTALL_SUBDIRS_TARGETS): KERNEL_CONFIG=$${kernel_config} \ MAKEFILES=${SOURCE}/$${install_subdir}/Makefile \ SOURCE=${SOURCE}$${install_subdir}/ \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH}/$${install_subdir} \ TARGET=$${TARGET} \ build_install; @@ -697,10 +776,12 @@ TAGS: cscope.files .PHONY: installman installman: -ifeq ($(RC_ProjectName),Libsyscall) +ifeq ($(findstring Libsyscall,$(RC_ProjectName)),Libsyscall) # nothing to do else ifeq ($(findstring libkxld,$(RC_ProjectName)),libkxld) # nothing to do +else ifeq ($(RC_ProjectName),libkmod) +# nothing to do else ifeq ($(findstring xnu_,$(RC_ProjectName)),xnu_) installman: # nothing to do @@ -710,6 +791,7 @@ else # xnu [ -d $$manpath ] || $(MKDIR) $$manpath; \ ${MAKE} ${MAKEJOBS} MAKEFILES=${SOURCE}/Makefile \ SOURCE=${SOURCE}/ \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH} \ TARGET=${DSTROOT}/ \ build_installman ${SRCROOT}/config/compress-man-pages.pl ${DSTROOT}/${MANDIR} @@ -726,6 +808,7 @@ $(BUILD_INSTALLMAN_SUBDIRS_TARGETS): ${MAKE} -C $${installman_subdir} -r \ MAKEFILES=$(SOURCE)$${installman_subdir}/Makefile \ SOURCE=$(SOURCE)$${installman_subdir}/ \ + RELATIVE_SOURCE_PATH=$(RELATIVE_SOURCE_PATH)/$${installman_subdir} \ TARGET=$(TARGET)$${installman_subdir}/ \ build_installman; diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index b2d7e3af3..d4e5e5fee 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -51,9 +51,9 @@ ifndef INSTALL_KF_MD_GEN_LIST endif ifneq ($(MACHINE_CONFIG), DEFAULT) - OBJPATH = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG) + export OBJPATH = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG) else - OBJPATH = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG) + export OBJPATH = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG) endif INSTALL_MI_GEN_FILES = $(addprefix $(DSTROOT)/$(INCDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_MI_GEN_LIST)) @@ -513,13 +513,14 @@ endif # Compilation rules to generate .o from .s # -S_RULE_1A=$(_v)${S_KCC} -c -MD ${SFLAGS} -DASSEMBLER ${INCFLAGS} ${$@_INCFLAGS} +S_RULE_1A=$(_v)${S_KCC} -c ${SFLAGS} -MD -DASSEMBLER ${$@_SFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} S_RULE_1B=$*.s S_RULE_2=@echo AS $@ S_RULE_3= # # Compilation rules to generate .o from .c for normal files +# C_RULE_1A=$(_v)${KCC} -c ${filter-out ${$@_CFLAGS_RM}, ${CFLAGS} ${CWARNFLAGS}} -MD ${$@_CFLAGS_ADD} ${$@_CWARNFLAGS_ADD} ${INCFLAGS} ${$@_INCFLAGS} C_RULE_1B=$*.c C_RULE_2=@echo CC $@ @@ -527,11 +528,19 @@ ifeq ($(BUILD_MACHO_OBJ),0) C_RULE_3= else ifeq ($(BUILD_STABS),1) C_RULE_3= +else ifeq ($(BUILD_DWARF),1) +C_RULE_3=$(_v)${CTFCONVERT} -l xnu -v -o $@.ctf $@ > /dev/null && $(CTFSCRUB) `cat $(SRCROOT)/config/DtraceIgnored.symbols` $@.ctf || true; else -C_RULE_3=$(_v)${CTFCONVERT} -l xnu -v -o $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf $@ > /dev/null && $(CTFSCRUB) `cat $(SRCROOT)/config/DtraceIgnored.symbols` $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf || true; +C_RULE_3= endif C_RULE_4= +ifeq ($(ARCH_CONFIG),ARM) +ifeq ($(KERNEL_CONFIG),RELEASE) +C_RULE_3= +endif +endif + # # Compilation rules to generate .o from .c for driver files # @@ -553,10 +562,17 @@ ifeq ($(BUILD_MACHO_OBJ),0) P_RULE_4= else ifeq ($(BUILD_STABS),1) P_RULE_4= +else ifeq ($(BUILD_DWARF),1) +P_RULE_4=$(_v)${CTFCONVERT} -l xnu -v -o $@.ctf $@ > /dev/null && $(CTFSCRUB) `cat $(SRCROOT)/config/DtraceIgnored.symbols` $@.ctf || true; else -P_RULE_4=$(_v)${CTFCONVERT} -l xnu -v -o $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf $@ > /dev/null && $(CTFSCRUB) `cat $(SRCROOT)/config/DtraceIgnored.symbols` $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$@.ctf || true; +P_RULE_4= endif +ifeq ($(ARCH_CONFIG),ARM) +ifeq ($(KERNEL_CONFIG),RELEASE) +P_RULE_4= +endif +endif setup_build_all: @@ -582,6 +598,7 @@ $(TARGET)/mach_kernel: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LI $(_v)if [ $(BUILD_DWARF) -eq 1 ]; then \ echo DSYMUTIL mach_kernel.sys; \ $(DSYMUTIL) $(DSYMUTIL_FLAGS) $(TARGET)/mach_kernel.sys -o $(TARGET)/mach_kernel.sys.dSYM > /dev/null; \ + $(MKDIR) $(TARGET)/mach_kernel.sys.dSYM/$(DSYMRESDIR); \ $(INSTALL) $(INSTALL_FLAGS) $(SRCROOT)/kgmacros $(TARGET)/mach_kernel.sys.dSYM/$(DSYMRESDIR)/kgmacros; \ fi; $(_v)if [ $(MACHINE_CONFIG) != DEFAULT ] ; then \ @@ -591,19 +608,31 @@ $(TARGET)/mach_kernel: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LI fi; @echo STRIP mach_kernel $(_v)$(STRIP) $(STRIP_FLAGS) $(TARGET)/mach_kernel.sys -o $(TARGET)/mach_kernel - $(_v)if [ $(BUILD_MACHO_OBJ) -eq 1 -a $(BUILD_DWARF) -eq 1 ]; then \ + + $(_v)kernel_config=$(KERNEL_CONFIG); \ + onearch=$(ARCH_CONFIG); \ + skip_ctf=FALSE; \ + if [ $${kernel_config} = RELEASE ]; then \ + if [[ $${onearch} = ARM ]]; then \ + skip_ctf=TRUE; \ + echo "Skipping CTF processing"; \ + fi \ + fi; \ + if [ $${skip_ctf} = FALSE ]; then \ + if [ $(BUILD_MACHO_OBJ) -eq 1 -a $(BUILD_DWARF) -eq 1 ]; then \ echo CTFMERGE mach_kernel; \ $(FIND) $(OBJPATH)/ -name \*.ctf -size 0 \ -exec $(RM) -rf {} \; ; \ - $(CTFMERGE) -l xnu -o $(TARGET)/mach_kernel \ - -Z $(TARGET)/mach_kernel.ctfdata \ - $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ + $(FIND) $(OBJPATH)/ -name \*.ctf | \ + $(XARGS) $(CTFMERGE) -l xnu -o $(TARGET)/mach_kernel \ + -Z $(TARGET)/mach_kernel.ctfdata || true; \ echo CTFINSERT mach_kernel; \ $(CTFINSERT) $(TARGET)/mach_kernel \ $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) $(TARGET)/mach_kernel.ctfdata \ -o $(TARGET)/mach_kernel || true; \ $(RM) -f $(TARGET)/mach_kernel.ctfdata > /dev/null || true; \ fi; \ + fi; \ version.o: $(OBJPATH)/version.c ${C_RULE_1A}$< @@ -632,6 +661,7 @@ build_mach_kernel_exports: $(_v)${MAKE} \ MAKEFILES=${SOURCE}/config/Makefile \ SOURCE=${SOURCE}/config \ + RELATIVE_SOURCE_PATH=${RELATIVE_SOURCE_PATH}/config \ TARGET=$${TARGET} \ build_mach_kernel_exports; @@ -663,11 +693,19 @@ $(INSTALL_KERNEL_FILE_FILES): $(TARGET)/mach_kernel force_kernel_file_install fi INSTALL_KERNEL_FILESYS_FILES = $(addprefix $(SYMROOT)$(INSTALL_KERNEL_DIR), $(INSTALL_KERNEL_FILE)) +ifeq ($(PLATFORM),iPhoneOS) +INSTALL_KERNEL_FILESYS_FILES += $(addprefix $(DSTROOT)$(INSTALL_KERNEL_SYM_DIR), $(INSTALL_KERNEL_FILE)) +endif force_kernel_filesys_install: $(INSTALL_KERNEL_FILESYS_FILES): $(TARGET)/mach_kernel.sys force_kernel_filesys_install @echo Installing $< in $@; +ifeq ($(PLATFORM),iPhoneOS) + $(_v)if [ ! -e $(DSTROOT)$(INSTALL_KERNEL_SYM_DIR) ]; then \ + $(MKDIR) $(DSTROOT)$(INSTALL_KERNEL_SYM_DIR); \ + fi; +endif $(_v)if [ ! -e $(SYMROOT)$(INSTALL_KERNEL_DIR) ]; then \ $(MKDIR) $(SYMROOT)$(INSTALL_KERNEL_DIR); \ fi; \ diff --git a/osfmk/Makefile b/osfmk/Makefile index b61d3bc7d..a864f850b 100644 --- a/osfmk/Makefile +++ b/osfmk/Makefile @@ -21,7 +21,8 @@ INSTINC_SUBDIRS = \ vm \ libsa \ kdp \ - pmc + pmc \ + kperf INSTINC_SUBDIRS_I386 = \ mach \ i386 @@ -29,9 +30,6 @@ INSTINC_SUBDIRS_X86_64 = \ mach \ i386 \ x86_64 -INSTINC_SUBDIRS_ARM = \ - mach \ - arm EXPINC_SUBDIRS = \ mach \ @@ -50,7 +48,8 @@ EXPINC_SUBDIRS = \ libsa \ kdp \ console \ - pmc + pmc \ + kperf EXPINC_SUBDIRS_I386 = \ mach \ @@ -59,9 +58,6 @@ EXPINC_SUBDIRS_X86_64 = \ mach \ i386 \ x86_64 -EXPINC_SUBDIRS_ARM = \ - mach \ - arm SETUP_SUBDIRS = diff --git a/osfmk/chud/chud_glue.c b/osfmk/chud/chud_glue.c index 26f1a70ce..a721a7313 100644 --- a/osfmk/chud/chud_glue.c +++ b/osfmk/chud/chud_glue.c @@ -26,3 +26,13 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + +void *chudxnu_platform_ptr(void); + +void * +chudxnu_platform_ptr(void) +{ + return (void *)0; +} + + diff --git a/osfmk/chud/chud_thread.c b/osfmk/chud/chud_thread.c index 97c07757b..6cc0eb0e9 100644 --- a/osfmk/chud/chud_thread.c +++ b/osfmk/chud/chud_thread.c @@ -550,3 +550,73 @@ chudxnu_thread_set_marked(thread_t thread, boolean_t new_value) return FALSE; } +/* XXX: good thing this code is experimental... */ + +/* external handler */ +extern void (*chudxnu_thread_ast_handler)(thread_t); +void (*chudxnu_thread_ast_handler)(thread_t) = NULL; + +/* AST callback to dispatch to AppleProfile */ +extern void chudxnu_thread_ast(thread_t); +void +chudxnu_thread_ast(thread_t thread) +{ + /* atomicness for kdebug events */ + void (*handler)(thread_t) = chudxnu_thread_ast_handler; + if( handler ) + handler( thread ); + + thread->t_chud = 0; +} + + + +/* Get and set bits on the thread and trigger an AST handler */ +void chudxnu_set_thread_ast( thread_t thread ); +void +chudxnu_set_thread_ast( thread_t thread ) +{ + /* FIXME: only call this on current thread from an interrupt handler for now... */ + if( thread != current_thread() ) + panic( "unsafe AST set" ); + + act_set_kperf(thread); +} + +/* get and set the thread bits */ +extern uint32_t chudxnu_get_thread_bits( thread_t thread ); +extern void chudxnu_set_thread_bits( thread_t thread, uint32_t bits ); + +uint32_t +chudxnu_get_thread_bits( thread_t thread ) +{ + return thread->t_chud; +} + +void +chudxnu_set_thread_bits( thread_t thread, uint32_t bits ) +{ + thread->t_chud = bits; +} + +/* get and set thread dirty bits. so CHUD can track whether the thread + * has been dispatched since it last looked. caller must hold the + * thread lock + */ +boolean_t +chudxnu_thread_get_dirty(thread_t thread) +{ + if( thread->c_switch != thread->chud_c_switch ) + return TRUE; + else + return FALSE; +} + +void +chudxnu_thread_set_dirty(thread_t thread, boolean_t makedirty) +{ + if( makedirty ) + thread->chud_c_switch = thread->c_switch - 1; + else + thread->chud_c_switch = thread->c_switch; +} diff --git a/osfmk/chud/chud_xnu.h b/osfmk/chud/chud_xnu.h index 2e8168577..7d2c56f67 100644 --- a/osfmk/chud/chud_xnu.h +++ b/osfmk/chud/chud_xnu.h @@ -99,6 +99,9 @@ enum { extern int chudxnu_thread_get_scheduler_state(thread_t thread); +extern boolean_t chudxnu_thread_get_dirty(thread_t thread); +extern void chudxnu_thread_set_dirty(thread_t thread, boolean_t); + #if 0 #pragma mark **** memory **** #endif diff --git a/osfmk/chud/i386/chud_osfmk_callback_i386.c b/osfmk/chud/i386/chud_osfmk_callback_i386.c index aa576cbc7..c92dbb7fd 100644 --- a/osfmk/chud/i386/chud_osfmk_callback_i386.c +++ b/osfmk/chud/i386/chud_osfmk_callback_i386.c @@ -103,6 +103,9 @@ chudxnu_cpu_alloc(boolean_t boot_processor) mpqueue_init(&chud_proc_info->cpu_request_queue, &chud_request_lck_grp, &chud_request_lck_attr); + /* timer_call_cancel() can be called before first usage, so init here: */ + timer_call_setup(&(chud_proc_info->cpu_timer_call), NULL, NULL); + return (void *)chud_proc_info; } diff --git a/osfmk/chud/i386/chud_thread_i386.c b/osfmk/chud/i386/chud_thread_i386.c index a8edff8fa..6b8a4e873 100644 --- a/osfmk/chud/i386/chud_thread_i386.c +++ b/osfmk/chud/i386/chud_thread_i386.c @@ -45,6 +45,16 @@ #include #include + +static uint64_t +chudxnu_vm_unslide( uint64_t ptr, int kaddr ) +{ + if( !kaddr ) + return ptr; + + return VM_KERNEL_UNSLIDE(ptr); +} + #if 0 #pragma mark **** thread state **** #endif @@ -236,7 +246,7 @@ static kern_return_t do_backtrace32( if(ct >= max_idx) return KERN_RESOURCE_SHORTAGE; // no frames traced - frames[ct++] = currPC; + frames[ct++] = chudxnu_vm_unslide(currPC, supervisor); // build a backtrace of this 32 bit state. while(VALID_STACK_ADDRESS(supervisor, currFP, kernStackMin, kernStackMax)) { @@ -279,7 +289,7 @@ static kern_return_t do_backtrace32( prevFP = (uint64_t) tmpWord; // promote 32 bit address if(prevFP) { - frames[ct++] = currPC; + frames[ct++] = chudxnu_vm_unslide(currPC, supervisor); prevPC = currPC; } if(prevFP < currFP) { @@ -314,7 +324,7 @@ static kern_return_t do_backtrace64( if(*start_idx >= max_idx) return KERN_RESOURCE_SHORTAGE; // no frames traced - frames[ct++] = currPC; + frames[ct++] = chudxnu_vm_unslide(currPC, supervisor); // build a backtrace of this 32 bit state. while(VALID_STACK_ADDRESS64(supervisor, currFP, kernStackMin, kernStackMax)) { @@ -355,7 +365,7 @@ static kern_return_t do_backtrace64( } if(VALID_STACK_ADDRESS64(supervisor, prevFP, kernStackMin, kernStackMax)) { - frames[ct++] = currPC; + frames[ct++] = chudxnu_vm_unslide(currPC, supervisor); prevPC = currPC; } if(prevFP < currFP) { @@ -412,7 +422,7 @@ static kern_return_t do_kernel_backtrace( return KERN_FAILURE; } - frames[ct++] = (uint64_t)currPC; + frames[ct++] = chudxnu_vm_unslide((uint64_t)currPC, 1); // build a backtrace of this kernel state #if __LP64__ @@ -454,7 +464,7 @@ static kern_return_t do_kernel_backtrace( #else if(VALID_STACK_ADDRESS(TRUE, prevFP, kernStackMin, kernStackMax)) { #endif - frames[ct++] = (uint64_t)currPC; + frames[ct++] = chudxnu_vm_unslide((uint64_t)currPC, 1); prevPC = currPC; } if(prevFP <= currFP) { diff --git a/osfmk/conf/MASTER b/osfmk/conf/MASTER index e34f671cb..ad71e25cc 100644 --- a/osfmk/conf/MASTER +++ b/osfmk/conf/MASTER @@ -113,6 +113,18 @@ options CONFIG_ZLEAKS # Live zone leak debugging # # options ZONE_ALIAS_ADDR # # + + +# +# CONFIG_TASK_ZONE_INFO allows per-task zone information to be extracted +# Primarily useful for xnu debug and development. +# +options CONFIG_TASK_ZONE_INFO # +# +# CONFIG_DEBUGGER_FOR_ZONE_INFO restricts zone info so that it is only +# available when the kernel is being debugged. +# +options CONFIG_DEBUGGER_FOR_ZONE_INFO # # # XPR_DEBUG enables the gathering of data through the XPR macros inserted # into various subsystems. This option is normally only enabled for @@ -130,9 +142,11 @@ options XPR_DEBUG # # options MACH_LDEBUG # # # -# +# configuration option for full, partial, or no kernel debug event tracing # -options KDEBUG # kernel tracing # +options KDEBUG # kernel tracing # +options IST_KDEBUG # limited tracing # +options NO_KDEBUG # no kernel tracing # # # CONFIG_DTRACE enables code needed to support DTrace. Currently this is @@ -193,6 +207,11 @@ options HIBERNATION # # # options CONFIG_SLEEP # # +# CONFIG_KEXT_BASEMENT - alloc post boot loaded kexts after prelinked kexts +# +options CONFIG_KEXT_BASEMENT # # + + # # configurable kernel related resources (CONFIG_THREAD_MAX needs to stay in # sync with bsd/conf/MASTER until we fix the config system... todo XXX @@ -209,6 +228,13 @@ options CONFIG_ZONE_MAP_MIN=12582912 # options CONFIG_ZONE_MAP_MIN=6291456 # options CONFIG_ZONE_MAP_MIN=1048576 # +# Sizes must be a power of two for the zhash to +# be able to just mask off bits instead of mod +options CONFIG_ZLEAK_ALLOCATION_MAP_NUM=16384 # +options CONFIG_ZLEAK_ALLOCATION_MAP_NUM=8192 # +options CONFIG_ZLEAK_TRACE_MAP_NUM=8192 # +options CONFIG_ZLEAK_TRACE_MAP_NUM=4096 # + # # configurable kernel - use these options to strip strings from panic # and printf calls. @@ -260,12 +286,30 @@ options CONFIG_SCHED_FIXEDPRIORITY # options CONFIG_SCHED_GRRR_CORE # options CONFIG_SCHED_IDLE_IN_PLACE # +options CONFIG_GZALLOC # +# +# enable per-process memory priority tracking +# +options CONFIG_MEMORYSTATUS # # -# freeze - support app hibernation, used on embedded +# enable jetsam - used on embedded # -options CONFIG_FREEZE # +options CONFIG_JETSAM # +# +# enable freezing of suspended processes - used on embedded +# +options CONFIG_FREEZE # options CHECK_CS_VALIDATION_BITMAP # +# +# Enable dispatch of memory pressure events from the vm_pageout_garbage_collect thread +# +options VM_PRESSURE_EVENTS # + +# Enable allocation of contiguous physical memory through vm_map_enter_cpm() +options VM_CPM # + +options CONFIG_SKIP_PRECISE_USER_KERNEL_TIME # diff --git a/osfmk/conf/MASTER.i386 b/osfmk/conf/MASTER.i386 index 42b4294e1..2240533b9 100644 --- a/osfmk/conf/MASTER.i386 +++ b/osfmk/conf/MASTER.i386 @@ -9,13 +9,13 @@ # Standard Apple MacOS X Configurations: # -------- ---- -------- --------------- # -# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_mtrr config_lapic config_counters zleaks config_sched_traditional config_sched_proto config_sched_grrr config_sched_fixedpriority mach_pagemap config_sched_idle_in_place ] -# DEBUG= [ RELEASE osf_debug debug mach_kdb mach_assert] +# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_mtrr config_lapic config_counters zleaks config_sched_traditional config_sched_proto config_sched_grrr config_sched_fixedpriority mach_pagemap vm_pressure_events config_sched_idle_in_place memorystatus ] +# DEBUG= [ RELEASE osf_debug debug mach_assert task_zone_info ] # PROFILE = [ RELEASE profile ] # # EMBEDDED_BASE = [ bsmall intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto ] # EMBEDDED = [ EMBEDDED_BASE no_printf_str no_kprintf_str no_kdebug ] -# DEVELOPMENT = [ EMBEDDED_BASE mach_assert config_dtrace config_counters ] +# DEVELOPMENT = [ EMBEDDED_BASE mach_assert config_dtrace config_counters task_zone_info ] # ###################################################################### # @@ -50,8 +50,6 @@ options MACH_BSD options IOKIT # # options MACH_PE # # -options DDB # Inline debugger # -options MACH_KDB # # options MACH_KDP # KDP # options CONFIG_SERIAL_KDP # KDP over serial # options PAE diff --git a/osfmk/conf/MASTER.x86_64 b/osfmk/conf/MASTER.x86_64 index 993fa17ab..27b9ce5e5 100644 --- a/osfmk/conf/MASTER.x86_64 +++ b/osfmk/conf/MASTER.x86_64 @@ -9,12 +9,12 @@ # Standard Apple MacOS X Configurations: # -------- ---- -------- --------------- # -# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_mtrr config_lapic config_counters zleaks config_sched_traditional config_sched_proto config_sched_grrr config_sched_fixedpriority mach_pagemap config_sched_idle_in_place ] -# DEBUG = [ RELEASE osf_debug debug mach_assert ] +# RELEASE = [ medium intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto config_dtrace config_mca config_vmx config_mtrr config_lapic config_counters zleaks config_gzalloc config_sched_traditional config_sched_proto config_sched_grrr config_sched_fixedpriority mach_pagemap vm_pressure_events config_sched_idle_in_place kperf memorystatus config_kext_basement ] +# DEBUG = [ RELEASE osf_debug debug mach_assert task_zone_info ] # # EMBEDDED_BASE = [ bsmall intel pc iokit mach_pe mach mach_kdp config_serial_kdp event vol hd pst gdb fixpri simple_clock mkernserv uxpr kernstack ipc_compat ipc_debug fb mk30 mk30_i386 hibernation config_sleep crypto ] # EMBEDDED = [ EMBEDDED_BASE no_printf_str no_kprintf_str no_kdebug ] -# DEVELOPMENT = [ EMBEDDED_BASE mach_assert config_counters ] +# DEVELOPMENT = [ EMBEDDED_BASE mach_assert config_counters task_zone_info ] # ###################################################################### # @@ -51,6 +51,7 @@ options MACH_PE # # options MACH_KDP # KDP # options CONFIG_SERIAL_KDP # KDP over serial # +# options KPERF # # options PAE options X86_64 options DISPATCH_COUNTS diff --git a/osfmk/conf/Makefile b/osfmk/conf/Makefile index 330f94ab6..439807979 100644 --- a/osfmk/conf/Makefile +++ b/osfmk/conf/Makefile @@ -48,9 +48,11 @@ $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/platforms.h: $(COMPOBJROOT)/$(OSFMK_KERNEL do_all: $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/Makefile \ $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG)/platforms.h $(_v)next_source=$(subst conf/,,$(SOURCE)); \ + next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH)); \ ${MAKE} -C $(COMPOBJROOT)/$(OSFMK_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(OSFMK_KERNEL_CONFIG)/Makefile \ SOURCE=$${next_source} \ + RELATIVE_SOURCE_PATH=$${next_relsource} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ KERNEL_CONFIG=$(OSFMK_KERNEL_CONFIG) \ diff --git a/osfmk/conf/Makefile.i386 b/osfmk/conf/Makefile.i386 index e232c0e32..852d8ad0a 100644 --- a/osfmk/conf/Makefile.i386 +++ b/osfmk/conf/Makefile.i386 @@ -4,24 +4,14 @@ CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 -# Objects that don't compile cleanly: -OBJS_NO_WERROR= \ - db_macro.o \ - db_print.o \ - db_sym.o \ - db_variables.o \ - db_disasm.o \ - db_interface.o \ - db_trace.o - -$(foreach file,$(OBJS_NO_WERROR),$(eval $(call add_perfile_cflags,$(file),-Wno-error))) - # Files that must go in the __HIB segment: UNCONFIGURED_HIB_FILES= \ hibernate_restore.o HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) +hibernate_restore.o_CFLAGS_ADD += -fno-stack-protector + ###################################################################### #END Machine dependent Makefile fragment for i386 ###################################################################### diff --git a/osfmk/conf/Makefile.x86_64 b/osfmk/conf/Makefile.x86_64 index 768a50845..2a4eb03ff 100644 --- a/osfmk/conf/Makefile.x86_64 +++ b/osfmk/conf/Makefile.x86_64 @@ -11,6 +11,9 @@ UNCONFIGURED_HIB_FILES= \ HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) +hibernate_restore.o_CFLAGS_ADD += -fno-stack-protector +hibernate_bootstrap.o_CFLAGS_ADD += -fno-stack-protector + ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### diff --git a/osfmk/conf/files b/osfmk/conf/files index 7a97e71c6..19b3b0550 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -44,8 +44,6 @@ OPTIONS/mach_cluster_stats optional mach_cluster_stats OPTIONS/mach_counters optional mach_counters OPTIONS/mach_ipc_debug optional mach_ipc_debug OPTIONS/mach_ipc_test optional mach_ipc_test -OPTIONS/mach_kdb optional mach_kdb -OPTIONS/mach_kgdb optional mach_kgdb OPTIONS/mach_kdp optional mach_kdp OPTIONS/config_serial_kdp optional config_serial_kdp OPTIONS/mach_kprof optional mach_kprof @@ -57,7 +55,6 @@ OPTIONS/advisory_pageout optional advisory_pageout OPTIONS/mach_vm_debug optional mach_vm_debug OPTIONS/mach_page_hash_stats optional mach_page_hash_stats OPTIONS/mig_debug optional mig_debug -OPTIONS/stat_time optional stat_time OPTIONS/time_stamp optional time_stamp OPTIONS/xpr_debug optional xpr_debug OPTIONS/bootstrap_symbols optional bootstrap_symbols @@ -104,27 +101,6 @@ osfmk/default_pager/dp_memory_object.c standard ./UserNotification/UNDReplyServer.c standard osfmk/UserNotification/KUNCUserNotifications.c standard -osfmk/ddb/db_access.c optional mach_kdb -osfmk/ddb/db_break.c optional mach_kdb -osfmk/ddb/db_command.c optional mach_kdb -osfmk/ddb/db_cond.c optional mach_kdb -osfmk/ddb/db_examine.c optional mach_kdb -osfmk/ddb/db_expr.c optional mach_kdb -osfmk/ddb/db_ext_symtab.c standard -osfmk/ddb/db_input.c optional mach_kdb -osfmk/ddb/db_lex.c optional mach_kdb -osfmk/ddb/db_macro.c optional mach_kdb -osfmk/ddb/db_output.c optional mach_kdb -osfmk/ddb/db_print.c optional mach_kdb -osfmk/ddb/db_run.c optional mach_kdb -osfmk/ddb/db_sym.c optional mach_kdb -osfmk/ddb/db_task_thread.c optional mach_kdb -osfmk/ddb/db_trap.c optional mach_kdb -osfmk/ddb/db_variables.c optional mach_kdb -osfmk/ddb/db_watch.c optional mach_kdb -osfmk/ddb/db_write_cmd.c optional mach_kdb - -osfmk/ddb/tr.c optional mach_tr osfmk/kdp/kdp.c optional mach_kdp osfmk/kdp/kdp_udp.c optional mach_kdp osfmk/kdp/kdp_serial.c optional config_serial_kdp @@ -139,10 +115,10 @@ osfmk/ipc/ipc_port.c standard osfmk/ipc/ipc_pset.c standard osfmk/ipc/ipc_right.c standard osfmk/ipc/ipc_space.c standard -osfmk/ipc/ipc_splay.c standard osfmk/ipc/ipc_table.c standard osfmk/ipc/ipc_labelh.c standard osfmk/ipc/mach_debug.c standard +osfmk/ipc/mach_kernelrpc.c standard osfmk/ipc/mach_msg.c standard osfmk/ipc/mach_port.c standard osfmk/ipc/mig_log.c optional mig_debug @@ -167,7 +143,6 @@ osfmk/kern/ipc_tt.c standard osfmk/kern/kalloc.c standard osfmk/kern/ledger.c standard osfmk/kern/locks.c standard -osfmk/kern/mach_clock.c standard osfmk/kern/machine.c standard osfmk/kern/mk_sp.c standard osfmk/kern/mk_timer.c standard @@ -202,6 +177,7 @@ osfmk/kern/timer_call.c standard osfmk/kern/wait_queue.c standard osfmk/kern/xpr.c optional xpr_debug osfmk/kern/zalloc.c standard +osfmk/kern/gzalloc.c optional config_gzalloc osfmk/kern/bsd_kern.c optional mach_bsd osfmk/kern/hibernate.c optional hibernation osfmk/pmc/pmc.c standard @@ -212,7 +188,6 @@ osfmk/pmc/pmc.c standard ./mach/exc_server.c optional mach_bsd ./mach/host_priv_server.c standard ./mach/host_security_server.c standard -./mach/ledger_server.c standard ./mach/lock_set_server.c standard ./mach/mach_exc_user.c standard ./mach/mach_exc_server.c optional mach_bsd @@ -289,4 +264,14 @@ osfmk/chud/chud_memory.c standard osfmk/chud/chud_osfmk_callback.c standard osfmk/chud/chud_thread.c standard +# Kernel performance monitoring +osfmk/kperf/kperf.c optional kperf +osfmk/kperf/action.c optional kperf +osfmk/kperf/callstack.c optional kperf +osfmk/kperf/pet.c optional kperf +osfmk/kperf/filter.c optional kperf +# osfmk/kperf/kperfbsd.c optional kperf # bsd/conf/files +osfmk/kperf/threadinfo.c optional kperf +osfmk/kperf/timetrigger.c optional kperf + osfmk/console/serial_general.c standard diff --git a/osfmk/conf/files.i386 b/osfmk/conf/files.i386 index 8c2864527..fb2610ce3 100644 --- a/osfmk/conf/files.i386 +++ b/osfmk/conf/files.i386 @@ -11,7 +11,6 @@ OPTIONS/debug optional debug OPTIONS/gprof optional gprof -OPTIONS/db_machine_commands optional db_machine_commands OPTIONS/dynamic_num_nodes optional dynamic_num_nodes OPTIONS/vtoc_compat optional vtoc_compat OPTIONS/fddi optional fddi @@ -27,8 +26,6 @@ osfmk/i386/pmap_common.c standard osfmk/i386/pal_routines.c optional pal_i386 osfmk/i386/pal_routines_asm.s optional pal_i386 -osfmk/ddb/db_aout.c optional mach_kdb - osfmk/i386/bsd_i386.c optional mach_bsd osfmk/i386/bsd_i386_native.c optional mach_bsd osfmk/i386/machdep_call.c optional mach_bsd @@ -40,9 +37,6 @@ osfmk/i386/cpu.c standard osfmk/i386/cpuid.c standard osfmk/i386/cpu_threads.c standard osfmk/i386/cpu_topology.c standard -osfmk/i386/db_disasm.c optional mach_kdb -osfmk/i386/db_interface.c optional mach_kdb -osfmk/i386/db_trace.c optional mach_kdb osfmk/i386/etimer.c standard osfmk/i386/fpu.c standard osfmk/i386/gdt.c standard diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index a147f68de..88d449c78 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -13,7 +13,6 @@ OPTIONS/debug optional debug OPTIONS/gprof optional gprof -OPTIONS/db_machine_commands optional db_machine_commands OPTIONS/dynamic_num_nodes optional dynamic_num_nodes OPTIONS/vtoc_compat optional vtoc_compat OPTIONS/fddi optional fddi @@ -34,6 +33,12 @@ osfmk/i386/bsd_i386.c optional mach_bsd osfmk/i386/bsd_i386_native.c optional mach_bsd osfmk/i386/machdep_call.c optional mach_bsd +# Order is important here for __HIB section +osfmk/x86_64/boot_pt.c standard +osfmk/i386/mp_desc.c standard +osfmk/i386/gdt.c standard +osfmk/x86_64/start.s standard + osfmk/x86_64/bcopy.s standard osfmk/x86_64/bzero.s standard osfmk/i386/cpu.c standard @@ -42,10 +47,8 @@ osfmk/i386/cpu_threads.c standard osfmk/i386/cpu_topology.c standard osfmk/i386/etimer.c standard osfmk/i386/fpu.c standard -osfmk/i386/gdt.c standard osfmk/i386/i386_lock.s standard osfmk/i386/i386_init.c standard -osfmk/i386/idle_pt.c standard osfmk/i386/i386_vm_init.c standard osfmk/i386/io_map.c standard osfmk/i386/ktss.c standard @@ -54,15 +57,13 @@ osfmk/x86_64/loose_ends.c standard osfmk/x86_64/copyio.c standard osfmk/i386/locks_i386.c standard osfmk/x86_64/locore.s standard -osfmk/x86_64/start.s standard -osfmk/x86_64/lowmem_vectors.s standard +osfmk/x86_64/lowmem_vectors.c standard osfmk/x86_64/cswitch.s standard osfmk/i386/machine_routines.c standard osfmk/x86_64/machine_routines_asm.s standard osfmk/i386/machine_check.c optional config_mca osfmk/i386/machine_task.c standard osfmk/x86_64/mcount.s optional profile -osfmk/i386/mp_desc.c standard #osfmk/x86_64/ntoh.s standard osfmk/i386/pcb.c standard osfmk/i386/pcb_native.c standard @@ -129,6 +130,8 @@ osfmk/i386/vmx/vmx_shims.c optional config_vmx #osfmk/OPTIONS/ec optional ec #osfmk/OPTIONS/hi_res_clock optional hi_res_clock +# Kernel performance monitoring +osfmk/kperf/x86_64/kperf_mp.c optional kperf osfmk/i386/startup64.c standard osfmk/x86_64/idt64.s standard diff --git a/osfmk/console/i386/serial_console.c b/osfmk/console/i386/serial_console.c index 2af1a9553..7cac3d45d 100644 --- a/osfmk/console/i386/serial_console.c +++ b/osfmk/console/i386/serial_console.c @@ -269,13 +269,6 @@ void cnputc(char c) { console_buf_t *cbp; -#if MACH_KDB - /* Bypass locking/buffering if in debugger */ - if (kdb_cpu == cpu_number()) { - _cnputc(c); - return; - } -#endif /* MACH_KDB */ mp_disable_preemption(); cbp = (console_buf_t *) current_cpu_datap()->cpu_console_buf; if (cbp == NULL) { diff --git a/osfmk/console/serial_general.c b/osfmk/console/serial_general.c index d51e98dab..8551fd6b7 100644 --- a/osfmk/console/serial_general.c +++ b/osfmk/console/serial_general.c @@ -32,7 +32,6 @@ * @APPLE_FREE_COPYRIGHT@ */ -#include #include #include #include diff --git a/osfmk/console/video_console.c b/osfmk/console/video_console.c index 0afd153b7..3dd1a2eca 100644 --- a/osfmk/console/video_console.c +++ b/osfmk/console/video_console.c @@ -1281,6 +1281,8 @@ gc_show_cursor(unsigned int xx, unsigned int yy) static void gc_update_color(int color, boolean_t fore) { + assert(gc_ops.update_color); + gc_color_code = COLOR_CODE_SET(gc_color_code, color, fore); gc_ops.update_color(color, fore); } @@ -2465,8 +2467,6 @@ vc_progress_task(__unused void *arg0, __unused void *arg) if( vc_progress_enable) { - KERNEL_DEBUG_CONSTANT(0x7020008, vc_progress_count, 0, 0, 0, 0); - vc_progress_count++; if( vc_progress_count >= vc_progress->count) { vc_progress_count = 0; @@ -2579,7 +2579,14 @@ initialize_screen(PE_Video * boot_vinfo, unsigned int op) #else new_vinfo.v_type = 0; #endif - new_vinfo.v_scale = boot_vinfo->v_scale; + unsigned int scale = (unsigned int)boot_vinfo->v_scale; + if (scale == kPEScaleFactor1x ) + new_vinfo.v_scale = kPEScaleFactor1x; + else if (scale == kPEScaleFactor2x) + new_vinfo.v_scale = kPEScaleFactor2x; + else /* Scale factor not set, default to 1x */ + new_vinfo.v_scale = kPEScaleFactor1x; + } if (!lastVideoMapped) @@ -2749,6 +2756,8 @@ initialize_screen(PE_Video * boot_vinfo, unsigned int op) gc_acquired = FALSE; gc_desire_text = FALSE; gc_enable( FALSE ); + if ( gc_graphics_boot == FALSE ) break; + vc_progress_set( FALSE, 0 ); #if !CONFIG_EMBEDDED vc_enable_progressmeter( FALSE ); diff --git a/osfmk/console/video_console.h b/osfmk/console/video_console.h index 26c6081fc..368e94da2 100644 --- a/osfmk/console/video_console.h +++ b/osfmk/console/video_console.h @@ -58,19 +58,19 @@ void video_scroll_down( void *start, /* HIGH addr */ struct vc_info { - unsigned int v_height; /* pixels */ - unsigned int v_width; /* pixels */ - unsigned int v_depth; - unsigned int v_rowbytes; - unsigned long v_baseaddr; - unsigned int v_type; - char v_name[32]; - uint64_t v_physaddr; - unsigned int v_rows; /* characters */ - unsigned int v_columns; /* characters */ - unsigned int v_rowscanbytes; /* Actualy number of bytes used for display per row*/ - unsigned int v_scale; - unsigned int v_reserved[4]; + unsigned int v_height; /* pixels */ + unsigned int v_width; /* pixels */ + unsigned int v_depth; + unsigned int v_rowbytes; + unsigned long v_baseaddr; + unsigned int v_type; + char v_name[32]; + uint64_t v_physaddr; + unsigned int v_rows; /* characters */ + unsigned int v_columns; /* characters */ + unsigned int v_rowscanbytes; /* Actualy number of bytes used for display per row*/ + unsigned int v_scale; + unsigned int v_reserved[4]; }; struct vc_progress_element { diff --git a/osfmk/ddb/Makefile b/osfmk/ddb/Makefile deleted file mode 100644 index b0689e4fb..000000000 --- a/osfmk/ddb/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -MIG_DEFS = \ - -MIG_HDRS = \ - -DATAFILES = \ - -MIGINCLUDES = \ - -EXPORT_MI_LIST = ${DATAFILES} ${_MIG_HDRS_} ${MIGINCLUDES} - -EXPORT_MI_DIR = ddb - -.ORDER: ${_MIG_HDRS_} ${MIGINCLUDES} - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/osfmk/ddb/db_access.c b/osfmk/ddb/db_access.c deleted file mode 100644 index fb0512bac..000000000 --- a/osfmk/ddb/db_access.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -#include -#include /* type definitions */ -#include -#include -#include -#include - - - -/* - * Access unaligned data items on aligned (longword) - * boundaries. - */ - -int db_access_level = DB_ACCESS_LEVEL; - -db_expr_t -db_get_task_value( - db_addr_t addr, - register int size, - boolean_t is_signed, - task_t task) -{ - char data[sizeof(db_expr_t)]; - register db_expr_t value; - register int i; - uint64_t signx; - - if(size == 0) return 0; - - db_read_bytes((vm_offset_t)addr, size, data, task); - - value = 0; -#if BYTE_MSF - for (i = 0; i < size; i++) -#else /* BYTE_LSF */ - for (i = size - 1; i >= 0; i--) -#endif - { - value = (value << 8) + (data[i] & 0xFF); - } - - if(!is_signed) return value; - - signx = 0xFFFFFFFFFFFFFFFFULL << ((size << 3) - 1); - - if(value & signx) value |= signx; /* Add 1s to front if sign bit is on */ - - return (value); -} - -void -db_put_task_value( - db_addr_t addr, - register int size, - register db_expr_t value, - task_t task) -{ - char data[sizeof(db_expr_t)]; - register int i; - -#if BYTE_MSF - for (i = size - 1; i >= 0; i--) -#else /* BYTE_LSF */ - for (i = 0; i < size; i++) -#endif - { - data[i] = value & 0xFF; - value >>= 8; - } - - db_write_bytes((vm_offset_t)addr, size, data, task); -} - -db_expr_t -db_get_value( - db_addr_t addr, - int size, - boolean_t is_signed) -{ - return(db_get_task_value(addr, size, is_signed, TASK_NULL)); -} - -void -db_put_value( - db_addr_t addr, - int size, - db_expr_t value) -{ - db_put_task_value(addr, size, value, TASK_NULL); -} diff --git a/osfmk/ddb/db_access.h b/osfmk/ddb/db_access.h deleted file mode 100644 index 518a1b648..000000000 --- a/osfmk/ddb/db_access.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -/* - * Data access functions for debugger. - */ - -#ifndef _DDB_DB_ACCESS_H_ -#define _DDB_DB_ACCESS_H_ - -#include -#include -#include - -/* implementation dependent access capability */ -#define DB_ACCESS_KERNEL 0 /* only kernel space */ -#define DB_ACCESS_CURRENT 1 /* kernel or current task space */ -#define DB_ACCESS_ANY 2 /* any space */ - -#ifndef DB_ACCESS_LEVEL -#define DB_ACCESS_LEVEL DB_ACCESS_KERNEL -#endif /* DB_ACCESS_LEVEL */ - -#ifndef DB_VALID_KERN_ADDR -#define DB_VALID_KERN_ADDR(addr) ((addr) >= VM_MIN_KERNEL_ADDRESS \ - && (addr) < VM_MAX_KERNEL_ADDRESS) -#define DB_VALID_ADDRESS(addr,user) ((user != 0) ^ DB_VALID_KERN_ADDR(addr)) -#define DB_PHYS_EQ(task1,addr1,task2,addr2) 0 -#define DB_CHECK_ACCESS(addr,size,task) db_is_current_space(task) -#endif /* DB_VALID_KERN_ADDR */ - -extern int db_access_level; - - - -/* Prototypes for functions exported by ddb/db_access.c. - */ -db_expr_t db_get_task_value( - db_addr_t addr, - register int size, - boolean_t is_signed, - task_t task); - -void db_put_task_value( - db_addr_t addr, - register int size, - register db_expr_t value, - task_t task); - -db_expr_t db_get_value( - db_addr_t addr, - int size, - boolean_t is_signed); - -void db_put_value( - db_addr_t addr, - int size, - db_expr_t value); - -#endif /* !_DDB_DB_ACCESS_H_ */ diff --git a/osfmk/ddb/db_aout.c b/osfmk/ddb/db_aout.c deleted file mode 100644 index a6e48c3ee..000000000 --- a/osfmk/ddb/db_aout.c +++ /dev/null @@ -1,961 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -/* - * Symbol table routines for a.out format files. - */ -#include -#include -#include -#include /* data types */ -#include -#include /* For strcpy(), strcmp() */ -#include -#include /* For db_printf() */ -#include - -#ifndef DB_NO_AOUT - -#include /* a.out symbol table */ -#include - -#include - -#define private static - -private int aout_db_order_symbols(char *, char *); -private int aout_db_compare_symbols(char *, char *); -private boolean_t aout_db_is_filename(char *); -private boolean_t aout_db_eq_name(struct nlist *, char *, int); - -/* - * An a.out symbol table as loaded into the kernel debugger: - * - * symtab -> size of symbol entries, in bytes - * sp -> first symbol entry - * ... - * ep -> last symbol entry + 1 - * strtab == start of string table - * size of string table in bytes, - * including this word - * -> strings - */ - -/* - * Find pointers to the start and end of the symbol entries, - * given a pointer to the start of the symbol table. - */ -#define db_get_aout_symtab(symtab, sp, ep) \ - (sp = (struct nlist *)(((vm_offset_t *)(symtab)) + 1), \ - ep = (struct nlist *)((char *)sp + *((int *)(symtab)))) - -char *db_sorting_sym_end; - -private int -aout_db_order_symbols( - char *s1, - char *s2) -{ - struct nlist *sym1 = (struct nlist *) s1; - struct nlist *sym2 = (struct nlist *) s2; - - if (sym1->n_value != sym2->n_value) - return (sym1->n_value - sym2->n_value); - else { - return (sym1->n_un.n_name - sym2->n_un.n_name); - } -} - -private int -aout_db_compare_symbols( - char *sym1, - char *sym2) -{ - return (((struct nlist *) sym1)->n_value - - ((struct nlist *) sym2)->n_value); -} - -int db_sorting_limit = 50000; - -boolean_t -aout_db_sym_init( - char * symtab, /* pointer to start of symbol table */ - __unused char *esymtab, /* pointer to end of string table, - for checking - may be rounded up to - integer boundary */ - const char *name, - char * task_addr) /* use for this task only */ -{ - struct nlist *sym_start, *sym_end, *dbsym_start, *dbsym_end; - struct nlist *sp; - char *strtab, *dbstrtab; - long db_strlen; - char *estrtab, *dbestrtab; - unsigned long minsym = ~0; - unsigned long maxsym = 0; - boolean_t sorted; - boolean_t sorting; - int nsyms; - - - if (!getsymtab((kernel_mach_header_t *)symtab, - (vm_offset_t *)&sym_start, &nsyms, - (vm_offset_t *)&strtab, (vm_size_t *)&db_strlen)) { - return(FALSE); - } - sym_end = sym_start + nsyms; - estrtab = strtab + db_strlen; - -/* - * We haven't actually started up VM yet, so we can just steal some pages to - * make a working copy of the symbols and strings - */ - - dbsym_start = (struct nlist *)pmap_steal_memory(((unsigned int)sym_end - (unsigned int)sym_start + 4096) & -4096); /* Get space for symbols */ - dbstrtab = (char *)pmap_steal_memory(((unsigned int)estrtab - (unsigned int)strtab + 4096) & -4096); /* Get space for strings */ - - bcopy((char *)sym_start, (char *)dbsym_start, (unsigned int)sym_end - (unsigned int)sym_start); /* Copy symbols */ - bcopy(strtab, dbstrtab, (unsigned int)estrtab - (unsigned int)strtab); /* Copy strings */ - - dbsym_end = dbsym_start + nsyms; - dbestrtab = dbstrtab + db_strlen; - - sorting = ((dbsym_end - dbsym_start) < db_sorting_limit); - - for (sp = dbsym_start; sp < dbsym_end; sp++) { - register long strx; - strx = sp->n_un.n_strx; - if (strx != 0) { - if (strx > db_strlen) { - sp->n_un.n_name = 0; - continue; - } - sp->n_un.n_name = dbstrtab + strx; - } - if (sp->n_type != N_ABS) { - if (sp->n_value > 0 && sp->n_value < minsym) - minsym = sp->n_value; - if (sp->n_value > maxsym) - maxsym = sp->n_value; - } - } - - if (maxsym < minsym) - minsym = maxsym = 0; - - if (sorting) { - db_qsort((char *) dbsym_start, dbsym_end - dbsym_start, - sizeof(struct nlist), aout_db_order_symbols); - sorted = TRUE; - } else - sorted = FALSE; - - if (db_add_symbol_table(SYMTAB_AOUT, - (char*)dbsym_start, - (char*)dbsym_end, - name, - 0, - task_addr, - minsym, - maxsym, - sorted)) - { - /* Successfully added symbol table */ - - pmap_protect(kernel_pmap, - (vm_offset_t) dbsym_start, (vm_offset_t) dbsym_end, - VM_PROT_READ|VM_PROT_WRITE); - pmap_protect(kernel_pmap, - (vm_offset_t) dbstrtab, (vm_offset_t) dbestrtab, - VM_PROT_READ|VM_PROT_WRITE); - return TRUE; - } - return FALSE; -} - -/* - * This KLUDGE offsets the n_values of a copied symbol table - */ -void db_clone_offsetXXX(char *, long); -void -db_clone_offsetXXX(char * symtab, long offset) -{ - register struct nlist *sym_start, *sym_end, *sp; - - db_get_aout_symtab((int *)symtab, sym_start, sym_end); - - for (sp = sym_start; sp < sym_end; sp++) - if (sp->n_type != N_ABS) - sp->n_value += offset; -} -/* end KLUDGE */ - -/* - * check file name or not (check xxxx.x pattern) - */ -private boolean_t -aout_db_is_filename(char *name) -{ - while (*name) { - if (*name == '.') { - if (name[1]) - return(TRUE); - } - name++; - } - return(FALSE); -} - -/* - * special name comparison routine with a name in the symbol table entry - */ -private boolean_t -aout_db_eq_name( - struct nlist *sp, - char *name, - int incomplete) -{ - register char *s1, *s2; - - s1 = sp->n_un.n_name; - s2 = name; -#ifndef __NO_UNDERSCORES__ - if (*s1 == '_' && *s2 && *s2 != '_') - s1++; -#endif /* __NO_UNDERSCORES__ */ - while (*s2) { - if (*s1++ != *s2++) { - /* - * check .c .o file name comparison case - */ - if (*s2 == 0 && sp->n_un.n_name <= s1 - 2 - && s1[-2] == '.' && s1[-1] == 'o') - return(TRUE); - return(FALSE); - } - } - if (incomplete) - return(TRUE); - /* - * do special check for - * xxx:yyy for N_FUN - * xxx.ttt for N_DATA and N_BSS - */ - return(*s1 == 0 || (*s1 == ':' && sp->n_type == N_FUN) || - (*s1 == '.' && (sp->n_type == N_DATA || sp->n_type == N_BSS))); -} - -/* - * search a symbol table with name and type - * fp(in,out): last found text file name symbol entry - */ -private struct nlist * -aout_db_search_name( - struct nlist *sp, - struct nlist *ep, - char *name, - int type, - struct nlist **fp, - int incomplete) -{ - struct nlist *file_sp = *fp; - struct nlist *found_sp = 0; - - for ( ; sp < ep; sp++) { - if (sp->n_other) - sp->n_other = 0; - if (sp->n_type == N_TEXT && aout_db_is_filename(sp->n_un.n_name)) - *fp = sp; - if (type) { - if (sp->n_type == type) { - /* dwm_debug: b26 name, mk6 added last param */ - if (aout_db_eq_name(sp, name, 0)) - return(sp); - } - if (sp->n_type == N_SO) - *fp = sp; - continue; - } - if (sp->n_type & N_STAB) - continue; - if (sp->n_un.n_name && aout_db_eq_name(sp, name, incomplete)) { - /* - * In case of qaulified search by a file, - * return it immediately with some check. - * Otherwise, search external one - */ - if (file_sp) { - if ((file_sp == *fp) || (sp->n_type & N_EXT)) - return(sp); - } else if ((sp->n_type & N_EXT) || - (incomplete && !aout_db_is_filename(sp->n_un.n_name))) - return(sp); - else - found_sp = sp; - } - } - return(found_sp); -} - -/* - * Print sorted possible completions for a symbol. - * Use n_other field to mark completion symbols in order - * to speed up sort. - */ -int -aout_db_qualified_print_completion( - db_symtab_t *stab, - char *sym) -{ - struct nlist *sp; - struct nlist *sp1; - struct nlist *ep; - struct nlist *ep1 = NULL; - struct nlist *fp = 0; - int symlen; - int nsym = 0; - struct nlist *cur; - struct nlist *new; - char *fname; - int func; - int line; - - sp = aout_db_search_name((struct nlist *)stab->start, - (struct nlist *)stab->end, - sym, 0, &fp, 1); - if (sp == (struct nlist *)0) - return 0; - - symlen = strlen(sym); - cur = sp; - while (cur) { - if (strncmp(cur->n_un.n_name, sym, symlen) == 0) - cur->n_other = 1; - else - cur->n_other = 2; - ep = cur; - cur = aout_db_search_name(cur + 1, (struct nlist *)stab->end, - sym, 0, &fp, 1); - } - - sp1 = sp; - for (;;) { - new = cur = sp; - while (++cur <= ep) - if (cur->n_other) { - if (sp1 == sp) - sp1 = cur; - if (strncmp(&cur->n_un.n_name[cur->n_other - 1], - &new->n_un.n_name[new->n_other - 1], - symlen) < 0) - new = cur; - else - ep1 = cur; - } - - func = line = 0; - if ((new->n_type & N_EXT) == 0) { - for (cur = new - 1; cur > (struct nlist *)stab->start; cur--) { - if (cur->n_type == N_SO || - (stab->sorted && cur->n_value < new->n_value)) - break; - if (line == 0 && - cur->n_type == N_SLINE && - cur->n_value == new->n_value) - line = cur->n_desc; - if (func == 0 && - cur->n_type == N_FUN && - cur->n_value == new->n_value) - func = 1; - } - - if (cur->n_type == N_SO) - fname = cur->n_un.n_name; - else - fname = (char *)0; - - if (line == 0 || func == 0) - for (cur = new + 1; - cur < (struct nlist *)stab->end; cur++) { - if (cur->n_type == N_SO || - (stab->sorted && cur->n_value > new->n_value)) - break; - if (line == 0 && - cur->n_type == N_SLINE && - cur->n_value == new->n_value) { - line = cur->n_desc; - if (func) - break; - } - if (func == 0 && - cur->n_type == N_FUN && - cur->n_value == new->n_value) { - func = 1; - if (line) - break; - } - } - } else { - fname = (char *)0; - for (cur = new - 1; cur > (struct nlist *)stab->start; cur--) { - if (cur->n_type == N_SO || - (stab->sorted && cur->n_value < new->n_value)) - break; - if (func == 0 && - cur->n_type == N_FUN && - cur->n_value == new->n_value) - func = 1; - } - if (func == 0) - for (cur = new + 1; - cur < (struct nlist *)stab->end; cur++) { - if (cur->n_type == N_SO || - (stab->sorted && cur->n_value > new->n_value)) - break; - if (cur->n_type == N_FUN && - cur->n_value == new->n_value) { - func = 1; - break; - } - } - } - - db_sym_print_completion(stab, &new->n_un.n_name[new->n_other - 1], - func, fname, line); - nsym++; - new->n_other = 0; - - if (new == sp) { - if (sp1 == sp) - break; - sp = sp1; - } else if (new == sp1) - sp1 = sp; - - if (new == ep) - ep = ep1; - } - return nsym; -} - -/* - * search a (possibly incomplete) symbol with file, func and line qualification - */ -private int -aout_db_qualified_search( - db_symtab_t *stab, - char *file, - char *sym, - int line, - db_sym_t *ret, - char **name, - int *len) -{ - register struct nlist *sp = (struct nlist *)stab->start; - struct nlist *ep = (struct nlist *)stab->end; - struct nlist *fp = 0; - struct nlist *found_sp; - unsigned long func_top; - boolean_t in_file; - int nsym = 0; - int i; - char *p; - - if (file == 0 && sym == 0) - return(0); - if (file) { - if ((sp = aout_db_search_name(sp, ep, file, N_TEXT, &fp, 0)) == 0) - return(0); - } - if (sym) { - for (;;) { - sp = aout_db_search_name(sp, ep, sym, (line > 0)? N_FUN: 0, &fp, - (ret == (db_sym_t *)0)); - if (sp == 0) - return(nsym); - if (ret) - break; - - if (strncmp(sp->n_un.n_name, sym, strlen(sym)) == 0) - p = sp->n_un.n_name; - else - p = &sp->n_un.n_name[1]; - - if (*name == (char *)0) { - *name = p; - *len = strlen(p); - } else { - for (i = 0; i < *len; i++) - if ((*name)[i] != p[i]) { - *len = i; - break; - } - } - - nsym++; - sp++; - } - } - if (line > 0) { - if (file && !aout_db_eq_name(fp, file, 0)) - return(0); - found_sp = 0; - if (sp->n_type == N_FUN) { - /* - * qualfied by function name - * search backward because line number entries - * for the function are above it in this case. - */ - func_top = sp->n_value; - if (stab->sorted) { - /* symbols with the same value may have been mixed up */ - do { - sp++; - } while (sp->n_value == func_top); - } - for (sp--; sp >= (struct nlist *)stab->start; sp--) { - if (sp->n_type != N_SLINE) - continue; - if (sp->n_value < func_top) - break; - if (sp->n_desc <= line) { - if (found_sp == 0 || found_sp->n_desc < sp->n_desc) - found_sp = sp; - if (sp->n_desc == line) - break; - } - } - if (sp->n_type != N_SLINE || sp->n_value < func_top) - return(0); - } else { - /* - * qualified by only file name - * search forward in this case - */ - in_file = TRUE; - if (stab->sorted) { - /* symbols with the same value may have been mixed up */ - func_top = sp->n_value; - do { - sp--; - } while (sp->n_value == func_top); - } - for (sp++; sp < ep; sp++) { - if (sp->n_type == N_TEXT - && aout_db_is_filename(sp->n_un.n_name)) - break; /* enter into another file */ - if (sp->n_type == N_SOL) { - in_file = aout_db_eq_name(sp, file, 0); - continue; - } - if (!in_file || sp->n_type != N_SLINE) - continue; - if (sp->n_desc <= line) { - if (found_sp == 0 || found_sp->n_desc < sp->n_desc) - found_sp = sp; - if (sp->n_desc == line) - break; - } - } - } - sp = found_sp; - } - *ret = (db_sym_t) sp; - return(1); -} - -/* - * lookup symbol by name - */ -db_sym_t -aout_db_lookup( - db_symtab_t *stab, - char * symstr) -{ - return(db_sym_parse_and_lookup(aout_db_qualified_search, stab, symstr)); -} - -/* - * lookup (possibly incomplete) symbol by name - */ -int -aout_db_lookup_incomplete( - db_symtab_t *stab, - char * symstr, - char ** name, - int *len, - int *toadd) -{ - return(db_sym_parse_and_lookup_incomplete(aout_db_qualified_search, - stab, symstr, name, len, toadd)); -} - -/* - * Display possible completion for the symbol - */ -int -aout_db_print_completion(stab, symstr) - db_symtab_t *stab; - char * symstr; -{ - - return(db_sym_parse_and_print_completion(aout_db_qualified_print_completion, - stab, symstr)); -} - -db_sym_t -aout_db_search_symbol( - db_symtab_t *symtab, - db_addr_t off, - db_strategy_t strategy, - db_expr_t *diffp) /* in/out */ -{ - db_expr_t diff = *diffp; - register struct nlist *symp = 0; - struct nlist *sp, *ep, *cp; - boolean_t first_pass = FALSE; - - sp = (struct nlist *)symtab->start; - ep = (struct nlist *)symtab->end; - - if (symtab->sorted) { - struct nlist target; - - target.n_value = (vm_offset_t)off; - target.n_un.n_name = (char *) 0; - target.n_other = (char) 0; - db_qsort_limit_search((char *)&target, (char **)&sp, (char **)&ep, - sizeof(struct nlist), aout_db_compare_symbols); - first_pass = TRUE; - } - - try_again: - for (cp = ep-1; cp >= sp; cp--) { - if (cp->n_un.n_name == 0) - continue; - if ((cp->n_type & N_STAB) != 0) - continue; - if (strategy == DB_STGY_XTRN && (cp->n_type & N_EXT) == 0) - continue; - if (off >= cp->n_value) { - if (off - cp->n_value < diff) { - diff = off - cp->n_value; - symp = cp; - if (diff == 0 && (cp->n_type & N_EXT)) - break; - } - else if (off - cp->n_value == diff) { - if (symp == 0) - symp = cp; - else if ((symp->n_type & N_EXT) == 0 && - (cp->n_type & N_EXT) != 0) - symp = cp; /* pick the external symbol */ - } - } - } - if (symp == 0) { - if (first_pass) { - first_pass = FALSE; - sp = (struct nlist *) symtab->start; - goto try_again; - } - *diffp = off; - } - else { - *diffp = diff; - } - return ((db_sym_t)symp); -} - -/* - * Return the name and value for a symbol. - */ -void -aout_db_symbol_values( - db_sym_t sym, - char **namep, - db_expr_t *valuep) -{ - register struct nlist *sp; - - sp = (struct nlist *)sym; - if (namep) - *namep = sp->n_un.n_name; - if (valuep) - *valuep = sp->n_value; -} - -#define X_DB_MAX_DIFF 8 /* maximum allowable diff at the end of line */ -extern unsigned int db_search_maxoff; /* maximum acceptable offset */ - -/* - * search symbol by value - */ -db_sym_t -aout_db_search_by_addr( - db_symtab_t *stab, - db_addr_t addr, - char **file, - char **func, - int *line, - db_expr_t *diff, - int *args) -{ - struct nlist *sp, *cp; - register struct nlist *line_sp, *func_sp, *file_sp, *line_func; - unsigned long func_diff, line_diff; - boolean_t found_line = FALSE; - struct nlist *ep = (struct nlist *)stab->end; - boolean_t first_pass = FALSE; - - /* - * 92-May-16 - * Added init of these two... not sure if it's correct, but - * can't be worse than random values.... -- jfriedl@omron.co.jp - */ - func_diff = line_diff = /*HUGE*/0x0fffffff; - - line_sp = func_sp = file_sp = line_func = 0; - *file = *func = 0; - *line = 0; - *args = -1; - - sp = (struct nlist *)stab->start; - if (stab->sorted) { - struct nlist target; - - target.n_value = (vm_offset_t)addr; - target.n_un.n_name = (char *) 0; - target.n_other = (char) 0; - db_qsort_limit_search((char *)&target, (char **)&sp, - (char **)&ep, sizeof(struct nlist), - aout_db_compare_symbols); - first_pass = TRUE; - } - - for (cp = sp; cp < ep; cp++) { - switch(cp->n_type) { - case N_SLINE: - if (cp->n_value <= addr) { - if (line_sp == 0 || line_diff >= addr - cp->n_value) { - if (line_func) - line_func = 0; - line_sp = cp; - line_diff = (unsigned long)(addr - cp->n_value); - } - } - if (cp->n_value >= addr && line_sp) - found_line = TRUE; - continue; - case N_FUN: - if ((found_line || (line_sp && line_diff < X_DB_MAX_DIFF)) - && line_func == 0) - line_func = cp; - continue; - case N_SO: - if (cp->n_value > addr) - continue; - if (file_sp == 0 || file_sp->n_value <= cp->n_value) - file_sp = cp; - continue; - case N_TEXT: - if (aout_db_is_filename(cp->n_un.n_name)) { - if (cp->n_value > addr) - continue; - if (file_sp == 0 || file_sp->n_value <= cp->n_value) - file_sp = cp; - } else if (cp->n_value <= addr && - (func_sp == 0 || func_diff > addr - cp->n_value)) { - func_sp = cp; - func_diff = (unsigned long)(addr - cp->n_value); - } - continue; - case N_TEXT|N_EXT: - if (cp->n_value <= addr && - (func_sp == 0 || func_diff >= addr - cp->n_value)) { - func_sp = cp; - func_diff = (unsigned long)(addr - cp->n_value); - if (func_diff == 0 && file_sp && func_sp && line_sp == 0) - break; - } - default: - if (stab->sorted) { - if ((cp->n_value > addr) && - (cp->n_value - addr > db_search_maxoff)) - break; - } - continue; - } - break; - } - if (first_pass && (!file_sp || !line_sp || !func_sp)) { - first_pass = FALSE; - cp = sp; - sp = (struct nlist *)stab->start; - for (; cp >= sp; cp--) { - switch(cp->n_type) { - case N_SLINE: - if (line_sp) - found_line = TRUE; - continue; - case N_FUN: - if ((found_line || (line_sp && line_diff < X_DB_MAX_DIFF)) - && line_func == 0) - line_func = cp; - continue; - case N_SO: - if (file_sp == 0) - file_sp = cp; - continue; - case N_TEXT: - if (aout_db_is_filename(cp->n_un.n_name)) { - if (file_sp == 0) - file_sp = cp; - } else if (func_sp == 0) { - func_sp = cp; - func_diff = (unsigned long)(addr - cp->n_value); - } - continue; - case N_TEXT|N_EXT: - if (func_sp == 0) { - func_sp = cp; - func_diff = (unsigned long)(addr - cp->n_value); - if (func_diff == 0 && file_sp && func_sp - && line_sp == 0) - break; - } - default: - if (line_sp && file_sp && - addr - cp->n_value > db_search_maxoff) - break; - continue; - } - break; - } - } -#if 0 -/* - * XXX - barbou@gr.osf.org - * I don't know if that code is useful to something, but it makes the -gline - * option of gcc useless. - */ - if (line_sp) { - if (line_func == 0 || func_sp == 0 - || line_func->n_value != func_sp->n_value) - line_sp = 0; - } -#else - if (line_sp && !found_line) { - line_sp = 0; - } -#endif - *diff = 0; - if (file_sp) { - *diff = addr - file_sp->n_value; - *file = file_sp->n_un.n_name; - } - if (line_sp) { - *diff = addr - line_sp->n_value; - *line = line_sp->n_desc; - } - if (func_sp) { - *diff = addr - func_sp->n_value; - *func = (func_sp->n_un.n_name[0] == '_')? - func_sp->n_un.n_name + 1: func_sp->n_un.n_name; - if (line_func && (line_func->n_desc & 0x4000)) - *args = line_func->n_desc & 0x3ff; - } - return((db_sym_t) func_sp); -} - -/* - * Find filename and lineno within, given the current pc. - */ -boolean_t -aout_db_line_at_pc( - db_symtab_t *stab, - __unused db_sym_t sym, - char **file, - int *line, - db_expr_t pc) -{ - char *func; - db_expr_t diff; - boolean_t found; - int args; - - found = (aout_db_search_by_addr(stab, (unsigned)pc, file, &func, line, - &diff, &args) - != DB_SYM_NULL); - return(found && func && *file); -} - -/* - * Initialization routine for a.out files. - */ -void -aout_db_init(void) -{ - aout_db_sym_init((char *) &_mh_execute_header, - (char *)0, "mach", (char *)0); -} - -#endif /* DB_NO_AOUT */ diff --git a/osfmk/ddb/db_aout.h b/osfmk/ddb/db_aout.h deleted file mode 100644 index 3a1ab3836..000000000 --- a/osfmk/ddb/db_aout.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Symbol table routines for a.out format files. - */ - -#ifndef _DDB_DB_AOUT_H_ -#define _DDB_DB_AOUT_H_ - -#include /* data types */ -#include /* db_symtab_t */ - -boolean_t aout_db_sym_init(char *, char *, const char *, char *); - -db_sym_t aout_db_lookup( - db_symtab_t *stab, - char * symstr); - -int aout_db_lookup_incomplete( - db_symtab_t *stab, - char * symstr, - char ** name, - int *len, - int *toadd); - -int aout_db_print_completion( - db_symtab_t *stab, - char * symstr); - -db_sym_t aout_db_search_symbol( - db_symtab_t *symtab, - db_addr_t off, - db_strategy_t strategy, - db_expr_t *diffp); /* in/out */ - -void aout_db_symbol_values( - db_sym_t sym, - char **namep, - db_expr_t *valuep); - -db_sym_t aout_db_search_by_addr( - db_symtab_t *stab, - db_addr_t addr, - char **file, - char **func, - int *line, - db_expr_t *diff, - int *args); - -boolean_t aout_db_line_at_pc( - db_symtab_t *stab, - db_sym_t sym, - char **file, - int *line, - db_expr_t pc); - -int aout_db_qualified_print_completion( - db_symtab_t *stab, - char *sym); - -void aout_db_init(void); - -#endif /* !_DDB_DB_AOUT_H_ */ diff --git a/osfmk/ddb/db_break.c b/osfmk/ddb/db_break.c deleted file mode 100644 index 38c4e232a..000000000 --- a/osfmk/ddb/db_break.c +++ /dev/null @@ -1,816 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Breakpoints. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include -#include - -#define NBREAKPOINTS 100 -#define NTHREAD_LIST (NBREAKPOINTS*3) - -struct db_breakpoint db_break_table[NBREAKPOINTS]; -db_breakpoint_t db_next_free_breakpoint = &db_break_table[0]; -db_breakpoint_t db_free_breakpoints = 0; -db_breakpoint_t db_breakpoint_list = 0; - -static struct db_thread_breakpoint db_thread_break_list[NTHREAD_LIST]; -static db_thread_breakpoint_t db_free_thread_break_list = 0; -static boolean_t db_thread_break_init = FALSE; -static int db_breakpoint_number = 0; - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -static int db_add_thread_breakpoint( - register db_breakpoint_t bkpt, - vm_offset_t task_thd, - int count, - boolean_t task_bpt); - -static int db_delete_thread_breakpoint( - register db_breakpoint_t bkpt, - vm_offset_t task_thd); - -static db_thread_breakpoint_t db_find_thread_breakpoint( - db_breakpoint_t bkpt, - thread_t thr_act); - -static void db_force_delete_breakpoint( - db_breakpoint_t bkpt, - vm_offset_t task_thd, - boolean_t is_task); - -db_breakpoint_t db_breakpoint_alloc(void); - -void db_breakpoint_free(register db_breakpoint_t bkpt); - -void db_delete_breakpoint( - task_t task, - db_addr_t addr, - vm_offset_t task_thd); - -void -db_delete_all_breakpoints( - task_t task); - -void db_list_breakpoints(void); - - - -db_breakpoint_t -db_breakpoint_alloc(void) -{ - register db_breakpoint_t bkpt; - - if ((bkpt = db_free_breakpoints) != 0) { - db_free_breakpoints = bkpt->link; - return (bkpt); - } - if (db_next_free_breakpoint == &db_break_table[NBREAKPOINTS]) { - db_printf("All breakpoints used.\n"); - return (0); - } - bkpt = db_next_free_breakpoint; - db_next_free_breakpoint++; - - return (bkpt); -} - -void -db_breakpoint_free(register db_breakpoint_t bkpt) -{ - bkpt->link = db_free_breakpoints; - db_free_breakpoints = bkpt; -} - -static int -db_add_thread_breakpoint( - register db_breakpoint_t bkpt, - vm_offset_t task_thd, - int count, - boolean_t task_bpt) -{ - register db_thread_breakpoint_t tp; - - if (db_thread_break_init == FALSE) { - for (tp = db_thread_break_list; - tp < &db_thread_break_list[NTHREAD_LIST-1]; tp++) - tp->tb_next = tp+1; - tp->tb_next = 0; - db_free_thread_break_list = db_thread_break_list; - db_thread_break_init = TRUE; - } - if (db_free_thread_break_list == 0) - return (-1); - tp = db_free_thread_break_list; - db_free_thread_break_list = tp->tb_next; - tp->tb_is_task = task_bpt; - tp->tb_task_thd = task_thd; - tp->tb_count = count; - tp->tb_init_count = count; - tp->tb_cond = 0; - tp->tb_number = ++db_breakpoint_number; - tp->tb_next = bkpt->threads; - bkpt->threads = tp; - return(0); -} - -static int -db_delete_thread_breakpoint( - register db_breakpoint_t bkpt, - vm_offset_t task_thd) -{ - register db_thread_breakpoint_t tp; - register db_thread_breakpoint_t *tpp; - - if (task_thd == 0) { - /* delete all the thread-breakpoints */ - - for (tpp = &bkpt->threads; (tp = *tpp) != 0; tpp = &tp->tb_next) - db_cond_free(tp); - - *tpp = db_free_thread_break_list; - db_free_thread_break_list = bkpt->threads; - bkpt->threads = 0; - return 0; - } else { - /* delete the specified thread-breakpoint */ - - for (tpp = &bkpt->threads; (tp = *tpp) != 0; tpp = &tp->tb_next) - if (tp->tb_task_thd == task_thd) { - db_cond_free(tp); - *tpp = tp->tb_next; - tp->tb_next = db_free_thread_break_list; - db_free_thread_break_list = tp; - return 0; - } - - return -1; /* not found */ - } -} - -static db_thread_breakpoint_t -db_find_thread_breakpoint( - db_breakpoint_t bkpt, - thread_t thr_act) -{ - register db_thread_breakpoint_t tp; - register task_t task = - (thr_act == THREAD_NULL) - ? TASK_NULL : thr_act->task; - - for (tp = bkpt->threads; tp; tp = tp->tb_next) { - if (tp->tb_is_task) { - if (tp->tb_task_thd == (vm_offset_t)task) - break; - continue; - } - if (tp->tb_task_thd == (vm_offset_t)thr_act || tp->tb_task_thd == 0) - break; - } - return(tp); -} - -db_thread_breakpoint_t -db_find_thread_breakpoint_here( - task_t task, - db_addr_t addr) -{ - db_breakpoint_t bkpt; - - bkpt = db_find_breakpoint(task, (db_addr_t)addr); - if (bkpt == 0) - return(0); - return(db_find_thread_breakpoint(bkpt, current_thread())); -} - -db_thread_breakpoint_t -db_find_breakpoint_number( - int num, - db_breakpoint_t *bkptp) -{ - register db_thread_breakpoint_t tp; - register db_breakpoint_t bkpt; - - for (bkpt = db_breakpoint_list; bkpt != 0; bkpt = bkpt->link) { - for (tp = bkpt->threads; tp; tp = tp->tb_next) { - if (tp->tb_number == num) { - if (bkptp) - *bkptp = bkpt; - return(tp); - } - } - } - return(0); -} - -static void -db_force_delete_breakpoint( - db_breakpoint_t bkpt, - vm_offset_t task_thd, - boolean_t is_task) -{ - db_printf("deleted a stale breakpoint at "); - if (bkpt->task == TASK_NULL || db_lookup_task(bkpt->task) >= 0) - db_task_printsym(bkpt->address, DB_STGY_PROC, bkpt->task); - else - db_printf("%#X", bkpt->address); - if (bkpt->task) - db_printf(" in task %X", bkpt->task); - if (task_thd) - db_printf(" for %s %X", (is_task)? "task": "thr_act", task_thd); - db_printf("\n"); - db_delete_thread_breakpoint(bkpt, task_thd); -} - -void -db_check_breakpoint_valid(void) -{ - register db_thread_breakpoint_t tbp, tbp_next; - register db_breakpoint_t bkpt, *bkptp; - - bkptp = &db_breakpoint_list; - for (bkpt = *bkptp; bkpt; bkpt = *bkptp) { - if (bkpt->task != TASK_NULL) { - if (db_lookup_task(bkpt->task) < 0) { - db_force_delete_breakpoint(bkpt, 0, FALSE); - *bkptp = bkpt->link; - db_breakpoint_free(bkpt); - continue; - } - } else { - for (tbp = bkpt->threads; tbp; tbp = tbp_next) { - tbp_next = tbp->tb_next; - if (tbp->tb_task_thd == 0) - continue; - if ((tbp->tb_is_task && - db_lookup_task((task_t)(tbp->tb_task_thd)) < 0) || - (!tbp->tb_is_task && - db_lookup_act((thread_t)(tbp->tb_task_thd)) < 0)) { - db_force_delete_breakpoint(bkpt, - tbp->tb_task_thd, tbp->tb_is_task); - } - } - if (bkpt->threads == 0) { - db_put_task_value(bkpt->address, BKPT_SIZE, - bkpt->bkpt_inst, bkpt->task); - *bkptp = bkpt->link; - db_breakpoint_free(bkpt); - continue; - } - } - bkptp = &bkpt->link; - } -} - -void -db_set_breakpoint( - task_t task, - db_addr_t addr, - int count, - thread_t thr_act, - boolean_t task_bpt) -{ - register db_breakpoint_t bkpt; - db_breakpoint_t alloc_bkpt = 0; - vm_offset_t task_thd; - - bkpt = db_find_breakpoint(task, addr); - if (bkpt) { - if (thr_act == THREAD_NULL - || db_find_thread_breakpoint(bkpt, thr_act)) { - db_printf("Already set.\n"); - return; - } - } else { - if (!DB_CHECK_ACCESS((vm_offset_t)addr, BKPT_SIZE, task)) { - if (task) { - db_printf("Warning: non-resident page for breakpoint at %llX", - (unsigned long long)addr); - db_printf(" in task %lX.\n", task); - } else { - db_printf("Cannot set breakpoint at %llX in kernel space.\n", - (unsigned long long)addr); - return; - } - } - alloc_bkpt = bkpt = db_breakpoint_alloc(); - if (bkpt == 0) { - db_printf("Too many breakpoints.\n"); - return; - } - bkpt->task = task; - bkpt->flags = (task && thr_act == THREAD_NULL)? - (BKPT_USR_GLOBAL|BKPT_1ST_SET): 0; - bkpt->address = addr; - bkpt->threads = 0; - } - if (db_breakpoint_list == 0) - db_breakpoint_number = 0; - task_thd = (task_bpt) ? (vm_offset_t)(thr_act->task) - : (vm_offset_t)thr_act; - if (db_add_thread_breakpoint(bkpt, task_thd, count, task_bpt) < 0) { - if (alloc_bkpt) - db_breakpoint_free(alloc_bkpt); - db_printf("Too many thread_breakpoints.\n"); - } else { - db_printf("set breakpoint #%x\n", db_breakpoint_number); - if (alloc_bkpt) { - bkpt->link = db_breakpoint_list; - db_breakpoint_list = bkpt; - } - } -} - -void -db_delete_breakpoint( - task_t task, - db_addr_t addr, - vm_offset_t task_thd) -{ - register db_breakpoint_t bkpt; - register db_breakpoint_t *prev; - - for (prev = &db_breakpoint_list; (bkpt = *prev) != 0; - prev = &bkpt->link) { - if ((bkpt->task == task - || (task != TASK_NULL && (bkpt->flags & BKPT_USR_GLOBAL))) - && bkpt->address == addr) - break; - } - if (bkpt && (bkpt->flags & BKPT_SET_IN_MEM)) { - db_printf("cannot delete it now.\n"); - return; - } - if (bkpt == 0 - || db_delete_thread_breakpoint(bkpt, task_thd) < 0) { - db_printf("Not set.\n"); - return; - } - if (bkpt->threads == 0) { - *prev = bkpt->link; - db_breakpoint_free(bkpt); - } -} - -db_breakpoint_t -db_find_breakpoint( - task_t task, - db_addr_t addr) -{ - register db_breakpoint_t bkpt; - - for (bkpt = db_breakpoint_list; bkpt != 0; bkpt = bkpt->link) { - if ((bkpt->task == task - || (task != TASK_NULL && (bkpt->flags & BKPT_USR_GLOBAL))) - && bkpt->address == addr) - return (bkpt); - } - return (0); -} - -boolean_t -db_find_breakpoint_here( - task_t task, - db_addr_t addr) -{ - register db_breakpoint_t bkpt; - - for (bkpt = db_breakpoint_list; bkpt != 0; bkpt = bkpt->link) { - if ((bkpt->task == task - || (task != TASK_NULL && (bkpt->flags & BKPT_USR_GLOBAL))) - && bkpt->address == addr) - return(TRUE); - if ((bkpt->flags & BKPT_USR_GLOBAL) == 0 && - DB_PHYS_EQ(task, (vm_offset_t)addr, bkpt->task, (vm_offset_t)bkpt->address)) - return (TRUE); - } - return(FALSE); -} - -boolean_t db_breakpoints_inserted = TRUE; - -void -db_set_breakpoints(void) -{ - register db_breakpoint_t bkpt; - register task_t task; - db_expr_t inst; - thread_t cur_act = current_thread(); - task_t cur_task = - (cur_act) ? - cur_act->task : TASK_NULL; - boolean_t inserted = TRUE; - - if (!db_breakpoints_inserted) { - for (bkpt = db_breakpoint_list; bkpt != 0; bkpt = bkpt->link) { - if (bkpt->flags & BKPT_SET_IN_MEM) - continue; - task = bkpt->task; - if (bkpt->flags & BKPT_USR_GLOBAL) { - if ((bkpt->flags & BKPT_1ST_SET) == 0) { - if (cur_task == TASK_NULL) - continue; - task = cur_task; - } else - bkpt->flags &= ~BKPT_1ST_SET; - } - if (DB_CHECK_ACCESS((vm_offset_t)bkpt->address, BKPT_SIZE, task)) { - inst = db_get_task_value(bkpt->address, BKPT_SIZE, FALSE, - task); - if (inst == BKPT_SET(inst)) - continue; - bkpt->bkpt_inst = (vm_size_t)inst; - db_put_task_value(bkpt->address, - BKPT_SIZE, - BKPT_SET(bkpt->bkpt_inst), task); - bkpt->flags |= BKPT_SET_IN_MEM; - } else { - inserted = FALSE; - } - } - db_breakpoints_inserted = inserted; - } -} - -void -db_clear_breakpoints(void) -{ - register db_breakpoint_t bkpt, *bkptp; - register task_t task; - db_expr_t inst; - thread_t cur_act = current_thread(); - task_t cur_task = (cur_act) ? - cur_act->task: TASK_NULL; - - if (db_breakpoints_inserted) { - bkptp = &db_breakpoint_list; - for (bkpt = *bkptp; bkpt; bkpt = *bkptp) { - task = bkpt->task; - if (bkpt->flags & BKPT_USR_GLOBAL) { - if (cur_task == TASK_NULL) { - bkptp = &bkpt->link; - continue; - } - task = cur_task; - } - if ((bkpt->flags & BKPT_SET_IN_MEM) - && DB_CHECK_ACCESS((vm_offset_t)bkpt->address, BKPT_SIZE, task)) { - inst = db_get_task_value(bkpt->address, BKPT_SIZE, FALSE, - task); - if (inst != BKPT_SET(inst)) { - if (bkpt->flags & BKPT_USR_GLOBAL) { - bkptp = &bkpt->link; - continue; - } - db_force_delete_breakpoint(bkpt, 0, FALSE); - *bkptp = bkpt->link; - db_breakpoint_free(bkpt); - continue; - } - db_put_task_value(bkpt->address, BKPT_SIZE, - bkpt->bkpt_inst, task); - bkpt->flags &= ~BKPT_SET_IN_MEM; - } - bkptp = &bkpt->link; - } - db_breakpoints_inserted = FALSE; - } -} - -/* - * Set a temporary breakpoint. - * The instruction is changed immediately, - * so the breakpoint does not have to be on the breakpoint list. - */ -db_breakpoint_t -db_set_temp_breakpoint( - task_t task, - db_addr_t addr) -{ - register db_breakpoint_t bkpt; - - bkpt = db_breakpoint_alloc(); - if (bkpt == 0) { - db_printf("Too many breakpoints.\n"); - return 0; - } - bkpt->task = task; - bkpt->address = addr; - bkpt->flags = BKPT_TEMP; - bkpt->threads = 0; - if (db_add_thread_breakpoint(bkpt, 0, 1, FALSE) < 0) { - if (bkpt) - db_breakpoint_free(bkpt); - db_printf("Too many thread_breakpoints.\n"); - return 0; - } - bkpt->bkpt_inst = (vm_size_t)db_get_task_value(bkpt->address, BKPT_SIZE, - FALSE, task); - db_put_task_value(bkpt->address, BKPT_SIZE, - BKPT_SET(bkpt->bkpt_inst), task); - return bkpt; -} - -void -db_delete_temp_breakpoint( - task_t task, - db_breakpoint_t bkpt) -{ - db_put_task_value(bkpt->address, BKPT_SIZE, bkpt->bkpt_inst, task); - db_delete_thread_breakpoint(bkpt, 0); - db_breakpoint_free(bkpt); -} - -/* - * List breakpoints. - */ -void -db_list_breakpoints(void) -{ - register db_breakpoint_t bkpt; - - if (db_breakpoint_list == 0) { - db_printf("No breakpoints set\n"); - return; - } - - db_printf(" No Space Task.Act Cnt Address(Cond)\n"); - for (bkpt = db_breakpoint_list; - bkpt != 0; - bkpt = bkpt->link) - { - register db_thread_breakpoint_t tp; - int task_id; - int act_id; - - if (bkpt->threads) { - for (tp = bkpt->threads; tp; tp = tp->tb_next) { - db_printf("%3d ", tp->tb_number); - if (bkpt->flags & BKPT_USR_GLOBAL) - db_printf("user "); - else if (bkpt->task == TASK_NULL) - db_printf("kernel "); - else if ((task_id = db_lookup_task(bkpt->task)) < 0) - db_printf("%0*X ", 2*sizeof(vm_offset_t), bkpt->task); - else - db_printf("task%-3d ", task_id); - if (tp->tb_task_thd == 0) { - db_printf("all "); - } else { - if (tp->tb_is_task) { - task_id = db_lookup_task((task_t)(tp->tb_task_thd)); - if (task_id < 0) - db_printf("%0*X ", 2*sizeof(vm_offset_t), - tp->tb_task_thd); - else - db_printf("task%03d ", task_id); - } else { - thread_t thd = (thread_t)(tp->tb_task_thd); - task_id = db_lookup_task(thd->task); - act_id = db_lookup_task_act(thd->task, thd); - if (task_id < 0 || act_id < 0) - db_printf("%0*X ", 2*sizeof(vm_offset_t), - tp->tb_task_thd); - else - db_printf("task%03d.%-3d ", task_id, act_id); - } - } - db_printf("%3d ", tp->tb_init_count); - db_task_printsym(bkpt->address, DB_STGY_PROC, bkpt->task); - if (tp->tb_cond > 0) { - db_printf("("); - db_cond_print(tp); - db_printf(")"); - } - db_printf("\n"); - } - } else { - if (bkpt->task == TASK_NULL) - db_printf(" ? kernel "); - else - db_printf("%*X ", 2*sizeof(vm_offset_t), bkpt->task); - db_printf("(?) "); - db_task_printsym(bkpt->address, DB_STGY_PROC, bkpt->task); - db_printf("\n"); - } - } -} - -void -db_delete_all_breakpoints( - task_t task) -{ - register db_breakpoint_t bkpt; - - bkpt = db_breakpoint_list; - while ( bkpt != 0 ) { - if (bkpt->task == task || - (task != TASK_NULL && (bkpt->flags & BKPT_USR_GLOBAL))) { - db_delete_breakpoint(task, bkpt->address, 0); - bkpt = db_breakpoint_list; - } - else - bkpt = bkpt->link; - - } -} - -/* Delete breakpoint */ -void -db_delete_cmd(void) -{ - register int n; - thread_t thr_act; - vm_offset_t task_thd; - boolean_t user_global = FALSE; - boolean_t task_bpt = FALSE; - boolean_t user_space = FALSE; - boolean_t thd_bpt = FALSE; - db_expr_t addr; - int t; - - t = db_read_token(); - if (t == tSLASH) { - t = db_read_token(); - if (t != tIDENT) { - db_printf("Bad modifier \"%s\"\n", db_tok_string); - db_error(0); - } - user_global = db_option(db_tok_string, 'U'); - user_space = (user_global)? TRUE: db_option(db_tok_string, 'u'); - task_bpt = db_option(db_tok_string, 'T'); - thd_bpt = db_option(db_tok_string, 't'); - if (task_bpt && user_global) - db_error("Cannot specify both 'T' and 'U' option\n"); - t = db_read_token(); - } - - if ( t == tSTAR ) { - db_printf("Delete ALL breakpoints\n"); - db_delete_all_breakpoints( (task_t)task_bpt ); - return; - } - - if (t == tHASH) { - db_thread_breakpoint_t tbp; - db_breakpoint_t bkpt = 0; - - if (db_read_token() != tNUMBER) { - db_printf("Bad break point number #%s\n", db_tok_string); - db_error(0); - } - if ((tbp = db_find_breakpoint_number((int)db_tok_number, &bkpt)) == 0) { - db_printf("No such break point #%d\n", db_tok_number); - db_error(0); - } - db_delete_breakpoint(bkpt->task, bkpt->address, tbp->tb_task_thd); - return; - } - db_unread_token(t); - if (!db_expression(&addr)) { - /* - * We attempt to pick up the user_space indication from db_dot, - * so that a plain "d" always works. - */ - addr = (db_expr_t)db_dot; - if (!user_space && !DB_VALID_ADDRESS(addr, FALSE)) - user_space = TRUE; - } - if (!DB_VALID_ADDRESS(addr, user_space)) { - db_printf("Address %#llX is not in %s space\n", (unsigned long long)addr, - (user_space)? "user": "kernel"); - db_error(0); - } - if (thd_bpt || task_bpt) { - for (n = 0; db_get_next_act(&thr_act, n); n++) { - if (thr_act == THREAD_NULL) - db_error("No active thr_act\n"); - if (task_bpt) { - if (thr_act->task == TASK_NULL) - db_error("No task\n"); - task_thd = (vm_offset_t) (thr_act->task); - } else - task_thd = (user_global)? 0: (vm_offset_t) thr_act; - db_delete_breakpoint(db_target_space(thr_act, user_space), - (db_addr_t)addr, task_thd); - } - } else { - db_delete_breakpoint(db_target_space(THREAD_NULL, user_space), - (db_addr_t)addr, 0); - } -} - -/* Set breakpoint with skip count */ -#include - -void -db_breakpoint_cmd(db_expr_t addr, __unused boolean_t have_addr, db_expr_t count, - char *modif) -{ - register int n; - thread_t thr_act; - boolean_t user_global = db_option(modif, 'U'); - boolean_t task_bpt = db_option(modif, 'T'); - boolean_t user_space; - - if (count == (uint64_t)-1) - count = 1; -#if 0 /* CHECKME */ - if (!task_bpt && db_option(modif,'t')) - task_bpt = TRUE; -#endif - - if (task_bpt && user_global) - db_error("Cannot specify both 'T' and 'U'\n"); - user_space = (user_global)? TRUE: db_option(modif, 'u'); - if (user_space && db_access_level < DB_ACCESS_CURRENT) - db_error("User space break point is not supported\n"); - if ((!task_bpt || !user_space) && - !DB_VALID_ADDRESS(addr, user_space)) { - /* if the user has explicitly specified user space, - do not insert a breakpoint into the kernel */ - if (user_space) - db_error("Invalid user space address\n"); - user_space = TRUE; - db_printf("%#llX is in user space\n", (unsigned long long)addr); -#ifdef ppc - db_printf("kernel is from %#X to %#x\n", VM_MIN_KERNEL_ADDRESS, vm_last_addr); -#else - db_printf("kernel is from %#X to %#x\n", VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS); -#endif - } - if (db_option(modif, 't') || task_bpt) { - for (n = 0; db_get_next_act(&thr_act, n); n++) { - if (thr_act == THREAD_NULL) - db_error("No active thr_act\n"); - if (task_bpt && thr_act->task == TASK_NULL) - db_error("No task\n"); - if (db_access_level <= DB_ACCESS_CURRENT && user_space - && thr_act->task != db_current_space()) - db_error("Cannot set break point in inactive user space\n"); - db_set_breakpoint(db_target_space(thr_act, user_space), - (db_addr_t)addr, (int)count, - (user_global)? THREAD_NULL: thr_act, - task_bpt); - } - } else { - db_set_breakpoint(db_target_space(THREAD_NULL, user_space), - (db_addr_t)addr, - (int)count, THREAD_NULL, FALSE); - } -} - -/* list breakpoints */ -void -db_listbreak_cmd(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - db_list_breakpoints(); -} diff --git a/osfmk/ddb/db_break.h b/osfmk/ddb/db_break.h deleted file mode 100644 index f456b2517..000000000 --- a/osfmk/ddb/db_break.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:47 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.18.3 1995/01/06 19:10:05 devrcs - * mk6 CR668 - 1.3b26 merge - * 64bit cleanup, prototypes. - * [1994/10/14 03:39:52 dwm] - * - * Revision 1.1.18.2 1994/09/23 01:18:04 ezf - * change marker to not FREE - * [1994/09/22 21:09:24 ezf] - * - * Revision 1.1.18.1 1994/06/11 21:11:29 bolinger - * Merge up to NMK17.2. - * [1994/06/11 20:03:39 bolinger] - * - * Revision 1.1.16.1 1994/04/11 09:34:32 bernadat - * Moved db_breakpoint struct declaration from db_break.c - * to here. - * [94/03/16 bernadat] - * - * Revision 1.1.12.2 1994/03/17 22:35:24 dwm - * The infamous name change: thread_activation + thread_shuttle = thread. - * [1994/03/17 21:25:41 dwm] - * - * Revision 1.1.12.1 1994/01/12 17:50:30 dwm - * Coloc: initial restructuring to follow Utah model. - * [1994/01/12 17:13:00 dwm] - * - * Revision 1.1.4.4 1993/07/27 18:26:51 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:10:59 elliston] - * - * Revision 1.1.4.3 1993/06/07 22:06:31 jeffc - * CR9176 - ANSI C violations: trailing tokens on CPP - * directives, extra semicolons after decl_ ..., asm keywords - * [1993/06/07 18:57:06 jeffc] - * - * Revision 1.1.4.2 1993/06/02 23:10:21 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:55:49 jeffc] - * - * Revision 1.1 1992/09/30 02:24:12 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.6 91/10/09 15:58:03 af - * Revision 2.5.3.1 91/10/05 13:05:04 jeffreyh - * Added db_thread_breakpoint structure, and added task and threads - * field to db_breakpoint structure. Some status flags were also - * added to keep track user space break point correctly. - * [91/08/29 tak] - * - * Revision 2.5.3.1 91/10/05 13:05:04 jeffreyh - * Added db_thread_breakpoint structure, and added task and threads - * field to db_breakpoint structure. Some status flags were also - * added to keep track user space break point correctly. - * [91/08/29 tak] - * - * Revision 2.5 91/05/14 15:32:35 mrt - * Correcting copyright - * - * Revision 2.4 91/02/05 17:06:06 mrt - * Changed to new Mach copyright - * [91/01/31 16:17:10 mrt] - * - * Revision 2.3 90/10/25 14:43:40 rwd - * Added map field to breakpoints. - * [90/10/18 rpd] - * - * Revision 2.2 90/08/27 21:50:00 dbg - * Modularized typedef names. - * [90/08/20 af] - * Add external defintions. - * [90/08/07 dbg] - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -#ifndef _DDB_DB_BREAK_H_ -#define _DDB_DB_BREAK_H_ - -#include -#include -#include -#include - -/* - * thread list at the same breakpoint address - */ -struct db_thread_breakpoint { - vm_offset_t tb_task_thd; /* target task or thread */ - boolean_t tb_is_task; /* task qualified */ - short tb_number; /* breakpoint number */ - short tb_init_count; /* skip count(initial value) */ - short tb_count; /* current skip count */ - short tb_cond; /* break condition */ - struct db_thread_breakpoint *tb_next; /* next chain */ -}; -typedef struct db_thread_breakpoint *db_thread_breakpoint_t; - -/* - * Breakpoint. - */ -struct db_breakpoint { - task_t task; /* target task */ - db_addr_t address; /* set here */ - db_thread_breakpoint_t threads; /* thread */ - int flags; /* flags: */ -#define BKPT_SINGLE_STEP 0x2 /* to simulate single step */ -#define BKPT_TEMP 0x4 /* temporary */ -#define BKPT_USR_GLOBAL 0x8 /* global user space break point */ -#define BKPT_SET_IN_MEM 0x10 /* break point is set in memory */ -#define BKPT_1ST_SET 0x20 /* 1st time set of user global bkpt */ - vm_size_t bkpt_inst; /* saved instruction at bkpt */ - struct db_breakpoint *link; /* link in in-use or free chain */ -}; - -typedef struct db_breakpoint *db_breakpoint_t; - - -/* - * Prototypes for functions exported by this module. - */ - -db_thread_breakpoint_t db_find_thread_breakpoint_here( - task_t task, - db_addr_t addr); - -void db_check_breakpoint_valid(void); - -void db_set_breakpoint( - task_t task, - db_addr_t addr, - int count, - thread_t thr_act, - boolean_t task_bpt); - -db_breakpoint_t db_find_breakpoint( - task_t task, - db_addr_t addr); - -boolean_t db_find_breakpoint_here( - task_t task, - db_addr_t addr); - -db_thread_breakpoint_t db_find_breakpoint_number( - int num, - db_breakpoint_t *bkptp); - -void db_set_breakpoints(void); - -void db_clear_breakpoints(void); - -db_breakpoint_t db_set_temp_breakpoint( - task_t task, - db_addr_t addr); - -void db_delete_temp_breakpoint( - task_t task, - db_breakpoint_t bkpt); - -void db_delete_cmd(void); - -void db_breakpoint_cmd( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_listbreak_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -#endif /* !_DDB_DB_BREAK_H_ */ diff --git a/osfmk/ddb/db_coff.h b/osfmk/ddb/db_coff.h deleted file mode 100644 index 57f3b7ddb..000000000 --- a/osfmk/ddb/db_coff.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - * - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:47 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.2.1 1995/02/23 16:34:08 alanl - * Initial file creation. - * [95/02/06 sjs] - * - * $EndLog$ - */ - -#ifndef _DDB_DB_COFF_H_ -#define _DDB_DB_COFF_H_ - -#define DB_NO_AOUT 1 - - -/* - * Symbol table routines for COFF format files. - */ - -boolean_t coff_db_sym_init( - char * symtab, - char * esymtab, - const char * name, - char * task_addr); - -db_sym_t coff_db_lookup( - db_symtab_t *stab, - char * symstr); - -int coff_db_lookup_incomplete( - db_symtab_t *stab, - char * symstr, - char ** name, - int *len, - int *toadd); - -int coff_db_print_completion( - db_symtab_t *stab, - char * symstr); - -db_sym_t coff_db_search_symbol( - db_symtab_t *symtab, - db_addr_t off, - db_strategy_t strategy, - db_expr_t *diffp); /* in/out */ - -void coff_db_symbol_values( - db_sym_t sym, - char **namep, - db_expr_t *valuep); - -db_sym_t coff_db_search_by_addr( - db_symtab_t *stab, - db_addr_t addr, - char **file, - char **func, - int *line, - db_expr_t *diff, - int *args); - -boolean_t coff_db_line_at_pc( - db_symtab_t *stab, - db_sym_t sym, - char **file, - int *line, - db_expr_t pc); - -int coff_db_qualified_print_completion( - db_symtab_t *stab, - char *sym); - -void coff_db_init(void); - -#endif /* !_DDB_DB_COFF_H_ */ diff --git a/osfmk/ddb/db_command.c b/osfmk/ddb/db_command.c deleted file mode 100644 index 7e21b12b4..000000000 --- a/osfmk/ddb/db_command.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Command dispatcher. - */ -#include - -#include -#include -#include - -#if defined(__alpha) -# include -# if KDEBUG -# include -# endif -#endif /* defined(__alpha) */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include /* For db_stack_trace_cmd(). */ -#include /* For db_show_one_zone, db_show_all_zones. */ -#include /* For db_show_all_slocks(). */ - -#if NORMA_VM -#include -#endif /* NORMA_VM */ - -/* - * Exported global variables - */ -boolean_t db_cmd_loop_done; -jmp_buf_t *db_recover = 0; -db_addr_t db_dot; -db_addr_t db_last_addr; -db_addr_t db_prev; -db_addr_t db_next; - -/* - * if 'ed' style: 'dot' is set at start of last item printed, - * and '+' points to next line. - * Otherwise: 'dot' points to next item, '..' points to last. - */ -boolean_t db_ed_style = TRUE; - -/* - * Results of command search. - */ -#define CMD_UNIQUE 0 -#define CMD_FOUND 1 -#define CMD_NONE 2 -#define CMD_AMBIGUOUS 3 -#define CMD_HELP 4 - -/* Prototypes for functions local to this file. XXX -- should be static! - */ - -void db_command( - struct db_command **last_cmdp, /* IN_OUT */ - db_expr_t *last_countp, /* IN_OUT */ - char *last_modifp, /* IN_OUT */ - struct db_command *cmd_table); - -void db_help_cmd(void); - -void db_fncall(void); - -void db_cmd_list(struct db_command *table); - -int db_cmd_search( - char * name, - struct db_command * table, - struct db_command ** cmdp); /* out */ - -void db_command_list( - struct db_command **last_cmdp, /* IN_OUT */ - db_expr_t *last_countp, /* IN_OUT */ - char *last_modifp, /* IN_OUT */ - struct db_command *cmd_table); - -/* - * Search for command prefix. - */ -int -db_cmd_search( - char * name, - struct db_command * table, - struct db_command ** cmdp) /* out */ -{ - struct db_command *cmd; - int result = CMD_NONE; - - for (cmd = table; cmd->name != 0; cmd++) { - register char *lp; - const char *rp; - register int c; - - lp = name; - rp = cmd->name; - while ((c = *lp) == *rp) { - if (c == 0) { - /* complete match */ - *cmdp = cmd; - return (CMD_UNIQUE); - } - lp++; - rp++; - } - if (c == 0) { - /* end of name, not end of command - - partial match */ - if (result == CMD_FOUND) { - result = CMD_AMBIGUOUS; - /* but keep looking for a full match - - this lets us match single letters */ - } - else { - *cmdp = cmd; - result = CMD_FOUND; - } - } - } - if (result == CMD_NONE) { - /* check for 'help' */ - if (!strncmp(name, "help", strlen(name))) - result = CMD_HELP; - } - return (result); -} - -void -db_cmd_list(struct db_command *table) -{ - struct db_command *new; - struct db_command *old; - struct db_command *cur; - unsigned int l; - unsigned int len; - - len = 1; - for (cur = table; cur->name != 0; cur++) - if ((l = strlen(cur->name)) >= len) - len = l + 1; - - old = (struct db_command *)0; - for (;;) { - new = (struct db_command *)0; - for (cur = table; cur->name != 0; cur++) - if ((new == (struct db_command *)0 || - strncmp(cur->name, new->name, strlen(cur->name)) < 0) && - (old == (struct db_command *)0 || - strncmp(cur->name, old->name, strlen(cur->name)) > 0)) - new = cur; - if (new == (struct db_command *)0) - return; - db_reserve_output_position(len); - db_printf("%-*s", len, new->name); - old = new; - } -} - -void -db_command( - struct db_command **last_cmdp, /* IN_OUT */ - db_expr_t *last_countp, /* IN_OUT */ - char *last_modifp, /* IN_OUT */ - struct db_command *cmd_table) -{ - struct db_command *cmd; - int t; - char modif[TOK_STRING_SIZE]; - char *modifp = &modif[0]; - db_expr_t addr, count; - boolean_t have_addr = FALSE; - int result; - - t = db_read_token(); - if (t == tEOL || t == tSEMI_COLON) { - /* empty line repeats last command, at 'next' */ - cmd = *last_cmdp; - count = *last_countp; - modifp = last_modifp; - addr = (db_expr_t)db_next; - have_addr = FALSE; - if (t == tSEMI_COLON) - db_unread_token(t); - } - else if (t == tEXCL) { - db_fncall(); - return; - } - else if (t != tIDENT) { - db_printf("?\n"); - db_flush_lex(); - return; - } - else { - /* - * Search for command - */ - while (cmd_table) { - result = db_cmd_search(db_tok_string, - cmd_table, - &cmd); - switch (result) { - case CMD_NONE: - if (db_exec_macro(db_tok_string) == 0) - return; - db_printf("No such command \"%s\"\n", db_tok_string); - db_flush_lex(); - return; - case CMD_AMBIGUOUS: - db_printf("Ambiguous\n"); - db_flush_lex(); - return; - case CMD_HELP: - db_cmd_list(cmd_table); - db_flush_lex(); - return; - default: - break; - } - if ((cmd_table = cmd->more) != 0) { - t = db_read_token(); - if (t != tIDENT) { - db_cmd_list(cmd_table); - db_flush_lex(); - return; - } - } - } - - if ((cmd->flag & CS_OWN) == 0) { - /* - * Standard syntax: - * command [/modifier] [addr] [,count] - */ - t = db_read_token(); - if (t == tSLASH) { - t = db_read_token(); - if (t != tIDENT) { - db_printf("Bad modifier \"/%s\"\n", db_tok_string); - db_flush_lex(); - return; - } - strlcpy(modif, db_tok_string, TOK_STRING_SIZE); - } - else { - db_unread_token(t); - modif[0] = '\0'; - } - - if (db_expression(&addr)) { - db_dot = (db_addr_t) addr; - db_last_addr = db_dot; - have_addr = TRUE; - } - else { - addr = (db_expr_t) db_dot; - have_addr = FALSE; - } - t = db_read_token(); - if (t == tCOMMA) { - if (!db_expression(&count)) { - db_printf("Count missing after ','\n"); - db_flush_lex(); - return; - } - } - else { - db_unread_token(t); - count = -1; - } - } - } - if (cmd != 0) { - /* - * Execute the command. - */ - (*cmd->fcn)(addr, have_addr, count, modifp); - - if (cmd->flag & CS_SET_DOT) { - /* - * If command changes dot, set dot to - * previous address displayed (if 'ed' style). - */ - if (db_ed_style) { - db_dot = db_prev; - } - else { - db_dot = db_next; - } - } - else { - /* - * If command does not change dot, - * set 'next' location to be the same. - */ - db_next = db_dot; - } - } - *last_cmdp = cmd; - *last_countp = count; - strlcpy(last_modifp, modifp, TOK_STRING_SIZE); -} - -void -db_command_list( - struct db_command **last_cmdp, /* IN_OUT */ - db_expr_t *last_countp, /* IN_OUT */ - char *last_modifp, /* IN_OUT */ - struct db_command *cmd_table) -{ - do { - db_command(last_cmdp, last_countp, last_modifp, cmd_table); - db_skip_to_eol(); - } while (db_read_token() == tSEMI_COLON && db_cmd_loop_done == 0); -} - - -extern void db_system_stats(void); - -struct db_command db_show_all_cmds[] = { - { - .name = "acts", - .fcn = db_show_all_acts, - }, - { - .name = "spaces", - .fcn = db_show_all_spaces, - }, - { - .name = "tasks", - .fcn = db_show_all_acts, - }, - /* temporary alias for sanity preservation */ - { - .name ="threads", - db_show_all_acts, - }, - { - .name = "zones", - .fcn = db_show_all_zones, - }, - { - .name = "vmtask", - .fcn = db_show_all_task_vm, - }, - { - .name = (const char *)NULL, - }, -}; - -/* XXX */ - -extern void db_show_thread_log(void); -extern void db_show_etap_log(db_expr_t, int, db_expr_t, char *); - -struct db_command db_show_cmds[] = { - { - .name = "all", - .more = db_show_all_cmds - }, - { - .name = "registers", - .fcn = db_show_regs, - }, - { - .name = "variables", - .fcn = db_show_variable, - .flag = CS_OWN, - }, - { - .name = "breaks", - .fcn = db_listbreak_cmd, - }, - { - .name = "watches", - .fcn = db_listwatch_cmd, - }, - { - .name = "task", - .fcn = db_show_one_task, - }, - { - .name = "act", - .fcn = db_show_one_act, - }, - { - .name = "shuttle", - .fcn = db_show_shuttle, - }, -#if 0 - { - .name = "thread", - .fcn = db_show_one_thread, - }, -#endif - { - .name = "vmtask", - .fcn = db_show_one_task_vm, - }, - { - .name = "macro", - .fcn = (db_func)db_show_macro, - .flag = CS_OWN, - }, - { - .name = "runq", - .fcn = (db_func)db_show_runq, - }, - { - .name = "map", - .fcn = (db_func)vm_map_print, - }, - { - .name = "object", - .fcn = vm_object_print, - }, - { - .name = "page", - .fcn = (db_func)vm_page_print, - }, - { - .name = "copy", - .fcn = (db_func)vm_map_copy_print, - }, - { - .name = "port", - .fcn = (db_func)ipc_port_print, - }, - { - .name = "pset", - .fcn = (db_func)ipc_pset_print, - }, - { - .name = "kmsg", - .fcn = (db_func)ipc_kmsg_print, - }, - { - .name = "msg", - .fcn = (db_func)ipc_msg_print, - }, - { - .name = "ipc_port", - .fcn = db_show_port_id, - }, -#if NORMA_VM - { - .name = "xmm_obj", - .fcn = (db_func)xmm_obj_print, - }, - { - .name = "xmm_reply", - .fcn = (db_func)xmm_reply_print, - }, -#endif /* NORMA_VM */ - { - .name = "space", - .fcn = db_show_one_space, - }, - { - .name = "system", - .fcn = (db_func)db_system_stats, - }, - { - .name = "zone", - .fcn = db_show_one_zone, - }, - { - .name = "lock", - .fcn = (db_func)db_show_one_lock, - }, - { - .name = "simple_lock", - .fcn = (db_func)db_show_one_simple_lock, - }, - { - .name = "thread_log", - (db_func)db_show_thread_log, - }, - { - .name = "shuttle", - .fcn = db_show_shuttle, - }, - { - .name = (const char *)NULL, - }, -}; - -#define db_switch_cpu kdb_on - -struct db_command db_command_table[] = { -#if DB_MACHINE_COMMANDS - /* this must be the first entry, if it exists */ - { - .name = "machine", - }, -#endif /* DB_MACHINE_COMMANDS */ - { - .name = "print", - .fcn = (db_func)db_print_cmd, - .flag = CS_OWN, - }, - { - .name = "examine", - .fcn = db_examine_cmd, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "x", - .fcn = db_examine_cmd, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "xf", - .fcn = db_examine_forward, - .flag = CS_SET_DOT, - }, - { - .name = "xb", - .fcn = db_examine_backward, - .flag = CS_SET_DOT, - }, - { - .name = "search", - .fcn = (db_func)db_search_cmd, - .flag = CS_OWN|CS_SET_DOT, - }, - { - .name = "set", - .fcn = (db_func)db_set_cmd, - .flag = CS_OWN, - }, - { - .name = "write", - .fcn = db_write_cmd, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "w", - .fcn = db_write_cmd, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "delete", - .fcn = (db_func)db_delete_cmd, - .flag = CS_OWN, - }, - { - .name = "d", - .fcn = (db_func)db_delete_cmd, - .flag = CS_OWN, - }, - { - .name = "break", - .fcn = db_breakpoint_cmd, - .flag = CS_MORE, - }, - { - .name = "dwatch", - .fcn = db_deletewatch_cmd, - .flag = CS_MORE, - }, - { - .name = "watch", - .fcn = db_watchpoint_cmd, - .flag = CS_MORE, - }, - { - .name = "step", - .fcn = db_single_step_cmd, - }, - { - .name = "s", - .fcn = db_single_step_cmd, - }, - { - .name = "continue", - .fcn = db_continue_cmd, - }, - { - .name = "c", - .fcn = db_continue_cmd, - }, - { - .name = "gdb", - .fcn = db_continue_gdb, - }, - { - .name = "until", - .fcn = db_trace_until_call_cmd, - }, - - /* As per request of DNoveck, CR1550, leave this disabled */ -#if 0 /* until CR1440 is fixed, to avoid toe-stubbing */ - { - .name = "next", - .fcn = db_trace_until_matching_cmd, - }, -#endif - { - .name = "match", - .fcn = db_trace_until_matching_cmd, - }, - { - .name = "trace", - .fcn = db_stack_trace_cmd, - }, - { - .name = "cond", - .fcn = (db_func)db_cond_cmd, - .flag = CS_OWN, - }, - { - .name = "call", - .fcn = (db_func)db_fncall, - .flag = CS_OWN, - }, - { - .name = "macro", - .fcn = (db_func)db_def_macro_cmd, - .flag = CS_OWN, - }, - { - .name = "dmacro", - .fcn = (db_func)db_del_macro_cmd, - .flag = CS_OWN, - }, - { - .name = "show", - .more = db_show_cmds - }, - { - .name = "cpu", - .fcn = (db_func)db_switch_cpu, - }, - { - .name = "dr", - .fcn = db_display_real, - .flag = CS_MORE|CS_SET_DOT, - }, - { - .name = "di", - .fcn = db_display_iokit, - .flag = CS_MORE, - }, - { - .name = "dk", - .fcn = db_display_kmod, - .flag = CS_MORE, - }, - - { - .name = "reboot", - (db_func)db_reboot, - }, - { - .name = "ms", - .fcn = db_msr, - .flag = CS_MORE, - }, - { - .name = "cp", - .fcn = db_cpuid, - .flag = CS_MORE, - }, - { - .name = "da", - .fcn = db_apic, - .flag = CS_MORE, - }, - { - .name = (const char *)NULL, - }, -}; - -/* this function should be called to install the machine dependent - commands. It should be called before the debugger is enabled */ -void db_machine_commands_install(struct db_command *ptr) -{ - db_command_table[0].more = ptr; - return; -} - - -struct db_command *db_last_command = 0; -db_expr_t db_last_count = 0; -char db_last_modifier[TOK_STRING_SIZE] = { '\0' }; - -void -db_help_cmd(void) -{ - struct db_command *cmd = db_command_table; - - while (cmd->name != 0) { - db_printf("%-12s", cmd->name); - db_end_line(); - cmd++; - } -} - -int (*ddb_display)(void); - -extern int db_output_line; -extern int db_macro_level; - -void -db_command_loop(void) -{ - jmp_buf_t db_jmpbuf; - jmp_buf_t *prev = db_recover; - - /* - * Initialize 'prev' and 'next' to dot. - */ - db_prev = db_dot; - db_next = db_dot; - - if (ddb_display) - (*ddb_display)(); - - db_cmd_loop_done = 0; - while (!db_cmd_loop_done) { - (void) _setjmp(db_recover = &db_jmpbuf); - db_macro_level = 0; - if (db_print_position() != 0) - db_printf("\n"); - db_output_line = 0; - db_indent = 0; - db_reset_more(); - db_output_prompt(); - - (void) db_read_line("!!"); - db_command_list(&db_last_command, &db_last_count, - db_last_modifier, db_command_table); - } - - db_recover = prev; -} - -boolean_t -db_exec_cmd_nest( - const char *cmd, - int size) -{ - struct db_lex_context lex_context; - - db_cmd_loop_done = 0; - if (cmd) { - db_save_lex_context(&lex_context); - db_switch_input(cmd, size); - } - db_command_list(&db_last_command, &db_last_count, - db_last_modifier, db_command_table); - if (cmd) - db_restore_lex_context(&lex_context); - return(db_cmd_loop_done == 0); -} - -void -db_error(const char *s) -{ - db_macro_level = 0; - if (db_recover) { - if (s > (char *)1) - db_printf(s); - db_flush_lex(); - _longjmp(db_recover, (s == (char *)1) ? 2 : 1); - } - else - { - if (s > (char *)1) - db_printf(s); - panic("db_error"); - } -} - - -/* - * Call random function: - * !expr(arg,arg,arg) - */ -void -db_fncall(void) -{ - db_expr_t fn_addr; -#define MAXARGS 11 - uint32_t args[MAXARGS]; - db_expr_t argwork; - int nargs = 0; - uint32_t retval; - uint32_t (*func)(uint32_t, ...); - int t; - - if (!db_expression(&fn_addr)) { - db_printf("Bad function \"%s\"\n", db_tok_string); - db_flush_lex(); - return; - } - func = (uint32_t (*) (uint32_t, ...))(unsigned long)fn_addr; - - t = db_read_token(); - if (t == tLPAREN) { - if (db_expression(&argwork)) { - args[nargs] = (uint32_t)argwork; - nargs++; - while ((t = db_read_token()) == tCOMMA) { - if (nargs == MAXARGS) { - db_printf("Too many arguments\n"); - db_flush_lex(); - return; - } - if (!db_expression(&argwork)) { - db_printf("Argument missing\n"); - db_flush_lex(); - return; - } - args[nargs] = (uint32_t)argwork; - nargs++; - } - db_unread_token(t); - } - if (db_read_token() != tRPAREN) { - db_printf("?\n"); - db_flush_lex(); - return; - } - } - while (nargs < MAXARGS) { - args[nargs++] = 0; - } - - retval = (*func)(args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8], args[9] ); - db_printf(" %#n\n", retval); -} - -boolean_t -db_option( - const char *modif, - int option) -{ - const char *p; - - for (p = modif; *p; p++) - if (*p == option) - return(TRUE); - return(FALSE); -} diff --git a/osfmk/ddb/db_command.h b/osfmk/ddb/db_command.h deleted file mode 100644 index 102dd97ea..000000000 --- a/osfmk/ddb/db_command.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -/* - * Command loop declarations. - */ - -#ifndef _DDB_DB_COMMAND_H_ -#define _DDB_DB_COMMAND_H_ - -#include -#include - -typedef void (*db_func)(db_expr_t, boolean_t, db_expr_t, char *); - -/* - * Command table - */ -struct db_command { - const char *name; /* command name */ - db_func fcn; /* function to call */ - int flag; /* extra info: */ -#define CS_OWN 0x1 /* non-standard syntax */ -#define CS_MORE 0x2 /* standard syntax, but may have other - words at end */ -#define CS_SET_DOT 0x100 /* set dot after command */ - struct db_command *more; /* another level of command */ -}; - - -extern db_addr_t db_dot; /* current location */ -extern db_addr_t db_last_addr; /* last explicit address typed */ -extern db_addr_t db_prev; /* last address examined - or written */ -extern db_addr_t db_next; /* next address to be examined - or written */ - - -/* Prototypes for functions exported by this module. - */ - -void db_command_loop(void); - -void db_machine_commands_install(struct db_command *ptr); - -boolean_t db_exec_cmd_nest( - const char *cmd, - int size); - -void db_error(const char *s); - -boolean_t db_option( - const char *modif, - int option); - -#endif /* !_DDB_DB_COMMAND_H_ */ diff --git a/osfmk/ddb/db_cond.c b/osfmk/ddb/db_cond.c deleted file mode 100644 index 3209a22c8..000000000 --- a/osfmk/ddb/db_cond.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:47 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.2.18.1 1997/03/27 18:46:29 barbou - * ri-osc CR1558: enable use of breakpoint counts even when no - * condition given. - * [1995/09/20 15:24:24 bolinger] - * [97/02/25 barbou] - * - * Revision 1.2.6.2 1996/01/09 19:15:34 devrcs - * Change 'register c' to 'register int c'. - * [1995/12/01 21:42:00 jfraser] - * - * Merged '64-bit safe' changes from DEC alpha port. - * [1995/11/21 18:02:54 jfraser] - * - * Revision 1.2.6.1 1994/09/23 01:18:27 ezf - * change marker to not FREE - * [1994/09/22 21:09:37 ezf] - * - * Revision 1.2.2.4 1993/08/11 20:37:33 elliston - * Add ANSI Prototypes. CR #9523. - * [1993/08/11 03:32:57 elliston] - * - * Revision 1.2.2.3 1993/07/27 18:26:59 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:11:12 elliston] - * - * Revision 1.2.2.2 1993/06/09 02:19:53 gm - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:56:04 jeffc] - * - * Revision 1.2 1993/04/19 16:01:51 devrcs - * Changes from mk78: - * Changed errant call of db_error in db_cond_cmd() to db_printf/db_error. - * [92/05/20 jfriedl] - * [93/02/02 bruel] - * - * Revision 1.1 1992/09/30 02:00:58 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.2 91/10/09 15:59:09 af - * Revision 2.1.3.1 91/10/05 13:05:38 jeffreyh - * Created to support conditional break point and command execution. - * [91/08/29 tak] - * - * Revision 2.1.3.1 91/10/05 13:05:38 jeffreyh - * Created to support conditional break point and command execution. - * [91/08/29 tak] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include /* For db_printf() */ - -#define DB_MAX_COND 10 /* maximum conditions to be set */ - -int db_ncond_free = DB_MAX_COND; /* free condition */ -struct db_cond { - int c_size; /* size of cond */ - char c_cond_cmd[DB_LEX_LINE_SIZE]; /* cond & cmd */ -} db_cond[DB_MAX_COND]; - -void -db_cond_free(db_thread_breakpoint_t bkpt) -{ - if (bkpt->tb_cond > 0) { - db_cond[bkpt->tb_cond-1].c_size = 0; - db_ncond_free++; - bkpt->tb_cond = 0; - } -} - -extern jmp_buf_t *db_recover; - -boolean_t -db_cond_check(db_thread_breakpoint_t bkpt) -{ - register struct db_cond *cp; - db_expr_t value; - int t; - jmp_buf_t db_jmpbuf; - - if (bkpt->tb_cond <= 0) { /* no condition */ - if (--(bkpt->tb_count) > 0) - return(FALSE); - bkpt->tb_count = bkpt->tb_init_count; - return(TRUE); - } - db_dot = PC_REGS(DDB_REGS); - db_prev = db_dot; - db_next = db_dot; - if (_setjmp(db_recover = &db_jmpbuf)) { - /* - * in case of error, return true to enter interactive mode - */ - return(TRUE); - } - - /* - * switch input, and evalutate condition - */ - cp = &db_cond[bkpt->tb_cond - 1]; - db_switch_input(cp->c_cond_cmd, cp->c_size); - if (!db_expression(&value)) { - db_printf("error: condition evaluation error\n"); - return(TRUE); - } - if (value == 0 || --(bkpt->tb_count) > 0) - return(FALSE); - - /* - * execute a command list if exist - */ - bkpt->tb_count = bkpt->tb_init_count; - if ((t = db_read_token()) != tEOL) { - db_unread_token(t); - return(db_exec_cmd_nest(0, 0)); - } - return(TRUE); -} - -void -db_cond_print(db_thread_breakpoint_t bkpt) -{ - register char *p, *ep; - register struct db_cond *cp; - - if (bkpt->tb_cond <= 0) - return; - cp = &db_cond[bkpt->tb_cond-1]; - p = cp->c_cond_cmd; - ep = p + cp->c_size; - while (p < ep) { - if (*p == '\n' || *p == 0) - break; - db_putchar(*p++); - } -} - -void -db_cond_cmd(void) -{ - register int c; - register struct db_cond *cp; - register char *p; - db_expr_t value; - db_thread_breakpoint_t bkpt; - - if (db_read_token() != tHASH || db_read_token() != tNUMBER) { - db_printf("# expected instead of \"%s\"\n", db_tok_string); - db_error(0); - return; - } - if ((bkpt = db_find_breakpoint_number((int)db_tok_number, 0)) == 0) { - db_printf("No such break point #%d\n", db_tok_number); - db_error(0); - return; - } - /* - * if the break point already has a condition, free it first - */ - if (bkpt->tb_cond > 0) { - cp = &db_cond[bkpt->tb_cond - 1]; - db_cond_free(bkpt); - } else { - if (db_ncond_free <= 0) { - db_error("Too many conditions\n"); - return; - } - for (cp = db_cond; cp < &db_cond[DB_MAX_COND]; cp++) - if (cp->c_size == 0) - break; - if (cp >= &db_cond[DB_MAX_COND]) - panic("bad db_cond_free"); - } - for (c = db_read_char(); c == ' ' || c == '\t'; c = db_read_char()); - for (p = cp->c_cond_cmd; c >= 0; c = db_read_char()) - *p++ = c; - /* - * switch to saved data and call db_expression to check the condition. - * If no condition is supplied, db_expression will return false. - * In this case, clear previous condition of the break point. - * If condition is supplied, set the condition to the permanent area. - * Note: db_expression will not return here, if the condition - * expression is wrong. - */ - db_switch_input(cp->c_cond_cmd, p - cp->c_cond_cmd); - if (!db_expression(&value)) { - /* since condition is already freed, do nothing */ - db_flush_lex(); - return; - } - db_flush_lex(); - db_ncond_free--; - cp->c_size = p - cp->c_cond_cmd; - bkpt->tb_cond = (cp - db_cond) + 1; -} diff --git a/osfmk/ddb/db_examine.c b/osfmk/ddb/db_examine.c deleted file mode 100644 index 6ed841857..000000000 --- a/osfmk/ddb/db_examine.c +++ /dev/null @@ -1,747 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -#include /* For strlcpy() */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include /* For db_option() */ -#include -#include -#include -#include -#include - -#define db_act_to_task(thr_act) ((thr_act)? thr_act->task: TASK_NULL) - -char db_examine_format[TOK_STRING_SIZE] = "x"; -int db_examine_count = 1; -db_addr_t db_examine_prev_addr = 0; -thread_t db_examine_act = THREAD_NULL; - -extern int db_max_width; - - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -int db_xcdump( - db_addr_t addr, - int size, - int count, - task_t task); - -int db_examine_width( - int size, - int *items, - int *remainder); - -extern char db_last_modifier[]; - -/* - * Examine (print) data. - */ -void -db_examine_cmd(db_expr_t addr, __unused boolean_t have_addr, db_expr_t count, - char *modif) -{ - thread_t thr_act; - - if (modif[0] != '\0') - strlcpy(db_examine_format, modif, TOK_STRING_SIZE); - - if (count == (db_expr_t)-1) - count = 1; - db_examine_count = (int)count; - if (db_option(modif, 't')) { - if (modif == db_last_modifier) - thr_act = db_examine_act; - else if (!db_get_next_act(&thr_act, 0)) - return; - } else - if (db_option(modif,'u')) - thr_act = current_thread(); - else - thr_act = THREAD_NULL; - - db_examine_act = thr_act; - db_examine((db_addr_t) addr, db_examine_format, (int)count, - db_act_to_task(thr_act)); -} - -void -db_examine_forward(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - db_examine(db_next, db_examine_format, db_examine_count, - db_act_to_task(db_examine_act)); -} - -void -db_examine_backward(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - db_examine(db_examine_prev_addr - (db_next - db_examine_prev_addr), - db_examine_format, db_examine_count, - db_act_to_task(db_examine_act)); -} - -int -db_examine_width( - int size, - int *items, - int *remainder) -{ - int sz; - int entry; - int width; - - width = size * 2 + 1; - sz = (db_max_width - (sizeof (void *) * 2 + 4)) / width; - for (entry = 1; (entry << 1) < sz; entry <<= 1) - continue; - - sz = sizeof (void *) * 2 + 4 + entry * width; - while (sz + entry < db_max_width) { - width++; - sz += entry; - } - *remainder = (db_max_width - sz + 1) / 2; - *items = entry; - return width; -} - -void -db_examine( - db_addr_t addr, - char * fmt, /* format string */ - int count, /* repeat count */ - task_t task) -{ - int c; - db_expr_t value; - int size; - int width; - int leader; - int items; - int nitems = 0; - char * fp; - db_addr_t next_addr = 0; - int sz; - - db_examine_prev_addr = addr; - while (--count >= 0) { - fp = fmt; - size = sizeof(int); - width = db_examine_width(size, &items, &leader); - while ((c = *fp++) != 0) { - switch (c) { - case 'b': - size = sizeof(char); - width = db_examine_width(size, &items, &leader); - break; - case 'h': - size = sizeof(short); - width = db_examine_width(size, &items, &leader); - break; - case 'l': - size = sizeof(int); - width = db_examine_width(size, &items, &leader); - break; - case 'q': - size = sizeof(long); - width = db_examine_width(size, &items, &leader); - break; - case 'a': /* address */ - case 'A': /* function address */ - /* always forces a new line */ - if (db_print_position() != 0) - db_printf("\n"); - db_prev = addr; - next_addr = addr + 4; - db_task_printsym(addr, - (c == 'a')?DB_STGY_ANY:DB_STGY_PROC, - task); - db_printf(":\t"); - break; - case 'm': - db_next = db_xcdump(addr, size, count+1, task); - return; - case 't': - case 'u': - break; - default: - restart: - /* Reset next_addr in case we are printing in - multiple formats. */ - next_addr = addr; - if (db_print_position() == 0) { - /* If we hit a new symbol, print it */ - const char * name; - db_addr_t off; - - db_find_task_sym_and_offset(addr,&name,&off,task); - if (off == 0) - db_printf("\r%s:\n", name); - db_printf("%#lln: ", (unsigned long long)addr); - for (sz = 0; sz < leader; sz++) - db_putchar(' '); - db_prev = addr; - nitems = items; - } - - switch (c) { - case 'p': /* Addrs rendered symbolically. */ - if( size == sizeof(void *) ) { - const char *symName; - db_addr_t offset; - - items = 1; - value = db_get_task_value( next_addr, - sizeof(db_expr_t), FALSE, task ); - db_find_task_sym_and_offset( value, - &symName, &offset, task); - db_printf("\n\t*%8llX(%8llX) = %s", - (unsigned long long)next_addr, (unsigned long long)value, symName ); - if( offset ) { - db_printf("+%llX", (unsigned long long)offset ); - } - next_addr += size; - } - break; - case 'r': /* signed, current radix */ - for (sz = size, next_addr = addr; - sz >= (signed)sizeof (db_expr_t); - sz -= sizeof (db_expr_t)) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, - sizeof (db_expr_t), - TRUE,task); - db_printf("%-*llr", width, (unsigned long long)value); - next_addr += sizeof (db_expr_t); - } - if (sz > 0) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, sz, - TRUE, task); - db_printf("%-*llR", width, (unsigned long long)value); - next_addr += sz; - } - break; - case 'X': /* unsigned hex */ - case 'x': /* unsigned hex */ - for (sz = size, next_addr = addr; - sz >= (signed)sizeof (db_expr_t); - sz -= sizeof (db_expr_t)) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, - sizeof (db_expr_t), - FALSE,task); - if ( c == 'X') - db_printf("%0*llX ", 2*size, (unsigned long long)value); - else - db_printf("%-*llx", width, (unsigned long long)value); - next_addr += sizeof (db_expr_t); - } - if (sz > 0) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, sz, - FALSE, task); - if ( c == 'X') - db_printf("%0*llX ", 2*size, (unsigned long long)value); - else - db_printf("%-*llX", width, (unsigned long long)value); - next_addr += sz; - } - break; - case 'z': /* signed hex */ - for (sz = size, next_addr = addr; - sz >= (signed)sizeof (db_expr_t); - sz -= sizeof (db_expr_t)) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, - sizeof (db_expr_t), - TRUE, task); - db_printf("%-*llz", width, (unsigned long long)value); - next_addr += sizeof (db_expr_t); - } - if (sz > 0) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr,sz, - TRUE,task); - db_printf("%-*llZ", width, (unsigned long long)value); - next_addr += sz; - } - break; - case 'd': /* signed decimal */ - for (sz = size, next_addr = addr; - sz >= (signed)sizeof (db_expr_t); - sz -= sizeof (db_expr_t)) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, - sizeof (db_expr_t), - TRUE,task); - db_printf("%-*lld", width, (unsigned long long)value); - next_addr += sizeof (db_expr_t); - } - if (sz > 0) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, sz, - TRUE, task); - db_printf("%-*llD", width, (unsigned long long)value); - next_addr += sz; - } - break; - case 'U': /* unsigned decimal */ - case 'u': - for (sz = size, next_addr = addr; - sz >= (signed)sizeof (db_expr_t); - sz -= sizeof (db_expr_t)) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, - sizeof (db_expr_t), - FALSE,task); - db_printf("%-*llu", width, (unsigned long long)value); - next_addr += sizeof (db_expr_t); - } - if (sz > 0) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, sz, - FALSE, task); - db_printf("%-*llU", width, (unsigned long long)value); - next_addr += sz; - } - break; - case 'o': /* unsigned octal */ - for (sz = size, next_addr = addr; - sz >= (signed)sizeof (db_expr_t); - sz -= sizeof (db_expr_t)) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, - sizeof (db_expr_t), - FALSE,task); - db_printf("%-*llo", width, (unsigned long long)value); - next_addr += sizeof (db_expr_t); - } - if (sz > 0) { - if (nitems-- == 0) { - db_putchar('\n'); - goto restart; - } - value = db_get_task_value(next_addr, sz, - FALSE, task); - db_printf("%-*llo", width, (unsigned long long)value); - next_addr += sz; - } - break; - case 'c': /* character */ - for (sz = 0, next_addr = addr; - sz < size; - sz++, next_addr++) { - value = db_get_task_value(next_addr,1, - FALSE,task); - if ((value >= ' ' && value <= '~') || - value == '\n' || - value == '\t') - db_printf("%llc", (unsigned long long)value); - else - db_printf("\\%03llo", (unsigned long long)value); - } - break; - case 's': /* null-terminated string */ - size = 0; - for (;;) { - value = db_get_task_value(next_addr,1, - FALSE,task); - next_addr += 1; - size++; - if (value == 0) - break; - if (value >= ' ' && value <= '~') - db_printf("%llc", (unsigned long long)value); - else - db_printf("\\%03llo", (unsigned long long)value); - } - break; - case 'i': /* instruction */ - next_addr = db_disasm(addr, FALSE, task); - size = (int)(next_addr - addr); - break; - case 'I': /* instruction, alternate form */ - next_addr = db_disasm(addr, TRUE, task); - size = (int)(next_addr - addr); - break; - default: - break; - } - if (db_print_position() != 0) - db_end_line(); - break; - } - } - addr = next_addr; - } - db_next = addr; -} - -/* - * Print value. - */ -char db_print_format = 'x'; - -void -db_print_cmd(void) -{ - db_expr_t value; - int t; - task_t task = TASK_NULL; - - if ((t = db_read_token()) == tSLASH) { - if (db_read_token() != tIDENT) { - db_printf("Bad modifier \"/%s\"\n", db_tok_string); - db_error(0); - /* NOTREACHED */ - } - if (db_tok_string[0]) - db_print_format = db_tok_string[0]; - if (db_option(db_tok_string, 't')) { - if (db_default_act) - task = db_default_act->task; - if (db_print_format == 't') - db_print_format = db_tok_string[1]; - } - } else - db_unread_token(t); - - for ( ; ; ) { - t = db_read_token(); - if (t == tSTRING) { - db_printf("%s", db_tok_string); - continue; - } - db_unread_token(t); - if (!db_expression(&value)) - break; - switch (db_print_format) { - case 'a': - case 'A': - db_task_printsym((db_addr_t)value, - (db_print_format == 'a') ? DB_STGY_ANY: - DB_STGY_PROC, - task); - break; - case 'r': - db_printf("%11llr", (unsigned long long)value); - break; - case 'X': - db_printf("%016llX", (unsigned long long)value); - break; - case 'x': - db_printf("%016llx", (unsigned long long)value); - break; - case 'z': - db_printf("%16llz", (unsigned long long)value); - break; - case 'd': - db_printf("%11lld", (unsigned long long)value); - break; - case 'u': - db_printf("%11llu", (unsigned long long)value); - break; - case 'o': - db_printf("%16llo", (unsigned long long)value); - break; - case 'c': - value = value & 0xFF; - if (value >= ' ' && value <= '~') - db_printf("%llc", (unsigned long long)value); - else - db_printf("\\%03llo", (unsigned long long)value); - break; - default: - db_printf("Unknown format %c\n", db_print_format); - db_print_format = 'x'; - db_error(0); - } - } -} - -void -db_print_loc( - db_addr_t loc, - task_t task) -{ - db_task_printsym(loc, DB_STGY_PROC, task); -} - -void -db_print_inst( - db_addr_t loc, - task_t task) -{ - (void) db_disasm(loc, TRUE, task); -} - -void -db_print_loc_and_inst( - db_addr_t loc, - task_t task) -{ - db_task_printsym(loc, DB_STGY_PROC, task); - db_printf(":\t"); - (void) db_disasm(loc, TRUE, task); -} - -/* - * Search for a value in memory. - * Syntax: search [/bhl] addr value [mask] [,count] [thread] - */ -void -db_search_cmd(void) -{ - int t; - db_addr_t addr; - int size = 0; - db_expr_t value; - db_expr_t mask; - db_addr_t count; - thread_t thr_act; - boolean_t thread_flag = FALSE; - register char *p; - - t = db_read_token(); - if (t == tSLASH) { - t = db_read_token(); - if (t != tIDENT) { - bad_modifier: - db_printf("Bad modifier \"/%s\"\n", db_tok_string); - db_flush_lex(); - return; - } - - for (p = db_tok_string; *p; p++) { - switch(*p) { - case 'b': - size = sizeof(char); - break; - case 'h': - size = sizeof(short); - break; - case 'l': - size = sizeof(long); - break; - case 't': - thread_flag = TRUE; - break; - default: - goto bad_modifier; - } - } - } else { - db_unread_token(t); - size = sizeof(int); - } - - if (!db_expression((db_expr_t *) &addr)) { - db_printf("Address missing\n"); - db_flush_lex(); - return; - } - - if (!db_expression(&value)) { - db_printf("Value missing\n"); - db_flush_lex(); - return; - } - - if (!db_expression(&mask)) - mask = ~0; - - t = db_read_token(); - if (t == tCOMMA) { - if (!db_expression((db_expr_t *) &count)) { - db_printf("Count missing\n"); - db_flush_lex(); - return; - } - } else { - db_unread_token(t); - count = -1; /* effectively forever */ - } - if (thread_flag) { - if (!db_get_next_act(&thr_act, 0)) - return; - } else - thr_act = THREAD_NULL; - - db_search(addr, size, value, mask, (unsigned int)count, db_act_to_task(thr_act)); -} - -void -db_search( - db_addr_t addr, - int size, - db_expr_t value, - db_expr_t mask, - unsigned int count, - task_t task) -{ - while (count-- != 0) { - db_prev = addr; - if ((db_get_task_value(addr,size,FALSE,task) & mask) == value) - break; - addr += size; - } - db_printf("0x%llx: ", (unsigned long long)addr); - db_next = addr; -} - -#define DB_XCDUMP_NC 16 - -int -db_xcdump( - db_addr_t addr, - int size, - int count, - task_t task) -{ - register int i, n; - db_expr_t value; - int bcount; - db_addr_t off; - const char *name; - char data[DB_XCDUMP_NC]; - - db_find_task_sym_and_offset(addr, &name, &off, task); - for (n = count*size; n > 0; n -= bcount) { - db_prev = addr; - if (off == 0) { - db_printf("%s:\n", name); - off = -1; - } - db_printf("%0*llX:%s", 2*sizeof(db_addr_t),(unsigned long long) addr, - (size != 1) ? " " : "" ); - bcount = ((n > DB_XCDUMP_NC)? DB_XCDUMP_NC: n); - if (trunc_page(addr) != trunc_page(addr+bcount-1)) { - db_addr_t next_page_addr = trunc_page(addr+bcount-1); - if (!DB_CHECK_ACCESS((vm_offset_t)next_page_addr, (int)sizeof(int), task)) - bcount = (int)(next_page_addr - addr); - } - db_read_bytes((vm_offset_t)addr, bcount, data, task); - for (i = 0; i < bcount && off != 0; i += size) { - if (i % 4 == 0) - db_printf(" "); - value = db_get_task_value(addr, size, FALSE, task); - db_printf("%0*llX ", size*2, (unsigned long long)value); - addr += size; - db_find_task_sym_and_offset(addr, &name, &off, task); - } - db_printf("%*s", - ((DB_XCDUMP_NC-i)/size)*(size*2+1)+(DB_XCDUMP_NC-i)/4, - ""); - bcount = i; - db_printf("%s*", (size != 1)? " ": ""); - for (i = 0; i < bcount; i++) { - value = data[i]; - db_printf("%llc", (value >= ' ' && value <= '~')? (unsigned long long)value: (unsigned long long)'.'); - } - db_printf("*\n"); - } - return((int)addr); -} diff --git a/osfmk/ddb/db_examine.h b/osfmk/ddb/db_examine.h deleted file mode 100644 index 08f026d2c..000000000 --- a/osfmk/ddb/db_examine.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.7.2 1996/01/09 19:15:43 devrcs - * Function prototypes for db_print_loc() & db_print_inst(). - * [1995/12/01 21:42:06 jfraser] - * - * Merged '64-bit safe' changes from DEC alpha port. - * [1995/11/21 18:03:03 jfraser] - * - * Revision 1.1.7.1 1994/09/23 01:18:55 ezf - * change marker to not FREE - * [1994/09/22 21:09:49 ezf] - * - * Revision 1.1.2.4 1993/09/17 21:34:33 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:11 robert] - * - * Revision 1.1.2.3 1993/08/11 22:12:10 elliston - * Add ANSI Prototypes. CR #9523. - * [1993/08/11 03:33:11 elliston] - * - * Revision 1.1.2.2 1993/07/27 18:27:12 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:11:28 elliston] - * - * $EndLog$ - */ - -#ifndef _DDB_DB_EXAMINE_H_ -#define _DDB_DB_EXAMINE_H_ - -#include -#include - -/* Prototypes for functions exported by this module. - */ - -void db_examine_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_examine_forward(db_expr_t, boolean_t, db_expr_t, char *); - -void db_examine_backward(db_expr_t, boolean_t, db_expr_t, char *); - -void db_examine( - db_addr_t addr, - char * fmt, /* format string */ - int count, /* repeat count */ - task_t task); - -void db_print_cmd(void); - -void db_print_loc( - db_addr_t loc, - task_t task); - -void -db_print_inst( - db_addr_t loc, - task_t task); - -void db_print_loc_and_inst( - db_addr_t loc, - task_t task); - -void db_search_cmd(void); - -void db_search( - db_addr_t addr, - int size, - db_expr_t value, - db_expr_t mask, - unsigned int count, - task_t task); - -#endif /* !_DDB_DB_EXAMINE_H_ */ diff --git a/osfmk/ddb/db_expr.c b/osfmk/ddb/db_expr.c deleted file mode 100644 index 9c3962bc7..000000000 --- a/osfmk/ddb/db_expr.c +++ /dev/null @@ -1,482 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include -#include -#include - - - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -boolean_t db_term(db_expr_t *valuep); -boolean_t db_unary(db_expr_t *valuep); -boolean_t db_mult_expr(db_expr_t *valuep); -boolean_t db_add_expr(db_expr_t *valuep); -boolean_t db_shift_expr(db_expr_t *valuep); -boolean_t db_logical_relation_expr(db_expr_t *valuep); -boolean_t db_logical_and_expr(db_expr_t *valuep); -boolean_t db_logical_or_expr(db_expr_t *valuep); - - -/* try to interpret unknown symbols as hexadecimal constants */ -int db_allow_unprefixed_hexa = 1; - -boolean_t -db_term(db_expr_t *valuep) -{ - int t; - boolean_t valid_symbol = FALSE; - boolean_t valid_hexa = FALSE; - - switch(t = db_read_token()) { - case tIDENT: - if (db_value_of_name(db_tok_string, valuep)) { - valid_symbol = TRUE; - } - if (db_allow_unprefixed_hexa && db_radix == 16 && - db_tok_string[0]) { - char *cp; - db_expr_t value; - - value = 0; - valid_hexa = TRUE; - for (cp = db_tok_string; *cp; cp++) { - if (*cp >= 'a' && *cp <= 'f') { - value = value * 16 + 10 + (*cp - 'a'); - } else if (*cp >= 'A' && *cp <= 'F') { - value = value * 16 + 10 + (*cp - 'A'); - } else if (*cp >= '0' && *cp <= '9') { - value = value * 16 + (*cp - '0'); - } else { - valid_hexa = FALSE; - break; - } - } - if (valid_hexa) { - if (valid_symbol) { - db_printf("Ambiguous constant %x used as a symbol\n", - value); - } else { - *valuep = value; - } - } - } - if (!valid_symbol && !valid_hexa) { - db_printf("Symbol \"%s\" not found\n", db_tok_string); - db_error(0); - /*NOTREACHED*/ - } - return (TRUE); - case tNUMBER: - *valuep = /*(db_expr_t)*/db_tok_number; - return (TRUE); - case tDOT: - *valuep = (db_expr_t)db_dot; - return (TRUE); - case tDOTDOT: - *valuep = (db_expr_t)db_prev; - return (TRUE); - case tPLUS: - *valuep = (db_expr_t) db_next; - return (TRUE); - case tQUOTE: - *valuep = (db_expr_t)db_last_addr; - return (TRUE); - case tDOLLAR: - if (!db_get_variable(valuep)) - return (FALSE); - return (TRUE); - case tLPAREN: - if (!db_expression(valuep)) { - db_error("Unmached ()s\n"); - /*NOTREACHED*/ - } - t = db_read_token(); - if (t != tRPAREN) { - db_printf("')' expected at \"%s...\"\n", db_tok_string); - db_error(0); - /*NOTREACHED*/ - } - return (TRUE); - case tSTRING: - { - static int db_tok_offset = 0; - char *sp, *cp; - - sp = (char *)db_tok_string + db_tok_offset; - *valuep = *(int *)sp; - for (cp = sp; - *cp && cp < sp + sizeof (int); - cp++); - if (cp == sp + sizeof (int) && *cp) { - db_tok_offset += sizeof (int); - db_unread_token(t); - } else { - db_tok_offset = 0; - } - return (TRUE); - } - default: - db_unread_token(t); - return (FALSE); - } -} - -int -db_size_option( - char *modif, - boolean_t *u_option, - boolean_t *t_option) -{ - register char *p; - int size = sizeof(int); - - *u_option = FALSE; - *t_option = FALSE; - for (p = modif; *p; p++) { - switch(*p) { - case 'b': - size = sizeof(char); - break; - case 'h': - size = sizeof(short); - break; - case 'l': - size = sizeof(long); - break; - case 'u': - *u_option = TRUE; - break; - case 't': - *t_option = TRUE; - break; - } - } - return(size); -} - -boolean_t -db_unary(db_expr_t *valuep) -{ - int t; - int size; - boolean_t u_opt, t_opt; - task_t task; - - t = db_read_token(); - if (t == tMINUS) { - if (!db_unary(valuep)) { - db_error("Expression syntax error after '-'\n"); - /*NOTREACHED*/ - } - *valuep = -*valuep; - return (TRUE); - } - if (t == tSTAR) { - /* indirection */ - if (!db_unary(valuep)) { - db_error("Expression syntax error after '*'\n"); - /*NOTREACHED*/ - } - task = TASK_NULL; - size = sizeof(db_addr_t); - u_opt = FALSE; - t = db_read_token(); - if (t == tIDENT && db_tok_string[0] == ':') { - size = db_size_option(&db_tok_string[1], &u_opt, &t_opt); - if (t_opt) - task = db_default_task; - } else - db_unread_token(t); - *valuep = db_get_task_value((db_addr_t)*valuep, size, !u_opt, task); - return (TRUE); - } - if (t == tEXCL) { - if (!db_unary(valuep)) { - db_error("Expression syntax error after '!'\n"); - /*NOTREACHED*/ - } - *valuep = (!(*valuep)); - return (TRUE); - } - db_unread_token(t); - return (db_term(valuep)); -} - -boolean_t -db_mult_expr(db_expr_t *valuep) -{ - db_expr_t lhs, rhs; - int t; - char c; - - if (!db_unary(&lhs)) - return (FALSE); - - t = db_read_token(); - while (t == tSTAR || t == tSLASH || t == tPCT || t == tHASH - || t == tBIT_AND) { - c = db_tok_string[0]; - if (!db_term(&rhs)) { - db_printf("Expression syntax error after '%c'\n", c); - db_error(0); - /*NOTREACHED*/ - } - switch(t) { - case tSTAR: - lhs *= rhs; - break; - case tBIT_AND: - lhs &= rhs; - break; - default: - if (rhs == 0) { - db_error("Divide by 0\n"); - /*NOTREACHED*/ - } - if (t == tSLASH) - lhs /= rhs; - else if (t == tPCT) - lhs %= rhs; - else - lhs = ((lhs+rhs-1)/rhs)*rhs; - } - t = db_read_token(); - } - db_unread_token(t); - *valuep = lhs; - return (TRUE); -} - -boolean_t -db_add_expr(db_expr_t *valuep) -{ - db_expr_t lhs, rhs; - int t; - char c; - - if (!db_mult_expr(&lhs)) - return (FALSE); - - t = db_read_token(); - while (t == tPLUS || t == tMINUS || t == tBIT_OR) { - c = db_tok_string[0]; - if (!db_mult_expr(&rhs)) { - db_printf("Expression syntax error after '%c'\n", c); - db_error(0); - /*NOTREACHED*/ - } - if (t == tPLUS) - lhs += rhs; - else if (t == tMINUS) - lhs -= rhs; - else - lhs |= rhs; - t = db_read_token(); - } - db_unread_token(t); - *valuep = lhs; - return (TRUE); -} - -boolean_t -db_shift_expr(db_expr_t *valuep) -{ - db_expr_t lhs, rhs; - int t; - - if (!db_add_expr(&lhs)) - return (FALSE); - - t = db_read_token(); - while (t == tSHIFT_L || t == tSHIFT_R) { - if (!db_add_expr(&rhs)) { - db_printf("Expression syntax error after \"%s\"\n", - (t == tSHIFT_L)? "<<": ">>"); - db_error(0); - /*NOTREACHED*/ - } - if ((int64_t)rhs < 0) { - db_error("Negative shift amount\n"); - /*NOTREACHED*/ - } - if (t == tSHIFT_L) - lhs <<= rhs; - else { - /* Shift right is unsigned */ - lhs = (uint64_t) lhs >> rhs; - } - t = db_read_token(); - } - db_unread_token(t); - *valuep = lhs; - return (TRUE); -} - -boolean_t -db_logical_relation_expr(db_expr_t *valuep) -{ - db_expr_t lhs, rhs; - int t; - char op[3]; - - if (!db_shift_expr(&lhs)) - return(FALSE); - - t = db_read_token(); - while (t == tLOG_EQ || t == tLOG_NOT_EQ - || t == tGREATER || t == tGREATER_EQ - || t == tLESS || t == tLESS_EQ) { - op[0] = db_tok_string[0]; - op[1] = db_tok_string[1]; - op[2] = 0; - if (!db_shift_expr(&rhs)) { - db_printf("Expression syntax error after \"%s\"\n", op); - db_error(0); - /*NOTREACHED*/ - } - switch(t) { - case tLOG_EQ: - lhs = (lhs == rhs); - break; - case tLOG_NOT_EQ: - lhs = (lhs != rhs); - break; - case tGREATER: - lhs = (lhs > rhs); - break; - case tGREATER_EQ: - lhs = (lhs >= rhs); - break; - case tLESS: - lhs = (lhs < rhs); - break; - case tLESS_EQ: - lhs = (lhs <= rhs); - break; - } - t = db_read_token(); - } - db_unread_token(t); - *valuep = lhs; - return (TRUE); -} - -boolean_t -db_logical_and_expr(db_expr_t *valuep) -{ - db_expr_t lhs, rhs; - int t; - - if (!db_logical_relation_expr(&lhs)) - return(FALSE); - - t = db_read_token(); - while (t == tLOG_AND) { - if (!db_logical_relation_expr(&rhs)) { - db_error("Expression syntax error after \"&&\"\n"); - /*NOTREACHED*/ - } - lhs = (lhs && rhs); - t = db_read_token(); - } - db_unread_token(t); - *valuep = lhs; - return (TRUE); -} - -boolean_t -db_logical_or_expr(db_expr_t *valuep) -{ - db_expr_t lhs, rhs; - int t; - - if (!db_logical_and_expr(&lhs)) - return(FALSE); - - t = db_read_token(); - while (t == tLOG_OR) { - if (!db_logical_and_expr(&rhs)) { - db_error("Expression syntax error after \"||\"\n"); - /*NOTREACHED*/ - } - lhs = (lhs || rhs); - t = db_read_token(); - } - db_unread_token(t); - *valuep = lhs; - return (TRUE); -} - -int -db_expression(db_expr_t *valuep) -{ - return (db_logical_or_expr(valuep)); -} diff --git a/osfmk/ddb/db_ext_symtab.c b/osfmk/ddb/db_ext_symtab.c deleted file mode 100644 index 4cb7c02ae..000000000 --- a/osfmk/ddb/db_ext_symtab.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - */ -#include -#include - -#include -#include -#include -#include -#include /* prototype */ - -#if MACH_KDB && MACH_DEBUG -#include -#include -#include -#include -#include -#endif - -/* - * Loads a symbol table for an external file into the kernel debugger. - * The symbol table data is an array of characters. It is assumed that - * the caller and the kernel debugger agree on its format. - - * This has never and will never be supported on MacOS X. The only reason I don't remove - * it entirely is that it is an exported symbol. - */ -kern_return_t -host_load_symbol_table( - __unused host_priv_t host_priv, - __unused task_t task, - __unused char * name, - __unused pointer_t symtab, - __unused mach_msg_type_number_t symtab_count) -{ - return KERN_FAILURE; -} diff --git a/osfmk/ddb/db_input.c b/osfmk/ddb/db_input.c deleted file mode 100644 index 650f05bb6..000000000 --- a/osfmk/ddb/db_input.c +++ /dev/null @@ -1,821 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.3.10.2 1994/09/23 01:19:37 ezf - * change marker to not FREE - * [1994/09/22 21:10:05 ezf] - * - * Revision 1.3.10.1 1994/06/11 21:11:48 bolinger - * Merge up to NMK17.2. - * [1994/06/11 20:01:41 bolinger] - * - * Revision 1.3.8.2 1994/02/11 14:21:41 paire - * Added string.h header file for strlen declaration. - * [94/02/09 paire] - * - * Revision 1.3.8.1 1994/02/08 10:57:55 bernadat - * Added db_auto_completion variable. - * [93/08/17 paire] - * - * Added support of symbol completion by typing '\t'. - * [93/08/14 paire] - * [94/02/07 bernadat] - * - * Revision 1.3.2.4 1993/08/11 20:37:51 elliston - * Add ANSI Prototypes. CR #9523. - * [1993/08/11 03:33:21 elliston] - * - * Revision 1.3.2.3 1993/07/27 18:27:30 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:01 elliston] - * - * Revision 1.3.2.2 1993/06/09 02:20:13 gm - * CR9176 - ANSI C violations: trailing tokens on CPP - * directives, extra semicolons after decl_ ..., asm keywords - * [1993/06/07 18:57:14 jeffc] - * - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:56:26 jeffc] - * - * Revision 1.3 1993/04/19 16:02:17 devrcs - * Replaced ^R (redraw) with ^L [barbou@gr.osf.org] - * - * Added ^R and ^S commands for history search commands - * ^U does not erase end of the line anymore. (only erases - * from the beginning of the line to current position). - * [barbou@gr.osf.org] - * - * ^C now erases the entire line. [barbou@gr.osf.org] - * [92/12/03 bernadat] - * - * Fixed history management: Do not store repeated typed - * command. Null terminate current command in case it is a - * substring of the last command. - * [92/10/02 bernadat] - * - * Revision 1.2 1992/11/25 01:04:24 robert - * integrate changes for norma_14 below - * - * Philippe Bernadat (bernadat) at gr.osf.org 02-Oct-92 - * Fixed history management: Do not store repeated typed - * command. Null terminate current command in case it is a - * substring of the last command. - * [1992/11/20 00:56:07 robert] - * - * integrate changes below for norma_14 - * [1992/11/13 19:21:34 robert] - * - * Revision 1.1 1992/09/30 02:01:08 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.7.3.2 92/09/15 17:14:26 jeffreyh - * Fixed history code. (Only one char. out of 2 was checked to - * compare to last command) - * [barbou@gr.osf.org] - * - * Revision 2.7.3.1 92/03/03 16:13:30 jeffreyh - * Pick up changes from TRUNK - * [92/02/26 10:59:36 jeffreyh] - * - * Revision 2.8 92/02/19 15:07:44 elf - * Added delete_line (Ctrl-U). - * [92/02/17 kivinen] - * - * Added command line history. Ctrl-P = previous, Ctrl-N = next. If - * DB_HISTORY_SIZE is 0 then command history is disabled. - * [92/02/17 kivinen] - * - * Revision 2.7 91/10/09 16:00:03 af - * Revision 2.6.2.1 91/10/05 13:06:12 jeffreyh - * Fixed incorrect db_lbuf_end setting. - * [91/08/29 tak] - * - * Revision 2.6.2.1 91/10/05 13:06:12 jeffreyh - * Fixed incorrect db_lbuf_end setting. - * [91/08/29 tak] - * - * Revision 2.6 91/07/09 23:15:49 danner - * Add include of machine/db_machdep.h to allow machine-specific - * overrides via defines. - * [91/07/08 danner] - * - * Revision 2.5 91/05/14 15:34:03 mrt - * Correcting copyright - * - * Revision 2.4 91/02/14 14:41:53 mrt - * Add input line editing. - * [90/11/11 dbg] - * - * Revision 2.3 91/02/05 17:06:32 mrt - * Changed to new Mach copyright - * [91/01/31 16:18:13 mrt] - * - * Revision 2.2 90/08/27 21:51:03 dbg - * Reduce lint. - * [90/08/07 dbg] - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef DB_HISTORY_SIZE -#define DB_HISTORY_SIZE 4000 -#endif /* DB_HISTORY_SIZE */ - -/* - * Character input and editing. - */ - -/* - * We don't track output position while editing input, - * since input always ends with a new-line. We just - * reset the line position at the end. - */ -char * db_lbuf_start; /* start of input line buffer */ -char * db_lbuf_end; /* end of input line buffer */ -char * db_lc; /* current character */ -char * db_le; /* one past last character */ -int db_completion; /* number of incomplete symbols matched */ -int db_auto_completion = 10; /* number of line to display without asking */ -#if DB_HISTORY_SIZE != 0 -char db_history[DB_HISTORY_SIZE]; /* start of history buffer */ -int db_history_size = DB_HISTORY_SIZE;/* size of history buffer */ -char * db_history_curr = db_history; /* start of current line */ -char * db_history_last = db_history; /* start of last line */ -char * db_history_prev = (char *) 0; /* start of previous line */ -int db_hist_unmodified = 0; /* unmodified line from history */ -int db_hist_search = 0; /* are we in hist search mode ? */ -char db_hist_search_string[DB_LEX_LINE_SIZE];/* the string to look for */ -int db_hist_ignore_dups = 0; /* don't duplicate commands in hist */ -#endif - -#define CTRL(c) ((c) & 0x1f) -#define isspace(c) ((c) == ' ' || (c) == '\t') -#define BLANK ' ' -#define BACKUP '\b' - - - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -void db_putstring(const char *s, int count); - -void db_putnchars( - int c, - int count); - -void db_delete( - int n, - int bwd); - -void db_delete_line(void); - -boolean_t db_hist_substring( - char *string, - char *substring); - -boolean_t db_inputchar(int c); - -extern jmp_buf_t *db_recover; - -void -db_putstring(const char *s, int count) -{ - while (--count >= 0) - cnputc(*s++); -} - -void -db_putnchars( - int c, - int count) -{ - while (--count >= 0) - cnputc(c); -} - -/* - * Delete N characters, forward or backward - */ -#define DEL_FWD 0 -#define DEL_BWD 1 -void -db_delete( - int n, - int bwd) -{ - register char *p; - - if (bwd) { - db_lc -= n; - db_putnchars(BACKUP, n); - } - for (p = db_lc; p < db_le-n; p++) { - *p = *(p+n); - cnputc(*p); - } - db_putnchars(BLANK, n); - db_putnchars(BACKUP, db_le - db_lc); - db_le -= n; -} - -void -db_delete_line(void) -{ - db_delete(db_le - db_lc, DEL_FWD); - db_delete(db_lc - db_lbuf_start, DEL_BWD); - db_le = db_lc = db_lbuf_start; -} - -#if DB_HISTORY_SIZE != 0 -#define INC_DB_CURR() \ - do { \ - db_history_curr++; \ - if (db_history_curr > \ - db_history + db_history_size - 1) \ - db_history_curr = db_history; \ - } while (0) -#define DEC_DB_CURR() \ - do { \ - db_history_curr--; \ - if (db_history_curr < db_history) \ - db_history_curr = db_history + \ - db_history_size - 1; \ - } while (0) -#endif - -/* returs TRUE if "substring" is a substring of "string" */ -boolean_t -db_hist_substring( - char *string, - char *substring) -{ - register char *cp1, *cp2; - - cp1 = string; - while (*cp1) - cp1++; - cp2 = substring; - while (*cp2) - cp2++; - - while (cp2 > substring) { - cp1--; cp2--; - } - - while (cp1 >= string) { - register char *cp3; - - cp2 = substring; - cp3 = cp1; - while (*cp2 && *cp2 == *cp3) { - cp2++; cp3++; - } - if (*cp2 == '\0') { - return TRUE; - } - cp1--; - } - return FALSE; -} - -/* returns TRUE at end-of-line */ -boolean_t -db_inputchar(int c) -{ - char *sym; - char *start; - char *restart; - jmp_buf_t db_jmpbuf; - jmp_buf_t *local_prev; - char *p; - int len; - - switch(db_completion) { - case -1: - db_putchar('\n'); - local_prev = db_recover; - if (_setjmp(db_recover = &db_jmpbuf) == 0 && - (c == 'y' || c == ' ' || c == '\t')) - db_print_completion(db_tok_string); - db_recover = local_prev; - db_completion = 0; - db_reset_more(); - db_output_prompt(); - if (db_le > db_lbuf_start) { - for (start = db_lbuf_start; start < db_le; start++) - db_putchar(*start); - db_putnchars(BACKUP, db_le - db_lc); - } - return(FALSE); - - case 0: - break; - - default: - if (c == '\t') { - db_printf("\nThere are %d possibilities. ", db_completion); - db_printf("Do you really wish to see them all [n] ? "); - db_force_whitespace(); - db_completion = -1; - db_reset_more(); - return(FALSE); - } - db_completion = 0; - break; - } - - switch (c) { - case '\t': - /* symbol completion */ - if (db_lc == db_lbuf_start || db_auto_completion == 0) - break; - if (db_le == db_lbuf_end) { - cnputc('\007'); - break; - } - start = db_lc - 1; - while (start >= db_lbuf_start && - ((*start >= 'A' && *start <= 'Z') || - (*start >= 'a' && *start <= 'z') || - (*start >= '0' && *start <= '9') || - *start == '_' || *start == ':')) - start--; - if (start == db_lc - 1) - break; - if (start > db_lbuf_start && *start == '$') { - cnputc('\007'); - break; - } - sym = db_tok_string; - restart = ++start; - do { - *sym++ = *start++; - } while (start != db_lc && - sym != db_tok_string + sizeof(db_tok_string)); - if (sym == db_tok_string + sizeof(db_tok_string)) { - cnputc('\007'); - break; - } - *sym = '\0'; - db_completion = db_lookup_incomplete(db_tok_string, - sizeof(db_tok_string)); - if (db_completion == 0) { - /* symbol unknown */ - cnputc('\007'); - break; - } - - len = strlen(db_tok_string) - (start - restart); - if (db_completion == 1 && - (db_le == db_lc || - ((db_le > db_lc) && *db_lc != ' '))) - len++; - for (p = db_le - 1; p >= db_lc; p--) - *(p + len) = *p; - db_le += len; - for (sym = &db_tok_string[start - restart]; - *sym != '\0'; sym++) - *db_lc++ = *sym; - - if (db_completion == 1 || db_completion > db_auto_completion) { - for (sym = &db_tok_string[start - restart]; - *sym != '\0'; sym++) - cnputc(*sym); - if (db_completion == 1) { - if (db_le == db_lc || - ((db_le > db_lc) && *db_lc != ' ')) { - cnputc(' '); - *db_lc++ = ' '; - } - db_completion = 0; - } - db_putstring(db_lc, db_le - db_lc); - db_putnchars(BACKUP, db_le - db_lc); - } - - if (db_completion > 1) { - cnputc('\007'); - if (db_completion <= db_auto_completion) { - db_putchar('\n'); - db_print_completion(db_tok_string); - db_completion = 0; - db_reset_more(); - db_output_prompt(); - if (db_le > db_lbuf_start) { - for (start = db_lbuf_start; start < db_le; start++) - db_putchar(*start); - db_putnchars(BACKUP, db_le - db_lc); - } - } - } - break; - - case CTRL('b'): - /* back up one character */ - if (db_lc > db_lbuf_start) { - cnputc(BACKUP); - db_lc--; - } - break; - case CTRL('f'): - /* forward one character */ - if (db_lc < db_le) { - cnputc(*db_lc); - db_lc++; - } - break; - case CTRL('a'): - /* beginning of line */ - while (db_lc > db_lbuf_start) { - cnputc(BACKUP); - db_lc--; - } - break; - case CTRL('e'): - /* end of line */ - while (db_lc < db_le) { - cnputc(*db_lc); - db_lc++; - } - break; - case CTRL('h'): - case 0177: - /* erase previous character */ - if (db_lc > db_lbuf_start) - db_delete(1, DEL_BWD); - break; - case CTRL('d'): - /* erase next character */ - if (db_lc < db_le) - db_delete(1, DEL_FWD); - break; - case CTRL('k'): - /* delete to end of line */ - if (db_lc < db_le) - db_delete(db_le - db_lc, DEL_FWD); - break; - case CTRL('u'): - /* delete to beginning of line */ - if (db_lc > db_lbuf_start) - db_delete(db_lc - db_lbuf_start, DEL_BWD); - break; - case CTRL('t'): - /* twiddle last 2 characters */ - if (db_lc >= db_lbuf_start + 2) { - c = db_lc[-2]; - db_lc[-2] = db_lc[-1]; - db_lc[-1] = c; - cnputc(BACKUP); - cnputc(BACKUP); - cnputc(db_lc[-2]); - cnputc(db_lc[-1]); - } - break; - case CTRL('c'): - case CTRL('g'): - db_delete_line(); -#if DB_HISTORY_SIZE != 0 - db_history_curr = db_history_last; - if (c == CTRL('g') && db_hist_search) { - for (p = db_hist_search_string, db_le = db_lbuf_start; - *p; ) { - *db_le++ = *p++; - } - db_lc = db_le; - *db_le = '\0'; - db_putstring(db_lbuf_start, db_le - db_lbuf_start); - } -#endif - break; -#if DB_HISTORY_SIZE != 0 - case CTRL('r'): - if (db_hist_search++ == 0) { - /* starting an history lookup */ - register char *cp1, *cp2; - for (cp1 = db_lbuf_start, cp2 = db_hist_search_string; - cp1 < db_le; - cp1++, cp2++) - *cp2 = *cp1; - *cp2 = '\0'; - db_hist_search++; - } - /* FALL THROUGH */ - case CTRL('p'): - { - char * old_history_curr = db_history_curr; - - if (db_hist_unmodified++ == 0) - db_hist_unmodified++; - DEC_DB_CURR(); - while (db_history_curr != db_history_last) { - DEC_DB_CURR(); - if (*db_history_curr == '\0') { - INC_DB_CURR(); - if (db_hist_search <= 1) { - if (*db_history_curr == '\0') - cnputc('\007'); - else - DEC_DB_CURR(); - break; - } - if (*db_history_curr == '\0') { - cnputc('\007'); - db_history_curr = old_history_curr; - DEC_DB_CURR(); - break; - } - if (db_history_curr != db_history_last && - db_hist_substring(db_history_curr, - db_hist_search_string)) { - DEC_DB_CURR(); - break; - } - DEC_DB_CURR(); - } - } - if (db_history_curr == db_history_last) { - cnputc('\007'); - db_history_curr = old_history_curr; - } else { - INC_DB_CURR(); - db_delete_line(); - for (p = db_history_curr, db_le = db_lbuf_start; - *p; ) { - *db_le++ = *p++; - if (p == db_history + db_history_size) { - p = db_history; - } - } - db_lc = db_le; - *db_le = '\0'; - db_putstring(db_lbuf_start, db_le - db_lbuf_start); - } - break; - } - case CTRL('s'): - if (db_hist_search++ == 0) { - /* starting an history lookup */ - register char *cp1, *cp2; - for (cp1 = db_lbuf_start, cp2 = db_hist_search_string; - cp1 < db_le; - cp1++, cp2++) - *cp2 = *cp1; - *cp2 = '\0'; - db_hist_search++; - } - /* FALL THROUGH */ - case CTRL('n'): - { - char *old_history_curr = db_history_curr; - - if (db_hist_unmodified++ == 0) - db_hist_unmodified++; - while (db_history_curr != db_history_last) { - if (*db_history_curr == '\0') { - if (db_hist_search <= 1) - break; - INC_DB_CURR(); - if (db_history_curr != db_history_last && - db_hist_substring(db_history_curr, - db_hist_search_string)) { - DEC_DB_CURR(); - break; - } - DEC_DB_CURR(); - } - INC_DB_CURR(); - } - if (db_history_curr != db_history_last) { - INC_DB_CURR(); - if (db_history_curr != db_history_last) { - db_delete_line(); - for (p = db_history_curr, - db_le = db_lbuf_start; *p;) { - *db_le++ = *p++; - if (p == db_history + - db_history_size) { - p = db_history; - } - } - db_lc = db_le; - *db_le = '\0'; - db_putstring(db_lbuf_start, - db_le - db_lbuf_start); - } else { - cnputc('\007'); - db_history_curr = old_history_curr; - } - } else { - cnputc('\007'); - db_history_curr = old_history_curr; - } - break; - } -#endif - /* refresh the command line */ - case CTRL('l'): - db_putstring("^L\n", 3); - if (db_le > db_lbuf_start) { - db_putstring(db_lbuf_start, db_le - db_lbuf_start); - db_putnchars(BACKUP, db_le - db_lc); - } - break; - case '\n': - case '\r': -#if DB_HISTORY_SIZE != 0 - /* Check if it same than previous line */ - if (db_history_prev) { - char *pc; - - /* Is it unmodified */ - for (p = db_history_prev, pc = db_lbuf_start; - pc != db_le && *p;) { - if (*p != *pc) - break; - if (++p == db_history + db_history_size) { - p = db_history; - } - if (++pc == db_history + db_history_size) { - pc = db_history; - } - } - if (!*p && pc == db_le) { - /* Repeted previous line, not saved */ - db_history_curr = db_history_last; - *db_le++ = c; - db_hist_search = 0; - db_hist_unmodified = 0; - return (TRUE); - } - } - if (db_le != db_lbuf_start && - (db_hist_unmodified == 0 || !db_hist_ignore_dups)) { - db_history_prev = db_history_last; - for (p = db_lbuf_start; p != db_le; p++) { - *db_history_last++ = *p; - if (db_history_last == db_history + - db_history_size) { - db_history_last = db_history; - } - } - *db_history_last++ = '\0'; - } - db_history_curr = db_history_last; -#endif - *db_le++ = c; - db_hist_search = 0; - db_hist_unmodified = 0; - return (TRUE); - default: - if (db_le == db_lbuf_end) { - cnputc('\007'); - } - else if (c >= ' ' && c <= '~') { - for (p = db_le; p > db_lc; p--) - *p = *(p-1); - *db_lc++ = c; - db_le++; - cnputc(c); - db_putstring(db_lc, db_le - db_lc); - db_putnchars(BACKUP, db_le - db_lc); - } - break; - } - if (db_hist_search) - db_hist_search--; - if (db_hist_unmodified) - db_hist_unmodified--; - return (FALSE); -} - -int -db_readline( - char * lstart, - int lsize) -{ - db_force_whitespace(); /* synch output position */ - - db_lbuf_start = lstart; - db_lbuf_end = lstart + lsize - 1; - db_lc = lstart; - db_le = lstart; - - while (!db_inputchar(cngetc())) - continue; - - db_putchar('\n'); /* synch output position */ - - *db_le = 0; - return (db_le - db_lbuf_start); -} - -void -db_check_interrupt(void) -{ - register int c; - - c = cnmaygetc(); - switch (c) { - case -1: /* no character */ - return; - - case CTRL('c'): - db_error((char *)0); - /*NOTREACHED*/ - - case CTRL('s'): - do { - c = cnmaygetc(); - if (c == CTRL('c')) - db_error((char *)0); - } while (c != CTRL('q')); - break; - - default: - /* drop on floor */ - break; - } -} diff --git a/osfmk/ddb/db_input.h b/osfmk/ddb/db_input.h deleted file mode 100644 index 107bfcdc8..000000000 --- a/osfmk/ddb/db_input.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:19:48 ezf - * change marker to not FREE - * [1994/09/22 21:10:10 ezf] - * - * Revision 1.1.2.3 1993/09/17 21:34:37 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:17 robert] - * - * Revision 1.1.2.2 1993/07/27 18:27:36 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:08 elliston] - * - * $EndLog$ - */ - -#ifndef _DDB_DB_INPUT_H_ -#define _DDB_DB_INPUT_H_ - -/* Prototypes for functions exported by this module. - */ - -int db_readline( - char * lstart, - int lsize); - -void db_check_interrupt(void); - -#endif /* !_DDB_DB_INPUT_H_ */ diff --git a/osfmk/ddb/db_lex.c b/osfmk/ddb/db_lex.c deleted file mode 100644 index 50975857d..000000000 --- a/osfmk/ddb/db_lex.c +++ /dev/null @@ -1,575 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.11.3 1996/01/09 19:15:49 devrcs - * Change 'register foo' to 'register int foo'. - * [1995/12/01 21:42:12 jfraser] - * - * Merged '64-bit safe' changes from DEC alpha port. - * [1995/11/21 18:03:11 jfraser] - * - * Revision 1.1.11.2 1995/01/06 19:10:21 devrcs - * mk6 CR668 - 1.3b26 merge - * * Revision 1.1.4.6 1994/05/06 18:39:20 tmt - * Merged osc1.3dec/shared with osc1.3b19 - * Merge Alpha changes into osc1.312b source code. - * String protos. - * 64bit cleanup. - * Cleanup to quiet gcc warnings. - * * End1.3merge - * [1994/11/04 08:49:35 dwm] - * - * Revision 1.1.11.1 1994/09/23 01:19:59 ezf - * change marker to not FREE - * [1994/09/22 21:10:14 ezf] - * - * Revision 1.1.4.4 1993/08/11 20:37:55 elliston - * Add ANSI Prototypes. CR #9523. - * [1993/08/11 03:33:26 elliston] - * - * Revision 1.1.4.3 1993/07/27 18:27:38 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:13 elliston] - * - * Revision 1.1.4.2 1993/06/02 23:11:27 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:56:32 jeffc] - * - * Revision 1.1 1992/09/30 02:01:10 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.5 91/10/09 16:00:20 af - * Revision 2.4.3.1 91/10/05 13:06:25 jeffreyh - * Added relational operator tokens and string constant etc. - * Added input switching functions for macro and conditional command. - * Moved skip_to_eol() from db_command.c and added db_last_lp to print - * skipped input data as a warning message. - * Added last input repetition support to db_read_line. - * Changed db_lex() to always set db_tok_string for error message. - * [91/08/29 tak] - * - * Revision 2.4.3.1 91/10/05 13:06:25 jeffreyh - * Added relational operator tokens and string constant etc. - * Added input switching functions for macro and conditional command. - * Moved skip_to_eol() from db_command.c and added db_last_lp to print - * skipped input data as a warning message. - * Added last input repetition support to db_read_line. - * Changed db_lex() to always set db_tok_string for error message. - * [91/08/29 tak] - * - * Revision 2.4 91/05/14 15:34:23 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:06:36 mrt - * Changed to new Mach copyright - * [91/01/31 16:18:20 mrt] - * - * Revision 2.2 90/08/27 21:51:10 dbg - * Add 'dotdot' token. - * [90/08/22 dbg] - * - * Allow backslash to quote any character into an identifier. - * Allow colon in identifier for symbol table qualification. - * [90/08/16 dbg] - * Reduce lint. - * [90/08/07 dbg] - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -/* - * Lexical analyzer. - */ -#include /* For strlcpy(), strlcmp(), strlen() */ -#include -#include -#include -#include /* For db_printf() */ - -char db_line[DB_LEX_LINE_SIZE]; -char db_last_line[DB_LEX_LINE_SIZE]; -const char *db_lp, *db_endlp; -const char *db_last_lp; -int db_look_char = 0; -db_expr_t db_look_token = 0; - - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -void db_flush_line(void); -void db_unread_char(int c); - - -int -db_read_line(const char *repeat_last) -{ - int i; - - i = db_readline(db_line, sizeof(db_line)); - if (i == 0) - return (0); /* EOI */ - if (repeat_last) { - if (strncmp(db_line, repeat_last, strlen(repeat_last)) == 0) { - strlcpy(db_line, db_last_line, DB_LEX_LINE_SIZE); - db_printf("%s", db_line); - i = strlen(db_line); - } else if (db_line[0] != '\n' && db_line[0] != 0) - strlcpy(db_last_line, db_line, DB_LEX_LINE_SIZE); - } - db_lp = db_line; - db_endlp = db_lp + i; - db_last_lp = db_lp; - db_look_char = 0; - db_look_token = 0; - return (i); -} - -void -db_flush_line(void) -{ - db_lp = db_line; - db_last_lp = db_lp; - db_endlp = db_line; -} - -void -db_switch_input(const char *buffer, int size) -{ - db_lp = buffer; - db_last_lp = db_lp; - db_endlp = buffer + size; - db_look_char = 0; - db_look_token = 0; -} - -void -db_save_lex_context(register struct db_lex_context *lp) -{ - lp->l_ptr = db_lp; - lp->l_eptr = db_endlp; - lp->l_char = db_look_char; - lp->l_token = (int)db_look_token; -} - -void -db_restore_lex_context(register struct db_lex_context *lp) -{ - db_lp = lp->l_ptr; - db_last_lp = db_lp; - db_endlp = lp->l_eptr; - db_look_char = lp->l_char; - db_look_token = lp->l_token; -} - -int -db_read_char(void) -{ - int c; - - if (db_look_char != 0) { - c = db_look_char; - db_look_char = 0; - } - else if (db_lp >= db_endlp) - c = -1; - else - c = *db_lp++; - return (c); -} - -void -db_unread_char(int c) -{ - db_look_char = c; -} - -void -db_unread_token(int t) -{ - db_look_token = t; -} - -int -db_read_token(void) -{ - int t; - - if (db_look_token) { - t = (int)db_look_token; - db_look_token = 0; - } - else { - db_last_lp = db_lp; - if (db_look_char) - db_last_lp--; - t = db_lex(); - } - return (t); -} - -db_expr_t db_tok_number; -char db_tok_string[TOK_STRING_SIZE]; - -db_expr_t db_radix = 16; - -void -db_flush_lex(void) -{ - db_flush_line(); - db_look_char = 0; - db_look_token = 0; -} - -#define DB_DISP_SKIP 40 /* number of chars to display skip */ - -void -db_skip_to_eol(void) -{ - register int skip; - register int t; - register int n; - const char *p; - - t = db_read_token(); - p = db_last_lp; - for (skip = 0; t != tEOL && t != tSEMI_COLON && t != tEOF; skip++) - t = db_read_token(); - if (t == tSEMI_COLON) - db_unread_token(t); - if (skip != 0) { - while (p < db_last_lp && (*p == ' ' || *p == '\t')) - p++; - db_printf("Warning: Skipped input data \""); - for (n = 0; n < DB_DISP_SKIP && p < db_last_lp; n++) - db_printf("%c", *p++); - if (n >= DB_DISP_SKIP) - db_printf("...."); - db_printf("\"\n"); - } -} - -int -db_lex(void) -{ - register char *cp; - register int c; - - c = db_read_char(); - while (c <= ' ' || c > '~') { - if (c == '\n' || c == -1) - return (tEOL); - c = db_read_char(); - } - - cp = db_tok_string; - *cp++ = c; - - if (c >= '0' && c <= '9') { - /* number */ - int r, digit; - - if (c > '0') - r = (int)db_radix; - else { - c = db_read_char(); - if (c == 'O' || c == 'o') - r = 8; - else if (c == 'T' || c == 't') - r = 10; - else if (c == 'X' || c == 'x') - r = 16; - else { - cp--; - r = (int)db_radix; - db_unread_char(c); - } - c = db_read_char(); - *cp++ = c; - } - db_tok_number = 0; - for (;;) { - if (c >= '0' && c <= ((r == 8) ? '7' : '9')) - digit = c - '0'; - else if (r == 16 && ((c >= 'A' && c <= 'F') || - (c >= 'a' && c <= 'f'))) { - if (c >= 'a') - digit = c - 'a' + 10; - else - digit = c - 'A' + 10; - } - else - break; - db_tok_number = db_tok_number * r + digit; - c = db_read_char(); - if (cp < &db_tok_string[sizeof(db_tok_string)-1]) - *cp++ = c; - } - cp[-1] = 0; - if ((c >= '0' && c <= '9') || - (c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - (c == '_')) - { - db_printf("Bad character '%c' after number %s\n", - c, db_tok_string); - db_error(0); - db_flush_lex(); - return (tEOF); - } - db_unread_char(c); - return (tNUMBER); - } - if ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - c == '_' || c == '\\' || c == ':') - { - /* identifier */ - if (c == '\\') { - c = db_read_char(); - if (c == '\n' || c == -1) - db_error("Bad '\\' at the end of line\n"); - cp[-1] = c; - } - while (1) { - c = db_read_char(); - if ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - (c >= '0' && c <= '9') || - c == '_' || c == '\\' || c == ':' || c == '.') - { - if (c == '\\') { - c = db_read_char(); - if (c == '\n' || c == -1) - db_error("Bad '\\' at the end of line\n"); - } - *cp++ = c; - if (cp == db_tok_string+sizeof(db_tok_string)) { - db_error("String too long\n"); - db_flush_lex(); - return (tEOF); - } - continue; - } - else { - *cp = '\0'; - break; - } - } - db_unread_char(c); - return (tIDENT); - } - - *cp = 0; - switch (c) { - case '+': - return (tPLUS); - case '-': - return (tMINUS); - case '.': - c = db_read_char(); - if (c == '.') { - *cp++ = c; - *cp = 0; - return (tDOTDOT); - } - db_unread_char(c); - return (tDOT); - case '*': - return (tSTAR); - case '/': - return (tSLASH); - case '=': - c = db_read_char(); - if (c == '=') { - *cp++ = c; - *cp = 0; - return(tLOG_EQ); - } - db_unread_char(c); - return (tEQ); - case '%': - return (tPCT); - case '#': - return (tHASH); - case '(': - return (tLPAREN); - case ')': - return (tRPAREN); - case ',': - return (tCOMMA); - case '\'': - return (tQUOTE); - case '"': - /* string */ - cp = db_tok_string; - c = db_read_char(); - while (c != '"' && c > 0 && c != '\n') { - if (cp >= &db_tok_string[sizeof(db_tok_string)-1]) { - db_error("Too long string\n"); - db_flush_lex(); - return (tEOF); - } - if (c == '\\') { - c = db_read_char(); - switch(c) { - case 'n': - c = '\n'; break; - case 't': - c = '\t'; break; - case '\\': - case '"': - break; - default: - db_printf("Bad escape sequence '\\%c'\n", c); - db_error(0); - db_flush_lex(); - return (tEOF); - } - } - *cp++ = c; - c = db_read_char(); - } - *cp = 0; - if (c != '"') { - db_error("Non terminated string constant\n"); - db_flush_lex(); - return (tEOF); - } - return (tSTRING); - case '$': - return (tDOLLAR); - case '!': - c = db_read_char(); - if (c == '=') { - *cp++ = c; - *cp = 0; - return(tLOG_NOT_EQ); - } - db_unread_char(c); - return (tEXCL); - case '&': - c = db_read_char(); - if (c == '&') { - *cp++ = c; - *cp = 0; - return(tLOG_AND); - } - db_unread_char(c); - return(tBIT_AND); - case '|': - c = db_read_char(); - if (c == '|') { - *cp++ = c; - *cp = 0; - return(tLOG_OR); - } - db_unread_char(c); - return(tBIT_OR); - case '<': - c = db_read_char(); - *cp++ = c; - *cp = 0; - if (c == '<') - return (tSHIFT_L); - if (c == '=') - return (tLESS_EQ); - cp[-1] = 0; - db_unread_char(c); - return(tLESS); - break; - case '>': - c = db_read_char(); - *cp++ = c; - *cp = 0; - if (c == '>') - return (tSHIFT_R); - if (c == '=') - return (tGREATER_EQ); - cp[-1] = 0; - db_unread_char(c); - return (tGREATER); - break; - case ';': - return (tSEMI_COLON); - case '?': - return (tQUESTION); - case -1: - strlcpy(db_tok_string, "", TOK_STRING_SIZE); - return (tEOF); - } - db_printf("Bad character '%c'\n", c); - db_flush_lex(); - return (tEOF); -} diff --git a/osfmk/ddb/db_lex.h b/osfmk/ddb/db_lex.h deleted file mode 100644 index 2b327246f..000000000 --- a/osfmk/ddb/db_lex.h +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.11.2 1995/01/06 19:10:24 devrcs - * mk6 CR668 - 1.3b26 merge - * 64bit cleanup - * [1994/10/14 03:39:54 dwm] - * - * Revision 1.1.11.1 1994/09/23 01:20:10 ezf - * change marker to not FREE - * [1994/09/22 21:10:18 ezf] - * - * Revision 1.1.4.3 1993/07/27 18:27:40 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:19 elliston] - * - * Revision 1.1.4.2 1993/06/02 23:11:33 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:56:37 jeffc] - * - * Revision 1.1 1992/09/30 02:24:17 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.5 91/10/09 16:00:48 af - * Revision 2.4.3.1 91/10/05 13:06:34 jeffreyh - * Added db_lex_context structure and some routine declarations - * for macro and conditinal command. - * Added relational operator tokens etc. for condition expression. - * Changed TOK_STRING_SIZE from 120 to 64, and defined - * DB_LEX_LINE_SIZE as 256 which was previously embedded - * in db_lex.c as 120. - * [91/08/29 tak] - * Revision 2.4.1 91/07/15 09:30:00 tak - * Added db_lex_context for macro support - * Added some lexical constants to support logical expression etc. - * [91/05/15 13:55:00 tak] - * - * Revision 2.4.3.1 91/10/05 13:06:34 jeffreyh - * Added db_lex_context structure and some routine declarations - * for macro and conditinal command. - * Added relational operator tokens etc. for condition expression. - * Changed TOK_STRING_SIZE from 120 to 64, and defined - * DB_LEX_LINE_SIZE as 256 which was previously embedded - * in db_lex.c as 120. - * [91/08/29 tak] - * - * Revision 2.4.1 91/07/15 09:30:00 tak - * Added db_lex_context for macro support - * Added some lexical constants to support logical expression etc. - * [91/05/15 13:55:00 tak] - * - * Revision 2.4 91/05/14 15:34:38 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:06:41 mrt - * Changed to new Mach copyright - * [91/01/31 16:18:28 mrt] - * - * Revision 2.2 90/08/27 21:51:16 dbg - * Add 'dotdot' token. - * [90/08/22 dbg] - * Export db_flush_lex. - * [90/08/07 dbg] - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ -/* - * Lexical analyzer. - */ - -#ifndef _DDB_DB_LEX_H_ -#define _DDB_DB_LEX_H_ - -#include /* For db_expr_t */ - -#define TOK_STRING_SIZE 64 -#define DB_LEX_LINE_SIZE 256 - -struct db_lex_context { - int l_char; /* peek char */ - int l_token; /* peek token */ - const char *l_ptr; /* line pointer */ - const char *l_eptr; /* line end pointer */ -}; - -extern db_expr_t db_tok_number; -extern char db_tok_string[TOK_STRING_SIZE]; -extern db_expr_t db_radix; - -#define tEOF (-1) -#define tEOL 1 -#define tNUMBER 2 -#define tIDENT 3 -#define tPLUS 4 -#define tMINUS 5 -#define tDOT 6 -#define tSTAR 7 -#define tSLASH 8 -#define tEQ 9 -#define tLPAREN 10 -#define tRPAREN 11 -#define tPCT 12 -#define tHASH 13 -#define tCOMMA 14 -#define tQUOTE 15 -#define tDOLLAR 16 -#define tEXCL 17 -#define tSHIFT_L 18 -#define tSHIFT_R 19 -#define tDOTDOT 20 -#define tSEMI_COLON 21 -#define tLOG_EQ 22 -#define tLOG_NOT_EQ 23 -#define tLESS 24 -#define tLESS_EQ 25 -#define tGREATER 26 -#define tGREATER_EQ 27 -#define tBIT_AND 28 -#define tBIT_OR 29 -#define tLOG_AND 30 -#define tLOG_OR 31 -#define tSTRING 32 -#define tQUESTION 33 - -/* Prototypes for functions exported by this module. - */ -int db_read_line(const char *); - -void db_switch_input(const char *, int); - -void db_save_lex_context(struct db_lex_context *lp); - -void db_restore_lex_context(struct db_lex_context *lp); - -int db_read_char(void); - -void db_unread_token(int t); - -int db_read_token(void); - -void db_flush_lex(void); - -void db_skip_to_eol(void); - -int db_lex(void); - -#endif /* !_DDB_DB_LEX_H_ */ diff --git a/osfmk/ddb/db_macro.c b/osfmk/ddb/db_macro.c deleted file mode 100644 index 9e9e6cf98..000000000 --- a/osfmk/ddb/db_macro.c +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -#include -#include /* For strncmp(), strlcpy() */ - -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include -#include - -/* - * debugger macro support - */ - -#define DB_NUSER_MACRO 10 /* max user macros */ - -int db_macro_free = DB_NUSER_MACRO; -struct db_user_macro { - char m_name[TOK_STRING_SIZE]; - char m_lbuf[DB_LEX_LINE_SIZE]; - int m_size; -} db_user_macro[DB_NUSER_MACRO]; - -int db_macro_level = -1; -db_expr_t db_macro_args[DB_MACRO_LEVEL][DB_MACRO_NARGS]; - - -/* Prototypes for functions local to this file. - */ -static struct db_user_macro *db_lookup_macro(char *name); - - -static struct db_user_macro * -db_lookup_macro(char *name) -{ - register struct db_user_macro *mp; - - for (mp = db_user_macro; mp < &db_user_macro[DB_NUSER_MACRO]; mp++) { - if (mp->m_name[0] == 0) - continue; - if (strncmp(mp->m_name, name, TOK_STRING_SIZE) == 0) - return(mp); - } - return(0); -} - -void -db_def_macro_cmd(void) -{ - register char *p; - register int c; - register struct db_user_macro *mp, *ep; - - if (db_read_token() != tIDENT) { - db_printf("Bad macro name \"%s\"\n", db_tok_string); - db_error(0); - /* NOTREACHED */ - } - if ((mp = db_lookup_macro(db_tok_string)) == 0) { - if (db_macro_free <= 0) - db_error("Too many macros\n"); - /* NOTREACHED */ - ep = &db_user_macro[DB_NUSER_MACRO]; - for (mp = db_user_macro; mp < ep && mp->m_name[0]; mp++); - if (mp >= ep) - db_error("ddb: internal error(macro)\n"); - /* NOTREACHED */ - db_macro_free--; - strlcpy(mp->m_name, db_tok_string, TOK_STRING_SIZE); - } - for (c = db_read_char(); c == ' ' || c == '\t'; c = db_read_char()); - for (p = mp->m_lbuf; c > 0; c = db_read_char()) - *p++ = c; - *p = 0; - mp->m_size = p - mp->m_lbuf; -} - -void -db_del_macro_cmd(void) -{ - struct db_user_macro *mp = NULL; - - if (db_read_token() != tIDENT - || (mp = db_lookup_macro(db_tok_string)) == 0) { - db_printf("No such macro \"%s\"\n", db_tok_string); - db_error(0); - /* NOTREACHED */ - } - mp->m_name[0] = 0; - db_macro_free++; -} - -void -db_show_macro(void) -{ - register struct db_user_macro *mp; - int t; - char *name = 0; - - if ((t = db_read_token()) == tIDENT) - name = db_tok_string; - else - db_unread_token(t); - for (mp = db_user_macro; mp < &db_user_macro[DB_NUSER_MACRO]; mp++) { - if (mp->m_name[0] == 0) - continue; - if (name && strncmp(mp->m_name, name, TOK_STRING_SIZE)) - continue; - db_printf("%s: %s", mp->m_name, mp->m_lbuf); - } -} - -int -db_exec_macro(char *name) -{ - register struct db_user_macro *mp; - register int n; - - if ((mp = db_lookup_macro(name)) == 0) - return(-1); - if (db_macro_level+1 >= DB_MACRO_LEVEL) { - db_macro_level = -1; - db_error("Too many macro nest\n"); - /* NOTREACHED */ - } - for (n = 0; - n < DB_MACRO_NARGS && - db_expression(&db_macro_args[db_macro_level+1][n]); - n++); - while (n < DB_MACRO_NARGS) - db_macro_args[db_macro_level+1][n++] = 0; - db_macro_level++; - db_exec_cmd_nest(mp->m_lbuf, mp->m_size); - db_macro_level--; - return(0); -} - -int -db_arg_variable(__unused struct db_variable *vp, db_expr_t *valuep, int flag, - db_var_aux_param_t ap) -{ - db_expr_t value; - char *name; - db_addr_t offset; - - if (flag == DB_VAR_SHOW) { - value = db_macro_args[ap->hidden_level][ap->suffix[0]-1]; - db_printf("%#lln", (unsigned long long)value); - db_find_xtrn_task_sym_and_offset(value, &name, &offset, TASK_NULL); - if (name != (char *)0 && offset <= db_maxoff && offset != value) { - db_printf("\t%s", name); - if (offset != 0) - db_printf("+%#llr", (unsigned long long)offset); - } - return(0); - } - - if (ap->level != 1 || ap->suffix[0] < 1 || - ap->suffix[0] > DB_MACRO_NARGS) { - db_error("Bad $arg variable\n"); - /* NOTREACHED */ - } - if (flag == DB_VAR_GET) - *valuep = db_macro_args[db_macro_level][ap->suffix[0]-1]; - else - db_macro_args[db_macro_level][ap->suffix[0]-1] = *valuep; - return(0); -} diff --git a/osfmk/ddb/db_macro.h b/osfmk/ddb/db_macro.h deleted file mode 100644 index 2a21e44de..000000000 --- a/osfmk/ddb/db_macro.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:20:28 ezf - * change marker to not FREE - * [1994/09/22 21:10:28 ezf] - * - * Revision 1.1.2.3 1993/09/17 21:34:39 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:20 robert] - * - * Revision 1.1.2.2 1993/07/27 18:27:48 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:28 elliston] - * - * $EndLog$ - */ - -#ifndef _DDB_DB_MACRO_H_ -#define _DDB_DB_MACRO_H_ - -#include -#include - -/* Prototypes for functions exported by this module. - */ -void db_def_macro_cmd(void); - -void db_del_macro_cmd(void); - -void db_show_macro(void); - -int db_exec_macro(char *name); - -int db_arg_variable( - struct db_variable *vp, - db_expr_t *valuep, - int flag, - db_var_aux_param_t ap); - -#endif /* !_DDB_DB_MACRO_H_ */ diff --git a/osfmk/ddb/db_output.c b/osfmk/ddb/db_output.c deleted file mode 100644 index 69bfeeaef..000000000 --- a/osfmk/ddb/db_output.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Printf and character output for debugger. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Character output - tracks position in line. - * To do this correctly, we should know how wide - * the output device is - then we could zero - * the line position when the output device wraps - * around to the start of the next line. - * - * Instead, we count the number of spaces printed - * since the last printing character so that we - * don't print trailing spaces. This avoids most - * of the wraparounds. - */ - -#ifndef DB_MAX_LINE -#define DB_MAX_LINE 43 /* maximum line */ -#define DB_MAX_WIDTH 132 /* maximum width */ -#endif /* DB_MAX_LINE */ - -#define DB_MIN_MAX_WIDTH 20 /* minimum max width */ -#define DB_MIN_MAX_LINE 3 /* minimum max line */ -#define CTRL(c) ((c) & 0xff) - -int db_output_position = 0; /* output column */ -int db_output_line = 0; /* output line number */ -int db_last_non_space = 0; /* last non-space character */ -int db_last_gen_return = 0; /* last character generated return */ -int db_auto_wrap = 1; /* auto wrap at end of line ? */ -int db_tab_stop_width = 8; /* how wide are tab stops? */ -#define NEXT_TAB(i) \ - ((((i) + db_tab_stop_width) / db_tab_stop_width) * db_tab_stop_width) -int db_max_line = DB_MAX_LINE; /* output max lines */ -int db_max_width = DB_MAX_WIDTH; /* output line width */ - - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -static void db_more(void); -void db_advance_output_position(int new_output_position, - int blank); - - -/* - * Force pending whitespace. - */ -void -db_force_whitespace(void) -{ - register int last_print, next_tab; - - last_print = db_last_non_space; - while (last_print < db_output_position) { - next_tab = NEXT_TAB(last_print); - if (next_tab <= db_output_position) { - cnputc('\t'); - last_print = next_tab; - } - else { - cnputc(' '); - last_print++; - } - } - db_last_non_space = db_output_position; -} - -void -db_reset_more() -{ - db_output_line = 0; -} - -static void -db_more(void) -{ - const char *p; - boolean_t quit_output = FALSE; - - for (p = "--db_more--"; *p; p++) - cnputc(*p); - switch(cngetc()) { - case ' ': - db_output_line = 0; - break; - case 'q': - case CTRL('c'): - db_output_line = 0; - quit_output = TRUE; - break; - default: - db_output_line--; - break; - } - p = "\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b"; - while (*p) - cnputc(*p++); - if (quit_output) { - db_error((char *) 0); - /* NOTREACHED */ - } -} - -void -db_advance_output_position(int new_output_position, - int blank) -{ - if (db_max_width >= DB_MIN_MAX_WIDTH - && new_output_position >= db_max_width) { - /* auto new line */ - if (!db_auto_wrap || blank) - cnputc('\n'); - db_output_position = 0; - db_last_non_space = 0; - db_last_gen_return = 1; - db_output_line++; - } else { - db_output_position = new_output_position; - } -} - -boolean_t -db_reserve_output_position(int increment) -{ - if (db_max_width >= DB_MIN_MAX_WIDTH - && db_output_position + increment >= db_max_width) { - /* auto new line */ - if (!db_auto_wrap || db_last_non_space != db_output_position) - cnputc('\n'); - db_output_position = 0; - db_last_non_space = 0; - db_last_gen_return = 1; - db_output_line++; - return TRUE; - } - return FALSE; -} - -/* - * Output character. Buffer whitespace. - */ -void -db_putchar(char c) -{ - if (db_max_line >= DB_MIN_MAX_LINE && db_output_line >= db_max_line-1) - db_more(); - if (c > ' ' && c <= '~') { - /* - * Printing character. - * If we have spaces to print, print them first. - * Use tabs if possible. - */ - db_force_whitespace(); - cnputc(c); - db_last_gen_return = 0; - db_advance_output_position(db_output_position+1, 0); - db_last_non_space = db_output_position; - } - else if (c == '\n') { - /* Return */ - if (db_last_gen_return) { - db_last_gen_return = 0; - } else { - cnputc(c); - db_output_position = 0; - db_last_non_space = 0; - db_output_line++; - db_check_interrupt(); - } - } - else if (c == '\t') { - /* assume tabs every 8 positions */ - db_advance_output_position(NEXT_TAB(db_output_position), 1); - } - else if (c == ' ') { - /* space */ - db_advance_output_position(db_output_position+1, 1); - } - else if (c == '\007') { - /* bell */ - cnputc(c); - } - /* other characters are assumed non-printing */ -} - -/* - * Return output position - */ -int -db_print_position(void) -{ - return (db_output_position); -} - -/* - * End line if too long. - */ -void -db_end_line(void) -{ - if (db_output_position >= db_max_width-1) { - /* auto new line */ - if (!db_auto_wrap) - cnputc('\n'); - db_output_position = 0; - db_last_non_space = 0; - db_last_gen_return = 1; - db_output_line++; - } -} - -/* - * Printing - */ - -void -db_printf(const char *fmt, ...) -{ - va_list listp; - - va_start(listp, fmt); - _doprnt(fmt, &listp, db_putchar, (int)db_radix); - va_end(listp); -} - -/* alternate name */ - -void -kdbprintf(const char *fmt, ...) -{ - va_list listp; - - va_start(listp, fmt); - _doprnt(fmt, &listp, db_putchar, (int)db_radix); - va_end(listp); -} - -int db_indent = 0; - -/* - * Printing (to console) with indentation. - */ -void -iprintf(const char *fmt, ...) -{ - va_list listp; - register int i; - - for (i = db_indent; i > 0; ){ - if (i >= 8) { - kdbprintf("\t"); - i -= 8; - } - else { - kdbprintf(" "); - i--; - } - } - - va_start(listp, fmt); - _doprnt(fmt, &listp, db_putchar, (int)db_radix); - va_end(listp); -} - -void -db_output_prompt(void) -{ - db_printf("db%s", (db_default_act) ? "t": ""); - db_printf("{%d}", cpu_number()); - db_printf("> "); -} - diff --git a/osfmk/ddb/db_print.c b/osfmk/ddb/db_print.c deleted file mode 100644 index d773823d0..000000000 --- a/osfmk/ddb/db_print.c +++ /dev/null @@ -1,931 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Miscellaneous printing. - */ -#include - -#include /* For strlen() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for db_vm() */ - -#include -#include - -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include - -#if TASK_SWAPPER -#include -#endif /* TASK_SWAPPER */ - -/* Prototypes for functions local to this file. XXX -- should be static! - */ - -char *db_act_stat( - register thread_t thr_act, - char *status); - -char *db_act_swap_stat( - register thread_t thr_act, - char *status); - -void db_print_task( - task_t task, - int task_id, - int flag); - -void db_reset_print_entry( - void); - -void db_print_one_entry( - ipc_entry_t entry, - int index, - mach_port_name_t name, - boolean_t is_pset, - ipc_space_t space); - -int db_port_iterate( - thread_t thr_act, - boolean_t is_pset, - boolean_t do_output); - -ipc_port_t db_lookup_port( - thread_t thr_act, - int id); - -void db_print_act( - thread_t thr_act, - int act_id, - int flag); - -void db_print_space( - task_t task, - int task_id, - int flag); - -void db_print_task_vm( - task_t task, - int task_id, - boolean_t title, - char *modif); - -void db_system_stats(void); - - -void -db_show_regs(db_expr_t addr, boolean_t have_addr, __unused db_expr_t count, - char *modif) -{ - register struct db_variable *regp; - db_expr_t value; - db_addr_t offset; - char * name; - register int i; - struct db_var_aux_param aux_param; - task_t task = TASK_NULL; - - aux_param.modif = modif; - aux_param.thr_act = THREAD_NULL; - if (db_option(modif, 't')) { - if (have_addr) { - if (!db_check_act_address_valid((thread_t)(unsigned long)addr)) - return; - aux_param.thr_act = (thread_t)(unsigned long)addr; - } else - aux_param.thr_act = db_default_act; - if (aux_param.thr_act != THREAD_NULL) - task = aux_param.thr_act->task; - } - for (regp = db_regs; regp < db_eregs; regp++) { - if (regp->max_level > 1) { - db_printf("bad multi-suffixed register %s\n", regp->name); - continue; - } - aux_param.level = regp->max_level; - for (i = regp->low; i <= regp->high; i++) { - aux_param.suffix[0] = i; - db_read_write_variable(regp, &value, DB_VAR_GET, &aux_param); - if (regp->max_level > 0) - db_printf("%s%d%*s", regp->name, i, - 12-strlen(regp->name)-((i<10)?1:2), ""); - else - db_printf("%-12s", regp->name); - db_printf("%#*llN", 2+2*sizeof(db_expr_t), (unsigned long long)value); - db_find_xtrn_task_sym_and_offset((db_addr_t)value, &name, - &offset, task); - if (name != 0 && offset <= db_maxoff && offset != value) { - db_printf("\t%s", name); - if (offset != 0) - db_printf("+%#llr", (unsigned long long)offset); - } - db_printf("\n"); - } - } -} - -#define OPTION_LONG 0x001 /* long print option */ -#define OPTION_USER 0x002 /* print ps-like stuff */ -#define OPTION_INDENT 0x100 /* print with indent */ -#define OPTION_THREAD_TITLE 0x200 /* print thread title */ -#define OPTION_TASK_TITLE 0x400 /* print thread title */ - -#ifndef DB_TASK_NAME -#define DB_TASK_NAME(task) /* no task name */ -#define DB_TASK_NAME_TITLE "" /* no task name */ -#endif /* DB_TASK_NAME */ - -#ifndef db_act_fp_used -#define db_act_fp_used(thr_act) FALSE -#endif - -char * -db_act_stat( - register thread_t thr_act, - char *status) -{ - register char *p = status; - - if (!thr_act->active) { - *p++ = 'D', - *p++ = 'y', - *p++ = 'i', - *p++ = 'n', - *p++ = 'g'; - *p++ = ' '; - } else { - thread_t athread = thr_act; - - *p++ = (athread->state & TH_RUN) ? 'R' : '.'; - *p++ = (athread->state & TH_WAIT) ? 'W' : '.'; - *p++ = (athread->state & TH_SUSP) ? 'S' : '.'; - *p++ = (!athread->kernel_stack) ? 'O' : '.'; - *p++ = (athread->state & TH_UNINT) ? 'N' : '.'; - /* show if the FPU has been used */ - *p++ = db_act_fp_used(thr_act) ? 'F' : '.'; - } - *p++ = 0; - return(status); -} - -char * -db_act_swap_stat(__unused thread_t thr_act, char *status) -{ - register char *p = status; - *p++ = 0; - - return status; -} - -const char *policy_list[] = { "TS", "RR", "??", "FF", "??", "??", "??", "BE"}; - -void -db_print_act( - thread_t thr_act, - int act_id, - int flag) -{ - thread_t athread; - char status[8]; - char swap_status[3]; - const char *indent = ""; - int policy; - - if (!thr_act) { - db_printf("db_print_act(NULL)!\n"); - return; - } - - athread = thr_act; - if (flag & OPTION_USER) { - - if (flag & OPTION_LONG) { - if (flag & OPTION_INDENT) - indent = " "; - if (flag & OPTION_THREAD_TITLE) { - db_printf("%s ID: ACT STAT SW STACK SHUTTLE", indent); - db_printf(" SUS PRI WAIT_FUNC\n"); - } - policy = ((athread && (athread->sched_mode == TH_MODE_TIMESHARE))? 1: 2); - db_printf("%s%3d%c %0*X %s %s %0*X %0*X %3d %3d/%s ", - indent, act_id, - (thr_act == current_thread())? '#': ':', - 2*sizeof(vm_offset_t), thr_act, - db_act_stat(thr_act, status), - db_act_swap_stat(thr_act, swap_status), - 2*sizeof(vm_offset_t), (athread ?athread->kernel_stack:0), - 2*sizeof(vm_offset_t), athread, - thr_act->suspend_count, - (athread ? athread->sched_pri : 999), /* XXX */ - policy_list[policy-1]); - if (athread) { - /* no longer TH_SWAP, no continuation to print */ - if (athread->state & TH_WAIT) - db_task_printsym((db_addr_t)athread->wait_event, - DB_STGY_ANY, kernel_task); - } - db_printf("\n"); - } else { - if (act_id % 3 == 0) { - if (flag & OPTION_INDENT) - db_printf("\n "); - } else - db_printf(" "); - db_printf("%3d%c(%0*X,%s)", act_id, - (thr_act == current_thread())? '#': ':', - 2*sizeof(vm_offset_t), thr_act, - db_act_stat(thr_act, status)); - } - } else { - if (flag & OPTION_INDENT) - db_printf(" %3d (%0*X) ", act_id, - 2*sizeof(vm_offset_t), thr_act); - else - db_printf("(%0*X) ", 2*sizeof(vm_offset_t), thr_act); - if (athread) { - db_printf("%c%c%c%c%c", - (athread->state & TH_RUN) ? 'R' : ' ', - (athread->state & TH_WAIT) ? 'W' : ' ', - (athread->state & TH_SUSP) ? 'S' : ' ', - (athread->state & TH_UNINT)? 'N' : ' ', - db_act_fp_used(thr_act) ? 'F' : ' '); - if (!athread->kernel_stack) { - if (athread->continuation) { - db_printf("("); - db_task_printsym((db_addr_t)(unsigned long)athread->continuation, - DB_STGY_ANY, kernel_task); - db_printf(")"); - } else { - db_printf("(handoff)"); - } - } - if (athread->state & TH_WAIT) { - db_printf(" "); - db_task_printsym((db_addr_t)athread->wait_event, - DB_STGY_ANY, kernel_task); - } - } else - db_printf("Empty"); - db_printf("\n"); - } -} - -void -db_print_task( - task_t task, - int task_id, - int flag) -{ - thread_t thr_act; - int act_id; - char sstate; - - if (flag & OPTION_USER) { - if (flag & OPTION_TASK_TITLE) { - db_printf(" ID: TASK MAP THD SUS PR SW %s", - DB_TASK_NAME_TITLE); - if ((flag & OPTION_LONG) == 0) - db_printf(" ACTS"); - db_printf("\n"); - } -#if TASK_SWAPPER - switch ((int) task->swap_state) { - case TASK_SW_IN: - sstate = 'I'; - break; - case TASK_SW_OUT: - sstate = 'O'; - break; - case TASK_SW_GOING_OUT: - sstate = 'G'; - break; - case TASK_SW_COMING_IN: - sstate = 'C'; - break; - case TASK_SW_UNSWAPPABLE: - sstate = 'U'; - break; - default: - sstate = '?'; - break; - } -#else /* TASK_SWAPPER */ - sstate = 'I'; -#endif /* TASK_SWAPPER */ - /*** ??? fix me ***/ - db_printf("%3d: %0*X %0*X %3d %3d %2d %c ", - task_id, 2*sizeof(vm_offset_t), task, - 2*sizeof(vm_offset_t), task->map, - task->thread_count, - task->suspend_count, - task->priority, - sstate); - DB_TASK_NAME(task); - if (flag & OPTION_LONG) { - if (flag & OPTION_TASK_TITLE) - flag |= OPTION_THREAD_TITLE; - db_printf("\n"); - } else if (task->thread_count <= 1) - flag &= ~OPTION_INDENT; - act_id = 0; - queue_iterate(&task->threads, thr_act, thread_t, task_threads) { - db_print_act(thr_act, act_id, flag); - flag &= ~OPTION_THREAD_TITLE; - act_id++; - } - if ((flag & OPTION_LONG) == 0) - db_printf("\n"); - } else { - if (flag & OPTION_LONG) { - if (flag & OPTION_TASK_TITLE) { - db_printf(" TASK ACT\n"); - if (task->thread_count > 1) - flag |= OPTION_THREAD_TITLE; - } - } - db_printf("%3d (%0*X): ", task_id, 2*sizeof(vm_offset_t), task); - if (task->thread_count == 0) { - db_printf("no threads\n"); - } else { - if (task->thread_count > 1) { - db_printf("%d threads: \n", task->thread_count); - flag |= OPTION_INDENT; - } else - flag &= ~OPTION_INDENT; - act_id = 0; - queue_iterate(&task->threads, thr_act, - thread_t, task_threads) { - db_print_act(thr_act, act_id++, flag); - flag &= ~OPTION_THREAD_TITLE; - } - } - } -} - -void -db_print_space(task_t task, int task_id, __unused int flag) -{ - ipc_space_t space; - thread_t act = (thread_t)queue_first(&task->threads); - int count; - - count = 0; - space = task->itk_space; - if (act) - count = db_port_iterate(act, FALSE, FALSE); - db_printf("%3d: %08x %08x %08x %sactive %d\n", - task_id, task, space, task->map, - space->is_active? "":"!", count); -} - -void -db_print_task_vm(task_t task, int task_id, boolean_t title, - __unused char *modif) -{ - vm_map_t map; - pmap_t pmap; - vm_size_t size; - long resident; - long wired; - - if (title) { - db_printf("id task map pmap virtual rss pg rss mem wir pg wir mem\n"); - } - - map = task->map; - pmap = vm_map_pmap(map); - - size = db_vm_map_total_size((unsigned long)map); - resident = pmap->stats.resident_count; - wired = pmap->stats.wired_count; - - db_printf("%2d %08x %08x %08x %7dK %6d %6dK %6d %6dK\n", - task_id, - task, - map, - pmap, - size / 1024, - resident, (resident * PAGE_SIZE) / 1024, - wired, (wired * PAGE_SIZE) / 1024); -} - - -void -db_show_one_task_vm(db_expr_t addr, boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - thread_t thread; - task_t task; - int task_id; - - if (have_addr == FALSE) { - if ((thread = db_default_act) == THREAD_NULL) { - if ((thread = current_thread()) == THREAD_NULL) { - db_printf("no thread.\n"); - return; - } - } - task = thread->task; - } else { - task = (task_t)(unsigned long)addr; - } - - task_id = db_lookup_task(task); - if (task_id < 0) { - db_printf("0x%x is not a task_t\n", addr); - return; - } - - db_print_task_vm(task, task_id, TRUE, modif); -} - -void -db_show_all_task_vm(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - task_t task; - int task_id; - boolean_t title = TRUE; - - task_id = 0; - queue_iterate(&tasks, task, task_t, tasks) { - db_print_task_vm(task, task_id, title, modif); - title = FALSE; - task_id++; - } -} - -void -db_show_all_acts(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - task_t task; - int task_id; - int flag; - - flag = OPTION_TASK_TITLE|OPTION_INDENT; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - task_id = 0; - queue_iterate(&tasks, task, task_t, tasks) { - db_print_task(task, task_id, flag); - flag &= ~OPTION_TASK_TITLE; - task_id++; - if ((flag & (OPTION_LONG|OPTION_INDENT)) == OPTION_INDENT) - db_printf("\n"); - } -} - -void -db_show_one_space(db_expr_t addr, boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - int flag; - int task_id; - task_t task; - - flag = OPTION_TASK_TITLE; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - if (!have_addr) { - task = db_current_task(); - if (task == TASK_NULL) { - db_error("No task\n"); - /*NOTREACHED*/ - } - } else - task = (task_t)(unsigned long)addr; - - if ((task_id = db_lookup_task(task)) < 0) { - db_printf("bad task address 0x%llx\n", (unsigned long long)addr); - db_error(0); - /*NOTREACHED*/ - } - - db_printf(" ID: TASK SPACE MAP COUNT\n"); - db_print_space(task, task_id, flag); -} - -void -db_show_all_spaces(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - task_t task; - int task_id = 0; - int flag; - - flag = OPTION_TASK_TITLE|OPTION_INDENT; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - db_printf(" ID: TASK SPACE MAP COUNT\n"); - queue_iterate(&tasks, task, task_t, tasks) { - db_print_space(task, task_id, flag); - task_id++; - } -} - -db_addr_t -db_task_from_space( - ipc_space_t space, - int *task_id) -{ - task_t task; - int tid = 0; - - queue_iterate(&tasks, task, task_t, tasks) { - if (task->itk_space == space) { - *task_id = tid; - return (db_addr_t)(unsigned long)task; - } - tid++; - } - *task_id = 0; - return (0); -} - -void -db_show_one_act(db_expr_t addr, boolean_t have_addr, __unused db_expr_t count, - char *modif) -{ - int flag; - int act_id; - thread_t thr_act; - - flag = OPTION_THREAD_TITLE; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - if (!have_addr) { - thr_act = current_thread(); - if (thr_act == THREAD_NULL) { - db_error("No thr_act\n"); - /*NOTREACHED*/ - } - } else - thr_act = (thread_t)(unsigned long)addr; - - if ((act_id = db_lookup_act(thr_act)) < 0) { - db_printf("bad thr_act address %#llX\n", (unsigned long long)addr); - db_error(0); - /*NOTREACHED*/ - } - - if (flag & OPTION_USER) { - db_printf("TASK%d(%0*X):\n", - db_lookup_task(thr_act->task), - 2*sizeof(vm_offset_t), thr_act->task); - db_print_act(thr_act, act_id, flag); - } else { - db_printf("task %d(%0*Xx): thr_act %d", - db_lookup_task(thr_act->task), - 2*sizeof(vm_offset_t), thr_act->task, act_id); - db_print_act(thr_act, act_id, flag); - } - if (db_option(modif, 'i') && - (thr_act->state & TH_WAIT) && - thr_act->kernel_stack == 0) { - - db_printf("Wait State: option 0x%x\n", - thr_act->ith_option); - } -} - -void -db_show_one_task(db_expr_t addr, boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - int flag; - int task_id; - task_t task; - - flag = OPTION_TASK_TITLE|OPTION_INDENT; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - if (!have_addr) { - task = db_current_task(); - if (task == TASK_NULL) { - db_error("No task\n"); - /*NOTREACHED*/ - } - } else - task = (task_t)(unsigned long)addr; - - if ((task_id = db_lookup_task(task)) < 0) { - db_printf("bad task address 0x%llX\n", (unsigned long long)addr); - db_error(0); - /*NOTREACHED*/ - } - - db_print_task(task, task_id, flag); -} - -void -db_show_shuttle(db_expr_t addr, boolean_t have_addr, __unused db_expr_t count, - __unused char *modif) -{ - thread_t thread; - - if (have_addr) - thread = (thread_t)(unsigned long)addr; - else { - thread = current_thread(); - if (thread == THREAD_NULL) { - db_error("No thread\n"); - /*NOTREACHED*/ - } - } - db_printf("thread %x:\n", thread); - printf(" $task%d.%d(%x)", db_lookup_task(thread->task), - db_lookup_act(thread), thread); - db_printf("\n"); -} - -int -db_port_kmsg_count( - ipc_port_t port) -{ - return (port->ip_messages.imq_msgcount); -} - -static int db_print_ent_cnt = 0; - -void db_reset_print_entry( - void) -{ - db_print_ent_cnt = 0; -} - -void -db_print_one_entry(ipc_entry_t entry, int index, mach_port_name_t name, - boolean_t is_pset, __unused ipc_space_t space) -{ - ipc_port_t aport = (ipc_port_t)entry->ie_object; - ipc_entry_bits_t bits; - - bits = entry->ie_bits; - if (is_pset && !aport->ip_pset_count) - return; - if (db_print_ent_cnt && db_print_ent_cnt % 2 == 0) - db_printf("\n"); - if (!name) - db_printf("\t%s%d[%x]", - !is_pset && aport->ip_pset_count ? "pset" : "port", - index, - MACH_PORT_MAKE(index, IE_BITS_GEN(bits))); - else - db_printf("\t%s[%x]", - !is_pset && aport->ip_pset_count ? "pset" : "port", - name); - if (!is_pset) { - db_printf("(%s,%x,%d)", - (bits & MACH_PORT_TYPE_RECEIVE)? "r": - (bits & MACH_PORT_TYPE_SEND)? "s": "S", - aport, - db_port_kmsg_count(aport)); - db_print_ent_cnt++; - } - else { - db_printf("(%s,%x,set_count=%d,%d)", - (bits & MACH_PORT_TYPE_RECEIVE)? "r": - (bits & MACH_PORT_TYPE_SEND)? "s": "S", - aport, - aport->ip_pset_count, - db_port_kmsg_count(aport)); - db_print_ent_cnt++; - } -} - -int -db_port_iterate( - thread_t thr_act, - boolean_t is_pset, - boolean_t do_output) -{ - ipc_entry_t entry; - ipc_tree_entry_t tentry; - int index; - int size; - int count; - ipc_space_t space; - - count = 0; - space = thr_act->task->itk_space; - entry = space->is_table; - size = space->is_table_size; - db_reset_print_entry(); - for (index = 0; index < size; ++index, ++entry) { - if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { - if (do_output) - db_print_one_entry(entry, - index, MACH_PORT_NULL, is_pset, space); - ++count; - } - } - for (tentry = ipc_splay_traverse_start(&space->is_tree); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree, FALSE)) { - entry = &tentry->ite_entry; - if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { - if (do_output) - db_print_one_entry(entry, - 0, tentry->ite_name, is_pset, space); - ++count; - } - } - return (count); -} - -ipc_port_t -db_lookup_port( - thread_t thr_act, - int id) -{ - register ipc_space_t space; - register ipc_entry_t entry; - - if (thr_act == THREAD_NULL) - return(0); - space = thr_act->task->itk_space; - if (id < 0 || (unsigned)id >= space->is_table_size) - return(0); - entry = &space->is_table[id]; - if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) - return((ipc_port_t)entry->ie_object); - return(0); -} - -void -db_show_port_id(db_expr_t addr, boolean_t have_addr, __unused db_expr_t count, - char *modif) -{ - thread_t thr_act; - - if (!have_addr) { - thr_act = current_thread(); - if (thr_act == THREAD_NULL) { - db_error("No thr_act\n"); - /*NOTREACHED*/ - } - } else - thr_act = (thread_t)(unsigned long)addr; - if (db_lookup_act(thr_act) < 0) { - db_printf("Bad thr_act address 0x%llX\n", addr); - db_error(0); - /*NOTREACHED*/ - } - if (db_port_iterate(thr_act, db_option(modif,'s'), TRUE)) - db_printf("\n"); -} - -extern void db_sched(void); -/* - * Useful system state when the world has hung. - */ -void -db_system_stats(void) -{ - db_sched(); - iprintf("\n"); - db_vm(); - iprintf("\n"); - iprintf("\n"); - db_printf("current_{thread/task} 0x%x 0x%x\n", - current_thread(),current_task()); -} - -void db_show_one_runq(run_queue_t runq); - -void -db_show_runq(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - processor_t proc; - run_queue_t runq; - boolean_t showedany = FALSE; - - for (proc = processor_list; proc != PROCESSOR_NULL; proc = proc->processor_list) { - runq = &proc->runq; - if (runq->count > 0) { - db_printf("PROCESSOR %x IN SET %x\n", proc, proc->processor_set); - db_show_one_runq(runq); - showedany = TRUE; - } - } - if (rt_runq.count > 0) { - db_printf("REAL TIME\n"); - db_show_one_runq(runq); - showedany = TRUE; - } - if (!showedany) - db_printf("No runnable threads\n"); -} - -void -db_show_one_runq( - run_queue_t runq) -{ - int i, task_id, thread_id; - queue_t q; - thread_t thread; - task_t task; - - printf("PRI TASK.ACTIVATION\n"); - for (i = runq->highq, q = runq->queues + i; i >= 0; i--, q--) { - if (!queue_empty(q)) { - db_printf("%3d:", i); - queue_iterate(q, thread, thread_t, links) { - task = thread->task; - task_id = db_lookup_task(task); - thread_id = db_lookup_task_act(task, thread); - db_printf(" %d.%d", task_id, thread_id); - } - db_printf("\n"); - } - } -} diff --git a/osfmk/ddb/db_print.h b/osfmk/ddb/db_print.h deleted file mode 100644 index f02d6979a..000000000 --- a/osfmk/ddb/db_print.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.31.1 1997/03/27 18:46:44 barbou - * ri-osc CR1566: Add db_show_one_thread() prototype. [dwm] - * [1995/08/28 15:47:07 bolinger] - * [97/02/25 barbou] - * - * Revision 1.1.16.6 1995/02/23 21:43:39 alanl - * Merge with DIPC2_SHARED. - * [1995/01/05 13:30:16 alanl] - * - * Revision 1.1.21.2 1994/12/09 22:11:02 dwm - * mk6 CR801 - merge up from nmk18b4 to nmk18b7 - * * Rev 1.1.16.4 1994/10/11 16:36:02 emcmanus - * Added db_show_shuttle() and db_show_runq() prototypes. - * [1994/12/09 20:36:53 dwm] - * - * Revision 1.1.21.1 1994/11/10 06:06:47 dwm - * mk6 CR764 - s/spinlock/simple_lock/ (name change only) - * [1994/11/10 05:24:14 dwm] - * - * Revision 1.1.16.3 1994/09/23 01:21:01 ezf - * change marker to not FREE - * [1994/09/22 21:10:46 ezf] - * - * Revision 1.1.16.2 1994/09/16 15:30:07 emcmanus - * Add prototype for db_show_subsystem. - * [1994/09/16 15:29:05 emcmanus] - * - * Revision 1.1.16.1 1994/06/11 21:12:10 bolinger - * Merge up to NMK17.2. - * [1994/06/11 20:04:06 bolinger] - * - * Revision 1.1.18.2 1994/12/06 19:43:09 alanl - * Intel merge, Oct 94 code drop. - * Added prototypes for db_show_{one,all}_task_vm - * [94/11/28 mmp] - * - * Revision 1.1.18.1 1994/08/05 19:35:57 mmp - * Remove duplicate prototype for db_show_port_id. - * [1994/08/05 19:31:44 mmp] - * - * Revision 1.1.10.3 1994/04/15 18:41:54 paire - * Changed db_task_from_space prototype. - * [94/03/31 paire] - * - * Revision 1.1.10.2 1994/03/07 16:37:54 paire - * Added ANSI prototype for db_port_kmsg_count routine. - * [94/02/15 paire] - * - * Revision 1.1.10.1 1994/02/08 10:58:27 bernadat - * Added db_show_one_space - * db_show_all_spaces - * db_sys - * prototypes - * [94/02/07 bernadat] - * - * Revision 1.1.2.3 1993/09/17 21:34:40 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:24 robert] - * - * Revision 1.1.2.2 1993/07/27 18:28:01 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:43 elliston] - * - * $EndLog$ - */ - -#ifndef _DDB_DB_PRINT_H_ -#define _DDB_DB_PRINT_H_ - -#include -#include - -/* Prototypes for functions exported by this module. - */ -void db_show_regs( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); - -void db_show_all_acts( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_one_act( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_one_thread( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_one_task( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_shuttle( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_port_id( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_one_task_vm( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); - -void db_show_all_task_vm( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); - -void db_show_one_space( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_all_spaces( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_sys(void); - -int db_port_kmsg_count( - ipc_port_t port); - -db_addr_t db_task_from_space( - ipc_space_t space, - int *task_id); - -void db_show_one_simple_lock( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_runq( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif); - -void db_show_one_lock(lock_t *); - -#endif /* !_DDB_DB_PRINT_H_ */ diff --git a/osfmk/ddb/db_run.c b/osfmk/ddb/db_run.c deleted file mode 100644 index 6c7d5be98..000000000 --- a/osfmk/ddb/db_run.c +++ /dev/null @@ -1,541 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Commands to run process. - */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include -#include -#include - -#include - -boolean_t db_sstep_print; -int db_loop_count; -int db_call_depth; - -int db_inst_count; -int db_last_inst_count; -int db_load_count; -int db_store_count; -int db_max_inst_count = 1000; - -#ifndef db_set_single_step -void db_set_task_single_step( - register db_regs_t *regs, - task_t task); -#else -#define db_set_task_single_step(regs,task) db_set_single_step(regs) -#endif -#ifndef db_clear_single_step -void db_clear_task_single_step( - db_regs_t *regs, - task_t task); -#else -#define db_clear_task_single_step(regs,task) db_clear_single_step(regs) -#endif - -extern jmp_buf_t *db_recover; -boolean_t db_step_again(void); - -static db_addr_t db_stop_pc; -boolean_t -db_stop_at_pc( - boolean_t *is_breakpoint, - task_t task, - task_t space) -{ - register db_thread_breakpoint_t bkpt; - - db_clear_task_single_step(DDB_REGS, space); - db_clear_breakpoints(); - db_clear_watchpoints(); - db_stop_pc = PC_REGS(DDB_REGS); - -#ifdef FIXUP_PC_AFTER_BREAK - if (*is_breakpoint) { - /* - * Breakpoint trap. Fix up the PC if the - * machine requires it. - */ - FIXUP_PC_AFTER_BREAK - db_stop_pc = PC_REGS(DDB_REGS); - } -#endif - - /* - * Now check for a breakpoint at this address. - */ - bkpt = db_find_thread_breakpoint_here(space, db_stop_pc); - if (bkpt) { - if (db_cond_check(bkpt)) { - *is_breakpoint = TRUE; - return (TRUE); /* stop here */ - } - } - *is_breakpoint = FALSE; - - if (db_run_mode == STEP_INVISIBLE) { - db_run_mode = STEP_CONTINUE; - return (FALSE); /* continue */ - } - if (db_run_mode == STEP_COUNT) { - return (FALSE); /* continue */ - } - if (db_run_mode == STEP_ONCE) { - if (--db_loop_count > 0) { - if (db_sstep_print) { - db_print_loc_and_inst(db_stop_pc, task); - } - return (FALSE); /* continue */ - } - } - if (db_run_mode == STEP_RETURN) { - jmp_buf_t *prev; - jmp_buf_t db_jmpbuf; - /* WARNING: the following assumes an instruction fits an int */ - db_expr_t ins; - - ins = db_get_task_value(db_stop_pc, sizeof(int), FALSE, space); - - /* continue until matching return */ - - prev = db_recover; - if (_setjmp(db_recover = &db_jmpbuf) == 0) { - if (!inst_trap_return(ins) && - (!inst_return(ins) || --db_call_depth != 0)) { - if (db_sstep_print) { - if (inst_call(ins) || inst_return(ins)) { - register int i; - - db_printf("[after %6d /%4d] ", - db_inst_count, - db_inst_count - db_last_inst_count); - db_last_inst_count = db_inst_count; - for (i = db_call_depth; --i > 0; ) - db_printf(" "); - db_print_loc_and_inst(db_stop_pc, task); - db_printf("\n"); - } - } - if (inst_call(ins)) - db_call_depth++; - db_recover = prev; - if (db_step_again()) - return (FALSE); /* continue */ - } - } - db_recover = prev; - } - if (db_run_mode == STEP_CALLT) { - /* WARNING: the following assumes an instruction fits an int */ - db_expr_t ins; - ins = db_get_task_value(db_stop_pc, sizeof(int), FALSE, space); - - /* continue until call or return */ - - if (!inst_call(ins) && - !inst_return(ins) && - !inst_trap_return(ins)) { - if (db_step_again()) - return (FALSE); /* continue */ - } - } - if (db_find_breakpoint_here(space, db_stop_pc)) - return(FALSE); - db_run_mode = STEP_NONE; - return (TRUE); -} - -void -db_restart_at_pc( - boolean_t watchpt, - task_t task) -{ - db_addr_t pc = PC_REGS(DDB_REGS); -#ifdef SOFTWARE_SSTEP - db_addr_t brpc; -#endif - - - if ((db_run_mode == STEP_COUNT) || - (db_run_mode == STEP_RETURN) || - (db_run_mode == STEP_CALLT)) { - db_expr_t ins; - - /* - * We are about to execute this instruction, - * so count it now. - */ - - ins = db_get_task_value(pc, sizeof(int), FALSE, task); - db_inst_count++; - db_load_count += db_inst_load((unsigned long)ins); - db_store_count += db_inst_store((unsigned long)ins); -#ifdef SOFTWARE_SSTEP - /* Account for instructions in delay slots */ - brpc = next_instr_address(pc,1,task); - if ((brpc != pc) && (inst_branch(ins) || inst_call(ins))) { - /* Note: this ~assumes an instruction <= sizeof(int) */ - ins = db_get_task_value(brpc, sizeof(int), FALSE, task); - db_inst_count++; - db_load_count += db_inst_load(ins); - db_store_count += db_inst_store(ins); - } -#endif /* SOFTWARE_SSTEP */ - } - - if (db_run_mode == STEP_CONTINUE) { - if (watchpt || db_find_breakpoint_here(task, pc)) { - /* - * Step over breakpoint/watchpoint. - */ - db_run_mode = STEP_INVISIBLE; - db_set_task_single_step(DDB_REGS, task); - } else { - db_set_breakpoints(); - db_set_watchpoints(); - } - } else { - db_set_task_single_step(DDB_REGS, task); - } -} - -/* - * 'n' and 'u' commands might never return. - * Limit the maximum number of steps. - */ - -boolean_t -db_step_again(void) -{ - if (db_inst_count && !(db_inst_count%db_max_inst_count)) { - char c; - db_printf("%d instructions, continue ? (y/n) ", - db_inst_count); - c = cngetc(); - db_printf("\n"); - if(c == 'n') - return(FALSE); - } - return(TRUE); -} - -void -db_single_step(db_regs_t *regs, __unused task_t task) -{ - if (db_run_mode == STEP_CONTINUE) { - db_run_mode = STEP_INVISIBLE; - db_set_task_single_step(regs, task); - } -} - -#ifdef SOFTWARE_SSTEP -/* - * Software implementation of single-stepping. - * If your machine does not have a trace mode - * similar to the vax or sun ones you can use - * this implementation, done for the mips. - * Just define the above conditional and provide - * the functions/macros defined below. - * - * extern boolean_t - * inst_branch(), returns true if the instruction might branch - * extern unsigned - * branch_taken(), return the address the instruction might - * branch to - * db_getreg_val(); return the value of a user register, - * as indicated in the hardware instruction - * encoding, e.g. 8 for r8 - * - * next_instr_address(pc,bd,task) returns the address of the first - * instruction following the one at "pc", - * which is either in the taken path of - * the branch (bd==1) or not. This is - * for machines (mips) with branch delays. - * - * A single-step may involve at most 2 breakpoints - - * one for branch-not-taken and one for branch taken. - * If one of these addresses does not already have a breakpoint, - * we allocate a breakpoint and save it here. - * These breakpoints are deleted on return. - */ -db_breakpoint_t db_not_taken_bkpt = 0; -db_breakpoint_t db_taken_bkpt = 0; - -db_breakpoint_t -db_find_temp_breakpoint( - task_t task, - db_addr_t addr) -{ - if (db_taken_bkpt && (db_taken_bkpt->address == addr) && - db_taken_bkpt->task == task) - return db_taken_bkpt; - if (db_not_taken_bkpt && (db_not_taken_bkpt->address == addr) && - db_not_taken_bkpt->task == task) - return db_not_taken_bkpt; - return 0; -} - -void -db_set_task_single_step( - register db_regs_t *regs, - task_t task) -{ - db_addr_t pc = PC_REGS(regs), brpc; - register unsigned int inst; - register boolean_t unconditional; - - /* - * User was stopped at pc, e.g. the instruction - * at pc was not executed. - */ - inst = db_get_task_value(pc, sizeof(int), FALSE, task); - if (inst_branch(inst) || inst_call(inst)) { - extern db_expr_t getreg_val(); /* XXX -- need prototype! */ - - brpc = branch_taken(inst, pc, getreg_val, (unsigned char*)regs); - if (brpc != pc) { /* self-branches are hopeless */ - db_taken_bkpt = db_set_temp_breakpoint(task, brpc); - } else - db_taken_bkpt = 0; - pc = next_instr_address(pc,1,task); - } else - pc = next_instr_address(pc,0,task); - - /* - * check if this control flow instruction is an - * unconditional transfer - */ - - unconditional = inst_unconditional_flow_transfer(inst); - - /* - We only set the sequential breakpoint if previous instruction was not - an unconditional change of flow of control. If the previous instruction - is an unconditional change of flow of control, setting a breakpoint in the - next sequential location may set a breakpoint in data or in another routine, - which could screw up either the program or the debugger. - (Consider, for instance, that the next sequential instruction is the - start of a routine needed by the debugger.) - */ - if (!unconditional && db_find_breakpoint_here(task, pc) == 0 && - (db_taken_bkpt == 0 || db_taken_bkpt->address != pc)) { - db_not_taken_bkpt = db_set_temp_breakpoint(task, pc); - } else - db_not_taken_bkpt = 0; -} - -void -db_clear_task_single_step( - db_regs_t *regs, - task_t task) -{ - if (db_taken_bkpt != 0) { - db_delete_temp_breakpoint(task, db_taken_bkpt); - db_taken_bkpt = 0; - } - if (db_not_taken_bkpt != 0) { - db_delete_temp_breakpoint(task, db_not_taken_bkpt); - db_not_taken_bkpt = 0; - } -} - -#endif /* SOFTWARE_SSTEP */ - -extern int db_cmd_loop_done; - -/* single-step */ -void -db_single_step_cmd(__unused db_expr_t addr, __unused boolean_t have_addr, - db_expr_t count, char *modif) -{ - boolean_t print = FALSE; - - if (count == (db_expr_t)-1) - count = 1; - - if (modif[0] == 'p') - print = TRUE; - - db_run_mode = STEP_ONCE; - db_loop_count = (typeof(db_loop_count))count; - db_sstep_print = print; - db_inst_count = 0; - db_last_inst_count = 0; - db_load_count = 0; - db_store_count = 0; - - db_cmd_loop_done = 1; -} - -/* trace and print until call/return */ -void -db_trace_until_call_cmd(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - boolean_t print = FALSE; - - if (modif[0] == 'p') - print = TRUE; - - db_run_mode = STEP_CALLT; - db_sstep_print = print; - db_inst_count = 0; - db_last_inst_count = 0; - db_load_count = 0; - db_store_count = 0; - - db_cmd_loop_done = 1; -} - -void -db_trace_until_matching_cmd(__unused db_expr_t addr, - __unused boolean_t have_addr, - __unused db_expr_t count, - char *modif) -{ - boolean_t print = FALSE; - - if (modif[0] == 'p') - print = TRUE; - - db_run_mode = STEP_RETURN; - db_call_depth = 1; - db_sstep_print = print; - db_inst_count = 0; - db_last_inst_count = 0; - db_load_count = 0; - db_store_count = 0; - - db_cmd_loop_done = 1; -} - -/* continue */ -void -db_continue_cmd(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - /* - * Though "cont/c" works fairly well, it's not really robust - * enough to use in arbitrary situations, so disable it. - * (Doesn't seem cost-effective to debug and fix what ails - * it.) - */ -#if 0 - if (modif[0] == 'c') - db_run_mode = STEP_COUNT; - else - db_run_mode = STEP_CONTINUE; -#else - db_run_mode = STEP_CONTINUE; -#endif - db_inst_count = 0; - db_last_inst_count = 0; - db_load_count = 0; - db_store_count = 0; - - db_cmd_loop_done = 1; -} - - -/* - * Switch to gdb - */ -static void -db_to_gdb(void) -{ - switch_debugger = 1; -} - -/* gdb */ -void -db_continue_gdb(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - db_to_gdb(); - db_run_mode = STEP_CONTINUE; - db_inst_count = 0; - db_last_inst_count = 0; - db_load_count = 0; - db_store_count = 0; - - db_cmd_loop_done = 1; -} - - -boolean_t -db_in_single_step(void) -{ - return(db_run_mode != STEP_NONE && db_run_mode != STEP_CONTINUE); -} - diff --git a/osfmk/ddb/db_run.h b/osfmk/ddb/db_run.h deleted file mode 100644 index c568f0dec..000000000 --- a/osfmk/ddb/db_run.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -/* - */ - -#ifndef _DDB_DB_RUN_H_ -#define _DDB_DB_RUN_H_ - -#include -#include -#include - - -/* Prototypes for functions exported by this module. - */ - -boolean_t db_stop_at_pc( - boolean_t *is_breakpoint, - task_t task, - task_t space); - -void db_restart_at_pc( - boolean_t watchpt, - task_t task); - -void db_single_step( - db_regs_t *regs, - task_t task); - -void db_single_step_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_trace_until_call_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_trace_until_matching_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_continue_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_continue_gdb(db_expr_t, boolean_t, db_expr_t, char *); - -boolean_t db_in_single_step(void); - -#endif /* !_DDB_DB_RUN_H_ */ diff --git a/osfmk/ddb/db_sym.c b/osfmk/ddb/db_sym.c deleted file mode 100644 index 1e118054e..000000000 --- a/osfmk/ddb/db_sym.c +++ /dev/null @@ -1,1502 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -#include -#include /* For strcpy(), strcmp() */ -#include -#include /* For printf() */ -#include -#include -#include -#include -#include /* For db_printf() */ - -#include /* vm_map_t */ - -/* - * Multiple symbol tables - * - * mach, bootstrap, name_server, default_pager, unix, 1 spare - */ -#define MAXNOSYMTABS 6 - -db_symtab_t db_symtabs[MAXNOSYMTABS]; -int db_nsymtab = 0; - -db_symtab_t *db_last_symtab; - -unsigned long db_maxoff = 0x4000; -extern char end; -unsigned long db_maxval = (unsigned long)&end; -natural_t db_minval = 0x1000; - -/* Prototypes for functions local to this file. XXX -- should be static! - */ -static char *db_qualify( - char *sym, - register char *symtabname); - -boolean_t db_eqname( - char *src, - char *dst, - unsigned c); - -boolean_t db_symbol_is_ambiguous(char *name); - -void db_shorten_filename(char **filenamep); - -void qsort_swap( - register int *a, - register int *b, - register int size); - -void qsort_rotate( - register int *a, - register int *b, - register int *c, - register int size); - -void qsort_recur( - char *left, - char *right, - int eltsize, - int (*compfun)(char *, char *)); - -void qsort_checker( - char *table, - int nbelts, - int eltsize, - int (*compfun)(char *, char *)); - -void bubble_sort( - char *table, - int nbelts, - int eltsize, - int (*compfun)(char *, char *)); - -int no_print_completion( - db_symtab_t *stab, - char *symstr ); -int no_lookup_incomplete( - db_symtab_t *stab, - char *symstr, - char **name, - int *len, - int *toadd); - -/* - * Initialization routine for ddb. - */ -void -ddb_init(void) -{ - X_db_init(); - db_machdep_init(); -} - -extern vm_map_t kernel_map; -/* - * Add symbol table, with given name, to list of symbol tables. - */ -boolean_t -db_add_symbol_table( - int type, - char *start, - char *db_end, - const char *name, - char *ref, - char *map_pointer, - unsigned long minsym, - unsigned long maxsym, - boolean_t sorted) -{ - register db_symtab_t *st; - - if (db_nsymtab >= MAXNOSYMTABS) - return (FALSE); - - st = &db_symtabs[db_nsymtab]; - st->type = type; - st->start = start; - st->end = db_end; - st->private = ref; - if (map_pointer == (char *)kernel_map || - (VM_MIN_KERNEL_ADDRESS - VM_MAX_ADDRESS > 0 && - minsym - VM_MIN_KERNEL_ADDRESS > 0)) - st->map_pointer = 0; - else - st->map_pointer = map_pointer; - strlcpy(st->name, name, sizeof (st->name)); - st->minsym = minsym; - st->maxsym = maxsym; - if (maxsym == 0) - st->sorted = FALSE; - else { - st->sorted = sorted; - if (db_maxval < maxsym + db_maxoff) - db_maxval = maxsym + db_maxoff; - } - db_nsymtab++; - - return (TRUE); -} - -/* - * db_qualify("vm_map", "ux") returns "ux::vm_map". - * - * Note: return value points to static data whose content is - * overwritten by each call... but in practice this seems okay. - */ -static char * -db_qualify( - char *symname, - register char *symtabname) -{ - static char tmp[256]; - register char *s; - - s = tmp; - while ((*s++ = *symtabname++)) { - ; - } - s[-1] = ':'; - *s++ = ':'; - while ((*s++ = *symname++)) { - ; - } - return tmp; -} - - -boolean_t -db_eqname( - char *src, - char *dst, - unsigned c) -{ - if (!strcmp(src, dst)) - return (TRUE); - if (src[0] == (char)c) - return (!strcmp(src+1,dst)); - return (FALSE); -} - -boolean_t -db_value_of_name( - const char *name, - db_expr_t *valuep) -{ - db_sym_t sym; - - sym = db_lookup(name); - if (sym == DB_SYM_NULL) - return (FALSE); - db_symbol_values(0, sym, &name, valuep); - return (TRUE); -} - -/* - * Display list of possible completions for a symbol. - */ -void -db_print_completion( - char *symstr) -{ - register int i; - int symtab_start = 0; - int symtab_end = db_nsymtab; - register char *cp; - - /* - * Look for, remove, and remember any symbol table specifier. - */ - for (cp = symstr; *cp; cp++) { - if (*cp == ':' && cp[1] == ':') { - *cp = '\0'; - for (i = 0; i < db_nsymtab; i++) { - if (! strcmp(symstr, db_symtabs[i].name)) { - symtab_start = i; - symtab_end = i + 1; - break; - } - } - *cp = ':'; - if (i == db_nsymtab) - return; - symstr = cp+2; - } - } - - /* - * Look in the specified set of symbol tables. - * Return on first match. - */ - for (i = symtab_start; i < symtab_end; i++) { - if (X_db_print_completion(&db_symtabs[i], symstr)) - break; - } -} - -/* - * Lookup a (perhaps incomplete) symbol. - * If the symbol has a qualifier (e.g., ux::vm_map), - * then only the specified symbol table will be searched; - * otherwise, all symbol tables will be searched. - */ -int -db_lookup_incomplete( - char *symstr, - int symlen) -{ - register int i; - int symtab_start = 0; - int symtab_end = db_nsymtab; - register char *cp; - int nsym = 0; - char *name = (char *)0; - int len; - int toadd; - - /* - * Look for, remove, and remember any symbol table specifier. - */ - for (cp = symstr; *cp; cp++) { - if (*cp == ':' && cp[1] == ':') { - *cp = '\0'; - for (i = 0; i < db_nsymtab; i++) { - if (! strcmp(symstr, db_symtabs[i].name)) { - symtab_start = i; - symtab_end = i + 1; - break; - } - } - *cp = ':'; - if (i == db_nsymtab) - return 0; - symstr = cp+2; - } - } - - /* - * Look in the specified set of symbol tables. - * Return on first match. - */ - for (i = symtab_start; i < symtab_end; i++) { - nsym = X_db_lookup_incomplete(&db_symtabs[i], symstr, - &name, &len, &toadd); - if (nsym > 0) { - if (toadd > 0) { - len = strlen(symstr); - if (len + toadd >= symlen) - return 0; - bcopy(&name[len], &symstr[len], toadd); - symstr[len + toadd] = '\0'; - } - break; - } - } - return nsym; -} - -/* - * Lookup a symbol. - * If the symbol has a qualifier (e.g., ux::vm_map), - * then only the specified symbol table will be searched; - * otherwise, all symbol tables will be searched. - */ -db_sym_t -db_lookup(const char *symstr) -{ - db_sym_t sp; - int i; - int symtab_start = 0; - int symtab_end = db_nsymtab; - char *cp; - - /* - * Look for, remove, and remember any symbol table specifier. - */ - for (cp = symstr; *cp; cp++) { - if (*cp == ':' && cp[1] == ':') { - *cp = '\0'; - for (i = 0; i < db_nsymtab; i++) { - if (! strcmp(symstr, db_symtabs[i].name)) { - symtab_start = i; - symtab_end = i + 1; - break; - } - } - *cp = ':'; - if (i == db_nsymtab) - db_error("Invalid symbol table name\n"); - symstr = cp+2; - } - } - - /* - * Look in the specified set of symbol tables. - * Return on first match. - */ - for (i = symtab_start; i < symtab_end; i++) { - if ((sp = X_db_lookup(&db_symtabs[i], symstr))) { - db_last_symtab = &db_symtabs[i]; - return sp; - } - } - return 0; -} - -/* - * Print a symbol completion - */ -void -db_sym_print_completion( - db_symtab_t *stab, - char *name, - int function, - char *fname, - int line) -{ - if (stab != db_symtabs) - db_printf("%s::", stab->name); - db_printf(name); - if (function) { - db_putchar('('); - db_putchar(')'); - } - if (fname) { - db_printf(" [static from %s", fname); - if (line > 0) - db_printf(":%d", line); - db_putchar(']'); - } - db_putchar('\n'); -} - -/* - * Common utility routine to parse a symbol string into a file - * name, a (possibly incomplete) symbol name without line number. - * This routine is called from aout_db_print_completion if the object - * dependent handler supports qualified search with a file name. - * It parses the symbol string, and call an object dependent routine - * with parsed file name and symbol name. - */ -int -db_sym_parse_and_print_completion( - int (*func)(db_symtab_t *, - char *), - db_symtab_t *symtab, - char *symstr) -{ - register char *p; - register int n; - char *sym_name; - char *component[2]; - int nsym; - - /* - * disassemble the symbol into components: [file_name:]symbol - */ - component[0] = symstr; - component[1] = 0; - for (p = symstr, n = 1; *p; p++) { - if (*p == ':') { - if (n == 2) - break; - *p = 0; - component[n++] = p+1; - } - } - if (*p == 0) { - if (n == 1) { - sym_name = component[0]; - } else { - sym_name = component[1]; - } - nsym = func(symtab, sym_name); - } else - nsym = 0; - if (n == 2) - component[1][-1] = ':'; - return nsym; -} - -/* - * Common utility routine to parse a symbol string into a file - * name, a (possibly incomplete) symbol name without line number. - * This routine is called from X_db_lookup_incomplete if the object - * dependent handler supports qualified search with a file name. - * It parses the symbol string, and call an object dependent routine - * with parsed file name and symbol name. - */ -int -db_sym_parse_and_lookup_incomplete( - int (*func)(db_symtab_t *, - char *, - char *, - int, - db_sym_t*, - char **, - int *), - db_symtab_t *symtab, - char *symstr, - char **name, - int *len, - int *toadd) -{ - register char *p; - register int n; - char *file_name = 0; - char *sym_name = 0; - char *component[2]; - int nsym = 0; - - /* - * disassemble the symbol into components: [file_name:]symbol - */ - component[0] = symstr; - component[1] = 0; - for (p = symstr, n = 1; *p; p++) { - if (*p == ':') { - if (n == 2) - break; - *p = 0; - component[n++] = p+1; - } - } - if (*p == 0) { - if (n == 1) { - file_name = 0; - sym_name = component[0]; - } else { - file_name = component[0]; - sym_name = component[1]; - } - nsym = func(symtab, file_name, sym_name, 0, (db_sym_t *)0, - name, len); - if (nsym > 0) - *toadd = *len - strlen(sym_name); - } - if (n == 2) - component[1][-1] = ':'; - return(nsym); -} - -/* - * Common utility routine to parse a symbol string into a file - * name, a symbol name and line number. - * This routine is called from aout_db_lookup if the object dependent - * handler supports qualified search with a file name or a line number. - * It parses the symbol string, and call an object dependent routine - * with parsed file name, symbol name and line number. - */ -db_sym_t -db_sym_parse_and_lookup( - int (*func)(db_symtab_t *, char *, char *, int, - db_sym_t*, char **, int *), - db_symtab_t *symtab, - char *symstr) -{ - register char *p; - register int n; - int n_name; - int line_number; - char *file_name = 0; - char *sym_name = 0; - char *component[3]; - db_sym_t found = DB_SYM_NULL; - - /* - * disassemble the symbol into components: - * [file_name:]symbol[:line_nubmer] - */ - component[0] = symstr; - component[1] = component[2] = 0; - for (p = symstr, n = 1; *p; p++) { - if (*p == ':') { - if (n >= 3) - break; - *p = 0; - component[n++] = p+1; - } - } - if (*p != 0) - goto out; - line_number = 0; - n_name = n; - p = component[n-1]; - if (*p >= '0' && *p <= '9') { - if (n == 1) - goto out; - for (line_number = 0; *p; p++) { - if (*p < '0' || *p > '9') - goto out; - line_number = line_number*10 + *p - '0'; - } - n_name--; - } else if (n >= 3) - goto out; - if (n_name == 1) { - for (p = component[0]; *p && *p != '.'; p++); - if (*p == '.') { - file_name = component[0]; - sym_name = 0; - } else { - file_name = 0; - sym_name = component[0]; - } - } else { - file_name = component[0]; - sym_name = component[1]; - } - (void) func(symtab, file_name, sym_name, line_number, &found, - (char **)0, (int *)0); - -out: - while (--n >= 1) - component[n][-1] = ':'; - return(found); -} - -/* - * Does this symbol name appear in more than one symbol table? - * Used by db_symbol_values to decide whether to qualify a symbol. - */ -boolean_t db_qualify_ambiguous_names = TRUE; - -boolean_t -db_symbol_is_ambiguous(char *name) -{ - register int i; - register - boolean_t found_once = FALSE; - - if (!db_qualify_ambiguous_names) - return FALSE; - - for (i = 0; i < db_nsymtab; i++) { - if (X_db_lookup(&db_symtabs[i], name)) { - if (found_once) - return TRUE; - found_once = TRUE; - } - } - return FALSE; -} - -/* - * Find the closest symbol to val, and return its name - * and the difference between val and the symbol found. - */ -unsigned int db_search_maxoff = 0x4000; -db_sym_t -db_search_task_symbol( - register db_addr_t val, - db_strategy_t strategy, - db_addr_t *offp, /* better be unsigned */ - task_t task) -{ - db_addr_t diff, newdiff; - register int i; - db_symtab_t *sp; - db_sym_t ret = DB_SYM_NULL, sym; - vm_map_t map_for_val; - - if (task == TASK_NULL) - task = db_current_task(); - map_for_val = (task == TASK_NULL)? VM_MAP_NULL: task->map; -again: - newdiff = diff = -1; - db_last_symtab = 0; - for (sp = &db_symtabs[0], i = 0; - i < db_nsymtab; - sp++, i++) { - if ((((vm_map_t)sp->map_pointer == VM_MAP_NULL) || - ((vm_map_t)sp->map_pointer == map_for_val)) && - ((sp->maxsym == 0) || - ((val >= (db_addr_t)sp->minsym) && - (val <= (db_addr_t)sp->maxsym)))) { - sym = X_db_search_symbol(sp, val, strategy, - (db_expr_t *)&newdiff); - if (newdiff < diff) { - db_last_symtab = sp; - diff = newdiff; - ret = sym; - if (diff <= db_search_maxoff) - break; - } - } - } - if (ret == DB_SYM_NULL && map_for_val != VM_MAP_NULL) { - map_for_val = VM_MAP_NULL; - goto again; - } - *offp = diff; - return ret; -} - -/* - * Find the closest symbol to val, and return its name - * and the difference between val and the symbol found. - * Also return the filename and linenumber if available. - */ -db_sym_t -db_search_task_symbol_and_line( - register db_addr_t val, - __unused db_strategy_t strategy, - db_expr_t *offp, - char **filenamep, - int *linenump, - task_t task, - int *argsp) -{ - db_addr_t diff, newdiff; - register int i; - db_symtab_t *sp; - db_sym_t ret = DB_SYM_NULL, sym; - vm_map_t map_for_val; - char *func; - char *filename; - int linenum; - int args; - - if (task == TASK_NULL) - task = db_current_task(); - map_for_val = (task == TASK_NULL)? VM_MAP_NULL: task->map; - *filenamep = (char *) 0; - *linenump = 0; - *argsp = -1; - again: - filename = (char *) 0; - linenum = 0; - newdiff = diff = ~0UL; - db_last_symtab = 0; - for (sp = &db_symtabs[0], i = 0; - i < db_nsymtab; - sp++, i++) { - if ((((vm_map_t)sp->map_pointer == VM_MAP_NULL) || - ((vm_map_t)sp->map_pointer == map_for_val)) && - ((sp->maxsym == 0) || - ((val >= (db_addr_t)sp->minsym) && - (val <= (db_addr_t)sp->maxsym)))) { - - sym = X_db_search_by_addr(sp, val, &filename, &func, - &linenum, (db_expr_t *)&newdiff, - &args); - if (sym && newdiff < diff) { - db_last_symtab = sp; - diff = newdiff; - ret = sym; - *filenamep = filename; - *linenump = linenum; - *argsp = args; - if (diff <= db_search_maxoff) - break; - } - } - } - if (ret == DB_SYM_NULL && map_for_val != VM_MAP_NULL) { - map_for_val = VM_MAP_NULL; - goto again; - } - *offp = diff; - if (*filenamep) - db_shorten_filename(filenamep); - return ret; -} - -/* - * Return name and value of a symbol - */ -void -db_symbol_values( - db_symtab_t *stab, - db_sym_t sym, - const char **namep, - db_expr_t *valuep) -{ - db_expr_t value; - char *name; - - if (sym == DB_SYM_NULL) { - *namep = 0; - return; - } - if (stab == 0) - stab = db_last_symtab; - - X_db_symbol_values(stab, sym, &name, &value); - - if (db_symbol_is_ambiguous(name)) { - *namep = db_qualify(name, db_last_symtab->name); - }else { - *namep = name; - } - if (valuep) - *valuep = value; -} - - -/* - * Print a the closest symbol to value - * - * After matching the symbol according to the given strategy - * we print it in the name+offset format, provided the symbol's - * value is close enough (eg smaller than db_maxoff). - * We also attempt to print [filename:linenum] when applicable - * (eg for procedure names). - * - * If we could not find a reasonable name+offset representation, - * then we just print the value in hex. Small values might get - * bogus symbol associations, e.g. 3 might get some absolute - * value like _INCLUDE_VERSION or something, therefore we do - * not accept symbols whose value is zero (and use plain hex). - */ - -void -db_task_printsym( - db_addr_t off, - db_strategy_t strategy, - task_t task) -{ - db_expr_t d; - char *filename; - char *name; - db_expr_t value; - int linenum; - db_sym_t cursym; - - if (off >= db_maxval || off < db_minval) { - db_printf("%#lln", (unsigned long long)off); - return; - } - cursym = db_search_task_symbol(off, strategy, &d, task); - - db_symbol_values(0, cursym, &name, &value); - if (name == 0 || d >= db_maxoff || value == 0) { - db_printf("%#lln",(unsigned long long) off); - return; - } - db_printf("%s", name); - if (d) - db_printf("+%llx", (unsigned long long)d); - if (strategy == DB_STGY_PROC) { - if (db_line_at_pc(cursym, &filename, &linenum, off)) { - db_printf(" [%s", filename); - if (linenum > 0) - db_printf(":%d", linenum); - db_printf("]"); - } - } -} - -/* - * Return symbol name for a given offset and - * change the offset to be relative to this symbol. - * Very usefull for xpr, when you want to log offsets - * in a user friendly way. - */ - -char null_sym[] = ""; - -char * -db_get_sym(db_expr_t *off) -{ - db_sym_t cursym; - db_expr_t value; - char *name; - db_addr_t d; - - cursym = db_search_symbol(*off, DB_STGY_ANY, &d); - db_symbol_values(0, cursym, &name, &value); - if (name) - *off = d; - else - name = null_sym; - return(name); -} - -void -db_printsym( - db_expr_t off, - db_strategy_t strategy) -{ - db_task_printsym(off, strategy, TASK_NULL); -} - -int db_short_filename = 1; - -void -db_shorten_filename(char **filenamep) -{ - char *cp, *cp_slash; - - if (! *filenamep) - return; - for (cp = cp_slash = *filenamep; *cp; cp++) { - if (*cp == '/') - cp_slash = cp; - } - if (*cp_slash == '/') - *filenamep = cp_slash+1; -} - -int -db_task_getlinenum( - db_expr_t off, - task_t task) -{ - db_addr_t d; - char *filename; - char *name; - db_expr_t value; - int linenum; - db_sym_t cursym; - db_strategy_t strategy = DB_STGY_PROC; - - if (off >= db_maxval || off < db_minval) { - db_printf("%#lln", (unsigned long long)off); - return(-1); - } - cursym = db_search_task_symbol(off, strategy, &d, task); - - db_symbol_values(0, cursym, &name, &value); - if (name == 0 || d >= db_maxoff || value == 0) { - return(-1); - } - if (db_line_at_pc(cursym, &filename, &linenum, off)) - return(linenum); - else - return(-1); -} - -boolean_t -db_line_at_pc( - db_sym_t sym, - char **filename, - int *linenum, - db_expr_t pc) -{ - boolean_t result; - - if (db_last_symtab == 0) - return FALSE; - if (X_db_line_at_pc( db_last_symtab, sym, filename, linenum, pc)) { - if (db_short_filename) - db_shorten_filename(filename); - result = TRUE; - } else - result = FALSE; - return(result); -} - -int qsort_check = 0; - -void -db_qsort( - char *table, - int nbelts, - int eltsize, - int (*compfun)(char *, char *)) -{ - if (nbelts <= 0 || eltsize <= 0 || compfun == 0) { - printf("qsort: invalid parameters\n"); - return; - } - qsort_recur(table, table + nbelts * eltsize, eltsize, compfun); - - if (qsort_check) - qsort_checker(table, nbelts, eltsize, compfun); -} - -void -qsort_swap( - register int *a, - register int *b, - register int size) -{ - register int temp; - char *aa, *bb; - char ctemp; - - for (; size >= (signed)sizeof (int); size -= sizeof (int), a++, b++) { - temp = *a; - *a = *b; - *b = temp; - } - aa = (char *)a; - bb = (char *)b; - for (; size > 0; size--, aa++, bb++) { - ctemp = *aa; - *aa = *bb; - *bb = ctemp; - } -} - -/* rotate the three elements to the left */ -void -qsort_rotate( - register int *a, - register int *b, - register int *c, - register int size) -{ - register int temp; - char *aa, *bb, *cc; - char ctemp; - - for (; size >= (signed)sizeof(int); - size -= sizeof(int), a++, b++, c++) { - temp = *a; - *a = *c; - *c = *b; - *b = temp; - } - aa = (char *)a; - bb = (char *)b; - cc = (char *)c; - for (; size > 0; size--, aa++, bb++, cc++) { - ctemp = *aa; - *aa = *cc; - *cc = *bb; - *bb = ctemp; - } -} - -void -qsort_recur( - char *left, - char *right, - int eltsize, - int (*compfun)(char *, char *)) -{ - char *i, *j; - char *sameleft, *sameright; - - top: - if (left + eltsize - 1 >= right) { - return; - } - - /* partition element (reference for "same"ness */ - sameleft = left + (((right - left) / eltsize) / 2) * eltsize; - sameright = sameleft; - - i = left; - j = right - eltsize; - - again: - while (i < sameleft) { - int comp; - - comp = (*compfun)(i, sameleft); - if (comp == 0) { - /* - * Move to the "same" partition. - */ - /* - * Shift the left part of the "same" partition to - * the left, so that "same" elements stay in their - * original order. - */ - sameleft -= eltsize; - qsort_swap((int *) i, (int *) sameleft, eltsize); - } else if (comp < 0) { - /* - * Stay in the "left" partition. - */ - i += eltsize; - } else { - /* - * Should be moved to the "right" partition. - * Wait until the next loop finds an appropriate - * place to store this element. - */ - break; - } - } - - while (j > sameright) { - int comp; - - comp = (*compfun)(sameright, j); - if (comp == 0) { - /* - * Move to the right of the "same" partition. - */ - sameright += eltsize; - qsort_swap((int *) sameright, (int *) j, eltsize); - } else if (comp > 0) { - /* - * Move to the "left" partition. - */ - if (i == sameleft) { - /* - * Unfortunately, the "left" partition - * has already been fully processed, so - * we have to shift the "same" partition - * to the right to free a "left" element. - * This is done by moving the leftest same - * to the right of the "same" partition. - */ - sameright += eltsize; - qsort_rotate((int *) sameleft, (int*) sameright, - (int *) j, eltsize); - sameleft += eltsize; - i = sameleft; - } else { - /* - * Swap with the "left" partition element - * waiting to be moved to the "right" - * partition. - */ - qsort_swap((int *) i, (int *) j, eltsize); - j -= eltsize; - /* - * Go back to the 1st loop. - */ - i += eltsize; - goto again; - } - } else { - /* - * Stay in the "right" partition. - */ - j -= eltsize; - } - } - - if (i != sameleft) { - /* - * The second loop completed (the"right" partition is ok), - * but we have to go back to the first loop, and deal with - * the element waiting for a place in the "right" partition. - * Let's shift the "same" zone to the left. - */ - sameleft -= eltsize; - qsort_rotate((int *) sameright, (int *) sameleft, (int *) i, - eltsize); - sameright -= eltsize; - j = sameright; - /* - * Go back to 1st loop. - */ - goto again; - } - - /* - * The partitions are correct now. Recur on the smallest side only. - */ - if (sameleft - left >= right - (sameright + eltsize)) { - qsort_recur(sameright + eltsize, right, eltsize, compfun); - /* - * The "right" partition is now completely sorted. - * The "same" partition is OK, so... - * Ignore them, and start the loops again on the - * "left" partition. - */ - right = sameleft; - goto top; - } else { - qsort_recur(left, sameleft, eltsize, compfun); - /* - * The "left" partition is now completely sorted. - * The "same" partition is OK, so ... - * Ignore them, and start the loops again on the - * "right" partition. - */ - left = sameright + eltsize; - goto top; - } -} - -void -qsort_checker( - char *table, - int nbelts, - int eltsize, - int (*compfun)(char *, char *)) -{ - char *curr, *prev, *last; - - prev = table; - curr = prev + eltsize; - last = table + (nbelts * eltsize); - - while (prev < last) { - if ((*compfun)(prev, curr) > 0) { - printf("**** qsort_checker: error between 0x%x and 0x%x!!!\n", prev, curr); - break; - } - prev = curr; - curr += eltsize; - } - printf("qsort_checker: OK\n"); -} - -int qsort_search_debug = 0; - -void -db_qsort_limit_search( - char *target, - char **start, - char **db_end, - int eltsize, - int (*compfun)(char *, char *)) -{ - register char *left, *right; - char *oleft, *oright, *part; - int nbiter = 0; - int comp; - - oleft = left = *start; - oright = right = *db_end; - part = (char *) 0; - - while (left < right) { - nbiter++; - part = left + (((right - left) / eltsize) / 2) * eltsize; - comp = (*compfun)(target, part); - if (comp > 0) { - oleft = left; - oright = right; - left = part; - if (left == oleft) - break; - if (qsort_search_debug > 1) - printf(" [ Moved left from 0x%x to 0x%x]\n", - oleft, left); - } else if (comp < 0) { - oright = right; - oleft = left; - right = part; - if (qsort_search_debug > 1) - printf(" [ Moved right from 0x%x to 0x%x]\n", - oright, right); - } else { - if (qsort_search_debug > 1) - printf(" [ FOUND! left=0x%x right=0x%x]\n", - left, right); - for (left = part; - left > *start && (*compfun)(left, part) == 0; - left -= eltsize); - for (right = part + eltsize; - right < *db_end && (*compfun)(right, part) == 0; - right += eltsize); - oright = right; - oleft = left; - break; - } - } - - if (qsort_search_debug) - printf("[ Limited from %x-%x to %x-%x in %d iters ]\n", - *start, *db_end, oleft, oright, nbiter); - *start = oleft; - *db_end = oright; -} - -void -bubble_sort( - char *table, - int nbelts, - int eltsize, - int (*compfun)(char *, char *)) -{ - boolean_t sorted; - char *b_end; - register char *p; - - b_end = table + ((nbelts-1) * eltsize); - do { - sorted = TRUE; - for (p = table; p < b_end; p += eltsize) { - if ((*compfun)(p, p + eltsize) > 0) { - qsort_swap((int *) p, (int *) (p + eltsize), - eltsize); - sorted = FALSE; - } - } - } while (sorted == FALSE); - - if (qsort_check) - qsort_checker(table, nbelts, eltsize, compfun); -} - -vm_offset_t vm_min_inks_addr = VM_MAX_KERNEL_ADDRESS; - -void -db_install_inks( - vm_offset_t base) -{ - /* save addr to demarcate kernel/inks boundary (1st time only) */ - if (vm_min_inks_addr == VM_MAX_KERNEL_ADDRESS) { - vm_min_inks_addr = base; - db_qualify_ambiguous_names = TRUE; - } -} - -extern void db_clone_offsetXXX(char *, long); - -void -db_clone_symtabXXX( - char *clonee, /* which symtab to clone */ - char *cloner, /* in-kernel-server name */ - vm_offset_t base) /* base address of cloner */ -{ - db_symtab_t *st, *st_src; - char * memp; - vm_size_t size; - long offset; - - if (db_nsymtab >= MAXNOSYMTABS) { - db_printf("db_clone_symtab: Too Many Symbol Tables\n"); - return; - } - - db_install_inks(base); - - st = &db_symtabs[db_nsymtab]; /* destination symtab */ - if ((st_src = db_symtab_cloneeXXX(clonee)) == 0) { - db_printf("db_clone_symtab: clonee (%s) not found\n", clonee); - return; - } - /* alloc new symbols */ - size = (vm_size_t)(st_src->end - st_src->private); - memp = (char *)kalloc( round_page(size) ); - if (!memp) { - db_printf("db_clone_symtab: no memory for symtab\n"); - return; - } - - *st = *st_src; /* bulk copy src -> dest */ - strlcpy(st->name, cloner, sizeof (st->name)); /* new name */ - st->private = memp; /* copy symbols */ - bcopy((const char *)st_src->private, st->private, size); - st->start = memp + sizeof(int); /* fixup pointers to symtab */ - st->end = memp + *(int *)memp; - st->map_pointer = 0; /* no map because kernel-loaded */ - - /* Offset symbols, leaving strings pointing into st_src */ - offset = base - st_src->minsym; - st->minsym += offset; - st->maxsym += offset; - db_clone_offsetXXX(memp, offset); - db_nsymtab++; - - db_printf( "[ cloned symbol table for %s: range 0x%x to 0x%x %s]\n", - st->name, st->minsym, st->maxsym, - st->sorted ? "(sorted) " : ""); - db_maxval = (unsigned int)st->maxsym + db_maxoff; -} - -db_symtab_t * -db_symtab_cloneeXXX( - char *clonee) -{ - db_symtab_t *st, *st_src; - - st = &db_symtabs[db_nsymtab]; /* destination symtab */ - for (st_src = &db_symtabs[0]; st_src < st; ++st_src) - if (!strcmp(clonee, st_src->name)) - break; - return ((st_src < st) ? st_src : 0); -} - -/* - * Switch into symbol-table specific routines - */ - -#if !defined(__alpha) && !defined(INTEL860) -#define DB_NO_COFF -#endif - -#ifndef DB_NO_AOUT -#include -#endif - -#ifndef DB_NO_COFF -#include -#endif - -static void no_init(void) - -{ - db_printf("Non-existent code for ddb init\n"); -} - -static boolean_t -no_sym_init(__unused char *nstart, __unused char *nend, const char *name, - __unused char *task_addr) -{ - db_printf("Non-existent code for init of symtab %s\n", name); - return FALSE; -} - -static db_sym_t -no_lookup(__unused db_symtab_t *stab, char *symstr) -{ - db_printf("Bogus lookup of symbol %s\n", symstr); - return DB_SYM_NULL; -} - -static db_sym_t -no_search(__unused db_symtab_t *stab, db_addr_t off, - __unused db_strategy_t strategy, __unused db_expr_t *diffp) -{ - db_printf("Bogus search for offset %#llXn", (unsigned long long)off); - return DB_SYM_NULL; -} - -static boolean_t -no_line_at_pc(__unused db_symtab_t *stab, __unused db_sym_t sym, - __unused char **file, __unused int *line, db_expr_t pc) -{ - db_printf("Bogus search for pc %#llX\n", (unsigned long long)pc); - return FALSE; -} - -static void -no_symbol_values(__unused db_sym_t sym, char **namep, db_expr_t *valuep) -{ - db_printf("Bogus symbol value resolution\n"); - if (namep) *namep = NULL; - if (valuep) *valuep = 0; -} - -static db_sym_t -no_search_by_addr(__unused db_symtab_t *stab, db_addr_t off, - __unused char **file, __unused char **func, - __unused int *line, __unused db_expr_t *diffp, - __unused int *args) -{ - db_printf("Bogus search for address %#llX\n", (unsigned long long)off); - return DB_SYM_NULL; -} - -int -no_print_completion(__unused db_symtab_t *stab, __unused char *symstr) -{ - db_printf("Bogus print completion: not supported\n"); - return 0; -} - -int -no_lookup_incomplete(__unused db_symtab_t *stab, - __unused char *symstr, __unused char **name, - __unused int *len, __unused int *toadd) -{ - db_printf("Bogus lookup incomplete: not supported\n"); - return 0; -} - -#define NONE \ - { \ - .init = no_init, \ - .sym_init = no_sym_init, \ - .lookup = no_lookup, \ - .search_symbol = no_search, \ - .line_at_pc = no_line_at_pc, \ - .symbol_values = no_symbol_values, \ - .search_by_addr = no_search_by_addr, \ - .print_completion = no_print_completion, \ - .lookup_incomplete = no_lookup_incomplete, \ - } - -struct db_sym_switch x_db[] = { - - /* BSD a.out format (really, sdb/dbx(1) symtabs) */ -#ifdef DB_NO_AOUT - NONE, -#else /* DB_NO_AOUT */ - { - .init = aout_db_init, - .sym_init = aout_db_sym_init, - .lookup = aout_db_lookup, - .search_symbol = aout_db_search_symbol, - .line_at_pc = aout_db_line_at_pc, - .symbol_values = aout_db_symbol_values, - .search_by_addr = aout_db_search_by_addr, - .print_completion = aout_db_print_completion, - .lookup_incomplete = aout_db_lookup_incomplete, - }, -#endif /* DB_NO_AOUT */ - -#ifdef DB_NO_COFF - NONE, -#else /* DB_NO_COFF */ - { - .init = coff_db_init, - .sym_init = coff_db_sym_init, - .lookup = coff_db_lookup, - .search_symbol = coff_db_search_symbol, - .line_at_pc = coff_db_line_at_pc, - .symbol_values = coff_db_symbol_values, - .search_by_addr = coff_db_search_by_addr, - .print_completion = coff_db_print_completion, - .lookup_incomplete = coff_db_lookup_incomplete, - }, -#endif /* DB_NO_COFF */ - - /* Machdep, not inited here */ - NONE -}; diff --git a/osfmk/ddb/db_sym.h b/osfmk/ddb/db_sym.h deleted file mode 100644 index 3749e758e..000000000 --- a/osfmk/ddb/db_sym.h +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: Alessandro Forin, Carnegie Mellon University - * Date: 8/90 - */ - -#ifndef _DDB_DB_SYM_H_ -#define _DDB_DB_SYM_H_ - -#include -#include -#include -#include - -/* - * This module can handle multiple symbol tables, - * of multiple types, at the same time - */ -#define SYMTAB_NAME_LEN 32 - -typedef struct { - int type; -#define SYMTAB_AOUT 0 -#define SYMTAB_COFF 1 -#define SYMTAB_MACHDEP 2 - char *start; /* symtab location */ - char *end; - char *private; /* optional machdep pointer */ - char *map_pointer; /* symbols are for this map only, - if not null */ - char name[SYMTAB_NAME_LEN]; - /* symtab name */ - unsigned long minsym; /* lowest symbol value */ - unsigned long maxsym; /* highest symbol value */ - boolean_t sorted; /* is this table sorted ? */ -} db_symtab_t; - -extern db_symtab_t *db_last_symtab; /* where last symbol was found */ - -/* - * Symbol representation is specific to the symtab style: - * BSD compilers use dbx' nlist, other compilers might use - * a different one - */ -typedef void * db_sym_t; /* opaque handle on symbols */ -#define DB_SYM_NULL ((db_sym_t)0) - -/* - * Non-stripped symbol tables will have duplicates, for instance - * the same string could match a parameter name, a local var, a - * global var, etc. - * We are most concern with the following matches. - */ -typedef int db_strategy_t; /* search strategy */ - -#define DB_STGY_ANY 0 /* anything goes */ -#define DB_STGY_XTRN 1 /* only external symbols */ -#define DB_STGY_PROC 2 /* only procedures */ - -extern boolean_t db_qualify_ambiguous_names; - /* if TRUE, check across symbol tables - * for multiple occurrences of a name. - * Might slow down quite a bit */ - -extern unsigned long db_maxoff; - -/* Prototypes for functions exported by this module. - */ -extern boolean_t db_add_symbol_table( - int type, - char *start, - char *end, - const char *name, - char *ref, - char *map_pointer, - unsigned long minsym, - unsigned long maxsym, - boolean_t sorted); - -extern void db_install_inks( - vm_offset_t base); - -extern boolean_t db_value_of_name( - const char *name, - db_expr_t *valuep); - -extern db_sym_t db_lookup(const char *symstr); - -extern char * db_get_sym( - db_expr_t * off); - -extern db_sym_t db_sym_parse_and_lookup( - int (*func)(db_symtab_t *, - char *, - char *, - int, - db_sym_t*, - char **, - int *), - db_symtab_t *symtab, - char *symstr); - -extern int db_sym_parse_and_lookup_incomplete( - int (*func)(db_symtab_t *, - char *, - char *, - int, - db_sym_t*, - char **, - int *), - db_symtab_t *symtab, - char *symstr, - char **name, - int *len, - int *toadd); - -extern int db_sym_parse_and_print_completion( - int (*func)(db_symtab_t *, - char *), - db_symtab_t *symtab, - char *symstr); - -extern db_sym_t db_search_task_symbol( - db_addr_t val, - db_strategy_t strategy, - db_addr_t *offp, - task_t task); - -extern db_sym_t db_search_task_symbol_and_line( - db_addr_t val, - db_strategy_t strategy, - db_expr_t *offp, - char **filenamep, - int *linenump, - task_t task, - int *argsp); - -extern void db_symbol_values( - db_symtab_t *stab, - db_sym_t sym, - const char **namep, - db_expr_t *valuep); - -extern void db_task_printsym( - db_expr_t off, - db_strategy_t strategy, - task_t task); - -extern void db_printsym( - db_expr_t off, - db_strategy_t strategy); - -extern boolean_t db_line_at_pc( - db_sym_t sym, - char **filename, - int *linenum, - db_expr_t pc); - -extern void db_qsort( - char *table, - int nbelts, - int eltsize, - int (*compfun)(char *, char *)); - -extern void db_qsort_limit_search( - char *target, - char **start, - char **end, - int eltsize, - int (*compfun)(char *, char *)); - -extern void db_sym_print_completion( - db_symtab_t *stab, - char *name, - int function, - char *fname, - int line); - -extern void db_print_completion( - char *symstr); - -extern int db_lookup_incomplete( - char *symstr, - int symlen); - -extern void ddb_init(void); - -extern void db_machdep_init(void); - -extern void db_clone_symtabXXX(char *, char *, vm_offset_t); - -extern db_symtab_t *db_symtab_cloneeXXX(char *); - -extern int db_task_getlinenum( db_expr_t, task_t); - -/* Some convenience macros. - */ -#define db_find_sym_and_offset(val,namep,offp) \ - db_symbol_values(0, db_search_symbol(val,DB_STGY_ANY,offp),namep,0) - /* find name&value given approx val */ - -#define db_find_xtrn_sym_and_offset(val,namep,offp) \ - db_symbol_values(0, db_search_symbol(val,DB_STGY_XTRN,offp),namep,0) - /* ditto, but no locals */ - -#define db_find_task_sym_and_offset(val,namep,offp,task) \ - db_symbol_values(0, db_search_task_symbol(val,DB_STGY_ANY,offp,task), \ - namep, 0) /* find name&value given approx val */ - -#define db_find_xtrn_task_sym_and_offset(val,namep,offp,task) \ - db_symbol_values(0, db_search_task_symbol(val,DB_STGY_XTRN,offp,task), \ - namep,0) /* ditto, but no locals */ - -#define db_search_symbol(val,strgy,offp) \ - db_search_task_symbol(val,strgy,offp,0) - /* find symbol in current task */ - -/* - * Symbol table switch, defines the interface - * to symbol-table specific routines. - */ - -extern struct db_sym_switch { - - void (*init)(void); - - boolean_t (*sym_init)( - char *start, - char *end, - const char *name, - char *task_addr - ); - - db_sym_t (*lookup)( - db_symtab_t *stab, - char *symstr - ); - db_sym_t (*search_symbol)( - db_symtab_t *stab, - db_addr_t off, - db_strategy_t strategy, - db_expr_t *diffp - ); - - boolean_t (*line_at_pc)( - db_symtab_t *stab, - db_sym_t sym, - char **file, - int *line, - db_expr_t pc - ); - - void (*symbol_values)( - db_sym_t sym, - char **namep, - db_expr_t *valuep - ); - db_sym_t (*search_by_addr)( - db_symtab_t *stab, - db_addr_t off, - char **file, - char **func, - int *line, - db_expr_t *diffp, - int *args - ); - - int (*print_completion)( - db_symtab_t *stab, - char *symstr - ); - - int (*lookup_incomplete)( - db_symtab_t *stab, - char *symstr, - char **name, - int *len, - int *toadd - ); -} x_db[]; - -#ifndef symtab_type -#define symtab_type(s) SYMTAB_AOUT -#endif - -#define X_db_init() x_db[symtab_type(s)].init() -#define X_db_sym_init(s,e,n,t) x_db[symtab_type(s)].sym_init(s,e,n,t) -#define X_db_lookup(s,n) x_db[(s)->type].lookup(s,n) -#define X_db_search_symbol(s,o,t,d) x_db[(s)->type].search_symbol(s,o,t,d) -#define X_db_line_at_pc(s,p,f,l,a) x_db[(s)->type].line_at_pc(s,p,f,l,a) -#define X_db_symbol_values(s,p,n,v) x_db[(s)->type].symbol_values(p,n,v) -#define X_db_search_by_addr(s,a,f,c,l,d,r) \ - x_db[(s)->type].search_by_addr(s,a,f,c,l,d,r) -#define X_db_print_completion(s,p) x_db[(s)->type].print_completion(s,p) -#define X_db_lookup_incomplete(s,p,n,l,t) \ - x_db[(s)->type].lookup_incomplete(s,p,n,l,t) - -#endif /* !_DDB_DB_SYM_H_ */ diff --git a/osfmk/ddb/db_task_thread.c b/osfmk/ddb/db_task_thread.c deleted file mode 100644 index 7e7420c14..000000000 --- a/osfmk/ddb/db_task_thread.c +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include - -/* - * Following constants are used to prevent infinite loop of task - * or thread search due to the incorrect list. - */ -#define DB_MAX_TASKID 0x10000 /* max # of tasks */ -#define DB_MAX_THREADID 0x10000 /* max # of threads in a task */ -#define DB_MAX_PSETS 0x10000 /* max # of processor sets */ - -task_t db_default_task = TASK_NULL; /* default target task */ -thread_t db_default_act = THREAD_NULL; /* default target thr_act */ - - - -/* Prototypes for functions local to this file. - */ -task_t db_lookup_task_id(register int task_id); - -static thread_t db_lookup_act_id( - task_t task, - register int thread_id); - - - -/* - * search valid task queue, and return the queue position as the task id - */ -int -db_lookup_task(task_t target_task) -{ - register task_t task; - register int task_id; - - task_id = 0; - if (queue_first(&tasks) == 0) - return(-1); - queue_iterate(&tasks, task, task_t, tasks) { - if (target_task == task) - return(task_id); - if (task_id++ >= DB_MAX_TASKID) - return(-1); - } - return(-1); -} - -/* - * search thread queue of the task, and return the queue position - */ -int -db_lookup_task_act( - task_t task, - thread_t target_act) -{ - register thread_t thr_act; - register int act_id; - - act_id = 0; - if (queue_first(&task->threads) == 0) - return(-1); - queue_iterate(&task->threads, thr_act, thread_t, task_threads) { - if (target_act == thr_act) - return(act_id); - if (act_id++ >= DB_MAX_THREADID) - return(-1); - } - return(-1); -} - -/* - * search thr_act queue of every valid task, and return the queue position - * as the thread id. - */ -int -db_lookup_act(thread_t target_act) -{ - register int act_id; - register task_t task; - register int ntask = 0; - - if (queue_first(&tasks) == 0) - return(-1); - queue_iterate(&tasks, task, task_t, tasks) { - if (ntask++ > DB_MAX_TASKID) - return(-1); - if (task->thread_count == 0) - continue; - act_id = db_lookup_task_act(task, target_act); - if (act_id >= 0) - return(act_id); - } - return(-1); -} - -/* - * check the address is a valid thread address - */ -int force_act_lookup = 0; -boolean_t -db_check_act_address_valid(thread_t thr_act) -{ - if (!force_act_lookup && db_lookup_act(thr_act) < 0) { - db_printf("Bad thr_act address 0x%x\n", thr_act); - db_flush_lex(); - return(FALSE); - } else - return(TRUE); -} - -/* - * convert task_id(queue postion) to task address - */ -task_t -db_lookup_task_id(int task_id) -{ - register task_t task; - - if (task_id > DB_MAX_TASKID) - return(TASK_NULL); - if (queue_first(&tasks) == 0) - return(TASK_NULL); - queue_iterate(&tasks, task, task_t, tasks) { - if (task_id-- <= 0) - return(task); - } - return(TASK_NULL); -} - -/* - * convert (task_id, act_id) pair to thr_act address - */ -static thread_t -db_lookup_act_id( - task_t task, - register int act_id) -{ - register thread_t thr_act; - - - if (act_id > DB_MAX_THREADID) - return(THREAD_NULL); - if (queue_first(&task->threads) == 0) - return(THREAD_NULL); - queue_iterate(&task->threads, thr_act, thread_t, task_threads) { - if (act_id-- <= 0) - return(thr_act); - } - return(THREAD_NULL); -} - -/* - * get next parameter from a command line, and check it as a valid - * thread address - */ -boolean_t -db_get_next_act( - thread_t *actp, - int position) -{ - db_expr_t value; - thread_t thr_act; - - *actp = THREAD_NULL; - if (db_expression(&value)) { - thr_act = (thread_t)(unsigned long)value; - if (!db_check_act_address_valid(thr_act)) { - db_flush_lex(); - return(FALSE); - } - } else if (position <= 0) { - thr_act = db_default_act; - } else - return(FALSE); - *actp = thr_act; - return(TRUE); -} - -/* - * check the default thread is still valid - * ( it is called in entering DDB session ) - */ -void -db_init_default_act(void) -{ - if (db_lookup_act(db_default_act) < 0) { - db_default_act = THREAD_NULL; - db_default_task = TASK_NULL; - } else - db_default_task = db_default_act->task; -} - -/* - * set or get default thread which is used when /t or :t option is specified - * in the command line - */ -int -db_set_default_act(__unused struct db_variable *vp, db_expr_t *valuep, - int flag, __unused db_var_aux_param_t ap) -{ - thread_t thr_act; - int task_id; - int act_id; - - if (flag == DB_VAR_SHOW) { - db_printf("%#n", db_default_act); - task_id = db_lookup_task(db_default_task); - if (task_id != -1) { - act_id = db_lookup_act(db_default_act); - if (act_id != -1) { - db_printf(" (task%d.%d)", task_id, act_id); - } - } - return(0); - } - - if (flag != DB_VAR_SET) { - *valuep = (db_expr_t)(unsigned long)db_default_act; - return(0); - } - thr_act = (thread_t)(unsigned long)*valuep; - if (thr_act != THREAD_NULL && !db_check_act_address_valid(thr_act)) - db_error(0); - /* NOTREACHED */ - db_default_act = thr_act; - if (thr_act) - db_default_task = thr_act->task; - return(0); -} - -/* - * convert $taskXXX[.YYY] type DDB variable to task or thread address - */ -int -db_get_task_act(__unused struct db_variable *vp, db_expr_t *valuep, int flag, - db_var_aux_param_t ap) -{ - task_t task; - thread_t thr_act; - int task_id; - - if (flag == DB_VAR_SHOW) { - db_printf("%#n", db_default_task); - task_id = db_lookup_task(db_default_task); - if (task_id != -1) - db_printf(" (task%d)", task_id); - return(0); - } - - if (flag != DB_VAR_GET) { - db_error("Cannot set to $task variable\n"); - /* NOTREACHED */ - } - if ((task = db_lookup_task_id(ap->suffix[0])) == TASK_NULL) { - db_printf("no such task($task%d)\n", ap->suffix[0]); - db_error(0); - /* NOTREACHED */ - } - if (ap->level <= 1) { - *valuep = (db_expr_t)(unsigned long)task; - return(0); - } - if ((thr_act = db_lookup_act_id(task, ap->suffix[1])) == THREAD_NULL){ - db_printf("no such thr_act($task%d.%d)\n", - ap->suffix[0], ap->suffix[1]); - db_error(0); - /* NOTREACHED */ - } - *valuep = (db_expr_t)(unsigned long)thr_act; - return(0); -} diff --git a/osfmk/ddb/db_task_thread.h b/osfmk/ddb/db_task_thread.h deleted file mode 100644 index 12f9ac6b4..000000000 --- a/osfmk/ddb/db_task_thread.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#ifndef _DDB_DB_TASK_THREAD_H_ -#define _DDB_DB_TASK_THREAD_H_ - -#include -#include -#include -#include /* For db_var_aux_param_t */ - -/* - * On behalf of kernel-loaded tasks, distinguish between current task - * (=> symbol table) and current address space (=> where [e.g.] - * breakpoints are set). From ddb's perspective, kernel-loaded tasks - * can retain their own symbol tables, but share the kernel's address - * space. - */ -#define db_current_task() \ - ((current_thread())? current_thread()->task: TASK_NULL) -#define db_current_space() \ - ((current_thread())?\ - current_thread()->task: TASK_NULL) -#define db_target_space(thr_act, user_space) \ - ((!(user_space) || ((thr_act)))?\ - TASK_NULL: \ - (thr_act)? \ - (thr_act)->task: db_current_space()) -#define db_is_current_space(task) \ - ((task) == TASK_NULL || (task) == db_current_space()) - -extern task_t db_default_task; /* default target task */ -extern thread_t db_default_act; /* default target thr_act */ - - -/* Prototypes for functions exported by this module. - */ - -int db_lookup_act(thread_t target_act); - -int db_lookup_task(task_t target_task); - -int db_lookup_task_act( - task_t task, - thread_t target_act); - -boolean_t db_check_act_address_valid(thread_t thr_act); - -boolean_t db_get_next_act( - thread_t *actp, - int position); - -void db_init_default_act(void); - -int db_set_default_act( - struct db_variable *vp, - db_expr_t *valuep, - int flag, - db_var_aux_param_t ap); - -int db_get_task_act( - struct db_variable *vp, - db_expr_t *valuep, - int flag, - db_var_aux_param_t ap); - -#endif /* !_DDB_DB_TASK_THREAD_H_ */ diff --git a/osfmk/ddb/db_trap.c b/osfmk/ddb/db_trap.c deleted file mode 100644 index 759649b82..000000000 --- a/osfmk/ddb/db_trap.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Trap entry point to kernel debugger. - */ -#include -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include -#include -#include -#include - -extern jmp_buf_t *db_recover; - -extern int db_inst_count; -extern int db_load_count; -extern int db_store_count; - -static task_t task_space; -static task_t task; -void -db_task_trap(__unused int type, __unused int code, boolean_t user_space) -{ - jmp_buf_t db_jmpbuf; - jmp_buf_t *prev; - boolean_t bkpt; - boolean_t watchpt; - - task = db_current_task(); - task_space = db_target_space(current_thread(), user_space); - bkpt = IS_BREAKPOINT_TRAP(type, code); - watchpt = IS_WATCHPOINT_TRAP(type, code); - - /* - * Note: we look up PC values in an address space (task_space), - * but print symbols using a (task-specific) symbol table, found - * using task. - */ - - /* Elided since walking the thread/task lists before setting up - * safe recovery points is incorrect, and could - * potentially cause us to loop and fault indefinitely. - */ -#if 0 - db_init_default_act(); -#endif - db_check_breakpoint_valid(); - - if (db_stop_at_pc(&bkpt, task, task_space)) { - if (db_inst_count) { - db_printf("After %d instructions (%d loads, %d stores),\n", - db_inst_count, db_load_count, db_store_count); - } - if (bkpt) - db_printf("Breakpoint at "); - else if (watchpt) - db_printf("Watchpoint at "); - else - db_printf("Stopped at "); - db_dot = PC_REGS(DDB_REGS); - - prev = db_recover; - if (_setjmp(db_recover = &db_jmpbuf) == 0) { -#if defined(__alpha) - db_print_loc(db_dot, task_space); - db_printf("\n\t"); - db_print_inst(db_dot, task_space); -#else /* !defined(__alpha) */ - db_print_loc_and_inst(db_dot, task); -#endif /* defined(__alpha) */ - } else - db_printf("Trouble printing location %#llX.\n", (unsigned long long)db_dot); - db_recover = prev; - - db_command_loop(); - } - - db_restart_at_pc(watchpt, task_space); -} diff --git a/osfmk/ddb/db_trap.h b/osfmk/ddb/db_trap.h deleted file mode 100644 index 79554c85b..000000000 --- a/osfmk/ddb/db_trap.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:22:27 ezf - * change marker to not FREE - * [1994/09/22 21:11:20 ezf] - * - * Revision 1.1.2.4 1993/09/17 21:34:42 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:27 robert] - * - * Revision 1.1.2.3 1993/08/03 18:21:39 rod - * ANSI prototypes: prototype thread_kdb_return(). CR #9523. - * [1993/08/03 13:06:06 rod] - * - * Revision 1.1.2.2 1993/07/27 18:28:24 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:13:18 elliston] - * - * $EndLog$ - */ - -#ifndef _DDB_DB_TRAP_H_ -#define _DDB_DB_TRAP_H_ - -#include - - -/* Prototypes for functions exported by this module. - */ - -void db_task_trap( - int type, - int code, - boolean_t user_space); - -void db_trap( - int type, - int code); - -/* Other exported prototypes - */ - -void thread_kdb_return(void); - -#endif /* !_DDB_DB_TRAP_H_ */ diff --git a/osfmk/ddb/db_variables.c b/osfmk/ddb/db_variables.c deleted file mode 100644 index f30e5cad4..000000000 --- a/osfmk/ddb/db_variables.c +++ /dev/null @@ -1,716 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -#include -#include /* For strcpy() */ - -#include -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ - -extern db_expr_t db_max_width; -extern db_expr_t db_tab_stop_width; -extern db_expr_t db_max_line; -extern db_expr_t db_auto_wrap; -extern db_expr_t db_macro_level; -extern db_expr_t db_auto_completion; - -#define DB_NWORK 32 /* number of work variable */ - -db_expr_t db_work[DB_NWORK]; /* work variable */ - -struct db_variable db_vars[] = { - { - .name = "maxoff", - .valuep = (db_expr_t*)&db_maxoff, - }, - { - .name = "autowrap", - .valuep = &db_auto_wrap, - }, - { - .name = "completion", - .valuep = &db_auto_completion, - }, - { - .name = "maxwidth", - .valuep = &db_max_width, - }, - { - .name = "radix", - .valuep = &db_radix, - }, - { - .name = "tabstops", - .valuep = &db_tab_stop_width, - }, - { - .name = "lines", - .valuep = &db_max_line, - }, - { - .name = "thr_act", - .fcn = db_set_default_act, - }, - { - .name = "task", - .fcn = db_get_task_act, - .min_level = 1, - .max_level = 2, - .low = -1, - .high = -1, - }, - { - .name = "work", - .valuep = &db_work[0], - .min_level = 1, - .max_level = 1, - .high = DB_NWORK - 1, - }, - { - .name = "arg", - .fcn = db_arg_variable, - .min_level = 1, - .max_level = 1, - .low = 1, - .high = DB_MACRO_NARGS, - .hidden_level = 1, - .hidden_low = 0, - .hidden_high = DB_MACRO_LEVEL - 1, - .hidden_levelp = (int *)&db_macro_level, - }, -}; -struct db_variable *db_evars = db_vars + sizeof(db_vars)/sizeof(db_vars[0]); - - - -/* Prototypes for functions local to this file. - */ - -static const char *db_get_suffix(const char *, short *); - -static boolean_t db_cmp_variable_name(struct db_variable *, const char *, - db_var_aux_param_t); - -static int db_find_variable( - struct db_variable **varp, - db_var_aux_param_t ap); - -void db_list_variable(void); - -static const char * -db_get_suffix(const char *suffix, short *suffix_value) -{ - register int value; - - for (value = 0; *suffix && *suffix != '.' && *suffix != ':'; suffix++) { - if (*suffix < '0' || *suffix > '9') - return(0); - value = value*10 + *suffix - '0'; - } - *suffix_value = value; - if (*suffix == '.') - suffix++; - return(suffix); -} - -static boolean_t -db_cmp_variable_name(struct db_variable *vp, const char *name, - db_var_aux_param_t ap) -{ - const char *var_np, *np; - int level; - - for (np = name, var_np = vp->name; *var_np; ) { - if (*np++ != *var_np++) - return(FALSE); - } - for (level = 0; *np && *np != ':' && level < vp->max_level; level++){ - if ((np = db_get_suffix(np, &ap->suffix[level])) == 0) - return(FALSE); - } - if ((*np && *np != ':') || level < vp->min_level - || (level > 0 && (ap->suffix[0] < vp->low - || (vp->high >= 0 && ap->suffix[0] > vp->high)))) - return(FALSE); - strlcpy(ap->modif, (*np)? np+1: "", TOK_STRING_SIZE); - ap->thr_act = (db_option(ap->modif, 't')?db_default_act: THREAD_NULL); - ap->level = level; - ap->hidden_level = -1; - return(TRUE); -} - -static int -db_find_variable( - struct db_variable **varp, - db_var_aux_param_t ap) -{ - int t; - struct db_variable *vp; - - t = db_read_token(); - if (t == tIDENT) { - for (vp = db_vars; vp < db_evars; vp++) { - if (db_cmp_variable_name(vp, db_tok_string, ap)) { - *varp = vp; - return (1); - } - } - for (vp = db_regs; vp < db_eregs; vp++) { - if (db_cmp_variable_name(vp, db_tok_string, ap)) { - *varp = vp; - return (1); - } - } -#if defined(ALTERNATE_REGISTER_DEFS) - for (vp = db_altregs; vp < db_ealtregs; vp++) { - if (db_cmp_variable_name(vp, db_tok_string, ap)) { - *varp = vp; - return (1); - } - } -#endif /* defined(ALTERNATE_REGISTER_DEFS) */ - } - db_printf("Unknown variable \"$%s\"\n", db_tok_string); - db_error(0); - return (0); -} - -int -db_get_variable(db_expr_t *valuep) -{ - struct db_variable *vp; - struct db_var_aux_param aux_param; - char modif[TOK_STRING_SIZE]; - - aux_param.modif = modif; - if (!db_find_variable(&vp, &aux_param)) - return (0); - - db_read_write_variable(vp, valuep, DB_VAR_GET, &aux_param); - - return (1); -} - -void -db_read_write_variable( - struct db_variable *vp, - db_expr_t *valuep, - int rw_flag, - db_var_aux_param_t ap) -{ - int (*func)(struct db_variable*, db_expr_t*,int, db_var_aux_param_t) - = vp->fcn; - struct db_var_aux_param aux_param; - db_expr_t old_value; - - if (ap == 0) { - ap = &aux_param; - ap->modif = NULL; - ap->level = 0; - ap->thr_act = THREAD_NULL; - } - if (rw_flag == DB_VAR_SET && vp->precious) - db_read_write_variable(vp, &old_value, DB_VAR_GET, ap); - if (func == FCN_NULL) { - if (rw_flag == DB_VAR_SET) - vp->valuep[(ap->level)? (ap->suffix[0] - vp->low): 0] = *valuep; - else - *valuep = vp->valuep[(ap->level)? (ap->suffix[0] - vp->low): 0]; - } else - (*func)(vp, valuep, rw_flag, ap); - if (rw_flag == DB_VAR_SET && vp->precious) - db_printf("\t$%s:%s<%#x>\t%#8lln\t=\t%#8lln\n", vp->name, - ap->modif, ap->thr_act, (unsigned long long)old_value, (unsigned long long)*valuep); -} - -void -db_list_variable(void) -{ - register struct db_variable *new; - register struct db_variable *old; - register struct db_variable *cur; - unsigned int l; - unsigned int len; - short i; - unsigned int j; - - len = 1; - for (cur = db_vars; cur < db_evars; cur++) { - if (cur->min_level > 0 || cur->max_level > 0) { - j = 3 * (cur->max_level - cur->min_level + 1) - 1; - if (cur->max_level > cur->min_level) - j += 2; - } else - j = 0; - if ((l = strlen(cur->name) + j) >= len) - len = l + 1; - } - - old = (struct db_variable *)0; - for (;;) { - new = (struct db_variable *)0; - for (cur = db_vars; cur < db_evars; cur++) - if ((new == (struct db_variable *)0 || - strcmp(cur->name, new->name) < 0) && - (old == (struct db_variable *)0 || - strcmp(cur->name, old->name) > 0)) - new = cur; - if (new == (struct db_variable *)0) - return; - db_reserve_output_position(len); - db_printf(new->name); - j = strlen(new->name); - if (new->min_level > 0) { - db_putchar('?'); - db_putchar('?'); - j += 2; - for (i = new->min_level - 1; i > 0; i--) { - db_putchar('.'); - db_putchar('?'); - db_putchar('?'); - j += 3; - } - if (new->max_level > new->min_level) { - db_putchar('['); - db_putchar('.'); - db_putchar('?'); - db_putchar('?'); - j += 4; - } - i = new->min_level + 1; - } else { - if (new->max_level > new->min_level) { - db_putchar('['); - j++; - } - i = new->min_level; - } - while (i++ < new->max_level) { - db_putchar('.'); - db_putchar('?'); - db_putchar('?'); - j += 3; - } - if (new->max_level > new->min_level) { - db_putchar(']'); - j++; - } - while (j++ < len) - db_putchar(' '); - old = new; - } -} - -void -db_set_cmd(void) -{ - db_expr_t value; - int t; - struct db_variable *vp; - struct db_var_aux_param aux_param; - char modif[TOK_STRING_SIZE]; - - aux_param.modif = modif; - t = db_read_token(); - if (t == tIDENT && strcmp("help", db_tok_string) == 0) { - db_list_variable(); - return; - } - if (t != tDOLLAR) { - db_error("Variable name should be prefixed with $\n"); - return; - } - if (!db_find_variable(&vp, &aux_param)) { - db_error("Unknown variable\n"); - return; - } - - t = db_read_token(); - if (t != tEQ) - db_unread_token(t); - - if (!db_expression(&value)) { - db_error("No value\n"); - return; - } - if ((t = db_read_token()) == tSEMI_COLON) - db_unread_token(t); - else if (t != tEOL) - db_error("?\n"); - - db_read_write_variable(vp, &value, DB_VAR_SET, &aux_param); -} - -void -db_show_one_variable(void) -{ - struct db_variable *cur; - unsigned int len; - unsigned int sl; - unsigned int slen = 0; - short h = 0; - short i; - unsigned short j; - short k; - short low; - int hidden_level = 0; - struct db_var_aux_param aux_param; - const char *p = NULL, *q; - char *name; - db_addr_t offset = 0; - - for (cur = db_vars; cur < db_evars; cur++) - if (db_cmp_variable_name(cur, db_tok_string, &aux_param)) - break; - if (cur == db_evars) { - for (cur = db_vars; cur < db_evars; cur++) { - for (q = cur->name, p = db_tok_string; *q && *p == *q; p++,q++) - continue; - if (*q == '\0') - break; - } - if (cur == db_evars) { - db_error("Unknown variable\n"); - return; - } - - for (i = 0; *p && *p != ':' && i < cur->max_level; i++, p = q) - if ((q = db_get_suffix(p, &aux_param.suffix[i])) == 0) - break; - aux_param.level = i; - if ((*p && *p != ':') || - (i > 0 && (aux_param.suffix[0] < cur->low || - (cur->high >= 0 && - aux_param.suffix[0] > cur->high)))) { - db_error("Unknown variable format\n"); - return; - } - - strlcpy(aux_param.modif, *p ? p + 1 : "", TOK_STRING_SIZE); - aux_param.thr_act = (db_option(aux_param.modif, 't') ? - db_default_act : THREAD_NULL); - } - - if (cur->hidden_level) - if (*cur->hidden_levelp >= cur->hidden_low && - *cur->hidden_levelp <= cur->hidden_high) { - hidden_level = 1; - aux_param.hidden_level = h = *cur->hidden_levelp; - } else { - hidden_level = 0; - aux_param.hidden_level = h = cur->hidden_low; - slen = 1; - for (k = aux_param.level > 0 ? aux_param.suffix[0] : cur->high; - k > 9; k /= 10) - slen++; - } - else - aux_param.hidden_level = -1; - - if ((cur->min_level == 0 && !cur->hidden_level) || cur->high < 0) - j = 0; - else { - if (cur->min_level > 0) { - j = 1; - for (k = aux_param.level > 0 ? - aux_param.suffix[0] : cur->high; k > 9; k /= 10) - j++; - } else - j = 0; - if (cur->hidden_level && hidden_level == 0) { - j += 3; - for (k = aux_param.hidden_level >= 0 ? - aux_param.hidden_level : cur->hidden_high; k > 9; k /= 10) - j++; - } - } - len = strlen(cur->name) + j; - i = low = aux_param.level > 0 ? aux_param.suffix[0] : cur->low; - - for (;;) { - db_printf(cur->name); - j = strlen(cur->name); - if (cur->high >= 0) { - if (cur->min_level > 0) { - db_printf("%d", i); - j++; - for (k = i; k > 9; k /= 10) - j++; - } - if (cur->hidden_level && hidden_level == 0) { - sl = 1; - for (k = i; k > 9; k /= 10) - sl++; - while (sl++ < slen) { - db_putchar(' '); - j++; - } - db_printf("[%d]", h); - j += 3; - for (k = h; k > 9; k /= 10) - j++; - } - } - - while (j++ < len) - db_putchar(' '); - db_putchar(':'); - db_putchar(' '); - - if (cur->fcn) { - aux_param.suffix[0] = i; - (*cur->fcn)(cur, (db_expr_t *)0, DB_VAR_SHOW, &aux_param); - } else { - db_printf("%#lln", (unsigned long long)*(cur->valuep + i)); - db_find_xtrn_task_sym_and_offset(*(cur->valuep + i), &name, - &offset, TASK_NULL); - if (name != (char *)0 && offset <= db_maxoff && - offset != *(cur->valuep + i)) { - db_printf("\t%s", name); - if (offset != 0) - db_printf("+%#llr", (unsigned long long)offset); - } - } - db_putchar('\n'); - if (cur->high < 0) - break; - if (aux_param.level > 0 || i++ == cur->high) { - if (!cur->hidden_level || - hidden_level == 0 || - h++ == cur->hidden_high) - break; - aux_param.hidden_level = h; - i = low; - } - } -} - -void -db_show_variable(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - struct db_variable *cur; - unsigned int l; - unsigned int len; - unsigned int sl; - unsigned int slen; - short h = 0; - short i; - unsigned short j; - short k; - int t; - int t1; - struct db_var_aux_param aux_param; - char *name; - db_addr_t offset; - - switch(t = db_read_token()) { - case tEOL: - case tEOF: - case tSEMI_COLON: - break; - - case tDOLLAR: - t1 = db_read_token(); - if (t1 == tIDENT) { - db_show_one_variable(); - return; - } - db_error("Not a variable name after $\n"); - db_unread_token(t); - return; - - default: - db_error("Variable name should be prefixed with $\n"); - db_unread_token(t); - return; - } - db_unread_token(t); - - slen = len = 1; - for (cur = db_vars; cur < db_evars; cur++) { - if ((cur->min_level == 0 && !cur->hidden_level) || cur->high < 0) - j = 0; - else { - if (cur->min_level > 0) { - j = 1; - for (k = cur->high; k > 9; k /= 10) - j++; - } else - j = 0; - if (cur->hidden_level && - (*cur->hidden_levelp < cur->hidden_low || - *cur->hidden_levelp > cur->hidden_high)) { - j += 3; - for (k = cur->hidden_high; k > 9; k /= 10) - j++; - } - } - if ((l = strlen(cur->name) + j) >= len) - len = l + 1; - } - - aux_param.modif = NULL; - aux_param.level = 1; - aux_param.thr_act = THREAD_NULL; - - for (cur = db_vars; cur < db_evars; cur++) { - i = cur->low; - if (cur->hidden_level) { - if (*cur->hidden_levelp >= cur->hidden_low && - *cur->hidden_levelp <= cur->hidden_high) { - h = cur->hidden_low - 1; - aux_param.hidden_level = *cur->hidden_levelp; - } else { - h = cur->hidden_low; - aux_param.hidden_level = cur->hidden_low; - } - slen = 1; - for (k = cur->high; k > 9; k /= 10) - slen++; - } else - aux_param.hidden_level = -1; - - if (cur != db_vars && cur->high >= 0 && - (cur->min_level > 0 || cur->hidden_level)) - db_putchar('\n'); - - for (;;) { - db_printf(cur->name); - j = strlen(cur->name); - if (cur->high >= 0) { - if (cur->min_level > 0) { - db_printf("%d", i); - j++; - for (k = i; k > 9; k /= 10) - j++; - } - if (cur->hidden_level && h >= cur->hidden_low) { - sl = 1; - for (k = i; k > 9; k /= 10) - sl++; - while (sl++ < slen) { - db_putchar(' '); - j++; - } - db_printf("[%d]", h); - j += 3; - for (k = h; k > 9; k /= 10) - j++; - } - } - while (j++ < len) - db_putchar(' '); - db_putchar(':'); - db_putchar(' '); - - if (cur->fcn) { - aux_param.suffix[0] = i; - (*cur->fcn)(cur, (db_expr_t *)0, DB_VAR_SHOW, &aux_param); - } else { - db_printf("%#lln", (unsigned long long)*(cur->valuep + i)); - db_find_xtrn_task_sym_and_offset(*(cur->valuep + i), &name, - &offset, TASK_NULL); - if (name != (char *)0 && offset <= db_maxoff && - offset != *(cur->valuep + i)) { - db_printf("\t%s", name); - if (offset != 0) - db_printf("+%#llr", (unsigned long long)offset); - } - } - db_putchar('\n'); - if (cur->high < 0) - break; - if (i++ == cur->high) { - if (!cur->hidden_level || h++ == cur->hidden_high) - break; - aux_param.hidden_level = h; - i = cur->low; - } - } - } -} - -/* - * given a name of a machine register, return a variable pointer to it. - */ -db_variable_t -db_find_reg_name( - char *s) -{ - register db_variable_t regp; - - if ( s == (char *)0 ) - return DB_VAR_NULL; - - for (regp = db_regs; regp < db_eregs; regp++) { - if ( strcmp( s, regp->name) == 0 ) - return regp; - } - return DB_VAR_NULL; -} diff --git a/osfmk/ddb/db_variables.h b/osfmk/ddb/db_variables.h deleted file mode 100644 index 3ff52cf16..000000000 --- a/osfmk/ddb/db_variables.h +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.2.17.5 1996/01/09 19:16:39 devrcs - * Define alternate register definitions. - * [1995/12/01 21:42:46 jfraser] - * - * Merged '64-bit safe' changes from DEC alpha port. - * [1995/11/21 18:04:00 jfraser] - * - * Revision 1.2.17.4 1995/02/23 21:44:00 alanl - * Merged with DIPC2_SHARED. - * [1995/01/05 13:36:23 alanl] - * - * Revision 1.2.20.2 1994/10/14 03:47:19 dwm - * mk6 CR668 - 1.3b26 merge - * 64bit cleanup - * [1994/10/14 03:40:00 dwm] - * - * Revision 1.2.17.2 1994/09/23 01:22:42 ezf - * change marker to not FREE - * [1994/09/22 21:11:29 ezf] - * - * Revision 1.2.17.1 1994/06/11 21:12:42 bolinger - * Merge up to NMK17.2. - * [1994/06/11 20:04:23 bolinger] - * - * Revision 1.2.22.1 1994/12/06 19:43:29 alanl - * Intel merge, Oct 94 code drop. - * Define DB_VAR_NULL. - * Add prototype for db_find_reg_name. - * [94/11/23 mmp] - * - * Revision 1.2.15.1 1994/02/08 10:59:16 bernadat - * Added db_show_one_variable & db_show_variable prototypes - * - * Got DB_MACRO_LEVEL and DB_MACRO_NARGS macros from . - * Added new fields (hidden_xxx) into struct db_variable and into - * struct db_var_aux_param. - * Added DB_VAR_SHOW for showing variables. - * [93/08/12 paire] - * [94/02/07 bernadat] - * - * Revision 1.2.4.3 1993/07/27 18:28:29 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:13:26 elliston] - * - * Revision 1.2.4.2 1993/06/09 02:21:06 gm - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:57:48 jeffc] - * - * Revision 1.2 1993/04/19 16:03:36 devrcs - * New field used to display old register values with 'set' command - * [barbou@gr.osf.org] - * [92/12/03 bernadat] - * - * Revision 1.1 1992/09/30 02:24:26 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.5 91/10/09 16:04:17 af - * Revision 2.4.3.1 91/10/05 13:08:42 jeffreyh - * Added suffix related field to db_variable structure. - * Added macro definitions of db_{read,write}_variable. - * [91/08/29 tak] - * - * Revision 2.4.3.1 91/10/05 13:08:42 jeffreyh - * Added suffix related field to db_variable structure. - * Added macro definitions of db_{read,write}_variable. - * [91/08/29 tak] - * - * Revision 2.4 91/05/14 15:37:12 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:07:23 mrt - * Changed to new Mach copyright - * [91/01/31 16:19:54 mrt] - * - * Revision 2.2 90/08/27 21:53:40 dbg - * Modularized typedef name. Documented the calling sequence of - * the (optional) access function of a variable. Now the valuep - * field can be made opaque, eg be an offset that fcn() resolves. - * [90/08/20 af] - * - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -#ifndef _DDB_DB_VARIABLES_H_ -#define _DDB_DB_VARIABLES_H_ - -#include -#include /* For db_expr_t */ - - -#define DB_VAR_LEVEL 3 /* maximum number of suffix level */ - -/* - * auxiliary parameters passed to a variable handler - */ -struct db_var_aux_param { - char *modif; /* option strings, must be TOK_STRING_SIZE */ - short level; /* number of levels */ - short hidden_level; /* hidden level */ - short suffix[DB_VAR_LEVEL]; /* suffix */ - thread_t thr_act; /* target thr_act */ -}; - -typedef struct db_var_aux_param *db_var_aux_param_t; - - -/* - * Debugger variables. - */ -struct db_variable { - const char *name; /* Name of variable */ - db_expr_t *valuep; /* pointer to value of variable */ - /* function to call when reading/writing */ - int (*fcn)(struct db_variable *,db_expr_t *,int,db_var_aux_param_t); - short min_level; /* number of minimum suffix levels */ - short max_level; /* number of maximum suffix levels */ - short low; /* low value of level 1 suffix */ - short high; /* high value of level 1 suffix */ - boolean_t hidden_level; /* is there a hidden suffix level ? */ - short hidden_low; /* low value of hidden level */ - short hidden_high; /* high value of hidden level */ - int *hidden_levelp; /* value of current hidden level */ - boolean_t precious; /* print old value when affecting ? */ -#define DB_VAR_GET 0 -#define DB_VAR_SET 1 -#define DB_VAR_SHOW 2 -}; - -typedef struct db_variable *db_variable_t; - -#define DB_VAR_NULL (db_variable_t)0 - -#define FCN_NULL ((int (*)(struct db_variable *, \ - db_expr_t *, \ - int, \ - db_var_aux_param_t)) 0) - -#define DB_VAR_LEVEL 3 /* maximum number of suffix level */ -#define DB_MACRO_LEVEL 5 /* max macro nesting */ -#define DB_MACRO_NARGS 10 /* max args per macro */ - -#define db_read_variable(vp, valuep) \ - db_read_write_variable(vp, valuep, DB_VAR_GET, 0) -#define db_write_variable(vp, valuep) \ - db_read_write_variable(vp, valuep, DB_VAR_SET, 0) - - -extern struct db_variable db_vars[]; /* debugger variables */ -extern struct db_variable *db_evars; -extern struct db_variable db_regs[]; /* machine registers */ -extern struct db_variable *db_eregs; - -#if defined(ALTERNATE_REGISTER_DEFS) - -extern struct db_variable db_altregs[]; /* alternate machine regs */ -extern struct db_variable *db_ealtregs; - -#endif /* defined(ALTERNATE_REGISTER_DEFS) */ - -/* Prototypes for functions exported by this module. - */ - -int db_get_variable(db_expr_t *valuep); - -void db_read_write_variable( - struct db_variable *vp, - db_expr_t *valuep, - int rw_flag, - db_var_aux_param_t ap); - -void db_set_cmd(void); - -void db_show_one_variable(void); - -void db_show_variable(db_expr_t, boolean_t, db_expr_t, char *); - -db_variable_t db_find_reg_name(char *s); - -#endif /* !_DDB_DB_VARIABLES_H_ */ diff --git a/osfmk/ddb/db_watch.c b/osfmk/ddb/db_watch.c deleted file mode 100644 index f4a740028..000000000 --- a/osfmk/ddb/db_watch.c +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: Richard P. Draves, Carnegie Mellon University - * Date: 10/90 - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include /* For db_single_step() */ - -/* - * Watchpoints. - */ - -boolean_t db_watchpoints_inserted = TRUE; - -#define NWATCHPOINTS 100 -struct db_watchpoint db_watch_table[NWATCHPOINTS]; -db_watchpoint_t db_next_free_watchpoint = &db_watch_table[0]; -db_watchpoint_t db_free_watchpoints = 0; -db_watchpoint_t db_watchpoint_list = 0; - -extern vm_map_t kernel_map; - - - -/* Prototypes for functions local to this file. XXX -- should be static. - */ - -db_watchpoint_t db_watchpoint_alloc(void); - -void db_watchpoint_free(register db_watchpoint_t watch); - -void db_set_watchpoint( - task_t task, - db_addr_t addr, - vm_size_t size); - -void db_delete_watchpoint( - task_t task, - db_addr_t addr); - -static int db_get_task( - char *modif, - task_t *taskp, - db_addr_t addr); - -void db_list_watchpoints(void); - - - -db_watchpoint_t -db_watchpoint_alloc(void) -{ - register db_watchpoint_t watch; - - if ((watch = db_free_watchpoints) != 0) { - db_free_watchpoints = watch->link; - return (watch); - } - if (db_next_free_watchpoint == &db_watch_table[NWATCHPOINTS]) { - db_printf("All watchpoints used.\n"); - return (0); - } - watch = db_next_free_watchpoint; - db_next_free_watchpoint++; - - return (watch); -} - -void -db_watchpoint_free(register db_watchpoint_t watch) -{ - watch->link = db_free_watchpoints; - db_free_watchpoints = watch; -} - -void -db_set_watchpoint( - task_t task, - db_addr_t addr, - vm_size_t size) -{ - register db_watchpoint_t watch; - - /* - * Should we do anything fancy with overlapping regions? - */ - - for (watch = db_watchpoint_list; watch != 0; watch = watch->link) { - if (watch->task == task && - (watch->loaddr == addr) && - (watch->hiaddr == addr+size)) { - db_printf("Already set.\n"); - return; - } - } - - watch = db_watchpoint_alloc(); - if (watch == 0) { - db_printf("Too many watchpoints.\n"); - return; - } - - watch->task = task; - watch->loaddr = addr; - watch->hiaddr = addr+size; - - watch->link = db_watchpoint_list; - db_watchpoint_list = watch; - - db_watchpoints_inserted = FALSE; -} - -void -db_delete_watchpoint( - task_t task, - db_addr_t addr) -{ - register db_watchpoint_t watch; - register db_watchpoint_t *prev; - - for (prev = &db_watchpoint_list; (watch = *prev) != 0; - prev = &watch->link) { - if (watch->task == task && - (watch->loaddr <= addr) && - (addr < watch->hiaddr)) { - *prev = watch->link; - db_watchpoint_free(watch); - return; - } - } - - db_printf("Not set.\n"); -} - -void -db_list_watchpoints(void) -{ - register db_watchpoint_t watch; - int task_id; - - if (db_watchpoint_list == 0) { - db_printf("No watchpoints set\n"); - return; - } - - db_printf("Space Address Size\n"); - for (watch = db_watchpoint_list; watch != 0; watch = watch->link) { - if (watch->task == TASK_NULL) - db_printf("kernel "); - else { - task_id = db_lookup_task(watch->task); - if (task_id < 0) - db_printf("%*X", 2*sizeof(vm_offset_t), watch->task); - else - db_printf("task%-3d ", task_id); - } - db_printf(" %*X %X\n", 2*sizeof(vm_offset_t), watch->loaddr, - watch->hiaddr - watch->loaddr); - } -} - -static int -db_get_task( - char *modif, - task_t *taskp, - db_addr_t addr) -{ - task_t task = TASK_NULL; - db_expr_t value; - boolean_t user_space; - - user_space = db_option(modif, 'T'); - if (user_space) { - if (db_expression(&value)) { - task = (task_t)(unsigned long)value; - if (db_lookup_task(task) < 0) { - db_printf("bad task address %X\n", task); - return(-1); - } - } else { - task = db_default_task; - if (task == TASK_NULL) { - if ((task = db_current_task()) == TASK_NULL) { - db_printf("no task\n"); - return(-1); - } - } - } - } - if (!DB_VALID_ADDRESS(addr, user_space)) { - db_printf("Address %#X is not in %s space\n", addr, - (user_space)? "user": "kernel"); - return(-1); - } - *taskp = task; - return(0); -} - -/* Delete watchpoint */ -void -db_deletewatch_cmd(db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - task_t task; - - if (db_get_task(modif, &task, addr) < 0) - return; - db_delete_watchpoint(task, addr); -} - -/* Set watchpoint */ -void -db_watchpoint_cmd(db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - vm_size_t size; - db_expr_t value; - task_t task; - - if (db_get_task(modif, &task, addr) < 0) - return; - if (db_expression(&value)) - size = (vm_size_t) value; - else - size = sizeof(int); - db_set_watchpoint(task, addr, size); -} - -/* list watchpoints */ -void -db_listwatch_cmd(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - db_list_watchpoints(); -} - -void -db_set_watchpoints(void) -{ - register db_watchpoint_t watch; - vm_map_t map; - - if (!db_watchpoints_inserted) { - for (watch = db_watchpoint_list; watch != 0; watch = watch->link) { - map = (watch->task)? watch->task->map: kernel_map; - pmap_protect(map->pmap, - vm_map_trunc_page(watch->loaddr), - vm_map_round_page(watch->hiaddr), - VM_PROT_READ); - } - db_watchpoints_inserted = TRUE; - } -} - -void -db_clear_watchpoints(void) -{ - db_watchpoints_inserted = FALSE; -} - -boolean_t -db_find_watchpoint( - vm_map_t map, - db_addr_t addr, - db_regs_t *regs) -{ - register db_watchpoint_t watch; - db_watchpoint_t found = 0; - register task_t task_space; - - task_space = (vm_map_pmap(map) == kernel_pmap)? - TASK_NULL: db_current_space(); - for (watch = db_watchpoint_list; watch != 0; watch = watch->link) { - if (watch->task == task_space) { - if ((watch->loaddr <= addr) && (addr < watch->hiaddr)) - return (TRUE); - else if ((trunc_page(watch->loaddr) <= addr) && - (addr < round_page(watch->hiaddr))) - found = watch; - } - } - - /* - * We didn't hit exactly on a watchpoint, but we are - * in a protected region. We want to single-step - * and then re-protect. - */ - - if (found) { - db_watchpoints_inserted = FALSE; - db_single_step(regs, task_space); - } - - return (FALSE); -} diff --git a/osfmk/ddb/db_watch.h b/osfmk/ddb/db_watch.h deleted file mode 100644 index 6ec420e36..000000000 --- a/osfmk/ddb/db_watch.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:23:04 ezf - * change marker to not FREE - * [1994/09/22 21:11:39 ezf] - * - * Revision 1.1.2.4 1993/07/27 18:28:34 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:13:34 elliston] - * - * Revision 1.1.2.3 1993/06/07 22:07:00 jeffc - * CR9176 - ANSI C violations: trailing tokens on CPP - * directives, extra semicolons after decl_ ..., asm keywords - * [1993/06/07 18:57:38 jeffc] - * - * Revision 1.1.2.2 1993/06/02 23:13:21 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:57:59 jeffc] - * - * Revision 1.1 1992/09/30 02:24:28 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.5 91/10/09 16:04:47 af - * Revision 2.4.3.1 91/10/05 13:09:14 jeffreyh - * Changed "map" field of db_watchpoint structure to "task", - * and also changed paramters of function declarations. - * [91/08/29 tak] - * - * Revision 2.4.3.1 91/10/05 13:09:14 jeffreyh - * Changed "map" field of db_watchpoint structure to "task", - * and also changed paramters of function declarations. - * [91/08/29 tak] - * - * Revision 2.4 91/05/14 15:37:46 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:07:31 mrt - * Changed to new Mach copyright - * [91/01/31 16:20:09 mrt] - * - * Revision 2.2 90/10/25 14:44:21 rwd - * Generalized the watchpoint support. - * [90/10/16 rwd] - * Created. - * [90/10/16 rpd] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 10/90 - */ - -#ifndef _DDB_DB_WATCH_H_ -#define _DDB_DB_WATCH_H_ - -#include -#include -#include - -/* - * Watchpoint. - */ - -typedef struct db_watchpoint { - task_t task; /* in this map */ - db_addr_t loaddr; /* from this address */ - db_addr_t hiaddr; /* to this address */ - struct db_watchpoint *link; /* link in in-use or free chain */ -} *db_watchpoint_t; - - - -/* Prototypes for functions exported by this module. - */ - -void db_deletewatch_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_watchpoint_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_listwatch_cmd(db_expr_t, boolean_t, db_expr_t, char *); - -void db_clear_watchpoints(void); - -void db_set_watchpoints(void); - -boolean_t db_find_watchpoint( - vm_map_t map, - db_addr_t addr, - db_regs_t *regs); - -#endif /* !_DDB_DB_WATCH_H_ */ diff --git a/osfmk/ddb/db_write_cmd.c b/osfmk/ddb/db_write_cmd.c deleted file mode 100644 index 00b00e513..000000000 --- a/osfmk/ddb/db_write_cmd.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include /* For db_printf() */ - -/* - * Write to file. - */ -void -db_write_cmd(db_expr_t address, __unused boolean_t have_addr, - __unused db_expr_t count, char *modif) -{ - register db_addr_t addr; - register db_expr_t old_value; - db_expr_t new_value; - register int size; - boolean_t wrote_one = FALSE; - boolean_t t_opt, u_opt; - thread_t thr_act; - task_t task; - - addr = (db_addr_t) address; - - size = db_size_option(modif, &u_opt, &t_opt); - - if (t_opt) - { - if (!db_get_next_act(&thr_act, 0)) - return; - task = thr_act->task; - } - else - task = db_current_space(); - - /* if user space is not explicitly specified, - look in the kernel */ - if (!u_opt) - task = TASK_NULL; - - if (!DB_VALID_ADDRESS(addr, u_opt)) { - db_printf("Bad address 0x%llx\n", (unsigned long long)addr); - return; - } - - while (db_expression(&new_value)) { - old_value = db_get_task_value(addr, size, FALSE, task); - db_task_printsym(addr, DB_STGY_ANY, task); - db_printf("\t\t%#8lln\t=\t%#8lln\n", (unsigned long long)old_value, (unsigned long long)new_value); - db_put_task_value(addr, size, new_value, task); - addr += size; - - wrote_one = TRUE; - } - - if (!wrote_one) - db_error("Nothing written.\n"); - - db_next = addr; - db_prev = addr - size; -} diff --git a/osfmk/ddb/db_write_cmd.h b/osfmk/ddb/db_write_cmd.h deleted file mode 100644 index 1987c278b..000000000 --- a/osfmk/ddb/db_write_cmd.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 01:23:27 ezf - * change marker to not FREE - * [1994/09/22 21:11:46 ezf] - * - * Revision 1.1.2.3 1993/09/17 21:34:44 robert - * change marker to OSF_FREE_COPYRIGHT - * [1993/09/17 21:27:30 robert] - * - * Revision 1.1.2.2 1993/07/27 18:28:41 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:13:41 elliston] - * - * $EndLog$ - */ -#ifndef _DDB_DB_WRITE_CMD_H_ -#define _DDB_DB_WRITE_CMD_H_ - -#include - -/* Prototypes for functions exported by this module. - */ -void db_write_cmd( - db_expr_t address, - boolean_t have_addr, - db_expr_t count, - char * modif); - -#endif /* !_DDB_DB_WRITE_CMD_H_ */ diff --git a/osfmk/ddb/makedis.c b/osfmk/ddb/makedis.c deleted file mode 100644 index a33bf216e..000000000 --- a/osfmk/ddb/makedis.c +++ /dev/null @@ -1,2386 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.2.1 1997/03/27 18:46:52 barbou - * Created. - * [1997/03/27 13:58:42 barbou] - * - * $EndLog$ - */ - -/* makedis.c - make a disassembler. */ - -/* , - By Eamonn McManus , April 1995. - Copyright 1995 by Eamonn McManus. Non-commercial use is permitted. */ - -/* DESCRIPTION - - This program generates a disassembler in C from a file describing the - opcodes of the machine in question. Lines in the description file are - either comments beginning with #, or contain three fields, with the - first two being terminated by space and the third containing the rest - of the line. Long logical lines can be split onto several physical - lines by ending each one except the last with a \. A logical line - can also be split immediately after a |. Unlike \, | is considered - part of the logical line. Leading spaces on continuation lines - following either \ or | are ignored. - - Here is a concise description of the meanings of the three fields. - Examples later will make it clearer what they are used for. - - The first field of the three is a function name. This will produce - a function or array of the same name in the C output, so it should - not conflict with other identifiers or C keywords. By default the - function named returns a string (a (char *) in C), but if the first - field is preceded by %, the function returns an unsigned long - integer. - - The second field describes the arguments of the function. It consists - of two parts, either but not both of which may be omitted. The first - part is a string which is a bitmask describing the first argument of - the function. Each character of the string represents one bit, - with the least significant bit being the last. A character can be - 0 or 1, representing that constant value, or a letter, representing - part of a bitfield. A given bitfield consists of all of the - contiguous bits containing the same letter. Upper and lower case - letters are considered different. - - The second part of the second field is a list of parameters - describing the parameters of the function, or the parameters after - the first if the bitfield part was present. The list is contained - in parentheses () and the individual parameters are separated by - commas. Spaces are not allowed. Each parameter name is a single - letter, optionally preceded by %. The parameter is an unsigned - long integer if % is present, otherwise a string. Again, upper and - lower case parameter names are different. - - The third field describes the value of the function. If a bitmask - is present in the second field and it contains constant bits (0s or - 1s), then the third field is the value of the function only in the - case where its first argument contains matching values in those bit - positions. There can be many different lines naming the same - function but with different bitpatterns. The generated C code will - arrange to return the value corresponding to the pattern that - matches the actual first argument of the function when it is - called. This argument should not have bits set in positions beyond - those present in the bitpattern. - - It is only allowed for two different lines to name the same function - if there is a bitstring in the second field. It is not allowed for - two such lines to specify exactly the same constant bit values. But - it is allowed for a line to have all the same constant bit values as - another plus some extra constant values. In this case the more - specific line applies when all of its constant bits match, and - otherwise the less specific line applies. - - Apart from the contents of the bitstring, the second field must be - identical on every line referring to a given function, and the - bitstring must always be of the same length. - - For string-valued functions, the third field is the string value. - For integer-valued functions, it is a C integer expression - generating the value. In both cases there may be several special - values: - - - A $ followed by a single letter is replaced by the value of the - argument or bitfield with that name. The value of a bitfield is - shifted as if that bitfield were in the least-significant bit - position. Thus, a single-bit field always has value 0 or 1. - - - A $ followed by the name of a function and an argument list in - parentheses () is replaced by the value returned by the function - with those arguments. An integer value cannot be inserted into a - string without being converted by a function, nor can a string - value be used in an integer expression. - - - A $ followed by a bitstring enclosed in [] is replaced by the - value of that bitstring. The bitstring has the same syntax as in - the second field, described above. Each contiguous sequence of - the same repeated letter in the bitstring is replaced by the - value of the argument or bitfield-argument with that name, - shifted into the appropriate position. - - - A list of strings, separated by |, enclosed in - {}, and followed by an integer expression enclosed in [], is - replaced by the string in the list whose number matches the value - of the expression. The first string in the list is numbered 0. - If there is no string corresponding to the value of the - expression, the behaviour is undefined. The strings in the list - may themselves contain $ or {} operations. - - - A \ followed by any character is replaced by that - character, without regard to any meaning it may usually have. - This is used to obtain strings containing characters such as - {, $, or \. The use of backslash to split long logical - lines takes precedence over this use, so \\ should not appear - at the end of a line. - - The third field may also be a lone colon ":", in which case the - function is assumed to be defined externally and only a function - declaration (prototype) is generated. - - - EXAMPLES - - Here are some examples from the description file for the Z80 - microprocessor. This processor has 8-bit opcodes which are - disassembled by a generated function "inst" which looks like this: - - typedef unsigned long bits; - char *inst(bits code) {...} - - The simplest sort of line in the description file is one that looks - like this: - - inst 01110110 halt - - The first field names the function, "inst". The second field - implies that that function has exactly one argument which is an - integer, and that this line specifies the value of the function - when this integer has the binary value 01110110 (hex 0x76). This - value will be the string "halt". - - A more complex line is one looking like this: - - inst 001aa111 {daa|cpl|scf|ccf}[$a] - - This line is compatible with the previous one, because it has the - same number of bits and the constant bits are different. It - specifies the value of inst when its argument looks like - 001aa111, i.e., for the binary values - 00100111, - 00101111, - 00110111, and - 00111111. The value of $a for these four values will be - respectively binary 00, 01, 10, 11, i.e., 0 to 3. The - corresponding values of the inst function will be "daa", "cpl", - "scf", and "ccf". - - The description defines a helper function "reg8" like this: - - reg8 rrr {b|c|d|e|h|l|(hl)|a}[$r] - - This simply selects one of the eight strings between {} depending - on the value of the argument, which is assumed to be a three-bit - value. This could just as easily have been written: - - reg8 (%r) {b|c|d|e|h|l|(hl)|a}[$r] - - The generated C code is the same -- in each case makedis realises - that the function can be represented by an array rather than - compiling a C function. - - The reg8 function is used in lines like this one: - - inst 01rrrsss ld $reg8($r),$reg8($s) - - Thus if the argument to inst is - 01010011 - then $r is 010 (2) and $s is 011 (3). Since reg8(2) is "d" and - reg8(3) is "e", the value of inst with this argument will be the - string "ld d,e". - - Note that the opcode for "halt" given above matches this pattern, - but because the bitpattern for "halt" is more specific (has more - constant bits) it is the one chosen when the argument is 01110110. - - The description also uses an external C function "hexprint" defined - like this: - - char *hexprint(bits digits, bits n) { - char *p = dis_alloc(digits + 1); - sprintf(p, "%0*lx", (int) digits, n); - return p; - } - - The value of this function is a string containing the number n - spelt out in hex with "digits" digits. In the description - file this function is declared like this: - - hexprint (%w,%n) : - - The names of the parameters are not important in this case as long - as they are letters and are different from each other. - - The hexprint function is used in lines like this one: - - inst 11vvv111 rst $hexprint(2,$v << 3) - - If the argument to inst is - 11011111 - then $v is 011 (3) and the arguments to hexprint are 2 and (3 << 3), - i.e., 0x18. So the value of inst with this argument will be the - string "rst 18". - - Instead of writing $v << 3, it would be possible to write - $[00vvv000]. For instance when $v is binary 011, this becomes - 00011000. The leading 0s could be omitted. - - The $[...] operation is particularly useful for moving bits around. - For instance, the HP PA-RISC opcodes contain bits assigned to - apparently random parts of the instruction word. One of the helper - functions in its description file looks like this: - - im21l aaaaabbccddddddddddde l'$hex($[edddddddddddbbaaaaacc00000000000]) - - So 111110011000000000001 produces 10000000000000111111100000000000. - - The $[...] operation can also be used to spell out binary constants, - since C has no syntax for this. - - - ...More to come... */ - -/* To do: - - More error detection, e.g., bitstring or arg not used in entry. - - Better error recovery -- nearly all errors are currently fatal. - - Clean up type handling, which is somewhat haphazard. It works but there - is stuff that is surely redundant. - - Make generated functions void by default, with $ prefix to indicate - string-value. In a void function, instead of returning a string (or - integer) it would be output via a user-supplied function. - - Further optimise and tidy generated code, e.g.: arrays of one-character - strings could be replaced by arrays of characters; switches with just - one case could be replaced by ifs. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define MAXfunction 32 /* Max function name length. */ -#define MAXBITS 32 /* Max bitstring length. */ -typedef unsigned long bits; -enum type {T_ERROR, T_UNKNOWN, T_INTEGER, T_STRING}; -const char *const typename[] = {"error", "unknown", "integer", "string"}; -enum walkstringop {COUNTARRAYS, DECLAREARRAYS, COMPILEARRAYS}; -char *bitstype = "unsigned long"; - -int maxfunctionname, maxargwidth; -char *progname = "makedis"; -char **global_argv; -char *filename; -char *headerfilename; -FILE *headerfile; -int lineno; -int indentation; -int debug, dump, warnings; - -/* componentbits has a 1 bit for every possible number of strings we may want - to concatenate together at some stage. A separate C function is compiled - for each such case. */ -bits componentbits; - - -struct entry; -struct arg; -struct string; -struct functioncall; -struct array; -struct bits; -struct bitsplice; - - -int main(int argc, char **argv); -int makedis(FILE *f, char *fname); -struct function *findfunction(char *function); -int parseextern(struct function *fp, FILE *f); -struct function *makefunction(char *function); -int parsebits(struct function *fp, char *bitstring, int nbits); -int parseentrybits(struct entry *ep, char *bitstring, int nbits, int issplice); -int parsecontrol(char *name, char *value); -int parseargs(struct function *fp, FILE *f, int *cp); -int parsestring(struct function *fp, char *str); -enum type makestring(struct function *fp, struct string **stringlink, - char **stringp, char *magic, enum type targettype); -int parsedollar(struct function *fp, char **stringp, struct string *sp); -int parsebitsplice(struct function *fp, char *bitstring, int nbits, - struct string *sp); -int findvariable(struct function *fp, int name, struct string *sp); -int parsefunctioncall(struct function *fp, char *start, char **stringp, - struct string *sp); -int parsearray(struct function *fp, char **stringp, struct string *sp, - enum type t); -void dumpfunctions(void); -void dumpfunction(struct function *fp); -void showentry(FILE *f, struct function *fp, struct entry *ep, bits highlight); -void showbits(FILE *f, struct entry *ep, int nbits, bits highlight); -void showargs(FILE *f, struct arg *ap, int fieldwidth); -void showstring(FILE *f, struct string *sp); -void showstringelement(FILE *f, struct string *sp); -void showfunctioncall(FILE *f, struct functioncall *fcp); -void showarray(FILE *f, struct array *ap); -int outputfunctions(void); -void outputidentity(FILE *f); -int outputdeclarations(void); -void outputconcats(void); -void outputconcat(int n); -void outputconcatheader(FILE *f, int n); -void findarrays(void); -int checkfixedlength(struct array *ap); -int outputfunction(struct function *fp); -void functionarray(struct function *fp); -void functionheader(FILE *f, struct function *fp); -int simplearray(struct array *ap); -void compiletype(FILE *f, enum type *tp); -int functionswitch(struct function *fp, bits mask, bits value); -int compilestring(int assignto, struct string *sp, enum type type); -int compilecheckedstring(int assignto, struct string *sp, enum type type); -void compileassign(int assignto); -void compiletemp(int tempno); -void compiletext(char *s); -int compileconcat(struct string *sp, enum type type); -int compilenull(enum type type); -int compilesimple(struct string *sp, enum type type); -int compilearrayref(struct array *ap); -int compilefunctioncall(struct string *sp); -int walkstring(struct string *sp, enum walkstringop op, int tempno); -int compilearray(struct array *ap); -void compilesimplearray(enum type *tp, char *name, int num, struct array *ap); -void declarearray(struct array *ap); -void compilebitstring(struct bits *bp); -void compilebitsplice(struct bitsplice *splicep); -int bitcount(bits x); -bits allbitsset(int nbits); -void findent(FILE *f); -void indent(void); -void *xrealloc(char *oldp, size_t size); -void *xmalloc(size_t size); -void *xstrdup(char *s); -int prematureeof(void); - - -int main(int argc, char **argv) { - int i; - FILE *f; - - global_argv = argv; - if (argc > 0) - progname = argv[0]; - for (i = 1; i < argc && argv[i][0] == '-'; i++) { - switch (argv[i][1]) { - case 'h': - if (++i >= argc) - goto Usage; - headerfilename = argv[i]; break; - case 'd': - debug = 1; break; - case 'D': - dump = 1; break; - case 'w': - warnings = 1; break; - default: -Usage: - fprintf(stderr, "Usage: %s [file]\n", progname); - return 1; - } - } - if (i == argc) - return makedis(stdin, ""); - if (i + 1 != argc) - goto Usage; - if ((f = fopen(argv[i], "r")) == NULL) { - fprintf(stderr, "%s: %s: %s\n", progname, argv[i], strerror(errno)); - return 1; - } - return makedis(f, argv[i]); -} - - -int makedis(FILE *f, char *fname) { - int c, i; - char function[MAXfunction], bitstring[MAXBITS]; - static char *string = NULL; - int stringlen = 0; - struct function *fp; - - filename = fname; - lineno = 1; - /* Loop for every line in the description. */ - while (1) { - /* Ignore initial spaces and newlines. */ - while (isspace(c = getc(f))) - if (c == '\n') - lineno++; - if (c == EOF) - break; - - /* Ignore comments. # only allowed at start of line. */ - if (c == '#') { - while ((c = getc(f)) != '\n') - if (c == EOF) - return prematureeof(); - lineno++; - continue; - } - - /* Read function name, terminated by space. */ - for (i = 0; i < sizeof function && !isspace(c); i++, c = getc(f)) { - if (c == EOF) - return prematureeof(); - function[i] = c; - } - if (i >= sizeof function) { - fprintf(stderr, "%s: %s(%d): function name is too long: %.*s\n", - progname, filename, lineno, i, function); - return 1; - } - function[i] = '\0'; - - /* Skip to next field. */ - while (isspace(c) && c != '\n') - c = getc(f); - - /* If not a control statement, read bitstring and/or arguments. */ - if (function[0] == ':') - fp = 0; /* Silence gcc. */ - else { - fp = makefunction(function); - if (fp == NULL) - return 1; - - /* Read optional bitstring. */ - for (i = 0; i < sizeof bitstring && isalnum(c); i++, c = getc(f)) { - if (c == EOF) - return prematureeof(); - bitstring[i] = c; - } - if (isalnum(c)) { - fprintf(stderr, "%s: %s(%d): bit string is too long: %.*s\n", - progname, filename, lineno, i, bitstring); - return 1; - } - if (parsebits(fp, bitstring, i) != 0) - return 1; - - /* Read optional arguments. */ - if (parseargs(fp, f, &c) != 0) - return 1; - - /* Skip to next field. */ - while (isspace(c) && c != '\n') - c = getc(f); - - /* : indicates an external (C) function. */ - if (c == ':') { - if (parseextern(fp, f) != 0) - return 1; - continue; - } - } - - /* Read associated text. */ - i = 0; - while (1) { - for ( ; c != '\n'; i++, c = getc(f)) { - if (c == EOF) - return prematureeof(); - if (i >= stringlen) { - stringlen = stringlen * 2 + 16; - string = xrealloc(string, stringlen); - } - string[i] = c; - } - lineno++; - if (i > 0) { - switch (string[i - 1]) { - case '\\': - i--; - /* Fall in... */ - case '|': - while (isspace(c = getc(f)) && c != '\n') ; - continue; - } - } - break; - } - if (i >= stringlen) { - stringlen = stringlen * 2 + 16; - string = xrealloc(string, stringlen); - } - string[i] = '\0'; - - /* Parse the line just read. */ - if (function[0] == ':') { - if (parsecontrol(function + 1, string) != 0) - return 1; - } else { - if (parsestring(fp, string) != 0) - return 1; - } - } - if (dump) - dumpfunctions(); - return outputfunctions(); -} - - -/* A function in the description file. nbits and nargs are -1 until the - real values are known. */ -struct function { - struct function *next; - char *name; - enum type type; - int nbits; /* Number of bits in the bitpattern, 0 if none. */ - int nargs; /* Number of (x,y,...) parameters, 0 if none. */ - char isarray; /* Will be represented by a C array. */ - int fixedlength; /* If a C array, will be a char [][N] not a char *[]. */ - struct entry *first, *last; - /* Links to the value(s) supplied. */ - struct arg *args; /* List of (x,y,...) names and types. */ -}; -struct function *functions; - - -/* Find the function with the given name. If not found, create a structure - for it, fill it out with a template, and return that. */ -struct function *findfunction(char *name) { - struct function *fp; - - for (fp = functions; fp != NULL; fp = fp->next) { - if (strcmp(fp->name, name) == 0) - return fp; - } - if (strlen(name) > maxfunctionname) - maxfunctionname = strlen(name); - fp = xmalloc(sizeof *fp); - fp->next = functions; - functions = fp; - fp->name = xstrdup(name); - fp->type = T_UNKNOWN; - fp->nbits = fp->nargs = -1; /* nbits will be set correctly later. */ - fp->isarray = 0; - fp->first = fp->last = NULL; - return fp; -} - - -/* Parse an external (C) function declaration. This will look something like: - malloc (%s) : - We're called just after seeing the ':'. - Return 0 if parsing is successful, 1 otherwise. */ -int parseextern(struct function *fp, FILE *f) { - int c; - - if ((c = getc(f)) != '\n') { - fprintf(stderr, - "%s: %s(%d): extern declaration should be a lone `:'\n", - progname, filename, lineno); - return 1; - } - if (fp->nbits != 0) { - fprintf(stderr, - "%s: %s(%d): extern functions should not have bitstrings\n", - progname, filename, lineno); - return 1; - } - free(fp->first); - fp->first = fp->last = NULL; - return 0; -} - - -/* A value supplied for a function (the third field in a description line). - In general there can be any number of such values, differing in the - bitpattern supplied. The mask and value fields describe the constant - bits in the bitpattern: mask indicates which bits they are and value - indicates the values of those bits. So this entry matches - ((x & mask) == value). */ -struct entry { - struct entry *next; - bits mask, value; - struct bits *bits; /* List of named bitfields. */ - struct string *string; /* Value of function when bitpattern matched. */ - char done; /* This entry has already been compiled. */ -}; - - -/* We've just seen a definition of function "name". Make a structure for it - if necessary, and a template entry that will describe the value given here. - */ -struct function *makefunction(char *name) { - struct function *fp; - struct entry *ep = xmalloc(sizeof *ep); - enum type type; - - if (name[0] == '%') { - name++; - type = T_INTEGER; - } else - type = T_STRING; - fp = findfunction(name); - if (fp->type == T_UNKNOWN) - fp->type = type; - else if (fp->type != type) { - fprintf(stderr, "%s: %s(%d): function %s previously declared as %s, " - "here as %s\n", progname, filename, lineno, name, - typename[fp->type], typename[type]); - return NULL; - } - ep->next = NULL; - ep->bits = NULL; - ep->done = 0; - if (fp->first != NULL) - fp->last->next = ep; - else - fp->first = ep; - fp->last = ep; - return fp; -} - - -/* A named bitfield within the bitpattern of a function entry, or within a - $[...] bitsplice. The mask covers the bitfield and the shift says how - many 0 bits there are after the last 1 in the mask. */ -struct bits { - struct bits *next; - int shift; - bits mask; - char name; -}; - - -/* Parse the bitstring supplied for the given function. nbits says how many - bits there are; it can legitimately be 0. Return value is 0 on success. */ -int parsebits(struct function *fp, char *bitstring, int nbits) { - if (fp->nbits < 0) - fp->nbits = nbits; - else if (fp->nbits != nbits) { - fprintf(stderr, "%s: %s(%d): bit string of length %d;\n", - progname, filename, lineno, nbits); - fprintf(stderr, " function %s has bit strings of length %d\n", - fp->name, fp->nbits); - return 1; - } - return parseentrybits(fp->last, bitstring, nbits, 0); -} - - -/* Parse a bitstring that is the pattern for a function entry or that is in a - $[...] bitsplice. Put the result in ep. Return value is 0 on success. */ -int parseentrybits(struct entry *ep, char *bitstring, int nbits, int issplice) { - int i, j; - char bit; - bits mask, value, entrymask; - struct bits *bp; - - mask = value = 0; - for (i = 0; i < nbits; i++) { - bit = bitstring[nbits - 1 - i]; - switch (bit) { - case '1': - value |= 1 << i; - /* Fall in... */ - case '0': - mask |= 1 << i; - continue; - } - if (!isalpha(bit)) { - fprintf(stderr, "%s: %s(%d): invalid character in bitstring: %c\n", - progname, filename, lineno, bit); - return 1; - } - if (!issplice) { - for (bp = ep->bits; bp != NULL; bp = bp->next) { - if (bp->name == bit) { - fprintf(stderr, - "%s: %s(%d): bitstring name %c used twice\n", - progname, filename, lineno, bit); - return 1; - } - } - } - entrymask = 1 << i; - for (j = i + 1; j < nbits && bitstring[nbits - 1 - j] == bit; j++) - entrymask |= 1 << j; - bp = xmalloc(sizeof *bp); - bp->shift = i; - bp->mask = entrymask; - bp->name = bit; - bp->next = ep->bits; - ep->bits = bp; - i = j - 1; - } - ep->mask = mask; - ep->value = value; - return 0; -} - - -/* Parse a control line. This looks something like: - :bitstype unsigned int - in which case we will be called with name "bitstype" and - value "unsigned int". */ -int parsecontrol(char *name, char *value) { - if (strcmp(name, "bitstype") == 0) - bitstype = xstrdup(value); - else { - fprintf(stderr, "%s: %s(%d): unrecognised control keyword %s\n", - progname, filename, lineno, name); - return 1; - } - return 0; -} - - -/* A parameter to a function, e.g., x in: - %f aaa(%x) $a + $x */ -struct arg { - struct arg *next; - enum type type; - char name; -}; - - -/* Parse the parameters (x,y,...) to a function and put the result in fp. - The entry that is being built is fp->last. cp points to the opening - (; if it does not point to a ( then there are no parameters. If - this is the first entry for the function, fp->nargs will be -1 and - we will build up an argument list. Otherwise, fp->nargs will be - >= 0 and we will only check that the arguments here are consistent - with what went before. Return value is 0 on success. */ -int parseargs(struct function *fp, FILE *f, int *cp) { - struct arg **arglink, *ap; - struct bits *bp; - int nargs, width; - char name; - enum type t; - - arglink = &fp->args; - width = nargs = 0; - if (*cp == '(') { - *cp = getc(f); - if (*cp != ')') { - width = 1; - while (1) { - nargs++; - width += 2; - if (fp->nargs >= 0 && nargs > fp->nargs) { - fprintf(stderr, - "%s: %s(%d): %d arg(s) instead of %d for %s\n", - progname, filename, lineno, nargs, fp->nargs, - fp->name); - return 1; - } - t = T_STRING; - if (*cp == '%') { - width++; - t = T_INTEGER; - *cp = getc(f); - } - name = *cp; - if (!isalpha(name)) { - fprintf(stderr, - "%s: %s(%d): argument should be letter: %c\n", - progname, filename, lineno, name); - return 1; - } - for (bp = fp->last->bits; bp != NULL; bp = bp->next) { - if (bp->name == name) { - fprintf(stderr, - "%s: %s(%d): %c is a bitstring and an arg\n", - progname, filename, lineno, name); - return 1; - } - } - if (fp->nargs >= 0) { - if ((*arglink)->name != name) { - fprintf(stderr, - "%s: %s(%d): arg %d of %s is %c not %c\n", - progname, filename, lineno, nargs, fp->name, - (*arglink)->name, name); - return 1; - } - if ((*arglink)->type != t) { - fprintf(stderr, - "%s: %s(%d): arg %c of %s: inconsistent type\n", - progname, filename, lineno, name, fp->name); - return 1; - } - } else { - for (ap = fp->args; ap != *arglink; ap = ap->next) { - if (ap->name == name) { - fprintf(stderr, - "%s: %s(%d): argument name %c used twice\n", - progname, filename, lineno, name); - return 1; - } - } - *arglink = xmalloc(sizeof **arglink); - (*arglink)->name = name; - (*arglink)->type = t; - } - arglink = &(*arglink)->next; - *cp = getc(f); - if (*cp == ')') - break; - if (*cp != ',') { - fprintf(stderr, - "%s: %s(%d): bad character in argument list: %c\n" - " (arguments must be single letters)\n", - progname, filename, lineno, *cp); - return 1; - } - *cp = getc(f); - } - } - *cp = getc(f); - } - if (fp->nargs < 0) { - fp->nargs = nargs; - width += fp->nbits; - if (width > maxargwidth) - maxargwidth = width; - } else if (fp->nargs != nargs) { - fprintf(stderr, "%s: %s(%d): argument list of length %d;\n", - progname, filename, lineno, nargs); - fprintf(stderr, " function %s has argument lists of length %d\n", - fp->name, fp->nargs); - return 1; - } - *arglink = NULL; - return 0; -} - - -/* Parse the string describing the value of this entry for our - function. Return 0 on success. */ -int parsestring(struct function *fp, char *str) { - enum type t; - - t = makestring(fp, &fp->last->string, &str, NULL, fp->type); - if (t == T_ERROR) - return 1; - if (fp->type != t && t != T_UNKNOWN) { - fprintf(stderr, "%s: %s(%d): function %s has inconsistent types\n", - progname, filename, lineno, fp->name); - return 1; - } - return 0; -} - - -/* A parsed representation of the whole string describing a value of a - function, or certain strings within that (e.g., array indices). This is a - linked list of substrings whose type is given by the type field. */ -struct string { - struct string *next; - enum elementtype { - S_TEXT, S_BITSTRING, S_BITSPLICE, S_PARAMETER, S_FUNCTIONCALL, S_ARRAY - } type; - union value { /* The fields here correspond to the enum values. */ - char *text; /* plain text */ - struct bits *bits; /* $x where x is a bitfield */ - struct bitsplice *bitsplice; /* $[...] */ - struct arg *parameter; /* $x where x is a parameter */ - struct functioncall *functioncall; /* $func(...) */ - struct array *array; /* {...}[...] */ - } value; -}; - -/* The representation of a function call $func(...) in the description of a - function value. */ -struct functioncall { - struct function *function; - struct stringlist *args; -}; - -/* The representation of an array selection {...|...}[...] in the description - of a function value. tempno is used when constructing a C variable name - that will contain the strings or numbers in an array. */ -struct array { - struct string *index; /* what's between [...] */ - struct stringlist *elements; /* what's between {...} */ - enum type type; /* the type of each element */ - int tempno; -}; - -/* A list of strings, being the list of arguments in a function call or the - list of elements of an array. This is a linked list of linked lists. */ -struct stringlist { - struct stringlist *next; - enum type type; - struct string *string; -}; - - -/* The following are the only characters with special meaning at the top level - of parsing of a function value. When parsing arrays or function calls, - other characters become special. */ -#define MAKESTRING_MAGIC "${"/*}*/ - - -/* Parse a function return-value string or substring and make a struct string - list for it. The string starts at *stringp and ends at a \0 or at any - character in the `magic' string other than { or $. *stringp is updated - to point to the terminating character. The parsed representation is put - at *stringlink. `fp' is the function whose return value is being parsed. - `targettype' is the expected type of the result, if known. - The return value is the actual type. */ -enum type makestring(struct function *fp, struct string **stringlink, - char **stringp, char *magic, enum type targettype) { - char *p, *q; - struct string *sp, **firststringlink; - int n, components; - int parenlevel = 0; - enum type t = targettype, newt; - - if (magic == NULL) - magic = MAKESTRING_MAGIC; - p = *stringp; - firststringlink = stringlink; - components = 0; - while (*p != '\0') { - sp = xmalloc(sizeof *sp); - q = p; - n = 0; - do { - if (strchr(magic, *q) != NULL) { - if (*q != ')' || parenlevel == 0) - break; - } - switch (*q) { - case '(': - parenlevel++; break; - case ')': - parenlevel--; break; - case '\\': - if (q[1] != '\0') - q++; - break; - } - n++; - } while (*++q != '\0'); - if (n > 0) { - sp->type = S_TEXT; - sp->value.text = q = xmalloc(n + 1); - do { - if (*p == '\\') - p++; - *q++ = *p++; - } while (--n > 0); - *q = '\0'; - newt = t; - } else if (*p == '$') { - if (parsedollar(fp, &p, sp) != 0) - return T_ERROR; - switch (sp->type) { - case S_BITSTRING: - case S_BITSPLICE: - newt = T_INTEGER; - break; - case S_PARAMETER: - newt = sp->value.parameter->type; - break; - case S_FUNCTIONCALL: - newt = sp->value.functioncall->function->type; - break; - default: - fprintf(stderr, "makestring type %d\n", sp->type); - abort(); - } - } else if (*p == '{'/*}*/) { - if (parsearray(fp, &p, sp, t) != 0) - return T_ERROR; - newt = sp->value.array->type; - } else { - free(sp); - break; - } - if (t == T_UNKNOWN) - t = newt; - else if (newt != T_UNKNOWN && t != newt) { - if (stringlink == firststringlink) { - fprintf(stderr, "%s: %s(%d): expected %s type:\n", progname, - filename, lineno, typename[t]); - showstringelement(stderr, sp); - return T_ERROR; - } - *stringlink = NULL; - fprintf(stderr, "%s: %s(%d): mixed types in string:\n", - progname, filename, lineno); - showstring(stderr, *firststringlink); - fprintf(stderr, " -- %s\n", typename[t]); - showstringelement(stderr, sp); - fprintf(stderr, " -- %s\n", typename[newt]); - return T_ERROR; - } - *stringlink = sp; - stringlink = &sp->next; - components++; - } - *stringlink = NULL; - *stringp = p; - if (components >= MAXBITS) { - fprintf(stderr, "%s: %s(%d): excessively complicated string\n", - progname, filename, lineno); - return T_ERROR; - } - componentbits |= 1 << components; - return t; -} - - -/* Parse a $ operation at **stringp and update *stringp to point past it. - `fp' is the function whose return value is being parsed. The parsed - item will be put at *sp. Return 0 on success, nonzero on error. */ -int parsedollar(struct function *fp, char **stringp, struct string *sp) { - char *p, *start; - - p = *stringp; - assert(*p == '$'); - start = ++p; - if (*p == '[') - p++; - while (isalnum(*p) || *p == '_') - p++; - if (*start == '[') { - if (*p != ']') { - fprintf(stderr, "%s: %s(%d): missing ] or bad character in $[\n", - progname, filename, lineno); - return 1; - } - *stringp = p + 1; - return parsebitsplice(fp, start + 1, p - start - 1, sp); - } - if (p == start) { - fprintf(stderr, "%s: %s(%d): missing identifier after $\n", progname, - filename, lineno); - return 1; - } - if (p == start + 1) { - if (findvariable(fp, *start, sp) != 0) - return 1; - } else { - if (parsefunctioncall(fp, start, &p, sp) != 0) - return 1; - } - *stringp = p; - return 0; -} - - -/* The representation of a $[...] bitsplice. It is parsed into a - struct entry just as if it were a bitfield parameter, then analysed - into a chain of struct bitsplicebits. These in conjunction with - the constant portion of the struct entry will allow the bitsplice to - be compiled. Each bitsplicebits element represents either a numeric - argument to the current function, in which case it will be shifted - into place; or a bitfield name from the bitfield description of the - current function, in which case it will be shifted by the difference - between the position of the bitfield in the argument and the position - it occurs in the bitsplice. `shift' indicates how much to shift left - the associated value; if it is negative the value is shifted right. - For instance, in a function like this: - %oh xx00(%y) $[yyxx] - the bitsplicebits for y will have shift = 2 and value.arg pointing to y, - and those for x will have shift = -2 and value.mask = binary 1100. - As an optimisation, contiguous bitfields that are also contiguous in the - bitsplice will be combined. For instance: - %oh xxyy00 $[0xxyy0] - will compile the same code as: - %oh zzzz00 $[0zzzz0]. - As another optimisation, a bitfield that occupies the entire bitstring - for a function will be treated like a parameter in that it will not be - masked in the bitsplice. For instance: - %oh xxxxxx $[0xxxxxx0] - will compile the same code as: - %oh (%x) $[0xxxxxx0]. */ -struct bitsplice { - struct entry entry; - int nbits; - struct bitsplicebits *splice; -}; -struct bitsplicebits { - struct bitsplicebits *next; - int shift; - enum elementtype type; - union { - struct arg *arg; - bits mask; - } value; -}; - - -int parsebitsplice(struct function *fp, char *bitstring, int nbits, - struct string *sp) { - struct bitsplice *splicep; - struct bitsplicebits *bsp, *lastbsp, **bspp; - struct bits *bp; - int shift, nfrombits, ntobits; - bits allbits, b; - - splicep = xmalloc(sizeof *splicep); - splicep->nbits = nbits; - if (parseentrybits(&splicep->entry, bitstring, nbits, 1) != 0) - return 1; - bspp = &splicep->splice; - lastbsp = NULL; - for (bp = splicep->entry.bits; bp != NULL; bp = bp->next) { - if (findvariable(fp, bp->name, sp) != 0) - return 1; - shift = bp->shift; - if (sp->type == S_BITSTRING) { - nfrombits = bitcount(sp->value.bits->mask); - ntobits = bitcount(bp->mask); - if (warnings) { - if (nfrombits != ntobits) { - fprintf(stderr, "%s: %s(%d): warning: " - "bitstring $%c %ser than its place " - "in bitsplice\n", - progname, filename, lineno, bp->name, - (nfrombits > ntobits) ? "bigg" : "small"); - } - } - shift -= sp->value.bits->shift; - - /* See if this bitfield can be combined with a previous contiguous - bitfield. */ - if (lastbsp != NULL && lastbsp->type == S_BITSTRING - && lastbsp->shift == shift) { - lastbsp->value.mask |= sp->value.bits->mask; - continue; - } - } else { - assert(sp->type == S_PARAMETER); - if (sp->value.parameter->type != T_INTEGER) { - fprintf(stderr, - "%s: %s(%d): variable %c in $[...] should be integer\n", - progname, filename, lineno, sp->value.parameter->name); - return 1; - } - } - *bspp = bsp = xmalloc(sizeof *bsp); - bsp->type = sp->type; - bsp->shift = shift; - if (sp->type == S_PARAMETER) - bsp->value.arg = sp->value.parameter; - else - bsp->value.mask = sp->value.bits->mask; - bspp = &bsp->next; - lastbsp = bsp; - } - *bspp = NULL; - - /* Look for a spliced element that is the entire bitstring argument to - this function and therefore doesn't need to be masked. */ - allbits = allbitsset(fp->nbits); - for (bsp = splicep->splice; bsp != NULL; bsp = bsp->next) { - if (bsp->type == S_BITSTRING) { - for (b = bsp->value.mask; b != 0 && !(b & 1); b >>= 1) ; - if (b == allbits) - bsp->value.mask = 0; - } - } - sp->type = S_BITSPLICE; - sp->value.bitsplice = splicep; - return 0; -} - - -int findvariable(struct function *fp, int name, struct string *sp) { - struct bits *bp; - struct arg *ap; - - for (bp = fp->last->bits; bp != NULL; bp = bp->next) { - if (bp->name == name) { - sp->type = S_BITSTRING; - sp->value.bits = bp; - return 0; - } - } - for (ap = fp->args; ap != NULL; ap = ap->next) { - if (ap->name == name) { - sp->type = S_PARAMETER; - sp->value.parameter = ap; - return 0; - } - } - fprintf(stderr, "%s: %s(%d): undefined parameter %c\n", progname, filename, - lineno, name); - return 1; -} - - -int parsefunctioncall(struct function *fp, char *start, char **stringp, - struct string *sp) { - char *p; - struct functioncall *fcp; - struct stringlist **arglink, *arg; - enum type t; - - p = *stringp; - if (*p != '(') { - fprintf(stderr, "%s: %s(%d): missing ( after function %.*s\n", progname, - filename, lineno, (int)(p - start), start); - return 1; - } - sp->type = S_FUNCTIONCALL; - sp->value.functioncall = fcp = xmalloc(sizeof *fcp); - *p = '\0'; /* Ugly. */ - fcp->function = findfunction(start); - *p = '('; - arglink = &fcp->args; - if (*++p != ')') { - while (1) { - arg = xmalloc(sizeof *arg); - t = makestring(fp, &arg->string, &p, MAKESTRING_MAGIC ",)", - T_UNKNOWN); - if (t == T_ERROR) - return 1; - arg->type = t; - *arglink = arg; - arglink = &arg->next; - if (*p == ')') - break; - assert(*p == ','); - p++; - } - } - *arglink = NULL; - assert(*p == ')'); - *stringp = p + 1; - return 0; -} - - -int parsearray(struct function *fp, char **stringp, struct string *sp, - enum type t) { - char *p; - struct array *ap; - struct stringlist **elementlink, *element; - - p = *stringp; - assert(*p == '{'/*}*/); - sp->type = S_ARRAY; - sp->value.array = ap = xmalloc(sizeof *ap); - ap->tempno = -1; - elementlink = &ap->elements; - ap->type = t; - if (*++p != /*{*/'}') { - while (1) { - element = xmalloc(sizeof *element); - t = makestring(fp, &element->string, &p, - MAKESTRING_MAGIC /*{*/"|}", t); - if (t == T_ERROR) - return 1; - element->type = t; - if (ap->type == T_UNKNOWN) - ap->type = t; - else if (t != T_UNKNOWN && ap->type != t) { - fprintf(stderr, "%s: %s(%d): mixed types in array:\n", - progname, filename, lineno); - showstring(stderr, ap->elements->string); - fprintf(stderr, " -- %s\n", typename[ap->type]); - showstring(stderr, element->string); - fprintf(stderr, " -- %s\n", typename[t]); - return 1; - } - *elementlink = element; - elementlink = &element->next; - if (*p == /*{*/'}') - break; - assert(*p == '|'); - p++; - } - } - *elementlink = NULL; - assert(*p == /*{*/'}'); - if (*++p != '[') { - fprintf(stderr, "%s: %s(%d): missing [index] after array\n", - progname, filename, lineno); - return 1; - } - ++p; - t = makestring(fp, &ap->index, &p, MAKESTRING_MAGIC "]", T_INTEGER); - if (t == T_ERROR) - return 1; - if (t == T_STRING) { - fprintf(stderr, "%s: %s(%d): array index cannot be string:\n", - progname, filename, lineno); - showstring(stderr, ap->index); - return 1; - } - if (*p != ']') { - fprintf(stderr, "%s: %s(%d): [ without ]\n", progname, filename, - lineno); - return 1; - } - *stringp = p + 1; - return 0; -} - - -void dumpfunctions() { - struct function *fp; - - for (fp = functions; fp != NULL; fp = fp->next) - dumpfunction(fp); -} - - -void dumpfunction(struct function *fp) { - struct entry *ep; - - for (ep = fp->first; ep != NULL; ep = ep->next) - showentry(stderr, fp, ep, 0); -} - - -/* Entries are not shown exactly as they would be input, since \ would - need to be provided before some characters such as $ or {. But the - characters "|},]" pose a problem since a \ is only needed in certain - contexts and is annoying otherwise. It's not worth doing this right, - since it's only used for error messages. */ -void showentry(FILE *f, struct function *fp, struct entry *ep, bits highlight) { - if (fp->type == T_INTEGER) - putc('%', f); - fprintf(f, "%-*s ", maxfunctionname + 1, fp->name); - if (fp->nbits == 0 && fp->nargs == 0) - fprintf(f, "%-*s", maxargwidth, "()"); - else { - showbits(f, ep, fp->nbits, 0); - showargs(f, fp->args, maxargwidth - fp->nbits); - } - putc(' ', f); - showstring(f, ep->string); - putc('\n', f); - if (highlight != 0) { - fprintf(f, "%-*s ", maxfunctionname + 1, ""); - showbits(f, ep, fp->nbits, highlight); - putc('\n', f); - } -} - - -void showbits(FILE *f, struct entry *ep, int nbits, bits highlight) { - struct bits *bp; - bits i, value; - char zero, one; - - if (nbits == 0) - return; - i = 1 << (nbits - 1); - bp = ep->bits; - if (highlight) { - value = highlight; - zero = ' '; - one = '^'; - } else { - value = ep->value; - zero = '0'; - one = '1'; - } - do { - if (highlight != 0 || (ep->mask & i)) { - putc((value & i) ? one : zero, f); - i >>= 1; - } else { - assert(bp != NULL && (bp->mask & i)); - do { - putc(bp->name, f); - i >>= 1; - } while (bp->mask & i); - bp = bp->next; - } - } while (i != 0); -} - - -void showargs(FILE *f, struct arg *ap, int fieldwidth) { - int width; - int lastc; - int isint; - - if (ap == NULL) - width = 0; - else { - width = 1; - lastc = '('; - do { - isint = (ap->type == T_INTEGER); - fprintf(f, "%c%s%c", lastc, isint ? "%" : "", ap->name); - width += 2 + isint; - ap = ap->next; - lastc = ','; - } while (ap != NULL); - putc(')', f); - } - fprintf(f, "%-*s", fieldwidth - width, ""); -} - - -void showstring(FILE *f, struct string *sp) { - for ( ; sp != NULL; sp = sp->next) - showstringelement(f, sp); -} - - -void showstringelement(FILE *f, struct string *sp) { - struct bitsplice *bsp; - - switch (sp->type) { - case S_TEXT: - fputs(sp->value.text, f); - break; - case S_BITSTRING: - fprintf(f, "$%c", sp->value.bits->name); - break; - case S_BITSPLICE: - fprintf(f, "$["); - bsp = sp->value.bitsplice; - showbits(f, &bsp->entry, bsp->nbits, 0); - fprintf(f, "]"); - break; - case S_PARAMETER: - fprintf(f, "$%c", sp->value.parameter->name); - break; - case S_FUNCTIONCALL: - showfunctioncall(f, sp->value.functioncall); - break; - case S_ARRAY: - showarray(f, sp->value.array); - break; - default: - fprintf(stderr, "showstring case %d\n", sp->type); - abort(); - } -} - - -void showfunctioncall(FILE *f, struct functioncall *fcp) { - struct stringlist *sp; - char *last; - - fprintf(f, "$%s(", fcp->function->name); - last = ""; - for (sp = fcp->args; sp != NULL; sp = sp->next) { - fputs(last, f); - last = ","; - showstring(f, sp->string); - } - putc(')', f); -} - - -void showarray(FILE *f, struct array *ap) { - struct stringlist *sp; - char *last; - - putc('{'/*}*/, f); - last = ""; - for (sp = ap->elements; sp != NULL; sp = sp->next) { - fputs(last, f); - last = "|"; - showstring(f, sp->string); - } - fputs(/*{*/"}[", f); - showstring(f, ap->index); - putc(']', f); -} - - -const char commonpreamble[] = "\ -typedef %s bits;\n\ -\n\ -"; - -const char concatpreamble[] = "\ -static char *dis_buf;\n\ -static int dis_bufindex, dis_buflen;\n\ -\n\ -void *dis_alloc(size_t size)\n\ -{\n\ - void *p;\n\ - int newindex = dis_bufindex + size;\n\ - if (newindex > dis_buflen) {\n\ - dis_buflen = newindex * 4;\n\ - dis_buf = malloc(dis_buflen);\n\ - /* We can't use realloc because there might be pointers extant into\n\ - the old buffer. So we waste the memory of the old buffer. We\n\ - should soon reach an adequate buffer size and stop leaking. */\n\ - if (dis_buf == 0) {\n\ - perror(\"malloc\");\n\ - exit(1);\n\ - }\n\ - dis_bufindex = 0;\n\ - }\n\ - p = dis_buf + dis_bufindex;\n\ - dis_bufindex = newindex;\n\ - return p;\n\ -}\n\ -\n\ -void dis_done()\n\ -{\n\ - dis_bufindex = 0;\n\ -}\n\ -\n\ -"; - -const char concatdeclarations[] = "\ -#include \n\ -#include \n\ -#include \n\ -\n\ -extern void *dis_realloc(void *p, size_t size); /* User-provided. */\n\ -void *dis_alloc(size_t size);\n\ -void dis_done(void);\n\ -"; - -const char nonconcatpreamble[] = "\ -void dis_done() {}\n\ -"; - - -int outputfunctions() { - struct function *fp; - - outputidentity(stdout); - if (headerfilename != NULL) { - if ((headerfile = fopen(headerfilename, "w")) == NULL) { - fprintf(stderr, "%s: create %s: %s\n", progname, headerfilename, - strerror(errno)); - return 1; - } - outputidentity(headerfile); - fprintf(headerfile, commonpreamble, bitstype); - printf("\n#include \"%s\"\n", headerfilename); - } else - printf(commonpreamble, bitstype); - findarrays(); - if (outputdeclarations() != 0) - return 1; - outputconcats(); - for (fp = functions; fp != NULL; fp = fp->next) { - if (fp->isarray) - functionarray(fp); - } - for (fp = functions; fp != NULL; fp = fp->next) { - if (fp->first != NULL && !fp->isarray) { - if (outputfunction(fp) != 0) - return 1; - } - } - return 0; -} - - -void outputidentity(FILE *f) { - char **p; - - fprintf(f, "/*\n * This file was generated by:\n *"); - for (p = global_argv; *p != NULL; p++) - fprintf(f, " %s", *p); - fprintf(f, "\n */\n\n"); -} - - -int outputdeclarations() { - FILE *f = headerfile ? headerfile : stdout; - struct function *fp; - - for (fp = functions; fp != NULL; fp = fp->next) { - if (fp->type != T_UNKNOWN) { - if (fp->isarray) { - fprintf(f, "extern "); - if (fp->fixedlength > 0) - fprintf(f, "char %s[][%d]", fp->name, fp->fixedlength); - else { - compiletype(f, &fp->type); - fprintf(f, "%s[]", fp->name); - } - } else - functionheader(f, fp); - fprintf(f, ";\n"); - } - } - return 0; -} - - -void outputconcats() { - int i; - - if (componentbits & ~3) { - fputs(concatdeclarations, headerfile ? headerfile : stdout); - fputs(concatpreamble, stdout); - } else - fputs(nonconcatpreamble, stdout); - for (i = 2; i < MAXBITS; i++) { - if (componentbits & (1 << i)) - outputconcat(i); - } -} - - -void outputconcat(int n) { - int i; - char *last; - - assert(n > 1); - if (headerfile) { - outputconcatheader(headerfile, n); - fprintf(headerfile, ";\n"); - } - outputconcatheader(stdout, n); - printf("\n{\n void *p;\n int len = "); - last = ""; - for (i = 0; i < n; i++) { - printf("%sstrlen(p%d)", last, i); - last = " + "; - } - printf(";\n p = dis_alloc(len + 1);\n return "); - for (i = 1; i < n; i++) - printf("strcat("); - printf("strcpy(p, p0)"); - for (i = 1; i < n; i++) - printf(", p%d)", i); - printf(";\n}\n\n"); -} - - -void outputconcatheader(FILE *f, int n) { - int i; - char *last = ""; - - fprintf(f, "char *dis_concat%d(", n); - for (i = 0; i < n; i++) { - fprintf(f, "%schar *p%d", last, i); - last = ", "; - } - fprintf(f, ")"); -} - - -void findarrays() { - struct function *fp; - struct entry *ep; - struct string *estr, *indexstr; - struct bits *bp; - - for (fp = functions; fp != NULL; fp = fp->next) { - if (fp->nbits > 0 && fp->nargs > 0) - continue; - if (fp->nargs > 1) - continue; - ep = fp->first; - if (ep == NULL || ep->next != NULL) - continue; - estr = ep->string; - if (estr == NULL || estr->next != NULL || estr->type != S_ARRAY) - continue; - indexstr = estr->value.array->index; - if (indexstr->next != NULL) - continue; - if (fp->nbits > 0) { - bp = ep->bits; - if (bp == NULL || bp->next != NULL || bp->shift != 0) - continue; - if (bp->mask != allbitsset(fp->nbits)) - continue; - if (indexstr->type != S_BITSTRING || indexstr->value.bits != bp) - continue; - } else { - if (indexstr->type != S_PARAMETER - || indexstr->value.parameter != fp->args) - continue; - } - if (!simplearray(estr->value.array)) - continue; - fp->isarray = 1; - fp->fixedlength = - (fp->type == T_INTEGER) ? 0 : checkfixedlength(estr->value.array); - } -} - - -int checkfixedlength(struct array *ap) { - int len, maxlen, wasted, n; - struct stringlist *lp; - - maxlen = 0; - for (lp = ap->elements; lp != NULL; lp = lp->next) { - if (lp->string == NULL) - continue; - assert(lp->string->type == S_TEXT); - len = strlen(lp->string->value.text); - if (len > maxlen) - maxlen = len; - } - for (wasted = n = 0, lp = ap->elements; lp != NULL; n++, lp = lp->next) { - if (lp->string == NULL) - continue; - wasted += maxlen - strlen(lp->string->value.text); - } - if (wasted < n * sizeof(char *)) /* Should be target's sizeof. */ - return maxlen + 1; - return 0; -} - - -int outputfunction(struct function *fp) { - printf("\n"); - functionheader(stdout, fp); - printf("\n{\n"/*}*/); - switch (functionswitch(fp, 0, 0)) { - case -1: - return 1; - case 0: - if (warnings) { - fprintf(stderr, "%s: warning: not all cases of %s covered\n", - progname, fp->name); - } - } - printf(/*{*/"}\n"); - return 0; -} - - -void functionarray(struct function *fp) { - struct array *ap; - - ap = fp->first->string->value.array; - printf("\n"); - compilesimplearray(&fp->type, fp->name, 0, ap); -} - - -void functionheader(FILE *f, struct function *fp) { - char *last; - struct arg *ap; - - compiletype(f, &fp->type); - fprintf(f, "%s(", fp->name); - last = ""; - if (fp->nbits > 0) { - fprintf(f, "bits code"); - last = ", "; - } - for (ap = fp->args; ap != NULL; ap = ap->next) { - fprintf(f, "%s", last); - compiletype(f, &ap->type); - putc(ap->name, f); - last = ", "; - } - if (*last == '\0') - fprintf(f, "void"); - putc(')', f); -} - - -int simplearray(struct array *ap) { - struct stringlist *lp; - - for (lp = ap->elements; lp != NULL; lp = lp->next) { - if (lp->string != NULL - && (lp->string->next != NULL || lp->string->type != S_TEXT)) - break; - } - return (lp == NULL); -} - - -void compiletype(FILE *f, enum type *tp) { - switch (*tp) { - case T_UNKNOWN: - *tp = T_STRING; - /* Fall in... */ - case T_STRING: - fprintf(f, "char *"); - break; - case T_INTEGER: - fprintf(f, "bits "); - break; - default: - fprintf(stderr, "compiletype type %d\n", *tp); - abort(); - } -} - - -/* Generate code for entries in function fp whose bitstring b satisfies - the constraint (b & mask) == value. Return 1 if generated switch - always does `return', 0 if not, -1 on error. - The algorithm is as follows. Scan the eligible entries to find the - largest set of bits not in the passed-in mask which always have a - constant value (are not variable). One `default' entry is allowed - all of whose bits are variable. For each value of the constant bits, - generate a `switch' case and invoke the function recursively with - that value included in the constraint parameters. The recursion - stops when no set of constant bits is found, perhaps because the - mask parameter has all bits set. - This algorithm could be improved. Currently it will fail if there - are input lines "xxyy", "00xx" and "yy00", each of which is default with - respect to the others. The correct behaviour would then be to select - a bit that is sometimes constant and deal with those cases first. - But this problem has not yet arisen in real life. */ -int functionswitch(struct function *fp, bits mask, bits value) { - struct entry *ep, *defaultcase; - bits allbits, constbits, missingcases; - int nhits, ncases, nconstbits, alwaysreturns; - - indentation++; - allbits = allbitsset(fp->nbits); - constbits = allbits & ~mask; - if (debug) { - findent(stderr); - fprintf(stderr, - "functionswitch(%s): (x & 0x%lx) == 0x%lx; const == 0x%lx\n", - fp->name, mask, value, constbits); - } - defaultcase = NULL; - ncases = nhits = 0; - alwaysreturns = 1; - for (ep = fp->first; ep != NULL; ep = ep->next) { - /* If this is not one of the entries under consideration, skip. */ - if (ep->done - || (ep->mask & mask) != mask || (ep->value & mask) != value) - continue; - if (debug) { - findent(stderr); - showentry(stderr, fp, ep, 0); - } - /* If this entry has no constant bits in the still-variable portion, - it's the default. */ - if ((constbits & ep->mask) == 0) { - if (defaultcase != NULL) { - fprintf(stderr, - "%s: function %s: unable to distinguish between:\n", - progname, fp->name); - showentry(stderr, fp, defaultcase, 0); - showentry(stderr, fp, ep, 0); - return -1; - } - defaultcase = ep; - if (debug) { - findent(stderr); - fprintf(stderr, "^^ default case\n"); - } - } else { - if (debug && (constbits & ~ep->mask)) { - findent(stderr); - fprintf(stderr, "const now 0x%lx\n", constbits & ep->mask); - } - constbits &= ep->mask; - nhits++; - } - } - if (nhits > 0) { - indent(); - if (constbits == allbits) - printf("switch (code) {\n"/*}*/); - else - printf("switch (code & 0x%lx) {\n"/*}*/, constbits); - for (ep = fp->first; ep != NULL; ep = ep->next) { - /* If this is not one of the entries under consideration, skip. */ - if ((ep->mask & mask) != mask || (ep->value & mask) != value) - continue; - if (ep->done || ep == defaultcase) - continue; - ncases++; - indent(); - printf("case 0x%lx:\n", ep->value & constbits); - switch (functionswitch(fp, mask | constbits, - value | (ep->value & constbits))) { - case -1: - return -1; - case 0: - alwaysreturns = 0; - indentation++; indent(); indentation--; - printf("break;\n"); - } - } - indent(); - printf(/*{*/"}\n"); - } - nconstbits = bitcount(constbits); - missingcases = ((nconstbits == MAXBITS) ? 0 : 1 << nconstbits) - ncases; - if (alwaysreturns) { - switch (missingcases) { - case 0: - if (defaultcase != NULL) { - fprintf(stderr, "%s: warning: redundant entry:\n", progname); - showentry(stderr, fp, defaultcase, 0); - defaultcase = NULL; - } - break; - case 1: - if (defaultcase != NULL && nconstbits != 0) { - fprintf(stderr, - "%s: warning: variable bit(s) could be constant:\n", - progname); - showentry(stderr, fp, defaultcase, constbits); - break; - } - /* Fall in... */ - default: - alwaysreturns = 0; - } - } - if (defaultcase != NULL) { - /* If defaultcase has some constant bits of its own, recursion will - check that they have the required value. */ - if ((defaultcase->mask & ~mask) == 0) { - alwaysreturns = 1; - if (compilestring(-1, defaultcase->string, fp->type) != 0) - return -1; - defaultcase->done = 1; - } else { - indentation--; - alwaysreturns = functionswitch(fp, mask, value); - indentation++; - } - } - indentation--; - return alwaysreturns; -} - - -int compilestring(int assignto, struct string *sp, enum type type) { - int tempno; - - tempno = walkstring(sp, COUNTARRAYS, assignto); - if (tempno > assignto) { - indent(); - printf("{\n"/*}*/); - indentation++; - (void) walkstring(sp, DECLAREARRAYS, assignto); - if (walkstring(sp, COMPILEARRAYS, assignto) < 0) - return 1; - } - if (compilecheckedstring(assignto, sp, type) != 0) - return 1; - if (tempno > assignto) { - indentation--; - indent(); - printf(/*{*/"}\n"); - } - return 0; -} - - -int compilecheckedstring(int assignto, struct string *sp, enum type type) { - compileassign(assignto); - if (compileconcat(sp, type) != 0) - return 1; - printf(";\n"); - return 0; -} - - -void compileassign(int assignto) { - indent(); - if (assignto < 0) - printf("return "); - else { - compiletemp(assignto); - printf(" = "); - } -} - - -void compiletemp(int tempno) { - printf("t__%d", tempno); -} - - -void compiletext(char *s) { - putchar('"'); - if (s != NULL) { - for ( ; *s != '\0'; s++) { - switch (*s) { - case '"': - case '\\': - putchar('\\'); - } - putchar(*s); - } - } - putchar('"'); -} - - -int compileconcat(struct string *sp, enum type type) { - int elements; - struct string *sp1; - char *last; - - if (sp == NULL) - return compilenull(type); - if (sp->next == NULL) - return compilesimple(sp, type); - if (type != T_INTEGER) { - for (elements = 0, sp1 = sp; sp1 != NULL; elements++, sp1 = sp1->next) ; - printf("dis_concat%d(", elements); - } - last = ""; - for (sp1 = sp; sp1 != NULL; sp1 = sp1->next) { - printf("%s", last); - if (type != T_INTEGER) - last = ", "; - if (sp1->type == S_ARRAY) - compilearrayref(sp1->value.array); - else - if (compilesimple(sp1, type) != 0) - return 1; - } - if (type != T_INTEGER) - printf(")"); - return 0; -} - - -int compilenull(enum type type) { - if (type == T_INTEGER) { - fprintf(stderr, "%s: empty integer expression\n", progname); - return 1; - } - printf("\"\""); - return 0; -} - - -int compilesimple(struct string *sp, enum type type) { - if (sp == NULL) - return compilenull(type); - switch (sp->type) { - case S_TEXT: - if (type == T_INTEGER) - printf("%s", sp->value.text); - else - compiletext(sp->value.text); - break; - case S_BITSTRING: - compilebitstring(sp->value.bits); - break; - case S_BITSPLICE: - compilebitsplice(sp->value.bitsplice); - break; - case S_PARAMETER: - putchar(sp->value.parameter->name); - break; - case S_FUNCTIONCALL: - return compilefunctioncall(sp); - case S_ARRAY: - if (compilearrayref(sp->value.array) != 0) - return 1; - break; - default: - fprintf(stderr, "compilesimple case %d", sp->type); - abort(); - } - return 0; -} - - -int compilearrayref(struct array *ap) { - compiletemp(ap->tempno); - if (simplearray(ap)) { - printf("["); - if (compileconcat(ap->index, T_INTEGER) != 0) - return 1; - printf("]"); - } - return 0; -} - - -int compilefunctioncall(struct string *sp) { - struct function *fp; - struct stringlist *actualp; - struct arg *formalp; - char *last; - int nbits; - enum type formaltype; - - assert(sp->type == S_FUNCTIONCALL); - fp = sp->value.functioncall->function; - printf("%s%c", fp->name, fp->isarray ? '[' : '('); - last = ""; - nbits = fp->nbits; - formalp = fp->args; - actualp = sp->value.functioncall->args; - while (actualp != NULL) { - if (nbits > 0) { - nbits = 0; - formaltype = T_INTEGER; - } else { - if (formalp == NULL) { - fprintf(stderr, "%s: too many arguments to %s:\n", progname, - fp->name); - showstring(stderr, sp); - putc('\n', stderr); - return 1; - } - formaltype = formalp->type; - formalp = formalp->next; - } - if (actualp->type != T_UNKNOWN && actualp->type != formaltype) { - fprintf(stderr, "%s: argument to %s has the wrong type:\n", - progname, fp->name); - showstring(stderr, actualp->string); - putc('\n', stderr); - return 1; - } - printf("%s", last); - last = ", "; - if (compileconcat(actualp->string, formaltype) != 0) - return 1; - actualp = actualp->next; - } - putchar(fp->isarray ? ']' : ')'); - return 0; -} - - -int walkstring(struct string *sp, enum walkstringop op, int tempno) { - struct stringlist *lp; - struct array *ap; - - for ( ; sp != NULL; sp = sp->next) { - switch (sp->type) { - case S_ARRAY: - ap = sp->value.array; - for (lp = ap->elements; lp != NULL; lp = lp->next) - tempno = walkstring(lp->string, op, tempno); - tempno = walkstring(ap->index, op, tempno); - ap->tempno = ++tempno; - switch (op) { - case DECLAREARRAYS: - if (simplearray(ap)) { - indent(); - printf("static "); - compilesimplearray(&ap->type, NULL, tempno, ap); - } else - declarearray(ap); - break; - case COMPILEARRAYS: - if (!simplearray(ap)) - if (compilearray(ap) != 0) - return -1; - break; - default: - break; - } - break; - case S_FUNCTIONCALL: - for (lp = sp->value.functioncall->args; lp != NULL; lp = lp->next) - tempno = walkstring(lp->string, op, tempno); - break; - default: - break; - } - } - return tempno; -} - - -int compilearray(struct array *ap) { - struct stringlist *ep; - int i; - - indent(); - printf("switch ("); - if (compileconcat(ap->index, T_INTEGER) != 0) - return 1; - printf(") {\n"/*}*/); - for (i = 0, ep = ap->elements; ep != NULL; i++, ep = ep->next) { - indent(); - printf("case %d:\n", i); - indentation++; - if (compilecheckedstring(ap->tempno, ep->string, ap->type) != 0) - return 1; - indent(); - printf("break;\n"); - indentation--; - } - indent(); - printf(/*{*/"}\n"); - return 0; -} - - -void compilesimplearray(enum type *tp, char *name, int num, struct array *ap) { - struct stringlist *lp; - int fixedlength; - - fixedlength = (*tp == T_INTEGER) ? 0 : checkfixedlength(ap); - if (fixedlength > 0) - printf("char "); - else - compiletype(stdout, tp); - if (name != NULL) - printf("%s", name); - else - compiletemp(num); - printf("[]"); - if (fixedlength > 0) - printf("[%d]", fixedlength); - printf(" = {\n"/*}*/); - indentation++; - for (lp = ap->elements; lp != NULL; lp = lp->next) { - indent(); - compilesimple(lp->string, lp->type); - printf(",\n"); - } - indentation--; - indent(); - printf(/*{*/"};\n"); -} - - -void declarearray(struct array *ap) { - indent(); - compiletype(stdout, &ap->type); - compiletemp(ap->tempno); - printf(";\n"); -} - - -void compilebitstring(struct bits *bp) { - printf("("); - if (bp->shift != 0) - printf("("); - printf("code & 0x%lx", bp->mask); - if (bp->shift != 0) - printf(") >> %d", bp->shift); - printf(")"); -} - - -void compilebitsplice(struct bitsplice *splicep) { - struct bitsplicebits *bsp; - char *last = ""; - - printf("("); - for (bsp = splicep->splice; bsp != NULL; bsp = bsp->next) { - printf("%s", last); - last = " | "; - if (bsp->type == S_PARAMETER) - putchar(bsp->value.arg->name); - else { - assert(bsp->type == S_BITSTRING); - if (bsp->value.mask == 0) - printf("code"); - else - printf("(code & 0x%lx)", bsp->value.mask); - } - if (bsp->shift > 0) - printf(" << %d", bsp->shift); - else if (bsp->shift < 0) - printf(" >> %d", -bsp->shift); - } - if (splicep->entry.value != 0) - printf("%s0x%lx", last, splicep->entry.value); - printf(")"); -} - - -int bitcount(bits x) { - int nbits; - - for (nbits = 0; x != 0; x >>= 1) { - if (x & 1) - nbits++; - } - return nbits; -} - - -bits allbitsset(int nbits) { - return (nbits == MAXBITS) ? ~0 : (1 << nbits) - 1; -} - - -void findent(FILE *f) { - int i; - - for (i = 1; i < indentation; i += 2) - putc('\t', f); - if (i == indentation) - fputs(" ", f); -} - - -void indent() { - findent(stdout); -} - - -void *xrealloc(char *oldp, size_t size) { - void *p; - - if (oldp == NULL) - p = malloc(size); - else - p = realloc(oldp, size); - if (p == NULL) { - fprintf(stderr, "%s: allocate of %d bytes failed: %s\n", progname, - (int) size, strerror(errno)); - exit(1); - } - return p; -} - - -void *xmalloc(size_t size) { - return xrealloc(NULL, size); -} - - -void *xstrdup(char *s) { - char *p; - size_t i = strlen(s) + 1; - - p = xmalloc(i); - strlcpy(p, s, i); - return p; -} - - -int prematureeof() { - fprintf(stderr, "%s: %s(%d): premature end of file\n", progname, filename, - lineno); - return 1; -} diff --git a/osfmk/ddb/nlist.h b/osfmk/ddb/nlist.h deleted file mode 100644 index a677d771a..000000000 --- a/osfmk/ddb/nlist.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.11.2 1995/01/06 19:11:11 devrcs - * mk6 CR668 - 1.3b26 merge - * Add padding for alpha, make n_other unsigned, - * fix erroneous def of N_FN. - * [1994/10/14 03:40:03 dwm] - * - * Revision 1.1.11.1 1994/09/23 01:23:37 ezf - * change marker to not FREE - * [1994/09/22 21:11:49 ezf] - * - * Revision 1.1.4.3 1993/07/27 18:28:42 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:13:44 elliston] - * - * Revision 1.1.4.2 1993/06/02 23:13:34 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:58:08 jeffc] - * - * Revision 1.1 1992/09/30 02:24:29 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.4 91/05/14 15:38:20 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:07:42 mrt - * Changed to new Mach copyright - * [91/01/31 16:20:26 mrt] - * - * 11-Aug-88 David Golub (dbg) at Carnegie-Mellon University - * Added n_un, n_strx definitions for kernel debugger (from - * a.out.h). - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * nlist.h - symbol table entry structure for an a.out file - * derived from FSF's a.out.gnu.h - * - */ - -#ifndef _DDB_NLIST_H_ -#define _DDB_NLIST_H_ - -struct nlist { - union n_un { - char *n_name; /* symbol name */ - long n_strx; /* index into file string table */ - } n_un; - unsigned char n_type; /* type flag, i.e. N_TEXT etc; see below */ - unsigned char n_other; /* unused */ - short n_desc; /* see */ -#if defined(__alpha) - int n_pad; /* alignment, used to carry framesize info */ -#endif - vm_offset_t n_value; /* value of this symbol (or sdb offset) */ -}; - -/* - * Simple values for n_type. - */ -#define N_UNDF 0 /* undefined */ -#define N_ABS 2 /* absolute */ -#define N_TEXT 4 /* text */ -#define N_DATA 6 /* data */ -#define N_BSS 8 /* bss */ -#define N_FN 0x1e /* file name symbol */ -#define N_EXT 1 /* external bit, or'ed in */ -#define N_TYPE 0x1e /* mask for all the type bits */ -#define N_STAB 0xe0 /* if any of these bits set, a SDB entry */ - -#endif /* !_DDB_NLIST_H_ */ diff --git a/osfmk/ddb/orig/db_print.c b/osfmk/ddb/orig/db_print.c deleted file mode 100644 index 7e91ec96d..000000000 --- a/osfmk/ddb/orig/db_print.c +++ /dev/null @@ -1,1373 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.2 1998/04/29 17:35:25 mburg - * MK7.3 merger - * - * Revision 1.2.85.1 1998/02/03 09:24:09 gdt - * Merge up to MK7.3 - * [1998/02/03 09:10:24 gdt] - * - * Revision 1.2.81.1 1997/03/27 18:46:38 barbou - * ri-osc CR1565 - clean up db_print_act, removing old !USER code - * which had gotten stale (the option made little sense here anyway). - * Added routine db_show_one_thread() to take either act/shuttle and - * do something sensible. [dwm] Also rationalize plain, /u and /l - * output for "show act", "show task" and "show all acts". - * [1995/08/28 15:47:00 bolinger] - * [97/02/25 barbou] - * - * Revision 1.2.31.13 1996/01/09 19:16:02 devrcs - * Alpha kdebug Changes: - * Correct various header spacing to account for 64-bit addresses. - * Modify db_show_all_*() functions, so the can be called from kdebug. - * ( There's no way to call with "char *modif", so added NULL check. ) - * Changed db_error() calls to DB_ERROR() macro, so we return on error - * on Alpha (we gotta return to kdebug). - * Changed declarations of 'register foo' to 'register int foo'. - * [1995/12/01 21:42:20 jfraser] - * - * Merged '64-bit safe' changes from DEC alpha port. - * [1995/11/21 18:03:24 jfraser] - * - * Revision 1.2.31.12 1995/10/09 17:03:30 devrcs - * Merge forward. - * [1995/08/24 20:56:42 watkins] - * - * Revision 1.2.59.1 1995/08/04 17:03:17 watkins - * Change to stack per shuttle model. - * [1995/07/19 20:26:13 watkins] - * - * Revision 1.2.31.11 1995/09/18 19:08:49 devrcs - * Merge forward. - * [1995/08/24 20:56:42 watkins] - * - * Revision 1.2.59.1 1995/08/04 17:03:17 watkins - * Change to stack per shuttle model. - * [1995/07/19 20:26:13 watkins] - * - * Revision 1.2.31.10 1995/05/19 15:43:04 bernadat - * Fixed db_print_act for empty activations. - * Let thread swapping be configurable. - * [95/05/19 bernadat] - * - * Revision 1.2.31.9 1995/05/14 18:10:25 dwm - * ri-osc CR1304 - merge (nmk19_latest - nmk19b1) diffs into mainline. - * mk6 CR938 - restore mach_msg hot path - * remove use of now-defunct fields in thread [mmp,dwm] - * [1995/05/14 17:25:05 dwm] - * - * Revision 1.2.31.8 1995/04/07 18:53:00 barbou - * VM Merge - Task Swapper. - * Renamed TH_SWAPPED to TH_STACK_HANDOFF and swap_func to continuation - * to resolve name conflict. - * From kernel/kdb/kdb_mach.c: - * Put in changes for swapping. - * [1991/11/21 20:32:15 mmp] - * [94/07/27 barbou] - * [95/03/08 barbou] - * - * Revision 1.2.31.7 1995/02/28 01:58:38 dwm - * mk6 CR1120 - Merge mk6pro_shared into cnmk_shared - * * Rev1.2.43.1 1995/01/27 22:01:26 bolinger - * * Fix ri-osc CR977: Make "show space" and "show ipc_port" give - * * accurate count of ports active in IPC space. Make "show ipc_port" - * * output task-visible port name. - * [1995/02/28 01:12:46 dwm] - * - * Revision 1.2.31.6 1995/02/23 21:43:34 alanl - * Fix db_show_one_task_vm for thread_act_ts. - * [95/01/09 rwd] - * - * Merged with DIPC2_SHARED. - * [95/01/04 alanl] - * - * Revision 1.2.31.5 1995/01/10 04:49:52 devrcs - * mk6 CR801 - merge up from nmk18b4 to nmk18b7 - * Fix "sh thr/ul"; no cont. to print, fix pri/policy format. - * * Rev 1.2.31.4 1994/10/11 16:35:58 emcmanus - * Added "show runq" and "show shuttle". - * [1994/12/09 20:36:49 dwm] - * - * mk6 CR668 - 1.3b26 merge - * * Revision 1.2.8.6 1994/05/06 18:39:37 tmt - * Merged osc1.3dec/shared with osc1.3b19 - * Merge Alpha changes into osc1.312b source code. - * 64bit cleanup. - * * End1.3merge - * [1994/11/04 08:49:52 dwm] - * - * Revision 1.2.31.3 1994/09/23 01:20:51 ezf - * change marker to not FREE - * [1994/09/22 21:10:41 ezf] - * - * Revision 1.2.31.2 1994/06/14 17:21:05 bolinger - * Merge up to NMK17.2. - * [1994/06/14 17:20:35 bolinger] - * - * Revision 1.2.23.4 1994/04/15 18:41:31 paire - * Changed interface of db_task_from_space routine. - * [94/03/31 paire] - * - * Revision 1.2.23.3 1994/03/07 16:37:48 paire - * Merge with Intel R1_1 - * Change from NMK14.10 [1993/11/15 16:06:21 rwd] - * - * Enhanced pretty print routine and added db_task_from_space. - * Change from NMK14.10 [93/09/24 sjs] - * [94/02/21 paire] - * - * Exported ANSI prototype of db_port_kmsg_count routine. - * Added header file include for the declaration of db_norma_ipc routine. - * [94/02/15 paire] - * - * Revision 1.2.23.2 1994/02/11 14:21:58 paire - * Added new vm_print.h header file for db_vm declaration. - * [94/02/09 paire] - * - * Revision 1.2.23.1 1994/02/08 10:58:19 bernadat - * print out msgcount for each port in db_port_iterate - * Change from NORMA_MK14.6(August 93) [1993/07/27 12:35:17 mmp] - * - * Removed defintion of db_maxoff (got from ). - * [93/08/12 paire] - * - * Show ipc_space_remote msg counts only if NORMA_IPC is on - * [93/07/21 bernadat] - * - * Add /s option to "show ipc_port" to pick out port sets. - * Change from NORMA_MK14.6 [1993/02/17 16:29:54 dwm] - * [93/07/16 bernadat] - * [94/02/07 bernadat] - * - * Revision 1.2.20.8 1994/06/08 19:11:15 dswartz - * Preemption merge. - * [1994/06/08 19:10:18 dswartz] - * - * Revision 1.2.20.7 1994/04/30 21:28:24 bolinger - * Thread control ops synchronization: now that TH_SUSP is back, - * enable ddb to show it when printing thread state. - * [1994/04/28 21:55:42 bolinger] - * - * Revision 1.2.20.6 1994/03/17 22:35:31 dwm - * The infamous name change: thread_activation + thread_shuttle = thread. - * [1994/03/17 21:25:46 dwm] - * - * Revision 1.2.20.5 1994/01/26 15:43:37 bolinger - * Move kernel_stack from thread to activation. - * [1994/01/25 21:53:11 bolinger] - * - * Revision 1.2.20.4 1994/01/12 17:50:44 dwm - * Coloc: initial restructuring to follow Utah model. - * [1994/01/12 17:13:12 dwm] - * - * Revision 1.2.20.3 1993/11/18 18:11:47 dwm - * Coloc: remove continuations entirely; they are incompatible - * with migration, and their volume is obfuscatory. - * [1993/11/18 18:06:27 dwm] - * - * Revision 1.2.20.2 1993/10/12 16:38:50 dwm - * CoLoc: neuter continuations, ifdef USE_CONTINUATIONS. - * [1993/10/12 16:14:46 dwm] - * - * Revision 1.2.8.4 1993/08/11 20:38:06 elliston - * Add ANSI Prototypes. CR #9523. - * [1993/08/11 03:33:51 elliston] - * - * Revision 1.2.8.3 1993/07/27 18:27:55 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:12:39 elliston] - * - * Revision 1.2.8.2 1993/06/09 02:20:35 gm - * CR9176 - ANSI C violations: trailing tokens on CPP - * directives, extra semicolons after decl_ ..., asm keywords - * [1993/06/07 18:57:22 jeffc] - * - * Removed a '#if MACH_FIXPRI' which somehow survived the purge. CR #9131. - * [1993/05/11 20:56:00 dswartz] - * - * Revision 1.2 1993/04/19 16:02:50 devrcs - * Added printout of thread scheduling policy to long form - * of thread display. - * [93/01/28 jat] - * - * Changes from mk78: - * Removed unused variable from db_show_regs(). - * [92/05/16 jfriedl] - * Converted some db_printsyms to db_task_printsyms. - * [92/04/10 danner] - * Changed db_print_thread so that both display formats - * show the floating-point-used status of the thread. - * [92/03/16 rpd] - * [93/02/02 bruel] - * - * Revision 1.1 1992/09/30 02:01:18 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.11.3.2 92/04/08 15:43:10 jeffreyh - * Added i option to show thread. This gives wait state information. - * [92/04/08 sjs] - * - * Revision 2.11.3.1 92/03/03 16:13:34 jeffreyh - * Pick up changes from TRUNK - * [92/02/26 11:00:01 jeffreyh] - * - * Revision 2.13 92/02/20 18:34:28 elf - * Fixed typo. - * [92/02/20 elf] - * - * Revision 2.12 92/02/19 15:07:47 elf - * Added db_thread_fp_used, to avoid machine-dependent conditionals. - * [92/02/19 rpd] - * - * Added 'F' flag to db_thread_stat showing if the thread has a valid - * FPU context. Tested on i386 and pmax. - * [92/02/17 kivinen] - * - * Revision 2.11 91/11/12 11:50:32 rvb - * Added OPTION_USER ("/u") to db_show_all_threads, db_show_one_thread, - * db_show_one_task. Without it, we display old-style information. - * [91/10/31 rpd] - * - * Revision 2.10 91/10/09 16:01:48 af - * Supported "show registers" for non current thread. - * Changed display format of thread and task information. - * Changed "show thread" to print current thread information - * if no thread is specified. - * Added "show_one_task" for "show task" command. - * Added IPC port print routines for "show ipc_port" command. - * [91/08/29 tak] - * - * Revision 2.9 91/08/03 18:17:19 jsb - * In db_print_thread, if the thread is swapped and there is a - * continuation function, print the function name in parentheses - * instead of '(swapped)'. - * [91/07/04 09:59:27 jsb] - * - * Revision 2.8 91/07/31 17:30:43 dbg - * Revise scheduling state machine. - * [91/07/30 16:43:42 dbg] - * - * Revision 2.7 91/07/09 23:15:57 danner - * Fixed a few printf that should be db_printfs. - * [91/07/08 danner] - * - * Revision 2.6 91/05/14 15:35:25 mrt - * Correcting copyright - * - * Revision 2.5 91/02/05 17:06:53 mrt - * Changed to new Mach copyright - * [91/01/31 16:18:56 mrt] - * - * Revision 2.4 90/10/25 14:43:54 rwd - * Changed db_show_regs to print unsigned. - * [90/10/19 rpd] - * Generalized the watchpoint support. - * [90/10/16 rwd] - * - * Revision 2.3 90/09/09 23:19:52 rpd - * Avoid totally incorrect guesses of symbol names for small values. - * [90/08/30 17:39:08 af] - * - * Revision 2.2 90/08/27 21:51:49 dbg - * Insist that 'show thread' be called with an explicit address. - * [90/08/22 dbg] - * - * Fix type for db_maxoff. - * [90/08/20 dbg] - * - * Do not dereference the "valuep" field of a variable directly, - * call the new db_read/write_variable functions instead. - * Reflected changes in symbol lookup functions. - * [90/08/20 af] - * Reduce lint. - * [90/08/10 14:33:44 dbg] - * - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 7/90 - */ - -/* - * Miscellaneous printing. - */ -#include -#include - -#include /* For strlen() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for db_vm() */ - -#include -#include - -#include -#include -#include -#include -#include -#include /* For db_printf() */ -#include - -#include -#include /*** ??? fix so this can be removed ***/ - -#if TASK_SWAPPER -#include -#endif /* TASK_SWAPPER */ - -/* Prototypes for functions local to this file. XXX -- should be static! - */ - -char *db_act_stat( - register thread_act_t thr_act, - char *status); - -char *db_act_swap_stat( - register thread_act_t thr_act, - char *status); - -void db_print_task( - task_t task, - int task_id, - int flag); - -void db_reset_print_entry( - void); - -void db_print_one_entry( - ipc_entry_t entry, - int index, - mach_port_name_t name, - boolean_t is_pset); - -int db_port_iterate( - thread_act_t thr_act, - boolean_t is_pset, - boolean_t do_output); - -ipc_port_t db_lookup_port( - thread_act_t thr_act, - int id); - -static void db_print_port_id( - int id, - ipc_port_t port, - unsigned bits, - int n); - -void db_print_act( - thread_act_t thr_act, - int act_id, - int flag); - -void db_print_space( - task_t task, - int task_id, - int flag); - -void db_print_task_vm( - task_t task, - int task_id, - boolean_t title, - char *modif); - -void db_system_stats(void); - - -void -db_show_regs( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif) -{ - register struct db_variable *regp; - db_expr_t value; - db_addr_t offset; - char * name; - register int i; - struct db_var_aux_param aux_param; - task_t task = TASK_NULL; - - aux_param.modif = modif; - aux_param.thr_act = THR_ACT_NULL; - if (db_option(modif, 't')) { - if (have_addr) { - if (!db_check_act_address_valid((thread_act_t)addr)) - return; - aux_param.thr_act = (thread_act_t)addr; - } else - aux_param.thr_act = db_default_act; - if (aux_param.thr_act != THR_ACT_NULL) - task = aux_param.thr_act->task; - } - for (regp = db_regs; regp < db_eregs; regp++) { - if (regp->max_level > 1) { - db_printf("bad multi-suffixed register %s\n", regp->name); - continue; - } - aux_param.level = regp->max_level; - for (i = regp->low; i <= regp->high; i++) { - aux_param.suffix[0] = i; - db_read_write_variable(regp, &value, DB_VAR_GET, &aux_param); - if (regp->max_level > 0) - db_printf("%s%d%*s", regp->name, i, - 12-strlen(regp->name)-((i<10)?1:2), ""); - else - db_printf("%-12s", regp->name); - db_printf("%#*N", 2+2*sizeof(vm_offset_t), value); - db_find_xtrn_task_sym_and_offset((db_addr_t)value, &name, - &offset, task); - if (name != 0 && offset <= db_maxoff && offset != value) { - db_printf("\t%s", name); - if (offset != 0) - db_printf("+%#r", offset); - } - db_printf("\n"); - } - } -} - -#define OPTION_LONG 0x001 /* long print option */ -#define OPTION_USER 0x002 /* print ps-like stuff */ -#define OPTION_INDENT 0x100 /* print with indent */ -#define OPTION_THREAD_TITLE 0x200 /* print thread title */ -#define OPTION_TASK_TITLE 0x400 /* print thread title */ - -#ifndef DB_TASK_NAME -#define DB_TASK_NAME(task) /* no task name */ -#define DB_TASK_NAME_TITLE "" /* no task name */ -#endif /* DB_TASK_NAME */ - -#ifndef db_act_fp_used -#define db_act_fp_used(thr_act) FALSE -#endif - -char * -db_act_stat( - register thread_act_t thr_act, - char *status) -{ - register char *p = status; - - if (!thr_act->active) { - *p++ = 'D', - *p++ = 'y', - *p++ = 'i', - *p++ = 'n', - *p++ = 'g'; - *p++ = ' '; - } else if (!thr_act->thread) { - *p++ = 'E', - *p++ = 'm', - *p++ = 'p', - *p++ = 't', - *p++ = 'y'; - *p++ = ' '; - } else { - thread_t athread = thr_act->thread; - - *p++ = (athread->state & TH_RUN) ? 'R' : '.'; - *p++ = (athread->state & TH_WAIT) ? 'W' : '.'; - *p++ = (athread->state & TH_SUSP) ? 'S' : '.'; - *p++ = (athread->state & TH_SWAPPED_OUT) ? 'O' : '.'; - *p++ = (athread->state & TH_UNINT) ? 'N' : '.'; - /* show if the FPU has been used */ - *p++ = db_act_fp_used(thr_act) ? 'F' : '.'; - } - *p++ = 0; - return(status); -} - -char * -db_act_swap_stat( - register thread_act_t thr_act, - char *status) -{ - register char *p = status; - -#if THREAD_SWAPPER - switch (thr_act->swap_state & TH_SW_STATE) { - case TH_SW_UNSWAPPABLE: - *p++ = 'U'; - break; - case TH_SW_IN: - *p++ = 'I'; - break; - case TH_SW_GOING_OUT: - *p++ = 'G'; - break; - case TH_SW_WANT_IN: - *p++ = 'W'; - break; - case TH_SW_OUT: - *p++ = 'O'; - break; - case TH_SW_COMING_IN: - *p++ = 'C'; - break; - default: - *p++ = '?'; - break; - } - *p++ = (thr_act->swap_state & TH_SW_TASK_SWAPPING) ? 'T' : '.'; -#endif /* THREAD_SWAPPER */ - *p++ = 0; - - return status; -} - -char *policy_list[] = { "TS", "RR", "??", "FF", - "??", "??", "??", "BE"}; - -void -db_print_act( - thread_act_t thr_act, - int act_id, - int flag) -{ - thread_t athread; - char status[8]; - char swap_status[3]; - char *indent = ""; - int policy; - - if (!thr_act) { - db_printf("db_print_act(NULL)!\n"); - return; - } - - athread = thr_act->thread; - if (flag & OPTION_USER) { - - if (flag & OPTION_LONG) { - if (flag & OPTION_INDENT) - indent = " "; - if (flag & OPTION_THREAD_TITLE) { - db_printf("%s ID: ACT STAT SW STACK SHUTTLE", indent); - db_printf(" SUS PRI WAIT_FUNC\n"); - } - policy = (athread ? athread->policy : 2); - db_printf("%s%3d%c %0*X %s %s %0*X %0*X %3d %3d/%s ", - indent, act_id, - (thr_act == current_act())? '#': ':', - 2*sizeof(vm_offset_t), thr_act, - db_act_stat(thr_act, status), - db_act_swap_stat(thr_act, swap_status), - 2*sizeof(vm_offset_t), (athread ?athread->kernel_stack:0), - 2*sizeof(vm_offset_t), athread, - thr_act->suspend_count, - (athread ? athread->sched_pri : 999), /* XXX */ - policy_list[policy-1]); - if (athread) { - /* no longer TH_SWAP, no continuation to print */ - if (athread->state & TH_WAIT) - db_task_printsym((db_addr_t)athread->wait_event, - DB_STGY_ANY, kernel_task); - } - db_printf("\n"); - } else { - if (act_id % 3 == 0) { - if (flag & OPTION_INDENT) - db_printf("\n "); - } else - db_printf(" "); - db_printf("%3d%c(%0*X,%s)", act_id, - (thr_act == current_act())? '#': ':', - 2*sizeof(vm_offset_t), thr_act, - db_act_stat(thr_act, status)); - } - } else { - if (flag & OPTION_INDENT) - db_printf(" %3d (%0*X) ", act_id, - 2*sizeof(vm_offset_t), thr_act); - else - db_printf("(%0*X) ", 2*sizeof(vm_offset_t), thr_act); - if (athread) { - db_printf("%c%c%c%c%c", - (athread->state & TH_RUN) ? 'R' : ' ', - (athread->state & TH_WAIT) ? 'W' : ' ', - (athread->state & TH_SUSP) ? 'S' : ' ', - (athread->state & TH_UNINT)? 'N' : ' ', - db_act_fp_used(thr_act) ? 'F' : ' '); - /* Obsolete TH_STACK_HANDOFF code, left for now; might enhance - * to print out safe_points instead */ - if (athread->state & TH_STACK_HANDOFF) { - if (athread->continuation) { - db_printf("("); - db_task_printsym((db_addr_t)athread->continuation, - DB_STGY_ANY, kernel_task); - db_printf(")"); - } else { - db_printf("(handoff)"); - } - } - if (athread->state & TH_WAIT) { - db_printf(" "); - db_task_printsym((db_addr_t)athread->wait_event, - DB_STGY_ANY, kernel_task); - } - } else - db_printf("Empty"); - db_printf("\n"); - } -} - -void -db_print_task( - task_t task, - int task_id, - int flag) -{ - thread_act_t thr_act; - int act_id; - char sstate; - - if (flag & OPTION_USER) { - if (flag & OPTION_TASK_TITLE) { - db_printf(" ID: TASK MAP THD RES SUS PR SW %s", - DB_TASK_NAME_TITLE); - if ((flag & OPTION_LONG) == 0) - db_printf(" ACTS"); - db_printf("\n"); - } -#if TASK_SWAPPER - switch ((int) task->swap_state) { - case TASK_SW_IN: - sstate = 'I'; - break; - case TASK_SW_OUT: - sstate = 'O'; - break; - case TASK_SW_GOING_OUT: - sstate = 'G'; - break; - case TASK_SW_COMING_IN: - sstate = 'C'; - break; - case TASK_SW_UNSWAPPABLE: - sstate = 'U'; - break; - default: - sstate = '?'; - break; - } -#else /* TASK_SWAPPER */ - sstate = 'I'; -#endif /* TASK_SWAPPER */ - /*** ??? fix me ***/ - db_printf("%3d: %0*X %0*X %3d %3d %3d %2d %c ", - task_id, 2*sizeof(vm_offset_t), task, - 2*sizeof(vm_offset_t), task->map, - task->thr_act_count, task->res_act_count, - task->suspend_count, - ((mk_sp_attributes_t)(task->sp_attributes))->priority, - sstate); - DB_TASK_NAME(task); - if (flag & OPTION_LONG) { - if (flag & OPTION_TASK_TITLE) - flag |= OPTION_THREAD_TITLE; - db_printf("\n"); - } else if (task->thr_act_count <= 1) - flag &= ~OPTION_INDENT; - act_id = 0; - queue_iterate(&task->thr_acts, thr_act, thread_act_t, thr_acts) { - db_print_act(thr_act, act_id, flag); - flag &= ~OPTION_THREAD_TITLE; - act_id++; - } - if ((flag & OPTION_LONG) == 0) - db_printf("\n"); - } else { - if (flag & OPTION_LONG) { - if (flag & OPTION_TASK_TITLE) { - db_printf(" TASK ACT\n"); - if (task->thr_act_count > 1) - flag |= OPTION_THREAD_TITLE; - } - } - db_printf("%3d (%0*X): ", task_id, 2*sizeof(vm_offset_t), task); - if (task->thr_act_count == 0) { - db_printf("no threads\n"); - } else { - if (task->thr_act_count > 1) { - db_printf("%d threads: \n", task->thr_act_count); - flag |= OPTION_INDENT; - } else - flag &= ~OPTION_INDENT; - act_id = 0; - queue_iterate(&task->thr_acts, thr_act, - thread_act_t, thr_acts) { - db_print_act(thr_act, act_id++, flag); - flag &= ~OPTION_THREAD_TITLE; - } - } - } -} - -void -db_print_space( - task_t task, - int task_id, - int flag) -{ - ipc_space_t space; - thread_act_t act = (thread_act_t)queue_first(&task->thr_acts); - int count; - - count = 0; - space = task->itk_space; - if (act) - count = db_port_iterate(act, FALSE, FALSE); - db_printf("%3d: %08x %08x %08x %sactive %d\n", - task_id, task, space, task->map, - space->is_active? "":"!", count); -} - -void -db_print_task_vm( - task_t task, - int task_id, - boolean_t title, - char *modif) -{ - vm_map_t map; - pmap_t pmap; - vm_size_t size; - long resident; - long wired; - - if (title) { - db_printf("id task map pmap virtual rss pg rss mem wir pg wir mem\n"); - } - - map = task->map; - pmap = vm_map_pmap(map); - - size = db_vm_map_total_size(map); - resident = pmap->stats.resident_count; - wired = pmap->stats.wired_count; - - db_printf("%2d %08x %08x %08x %7dK %6d %6dK %6d %6dK\n", - task_id, - task, - map, - pmap, - size / 1024, - resident, (resident * PAGE_SIZE) / 1024, - wired, (wired * PAGE_SIZE) / 1024); -} - - -void -db_show_one_task_vm( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif) -{ - thread_act_t thread; - task_t task; - int task_id; - - if (have_addr == FALSE) { - if ((thread = db_default_act) == THR_ACT_NULL) { - if ((thread = current_act()) == THR_ACT_NULL) { - db_printf("no thread.\n"); - return; - } - } - task = thread->task; - } else { - task = (task_t) addr; - } - - task_id = db_lookup_task(task); - if (task_id < 0) { - db_printf("0x%x is not a task_t\n", addr); - return; - } - - db_print_task_vm(task, task_id, TRUE, modif); -} - -void -db_show_all_task_vm( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif) -{ - task_t task; - int task_id; - boolean_t title = TRUE; - processor_set_t pset; - - task_id = 0; - queue_iterate(&all_psets, pset, processor_set_t, all_psets) { - queue_iterate(&pset->tasks, task, task_t, pset_tasks) { - db_print_task_vm(task, task_id, title, modif); - title = FALSE; - task_id++; - } - } -} - -void -db_show_all_acts( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - task_t task; - int task_id; - int flag; - processor_set_t pset; - - flag = OPTION_TASK_TITLE|OPTION_INDENT; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - task_id = 0; - queue_iterate(&all_psets, pset, processor_set_t, all_psets) { - queue_iterate(&pset->tasks, task, task_t, pset_tasks) { - db_print_task(task, task_id, flag); - flag &= ~OPTION_TASK_TITLE; - task_id++; - if ((flag & (OPTION_LONG|OPTION_INDENT)) == OPTION_INDENT) - db_printf("\n"); - } - } -} - -void -db_show_one_space( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - int flag; - int task_id; - task_t task; - - flag = OPTION_TASK_TITLE; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - if (!have_addr) { - task = db_current_task(); - if (task == TASK_NULL) { - db_error("No task\n"); - /*NOTREACHED*/ - } - } else - task = (task_t) addr; - - if ((task_id = db_lookup_task(task)) < 0) { - db_printf("bad task address 0x%x\n", addr); - db_error(0); - /*NOTREACHED*/ - } - - db_printf(" ID: TASK SPACE MAP COUNT\n"); - db_print_space(task, task_id, flag); -} - -void -db_show_all_spaces( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - task_t task; - int task_id = 0; - int flag; - processor_set_t pset; - - flag = OPTION_TASK_TITLE|OPTION_INDENT; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - db_printf(" ID: TASK SPACE MAP COUNT\n"); - queue_iterate(&all_psets, pset, processor_set_t, all_psets) { - queue_iterate(&pset->tasks, task, task_t, pset_tasks) { - db_print_space(task, task_id, flag); - task_id++; - } - } -} - -db_addr_t -db_task_from_space( - ipc_space_t space, - int *task_id) -{ - task_t task; - int tid = 0; - processor_set_t pset; - - queue_iterate(&all_psets, pset, processor_set_t, all_psets) { - queue_iterate(&pset->tasks, task, task_t, pset_tasks) { - if (task->itk_space == space) { - *task_id = tid; - return (db_addr_t)task; - } - tid++; - } - } - *task_id = 0; - return (0); -} - -void -db_show_one_act( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - int flag; - int act_id; - thread_act_t thr_act; - - flag = OPTION_THREAD_TITLE; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - if (!have_addr) { - thr_act = current_act(); - if (thr_act == THR_ACT_NULL) { - db_error("No thr_act\n"); - /*NOTREACHED*/ - } - } else - thr_act = (thread_act_t) addr; - - if ((act_id = db_lookup_act(thr_act)) < 0) { - db_printf("bad thr_act address %#x\n", addr); - db_error(0); - /*NOTREACHED*/ - } - - if (flag & OPTION_USER) { - db_printf("TASK%d(%0*X):\n", - db_lookup_task(thr_act->task), - 2*sizeof(vm_offset_t), thr_act->task); - db_print_act(thr_act, act_id, flag); - } else { - db_printf("task %d(%0*Xx): thr_act %d", - db_lookup_task(thr_act->task), - 2*sizeof(vm_offset_t), thr_act->task, act_id); - db_print_act(thr_act, act_id, flag); - } - if (db_option(modif, 'i') && thr_act->thread && - (thr_act->thread->state & TH_WAIT) && - thr_act->thread->kernel_stack == 0) { - - db_printf("Wait State: option 0x%x\n", - thr_act->thread->ith_option); - } -} - -void -db_show_one_task( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - int flag; - int task_id; - task_t task; - - flag = OPTION_TASK_TITLE|OPTION_INDENT; - if (db_option(modif, 'u')) - flag |= OPTION_USER; - if (db_option(modif, 'l')) - flag |= OPTION_LONG; - - if (!have_addr) { - task = db_current_task(); - if (task == TASK_NULL) { - db_error("No task\n"); - /*NOTREACHED*/ - } - } else - task = (task_t) addr; - - if ((task_id = db_lookup_task(task)) < 0) { - db_printf("bad task address 0x%x\n", addr); - db_error(0); - /*NOTREACHED*/ - } - - db_print_task(task, task_id, flag); -} - -void -db_show_shuttle( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - thread_shuttle_t shuttle; - thread_act_t thr_act; - - if (have_addr) - shuttle = (thread_shuttle_t) addr; - else { - thr_act = current_act(); - if (thr_act == THR_ACT_NULL) { - db_error("No thr_act\n"); - /*NOTREACHED*/ - } - shuttle = thr_act->thread; - if (shuttle == THREAD_NULL) { - db_error("No shuttle associated with current thr_act\n"); - /*NOTREACHED*/ - } - } - db_printf("shuttle %x:\n", shuttle); - if (shuttle->top_act == THR_ACT_NULL) - db_printf(" no activations\n"); - else { - db_printf(" activations:"); - for (thr_act = shuttle->top_act; thr_act != THR_ACT_NULL; - thr_act = thr_act->lower) { - if (thr_act != shuttle->top_act) - printf(" from"); - printf(" $task%d.%d(%x)", db_lookup_task(thr_act->task), - db_lookup_act(thr_act), thr_act); - } - db_printf("\n"); - } -} - -#define db_pset_kmsg_count(port) \ - (ipc_list_count((port)->ip_pset->ips_messages.imq_messages.ikmq_base)) - -int -db_port_kmsg_count( - ipc_port_t port) -{ - return (port->ip_pset ? db_pset_kmsg_count(port) : port->ip_msgcount); -} - -static int db_print_ent_cnt = 0; - -void db_reset_print_entry( - void) -{ - db_print_ent_cnt = 0; -} - -void -db_print_one_entry( - ipc_entry_t entry, - int index, - mach_port_t name, - boolean_t is_pset) -{ - ipc_port_t aport = (ipc_port_t)entry->ie_object; - unsigned bits = entry->ie_bits; - - if (is_pset && !aport->ip_pset) - return; - if (db_print_ent_cnt && db_print_ent_cnt % 2 == 0) - db_printf("\n"); - if (!name) - db_printf("\t%s%d[%x]", - !is_pset && aport->ip_pset ? "pset" : "port", - index, - MACH_PORT_MAKE(index, IE_BITS_GEN(bits))); - else - db_printf("\t%s[%x]", - !is_pset && aport->ip_pset ? "pset" : "port", - name); - if (!is_pset) { - db_printf("(%s,%x,%d)", - (bits & MACH_PORT_TYPE_RECEIVE)? "r": - (bits & MACH_PORT_TYPE_SEND)? "s": "S", - aport, - db_port_kmsg_count(aport)); - db_print_ent_cnt++; - } - else { - db_printf("(%s,%x,set=%x,%d)", - (bits & MACH_PORT_TYPE_RECEIVE)? "r": - (bits & MACH_PORT_TYPE_SEND)? "s": "S", - aport, - aport->ip_pset, - db_pset_kmsg_count(aport)); - db_print_ent_cnt++; - } -} - -int -db_port_iterate( - thread_act_t thr_act, - boolean_t is_pset, - boolean_t do_output) -{ - ipc_entry_t entry; - ipc_tree_entry_t tentry; - int index; - int size; - int count; - ipc_space_t space; - - count = 0; - space = thr_act->task->itk_space; - entry = space->is_table; - size = space->is_table_size; - db_reset_print_entry(); - for (index = 0; index < size; ++index, ++entry) { - if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { - if (do_output) - db_print_one_entry(entry, - index, (mach_port_t)0, is_pset); - ++count; - } - } - for (tentry = ipc_splay_traverse_start(&space->is_tree); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree, FALSE)) { - entry = &tentry->ite_entry; - if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { - if (do_output) - db_print_one_entry(entry, - 0, tentry->ite_name, is_pset); - ++count; - } - } - return (count); -} - -ipc_port_t -db_lookup_port( - thread_act_t thr_act, - int id) -{ - register ipc_space_t space; - register ipc_entry_t entry; - - if (thr_act == THR_ACT_NULL) - return(0); - space = thr_act->task->itk_space; - if (id < 0 || id >= space->is_table_size) - return(0); - entry = &space->is_table[id]; - if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) - return((ipc_port_t)entry->ie_object); - return(0); -} - -static void -db_print_port_id( - int id, - ipc_port_t port, - unsigned bits, - int n) -{ - if (n != 0 && n % 3 == 0) - db_printf("\n"); - db_printf("\tport%d(%s,%x)", id, - (bits & MACH_PORT_TYPE_RECEIVE)? "r": - (bits & MACH_PORT_TYPE_SEND)? "s": "S", port); -} - -void -db_show_port_id( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - thread_act_t thr_act; - - if (!have_addr) { - thr_act = current_act(); - if (thr_act == THR_ACT_NULL) { - db_error("No thr_act\n"); - /*NOTREACHED*/ - } - } else - thr_act = (thread_act_t) addr; - if (db_lookup_act(thr_act) < 0) { - db_printf("Bad thr_act address 0x%x\n", addr); - db_error(0); - /*NOTREACHED*/ - } - if (db_port_iterate(thr_act, db_option(modif,'s'), TRUE)) - db_printf("\n"); -} - -/* - * Useful system state when the world has hung. - */ -void -db_system_stats() -{ - extern void db_device(void); - extern void db_sched(void); -#if DIPC - extern void db_dipc_stats(void); - extern void db_show_kkt(void); -#endif /* DIPC */ - - db_sched(); - iprintf("\n"); - db_vm(); - iprintf("\n"); - db_device(); -#if DIPC - iprintf("\n"); - db_dipc_stats(); - iprintf("\n"); - db_show_kkt(); -#endif /* DIPC */ - iprintf("\n"); - db_printf("current_{thread/task} 0x%x 0x%x\n", - current_thread(),current_task()); -} - -void db_show_one_runq(run_queue_t runq); - -void -db_show_runq( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char * modif) -{ - processor_set_t pset; - processor_t proc; - run_queue_t runq; - boolean_t showedany = FALSE; - - queue_iterate(&all_psets, pset, processor_set_t, all_psets) { - runq = &pset->runq; - if (runq->count > 0) { - db_printf("PROCESSOR SET %x\n", pset); - db_show_one_runq(runq); - showedany = TRUE; - } - } - if (!showedany) - db_printf("No runnable threads\n"); -} - -void -db_show_one_runq( - run_queue_t runq) -{ - int i, task_id, thr_act_id; - queue_t q; - thread_act_t thr_act; - thread_t thread; - task_t task; - - printf("PRI TASK.ACTIVATION\n"); - for (i = runq->low, q = runq->runq + i; i < NRQS; i++, q++) { - if (!queue_empty(q)) { - db_printf("%3d:", i); - queue_iterate(q, thread, thread_t, links) { - thr_act = thread->top_act; - task = thr_act->task; - task_id = db_lookup_task(task); - thr_act_id = db_lookup_task_act(task, thr_act); - db_printf(" %d.%d", task_id, thr_act_id); - } - db_printf("\n"); - } - } -} diff --git a/osfmk/ddb/stab.h b/osfmk/ddb/stab.h deleted file mode 100644 index 514ffb0a9..000000000 --- a/osfmk/ddb/stab.h +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:48 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:09 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.11.2 1995/01/06 19:11:14 devrcs - * mk6 CR668 - 1.3b26 merge - * added N_FRAME, an extension to aout symtabs - * for machines with non-self-describing frame formats - * [1994/10/14 03:40:05 dwm] - * - * Revision 1.1.11.1 1994/09/23 01:23:47 ezf - * change marker to not FREE - * [1994/09/22 21:11:53 ezf] - * - * Revision 1.1.4.3 1993/07/27 18:28:44 elliston - * Add ANSI prototypes. CR #9523. - * [1993/07/27 18:13:49 elliston] - * - * Revision 1.1.4.2 1993/06/02 23:13:40 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 20:58:12 jeffc] - * - * Revision 1.1 1992/09/30 02:24:31 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.2 91/10/09 16:05:28 af - * Revision 2.1 91/10/05 13:02:42 jeffreyh - * Created. - * - * Revision 2.1.1.1 91/10/05 13:03:14 jeffreyh - * Initial MK63 checkin - * - * Revision 2.1.1.1 91/07/31 13:14:49 jeffreyh - * Created from BSD network release #2 - * [91/07/31 jeffreyh] - * - * - */ -/* CMU_ENDHIST */ -/*- - * Copyright (c) 1991 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)stab.h 5.2 (Berkeley) 4/4/91 - */ -/* - */ - -#ifndef _DDB_DB_STAB_H_ -#define _DDB_DB_STAB_H_ - -/* - * The following are symbols used by various debuggers and by the Pascal - * compiler. Each of them must have one (or more) of the bits defined by - * the N_STAB mask set. - */ - -#define N_GSYM 0x20 /* global symbol */ -#define N_FNAME 0x22 /* F77 function name */ -#define N_FUN 0x24 /* procedure name */ -#define N_STSYM 0x26 /* data segment variable */ -#define N_LCSYM 0x28 /* bss segment variable */ -#define N_MAIN 0x2a /* main function name */ -#define N_PC 0x30 /* global Pascal symbol */ -#define N_FRAME 0x34 /* stack frame descriptor */ -#define N_RSYM 0x40 /* register variable */ -#define N_SLINE 0x44 /* text segment line number */ -#define N_DSLINE 0x46 /* data segment line number */ -#define N_BSLINE 0x48 /* bss segment line number */ -#define N_SSYM 0x60 /* structure/union element */ -#define N_SO 0x64 /* main source file name */ -#define N_LSYM 0x80 /* stack variable */ -#define N_BINCL 0x82 /* include file beginning */ -#define N_SOL 0x84 /* included source file name */ -#define N_PSYM 0xa0 /* parameter variable */ -#define N_EINCL 0xa2 /* include file end */ -#define N_ENTRY 0xa4 /* alternate entry point */ -#define N_LBRAC 0xc0 /* left bracket */ -#define N_EXCL 0xc2 /* deleted include file */ -#define N_RBRAC 0xe0 /* right bracket */ -#define N_BCOMM 0xe2 /* begin common */ -#define N_ECOMM 0xe4 /* end common */ -#define N_ECOML 0xe8 /* end common (local name) */ -#define N_LENG 0xfe /* length of preceding entry */ - -#endif /* !_DDB_DB_STAB_H_ */ diff --git a/osfmk/default_pager/default_pager.c b/osfmk/default_pager/default_pager.c index 5b2ee7b41..21d3667eb 100644 --- a/osfmk/default_pager/default_pager.c +++ b/osfmk/default_pager/default_pager.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -63,7 +63,6 @@ #include "default_pager_internal.h" #include #include -#include #include #include #include @@ -282,22 +281,11 @@ start_def_pager( __unused char *bs_device ) */ /* MACH_PORT_FACE security_port; - MACH_PORT_FACE root_ledger_wired; - MACH_PORT_FACE root_ledger_paged; */ __unused static char here[] = "main"; -/* - default_pager_host_port = ipc_port_make_send(realhost.host_priv_self); - master_device_port = ipc_port_make_send(master_device_port); - root_ledger_wired = ipc_port_make_send(root_wired_ledger_port); - root_ledger_paged = ipc_port_make_send(root_paged_ledger_port); - security_port = ipc_port_make_send(realhost.host_security_self); -*/ - - #if NORMA_VM norma_mk = 1; #else diff --git a/osfmk/default_pager/default_pager_internal.h b/osfmk/default_pager/default_pager_internal.h index 904643741..225ac2378 100644 --- a/osfmk/default_pager/default_pager_internal.h +++ b/osfmk/default_pager/default_pager_internal.h @@ -297,6 +297,7 @@ typedef struct backing_store *backing_store_t; #ifdef MACH_KERNEL #define BS_LOCK_INIT(bs) lck_mtx_init(&(bs)->bs_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define BS_LOCK_DESTROY(bs) lck_mtx_destroy(&(bs)->bs_lock, &default_pager_lck_grp) #define BS_LOCK(bs) lck_mtx_lock(&(bs)->bs_lock) #define BS_UNLOCK(bs) lck_mtx_unlock(&(bs)->bs_lock) @@ -309,6 +310,7 @@ extern struct backing_store_list_head backing_store_list; extern int backing_store_release_trigger_disable; #define BSL_LOCK_INIT() lck_mtx_init(&backing_store_list.bsl_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define BSL_LOCK_DESTROY() lck_mtx_destroy(&backing_store_list.bsl_lock, &default_pager_lck_grp) #define BSL_LOCK() lck_mtx_lock(&backing_store_list.bsl_lock) #define BSL_UNLOCK() lck_mtx_unlock(&backing_store_list.bsl_lock) @@ -365,6 +367,7 @@ typedef struct paging_segment *paging_segment_t; #define PAGING_SEGMENT_NULL ((paging_segment_t) 0) #define PS_LOCK_INIT(ps) lck_mtx_init(&(ps)->ps_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define PS_LOCK_DESTROY(ps) lck_mtx_destroy(&(ps)->ps_lock, &default_pager_lck_grp) #define PS_LOCK(ps) lck_mtx_lock(&(ps)->ps_lock) #define PS_UNLOCK(ps) lck_mtx_unlock(&(ps)->ps_lock) @@ -386,7 +389,8 @@ extern int paging_segment_count; /* number of active paging segments */ extern int paging_segment_max; /* highest used paging segment index */ extern int ps_select_array[DEFAULT_PAGER_BACKING_STORE_MAXPRI+1]; -#define PSL_LOCK_INIT() lck_mtx_init(&paging_segments_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define PSL_LOCK_INIT() lck_mtx_init(&paging_segments_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define PSL_LOCK_DESTROY() lck_mtx_destroy(&paging_segments_lock, &default_pager_lck_grp) #define PSL_LOCK() lck_mtx_lock(&paging_segments_lock) #define PSL_UNLOCK() lck_mtx_unlock(&paging_segments_lock) @@ -523,6 +527,7 @@ typedef struct vstruct_alias { } vstruct_alias_t; #define DPT_LOCK_INIT(lock) lck_mtx_init(&(lock), &default_pager_lck_grp, &default_pager_lck_attr) +#define DPT_LOCK_DESTROY(lock) lck_mtx_destroy(&(lock), &default_pager_lck_grp) #define DPT_LOCK(lock) lck_mtx_lock(&(lock)) #define DPT_UNLOCK(lock) lck_mtx_unlock(&(lock)) #define DPT_SLEEP(lock, e, i) lck_mtx_sleep(&(lock), LCK_SLEEP_DEFAULT, (event_t)(e), i) @@ -533,6 +538,7 @@ typedef struct vstruct_alias { #define VS_UNLOCK(vs) hw_lock_unlock(&(vs)->vs_lock) #define VS_MAP_LOCK_TYPE lck_mtx_t #define VS_MAP_LOCK_INIT(vs) lck_mtx_init(&(vs)->vs_map_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define VS_MAP_LOCK_DESTROY(vs) lck_mtx_destroy(&(vs)->vs_map_lock, &default_pager_lck_grp) #define VS_MAP_LOCK(vs) lck_mtx_lock(&(vs)->vs_map_lock) #define VS_MAP_TRY_LOCK(vs) lck_mtx_try_lock(&(vs)->vs_map_lock) #define VS_MAP_UNLOCK(vs) lck_mtx_unlock(&(vs)->vs_map_lock) @@ -660,6 +666,7 @@ extern lck_grp_t default_pager_lck_grp; extern lck_attr_t default_pager_lck_attr; #define VSL_LOCK_INIT() lck_mtx_init(&vstruct_list.vsl_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define VSL_LOCK_DESTROY() lck_mtx_destroy(&vstruct_list.vsl_lock, &default_pager_lck_grp) #define VSL_LOCK() lck_mtx_lock(&vstruct_list.vsl_lock) #define VSL_LOCK_TRY() lck_mtx_try_lock(&vstruct_list.vsl_lock) #define VSL_UNLOCK() lck_mtx_unlock(&vstruct_list.vsl_lock) diff --git a/osfmk/default_pager/dp_backing_store.c b/osfmk/default_pager/dp_backing_store.c index 9fcf6a2bd..2b97a2122 100644 --- a/osfmk/default_pager/dp_backing_store.c +++ b/osfmk/default_pager/dp_backing_store.c @@ -151,6 +151,7 @@ void vs_free_async(struct vs_async *vsa); /* forward */ #define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock) #define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock) #define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp) #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock) /* * Paging Space Hysteresis triggers and the target notification port @@ -603,7 +604,10 @@ default_pager_backing_store_create( } else { ipc_port_dealloc_kernel((MACH_PORT_FACE)(port)); + + BS_LOCK_DESTROY(bs); kfree(bs, sizeof (struct backing_store)); + return KERN_RESOURCE_SHORTAGE; } @@ -1001,6 +1005,7 @@ restart: /* * Free the backing store structure. */ + BS_LOCK_DESTROY(bs); kfree(bs, sizeof *bs); return KERN_SUCCESS; @@ -1110,6 +1115,7 @@ default_pager_add_segment( PS_LOCK_INIT(ps); ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); if (!ps->ps_bmap) { + PS_LOCK_DESTROY(ps); kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; @@ -1131,6 +1137,8 @@ default_pager_add_segment( if ((error = ps_enter(ps)) != 0) { kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); + + PS_LOCK_DESTROY(ps); kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; @@ -1876,6 +1884,8 @@ ps_vstruct_dealloc( bs_commit(- vs->vs_size); + VS_MAP_LOCK_DESTROY(vs); + zfree(vstruct_zone, vs); } @@ -1886,8 +1896,6 @@ ps_vstruct_reclaim( boolean_t reclaim_backing_store) { unsigned int i, j; -// spl_t s; - unsigned int request_flags; struct vs_map *vsmap; boolean_t vsmap_all_clear, vsimap_all_clear; struct vm_object_fault_info fault_info; @@ -1895,15 +1903,6 @@ ps_vstruct_reclaim( unsigned int vsmap_size; kern_return_t kr; - request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; - if (reclaim_backing_store) { -#if USE_PRECIOUS - request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE; -#else /* USE_PRECIOUS */ - request_flags |= UPL_REQUEST_SET_DIRTY; -#endif /* USE_PRECIOUS */ - } - VS_MAP_LOCK(vs); fault_info.cluster_size = VM_SUPER_CLUSTER; @@ -1912,6 +1911,7 @@ ps_vstruct_reclaim( fault_info.lo_offset = 0; fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift); fault_info.io_sync = reclaim_backing_store; + fault_info.batch_pmap_op = FALSE; /* * If this is an indirect structure, then we walk through the valid @@ -2937,7 +2937,17 @@ pvs_cluster_read( i = pages_in_cl; } else { i = 1; - request_flags |= UPL_NOBLOCK; + + /* + * if the I/O cluster size == PAGE_SIZE, we don't want to set + * the UPL_NOBLOCK since we may be trying to recover from a + * previous partial pagein I/O that occurred because we were low + * on memory and bailed early in order to honor the UPL_NOBLOCK... + * since we're only asking for a single page, we can block w/o fear + * of tying up pages while waiting for more to become available + */ + if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE) + request_flags |= UPL_NOBLOCK; } again: @@ -2975,7 +2985,8 @@ again: memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, PAGE_SIZE, PAGE_SIZE, &upl, NULL, &page_list_count, - request_flags); + request_flags | UPL_SET_INTERNAL); + upl_range_needed(upl, 0, 1); if (clmap.cl_error) upl_abort(upl, UPL_ABORT_ERROR); @@ -3480,10 +3491,23 @@ vs_cluster_write( * Ignore any non-present pages at the end of the * UPL. */ - for (page_index = upl->size / vm_page_size; page_index > 0;) - if (UPL_PAGE_PRESENT(pl, --page_index)) + for (page_index = upl->size / vm_page_size; page_index > 0;) { + if (UPL_PAGE_PRESENT(pl, --page_index)) { + page_index++; break; - num_of_pages = page_index + 1; + } + } + if (page_index == 0) { + /* + * no pages in the UPL + * abort and return + */ + upl_abort(upl, 0); + upl_deallocate(upl); + + return KERN_SUCCESS; + } + num_of_pages = page_index; base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE; @@ -3601,17 +3625,6 @@ vs_cluster_write( ps_offset[seg_index] + seg_offset, transfer_size, flags); - } else { - boolean_t empty = FALSE; - upl_abort_range(upl, - first_dirty * vm_page_size, - num_dirty * vm_page_size, - UPL_ABORT_NOTIFY_EMPTY, - &empty); - if (empty) { - assert(page_index == num_of_pages); - upl_deallocate(upl); - } } } @@ -4251,6 +4264,7 @@ default_pager_add_file( PS_LOCK_INIT(ps); ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); if (!ps->ps_bmap) { + PS_LOCK_DESTROY(ps); kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; @@ -4273,6 +4287,7 @@ default_pager_add_file( if ((error = ps_enter(ps)) != 0) { kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); + PS_LOCK_DESTROY(ps); kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; @@ -4300,7 +4315,7 @@ default_pager_add_file( * online but not activated (till it's needed the next time). */ #if CONFIG_FREEZE - if (!vm_freeze_enabled) + if (!memorystatus_freeze_enabled) #endif { ps = paging_segments[EMERGENCY_PSEG_INDEX]; @@ -4482,7 +4497,7 @@ default_pager_triggers( __unused MACH_PORT_FACE default_pager, /* High and low water signals aren't applicable when freeze is */ /* enabled, so release the trigger ports here and return */ /* KERN_FAILURE. */ - if (vm_freeze_enabled) { + if (memorystatus_freeze_enabled) { if (IP_VALID( trigger_port )){ ipc_port_release_send( trigger_port ); } @@ -4500,7 +4515,7 @@ default_pager_triggers( __unused MACH_PORT_FACE default_pager, } else if (flags == LO_WAT_ALERT) { release = max_pages_trigger_port; #if CONFIG_FREEZE - if (vm_freeze_enabled) { + if (memorystatus_freeze_enabled) { if (IP_VALID( trigger_port )){ ipc_port_release_send( trigger_port ); } diff --git a/osfmk/default_pager/dp_memory_object.c b/osfmk/default_pager/dp_memory_object.c index e122e7711..83c24fe6f 100644 --- a/osfmk/default_pager/dp_memory_object.c +++ b/osfmk/default_pager/dp_memory_object.c @@ -350,8 +350,8 @@ default_pager_add( pset = default_pager_external_set; } - ipc_port_make_sonce(mem_obj); ip_lock(mem_obj); /* unlocked in nsrequest below */ + ipc_port_make_sonce_locked(mem_obj); ipc_port_nsrequest(mem_obj, sync, mem_obj, &previous); } diff --git a/osfmk/device/device.defs b/osfmk/device/device.defs index 458d89540..dda80916e 100644 --- a/osfmk/device/device.defs +++ b/osfmk/device/device.defs @@ -355,13 +355,15 @@ routine io_iterator_is_valid( out is_valid : boolean_t ); -routine io_make_matching( +skip; +/* was routine io_make_matching( master_port : mach_port_t; in of_type : uint32_t; in options : uint32_t; in input : io_struct_inband_t; out matching : io_string_t ); +*/ routine io_catalog_send_data( master_port : mach_port_t; @@ -695,6 +697,20 @@ routine io_connect_method_var_output( out var_output : io_buf_ptr_t, physicalcopy ); +routine io_service_get_matching_service( + master_port : mach_port_t; + in matching : io_string_t; + out service : io_object_t + ); + +routine io_service_get_matching_service_ool( + master_port : mach_port_t; + in matching : io_buf_ptr_t, physicalcopy; + out result : kern_return_t; + out service : io_object_t + ); + + #endif /* IOKIT */ /* vim: set ft=c : */ diff --git a/osfmk/device/device_init.c b/osfmk/device/device_init.c index 05e76bc0e..b935e14c6 100644 --- a/osfmk/device/device_init.c +++ b/osfmk/device/device_init.c @@ -81,6 +81,11 @@ ipc_port_t master_device_port; +lck_grp_attr_t * dev_lck_grp_attr; +lck_grp_t * dev_lck_grp; +lck_attr_t * dev_lck_attr; +lck_mtx_t iokit_obj_to_port_binding_lock; + void device_service_create(void) { @@ -92,6 +97,16 @@ device_service_create(void) kernel_set_special_port(host_priv_self(), HOST_IO_MASTER_PORT, ipc_port_make_send(master_device_port)); + /* allocate device lock group attribute and group */ + dev_lck_grp_attr= lck_grp_attr_alloc_init(); + dev_lck_grp = lck_grp_alloc_init("device", dev_lck_grp_attr); + + /* Allocate device lock attribute */ + dev_lck_attr = lck_attr_alloc_init(); + + /* Initialize the IOKit object to port binding lock */ + lck_mtx_init(&iokit_obj_to_port_binding_lock, dev_lck_grp, dev_lck_attr); + #if 0 ds_init(); net_io_init(); diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index 5c5f8b742..1b6c4d4ec 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -25,10 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include #include -#include - #include #include #include @@ -116,6 +113,9 @@ extern void iokit_retain_port( ipc_port_t port ); extern void iokit_release_port( ipc_port_t port ); extern void iokit_release_port_send( ipc_port_t port ); +extern void iokit_lock_port(ipc_port_t port); +extern void iokit_unlock_port(ipc_port_t port); + extern kern_return_t iokit_switch_object_port( ipc_port_t port, io_object_t obj, ipc_kobject_type_t type ); /* @@ -145,7 +145,7 @@ iokit_lookup_object_port( if (!IP_VALID(port)) return (NULL); - ip_lock(port); + iokit_lock_port(port); if (ip_active(port) && (ip_kotype(port) == IKOT_IOKIT_OBJECT)) { obj = (io_object_t) port->ip_kobject; iokit_add_reference( obj ); @@ -153,7 +153,7 @@ iokit_lookup_object_port( else obj = NULL; - ip_unlock(port); + iokit_unlock_port(port); return( obj ); } @@ -167,7 +167,7 @@ iokit_lookup_connect_port( if (!IP_VALID(port)) return (NULL); - ip_lock(port); + iokit_lock_port(port); if (ip_active(port) && (ip_kotype(port) == IKOT_IOKIT_CONNECT)) { obj = (io_object_t) port->ip_kobject; iokit_add_reference( obj ); @@ -175,7 +175,7 @@ iokit_lookup_connect_port( else obj = NULL; - ip_unlock(port); + iokit_unlock_port(port); return( obj ); } @@ -192,14 +192,19 @@ iokit_lookup_connect_ref(io_object_t connectRef, ipc_space_t space) kr = ipc_object_translate(space, CAST_MACH_PORT_TO_NAME(connectRef), MACH_PORT_RIGHT_SEND, (ipc_object_t *)&port); if (kr == KERN_SUCCESS) { - assert(IP_VALID(port)); - - if (ip_active(port) && (ip_kotype(port) == IKOT_IOKIT_CONNECT)) { - obj = (io_object_t) port->ip_kobject; - iokit_add_reference(obj); - } - - ip_unlock(port); + assert(IP_VALID(port)); + + ip_reference(port); + ip_unlock(port); + + iokit_lock_port(port); + if (ip_active(port) && (ip_kotype(port) == IKOT_IOKIT_CONNECT)) { + obj = (io_object_t) port->ip_kobject; + iokit_add_reference(obj); + } + iokit_unlock_port(port); + + ip_release(port); } } @@ -230,6 +235,20 @@ iokit_release_port_send( ipc_port_t port ) ipc_port_release_send( port ); } +extern lck_mtx_t iokit_obj_to_port_binding_lock; + +EXTERN void +iokit_lock_port( __unused ipc_port_t port ) +{ + lck_mtx_lock(&iokit_obj_to_port_binding_lock); +} + +EXTERN void +iokit_unlock_port( __unused ipc_port_t port ) +{ + lck_mtx_unlock(&iokit_obj_to_port_binding_lock); +} + /* * Get the port for a device. * Consumes a device reference; produces a naked send right. @@ -298,9 +317,10 @@ iokit_alloc_object_port( io_object_t obj, ipc_kobject_type_t type ) ipc_kobject_set( port, (ipc_kobject_t) obj, type); /* Request no-senders notifications on the port. */ - notify = ipc_port_make_sonce( port); ip_lock( port); + notify = ipc_port_make_sonce_locked( port); ipc_port_nsrequest( port, 1, notify, ¬ify); + /* port unlocked */ assert( notify == IP_NULL); gIOKitPortCount++; @@ -326,7 +346,9 @@ iokit_destroy_object_port( ipc_port_t port ) EXTERN kern_return_t iokit_switch_object_port( ipc_port_t port, io_object_t obj, ipc_kobject_type_t type ) { + iokit_lock_port(port); ipc_kobject_set( port, (ipc_kobject_t) obj, type); + iokit_unlock_port(port); return( KERN_SUCCESS); } @@ -388,7 +410,7 @@ iokit_no_senders( mach_no_senders_notification_t * notification ) // convert a port to io_object_t. if( IP_VALID(port)) { - ip_lock(port); + iokit_lock_port(port); if( ip_active(port)) { obj = (io_object_t) port->ip_kobject; type = ip_kotype( port ); @@ -398,7 +420,7 @@ iokit_no_senders( mach_no_senders_notification_t * notification ) else obj = NULL; } - ip_unlock(port); + iokit_unlock_port(port); if( obj ) { @@ -406,11 +428,15 @@ iokit_no_senders( mach_no_senders_notification_t * notification ) if( KERN_SUCCESS != iokit_client_died( obj, port, type, &mscount )) { - /* Re-request no-senders notifications on the port. */ - notify = ipc_port_make_sonce( port); - ip_lock( port); - ipc_port_nsrequest( port, mscount + 1, notify, ¬ify); - assert( notify == IP_NULL); + /* Re-request no-senders notifications on the port (if still active) */ + ip_lock(port); + if (ip_active(port)) { + notify = ipc_port_make_sonce_locked(port); + ipc_port_nsrequest( port, mscount + 1, notify, ¬ify); + /* port unlocked */ + if ( notify != IP_NULL) + ipc_port_release_sonce(notify); + } } iokit_remove_reference( obj ); } @@ -478,6 +504,9 @@ kern_return_t IOMapPages(vm_map_t map, mach_vm_address_t va, mach_vm_address_t p case kIOMapCopybackCache: flags = VM_WIMG_COPYBACK; break; + case kIOMapCopybackInnerCache: + flags = VM_WIMG_INNERWBACK; + break; } pmap_set_cache_attributes(pagenum, flags); @@ -540,7 +569,7 @@ kern_return_t IOProtectCacheMode(vm_map_t __unused map, mach_vm_address_t __unus { ppnum_t ppnum = pmap_find_phys(pmap, va + off); if (ppnum) - pmap_enter(pmap, va + off, ppnum, prot, flags, TRUE); + pmap_enter(pmap, va + off, ppnum, prot, VM_PROT_NONE, flags, TRUE); } return (KERN_SUCCESS); diff --git a/osfmk/device/subrs.c b/osfmk/device/subrs.c index 105edff0f..4956a3b40 100644 --- a/osfmk/device/subrs.c +++ b/osfmk/device/subrs.c @@ -187,6 +187,7 @@ strcmp( * comparison runs for at most "n" characters. */ +// ARM implementation in ../arm/strncmp.s int strncmp( const char *s1, @@ -285,6 +286,7 @@ strcpy( * to the "to" string. */ +// ARM implementation in ../arm/strncpy.s char * strncpy( char *s1, @@ -378,6 +380,8 @@ atoi_term( * outputs: * length of s or max; whichever is smaller */ + +// ARM implementation in ../arm/strnlen.s size_t strnlen(const char *s, size_t max) { const char *es = s + max, *p = s; @@ -484,6 +488,8 @@ strlcat(char *dst, const char *src, size_t siz) * will be copied. Always NUL terminates (unless siz == 0). * Returns strlen(src); if retval >= siz, truncation occurred. */ + +// ARM implementation in ../arm/strlcpy.s size_t strlcpy(char *dst, const char *src, size_t siz) { @@ -565,4 +571,3 @@ strprefix(register const char *s1, register const char *s2) } return (1); } - diff --git a/osfmk/gssd/gssd_mach.defs b/osfmk/gssd/gssd_mach.defs index abe5ffe08..0c0a650e1 100644 --- a/osfmk/gssd/gssd_mach.defs +++ b/osfmk/gssd/gssd_mach.defs @@ -53,7 +53,7 @@ gssd_mach 999; serverprefix svc_; -routine mach_gss_init_sec_context( +Routine mach_gss_init_sec_context( server : mach_port_t; in mech : gssd_mechtype; in intoken : gssd_byte_buffer; @@ -64,6 +64,7 @@ routine mach_gss_init_sec_context( in gssd_flags : uint32_t; inout context : gssd_ctx; inout cred_handle : gssd_cred; + ServerAuditToken atoken : audit_token_t; out ret_flags : uint32_t; out key : gssd_byte_buffer, dealloc; out outtoken : gssd_byte_buffer, dealloc; @@ -78,6 +79,7 @@ routine mach_gss_accept_sec_context( in gssd_flags : uint32_t; inout context : gssd_ctx; inout cred_handle : gssd_cred; + ServerAuditToken atoken : audit_token_t; out flags : uint32_t; out uid : uint32_t; out gids : gssd_gid_list; @@ -93,7 +95,8 @@ simpleroutine mach_gss_log_error( in uid : uint32_t; in source : gssd_string; in major_stat : uint32_t; - in minor_stat : uint32_t + in minor_stat : uint32_t; + ServerAuditToken atoken : audit_token_t ); routine mach_gss_init_sec_context_v2( @@ -109,6 +112,7 @@ routine mach_gss_init_sec_context_v2( inout gssd_flags : uint32_t; inout context : gssd_ctx; inout cred_handle : gssd_cred; + ServerAuditToken atoken : audit_token_t; out ret_flags : uint32_t; out key : gssd_byte_buffer, dealloc; out outtoken : gssd_byte_buffer, dealloc; @@ -125,6 +129,7 @@ routine mach_gss_accept_sec_context_v2( inout gssd_flags : uint32_t; inout context : gssd_ctx; inout cred_handle : gssd_cred; + ServerAuditToken atoken : audit_token_t; out flags : uint32_t; out uid : uint32_t; out gids : gssd_gid_list; @@ -139,6 +144,7 @@ routine mach_gss_hold_cred( in mech : gssd_mechtype; in nt : gssd_nametype; in princ : gssd_byte_buffer; + ServerAuditToken atoken : audit_token_t; out major_stat : uint32_t; out minor_stat : uint32_t ); @@ -148,6 +154,15 @@ routine mach_gss_unhold_cred( in mech : gssd_mechtype; in nt : gssd_nametype; in princ : gssd_byte_buffer; + ServerAuditToken atoken : audit_token_t; out major_stat : uint32_t; out minor_stat : uint32_t ); + +routine mach_gss_lookup( + server : mach_port_t; + in uid : uint32_t; + in asid : int32_t; + ServerAuditToken atoken : audit_token_t; + out gssd_session_port : mach_port_t +); diff --git a/osfmk/gssd/gssd_mach_types.h b/osfmk/gssd/gssd_mach_types.h index e3bde951a..6015ea89c 100644 --- a/osfmk/gssd/gssd_mach_types.h +++ b/osfmk/gssd/gssd_mach_types.h @@ -32,12 +32,29 @@ #define MAX_DISPLAY_STR 128 #define MAX_PRINC_STR 1024 -typedef enum gssd_mechtype { GSSD_NO_MECH = -1, GSSD_KRB5_MECH = 0, - GSSD_SPNEGO_MECH, GSSD_NTLM_MECH } gssd_mechtype; -typedef enum gssd_nametype { GSSD_STRING_NAME = 0, GSSD_EXPORT, - GSSD_ANONYMOUS, GSSD_HOSTBASED, GSSD_USER, GSSD_MACHINE_UID, - GSSD_STRING_UID, GSSD_KRB5_PRINCIPAL, GSSD_KRB5_REFERRAL, - GSSD_NTLM_PRINCIPAL, GSSD_NTLM_BLOB} gssd_nametype; +typedef enum gssd_mechtype { + GSSD_NO_MECH = -1, + GSSD_KRB5_MECH = 0, + GSSD_SPNEGO_MECH, + GSSD_NTLM_MECH, + GSSD_IAKERB_MECH +} gssd_mechtype; + +typedef enum gssd_nametype { + GSSD_STRING_NAME = 0, + GSSD_EXPORT, + GSSD_ANONYMOUS, + GSSD_HOSTBASED, + GSSD_USER, + GSSD_MACHINE_UID, + GSSD_STRING_UID, + GSSD_KRB5_PRINCIPAL, + GSSD_KRB5_REFERRAL, + GSSD_NTLM_PRINCIPAL, + GSSD_NTLM_BLOB, + GSSD_UUID +} gssd_nametype; + typedef char *gssd_string; typedef char *gssd_dstring; typedef uint8_t *gssd_byte_buffer; diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 44b991579..8a95f3f53 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -67,7 +67,6 @@ */ #include -#include #include @@ -102,9 +101,6 @@ #include #include /* inb() */ #include -#if MACH_KDB -#include -#endif /* MACH_KDB */ #include #include @@ -116,6 +112,7 @@ #include #include +#include #include #include @@ -130,7 +127,6 @@ static void machine_conf(void); -extern int default_preemption_rate; extern int max_unsafe_quanta; extern int max_poll_quanta; extern unsigned int panic_is_inited; @@ -151,12 +147,6 @@ volatile int panic_double_fault_cpu = -1; #error unsupported architecture #endif -#ifdef __LP64__ -typedef struct nlist_64 kernel_nlist_t; -#else -typedef struct nlist kernel_nlist_t; -#endif - typedef struct _cframe_t { struct _cframe_t *prev; uintptr_t caller; @@ -205,30 +195,6 @@ machine_startup(void) #endif hw_lock_init(&pbtlock); /* initialize print backtrace lock */ -#if MACH_KDB - /* - * Initialize KDB - */ -#if DB_MACHINE_COMMANDS - db_machine_commands_install(ppc_db_commands); -#endif /* DB_MACHINE_COMMANDS */ - ddb_init(); - - if (boot_arg & DB_KDB) - current_debugger = KDB_CUR_DB; - - /* - * Cause a breakpoint trap to the debugger before proceeding - * any further if the proper option bit was specified in - * the boot flags. - */ - if (halt_in_debugger && (current_debugger == KDB_CUR_DB)) { - Debugger("inline call to debugger(machine_startup)"); - halt_in_debugger = 0; - active_debugger =1; - } -#endif /* MACH_KDB */ - if (PE_parse_boot_argn("preempt", &boot_arg, sizeof (boot_arg))) { default_preemption_rate = boot_arg; } @@ -690,6 +656,11 @@ hibernate_newruntime_map(void * map, vm_size_t map_size, uint32_t system_table_o void machine_init(void) { +#if __x86_64__ + /* Now with VM up, switch to dynamically allocated cpu data */ + cpu_data_realloc(); +#endif + /* Ensure panic buffer is initialized. */ debug_log_init(); @@ -805,12 +776,20 @@ machine_halt_cpu(void) { * writing, this is routine is chained through AppleSMC-> * AppleACPIPlatform */ - if (PE_halt_restart) (*PE_halt_restart)(kPERestartCPU); pmCPUHalt(PM_HALT_DEBUG); } +void +DebuggerWithContext( + __unused unsigned int reason, + __unused void *ctx, + const char *message) +{ + Debugger(message); +} + void Debugger( const char *message) diff --git a/osfmk/i386/Diagnostics.c b/osfmk/i386/Diagnostics.c index f9fd283bc..ddbb4b8ec 100644 --- a/osfmk/i386/Diagnostics.c +++ b/osfmk/i386/Diagnostics.c @@ -36,9 +36,8 @@ * Author: Bill Angell, Apple * Date: 10/auht-five * - * Random diagnostics + * Random diagnostics, augmented Derek Kumar 2011 * - * Try to keep the x86 selectors in-sync with the ppc selectors. * */ @@ -70,14 +69,11 @@ #include #include #include - -extern uint64_t lastNapClear; +#include diagWork dgWork; -uint64_t lastNapClear = 0ULL; uint64_t lastRuptClear = 0ULL; - int diagCall64(x86_saved_state_t * state) { @@ -96,6 +92,7 @@ diagCall64(x86_saved_state_t * state) switch (selector) { /* Select the routine */ case dgRuptStat: /* Suck Interruption statistics */ + (void) ml_set_interrupts_enabled(TRUE); data = regs->rsi; /* Get the number of processors */ if (data == 0) { /* If no location is specified for data, clear all @@ -133,82 +130,30 @@ diagCall64(x86_saved_state_t * state) curpos = curpos + (256 * sizeof(uint32_t) + 8); /* Point to next out put * slot */ } + return 1; break; - - default: /* Handle invalid ones */ - return 0; /* Return an exception */ - +#if DEBUG + case dgGzallocTest: + { + (void) ml_set_interrupts_enabled(TRUE); + unsigned *ptr = (unsigned *)kalloc(1024); + kfree(ptr, 1024); + *ptr = 0x42; } + break; +#endif - return 1; /* Normal non-ast check return */ -} - - -int -diagCall(x86_saved_state_t * state) -{ - uint32_t stk, curpos, i, j; - uint32_t selector, data; - int err; - uint64_t currNap, durNap; - x86_saved_state32_t *regs; - - assert(is_saved_state32(state)); - regs = saved_state32(state); - - if (!(dgWork.dgFlags & enaDiagSCs)) - return 0; /* If not enabled, cause an exception */ - - stk = regs->uesp; /* Point to the stack */ - err = copyin((user_addr_t) (stk + 4), (char *) &selector, sizeof(uint32_t)); /* Get the selector */ - if (err) { - return 0; /* Failed to fetch stack */ +#if defined(__x86_64__) + case dgPermCheck: + { + (void) ml_set_interrupts_enabled(TRUE); + return pmap_permissions_verify(kernel_pmap, kernel_map, 0, ~0ULL); } - switch (selector) { /* Select the routine */ - case dgRuptStat: /* Suck Interruption statistics */ - - err = copyin((user_addr_t) (stk + 8), (char *) &data, sizeof(uint32_t)); /* Get the selector */ - - if (data == 0) {/* If number of processors is 0, clear all - * counts */ - for (i = 0; i < real_ncpus; i++) { /* Cycle through - * processors */ - for (j = 0; j < 256; j++) - cpu_data_ptr[i]->cpu_hwIntCnt[j] = 0; - } - - lastRuptClear = mach_absolute_time(); /* Get the time of clear */ - return 1; /* Normal return */ - } - - (void) copyout((char *) &real_ncpus, data, sizeof(real_ncpus)); /* Copy out number of - * processors */ - - currNap = mach_absolute_time(); /* Get the time now */ - durNap = currNap - lastRuptClear; /* Get the last interval - * duration */ - if (durNap == 0) - durNap = 1; /* This is a very short time, make it - * bigger */ - - curpos = (uint32_t)(data + sizeof(real_ncpus)); /* Point to the next - * available spot */ - - for (i = 0; i < real_ncpus; i++) { /* Move 'em all out */ - (void) copyout((char *) &durNap, curpos, 8); /* Copy out the time - * since last clear */ - (void) copyout((char *) &cpu_data_ptr[i]->cpu_hwIntCnt, curpos + 8, 256 * sizeof(uint32_t)); /* Copy out interrupt - * data for this - * processor */ - curpos = (uint32_t)(curpos + (256 * sizeof(uint32_t) + 8)); /* Point to next out put - * slot */ - } - - break; + break; +#endif /* __x86_64__*/ default: /* Handle invalid ones */ return 0; /* Return an exception */ - } return 1; /* Normal non-ast check return */ diff --git a/osfmk/i386/Diagnostics.h b/osfmk/i386/Diagnostics.h index f5281c604..3a2ffc17a 100644 --- a/osfmk/i386/Diagnostics.h +++ b/osfmk/i386/Diagnostics.h @@ -35,7 +35,6 @@ /* * Here are the Diagnostic interface interfaces * Lovingly crafted by Bill Angell using traditional methods - * Keep selectors in sync with the PPC version where possible. */ #ifdef KERNEL_PRIVATE @@ -46,7 +45,6 @@ #error This file is not useful on non-Intel #endif -int diagCall(x86_saved_state_t *regs); int diagCall64(x86_saved_state_t *regs); #define diagSCnum 0x00006000 @@ -68,7 +66,7 @@ int diagCall64(x86_saved_state_t *regs); #define dgCPNull 14 #define dgPerfMon 15 #define dgMapPage 16 -#define dgScom 17 +#define dgGzallocTest 17 #define dgBind 18 #define dgPproc 19 #define dgAcntg 20 @@ -77,28 +75,20 @@ int diagCall64(x86_saved_state_t *regs); #define dgWar 23 #define dgNapStat 24 #define dgRuptStat 25 - +#define dgPermCheck 26 typedef struct diagWork { /* Diagnostic work area */ unsigned int dgLock; /* Lock if needed */ unsigned int dgFlags; /* Flags */ #define enaExpTrace 0x00000001 -#define enaExpTraceb 31 #define enaUsrFCall 0x00000002 -#define enaUsrFCallb 30 #define enaUsrPhyMp 0x00000004 -#define enaUsrPhyMpb 29 #define enaDiagSCs 0x00000008 -#define enaDiagSCsb 28 #define enaDiagDM 0x00000010 -#define enaDiagSDMb 27 #define enaDiagEM 0x00000020 -#define enaDiagEMb 26 #define enaDiagTrap 0x00000040 -#define enaDiagTrapb 25 #define enaNotifyEM 0x00000080 -#define enaNotifyEMb 24 unsigned int dgMisc0; unsigned int dgMisc1; diff --git a/osfmk/i386/asm.h b/osfmk/i386/asm.h index a51f8ae92..a3a0d524b 100644 --- a/osfmk/i386/asm.h +++ b/osfmk/i386/asm.h @@ -61,13 +61,6 @@ #include #endif /* _KERNEL */ -#ifdef MACH_KERNEL -#include -#else /* !MACH_KERNEL */ -#define MACH_KDB 0 -#endif /* !MACH_KERNEL */ - - #if defined(MACH_KERNEL) || defined(_KERNEL) #include #endif /* MACH_KERNEL || _KERNEL */ @@ -284,17 +277,7 @@ #define Lgmemload(lab,reg) movl Lgotoff(lab),reg #define Lgmemstore(reg,lab,tmp) movl reg,Lgotoff(lab) -#ifdef ASSEMBLER -#if MACH_KDB -#include -/* - * This pseudo-assembler line is added so that there will be at least - * one N_SO entry in the symbol stable to define the current file name. - */ -#endif /* MACH_KDB */ - -#else /* NOT ASSEMBLER */ - +#ifndef ASSEMBLER /* These defines are here for .c files that wish to reference global symbols * within __asm__ statements. */ diff --git a/osfmk/i386/asm64.h b/osfmk/i386/asm64.h index 08afac97a..0f9213e7d 100644 --- a/osfmk/i386/asm64.h +++ b/osfmk/i386/asm64.h @@ -28,6 +28,7 @@ /* Helper macros for 64-bit mode switching */ +#if __i386__ /* * Long jump to 64-bit space from 32-bit compatibility mode. */ @@ -60,3 +61,32 @@ .code32 ;\ 4: +#else + +/* + * Long jump to 64-bit space from 32-bit compatibility mode. + * Effected, in fact, by a long return .. + * - we push the 64-bit kernel code selector KERNEL64_CS + * - call .+1 to get EIP on stack + * - adjust return address after lret + * - lret to return to next instruction but 64-bit mode. + */ +#define ENTER_64BIT_MODE() \ + push $KERNEL64_CS ;\ + call 1f ;\ +1: addl $(2f-1b), (%esp) ;\ + lret ;\ +2: .code64 + +/* + * Long jump to 32-bit compatibility mode from 64-bit space. + * Effected by long return similar to ENTER_64BIT_MODE. + */ +#define ENTER_COMPAT_MODE() \ + call 3f ;\ +3: addq $(4f-3b), (%rsp) ;\ + movl $KERNEL32_CS, 4(%rsp) ;\ + lret ;\ +4: .code32 + +#endif diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index 57b222a14..7ae3c29f1 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -146,6 +146,24 @@ thread_userstack( return (KERN_SUCCESS); } +/* + * thread_userstackdefault: + * + * Return the default stack location for the + * thread, if otherwise unknown. + */ +kern_return_t +thread_userstackdefault( + thread_t thread, + mach_vm_offset_t *default_user_stack) +{ + if (thread_is_64bit(thread)) { + *default_user_stack = VM_USRSTACK64; + } else { + *default_user_stack = VM_USRSTACK32; + } + return (KERN_SUCCESS); +} kern_return_t thread_entrypoint( @@ -229,7 +247,7 @@ machdep_syscall(x86_saved_state_t *state) int args[machdep_call_count]; int trapno; int nargs; - machdep_call_t *entry; + const machdep_call_t *entry; x86_saved_state32_t *regs; assert(is_saved_state32(state)); @@ -311,7 +329,7 @@ void machdep_syscall64(x86_saved_state_t *state) { int trapno; - machdep_call_t *entry; + const machdep_call_t *entry; x86_saved_state64_t *regs; assert(is_saved_state64(state)); @@ -391,11 +409,24 @@ mach_call_arg_munger32(uint32_t sp, int nargs, int call_number, struct mach_call case 2: args->arg2 = args32[1]; case 1: args->arg1 = args32[0]; } - if (call_number == 90) { + if (call_number == 10) { + /* munge the mach_vm_size_t for mach_vm_allocate() */ + args->arg3 = (((uint64_t)(args32[2])) | ((((uint64_t)(args32[3]))<<32))); + args->arg4 = args32[4]; + } else if (call_number == 12) { + /* munge the mach_vm_address_t and mach_vm_size_t for mach_vm_deallocate() */ + args->arg2 = (((uint64_t)(args32[1])) | ((((uint64_t)(args32[2]))<<32))); + args->arg3 = (((uint64_t)(args32[3])) | ((((uint64_t)(args32[4]))<<32))); + } else if (call_number == 14) { + /* munge the mach_vm_address_t and mach_vm_size_t for mach_vm_protect() */ + args->arg2 = (((uint64_t)(args32[1])) | ((((uint64_t)(args32[2]))<<32))); + args->arg3 = (((uint64_t)(args32[3])) | ((((uint64_t)(args32[4]))<<32))); + args->arg4 = args32[5]; + args->arg5 = args32[6]; + } else if (call_number == 90) { /* munge_l for mach_wait_until_trap() */ args->arg1 = (((uint64_t)(args32[0])) | ((((uint64_t)(args32[1]))<<32))); - } - if (call_number == 93) { + } else if (call_number == 93) { /* munge_wl for mk_timer_arm_trap() */ args->arg2 = (((uint64_t)(args32[1])) | ((((uint64_t)(args32[2]))<<32))); } @@ -460,15 +491,19 @@ mach_call_munger(x86_saved_state_t *state) #ifdef MACH_BSD mach_kauth_cred_uthread_update(); #endif - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) | DBG_FUNC_START, - args.arg1, args.arg2, args.arg3, args.arg4, 0); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) | DBG_FUNC_START, + args.arg1, args.arg2, args.arg3, args.arg4, 0); retval = mach_call(&args); DEBUG_KPRINT_SYSCALL_MACH("mach_call_munger: retval=0x%x\n", retval); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_END, - retval, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_END, + retval, 0, 0, 0, 0); + regs->eax = retval; throttle_lowpri_io(TRUE); @@ -497,10 +532,9 @@ mach_call_munger64(x86_saved_state_t *state) "mach_call_munger64: code=%d(%s)\n", call_number, mach_syscall_name_table[call_number]); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC, - (call_number)) | DBG_FUNC_START, - regs->rdi, regs->rsi, - regs->rdx, regs->r10, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_START, + regs->rdi, regs->rsi, regs->rdx, regs->r10, 0); if (call_number < 0 || call_number >= mach_trap_count) { i386_exception(EXC_SYSCALL, regs->rax, 1); @@ -535,9 +569,9 @@ mach_call_munger64(x86_saved_state_t *state) DEBUG_KPRINT_SYSCALL_MACH( "mach_call_munger64: retval=0x%llx\n", regs->rax); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_EXCP_SC, - (call_number)) | DBG_FUNC_END, - regs->rax, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_END, + regs->rax, 0, 0, 0, 0); throttle_lowpri_io(TRUE); diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 375abc7c1..7076ff533 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -74,9 +74,14 @@ extern commpage_descriptor* commpage_64_routines[]; extern vm_map_t commpage32_map; // the shared submap, set up in vm init extern vm_map_t commpage64_map; // the shared submap, set up in vm init +extern vm_map_t commpage_text32_map; // the shared submap, set up in vm init +extern vm_map_t commpage_text64_map; // the shared submap, set up in vm init + char *commPagePtr32 = NULL; // virtual addr in kernel map of 32-bit commpage char *commPagePtr64 = NULL; // ...and of 64-bit commpage +char *commPageTextPtr32 = NULL; // virtual addr in kernel map of 32-bit commpage +char *commPageTextPtr64 = NULL; // ...and of 64-bit commpage uint32_t _cpu_capabilities = 0; // define the capability vector int noVMX = 0; /* if true, do not set kHasAltivec in ppc _cpu_capabilities */ @@ -105,22 +110,24 @@ decl_simple_lock_data(static,commpage_active_cpus_lock); static void* commpage_allocate( vm_map_t submap, // commpage32_map or commpage_map64 - size_t area_used ) // _COMM_PAGE32_AREA_USED or _COMM_PAGE64_AREA_USED + size_t area_used, // _COMM_PAGE32_AREA_USED or _COMM_PAGE64_AREA_USED + vm_prot_t uperm) { vm_offset_t kernel_addr = 0; // address of commpage in kernel map vm_offset_t zero = 0; vm_size_t size = area_used; // size actually populated vm_map_entry_t entry; ipc_port_t handle; + kern_return_t kr; if (submap == NULL) panic("commpage submap is null"); - if (vm_map(kernel_map,&kernel_addr,area_used,0,VM_FLAGS_ANYWHERE,NULL,0,FALSE,VM_PROT_ALL,VM_PROT_ALL,VM_INHERIT_NONE)) - panic("cannot allocate commpage"); + if ((kr = vm_map(kernel_map,&kernel_addr,area_used,0,VM_FLAGS_ANYWHERE,NULL,0,FALSE,VM_PROT_ALL,VM_PROT_ALL,VM_INHERIT_NONE))) + panic("cannot allocate commpage %d", kr); - if (vm_map_wire(kernel_map,kernel_addr,kernel_addr+area_used,VM_PROT_DEFAULT,FALSE)) - panic("cannot wire commpage"); + if ((kr = vm_map_wire(kernel_map,kernel_addr,kernel_addr+area_used,VM_PROT_DEFAULT,FALSE))) + panic("cannot wire commpage: %d", kr); /* * Now that the object is created and wired into the kernel map, mark it so that no delay @@ -130,19 +137,19 @@ commpage_allocate( * * JMM - What we really need is a way to create it like this in the first place. */ - if (!vm_map_lookup_entry( kernel_map, vm_map_trunc_page(kernel_addr), &entry) || entry->is_sub_map) - panic("cannot find commpage entry"); + if (!(kr = vm_map_lookup_entry( kernel_map, vm_map_trunc_page(kernel_addr), &entry) || entry->is_sub_map)) + panic("cannot find commpage entry %d", kr); entry->object.vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - if (mach_make_memory_entry( kernel_map, // target map + if ((kr = mach_make_memory_entry( kernel_map, // target map &size, // size kernel_addr, // offset (address in kernel map) - VM_PROT_ALL, // map it RWX + uperm, // protections as specified &handle, // this is the object handle we get - NULL )) // parent_entry (what is this?) - panic("cannot make entry for commpage"); + NULL ))) // parent_entry (what is this?) + panic("cannot make entry for commpage %d", kr); - if (vm_map_64( submap, // target map (shared submap) + if ((kr = vm_map_64( submap, // target map (shared submap) &zero, // address (map into 1st page in submap) area_used, // size 0, // mask @@ -150,19 +157,18 @@ commpage_allocate( handle, // port is the memory entry we just made 0, // offset (map 1st page in memory entry) FALSE, // copy - VM_PROT_READ|VM_PROT_EXECUTE, // cur_protection (R-only in user map) - VM_PROT_READ|VM_PROT_EXECUTE, // max_protection - VM_INHERIT_SHARE )) // inheritance - panic("cannot map commpage"); + uperm, // cur_protection (R-only in user map) + uperm, // max_protection + VM_INHERIT_SHARE ))) // inheritance + panic("cannot map commpage %d", kr); ipc_port_release(handle); - - // Initialize the text section of the commpage with INT3 - char *commpage_ptr = (char*)(intptr_t)kernel_addr; - vm_size_t i; - for( i = _COMM_PAGE_TEXT_START - _COMM_PAGE_START_ADDRESS; i < size; i++ ) - // This is the hex for the X86 opcode INT3 - commpage_ptr[i] = 0xCC; + /* Make the kernel mapping non-executable. This cannot be done + * at the time of map entry creation as mach_make_memory_entry + * cannot handle disjoint permissions at this time. + */ + kr = vm_protect(kernel_map, kernel_addr, area_used, FALSE, VM_PROT_READ | VM_PROT_WRITE); + assert (kr == KERN_SUCCESS); return (void*)(intptr_t)kernel_addr; // return address in kernel map } @@ -331,21 +337,20 @@ commpage_populate_one( char ** kernAddressPtr, // &commPagePtr32 or &commPagePtr64 size_t area_used, // _COMM_PAGE32_AREA_USED or _COMM_PAGE64_AREA_USED commpage_address_t base_offset, // will become commPageBaseOffset - commpage_descriptor** commpage_routines, // list of routine ptrs for this commpage commpage_time_data** time_data, // &time_data32 or &time_data64 - const char* signature ) // "commpage 32-bit" or "commpage 64-bit" + const char* signature, // "commpage 32-bit" or "commpage 64-bit" + vm_prot_t uperm) { uint8_t c1; short c2; int c4; uint64_t c8; uint32_t cfamily; - commpage_descriptor **rd; short version = _COMM_PAGE_THIS_VERSION; next = 0; cur_routine = 0; - commPagePtr = (char *)commpage_allocate( submap, (vm_size_t) area_used ); + commPagePtr = (char *)commpage_allocate( submap, (vm_size_t) area_used, uperm ); *kernAddressPtr = commPagePtr; // save address either in commPagePtr32 or 64 commPageBaseOffset = base_offset; @@ -380,12 +385,6 @@ commpage_populate_one( cfamily = cpuid_info()->cpuid_cpufamily; commpage_stuff(_COMM_PAGE_CPUFAMILY, &cfamily, 4); - for( rd = commpage_routines; *rd != NULL ; rd++ ) - commpage_stuff_routine(*rd); - - if (!matched) - panic("commpage no match on last routine"); - if (next > _COMM_PAGE_END) panic("commpage overflow: next = 0x%08x, commPagePtr = 0x%p", next, commPagePtr); @@ -408,9 +407,9 @@ commpage_populate( void ) &commPagePtr32, _COMM_PAGE32_AREA_USED, _COMM_PAGE32_BASE_ADDRESS, - commpage_32_routines, &time_data32, - "commpage 32-bit"); + "commpage 32-bit", + VM_PROT_READ); #ifndef __LP64__ pmap_commpage32_init((vm_offset_t) commPagePtr32, _COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_AREA_USED/INTEL_PGBYTES); @@ -422,9 +421,9 @@ commpage_populate( void ) &commPagePtr64, _COMM_PAGE64_AREA_USED, _COMM_PAGE32_START_ADDRESS, /* commpage address are relative to 32-bit commpage placement */ - commpage_64_routines, &time_data64, - "commpage 64-bit"); + "commpage 64-bit", + VM_PROT_READ); #ifndef __LP64__ pmap_commpage64_init((vm_offset_t) commPagePtr64, _COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_AREA_USED/INTEL_PGBYTES); @@ -437,6 +436,63 @@ commpage_populate( void ) rtc_nanotime_init_commpage(); } +/* Fill in the common routines during kernel initialization. + * This is called before user-mode code is running. + */ +void commpage_text_populate( void ){ + commpage_descriptor **rd; + + next =0; + cur_routine=0; + commPagePtr = (char *) commpage_allocate(commpage_text32_map, (vm_size_t) _COMM_PAGE_TEXT_AREA_USED, VM_PROT_READ | VM_PROT_EXECUTE); + commPageTextPtr32 = commPagePtr; + + char *cptr = commPagePtr; + int i=0; + for(; i< _COMM_PAGE_TEXT_AREA_USED; i++){ + cptr[i]=0xCC; + } + + commPageBaseOffset = _COMM_PAGE_TEXT_START; + for (rd = commpage_32_routines; *rd != NULL; rd++) { + commpage_stuff_routine(*rd); + } + if (!matched) + panic(" commpage_text no match for last routine "); + +#ifndef __LP64__ + pmap_commpage32_init((vm_offset_t) commPageTextPtr32, _COMM_PAGE_TEXT_START, + _COMM_PAGE_TEXT_AREA_USED/INTEL_PGBYTES); +#endif + + if (_cpu_capabilities & k64Bit) { + next =0; + cur_routine=0; + commPagePtr = (char *) commpage_allocate(commpage_text64_map, (vm_size_t) _COMM_PAGE_TEXT_AREA_USED, VM_PROT_READ | VM_PROT_EXECUTE); + commPageTextPtr64 = commPagePtr; + + cptr=commPagePtr; + for(i=0; i<_COMM_PAGE_TEXT_AREA_USED; i++){ + cptr[i]=0xCC; + } + + for (rd = commpage_64_routines; *rd !=NULL; rd++) { + commpage_stuff_routine(*rd); + } + +#ifndef __LP64__ + pmap_commpage64_init((vm_offset_t) commPageTextPtr64, _COMM_PAGE_TEXT_START, + _COMM_PAGE_TEXT_AREA_USED/INTEL_PGBYTES); +#endif + } + + if (!matched) + panic(" commpage_text no match for last routine "); + + if (next > _COMM_PAGE_TEXT_END) + panic("commpage text overflow: next=0x%08x, commPagePtr=%p", next, commPagePtr); + +} /* Update commpage nanotime information. Note that we interleave * setting the 32- and 64-bit commpages, in order to keep nanotime more @@ -618,13 +674,16 @@ commpage_update_active_cpus(void) simple_unlock(&commpage_active_cpus_lock); } +extern user32_addr_t commpage_text32_location; +extern user64_addr_t commpage_text64_location; /* Check to see if a given address is in the Preemption Free Zone (PFZ) */ uint32_t commpage_is_in_pfz32(uint32_t addr32) { - if ( (addr32 >= _COMM_PAGE_PFZ_START) && (addr32 < _COMM_PAGE_PFZ_END)) { + if ( (addr32 >= (commpage_text32_location + _COMM_TEXT_PFZ_START_OFFSET)) + && (addr32 < (commpage_text32_location+_COMM_TEXT_PFZ_END_OFFSET))) { return 1; } else @@ -634,8 +693,8 @@ commpage_is_in_pfz32(uint32_t addr32) uint32_t commpage_is_in_pfz64(addr64_t addr64) { - if ( (addr64 >= _COMM_PAGE_32_TO_64(_COMM_PAGE_PFZ_START)) - && (addr64 < _COMM_PAGE_32_TO_64(_COMM_PAGE_PFZ_END))) { + if ( (addr64 >= (commpage_text64_location + _COMM_TEXT_PFZ_START_OFFSET)) + && (addr64 < (commpage_text64_location + _COMM_TEXT_PFZ_END_OFFSET))) { return 1; } else diff --git a/osfmk/i386/commpage/fifo_queues.s b/osfmk/i386/commpage/fifo_queues.s index fa2bbf82c..e994ae945 100644 --- a/osfmk/i386/commpage/fifo_queues.s +++ b/osfmk/i386/commpage/fifo_queues.s @@ -56,6 +56,9 @@ * But we still must take a spinlock to serialize, and in case of page faults. */ +/* Work around 10062261 with a dummy non-local symbol */ +fifo_queue_dummy_symbol: + /* * typedef volatile struct { * void *opaque1; <-- ptr to first queue element or null diff --git a/osfmk/i386/commpage/pthreads.s b/osfmk/i386/commpage/pthreads.s index 1794228ff..a7226180b 100644 --- a/osfmk/i386/commpage/pthreads.s +++ b/osfmk/i386/commpage/pthreads.s @@ -82,6 +82,8 @@ * are located in the PFZ. */ +/* Work around 10062261 with a dummy non-local symbol */ +pthreads_dummy_symbol: /* Internal routine to handle pthread mutex lock operation. This is in the PFZ. * %edi == ptr to LVAL/UVAL pair diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index eee6a8173..3cf464e34 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -96,20 +96,20 @@ int _NumCPUs( void ) * Because Mach VM cannot map the last page of an address space, we don't use it. */ -#define _COMM_PAGE32_AREA_LENGTH ( 2 * 4096 ) /* reserved length of entire comm area */ +#define _COMM_PAGE32_AREA_LENGTH ( 1 * 4096 ) /* reserved length of entire comm area */ #define _COMM_PAGE32_BASE_ADDRESS ( 0xffff0000 ) /* base address of allocated memory */ #define _COMM_PAGE32_START_ADDRESS ( _COMM_PAGE32_BASE_ADDRESS ) /* address traditional commpage code starts on */ -#define _COMM_PAGE32_AREA_USED ( 2 * 4096 ) /* this is the amt actually allocated */ +#define _COMM_PAGE32_AREA_USED ( 1 * 4096 ) /* this is the amt actually allocated */ #define _COMM_PAGE32_SIGS_OFFSET 0x8000 /* offset to routine signatures */ -#define _COMM_PAGE64_AREA_LENGTH ( 2 * 1024 * 1024 ) /* reserved length of entire comm area (2MB) */ +#define _COMM_PAGE64_AREA_LENGTH ( 1 * 4096 ) /* reserved length of entire comm area (2MB) */ #ifdef __ASSEMBLER__ #define _COMM_PAGE64_BASE_ADDRESS ( 0x00007fffffe00000 ) /* base address of allocated memory */ #else /* __ASSEMBLER__ */ #define _COMM_PAGE64_BASE_ADDRESS ( 0x00007fffffe00000ULL ) /* base address of allocated memory */ #endif /* __ASSEMBLER__ */ #define _COMM_PAGE64_START_ADDRESS ( _COMM_PAGE64_BASE_ADDRESS ) /* address traditional commpage code starts on */ -#define _COMM_PAGE64_AREA_USED ( 2 * 4096 ) /* this is the amt actually populated */ +#define _COMM_PAGE64_AREA_USED ( 1 * 4096 ) /* this is the amt actually populated */ /* no need for an Objective-C area on Intel */ #define _COMM_PAGE32_OBJC_SIZE 0ULL @@ -183,6 +183,8 @@ int _NumCPUs( void ) #define _COMM_PAGE_GTOD_NS_BASE (_COMM_PAGE_START_ADDRESS+0x070) /* used by gettimeofday() */ #define _COMM_PAGE_GTOD_SEC_BASE (_COMM_PAGE_START_ADDRESS+0x078) /* used by gettimeofday() */ +#define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0xfff) /* end of common page */ + /* Warning: kernel commpage.h has a matching c typedef for the following. They must be kept in sync. */ /* These offsets are from _COMM_PAGE_TIME_DATA_START */ @@ -199,22 +201,40 @@ int _NumCPUs( void ) /* When new jump table entries are added, corresponding symbols should be added below */ /* New slots should be allocated with at least 16-byte alignment. Some like bcopy require */ /* 32-byte alignment, and should be aligned as such in the assembly source before they are relocated */ -#define _COMM_PAGE_TEXT_START (_COMM_PAGE_START_ADDRESS+0x080) /* start of text section */ +#define _COMM_PAGE_TEXT_START (_COMM_PAGE_START_ADDRESS+0x1000) +#define _COMM_PAGE32_TEXT_START (_COMM_PAGE32_BASE_ADDRESS+0x1000) /* start of text section */ +#define _COMM_PAGE64_TEXT_START (_COMM_PAGE64_BASE_ADDRESS+0x1000) +#define _COMM_PAGE_TEXT_AREA_USED ( 1 * 4096 ) +#define _COMM_PAGE_TEXT_AREA_LENGTH ( 1 * 4096 ) +#define _PFZ32_SLIDE_RANGE ( 14 ) /* pages between 0xfffff000 and _COMM_PAGE32_TEXT_START */ +#define _PFZ64_SLIDE_RANGE ( 510 ) /* pages between 0x00007ffffffff000 and _COMM_PAGE64_TEXT_START */ + +/* setup start offset in the commpage text region for each jump table entry + * the Comm Page Offset is shortened to _COMM_TEXT_[label]_OFFSET + */ -#define _COMM_PAGE_PREEMPT (_COMM_PAGE_START_ADDRESS+0x5a0) /* used by PFZ code */ -#define _COMM_PAGE_BACKOFF (_COMM_PAGE_START_ADDRESS+0x1600) /* called from PFZ */ +#define _COMM_TEXT_PREEMPT_OFFSET (0x5a0) /* called from withing pfz */ +#define _COMM_TEXT_BACKOFF_OFFSET (0x600) /* called from PFZ */ +#define _COMM_TEXT_PFZ_START_OFFSET (0xc00) /* offset for Preemption Free Zone */ +#define _COMM_TEXT_PFZ_ENQUEUE_OFFSET (0xc00) /* internal FIFO enqueue */ +#define _COMM_TEXT_PFZ_DEQUEUE_OFFSET (0xc80) /* internal FIFO dequeue */ +#define _COMM_TEXT_PFZ_MUTEX_LOCK_OFFSET (0xd00) /* internal pthread_mutex_lock() */ +#define _COMM_TEXT_UNUSED_OFFSET (0xd80) /* end of routines in text page */ +#define _COMM_TEXT_PFZ_END_OFFSET (0xfff) /* offset for end of PFZ */ -#define _COMM_PAGE_PFZ_START (_COMM_PAGE_START_ADDRESS+0x1c00) /* start of Preemption Free Zone */ -#define _COMM_PAGE_PFZ_ENQUEUE (_COMM_PAGE_START_ADDRESS+0x1c00) /* internal routine for FIFO enqueue */ -#define _COMM_PAGE_PFZ_DEQUEUE (_COMM_PAGE_START_ADDRESS+0x1c80) /* internal routine for FIFO dequeue */ -#define _COMM_PAGE_PFZ_MUTEX_LOCK (_COMM_PAGE_START_ADDRESS+0x1d00) /* internal routine for pthread_mutex_lock() */ +#define _COMM_PAGE_PREEMPT (_COMM_PAGE_TEXT_START+_COMM_TEXT_PREEMPT_OFFSET) +#define _COMM_PAGE_BACKOFF (_COMM_PAGE_TEXT_START+_COMM_TEXT_BACKOFF_OFFSET) -#define _COMM_PAGE_UNUSED6 (_COMM_PAGE_START_ADDRESS+0x1d80) /* unused space for PFZ code up to 0x1fff */ +#define _COMM_PAGE_PFZ_START (_COMM_PAGE_TEXT_START+_COMM_PAGE_PFZ_START_OFFSET) -#define _COMM_PAGE_PFZ_END (_COMM_PAGE_START_ADDRESS+0x1fff) /* end of Preemption Free Zone */ +#define _COMM_PAGE_PFZ_ENQUEUE (_COMM_PAGE_TEXT_START+_COMM_TEXT_PFZ_ENQUEUE_OFFSET) +#define _COMM_PAGE_PFZ_DEQUEUE (_COMM_PAGE_TEXT_START+_COMM_TEXT_PFZ_DEQUEUE_OFFSET) +#define _COMM_PAGE_PFZ_MUTEX_LOCK (_COMM_PAGE_TEXT_START+_COMM_TEXT_PFZ_MUTEX_LOCK_OFFSET) -#define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0x1fff) /* end of common page - insert new stuff here */ +#define _COMM_PAGE_UNUSED6 (_COMM_PAGE_TEXT_START+_COMM_TEXT_UNUSED_OFFSET) +#define _COMM_PAGE_PFZ_END (_COMM_PAGE_TEXT_START+_COMM_TEXT_PFZ_END_OFFSET) +#define _COMM_PAGE_TEXT_END (_COMM_PAGE_TEXT_START+_COMM_TEXT_PFZ_END_OFFSET) /* end of common text page */ /* _COMM_PAGE_COMPARE_AND_SWAP{32,64}B are not used on x86 and are * maintained here for source compatability. These will be removed at diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index 22de8b2b0..3473fbd3a 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -75,9 +75,6 @@ typedef struct rtclock_timer { typedef struct { struct i386_tss *cdi_ktss; -#if MACH_KDB - struct i386_tss *cdi_dbtss; -#endif /* MACH_KDB */ struct __attribute__((packed)) { uint16_t size; struct fake_descriptor *ptr; @@ -97,9 +94,6 @@ typedef enum { typedef struct { struct x86_64_tss *cdi_ktss; -#if MACH_KDB - struct x86_64_tss *cdi_dbtss; -#endif /* MACH_KDB */ struct __attribute__((packed)) { uint16_t size; void *ptr; @@ -194,15 +188,6 @@ typedef struct cpu_data struct fake_descriptor *cpu_ldtp; cpu_desc_index_t cpu_desc_index; int cpu_ldt; -#ifdef MACH_KDB - /* XXX Untested: */ - int cpu_db_pass_thru; - vm_offset_t cpu_db_stacks; - void *cpu_kdb_saved_state; - spl_t cpu_kdb_saved_ipl; - int cpu_kdb_is_slave; - int cpu_kdb_active; -#endif /* MACH_KDB */ boolean_t cpu_iflag; boolean_t cpu_boot_complete; int cpu_hibernate; @@ -235,8 +220,10 @@ typedef struct cpu_data * validity flag. */ pal_rtc_nanotime_t *cpu_nanotime; /* Nanotime info */ +#if CONFIG_COUNTERS thread_t csw_old_thread; thread_t csw_new_thread; +#endif /* CONFIG COUNTERS */ #if defined(__x86_64__) uint32_t cpu_pmap_pcid_enabled; pcid_t cpu_active_pcid; @@ -409,5 +396,6 @@ cpu_datap(int cpu) } extern cpu_data_t *cpu_data_alloc(boolean_t is_boot_cpu); +extern void cpu_data_realloc(void); #endif /* I386_CPU_DATA */ diff --git a/osfmk/i386/cpu_threads.c b/osfmk/i386/cpu_threads.c index a29bfda26..c36eb89b6 100644 --- a/osfmk/i386/cpu_threads.c +++ b/osfmk/i386/cpu_threads.c @@ -64,10 +64,10 @@ decl_simple_lock_data(, x86_topo_lock); static struct cpu_cache { int level; int type; } cpu_caches [LCACHE_MAX] = { - [L1D] { 1, CPU_CACHE_TYPE_DATA }, - [L1I] { 1, CPU_CACHE_TYPE_INST }, - [L2U] { 2, CPU_CACHE_TYPE_UNIF }, - [L3U] { 3, CPU_CACHE_TYPE_UNIF }, + [L1D] = { 1, CPU_CACHE_TYPE_DATA }, + [L1I] = { 1, CPU_CACHE_TYPE_INST }, + [L2U] = { 2, CPU_CACHE_TYPE_UNIF }, + [L3U] = { 3, CPU_CACHE_TYPE_UNIF }, }; static boolean_t diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 8050d75d6..abb0b94ed 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -29,21 +29,10 @@ * @OSF_COPYRIGHT@ */ #include -#include #include #include #include -#if MACH_KDB -#include -#include -#include -#include -#include -#include -#include -#include -#endif static boolean_t cpuid_dbg #if DEBUG @@ -633,7 +622,7 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) DBG(" features : 0x%016llx\n", info_p->cpuid_features); DBG(" extfeatures : 0x%016llx\n", info_p->cpuid_extfeatures); DBG(" logical_per_package : %d\n", info_p->cpuid_logical_per_package); - DBG(" microcode_version : 0x%08x\n", info_p->cpuid_microcode_version); + DBG(" microcode_version : 0x%08x\n", info_p->cpuid_microcode_version); /* Fold in the Invariant TSC feature bit, if present */ if (info_p->cpuid_max_ext >= 0x80000007) { @@ -1089,49 +1078,70 @@ cpuid_leaf7_features(void) { return cpuid_info()->cpuid_leaf7_features; } - -#if MACH_KDB +static i386_vmm_info_t *_cpuid_vmm_infop = NULL; +static i386_vmm_info_t _cpuid_vmm_info; -/* - * Display the cpuid - * * - * cp - */ -void -db_cpuid(__unused db_expr_t addr, - __unused int have_addr, - __unused db_expr_t count, - __unused char *modif) +static void +cpuid_init_vmm_info(i386_vmm_info_t *info_p) { + uint32_t reg[4]; + uint32_t max_vmm_leaf; - uint32_t i, mid; - uint32_t cpid[4]; + bzero(info_p, sizeof(*info_p)); - do_cpuid(0, cpid); /* Get the first cpuid which is the number of - * basic ids */ - db_printf("%08X - %08X %08X %08X %08X\n", - 0, cpid[eax], cpid[ebx], cpid[ecx], cpid[edx]); + if (!cpuid_vmm_present()) + return; - mid = cpid[eax]; /* Set the number */ - for (i = 1; i <= mid; i++) { /* Dump 'em out */ - do_cpuid(i, cpid); /* Get the next */ - db_printf("%08X - %08X %08X %08X %08X\n", - i, cpid[eax], cpid[ebx], cpid[ecx], cpid[edx]); + DBG("cpuid_init_vmm_info(%p)\n", info_p); + + /* do cpuid 0x40000000 to get VMM vendor */ + cpuid_fn(0x40000000, reg); + max_vmm_leaf = reg[eax]; + bcopy((char *)®[ebx], &info_p->cpuid_vmm_vendor[0], 4); + bcopy((char *)®[ecx], &info_p->cpuid_vmm_vendor[4], 4); + bcopy((char *)®[edx], &info_p->cpuid_vmm_vendor[8], 4); + info_p->cpuid_vmm_vendor[12] = '\0'; + + if (0 == strcmp(info_p->cpuid_vmm_vendor, CPUID_VMM_ID_VMWARE)) { + /* VMware identification string: kb.vmware.com/kb/1009458 */ + info_p->cpuid_vmm_family = CPUID_VMM_FAMILY_VMWARE; + } else { + info_p->cpuid_vmm_family = CPUID_VMM_FAMILY_UNKNOWN; } - db_printf("\n"); - - do_cpuid(0x80000000, cpid); /* Get the first extended cpuid which - * is the number of extended ids */ - db_printf("%08X - %08X %08X %08X %08X\n", - 0x80000000, cpid[eax], cpid[ebx], cpid[ecx], cpid[edx]); - - mid = cpid[eax]; /* Set the number */ - for (i = 0x80000001; i <= mid; i++) { /* Dump 'em out */ - do_cpuid(i, cpid); /* Get the next */ - db_printf("%08X - %08X %08X %08X %08X\n", - i, cpid[eax], cpid[ebx], cpid[ecx], cpid[edx]); + + /* VMM generic leaves: https://lkml.org/lkml/2008/10/1/246 */ + if (max_vmm_leaf >= 0x40000010) { + cpuid_fn(0x40000010, reg); + + info_p->cpuid_vmm_tsc_frequency = reg[eax]; + info_p->cpuid_vmm_bus_frequency = reg[ebx]; } + + DBG(" vmm_vendor : %s\n", info_p->cpuid_vmm_vendor); + DBG(" vmm_family : %u\n", info_p->cpuid_vmm_family); + DBG(" vmm_bus_frequency : %u\n", info_p->cpuid_vmm_bus_frequency); + DBG(" vmm_tsc_frequency : %u\n", info_p->cpuid_vmm_tsc_frequency); } -#endif +boolean_t +cpuid_vmm_present(void) +{ + return (cpuid_features() & CPUID_FEATURE_VMM) ? TRUE : FALSE; +} + +i386_vmm_info_t * +cpuid_vmm_info(void) +{ + if (_cpuid_vmm_infop == NULL) { + cpuid_init_vmm_info(&_cpuid_vmm_info); + _cpuid_vmm_infop = &_cpuid_vmm_info; + } + return _cpuid_vmm_infop; +} + +uint32_t +cpuid_vmm_family(void) +{ + return cpuid_vmm_info()->cpuid_vmm_family; +} diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index bc7fae019..1bc3e2927 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -44,6 +44,8 @@ #define CPUID_VID_INTEL "GenuineIntel" #define CPUID_VID_AMD "AuthenticAMD" +#define CPUID_VMM_ID_VMWARE "VMwareVMware" + #define CPUID_STRING_UNKNOWN "Unknown CPU Typ" #define _Bit(n) (1ULL << n) @@ -112,8 +114,12 @@ #define CPUID_FEATURE_OSXSAVE _HBit(27) /* XGETBV/XSETBV instructions */ #define CPUID_FEATURE_AVX1_0 _HBit(28) /* AVX 1.0 instructions */ #define CPUID_FEATURE_VMM _HBit(31) /* VMM (Hypervisor) present */ -#define CPUID_FEATURE_RDRAND _HBit(29) /* RDRAND instruction */ -#define CPUID_FEATURE_F16C _HBit(30) /* Float16 convert instructions */ +#define CPUID_FEATURE_SEGLIM64 _HBit(11) /* 64-bit segment limit checking */ +#define CPUID_FEATURE_PCID _HBit(17) /* ASID-PCID support */ +#define CPUID_FEATURE_TSCTMR _HBit(24) /* TSC deadline timer */ +#define CPUID_FEATURE_AVX1_0 _HBit(28) /* AVX 1.0 instructions */ +#define CPUID_FEATURE_F16C _HBit(29) /* Float16 convert instructions */ +#define CPUID_FEATURE_RDRAND _HBit(30) /* RDRAND instruction */ /* * Leaf 7, subleaf 0 additional features. @@ -162,6 +168,9 @@ #define CPUID_MODEL_IVYBRIDGE 0x3A +#define CPUID_VMM_FAMILY_UNKNOWN 0x0 +#define CPUID_VMM_FAMILY_VMWARE 0x1 + #ifndef ASSEMBLER #include #include @@ -337,6 +346,15 @@ typedef struct { uint32_t cpuid_leaf7_features; } i386_cpu_info_t; +#ifdef MACH_KERNEL_PRIVATE +typedef struct { + char cpuid_vmm_vendor[16]; + uint32_t cpuid_vmm_family; + uint32_t cpuid_vmm_bus_frequency; + uint32_t cpuid_vmm_tsc_frequency; +} i386_vmm_info_t; +#endif + #ifdef __cplusplus extern "C" { #endif @@ -364,6 +382,12 @@ extern i386_cpu_info_t *cpuid_info(void); extern void cpuid_set_info(void); +#ifdef MACH_KERNEL_PRIVATE +extern boolean_t cpuid_vmm_present(void); +extern i386_vmm_info_t *cpuid_vmm_info(void); +extern uint32_t cpuid_vmm_family(void); +#endif + #ifdef __cplusplus } #endif diff --git a/osfmk/i386/db_disasm.c b/osfmk/i386/db_disasm.c deleted file mode 100644 index c68d65237..000000000 --- a/osfmk/i386/db_disasm.c +++ /dev/null @@ -1,1826 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:36 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:37 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.2.8.3 1996/07/31 09:43:35 paire - * Merged with nmk20b7_shared (1.2.11.1) - * [96/06/10 paire] - * - * Revision 1.2.11.1 1996/05/14 13:49:36 paire - * Added support for new cmpxchg8b, cpuid, rdtsc, rdwmr, rsm and wrmsr - * Pentium instructions - * [95/11/23 paire] - * - * Revision 1.2.8.2 1994/09/23 01:50:45 ezf - * change marker to not FREE - * [1994/09/22 21:21:17 ezf] - * - * Revision 1.2.8.1 1994/09/16 15:26:28 emcmanus - * Only skip over GAS-inserted NOPs after branches if they are really - * NOPs; this depends at least on assembler options. - * [1994/09/16 15:26:03 emcmanus] - * - * Revision 1.2.6.3 1994/02/19 15:40:34 bolinger - * For load/store counting, mark all varieties of "call" as writing - * memory. - * [1994/02/15 20:25:18 bolinger] - * - * Revision 1.2.6.2 1994/02/14 21:46:49 dwm - * Warning repair - * [1994/02/14 21:46:14 dwm] - * - * Revision 1.2.6.1 1994/02/12 23:26:05 bolinger - * Implement load/store counting for ddb "until" command. - * [1994/02/12 03:34:55 bolinger] - * - * Revision 1.2.2.3 1993/08/09 19:39:21 dswartz - * Add ANSI prototypes - CR#9523 - * [1993/08/06 17:44:13 dswartz] - * - * Revision 1.2.2.2 1993/06/09 02:27:29 gm - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 21:03:54 jeffc] - * - * Revision 1.2 1993/04/19 16:12:57 devrcs - * Print file names and lineno on branch instructions. - * [barbou@gr.osf.org] - * [92/12/03 bernadat] - * - * Revision 1.1 1992/09/30 02:02:19 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.5.3.1 92/03/03 16:14:27 jeffreyh - * Pick up changes from TRUNK - * [92/02/26 11:05:06 jeffreyh] - * - * Revision 2.6 92/01/03 20:05:00 dbg - * Add a switch to disassemble 16-bit code. - * Fix spelling of 'lods' opcodes. - * [91/10/30 dbg] - * - * Revision 2.5 91/10/09 16:05:58 af - * Supported disassemble of non current task by passing task parameter. - * [91/08/29 tak] - * - * Revision 2.4 91/05/14 16:05:04 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:11:03 mrt - * Changed to new Mach copyright - * [91/02/01 17:31:03 mrt] - * - * Revision 2.2 90/08/27 21:55:56 dbg - * Fix register operand for move to/from control/test/debug - * register instructions. Add i486 instructions. - * [90/08/27 dbg] - * - * Import db_sym.h. Print instruction displacements in - * current radix (signed). Change calling sequence of - * db_disasm. - * [90/08/21 dbg] - * Fix includes. - * [90/08/08 dbg] - * Created. - * [90/07/25 dbg] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * Instruction disassembler. - */ - -#include -#include - -#include -#include -#include - -#include -#include - -struct i_addr { - int is_reg; /* if reg, reg number is in 'disp' */ - int disp; - char * base; - char * index; - int ss; -}; - -/* Forward */ - -extern db_addr_t db_read_address( - db_addr_t loc, - int short_addr, - int regmodrm, - struct i_addr * addrp, - task_t task); -extern void db_print_address( - char * seg, - int size, - struct i_addr *addrp, - task_t task); -extern db_addr_t db_disasm_esc( - db_addr_t loc, - int inst, - int short_addr, - int size, - char * seg, - task_t task); - -/* - * Switch to disassemble 16-bit code. - */ -boolean_t db_disasm_16 = FALSE; - -/* - * Size attributes - */ -#define BYTE 0 -#define WORD 1 -#define LONG 2 -#define QUAD 3 -#define SNGL 4 -#define DBLR 5 -#define EXTR 6 -#define SDEP 7 -#define NONE 8 - -/* - * Addressing modes - */ -#define E 1 /* general effective address */ -#define Eind 2 /* indirect address (jump, call) */ -#define Ew 3 /* address, word size */ -#define Eb 4 /* address, byte size */ -#define R 5 /* register, in 'reg' field */ -#define Rw 6 /* word register, in 'reg' field */ -#define Ri 7 /* register in instruction */ -#define S 8 /* segment reg, in 'reg' field */ -#define Si 9 /* segment reg, in instruction */ -#define A 10 /* accumulator */ -#define BX 11 /* (bx) */ -#define CL 12 /* cl, for shifts */ -#define DX 13 /* dx, for IO */ -#define SI 14 /* si */ -#define DI 15 /* di */ -#define CR 16 /* control register */ -#define DR 17 /* debug register */ -#define TR 18 /* test register */ -#define I 19 /* immediate, unsigned */ -#define Is 20 /* immediate, signed */ -#define Ib 21 /* byte immediate, unsigned */ -#define Ibs 22 /* byte immediate, signed */ -#define Iw 23 /* word immediate, unsigned */ -#define Il 24 /* long immediate */ -#define O 25 /* direct address */ -#define Db 26 /* byte displacement from EIP */ -#define Dl 27 /* long displacement from EIP */ -#define o1 28 /* constant 1 */ -#define o3 29 /* constant 3 */ -#define OS 30 /* immediate offset/segment */ -#define ST 31 /* FP stack top */ -#define STI 32 /* FP stack */ -#define X 33 /* extended FP op */ -#define XA 34 /* for 'fstcw %ax' */ - -struct inst { - char * i_name; /* name */ - short i_has_modrm; /* has regmodrm byte */ - short i_size; /* operand size */ - int i_mode; /* addressing modes */ - char * i_extra; /* pointer to extra opcode table */ -}; - -#define op1(x) (x) -#define op2(x,y) ((x)|((y)<<8)) -#define op3(x,y,z) ((x)|((y)<<8)|((z)<<16)) - -struct finst { - char * f_name; /* name for memory instruction */ - int f_size; /* size for memory instruction */ - int f_rrmode; /* mode for rr instruction */ - char * f_rrname; /* name for rr instruction - (or pointer to table) */ -}; - -char * db_Grp6[] = { - "sldt", - "str", - "lldt", - "ltr", - "verr", - "verw", - "", - "" -}; - -char * db_Grp7[] = { - "sgdt", - "sidt", - "lgdt", - "lidt", - "smsw", - "", - "lmsw", - "invlpg" -}; - -char * db_Grp8[] = { - "", - "", - "", - "", - "bt", - "bts", - "btr", - "btc" -}; - -struct inst db_inst_0f0x[] = { -/*00*/ { "", TRUE, NONE, op1(Ew), (char *)db_Grp6 }, -/*01*/ { "", TRUE, NONE, op1(Ew), (char *)db_Grp7 }, -/*02*/ { "lar", TRUE, LONG, op2(E,R), 0 }, -/*03*/ { "lsl", TRUE, LONG, op2(E,R), 0 }, -/*04*/ { "", FALSE, NONE, 0, 0 }, -/*05*/ { "", FALSE, NONE, 0, 0 }, -/*06*/ { "clts", FALSE, NONE, 0, 0 }, -/*07*/ { "", FALSE, NONE, 0, 0 }, - -/*08*/ { "invd", FALSE, NONE, 0, 0 }, -/*09*/ { "wbinvd",FALSE, NONE, 0, 0 }, -/*0a*/ { "", FALSE, NONE, 0, 0 }, -/*0b*/ { "", FALSE, NONE, 0, 0 }, -/*0c*/ { "", FALSE, NONE, 0, 0 }, -/*0d*/ { "", FALSE, NONE, 0, 0 }, -/*0e*/ { "", FALSE, NONE, 0, 0 }, -/*0f*/ { "", FALSE, NONE, 0, 0 }, -}; - -struct inst db_inst_0f2x[] = { -/*20*/ { "mov", TRUE, LONG, op2(CR,E), 0 }, /* use E for reg */ -/*21*/ { "mov", TRUE, LONG, op2(DR,E), 0 }, /* since mod == 11 */ -/*22*/ { "mov", TRUE, LONG, op2(E,CR), 0 }, -/*23*/ { "mov", TRUE, LONG, op2(E,DR), 0 }, -/*24*/ { "mov", TRUE, LONG, op2(TR,E), 0 }, -/*25*/ { "", FALSE, NONE, 0, 0 }, -/*26*/ { "mov", TRUE, LONG, op2(E,TR), 0 }, -/*27*/ { "", FALSE, NONE, 0, 0 }, - -/*28*/ { "", FALSE, NONE, 0, 0 }, -/*29*/ { "", FALSE, NONE, 0, 0 }, -/*2a*/ { "", FALSE, NONE, 0, 0 }, -/*2b*/ { "", FALSE, NONE, 0, 0 }, -/*2c*/ { "", FALSE, NONE, 0, 0 }, -/*2d*/ { "", FALSE, NONE, 0, 0 }, -/*2e*/ { "", FALSE, NONE, 0, 0 }, -/*2f*/ { "", FALSE, NONE, 0, 0 }, -}; -struct inst db_inst_0f3x[] = { -/*30*/ { "rdtsc", FALSE, NONE, 0, 0 }, -/*31*/ { "rdmsr", FALSE, NONE, 0, 0 }, -/*32*/ { "wrmsr", FALSE, NONE, 0, 0 }, -/*33*/ { "", FALSE, NONE, 0, 0 }, -/*34*/ { "", FALSE, NONE, 0, 0 }, -/*35*/ { "", FALSE, NONE, 0, 0 }, -/*36*/ { "", FALSE, NONE, 0, 0 }, -/*37*/ { "", FALSE, NONE, 0, 0 }, - -/*38*/ { "", FALSE, NONE, 0, 0 }, -/*39*/ { "", FALSE, NONE, 0, 0 }, -/*3a*/ { "", FALSE, NONE, 0, 0 }, -/*3b*/ { "", FALSE, NONE, 0, 0 }, -/*3c*/ { "", FALSE, NONE, 0, 0 }, -/*3d*/ { "", FALSE, NONE, 0, 0 }, -/*3e*/ { "", FALSE, NONE, 0, 0 }, -/*3f*/ { "", FALSE, NONE, 0, 0 }, -}; - -struct inst db_inst_0f8x[] = { -/*80*/ { "jo", FALSE, NONE, op1(Dl), 0 }, -/*81*/ { "jno", FALSE, NONE, op1(Dl), 0 }, -/*82*/ { "jb", FALSE, NONE, op1(Dl), 0 }, -/*83*/ { "jnb", FALSE, NONE, op1(Dl), 0 }, -/*84*/ { "jz", FALSE, NONE, op1(Dl), 0 }, -/*85*/ { "jnz", FALSE, NONE, op1(Dl), 0 }, -/*86*/ { "jbe", FALSE, NONE, op1(Dl), 0 }, -/*87*/ { "jnbe", FALSE, NONE, op1(Dl), 0 }, - -/*88*/ { "js", FALSE, NONE, op1(Dl), 0 }, -/*89*/ { "jns", FALSE, NONE, op1(Dl), 0 }, -/*8a*/ { "jp", FALSE, NONE, op1(Dl), 0 }, -/*8b*/ { "jnp", FALSE, NONE, op1(Dl), 0 }, -/*8c*/ { "jl", FALSE, NONE, op1(Dl), 0 }, -/*8d*/ { "jnl", FALSE, NONE, op1(Dl), 0 }, -/*8e*/ { "jle", FALSE, NONE, op1(Dl), 0 }, -/*8f*/ { "jnle", FALSE, NONE, op1(Dl), 0 }, -}; - -struct inst db_inst_0f9x[] = { -/*90*/ { "seto", TRUE, NONE, op1(Eb), 0 }, -/*91*/ { "setno", TRUE, NONE, op1(Eb), 0 }, -/*92*/ { "setb", TRUE, NONE, op1(Eb), 0 }, -/*93*/ { "setnb", TRUE, NONE, op1(Eb), 0 }, -/*94*/ { "setz", TRUE, NONE, op1(Eb), 0 }, -/*95*/ { "setnz", TRUE, NONE, op1(Eb), 0 }, -/*96*/ { "setbe", TRUE, NONE, op1(Eb), 0 }, -/*97*/ { "setnbe",TRUE, NONE, op1(Eb), 0 }, - -/*98*/ { "sets", TRUE, NONE, op1(Eb), 0 }, -/*99*/ { "setns", TRUE, NONE, op1(Eb), 0 }, -/*9a*/ { "setp", TRUE, NONE, op1(Eb), 0 }, -/*9b*/ { "setnp", TRUE, NONE, op1(Eb), 0 }, -/*9c*/ { "setl", TRUE, NONE, op1(Eb), 0 }, -/*9d*/ { "setnl", TRUE, NONE, op1(Eb), 0 }, -/*9e*/ { "setle", TRUE, NONE, op1(Eb), 0 }, -/*9f*/ { "setnle",TRUE, NONE, op1(Eb), 0 }, -}; - -struct inst db_inst_0fax[] = { -/*a0*/ { "push", FALSE, NONE, op1(Si), 0 }, -/*a1*/ { "pop", FALSE, NONE, op1(Si), 0 }, -/*a2*/ { "cpuid", FALSE, NONE, 0, 0 }, -/*a3*/ { "bt", TRUE, LONG, op2(E,R), 0 }, -/*a4*/ { "shld", TRUE, LONG, op3(Ib,E,R), 0 }, -/*a5*/ { "shld", TRUE, LONG, op3(CL,E,R), 0 }, -/*a6*/ { "", FALSE, NONE, 0, 0 }, -/*a7*/ { "", FALSE, NONE, 0, 0 }, - -/*a8*/ { "push", FALSE, NONE, op1(Si), 0 }, -/*a9*/ { "pop", FALSE, NONE, op1(Si), 0 }, -/*aa*/ { "rsm", FALSE, NONE, 0, 0 }, -/*ab*/ { "bts", TRUE, LONG, op2(E,R), 0 }, -/*ac*/ { "shrd", TRUE, LONG, op3(Ib,E,R), 0 }, -/*ad*/ { "shrd", TRUE, LONG, op3(CL,E,R), 0 }, -/*a6*/ { "", FALSE, NONE, 0, 0 }, -/*a7*/ { "imul", TRUE, LONG, op2(E,R), 0 }, -}; - -struct inst db_inst_0fbx[] = { -/*b0*/ { "", FALSE, NONE, 0, 0 }, -/*b1*/ { "", FALSE, NONE, 0, 0 }, -/*b2*/ { "lss", TRUE, LONG, op2(E, R), 0 }, -/*b3*/ { "bts", TRUE, LONG, op2(R, E), 0 }, -/*b4*/ { "lfs", TRUE, LONG, op2(E, R), 0 }, -/*b5*/ { "lgs", TRUE, LONG, op2(E, R), 0 }, -/*b6*/ { "movzb", TRUE, LONG, op2(E, R), 0 }, -/*b7*/ { "movzw", TRUE, LONG, op2(E, R), 0 }, - -/*b8*/ { "", FALSE, NONE, 0, 0 }, -/*b9*/ { "", FALSE, NONE, 0, 0 }, -/*ba*/ { "", TRUE, LONG, op2(Is, E), (char *)db_Grp8 }, -/*bb*/ { "btc", TRUE, LONG, op2(R, E), 0 }, -/*bc*/ { "bsf", TRUE, LONG, op2(E, R), 0 }, -/*bd*/ { "bsr", TRUE, LONG, op2(E, R), 0 }, -/*be*/ { "movsb", TRUE, LONG, op2(E, R), 0 }, -/*bf*/ { "movsw", TRUE, LONG, op2(E, R), 0 }, -}; - -struct inst db_inst_0fcx[] = { -/*c0*/ { "xadd", TRUE, BYTE, op2(R, E), 0 }, -/*c1*/ { "xadd", TRUE, LONG, op2(R, E), 0 }, -/*c2*/ { "", FALSE, NONE, 0, 0 }, -/*c3*/ { "", FALSE, NONE, 0, 0 }, -/*c4*/ { "", FALSE, NONE, 0, 0 }, -/*c5*/ { "", FALSE, NONE, 0, 0 }, -/*c6*/ { "", FALSE, NONE, 0, 0 }, -/*c7*/ { "cmpxchg8b", FALSE, NONE, op1(E), 0 }, -/*c8*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*c9*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*ca*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*cb*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*cc*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*cd*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*ce*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -/*cf*/ { "bswap", FALSE, LONG, op1(Ri), 0 }, -}; - -struct inst db_inst_0fdx[] = { -/*c0*/ { "cmpxchg",TRUE, BYTE, op2(R, E), 0 }, -/*c1*/ { "cmpxchg",TRUE, LONG, op2(R, E), 0 }, -/*c2*/ { "", FALSE, NONE, 0, 0 }, -/*c3*/ { "", FALSE, NONE, 0, 0 }, -/*c4*/ { "", FALSE, NONE, 0, 0 }, -/*c5*/ { "", FALSE, NONE, 0, 0 }, -/*c6*/ { "", FALSE, NONE, 0, 0 }, -/*c7*/ { "", FALSE, NONE, 0, 0 }, -/*c8*/ { "", FALSE, NONE, 0, 0 }, -/*c9*/ { "", FALSE, NONE, 0, 0 }, -/*ca*/ { "", FALSE, NONE, 0, 0 }, -/*cb*/ { "", FALSE, NONE, 0, 0 }, -/*cc*/ { "", FALSE, NONE, 0, 0 }, -/*cd*/ { "", FALSE, NONE, 0, 0 }, -/*ce*/ { "", FALSE, NONE, 0, 0 }, -/*cf*/ { "", FALSE, NONE, 0, 0 }, -}; - -struct inst *db_inst_0f[] = { - db_inst_0f0x, - 0, - db_inst_0f2x, - db_inst_0f3x, - 0, - 0, - 0, - 0, - db_inst_0f8x, - db_inst_0f9x, - db_inst_0fax, - db_inst_0fbx, - db_inst_0fcx, - db_inst_0fdx, - 0, - 0 -}; - -char * db_Esc92[] = { - "fnop", "", "", "", "", "", "", "" -}; -char * db_Esc93[] = { - "", "", "", "", "", "", "", "" -}; -char * db_Esc94[] = { - "fchs", "fabs", "", "", "ftst", "fxam", "", "" -}; -char * db_Esc95[] = { - "fld1", "fldl2t","fldl2e","fldpi","fldlg2","fldln2","fldz","" -}; -char * db_Esc96[] = { - "f2xm1","fyl2x","fptan","fpatan","fxtract","fprem1","fdecstp", - "fincstp" -}; -char * db_Esc97[] = { - "fprem","fyl2xp1","fsqrt","fsincos","frndint","fscale","fsin","fcos" -}; - -char * db_Esca4[] = { - "", "fucompp","", "", "", "", "", "" -}; - -char * db_Escb4[] = { - "", "", "fnclex","fninit","", "", "", "" -}; - -char * db_Esce3[] = { - "", "fcompp","", "", "", "", "", "" -}; - -char * db_Escf4[] = { - "fnstsw","", "", "", "", "", "", "" -}; - -struct finst db_Esc8[] = { -/*0*/ { "fadd", SNGL, op2(STI,ST), 0 }, -/*1*/ { "fmul", SNGL, op2(STI,ST), 0 }, -/*2*/ { "fcom", SNGL, op2(STI,ST), 0 }, -/*3*/ { "fcomp", SNGL, op2(STI,ST), 0 }, -/*4*/ { "fsub", SNGL, op2(STI,ST), 0 }, -/*5*/ { "fsubr", SNGL, op2(STI,ST), 0 }, -/*6*/ { "fdiv", SNGL, op2(STI,ST), 0 }, -/*7*/ { "fdivr", SNGL, op2(STI,ST), 0 }, -}; - -struct finst db_Esc9[] = { -/*0*/ { "fld", SNGL, op1(STI), 0 }, -/*1*/ { "", NONE, op1(STI), "fxch" }, -/*2*/ { "fst", SNGL, op1(X), (char *)db_Esc92 }, -/*3*/ { "fstp", SNGL, op1(X), (char *)db_Esc93 }, -/*4*/ { "fldenv", NONE, op1(X), (char *)db_Esc94 }, -/*5*/ { "fldcw", NONE, op1(X), (char *)db_Esc95 }, -/*6*/ { "fnstenv",NONE, op1(X), (char *)db_Esc96 }, -/*7*/ { "fnstcw", NONE, op1(X), (char *)db_Esc97 }, -}; - -struct finst db_Esca[] = { -/*0*/ { "fiadd", WORD, 0, 0 }, -/*1*/ { "fimul", WORD, 0, 0 }, -/*2*/ { "ficom", WORD, 0, 0 }, -/*3*/ { "ficomp", WORD, 0, 0 }, -/*4*/ { "fisub", WORD, op1(X), (char *)db_Esca4 }, -/*5*/ { "fisubr", WORD, 0, 0 }, -/*6*/ { "fidiv", WORD, 0, 0 }, -/*7*/ { "fidivr", WORD, 0, 0 } -}; - -struct finst db_Escb[] = { -/*0*/ { "fild", WORD, 0, 0 }, -/*1*/ { "", NONE, 0, 0 }, -/*2*/ { "fist", WORD, 0, 0 }, -/*3*/ { "fistp", WORD, 0, 0 }, -/*4*/ { "", WORD, op1(X), (char *)db_Escb4 }, -/*5*/ { "fld", EXTR, 0, 0 }, -/*6*/ { "", WORD, 0, 0 }, -/*7*/ { "fstp", EXTR, 0, 0 }, -}; - -struct finst db_Escc[] = { -/*0*/ { "fadd", DBLR, op2(ST,STI), 0 }, -/*1*/ { "fmul", DBLR, op2(ST,STI), 0 }, -/*2*/ { "fcom", DBLR, op2(ST,STI), 0 }, -/*3*/ { "fcomp", DBLR, op2(ST,STI), 0 }, -/*4*/ { "fsub", DBLR, op2(ST,STI), "fsubr" }, -/*5*/ { "fsubr", DBLR, op2(ST,STI), "fsub" }, -/*6*/ { "fdiv", DBLR, op2(ST,STI), "fdivr" }, -/*7*/ { "fdivr", DBLR, op2(ST,STI), "fdiv" }, -}; - -struct finst db_Escd[] = { -/*0*/ { "fld", DBLR, op1(STI), "ffree" }, -/*1*/ { "", NONE, 0, 0 }, -/*2*/ { "fst", DBLR, op1(STI), 0 }, -/*3*/ { "fstp", DBLR, op1(STI), 0 }, -/*4*/ { "frstor", NONE, op1(STI), "fucom" }, -/*5*/ { "", NONE, op1(STI), "fucomp" }, -/*6*/ { "fnsave", NONE, 0, 0 }, -/*7*/ { "fnstsw", NONE, 0, 0 }, -}; - -struct finst db_Esce[] = { -/*0*/ { "fiadd", LONG, op2(ST,STI), "faddp" }, -/*1*/ { "fimul", LONG, op2(ST,STI), "fmulp" }, -/*2*/ { "ficom", LONG, 0, 0 }, -/*3*/ { "ficomp", LONG, op1(X), (char *)db_Esce3 }, -/*4*/ { "fisub", LONG, op2(ST,STI), "fsubrp" }, -/*5*/ { "fisubr", LONG, op2(ST,STI), "fsubp" }, -/*6*/ { "fidiv", LONG, op2(ST,STI), "fdivrp" }, -/*7*/ { "fidivr", LONG, op2(ST,STI), "fdivp" }, -}; - -struct finst db_Escf[] = { -/*0*/ { "fild", LONG, 0, 0 }, -/*1*/ { "", LONG, 0, 0 }, -/*2*/ { "fist", LONG, 0, 0 }, -/*3*/ { "fistp", LONG, 0, 0 }, -/*4*/ { "fbld", NONE, op1(XA), (char *)db_Escf4 }, -/*5*/ { "fld", QUAD, 0, 0 }, -/*6*/ { "fbstp", NONE, 0, 0 }, -/*7*/ { "fstp", QUAD, 0, 0 }, -}; - -struct finst *db_Esc_inst[] = { - db_Esc8, db_Esc9, db_Esca, db_Escb, - db_Escc, db_Escd, db_Esce, db_Escf -}; - -char * db_Grp1[] = { - "add", - "or", - "adc", - "sbb", - "and", - "sub", - "xor", - "cmp" -}; - -char * db_Grp2[] = { - "rol", - "ror", - "rcl", - "rcr", - "shl", - "shr", - "shl", - "sar" -}; - -struct inst db_Grp3[] = { - { "test", TRUE, NONE, op2(I,E), 0 }, - { "test", TRUE, NONE, op2(I,E), 0 }, - { "not", TRUE, NONE, op1(E), 0 }, - { "neg", TRUE, NONE, op1(E), 0 }, - { "mul", TRUE, NONE, op2(E,A), 0 }, - { "imul", TRUE, NONE, op2(E,A), 0 }, - { "div", TRUE, NONE, op2(E,A), 0 }, - { "idiv", TRUE, NONE, op2(E,A), 0 }, -}; - -struct inst db_Grp4[] = { - { "inc", TRUE, BYTE, op1(E), 0 }, - { "dec", TRUE, BYTE, op1(E), 0 }, - { "", TRUE, NONE, 0, 0 }, - { "", TRUE, NONE, 0, 0 }, - { "", TRUE, NONE, 0, 0 }, - { "", TRUE, NONE, 0, 0 }, - { "", TRUE, NONE, 0, 0 }, - { "", TRUE, NONE, 0, 0 } -}; - -struct inst db_Grp5[] = { - { "inc", TRUE, LONG, op1(E), 0 }, - { "dec", TRUE, LONG, op1(E), 0 }, - { "call", TRUE, NONE, op1(Eind),0 }, - { "lcall", TRUE, NONE, op1(Eind),0 }, - { "jmp", TRUE, NONE, op1(Eind),0 }, - { "ljmp", TRUE, NONE, op1(Eind),0 }, - { "push", TRUE, LONG, op1(E), 0 }, - { "", TRUE, NONE, 0, 0 } -}; - -struct inst db_inst_table[256] = { -/*00*/ { "add", TRUE, BYTE, op2(R, E), 0 }, -/*01*/ { "add", TRUE, LONG, op2(R, E), 0 }, -/*02*/ { "add", TRUE, BYTE, op2(E, R), 0 }, -/*03*/ { "add", TRUE, LONG, op2(E, R), 0 }, -/*04*/ { "add", FALSE, BYTE, op2(Is, A), 0 }, -/*05*/ { "add", FALSE, LONG, op2(Is, A), 0 }, -/*06*/ { "push", FALSE, NONE, op1(Si), 0 }, -/*07*/ { "pop", FALSE, NONE, op1(Si), 0 }, - -/*08*/ { "or", TRUE, BYTE, op2(R, E), 0 }, -/*09*/ { "or", TRUE, LONG, op2(R, E), 0 }, -/*0a*/ { "or", TRUE, BYTE, op2(E, R), 0 }, -/*0b*/ { "or", TRUE, LONG, op2(E, R), 0 }, -/*0c*/ { "or", FALSE, BYTE, op2(I, A), 0 }, -/*0d*/ { "or", FALSE, LONG, op2(I, A), 0 }, -/*0e*/ { "push", FALSE, NONE, op1(Si), 0 }, -/*0f*/ { "", FALSE, NONE, 0, 0 }, - -/*10*/ { "adc", TRUE, BYTE, op2(R, E), 0 }, -/*11*/ { "adc", TRUE, LONG, op2(R, E), 0 }, -/*12*/ { "adc", TRUE, BYTE, op2(E, R), 0 }, -/*13*/ { "adc", TRUE, LONG, op2(E, R), 0 }, -/*14*/ { "adc", FALSE, BYTE, op2(Is, A), 0 }, -/*15*/ { "adc", FALSE, LONG, op2(Is, A), 0 }, -/*16*/ { "push", FALSE, NONE, op1(Si), 0 }, -/*17*/ { "pop", FALSE, NONE, op1(Si), 0 }, - -/*18*/ { "sbb", TRUE, BYTE, op2(R, E), 0 }, -/*19*/ { "sbb", TRUE, LONG, op2(R, E), 0 }, -/*1a*/ { "sbb", TRUE, BYTE, op2(E, R), 0 }, -/*1b*/ { "sbb", TRUE, LONG, op2(E, R), 0 }, -/*1c*/ { "sbb", FALSE, BYTE, op2(Is, A), 0 }, -/*1d*/ { "sbb", FALSE, LONG, op2(Is, A), 0 }, -/*1e*/ { "push", FALSE, NONE, op1(Si), 0 }, -/*1f*/ { "pop", FALSE, NONE, op1(Si), 0 }, - -/*20*/ { "and", TRUE, BYTE, op2(R, E), 0 }, -/*21*/ { "and", TRUE, LONG, op2(R, E), 0 }, -/*22*/ { "and", TRUE, BYTE, op2(E, R), 0 }, -/*23*/ { "and", TRUE, LONG, op2(E, R), 0 }, -/*24*/ { "and", FALSE, BYTE, op2(I, A), 0 }, -/*25*/ { "and", FALSE, LONG, op2(I, A), 0 }, -/*26*/ { "", FALSE, NONE, 0, 0 }, -/*27*/ { "aaa", FALSE, NONE, 0, 0 }, - -/*28*/ { "sub", TRUE, BYTE, op2(R, E), 0 }, -/*29*/ { "sub", TRUE, LONG, op2(R, E), 0 }, -/*2a*/ { "sub", TRUE, BYTE, op2(E, R), 0 }, -/*2b*/ { "sub", TRUE, LONG, op2(E, R), 0 }, -/*2c*/ { "sub", FALSE, BYTE, op2(Is, A), 0 }, -/*2d*/ { "sub", FALSE, LONG, op2(Is, A), 0 }, -/*2e*/ { "", FALSE, NONE, 0, 0 }, -/*2f*/ { "das", FALSE, NONE, 0, 0 }, - -/*30*/ { "xor", TRUE, BYTE, op2(R, E), 0 }, -/*31*/ { "xor", TRUE, LONG, op2(R, E), 0 }, -/*32*/ { "xor", TRUE, BYTE, op2(E, R), 0 }, -/*33*/ { "xor", TRUE, LONG, op2(E, R), 0 }, -/*34*/ { "xor", FALSE, BYTE, op2(I, A), 0 }, -/*35*/ { "xor", FALSE, LONG, op2(I, A), 0 }, -/*36*/ { "", FALSE, NONE, 0, 0 }, -/*37*/ { "daa", FALSE, NONE, 0, 0 }, - -/*38*/ { "cmp", TRUE, BYTE, op2(R, E), 0 }, -/*39*/ { "cmp", TRUE, LONG, op2(R, E), 0 }, -/*3a*/ { "cmp", TRUE, BYTE, op2(E, R), 0 }, -/*3b*/ { "cmp", TRUE, LONG, op2(E, R), 0 }, -/*3c*/ { "cmp", FALSE, BYTE, op2(Is, A), 0 }, -/*3d*/ { "cmp", FALSE, LONG, op2(Is, A), 0 }, -/*3e*/ { "", FALSE, NONE, 0, 0 }, -/*3f*/ { "aas", FALSE, NONE, 0, 0 }, - -/*40*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*41*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*42*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*43*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*44*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*45*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*46*/ { "inc", FALSE, LONG, op1(Ri), 0 }, -/*47*/ { "inc", FALSE, LONG, op1(Ri), 0 }, - -/*48*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*49*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*4a*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*4b*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*4c*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*4d*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*4e*/ { "dec", FALSE, LONG, op1(Ri), 0 }, -/*4f*/ { "dec", FALSE, LONG, op1(Ri), 0 }, - -/*50*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*51*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*52*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*53*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*54*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*55*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*56*/ { "push", FALSE, LONG, op1(Ri), 0 }, -/*57*/ { "push", FALSE, LONG, op1(Ri), 0 }, - -/*58*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*59*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*5a*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*5b*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*5c*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*5d*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*5e*/ { "pop", FALSE, LONG, op1(Ri), 0 }, -/*5f*/ { "pop", FALSE, LONG, op1(Ri), 0 }, - -/*60*/ { "pusha", FALSE, LONG, 0, 0 }, -/*61*/ { "popa", FALSE, LONG, 0, 0 }, -/*62*/ { "bound", TRUE, LONG, op2(E, R), 0 }, -/*63*/ { "arpl", TRUE, NONE, op2(Ew,Rw), 0 }, - -/*64*/ { "", FALSE, NONE, 0, 0 }, -/*65*/ { "", FALSE, NONE, 0, 0 }, -/*66*/ { "", FALSE, NONE, 0, 0 }, -/*67*/ { "", FALSE, NONE, 0, 0 }, - -/*68*/ { "push", FALSE, LONG, op1(I), 0 }, -/*69*/ { "imul", TRUE, LONG, op3(I,E,R), 0 }, -/*6a*/ { "push", FALSE, LONG, op1(Ib), 0 }, -/*6b*/ { "imul", TRUE, LONG, op3(Ibs,E,R),0 }, -/*6c*/ { "ins", FALSE, BYTE, op2(DX, DI), 0 }, -/*6d*/ { "ins", FALSE, LONG, op2(DX, DI), 0 }, -/*6e*/ { "outs", FALSE, BYTE, op2(SI, DX), 0 }, -/*6f*/ { "outs", FALSE, LONG, op2(SI, DX), 0 }, - -/*70*/ { "jo", FALSE, NONE, op1(Db), 0 }, -/*71*/ { "jno", FALSE, NONE, op1(Db), 0 }, -/*72*/ { "jb", FALSE, NONE, op1(Db), 0 }, -/*73*/ { "jnb", FALSE, NONE, op1(Db), 0 }, -/*74*/ { "jz", FALSE, NONE, op1(Db), 0 }, -/*75*/ { "jnz", FALSE, NONE, op1(Db), 0 }, -/*76*/ { "jbe", FALSE, NONE, op1(Db), 0 }, -/*77*/ { "jnbe", FALSE, NONE, op1(Db), 0 }, - -/*78*/ { "js", FALSE, NONE, op1(Db), 0 }, -/*79*/ { "jns", FALSE, NONE, op1(Db), 0 }, -/*7a*/ { "jp", FALSE, NONE, op1(Db), 0 }, -/*7b*/ { "jnp", FALSE, NONE, op1(Db), 0 }, -/*7c*/ { "jl", FALSE, NONE, op1(Db), 0 }, -/*7d*/ { "jnl", FALSE, NONE, op1(Db), 0 }, -/*7e*/ { "jle", FALSE, NONE, op1(Db), 0 }, -/*7f*/ { "jnle", FALSE, NONE, op1(Db), 0 }, - -/*80*/ { "", TRUE, BYTE, op2(I, E), (char *)db_Grp1 }, -/*81*/ { "", TRUE, LONG, op2(I, E), (char *)db_Grp1 }, -/*82*/ { "", TRUE, BYTE, op2(Is,E), (char *)db_Grp1 }, -/*83*/ { "", TRUE, LONG, op2(Ibs,E), (char *)db_Grp1 }, -/*84*/ { "test", TRUE, BYTE, op2(R, E), 0 }, -/*85*/ { "test", TRUE, LONG, op2(R, E), 0 }, -/*86*/ { "xchg", TRUE, BYTE, op2(R, E), 0 }, -/*87*/ { "xchg", TRUE, LONG, op2(R, E), 0 }, - -/*88*/ { "mov", TRUE, BYTE, op2(R, E), 0 }, -/*89*/ { "mov", TRUE, LONG, op2(R, E), 0 }, -/*8a*/ { "mov", TRUE, BYTE, op2(E, R), 0 }, -/*8b*/ { "mov", TRUE, LONG, op2(E, R), 0 }, -/*8c*/ { "mov", TRUE, NONE, op2(S, Ew), 0 }, -/*8d*/ { "lea", TRUE, LONG, op2(E, R), 0 }, -/*8e*/ { "mov", TRUE, NONE, op2(Ew, S), 0 }, -/*8f*/ { "pop", TRUE, LONG, op1(E), 0 }, - -/*90*/ { "nop", FALSE, NONE, 0, 0 }, -/*91*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, -/*92*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, -/*93*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, -/*94*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, -/*95*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, -/*96*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, -/*97*/ { "xchg", FALSE, LONG, op2(A, Ri), 0 }, - -/*98*/ { "cbw", FALSE, SDEP, 0, "cwde" }, /* cbw/cwde */ -/*99*/ { "cwd", FALSE, SDEP, 0, "cdq" }, /* cwd/cdq */ -/*9a*/ { "lcall", FALSE, NONE, op1(OS), 0 }, -/*9b*/ { "wait", FALSE, NONE, 0, 0 }, -/*9c*/ { "pushf", FALSE, LONG, 0, 0 }, -/*9d*/ { "popf", FALSE, LONG, 0, 0 }, -/*9e*/ { "sahf", FALSE, NONE, 0, 0 }, -/*9f*/ { "lahf", FALSE, NONE, 0, 0 }, - -/*a0*/ { "mov", FALSE, BYTE, op2(O, A), 0 }, -/*a1*/ { "mov", FALSE, LONG, op2(O, A), 0 }, -/*a2*/ { "mov", FALSE, BYTE, op2(A, O), 0 }, -/*a3*/ { "mov", FALSE, LONG, op2(A, O), 0 }, -/*a4*/ { "movs", FALSE, BYTE, op2(SI,DI), 0 }, -/*a5*/ { "movs", FALSE, LONG, op2(SI,DI), 0 }, -/*a6*/ { "cmps", FALSE, BYTE, op2(SI,DI), 0 }, -/*a7*/ { "cmps", FALSE, LONG, op2(SI,DI), 0 }, - -/*a8*/ { "test", FALSE, BYTE, op2(I, A), 0 }, -/*a9*/ { "test", FALSE, LONG, op2(I, A), 0 }, -/*aa*/ { "stos", FALSE, BYTE, op1(DI), 0 }, -/*ab*/ { "stos", FALSE, LONG, op1(DI), 0 }, -/*ac*/ { "lods", FALSE, BYTE, op1(SI), 0 }, -/*ad*/ { "lods", FALSE, LONG, op1(SI), 0 }, -/*ae*/ { "scas", FALSE, BYTE, op1(SI), 0 }, -/*af*/ { "scas", FALSE, LONG, op1(SI), 0 }, - -/*b0*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b1*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b2*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b3*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b4*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b5*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b6*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, -/*b7*/ { "mov", FALSE, BYTE, op2(I, Ri), 0 }, - -/*b8*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*b9*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*ba*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*bb*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*bc*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*bd*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*be*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, -/*bf*/ { "mov", FALSE, LONG, op2(I, Ri), 0 }, - -/*c0*/ { "", TRUE, BYTE, op2(Ib, E), (char *)db_Grp2 }, -/*c1*/ { "", TRUE, LONG, op2(Ib, E), (char *)db_Grp2 }, -/*c2*/ { "ret", FALSE, NONE, op1(Iw), 0 }, -/*c3*/ { "ret", FALSE, NONE, 0, 0 }, -/*c4*/ { "les", TRUE, LONG, op2(E, R), 0 }, -/*c5*/ { "lds", TRUE, LONG, op2(E, R), 0 }, -/*c6*/ { "mov", TRUE, BYTE, op2(I, E), 0 }, -/*c7*/ { "mov", TRUE, LONG, op2(I, E), 0 }, - -/*c8*/ { "enter", FALSE, NONE, op2(Ib, Iw), 0 }, -/*c9*/ { "leave", FALSE, NONE, 0, 0 }, -/*ca*/ { "lret", FALSE, NONE, op1(Iw), 0 }, -/*cb*/ { "lret", FALSE, NONE, 0, 0 }, -/*cc*/ { "int", FALSE, NONE, op1(o3), 0 }, -/*cd*/ { "int", FALSE, NONE, op1(Ib), 0 }, -/*ce*/ { "into", FALSE, NONE, 0, 0 }, -/*cf*/ { "iret", FALSE, NONE, 0, 0 }, - -/*d0*/ { "", TRUE, BYTE, op2(o1, E), (char *)db_Grp2 }, -/*d1*/ { "", TRUE, LONG, op2(o1, E), (char *)db_Grp2 }, -/*d2*/ { "", TRUE, BYTE, op2(CL, E), (char *)db_Grp2 }, -/*d3*/ { "", TRUE, LONG, op2(CL, E), (char *)db_Grp2 }, -/*d4*/ { "aam", TRUE, NONE, 0, 0 }, -/*d5*/ { "aad", TRUE, NONE, 0, 0 }, -/*d6*/ { "", FALSE, NONE, 0, 0 }, -/*d7*/ { "xlat", FALSE, BYTE, op1(BX), 0 }, - -/*d8*/ { "", TRUE, NONE, 0, (char *)db_Esc8 }, -/*d9*/ { "", TRUE, NONE, 0, (char *)db_Esc9 }, -/*da*/ { "", TRUE, NONE, 0, (char *)db_Esca }, -/*db*/ { "", TRUE, NONE, 0, (char *)db_Escb }, -/*dc*/ { "", TRUE, NONE, 0, (char *)db_Escc }, -/*dd*/ { "", TRUE, NONE, 0, (char *)db_Escd }, -/*de*/ { "", TRUE, NONE, 0, (char *)db_Esce }, -/*df*/ { "", TRUE, NONE, 0, (char *)db_Escf }, - -/*e0*/ { "loopne",FALSE, NONE, op1(Db), 0 }, -/*e1*/ { "loope", FALSE, NONE, op1(Db), 0 }, -/*e2*/ { "loop", FALSE, NONE, op1(Db), 0 }, -/*e3*/ { "jcxz", FALSE, SDEP, op1(Db), "jecxz" }, -/*e4*/ { "in", FALSE, BYTE, op2(Ib, A), 0 }, -/*e5*/ { "in", FALSE, LONG, op2(Ib, A) , 0 }, -/*e6*/ { "out", FALSE, BYTE, op2(A, Ib), 0 }, -/*e7*/ { "out", FALSE, LONG, op2(A, Ib) , 0 }, - -/*e8*/ { "call", FALSE, NONE, op1(Dl), 0 }, -/*e9*/ { "jmp", FALSE, NONE, op1(Dl), 0 }, -/*ea*/ { "ljmp", FALSE, NONE, op1(OS), 0 }, -/*eb*/ { "jmp", FALSE, NONE, op1(Db), 0 }, -/*ec*/ { "in", FALSE, BYTE, op2(DX, A), 0 }, -/*ed*/ { "in", FALSE, LONG, op2(DX, A) , 0 }, -/*ee*/ { "out", FALSE, BYTE, op2(A, DX), 0 }, -/*ef*/ { "out", FALSE, LONG, op2(A, DX) , 0 }, - -/*f0*/ { "", FALSE, NONE, 0, 0 }, -/*f1*/ { "", FALSE, NONE, 0, 0 }, -/*f2*/ { "", FALSE, NONE, 0, 0 }, -/*f3*/ { "", FALSE, NONE, 0, 0 }, -/*f4*/ { "hlt", FALSE, NONE, 0, 0 }, -/*f5*/ { "cmc", FALSE, NONE, 0, 0 }, -/*f6*/ { "", TRUE, BYTE, 0, (char *)db_Grp3 }, -/*f7*/ { "", TRUE, LONG, 0, (char *)db_Grp3 }, - -/*f8*/ { "clc", FALSE, NONE, 0, 0 }, -/*f9*/ { "stc", FALSE, NONE, 0, 0 }, -/*fa*/ { "cli", FALSE, NONE, 0, 0 }, -/*fb*/ { "sti", FALSE, NONE, 0, 0 }, -/*fc*/ { "cld", FALSE, NONE, 0, 0 }, -/*fd*/ { "std", FALSE, NONE, 0, 0 }, -/*fe*/ { "", TRUE, NONE, 0, (char *)db_Grp4 }, -/*ff*/ { "", TRUE, NONE, 0, (char *)db_Grp5 }, -}; - -struct inst db_bad_inst = - { "???", FALSE, NONE, 0, 0 } -; - -#define f_mod(byte) ((byte)>>6) -#define f_reg(byte) (((byte)>>3)&0x7) -#define f_rm(byte) ((byte)&0x7) - -#define sib_ss(byte) ((byte)>>6) -#define sib_index(byte) (((byte)>>3)&0x7) -#define sib_base(byte) ((byte)&0x7) - -char * db_index_reg_16[8] = { - "%bx,%si", - "%bx,%di", - "%bp,%si", - "%bp,%di", - "%si", - "%di", - "%bp", - "%bx" -}; - -char * db_reg[3][8] = { - "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh", - "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di", - "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" -}; - -char * db_seg_reg[8] = { - "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "" -}; - -/* - * lengths for size attributes - */ -int db_lengths[] = { - 1, /* BYTE */ - 2, /* WORD */ - 4, /* LONG */ - 8, /* QUAD */ - 4, /* SNGL */ - 8, /* DBLR */ - 10, /* EXTR */ -}; - -#define get_value_inc(result, loc, size, is_signed, task) \ - result = db_get_task_value((loc), (size), (is_signed), (task)); \ - (loc) += (size); - -/* - * Read address at location and return updated location. - */ -db_addr_t -db_read_address( - db_addr_t loc, - int short_addr, - int regmodrm, - struct i_addr *addrp, /* out */ - task_t task) -{ - int mod, rm, sib, index, ss, disp; - - mod = f_mod(regmodrm); - rm = f_rm(regmodrm); - - if (mod == 3) { - addrp->is_reg = TRUE; - addrp->disp = rm; - return (loc); - } - addrp->is_reg = FALSE; - addrp->index = 0; - - if (short_addr) { - addrp->index = 0; - addrp->ss = 0; - switch (mod) { - case 0: - if (rm == 6) { - get_value_inc(disp, loc, 2, TRUE, task); - addrp->disp = disp; - addrp->base = 0; - } - else { - addrp->disp = 0; - addrp->base = db_index_reg_16[rm]; - } - break; - case 1: - get_value_inc(disp, loc, 1, TRUE, task); - addrp->disp = disp; - addrp->base = db_index_reg_16[rm]; - break; - case 2: - get_value_inc(disp, loc, 2, TRUE, task); - addrp->disp = disp; - addrp->base = db_index_reg_16[rm]; - break; - } - } - else { - if (mod != 3 && rm == 4) { - get_value_inc(sib, loc, 1, FALSE, task); - rm = sib_base(sib); - index = sib_index(sib); - if (index != 4) - addrp->index = db_reg[LONG][index]; - addrp->ss = sib_ss(sib); - } - - switch (mod) { - case 0: - if (rm == 5) { - get_value_inc(addrp->disp, loc, 4, FALSE, task); - addrp->base = 0; - } - else { - addrp->disp = 0; - addrp->base = db_reg[LONG][rm]; - } - break; - - case 1: - get_value_inc(disp, loc, 1, TRUE, task); - addrp->disp = disp; - addrp->base = db_reg[LONG][rm]; - break; - - case 2: - get_value_inc(disp, loc, 4, FALSE, task); - addrp->disp = disp; - addrp->base = db_reg[LONG][rm]; - break; - } - } - return (loc); -} - -void -db_print_address( - char * seg, - int size, - struct i_addr *addrp, - task_t task) -{ - if (addrp->is_reg) { - db_printf("%s", db_reg[size][addrp->disp]); - return; - } - - if (seg) { - db_printf("%s:", seg); - } - - if (addrp->base != 0 || addrp->index != 0) { - db_printf("%#n", addrp->disp); - db_printf("("); - if (addrp->base) - db_printf("%s", addrp->base); - if (addrp->index) - db_printf(",%s,%d", addrp->index, 1<ss); - db_printf(")"); - } else - db_task_printsym((db_addr_t)addrp->disp, DB_STGY_ANY, task); -} - -/* - * Disassemble floating-point ("escape") instruction - * and return updated location. - */ -db_addr_t -db_disasm_esc( - db_addr_t loc, - int inst, - int short_addr, - int size, - char * seg, - task_t task) -{ - int regmodrm; - struct finst *fp; - int mod; - struct i_addr address; - char * name; - - get_value_inc(regmodrm, loc, 1, FALSE, task); - fp = &db_Esc_inst[inst - 0xd8][f_reg(regmodrm)]; - mod = f_mod(regmodrm); - if (mod != 3) { - /* - * Normal address modes. - */ - loc = db_read_address(loc, short_addr, regmodrm, &address, task); - db_printf(fp->f_name); - switch(fp->f_size) { - case SNGL: - db_printf("s"); - break; - case DBLR: - db_printf("l"); - break; - case EXTR: - db_printf("t"); - break; - case WORD: - db_printf("s"); - break; - case LONG: - db_printf("l"); - break; - case QUAD: - db_printf("q"); - break; - default: - break; - } - db_printf("\t"); - db_print_address(seg, BYTE, &address, task); - } - else { - /* - * 'reg-reg' - special formats - */ - switch (fp->f_rrmode) { - case op2(ST,STI): - name = (fp->f_rrname) ? fp->f_rrname : fp->f_name; - db_printf("%s\t%%st,%%st(%d)",name,f_rm(regmodrm)); - break; - case op2(STI,ST): - name = (fp->f_rrname) ? fp->f_rrname : fp->f_name; - db_printf("%s\t%%st(%d),%%st",name, f_rm(regmodrm)); - break; - case op1(STI): - name = (fp->f_rrname) ? fp->f_rrname : fp->f_name; - db_printf("%s\t%%st(%d)",name, f_rm(regmodrm)); - break; - case op1(X): - db_printf("%s", ((char **)fp->f_rrname)[f_rm(regmodrm)]); - break; - case op1(XA): - db_printf("%s\t%%ax", - ((char **)fp->f_rrname)[f_rm(regmodrm)]); - break; - default: - db_printf(""); - break; - } - } - - return (loc); -} - -/* - * Disassemble instruction at 'loc'. 'altfmt' specifies an - * (optional) alternate format. Return address of start of - * next instruction. - */ -db_addr_t -db_disasm( - db_addr_t loc, - boolean_t altfmt, - task_t task) -{ - int inst; - int size; - int short_addr; - char * seg; - struct inst * ip; - char * i_name; - int i_size; - int i_mode; - int regmodrm; - boolean_t first; - int displ; - int prefix; - int imm; - int imm2; - int len; - struct i_addr address; - char *filename; - int linenum; - - get_value_inc(inst, loc, 1, FALSE, task); - if (db_disasm_16) { - short_addr = TRUE; - size = WORD; - } - else { - short_addr = FALSE; - size = LONG; - } - seg = 0; - - /* - * Get prefixes - */ - prefix = TRUE; - do { - switch (inst) { - case 0x66: /* data16 */ - if (size == LONG) - size = WORD; - else - size = LONG; - break; - case 0x67: - short_addr = !short_addr; - break; - case 0x26: - seg = "%es"; - break; - case 0x36: - seg = "%ss"; - break; - case 0x2e: - seg = "%cs"; - break; - case 0x3e: - seg = "%ds"; - break; - case 0x64: - seg = "%fs"; - break; - case 0x65: - seg = "%gs"; - break; - case 0xf0: - db_printf("lock "); - break; - case 0xf2: - db_printf("repne "); - break; - case 0xf3: - db_printf("repe "); /* XXX repe VS rep */ - break; - default: - prefix = FALSE; - break; - } - if (prefix) { - get_value_inc(inst, loc, 1, FALSE, task); - } - } while (prefix); - - if (inst >= 0xd8 && inst <= 0xdf) { - loc = db_disasm_esc(loc, inst, short_addr, size, seg, task); - db_printf("\n"); - return (loc); - } - - if (inst == 0x0f) { - get_value_inc(inst, loc, 1, FALSE, task); - ip = db_inst_0f[inst>>4]; - if (ip == 0) { - ip = &db_bad_inst; - } - else { - ip = &ip[inst&0xf]; - } - } - else - ip = &db_inst_table[inst]; - - if (ip->i_has_modrm) { - get_value_inc(regmodrm, loc, 1, FALSE, task); - loc = db_read_address(loc, short_addr, regmodrm, &address, task); - } - - i_name = ip->i_name; - i_size = ip->i_size; - i_mode = ip->i_mode; - - if (ip->i_extra == (char *)db_Grp1 || - ip->i_extra == (char *)db_Grp2 || - ip->i_extra == (char *)db_Grp6 || - ip->i_extra == (char *)db_Grp7 || - ip->i_extra == (char *)db_Grp8) { - i_name = ((char **)ip->i_extra)[f_reg(regmodrm)]; - } - else if (ip->i_extra == (char *)db_Grp3) { - ip = (struct inst *)ip->i_extra; - ip = &ip[f_reg(regmodrm)]; - i_name = ip->i_name; - i_mode = ip->i_mode; - } - else if (ip->i_extra == (char *)db_Grp4 || - ip->i_extra == (char *)db_Grp5) { - ip = (struct inst *)ip->i_extra; - ip = &ip[f_reg(regmodrm)]; - i_name = ip->i_name; - i_mode = ip->i_mode; - i_size = ip->i_size; - } - - if (i_size == SDEP) { - if (size == WORD) - db_printf(i_name); - else - db_printf(ip->i_extra); - } - else { - db_printf(i_name); - if (i_size != NONE) { - if (i_size == BYTE) { - db_printf("b"); - size = BYTE; - } - else if (i_size == WORD) { - db_printf("w"); - size = WORD; - } - else if (size == WORD) - db_printf("w"); - else - db_printf("l"); - } - } - db_printf("\t"); - for (first = TRUE; - i_mode != 0; - i_mode >>= 8, first = FALSE) - { - if (!first) - db_printf(","); - - switch (i_mode & 0xFF) { - - case E: - db_print_address(seg, size, &address, task); - break; - - case Eind: - db_printf("*"); - db_print_address(seg, size, &address, task); - break; - - case Ew: - db_print_address(seg, WORD, &address, task); - break; - - case Eb: - db_print_address(seg, BYTE, &address, task); - break; - - case R: - db_printf("%s", db_reg[size][f_reg(regmodrm)]); - break; - - case Rw: - db_printf("%s", db_reg[WORD][f_reg(regmodrm)]); - break; - - case Ri: - db_printf("%s", db_reg[size][f_rm(inst)]); - break; - - case S: - db_printf("%s", db_seg_reg[f_reg(regmodrm)]); - break; - - case Si: - db_printf("%s", db_seg_reg[f_reg(inst)]); - break; - - case A: - db_printf("%s", db_reg[size][0]); /* acc */ - break; - - case BX: - if (seg) - db_printf("%s:", seg); - db_printf("(%s)", short_addr ? "%bx" : "%ebx"); - break; - - case CL: - db_printf("%%cl"); - break; - - case DX: - db_printf("%%dx"); - break; - - case SI: - if (seg) - db_printf("%s:", seg); - db_printf("(%s)", short_addr ? "%si" : "%esi"); - break; - - case DI: - db_printf("%%es:(%s)", short_addr ? "%di" : "%edi"); - break; - - case CR: - db_printf("%%cr%d", f_reg(regmodrm)); - break; - - case DR: - db_printf("%%dr%d", f_reg(regmodrm)); - break; - - case TR: - db_printf("%%tr%d", f_reg(regmodrm)); - break; - - case I: - len = db_lengths[size]; - get_value_inc(imm, loc, len, FALSE, task);/* unsigned */ - db_printf("$%#n", imm); - break; - - case Is: - len = db_lengths[size]; - get_value_inc(imm, loc, len, TRUE, task); /* signed */ - db_printf("$%#r", imm); - break; - - case Ib: - get_value_inc(imm, loc, 1, FALSE, task); /* unsigned */ - db_printf("$%#n", imm); - break; - - case Ibs: - get_value_inc(imm, loc, 1, TRUE, task); /* signed */ - db_printf("$%#r", imm); - break; - - case Iw: - get_value_inc(imm, loc, 2, FALSE, task); /* unsigned */ - db_printf("$%#n", imm); - break; - - case Il: - get_value_inc(imm, loc, 4, FALSE, task); - db_printf("$%#n", imm); - break; - - case O: - if (short_addr) { - get_value_inc(displ, loc, 2, TRUE, task); - } - else { - get_value_inc(displ, loc, 4, TRUE, task); - } - if (seg) - db_printf("%s:%#r",seg, displ); - else - db_task_printsym((db_addr_t)displ, DB_STGY_ANY, task); - break; - - case Db: - get_value_inc(displ, loc, 1, TRUE, task); - if (short_addr) { - /* offset only affects low 16 bits */ - displ = (loc & 0xffff0000) - | ((loc + displ) & 0xffff); - } - else - displ = displ + loc; - db_task_printsym((db_addr_t)displ,DB_STGY_ANY,task); - if (db_line_at_pc(0, &filename, &linenum, displ)) { - db_printf(" [%s", filename); - if (linenum > 0) - db_printf(":%d", linenum); - db_printf("]"); - } - break; - - case Dl: - if (short_addr) { - get_value_inc(displ, loc, 2, TRUE, task); - /* offset only affects low 16 bits */ - displ = (loc & 0xffff0000) - | ((loc + displ) & 0xffff); - } - else { - get_value_inc(displ, loc, 4, TRUE, task); - displ = displ + loc; - } - db_task_printsym((db_addr_t)displ, DB_STGY_ANY, task); - if (db_line_at_pc(0, &filename, &linenum, displ)) { - db_printf(" [%s", filename); - if (linenum > 0) - db_printf(":%d", linenum); - db_printf("]"); - } - break; - - case o1: - db_printf("$1"); - break; - - case o3: - db_printf("$3"); - break; - - case OS: - if (short_addr) { - get_value_inc(imm, loc, 2, FALSE, task); /* offset */ - } - else { - get_value_inc(imm, loc, 4, FALSE, task); /* offset */ - } - get_value_inc(imm2, loc, 2, FALSE, task); /* segment */ - db_printf("$%#n,%#n", imm2, imm); - break; - } - } - - if (altfmt == 0 && !db_disasm_16) { - if (inst == 0xe9 || inst == 0xeb) { /* jmp, Dl or Db */ - /* - * GAS pads to longword boundary after unconditional jumps. - */ - while (loc & (4-1)) { - get_value_inc(inst, loc, 0, FALSE, task); - if (inst != 0x90) /* nop */ - break; - loc++; - } - } - } - db_printf("\n"); - return (loc); -} - -/* - * Classify instructions by whether they read or write memory. - */ - -#define DBLS_LOAD 0x01 /* instruction reads from memory */ -#define DBLS_STORE 0x02 /* instruction writes to memory */ - -#define DBLS_MODRM 0x10 /* instruction uses mod r/m byte */ -#define DBLS_SECOND 0x20 /* instruction does two operations */ -#define DBLS_ESCAPE 0x40 /* escape to two-byte opcodes */ -#define DBLS_SWREG 0x80 /* need to switch on reg bits of mod r/m */ - -#define DBLS_MODS 0xf0 -#define DBLS_LMASK (DBLS_MODS|DBLS_LOAD) -#define DBLS_SMASK (DBLS_MODS|DBLS_STORE) - -char db_ldstrtab[] = { - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x02, 0x01, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x02, 0x40, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x02, 0x01, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x02, 0x01, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x12, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, - - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x01, 0x21, 0x13, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x11, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - - 0x12, 0x12, 0x00, 0x12, 0x11, 0x11, 0x13, 0x13, - 0x12, 0x12, 0x11, 0x11, 0x12, 0x00, 0x11, 0x03, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x02, 0x00, 0x02, 0x01, 0x00, 0x00, - 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x21, 0x21, - 0x00, 0x00, 0x02, 0x02, 0x01, 0x01, 0x01, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - - 0x13, 0x13, 0x00, 0x00, 0x01, 0x01, 0x12, 0x12, - 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x13, 0x13, 0x13, 0x13, 0x00, 0x00, 0x00, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x13, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x13, -}; - -unsigned char db_ldstrtab0f[] = { - 0x80, 0x80, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, - 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, - 0x02, 0x01, 0x00, 0x11, 0x13, 0x13, 0x00, 0x00, - 0x02, 0x01, 0x12, 0x13, 0x13, 0x13, 0x00, 0x11, - 0x00, 0x00, 0x01, 0x13, 0x01, 0x01, 0x11, 0x11, - 0x00, 0x00, 0x80, 0x13, 0x13, 0x13, 0x11, 0x11, - - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -int db_inst_swreg(boolean_t, unsigned long, unsigned char); - -/* - * Given four bytes of instruction (stored as an int, not an - * array of characters), compute if the instruction reads - * memory. - */ -int -db_inst_load( - unsigned long insw) -{ - unsigned char insb, bits; - - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab[insb]; - if (!(bits & DBLS_LOAD)) - return (0); - while (1) { - switch (bits & DBLS_MODS) { - case 0: - return (1); - case DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0); - case DBLS_SECOND|DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0 ? 2 : 0); - case DBLS_SECOND: - return (2); - case DBLS_ESCAPE: - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab0f[insb]; - break; - case DBLS_SWREG: - return (db_inst_swreg(TRUE, insw, insb)); - default: - panic ("db_inst_load: unknown mod bits"); - } - } -} - -/* - * Given four bytes of instruction (stored as an int, not an - * array of characters), compute if the instruction writes - * memory. - */ -int -db_inst_store( - unsigned long insw) -{ - unsigned char insb, bits; - - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab[insb]; - if (!(bits & DBLS_STORE)) - return (0); - while (1) { - switch (bits & DBLS_MODS) { - case 0: - return (1); - case DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0); - case DBLS_SECOND|DBLS_MODRM: - insb = insw & 0xff; - return ((insb & 0xc0) != 0xc0 ? 2 : 0); - case DBLS_SECOND: - return (2); - case DBLS_ESCAPE: - insb = insw & 0xff; - insw >>= 8; - bits = db_ldstrtab0f[insb]; - break; - case DBLS_SWREG: - return (db_inst_swreg(FALSE, insw, insb)); - default: - panic ("db_inst_store: unknown mod bits"); - } - } -} - -/* - * Parse a mod r/m byte to see if extended opcode reads - * or writes memory. - */ -int -db_inst_swreg( - boolean_t isload, - unsigned long insw, - unsigned char insb) -{ - unsigned char modrm = insw & 0xff; - - switch (insb) { - case 0x00: - switch (modrm & 0x38) { - case 0x00: - case 0x08: - case 0x10: - case 0x18: - return ((modrm & 0xc0) != 0xc0); - } - break; - case 0x01: - switch (modrm & 0x38) { - case 0x00: - case 0x08: - case 0x10: - case 0x18: - return ((modrm & 0xc0) != 0xc0 ? 2 : 0); - case 0x20: - case 0x30: - return ((modrm & 0xc0) != 0xc0); - } - break; - case 0xba: - if (isload) - return ((modrm & 0xc0) != 0xc0); - switch (modrm & 0x38) { - case 0x28: - case 0x30: - case 0x38: - return ((modrm & 0xc0) != 0xc0); - } - break; - } - return (0); -} diff --git a/osfmk/i386/db_gcc_aout.c b/osfmk/i386/db_gcc_aout.c deleted file mode 100644 index 508146b96..000000000 --- a/osfmk/i386/db_gcc_aout.c +++ /dev/null @@ -1,687 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * COPYRIGHT NOTICE - * - * Copyright (c) 1990, 1991, 1992, 1993 Open Software Foundation, Inc. - * - * Permission is hereby granted to use, copy, modify and freely distribute - * the software in this file and its documentation for any purpose without - * fee, provided that the above copyright notice appears in all copies and - * that both the copyright notice and this permission notice appear in - * supporting documentation. Further, provided that the name of Open - * Software Foundation, Inc. ("OSF") not be used in advertising or - * publicity pertaining to distribution of the software without prior - * written permission from OSF. OSF makes no representations about the - * suitability of this software for any purpose. It is provided "as is" - * without express or implied warranty. - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:36 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:37 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.2.2.3 1994/01/28 17:23:00 chasb - * Expand Copyrights - * [1994/01/27 19:40:16 chasb] - * - * Revision 1.2.2.2 1993/06/09 02:27:36 gm - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 21:04:03 jeffc] - * - * Revision 1.2 1993/04/19 16:13:10 devrcs - * pick up file_io.h from bootstrap directory - * [1993/02/27 15:01:09 david] - * - * Added new arguments and a missing one to db_add_symbol_table - * [barbou@gr.osf.org] - * [92/12/03 bernadat] - * - * Added gcc symbol table handling based on db_aout.c (Revsion 2.4) - * [91/07/31 tak] - * - * Revision 1.1 1992/09/30 02:02:23 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.1 91/07/31 13:13:51 jeffreyh - * Created. - * - * 31-Jul-91 Jeffrey Heller (tak) at Open Software Foundation - * Added gcc symbol table handling based on db_aout.c (Revsion 2.4) - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * Symbol table routines for a.out format files. - */ - -#include -#include /* data types */ -#include - -#ifdef DB_GCC_AOUT - -#include /* a.out symbol table */ -#include - -/* - * An a.out symbol table as loaded into the kernel debugger: - * - * symtab -> size of symbol entries, in bytes - * sp -> first symbol entry - * ... - * ep -> last symbol entry + 1 - * strtab == start of string table - * size of string table in bytes, - * including this word - * -> strings - */ - -/* - * Find pointers to the start and end of the symbol entries, - * given a pointer to the start of the symbol table. - */ -#define db_get_aout_symtab(symtab, sp, ep) \ - (sp = (struct nlist *)((symtab) + 1), \ - ep = (struct nlist *)((char *)sp + *(symtab))) - -X_db_sym_init(symtab, esymtab, name) - int * symtab; /* pointer to start of symbol table */ - char * esymtab; /* pointer to end of string table, - for checking - rounded up to integer - boundary */ - char * name; -{ - register struct nlist *sym_start, *sym_end; - register struct nlist *sp; - register char * strtab; - register int strlen; - - db_get_aout_symtab(symtab, sym_start, sym_end); - - strtab = (char *)sym_end; - strlen = *(int *)strtab; - - if (strtab + ((strlen + sizeof(int) - 1) & ~(sizeof(int)-1)) - != esymtab) - { - db_printf("[ %s symbol table not valid ]\n", name); - return; - } - - db_printf("[ preserving %#x bytes of %s symbol table ]\n", - esymtab - (char *)symtab, name); - - for (sp = sym_start; sp < sym_end; sp++) { - register int strx; - strx = sp->n_un.n_strx; - if (strx != 0) { - if (strx > strlen) { - db_printf("Bad string table index (%#x)\n", strx); - sp->n_un.n_name = 0; - continue; - } - sp->n_un.n_name = strtab + strx; - } - } - - db_add_symbol_table(sym_start, sym_end, name, (char *)symtab, - 0, 0, 0, FALSE); -} - -/* - * check file name or not (check xxxx.x pattern) - */ -boolean_t -X_db_is_filename(name) - register char *name; -{ - while (*name) { - if (*name == '.') { - if (name[1]) - return(TRUE); - } - name++; - } - return(FALSE); -} - -/* - * special name comparison routine with a name in the symbol table entry - */ -boolean_t -X_db_eq_name(sp, name) - struct nlist *sp; - char *name; -{ - register char *s1, *s2; - - s1 = sp->n_un.n_name; - s2 = name; - if (*s1 == '_' && *s2 && *s2 != '_') - s1++; - while (*s2) { - if (*s1++ != *s2++) { - /* - * check .c .o file name comparison case - */ - if (*s2 == 0 && sp->n_un.n_name <= s1 - 2 - && s1[-2] == '.' && s1[-1] == 'o') - return(TRUE); - return(FALSE); - } - } - /* - * do special check for - * xxx:yyy for N_FUN - * xxx.ttt for N_DATA and N_BSS - */ - return(*s1 == 0 || (*s1 == ':' && sp->n_type == N_FUN) || - (*s1 == '.' && (sp->n_type == N_DATA || sp->n_type == N_BSS))); -} - -/* - * search a symbol table with name and type - * fp(in,out): last found text file name symbol entry - */ -struct nlist * -X_db_search_name(sp, ep, name, type, fp) - register struct nlist *sp; - struct nlist *ep; - char *name; - int type; - struct nlist **fp; -{ - struct nlist *file_sp = *fp; - struct nlist *found_sp = 0; - - for ( ; sp < ep; sp++) { - if (sp->n_type == N_TEXT && X_db_is_filename(sp->n_un.n_name)) - *fp = sp; - if (type) { - if (sp->n_type == type) { - if (X_db_eq_name(sp, name)) - return(sp); - } - if (sp->n_type == N_SO) - *fp = sp; - continue; - } - if (sp->n_type & N_STAB) - continue; - if (sp->n_un.n_name && X_db_eq_name(sp, name)) { - /* - * In case of qaulified search by a file, - * return it immediately with some check. - * Otherwise, search external one - */ - if (file_sp) { - if ((file_sp == *fp) || (sp->n_type & N_EXT)) - return(sp); - } else if (sp->n_type & N_EXT) - return(sp); - else - found_sp = sp; - } - } - return(found_sp); -} - -/* - * search a symbol with file, func and line qualification - */ -struct nlist * -X_db_qualified_search(stab, file, sym, line) - db_symtab_t *stab; - char *file; - char *sym; - int line; -{ - register struct nlist *sp = (struct nlist *)stab->start; - struct nlist *ep = (struct nlist *)stab->end; - struct nlist *fp = 0; - struct nlist *found_sp; - unsigned func_top; - boolean_t in_file; - - if (file == 0 && sym == 0) - return(0); - if (file) { - if ((sp = X_db_search_name(sp, ep, file, N_TEXT, &fp)) == 0) - return(0); - } - if (sym) { - sp = X_db_search_name(sp, ep, sym, (line > 0)? N_FUN: 0, &fp); - if (sp == 0) - return(0); - } - if (line > 0) { - if (file && !X_db_eq_name(fp, file)) - return(0); - found_sp = 0; - if (sp->n_type == N_FUN) { - /* - * qualfied by function name - * search backward because line number entries - * for the function are above it in this case. - */ - func_top = sp->n_value; - for (sp--; sp >= (struct nlist *)stab->start; sp--) { - if (sp->n_type != N_SLINE) - continue; - if (sp->n_value < func_top) - break; - if (sp->n_desc <= line) { - if (found_sp == 0 || found_sp->n_desc < sp->n_desc) - found_sp = sp; - if (sp->n_desc == line) - break; - } - } - if (sp->n_type != N_SLINE || sp->n_value < func_top) - return(0); - } else { - /* - * qualified by only file name - * search forward in this case - */ - in_file = TRUE; - for (sp++; sp < ep; sp++) { - if (sp->n_type == N_TEXT - && X_db_is_filename(sp->n_un.n_name)) - break; /* enter into another file */ - if (sp->n_type == N_SOL) { - in_file = X_db_eq_name(sp, file); - continue; - } - if (!in_file || sp->n_type != N_SLINE) - continue; - if (sp->n_desc <= line) { - if (found_sp == 0 || found_sp->n_desc < sp->n_desc) - found_sp = sp; - if (sp->n_desc == line) - break; - } - } - } - sp = found_sp; - } - return(sp); -} - -/* - * lookup symbol by name - */ -db_sym_t -X_db_lookup(stab, symstr) - db_symtab_t *stab; - char * symstr; -{ - register char *p; - register n; - int n_name; - int line_number; - char *file_name = 0; - char *sym_name = 0; - char *component[3]; - struct nlist *found = 0; - - /* - * disassemble component: [file_name:]symbol[:line_nubmer] - */ - component[0] = symstr; - component[1] = component[2] = 0; - for (p = symstr, n = 1; *p; p++) { - if (*p == ':') { - if (n >= 3) - break; - *p = 0; - component[n++] = p+1; - } - } - if (*p != 0) - goto out; - line_number = 0; - n_name = n; - p = component[n-1]; - if (*p >= '0' && *p <= '9') { - if (n == 1) - goto out; - for (line_number = 0; *p; p++) { - if (*p < '0' || *p > '9') - goto out; - line_number = line_number*10 + *p - '0'; - } - n_name--; - } else if (n >= 3) - goto out; - if (n_name == 1) { - if (X_db_is_filename(component[0])) { - file_name = component[0]; - sym_name = 0; - } else { - file_name = 0; - sym_name = component[0]; - } - } else { - file_name = component[0]; - sym_name = component[1]; - } - found = X_db_qualified_search(stab, file_name, sym_name, line_number); - -out: - while (--n > 1) - component[n][-1] = ':'; - return((db_sym_t) found); -} - -db_sym_t -X_db_search_symbol(symtab, off, strategy, diffp) - db_symtab_t * symtab; - register - db_addr_t off; - db_strategy_t strategy; - db_expr_t *diffp; /* in/out */ -{ - register unsigned int diff = *diffp; - register struct nlist *symp = 0; - register struct nlist *sp, *ep; - - sp = (struct nlist *)symtab->start; - ep = (struct nlist *)symtab->end; - - for (; sp < ep; sp++) { - if (sp->n_un.n_name == 0) - continue; - if ((sp->n_type & N_STAB) != 0) - continue; - if (off >= sp->n_value) { - if (off - sp->n_value < diff) { - diff = off - sp->n_value; - symp = sp; - if (diff == 0 && (sp->n_type & N_EXT)) - break; - } - else if (off - sp->n_value == diff) { - if (symp == 0) - symp = sp; - else if ((symp->n_type & N_EXT) == 0 && - (sp->n_type & N_EXT) != 0) - symp = sp; /* pick the external symbol */ - } - } - } - if (symp == 0) { - *diffp = off; - } - else { - *diffp = diff; - } - return ((db_sym_t)symp); -} - -/* - * Return the name and value for a symbol. - */ -void -X_db_symbol_values(sym, namep, valuep) - db_sym_t sym; - char **namep; - db_expr_t *valuep; -{ - register struct nlist *sp; - - sp = (struct nlist *)sym; - if (namep) - *namep = sp->n_un.n_name; - if (valuep) - *valuep = sp->n_value; -} - -#define X_DB_MAX_DIFF 8 /* maximum allowable diff at the end of line */ - -/* - * search symbol by value - */ -X_db_search_by_addr(stab, addr, file, func, line, diff) - db_symtab_t *stab; - register unsigned addr; - char **file; - char **func; - int *line; - unsigned *diff; -{ - register struct nlist *sp; - register struct nlist *line_sp, *func_sp, *file_sp, *line_func; - register func_diff, line_diff; - boolean_t found_line = FALSE; - struct nlist *ep = (struct nlist *)stab->end; - - line_sp = func_sp = file_sp = line_func = 0; - *file = *func = 0; - *line = 0; - for (sp = (struct nlist *)stab->start; sp < ep; sp++) { - switch(sp->n_type) { - case N_SLINE: - if (sp->n_value <= addr) { - if (line_sp == 0 || line_diff >= addr - sp->n_value) { - if (line_func) - line_func = 0; - line_sp = sp; - line_diff = addr - sp->n_value; - } - } - if (sp->n_value >= addr && line_sp) - found_line = TRUE; - continue; - case N_FUN: - if ((found_line || (line_sp && line_diff < X_DB_MAX_DIFF)) - && line_func == 0) - line_func = sp; - continue; - case N_TEXT: - if (X_db_is_filename(sp->n_un.n_name)) { - if (sp->n_value > addr) - continue; - if (file_sp == 0 || file_sp->n_value < sp->n_value) - file_sp = sp; - } else if (sp->n_value <= addr && - (func_sp == 0 || func_diff > addr - sp->n_value)) { - func_sp = sp; - func_diff = addr - sp->n_value; - } - continue; - case N_TEXT|N_EXT: - if (sp->n_value <= addr && - (func_sp == 0 || func_diff >= addr - sp->n_value)) { - func_sp = sp; - func_diff = addr - sp->n_value; - if (func_diff == 0 && file_sp && func_sp) - break; - } - default: - continue; - } - break; - } - if (line_sp) { - if (line_func == 0 || func_sp == 0 - || line_func->n_value != func_sp->n_value) - line_sp = 0; - } - if (file_sp) { - *diff = addr - file_sp->n_value; - *file = file_sp->n_un.n_name; - } - if (func_sp) { - *diff = addr - func_sp->n_value; - *func = (func_sp->n_un.n_name[0] == '_')? - func_sp->n_un.n_name + 1: func_sp->n_un.n_name; - } - if (line_sp) { - *diff = addr - line_sp->n_value; - *line = line_sp->n_desc; - } - return(file_sp || func_sp || line_sp); -} - -/* ARGSUSED */ -boolean_t -X_db_line_at_pc(stab, sym, file, line, pc) - db_symtab_t *stab; - db_sym_t sym; - char **file; - int *line; - db_expr_t pc; -{ - char *func; - unsigned diff; - boolean_t found; - - found = X_db_search_by_addr(stab,(unsigned)pc,file,&func,line,&diff); - return(found && func && *file); -} - -/* - * Initialization routine for a.out files. - */ -kdb_init() -{ - extern char *esym; - extern int end; - - if (esym > (char *)&end) { - X_db_sym_init((int *)&end, esym, "mach"); - } -} - -/* - * Read symbol table from file. - * (should be somewhere else) - */ -#include -#include - -read_symtab_from_file(fp, symtab_name) - struct file *fp; - char * symtab_name; -{ - vm_size_t resid; - kern_return_t result; - vm_offset_t symoff; - vm_size_t symsize; - vm_offset_t stroff; - vm_size_t strsize; - vm_size_t table_size; - vm_offset_t symtab; - - if (!get_symtab(fp, &symoff, &symsize)) { - boot_printf("[ error %d reading %s file header ]\n", - result, symtab_name); - return; - } - - stroff = symoff + symsize; - result = read_file(fp, (vm_offset_t)stroff, - (vm_offset_t)&strsize, sizeof(strsize), &resid); - if (result || resid) { - boot_printf("[ no valid symbol table present for %s ]\n", - symtab_name); - return; - } - - table_size = sizeof(int) + symsize + strsize; - table_size = (table_size + sizeof(int)-1) & ~(sizeof(int)-1); - - result = kmem_alloc_kobject(kernel_map, &symtab, table_size); - if (result) { - boot_printf("[ error %d allocating space for %s symbol table ]\n", - result, symtab_name); - return; - } - - *(int *)symtab = symsize; - - result = read_file(fp, symoff, - symtab + sizeof(int), symsize, &resid); - if (result || resid) { - boot_printf("[ error %d reading %s symbol table ]\n", - result, symtab_name); - return; - } - - result = read_file(fp, stroff, - symtab + sizeof(int) + symsize, strsize, &resid); - if (result || resid) { - boot_printf("[ error %d reading %s string table ]\n", - result, symtab_name); - return; - } - - X_db_sym_init((int *)symtab, - (char *)(symtab + table_size), - symtab_name); - -} - -#endif /* DB_GCC_AOUT */ diff --git a/osfmk/i386/db_interface.c b/osfmk/i386/db_interface.c deleted file mode 100644 index 9e76b5406..000000000 --- a/osfmk/i386/db_interface.c +++ /dev/null @@ -1,1027 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * Interface to new debugger. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -int db_active = 0; -x86_saved_state32_t *i386_last_saved_statep; -x86_saved_state32_t i386_nested_saved_state; -unsigned i386_last_kdb_sp; -db_regs_t ddb_regs; /* register state */ - -extern thread_t db_default_act; -extern pt_entry_t *DMAP1; -extern caddr_t DADDR1; - -#if MACH_MP_DEBUG -extern int masked_state_cnt[]; -#endif /* MACH_MP_DEBUG */ - -/* - * Enter KDB through a keyboard trap. - * We show the registers as of the keyboard interrupt - * instead of those at its call to KDB. - */ -struct int_regs { - int gs; - int fs; - int edi; - int esi; - int ebp; - int ebx; - x86_saved_state32_t *is; -}; - -extern char * trap_type[]; -extern int TRAP_TYPES; - -/* Forward */ - -extern void kdbprinttrap( - int type, - int code, - int *pc, - int sp); -extern void kdb_kentry( - struct int_regs *int_regs); -extern int db_user_to_kernel_address( - task_t task, - vm_offset_t addr, - unsigned *kaddr, - int flag); -extern void db_write_bytes_user_space( - vm_offset_t addr, - int size, - char *data, - task_t task); -extern int db_search_null( - task_t task, - unsigned *svaddr, - unsigned evaddr, - unsigned *skaddr, - int flag); -extern int kdb_enter(int); -extern void kdb_leave(void); -extern void lock_kdb(void); -extern void unlock_kdb(void); - -/* - * kdb_trap - field a TRACE or BPT trap - */ - - -extern jmp_buf_t *db_recover; - -/* - * Translate the state saved in a task state segment into an - * exception frame. Since we "know" we always want the state - * in a ktss, we hard-wire that in, rather than indexing the gdt - * with tss_sel to derive a pointer to the desired tss. - */ - -/* - * Code used to synchronize kdb among all cpus, one active at a time, switch - * from one to another using cpu #cpu - */ - -decl_simple_lock_data(, kdb_lock) /* kdb lock */ - -#define db_simple_lock_init(l, e) hw_lock_init(&((l)->interlock)) -#define db_simple_lock_try(l) hw_lock_try(&((l)->interlock)) -#define db_simple_unlock(l) hw_lock_unlock(&((l)->interlock)) - -int kdb_cpu = -1; /* current cpu running kdb */ -int kdb_debug = 1; -volatile unsigned int cpus_holding_bkpts; /* counter for number of cpus - * holding breakpoints - */ -extern boolean_t db_breakpoints_inserted; - -void -db_tss_to_frame( - int tss_sel, - x86_saved_state32_t *regs) -{ - extern struct i386_tss ktss; - int mycpu = cpu_number(); - struct i386_tss *tss; - - tss = cpu_datap(mycpu)->cpu_desc_index.cdi_ktss; /* XXX */ - - /* - * ddb will overwrite whatever's in esp, so put esp0 elsewhere, too. - */ - regs->cr2 = tss->esp0; - regs->efl = tss->eflags; - regs->eip = tss->eip; - regs->trapno = tss->ss0; /* XXX */ - regs->err = tss->esp0; /* XXX */ - regs->eax = tss->eax; - regs->ecx = tss->ecx; - regs->edx = tss->edx; - regs->ebx = tss->ebx; - regs->uesp = tss->esp; - regs->ebp = tss->ebp; - regs->esi = tss->esi; - regs->edi = tss->edi; - regs->es = tss->es; - regs->ss = tss->ss; - regs->cs = tss->cs; - regs->ds = tss->ds; - regs->fs = tss->fs; - regs->gs = tss->gs; -} - -/* - * Compose a call to the debugger from the saved state in regs. (No - * reason not to do this in C.) - */ -boolean_t -db_trap_from_asm( - x86_saved_state32_t *regs) -{ - int code; - int type; - - type = regs->trapno; - code = regs->err; - return (kdb_trap(type, code, regs)); -} - -int -kdb_trap( - int type, - int code, - x86_saved_state32_t *regs) -{ - extern char etext; - boolean_t trap_from_user; - spl_t s; - int previous_console_device; - - s = splhigh(); - - previous_console_device = switch_to_serial_console(); - - db_printf("kdb_trap(): type %d, code %d, regs->eip 0x%x\n", type, code, regs->eip); - switch (type) { - case T_DEBUG: /* single_step */ - { - extern int dr_addr[]; - int addr; - uint32_t status; - - __asm__ volatile ("movl %%dr6, %0" : "=r" (status)); - - if (status & 0xf) { /* hmm hdw break */ - addr = status & 0x8 ? dr_addr[3] : - status & 0x4 ? dr_addr[2] : - status & 0x2 ? dr_addr[1] : - dr_addr[0]; - regs->efl |= EFL_RF; - db_single_step_cmd(addr, 0, 1, "p"); - } - } - case T_INT3: /* breakpoint */ - case T_WATCHPOINT: /* watchpoint */ - case -1: /* keyboard interrupt */ - break; - - default: - if (db_recover) { - i386_nested_saved_state = *regs; - db_printf("Caught "); - if (type < 0 || type > TRAP_TYPES) - db_printf("type %d", type); - else - db_printf("%s", trap_type[type]); - db_printf(" trap, code = %x, pc = %x\n", - code, regs->eip); - splx(s); - db_error(""); - /*NOTREACHED*/ - } - kdbprinttrap(type, code, (int *)®s->eip, regs->uesp); - } - - disable_preemption(); - - current_cpu_datap()->cpu_kdb_saved_ipl = s; - current_cpu_datap()->cpu_kdb_saved_state = regs; - - i386_last_saved_statep = regs; - i386_last_kdb_sp = (unsigned) &type; - - if (!kdb_enter(regs->eip)) - goto kdb_exit; - - /* Should switch to kdb's own stack here. */ - - if (!IS_USER_TRAP(regs, &etext)) { - bzero((char *)&ddb_regs, sizeof (ddb_regs)); - *(struct x86_saved_state32_from_kernel *)&ddb_regs = - *(struct x86_saved_state32_from_kernel *)regs; - trap_from_user = FALSE; - } - else { - ddb_regs = *regs; - trap_from_user = TRUE; - } - if (!trap_from_user) { - /* - * Kernel mode - esp and ss not saved - */ - ddb_regs.uesp = (int)®s->uesp; /* kernel stack pointer */ - ddb_regs.ss = KERNEL_DS; - } - - db_active++; - db_task_trap(type, code, trap_from_user); - db_active--; - - regs->eip = ddb_regs.eip; - regs->efl = ddb_regs.efl; - regs->eax = ddb_regs.eax; - regs->ecx = ddb_regs.ecx; - regs->edx = ddb_regs.edx; - regs->ebx = ddb_regs.ebx; - - if (trap_from_user) { - /* - * user mode - saved esp and ss valid - */ - regs->uesp = ddb_regs.uesp; /* user stack pointer */ - regs->ss = ddb_regs.ss & 0xffff; /* user stack segment */ - } - - regs->ebp = ddb_regs.ebp; - regs->esi = ddb_regs.esi; - regs->edi = ddb_regs.edi; - regs->es = ddb_regs.es & 0xffff; - regs->cs = ddb_regs.cs & 0xffff; - regs->ds = ddb_regs.ds & 0xffff; - regs->fs = ddb_regs.fs & 0xffff; - regs->gs = ddb_regs.gs & 0xffff; - - if ((type == T_INT3) && - (db_get_task_value(regs->eip, - BKPT_SIZE, - FALSE, - db_target_space(current_thread(), - trap_from_user)) - == BKPT_INST)) - regs->eip += BKPT_SIZE; - - switch_to_old_console(previous_console_device); -kdb_exit: - kdb_leave(); - - current_cpu_datap()->cpu_kdb_saved_state = 0; - - enable_preemption(); - - splx(s); - - /* Allow continue to upper layers of exception handling if - * trap was not a debugging trap. - */ - - if (trap_from_user && type != T_DEBUG && type != T_INT3 - && type != T_WATCHPOINT) - return 0; - else - return (1); -} - -/* - * Enter KDB through a keyboard trap. - * We show the registers as of the keyboard interrupt - * instead of those at its call to KDB. - */ - -spl_t kdb_oldspl; - -void -kdb_kentry( - struct int_regs *int_regs) -{ - extern char etext; - boolean_t trap_from_user; - x86_saved_state32_t *is = int_regs->is; - x86_saved_state32_t regs; - spl_t s; - - s = splhigh(); - kdb_oldspl = s; - - if (IS_USER_TRAP(is, &etext)) - { - regs.uesp = ((int *)(is+1))[0]; - regs.ss = ((int *)(is+1))[1]; - } - else { - regs.ss = KERNEL_DS; - regs.uesp= (int)(is+1); - } - regs.efl = is->efl; - regs.cs = is->cs; - regs.eip = is->eip; - regs.eax = is->eax; - regs.ecx = is->ecx; - regs.edx = is->edx; - regs.ebx = int_regs->ebx; - regs.ebp = int_regs->ebp; - regs.esi = int_regs->esi; - regs.edi = int_regs->edi; - regs.ds = is->ds; - regs.es = is->es; - regs.fs = int_regs->fs; - regs.gs = int_regs->gs; - - disable_preemption(); - - current_cpu_datap()->cpu_kdb_saved_state = ®s; - - if (!kdb_enter(regs.eip)) - goto kdb_exit; - - bcopy((char *)®s, (char *)&ddb_regs, sizeof (ddb_regs)); - trap_from_user = IS_USER_TRAP(&ddb_regs, &etext); - - db_active++; - db_task_trap(-1, 0, trap_from_user); - db_active--; - - if (trap_from_user) { - ((int *)(is+1))[0] = ddb_regs.uesp; - ((int *)(is+1))[1] = ddb_regs.ss & 0xffff; - } - is->efl = ddb_regs.efl; - is->cs = ddb_regs.cs & 0xffff; - is->eip = ddb_regs.eip; - is->eax = ddb_regs.eax; - is->ecx = ddb_regs.ecx; - is->edx = ddb_regs.edx; - int_regs->ebx = ddb_regs.ebx; - int_regs->ebp = ddb_regs.ebp; - int_regs->esi = ddb_regs.esi; - int_regs->edi = ddb_regs.edi; - is->ds = ddb_regs.ds & 0xffff; - is->es = ddb_regs.es & 0xffff; - int_regs->fs = ddb_regs.fs & 0xffff; - int_regs->gs = ddb_regs.gs & 0xffff; - -kdb_exit: - kdb_leave(); - current_cpu_datap()->cpu_kdb_saved_state = 0; - - enable_preemption(); - - splx(s); -} - -/* - * Print trap reason. - */ - -void -kdbprinttrap( - int type, - int code, - int *pc, - int sp) -{ - printf("kernel: "); - if (type < 0 || type > TRAP_TYPES) - db_printf("type %d", type); - else - db_printf("%s", trap_type[type]); - db_printf(" trap, code=%x eip@%x = %x esp=%x\n", - code, pc, *(int *)pc, sp); - db_run_mode = STEP_CONTINUE; -} - -int -db_user_to_kernel_address( - task_t task, - vm_offset_t addr, - unsigned *kaddr, - int flag) -{ - register pt_entry_t *ptp; - vm_offset_t src; - - /* - * must not pre-empted while using the pte pointer passed - * back since it's been mapped through a per-cpu window - */ - mp_disable_preemption(); - - ptp = pmap_pte(task->map->pmap, (vm_map_offset_t)addr); - if (ptp == PT_ENTRY_NULL || (*ptp & INTEL_PTE_VALID) == 0) { - if (flag) { - db_printf("\nno memory is assigned to address %08x\n", addr); - db_error(0); - /* NOTREACHED */ - } - mp_enable_preemption(); - return(-1); - } - src = (vm_offset_t)pte_to_pa(*ptp); - mp_enable_preemption(); - - *(int *) DMAP1 = INTEL_PTE_VALID | INTEL_PTE_RW | (src & PG_FRAME) | - INTEL_PTE_REF | INTEL_PTE_MOD; -#if defined(I386_CPU) - if (cpu_class == CPUCLASS_386) { - invltlb(); - } else -#endif - { - invlpg((u_int)DADDR1); - } - - *kaddr = (unsigned)DADDR1 + (addr & PAGE_MASK); - - return(0); -} - -/* - * Read bytes from kernel address space for debugger. - */ - -void -db_read_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task) -{ - register char *src; - register int n; - unsigned kern_addr; - - src = (char *)addr; - if (task == kernel_task || task == TASK_NULL) { - while (--size >= 0) { - if (addr++ > VM_MAX_KERNEL_ADDRESS) { - db_printf("\nbad address %x\n", addr); - db_error(0); - /* NOTREACHED */ - } - *data++ = *src++; - } - return; - } - while (size > 0) { - if (db_user_to_kernel_address(task, addr, &kern_addr, 1) < 0) - return; - src = (char *)kern_addr; - n = intel_trunc_page(addr+INTEL_PGBYTES) - addr; - if (n > size) - n = size; - size -= n; - addr += n; - while (--n >= 0) - *data++ = *src++; - } -} - -/* - * Write bytes to kernel address space for debugger. - */ - -void -db_write_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task) -{ - register char *dst; - - register pt_entry_t *ptep0 = 0; - pt_entry_t oldmap0 = 0; - vm_offset_t addr1; - register pt_entry_t *ptep1 = 0; - pt_entry_t oldmap1 = 0; - extern char etext; - - if (task && task != kernel_task) { - db_write_bytes_user_space(addr, size, data, task); - return; - } - - - if (addr >= VM_MIN_KERNEL_LOADED_ADDRESS) { - db_write_bytes_user_space(addr, size, data, kernel_task); - return; - } - - if (addr >= VM_MIN_KERNEL_ADDRESS && - addr <= (vm_offset_t)&etext) - { - ptep0 = pmap_pte(kernel_pmap, (vm_map_offset_t)addr); - oldmap0 = *ptep0; - *ptep0 |= INTEL_PTE_WRITE; - - addr1 = i386_trunc_page(addr + size - 1); - if (i386_trunc_page(addr) != addr1) { - /* data crosses a page boundary */ - - ptep1 = pmap_pte(kernel_pmap, (vm_map_offset_t)addr1); - oldmap1 = *ptep1; - *ptep1 |= INTEL_PTE_WRITE; - } - flush_tlb(); - } - - dst = (char *)addr; - - while (--size >= 0) { - if (addr++ > VM_MAX_KERNEL_ADDRESS) { - db_printf("\nbad address %x\n", addr); - db_error(0); - /* NOTREACHED */ - } - *dst++ = *data++; - } - - if (ptep0) { - *ptep0 = oldmap0; - if (ptep1) { - *ptep1 = oldmap1; - } - flush_tlb(); - } -} - -void -db_write_bytes_user_space( - vm_offset_t addr, - int size, - char *data, - task_t task) -{ - register char *dst; - register int n; - unsigned kern_addr; - - while (size > 0) { - if (db_user_to_kernel_address(task, addr, &kern_addr, 1) < 0) - return; - dst = (char *)kern_addr; - n = intel_trunc_page(addr+INTEL_PGBYTES) - addr; - if (n > size) - n = size; - size -= n; - addr += n; - while (--n >= 0) - *dst++ = *data++; - } -} - -boolean_t -db_check_access( - vm_offset_t addr, - int size, - task_t task) -{ - register n; - unsigned kern_addr; - - if (task == kernel_task || task == TASK_NULL) { - if (kernel_task == TASK_NULL) - return(TRUE); - task = kernel_task; - } else if (task == TASK_NULL) { - if (current_thread() == THREAD_NULL) - return(FALSE); - task = current_thread()->task; - } - while (size > 0) { - if (db_user_to_kernel_address(task, addr, &kern_addr, 0) < 0) - return(FALSE); - n = intel_trunc_page(addr+INTEL_PGBYTES) - addr; - if (n > size) - n = size; - size -= n; - addr += n; - } - return(TRUE); -} - -boolean_t -db_phys_eq( - task_t task1, - vm_offset_t addr1, - task_t task2, - vm_offset_t addr2) -{ - unsigned kern_addr1, kern_addr2; - - if ((addr1 & (INTEL_PGBYTES-1)) != (addr2 & (INTEL_PGBYTES-1))) - return(FALSE); - if (task1 == TASK_NULL) { - if (current_thread() == THREAD_NULL) - return(FALSE); - task1 = current_thread()->task; - } - if (db_user_to_kernel_address(task1, addr1, &kern_addr1, 0) < 0 || - db_user_to_kernel_address(task2, addr2, &kern_addr2, 0) < 0) - return(FALSE); - return(kern_addr1 == kern_addr2); -} - -#define DB_USER_STACK_ADDR (VM_MIN_KERNEL_ADDRESS) -#define DB_NAME_SEARCH_LIMIT (DB_USER_STACK_ADDR-(INTEL_PGBYTES*3)) - -int -db_search_null( - task_t task, - unsigned *svaddr, - unsigned evaddr, - unsigned *skaddr, - int flag) -{ - register unsigned vaddr; - register unsigned *kaddr; - - kaddr = (unsigned *)*skaddr; - for (vaddr = *svaddr; vaddr > evaddr; vaddr -= sizeof(unsigned)) { - if (vaddr % INTEL_PGBYTES == 0) { - vaddr -= sizeof(unsigned); - if (db_user_to_kernel_address(task, vaddr, skaddr, 0) < 0) - return(-1); - kaddr = (unsigned *)*skaddr; - } else { - vaddr -= sizeof(unsigned); - kaddr--; - } - if ((*kaddr == 0) ^ (flag == 0)) { - *svaddr = vaddr; - *skaddr = (unsigned)kaddr; - return(0); - } - } - return(-1); -} - -void -db_task_name( - task_t task) -{ - register char *p; - register n; - unsigned vaddr, kaddr; - - vaddr = DB_USER_STACK_ADDR; - kaddr = 0; - - /* - * skip nulls at the end - */ - if (db_search_null(task, &vaddr, DB_NAME_SEARCH_LIMIT, &kaddr, 0) < 0) { - db_printf(DB_NULL_TASK_NAME); - return; - } - /* - * search start of args - */ - if (db_search_null(task, &vaddr, DB_NAME_SEARCH_LIMIT, &kaddr, 1) < 0) { - db_printf(DB_NULL_TASK_NAME); - return; - } - - n = DB_TASK_NAME_LEN-1; - p = (char *)kaddr + sizeof(unsigned); - for (vaddr += sizeof(int); vaddr < DB_USER_STACK_ADDR && n > 0; - vaddr++, p++, n--) { - if (vaddr % INTEL_PGBYTES == 0) { - (void)db_user_to_kernel_address(task, vaddr, &kaddr, 0); - p = (char*)kaddr; - } - db_printf("%c", (*p < ' ' || *p > '~')? ' ': *p); - } - while (n-- >= 0) /* compare with >= 0 for one more space */ - db_printf(" "); -} - -void -db_machdep_init(void) -{ - int c; - - db_simple_lock_init(&kdb_lock, 0); -#if MACH_KDB /*this only works for legacy 32-bit machines */ - for (c = 0; c < real_ncpus; ++c) { - if (c == master_cpu) { - master_dbtss.esp0 = (int)(db_task_stack_store + - (INTSTACK_SIZE * (c + 1)) - sizeof (natural_t)); - master_dbtss.esp = master_dbtss.esp0; - master_dbtss.eip = (int)&db_task_start; - /* - * The TSS for the debugging task on each slave CPU - * is set up in cpu_desc_init(). - */ - } - } -#endif -} - -/* - * Called when entering kdb: - * Takes kdb lock. If if we were called remotely (slave state) we just - * wait for kdb_cpu to be equal to cpu_number(). Otherwise enter kdb if - * not active on another cpu. - * If db_pass_thru[cpu_number()] > 0, then kdb can't stop now. - */ - -int -kdb_enter(int pc) -{ - int my_cpu; - int retval; - - disable_preemption(); - - my_cpu = cpu_number(); - - if (current_cpu_datap()->cpu_db_pass_thru) { - retval = 0; - goto kdb_exit; - } - - current_cpu_datap()->cpu_kdb_active++; - - lock_kdb(); - - db_printf("kdb_enter(): cpu_number %d, kdb_cpu %d\n", my_cpu, kdb_cpu); - - if (db_breakpoints_inserted) - cpus_holding_bkpts++; - - if (kdb_cpu == -1 && !current_cpu_datap()->cpu_kdb_is_slave) { - kdb_cpu = my_cpu; - db_printf("Signaling other processors..\n"); - remote_kdb(); /* stop other cpus */ - retval = 1; - } else if (kdb_cpu == my_cpu) - retval = 1; - else - retval = 0; - -kdb_exit: - enable_preemption(); - - return (retval); -} - -void -kdb_leave(void) -{ - int my_cpu; - boolean_t wait = FALSE; - - disable_preemption(); - - my_cpu = cpu_number(); - - if (db_run_mode == STEP_CONTINUE) { - wait = TRUE; - kdb_cpu = -1; - } - if (db_breakpoints_inserted) - cpus_holding_bkpts--; - if (current_cpu_datap()->cpu_kdb_is_slave) - current_cpu_datap()->cpu_kdb_is_slave--; - if (kdb_debug) - db_printf("kdb_leave: cpu %d, kdb_cpu %d, run_mode %d pc %x (%x) holds %d\n", - my_cpu, kdb_cpu, db_run_mode, - ddb_regs.eip, *(int *)ddb_regs.eip, - cpus_holding_bkpts); - clear_kdb_intr(); - unlock_kdb(); - current_cpu_datap()->cpu_kdb_active--; - - mp_kdb_exit(); - - enable_preemption(); - - if (wait) { - while(cpus_holding_bkpts); - } -} - -void -lock_kdb(void) -{ - int my_cpu; - register i; - - disable_preemption(); - - my_cpu = cpu_number(); - - for(;;) { - if (kdb_cpu != -1 && kdb_cpu != my_cpu) { - continue; - } - if (db_simple_lock_try(&kdb_lock)) { - if (kdb_cpu == -1 || kdb_cpu == my_cpu) - break; - db_simple_unlock(&kdb_lock); - } - } - - enable_preemption(); -} - -#if TIME_STAMP -extern unsigned old_time_stamp; -#endif /* TIME_STAMP */ - -void -unlock_kdb(void) -{ - db_simple_unlock(&kdb_lock); -#if TIME_STAMP - old_time_stamp = 0; -#endif /* TIME_STAMP */ -} - - -#ifdef __STDC__ -#define KDB_SAVE(type, name) extern type name; type name##_save = name -#define KDB_RESTORE(name) name = name##_save -#else /* __STDC__ */ -#define KDB_SAVE(type, name) extern type name; type name/**/_save = name -#define KDB_RESTORE(name) name = name/**/_save -#endif /* __STDC__ */ - -#define KDB_SAVE_CTXT() \ - KDB_SAVE(int, db_run_mode); \ - KDB_SAVE(boolean_t, db_sstep_print); \ - KDB_SAVE(int, db_loop_count); \ - KDB_SAVE(int, db_call_depth); \ - KDB_SAVE(int, db_inst_count); \ - KDB_SAVE(int, db_last_inst_count); \ - KDB_SAVE(int, db_load_count); \ - KDB_SAVE(int, db_store_count); \ - KDB_SAVE(boolean_t, db_cmd_loop_done); \ - KDB_SAVE(jmp_buf_t *, db_recover); \ - KDB_SAVE(db_addr_t, db_dot); \ - KDB_SAVE(db_addr_t, db_last_addr); \ - KDB_SAVE(db_addr_t, db_prev); \ - KDB_SAVE(db_addr_t, db_next); \ - KDB_SAVE(db_regs_t, ddb_regs); - -#define KDB_RESTORE_CTXT() \ - KDB_RESTORE(db_run_mode); \ - KDB_RESTORE(db_sstep_print); \ - KDB_RESTORE(db_loop_count); \ - KDB_RESTORE(db_call_depth); \ - KDB_RESTORE(db_inst_count); \ - KDB_RESTORE(db_last_inst_count); \ - KDB_RESTORE(db_load_count); \ - KDB_RESTORE(db_store_count); \ - KDB_RESTORE(db_cmd_loop_done); \ - KDB_RESTORE(db_recover); \ - KDB_RESTORE(db_dot); \ - KDB_RESTORE(db_last_addr); \ - KDB_RESTORE(db_prev); \ - KDB_RESTORE(db_next); \ - KDB_RESTORE(ddb_regs); - -/* - * switch to another cpu - */ - -void -kdb_on( - int cpu) -{ - KDB_SAVE_CTXT(); - if (cpu < 0 || cpu >= real_ncpus || !cpu_datap(cpu)->cpu_kdb_active) - return; - db_set_breakpoints(); - db_set_watchpoints(); - kdb_cpu = cpu; - unlock_kdb(); - lock_kdb(); - db_clear_breakpoints(); - db_clear_watchpoints(); - KDB_RESTORE_CTXT(); - if (kdb_cpu == -1) {/* someone continued */ - kdb_cpu = cpu_number(); - db_continue_cmd(0, 0, 0, ""); - } -} - -/* - * system reboot - */ - -extern void kdp_machine_reboot(void); - -void db_reboot( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif) -{ - kdp_machine_reboot(); -} diff --git a/osfmk/i386/db_machdep.h b/osfmk/i386/db_machdep.h deleted file mode 100644 index e57dfca36..000000000 --- a/osfmk/i386/db_machdep.h +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#ifndef _I386_DB_MACHDEP_H_ -#define _I386_DB_MACHDEP_H_ - -/* - * Machine-dependent defines for new kernel debugger. - */ - -#include -#include -#include -#ifdef __i386__ -#include /* for thread_status */ -#include -#include -#include -#endif - -typedef addr64_t db_addr_t; /* address - unsigned */ -typedef uint64_t db_expr_t; /* expression */ - -#ifdef __i386__ -typedef struct x86_saved_state32 db_regs_t; -extern db_regs_t ddb_regs; /* register state */ -#define DDB_REGS (&ddb_regs) -extern int db_active; /* ddb is active */ - -#define PC_REGS(regs) ((db_addr_t)(regs)->eip) - -#define BKPT_INST 0xcc /* breakpoint instruction */ -#define BKPT_SIZE (1) /* size of breakpoint inst */ -#define BKPT_SET(inst) (BKPT_INST) - -#define FIXUP_PC_AFTER_BREAK ddb_regs.eip -= 1; - -#define db_clear_single_step(regs) ((regs)->efl &= ~EFL_TF) -#define db_set_single_step(regs) ((regs)->efl |= EFL_TF) - -#define IS_BREAKPOINT_TRAP(type, code) ((type) == T_INT3) -#define IS_WATCHPOINT_TRAP(type, code) ((type) == T_WATCHPOINT) - -#define I_CALL 0xe8 -#define I_CALLI 0xff -#define I_RET 0xc3 -#define I_IRET 0xcf - -#define inst_trap_return(ins) (((ins)&0xff) == I_IRET) -#define inst_return(ins) (((ins)&0xff) == I_RET) -#define inst_call(ins) (((ins)&0xff) == I_CALL || \ - (((ins)&0xff) == I_CALLI && \ - ((ins)&0x3800) == 0x1000)) - -int db_inst_load(unsigned long); -int db_inst_store(unsigned long); - -/* access capability and access macros */ - -#define DB_ACCESS_LEVEL 2 /* access any space */ -#define DB_CHECK_ACCESS(addr,size,task) \ - db_check_access(addr,size,task) -#define DB_PHYS_EQ(task1,addr1,task2,addr2) \ - db_phys_eq(task1,addr1,task2,addr2) -#define DB_VALID_KERN_ADDR(addr) (1) -#define DB_VALID_ADDRESS(addr,user) \ - ((!(user) && DB_VALID_KERN_ADDR(addr)) || \ - ((user) && (addr) < VM_MAX_ADDRESS)) - -/* - * Given pointer to i386_saved_state, determine if it represents - * a thread executing in user space. - */ -#define IS_USER_TRAP(regs, etext) (((regs)->cs & 3) != 0) - -extern boolean_t db_check_access( - vm_offset_t addr, - int size, - task_t task); -extern boolean_t db_phys_eq( - task_t task1, - vm_offset_t addr1, - task_t task2, - vm_offset_t addr2); -extern db_addr_t db_disasm( - db_addr_t loc, - boolean_t altfmt, - task_t task); -extern void db_read_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task); -extern void db_write_bytes( - vm_offset_t addr, - int size, - char *data, - task_t task); -extern void db_stack_trace_cmd( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); -extern void db_reboot( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif); - -extern void db_display_kmod(db_expr_t addr, boolean_t have_addr, - db_expr_t count, char *modif); -extern void db_display_real(db_expr_t addr, boolean_t have_addr, - db_expr_t count, char *modif); -extern void db_display_iokit(db_expr_t addr, boolean_t have_addr, - db_expr_t count, char * modif); -extern void db_cpuid(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif); -extern void db_msr(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif); -extern void db_apic(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif); - -/* macros for printing OS server dependent task name */ - -#define DB_TASK_NAME(task) db_task_name(task) -#define DB_TASK_NAME_TITLE "COMMAND " -#define DB_TASK_NAME_LEN 23 -#define DB_NULL_TASK_NAME "? " - -extern void db_task_name( - task_t task); - -/* macro for checking if a thread has used floating-point */ - -#define db_act_fp_used(act) (act && act->machine.ifps) - -extern void db_tss_to_frame( - int tss_sel, - x86_saved_state32_t *regs); -extern int kdb_trap( - int type, - int code, - x86_saved_state32_t *regs); -extern boolean_t db_trap_from_asm( - x86_saved_state32_t *regs); -extern void kdb_on( - int cpu); - -#if MACH_KDB -extern void db_chkpmgr(void); -#endif /* MACH_KDB */ -extern void db_pmgr(db_expr_t addr, int have_addr, db_expr_t count, char * modif); -extern void db_nap(db_expr_t addr, int have_addr, db_expr_t count, char * modif); -#endif /* __i386__ */ - -#endif /* _I386_DB_MACHDEP_H_ */ diff --git a/osfmk/i386/db_trace.c b/osfmk/i386/db_trace.c deleted file mode 100644 index 136418ea2..000000000 --- a/osfmk/i386/db_trace.c +++ /dev/null @@ -1,876 +0,0 @@ -/* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -extern jmp_buf_t *db_recover; -struct x86_kernel_state ddb_null_kregs; -extern kmod_info_t *kmod; - - -/* - * Stack trace. - */ - -#define INKERNELSTACK(va, th) 1 - -#define DB_NUMARGS_MAX 5 - -struct i386_frame { - struct i386_frame *f_frame; - int f_retaddr; - int f_arg0; -}; - -#define TRAP 1 -#define INTERRUPT 2 -#define SYSCALL 3 - -db_addr_t db_user_trap_symbol_value = 0; -db_addr_t db_kernel_trap_symbol_value = 0; -db_addr_t db_interrupt_symbol_value = 0; -db_addr_t db_return_to_iret_symbol_value = 0; -db_addr_t db_syscall_symbol_value = 0; -boolean_t db_trace_symbols_found = FALSE; - -struct i386_kregs { - char *name; - unsigned int offset; -} i386_kregs[] = { - { "ebx", (unsigned int)(&((struct x86_kernel_state *)0)->k_ebx) }, - { "esp", (unsigned int)(&((struct x86_kernel_state *)0)->k_esp) }, - { "ebp", (unsigned int)(&((struct x86_kernel_state *)0)->k_ebp) }, - { "edi", (unsigned int)(&((struct x86_kernel_state *)0)->k_edi) }, - { "esi", (unsigned int)(&((struct x86_kernel_state *)0)->k_esi) }, - { "eip", (unsigned int)(&((struct x86_kernel_state *)0)->k_eip) }, - { 0 } -}; - -/* Forward */ - -extern unsigned int * db_lookup_i386_kreg( - char *name, - int *kregp); -extern int db_i386_reg_value( - struct db_variable * vp, - db_expr_t * val, - int flag, - db_var_aux_param_t ap); -extern void db_find_trace_symbols(void); -extern int db_numargs( - struct i386_frame *fp, - task_t task); -extern void db_nextframe( - struct i386_frame **lfp, - struct i386_frame **fp, - db_addr_t *ip, - int frame_type, - thread_t thr_act); -extern int _setjmp( - jmp_buf_t * jb); - -/* - * Machine register set. - */ -struct db_variable db_regs[] = { - { "cs", (unsigned int *)&ddb_regs.cs, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "ds", (unsigned int *)&ddb_regs.ds, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "es", (unsigned int *)&ddb_regs.es, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "fs", (unsigned int *)&ddb_regs.fs, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "gs", (unsigned int *)&ddb_regs.gs, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "ss", (unsigned int *)&ddb_regs.ss, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "eax",(unsigned int *)&ddb_regs.eax, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "ecx",(unsigned int *)&ddb_regs.ecx, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "edx",(unsigned int *)&ddb_regs.edx, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "ebx",(unsigned int *)&ddb_regs.ebx, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "esp",(unsigned int *)&ddb_regs.uesp,db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "ebp",(unsigned int *)&ddb_regs.ebp, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "esi",(unsigned int *)&ddb_regs.esi, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "edi",(unsigned int *)&ddb_regs.edi, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "eip",(unsigned int *)&ddb_regs.eip, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 }, - { "efl",(unsigned int *)&ddb_regs.efl, db_i386_reg_value, 0, 0, 0, 0, TRUE, 0, 0, (int *)0, 0 } -}; -struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]); - -unsigned int * -db_lookup_i386_kreg( - char *name, - int *kregp) -{ - register struct i386_kregs *kp; - - for (kp = i386_kregs; kp->name; kp++) { - if (strcmp(name, kp->name) == 0) - return((unsigned int *)((int)kregp + kp->offset)); - } - return(0); -} - -int -db_i386_reg_value( - struct db_variable *vp, - db_expr_t *valuep, - int flag, - db_var_aux_param_t ap) -{ - extern char etext; - unsigned int *dp = 0; - db_expr_t null_reg = 0; - register thread_t thr_act = ap->thr_act; - - if (db_option(ap->modif, 'u')) { - if (thr_act == THREAD_NULL) { - if ((thr_act = current_thread()) == THREAD_NULL) - db_error("no user registers\n"); - } - if (thr_act == current_thread()) { - if (IS_USER_TRAP(&ddb_regs, &etext)) - dp = vp->valuep; - } - } else { - if (thr_act == THREAD_NULL || thr_act == current_thread()) { - dp = vp->valuep; - } else { - if (thr_act && - (thr_act->continuation != THREAD_CONTINUE_NULL) && - thr_act->kernel_stack) { - int cpu; - - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_datap(cpu)->cpu_running == TRUE && - cpu_datap(cpu)->cpu_active_thread == thr_act && cpu_datap(cpu)->cpu_kdb_saved_state) { - dp = (unsigned int *) (((unsigned int)cpu_datap(cpu)->cpu_kdb_saved_state) + - (((unsigned int) vp->valuep) - - (unsigned int) &ddb_regs)); - break; - } - } - if (dp == 0 && thr_act) - dp = db_lookup_i386_kreg(vp->name, - (unsigned int *)(STACK_IKS(thr_act->kernel_stack))); - if (dp == 0) - dp = &null_reg; - } else if (thr_act && - (thr_act->continuation != THREAD_CONTINUE_NULL)) { - /* only EIP is valid */ - if (vp->valuep == (unsigned int *) &ddb_regs.eip) { - dp = (unsigned int *)(&thr_act->continuation); - } else { - dp = &null_reg; - } - } - } - } - if (dp == 0) { - int cpu; - - if (!db_option(ap->modif, 'u')) { - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_datap(cpu)->cpu_running == TRUE && - cpu_datap(cpu)->cpu_active_thread == thr_act && cpu_datap(cpu)->cpu_kdb_saved_state) { - dp = (unsigned int *) (((unsigned int)cpu_datap(cpu)->cpu_kdb_saved_state) + - (((unsigned int) vp->valuep) - - (unsigned int) &ddb_regs)); - break; - } - } - } - if (dp == 0) { - if (!thr_act) - db_error("no pcb\n"); - dp = (unsigned int *)((unsigned int)(thr_act->machine.iss) + - ((unsigned int)vp->valuep - (unsigned int)&ddb_regs)); - } - } - if (flag == DB_VAR_SET) - *dp = *valuep; - else - *valuep = *dp; - return(0); -} - -void -db_find_trace_symbols(void) -{ - db_expr_t value; - boolean_t found_some; - - found_some = FALSE; - if (db_value_of_name(CC_SYM_PREFIX "user_trap", &value)) { - db_user_trap_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (db_value_of_name(CC_SYM_PREFIX "kernel_trap", &value)) { - db_kernel_trap_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (db_value_of_name(CC_SYM_PREFIX "interrupt", &value)) { - db_interrupt_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (db_value_of_name(CC_SYM_PREFIX "return_to_iret", &value)) { - db_return_to_iret_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (db_value_of_name(CC_SYM_PREFIX "syscall", &value)) { - db_syscall_symbol_value = (db_addr_t) value; - found_some = TRUE; - } - if (found_some) - db_trace_symbols_found = TRUE; -} - -/* - * Figure out how many arguments were passed into the frame at "fp". - */ -int db_numargs_default = 5; - -int -db_numargs( - struct i386_frame *fp, - task_t task) -{ - int *argp; - int inst; - int args; - extern char etext; - - argp = (int *)db_get_task_value((int)&fp->f_retaddr, 4, FALSE, task); - if (argp < (int *)VM_MIN_KERNEL_ADDRESS || (char *)argp > &etext) - args = db_numargs_default; - else if (!DB_CHECK_ACCESS((int)argp, 4, task)) - args = db_numargs_default; - else { - inst = db_get_task_value((int)argp, 4, FALSE, task); - if ((inst & 0xff) == 0x59) /* popl %ecx */ - args = 1; - else if ((inst & 0xffff) == 0xc483) /* addl %n, %esp */ - args = ((inst >> 16) & 0xff) / 4; - else - args = db_numargs_default; - } - return (args); -} - -struct interrupt_frame { - struct i386_frame *if_frame; /* point to next frame */ - int if_retaddr; /* return address to _interrupt */ - int if_unit; /* unit number */ - int if_spl; /* saved spl */ - int if_iretaddr; /* _return_to_{iret,iret_i} */ - int if_edx; /* old sp(iret) or saved edx(iret_i) */ - int if_ecx; /* saved ecx(iret_i) */ - int if_eax; /* saved eax(iret_i) */ - int if_eip; /* saved eip(iret_i) */ - int if_cs; /* saved cs(iret_i) */ - int if_efl; /* saved efl(iret_i) */ -}; - -extern const char *trap_type[]; -extern int TRAP_TYPES; - -/* - * Figure out the next frame up in the call stack. - * For trap(), we print the address of the faulting instruction and - * proceed with the calling frame. We return the ip that faulted. - * If the trap was caused by jumping through a bogus pointer, then - * the next line in the backtrace will list some random function as - * being called. It should get the argument list correct, though. - * It might be possible to dig out from the next frame up the name - * of the function that faulted, but that could get hairy. - */ -void -db_nextframe( - struct i386_frame **lfp, /* in/out */ - struct i386_frame **fp, /* in/out */ - db_addr_t *ip, /* out */ - int frame_type, /* in */ - thread_t thr_act) /* in */ -{ - x86_saved_state32_t *iss32; - struct interrupt_frame *ifp; - task_t task = (thr_act != THREAD_NULL)? thr_act->task: TASK_NULL; - - switch(frame_type) { - case TRAP: - /* - * We know that trap() has 1 argument and we know that - * it is an (strcut x86_saved_state32_t *). - */ - iss32 = (x86_saved_state32_t *) - db_get_task_value((int)&((*fp)->f_arg0),4,FALSE,task); - - if (iss32->trapno >= 0 && iss32->trapno < TRAP_TYPES) { - db_printf(">>>>> %s trap at ", - trap_type[iss32->trapno]); - } else { - db_printf(">>>>> trap (number %d) at ", - iss32->trapno & 0xffff); - } - db_task_printsym(iss32->eip, DB_STGY_PROC, task); - db_printf(" <<<<<\n"); - *fp = (struct i386_frame *)iss32->ebp; - *ip = (db_addr_t)iss32->eip; - break; - - case INTERRUPT: - if (*lfp == 0) { - db_printf(">>>>> interrupt <<<<<\n"); - goto miss_frame; - } - db_printf(">>>>> interrupt at "); - ifp = (struct interrupt_frame *)(*lfp); - *fp = ifp->if_frame; - if (ifp->if_iretaddr == db_return_to_iret_symbol_value) { - *ip = ((x86_saved_state32_t *)ifp->if_edx)->eip; - } else - *ip = (db_addr_t)ifp->if_eip; - db_task_printsym(*ip, DB_STGY_PROC, task); - db_printf(" <<<<<\n"); - break; - - case SYSCALL: - if (thr_act != THREAD_NULL) { - iss32 = (x86_saved_state32_t *)thr_act->machine.iss; - - *ip = (db_addr_t)(iss32->eip); - *fp = (struct i386_frame *)(iss32->ebp); - } - break; - - default: /* falling down for unknown case */ -miss_frame: - *ip = (db_addr_t) - db_get_task_value((int)&(*fp)->f_retaddr, 4, FALSE, task); - *lfp = *fp; - *fp = (struct i386_frame *) - db_get_task_value((int)&(*fp)->f_frame, 4, FALSE, task); - break; - } -} - -void -db_stack_trace_cmd( - db_expr_t addr, - boolean_t have_addr, - db_expr_t count, - char *modif) -{ - struct i386_frame *frame, *lastframe; - x86_saved_state32_t *iss32; - int *argp; - db_addr_t callpc, lastcallpc; - int frame_type; - boolean_t kernel_only = TRUE; - boolean_t trace_thread = FALSE; - boolean_t trace_all_threads = FALSE; - int thcount = 0; - char *filename; - int linenum; - task_t task; - thread_t th, top_act; - int user_frame; - int frame_count; - jmp_buf_t *prev; - jmp_buf_t db_jmp_buf; - queue_entry_t act_list; - - if (!db_trace_symbols_found) - db_find_trace_symbols(); - - { - register char *cp = modif; - register char c; - - while ((c = *cp++) != 0) { - if (c == 't') - trace_thread = TRUE; - if (c == 'T') { - trace_all_threads = TRUE; - trace_thread = TRUE; - } - if (c == 'u') - kernel_only = FALSE; - } - } - - if (trace_all_threads) { - if (!have_addr && !trace_thread) { - have_addr = TRUE; - trace_thread = TRUE; - act_list = &(current_task()->threads); - addr = (db_expr_t) queue_first(act_list); - } else if (trace_thread) { - if (have_addr) { - if (!db_check_act_address_valid((thread_t)addr)) { - if (db_lookup_task((task_t)addr) == -1) - return; - act_list = &(((task_t)addr)->threads); - addr = (db_expr_t) queue_first(act_list); - } else { - act_list = &(((thread_t)addr)->task->threads); - thcount = db_lookup_task_act(((thread_t)addr)->task, - (thread_t)addr); - } - } else { - th = db_default_act; - if (th == THREAD_NULL) - th = current_thread(); - if (th == THREAD_NULL) { - db_printf("no active thr_act\n"); - return; - } - have_addr = TRUE; - act_list = &th->task->threads; - addr = (db_expr_t) queue_first(act_list); - } - } - } - - if (count == -1) - count = 65535; - -next_thread: - top_act = THREAD_NULL; - - user_frame = 0; - frame_count = count; - - if (!have_addr && !trace_thread) { - frame = (struct i386_frame *)ddb_regs.ebp; - callpc = (db_addr_t)ddb_regs.eip; - th = current_thread(); - task = (th != THREAD_NULL)? th->task: TASK_NULL; - db_printf("thread 0x%x, current_thread() is 0x%x, ebp is 0x%x, eip is 0x%x\n", th, current_thread(), ddb_regs.ebp, ddb_regs.eip); - } else if (trace_thread) { - if (have_addr) { - th = (thread_t) addr; - if (!db_check_act_address_valid(th)) { - return; - } - } else { - th = db_default_act; - if (th == THREAD_NULL) - th = current_thread(); - if (th == THREAD_NULL) { - db_printf("no active thread\n"); - return; - } - } - if (trace_all_threads) - db_printf("---------- Thread 0x%x (#%d of %d) ----------\n", - addr, thcount, th->task->thread_count); - - next_activation: - user_frame = 0; -// kprintf("th is %x, current_thread() is %x, ddb_regs.ebp is %x ddb_regs.eip is %x\n", th, current_thread(), ddb_regs.ebp, ddb_regs.eip); - task = th->task; - if (th == current_thread()) { - frame = (struct i386_frame *)ddb_regs.ebp; - callpc = (db_addr_t)ddb_regs.eip; - } else { - if (!th) { - db_printf("thread has no shuttle\n"); - - goto thread_done; - } - else if ( (th->continuation != THREAD_CONTINUE_NULL) || - th->kernel_stack == 0) { - - db_printf("Continuation "); - db_task_printsym((db_expr_t)th->continuation, - DB_STGY_PROC, task); - db_printf("\n"); - - iss32 = (x86_saved_state32_t *)th->machine.iss; - - frame = (struct i386_frame *) (iss32->ebp); - callpc = (db_addr_t) (iss32->eip); - - } else { - int cpu; - - for (cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu_datap(cpu)->cpu_running == TRUE && - cpu_datap(cpu)->cpu_active_thread == th && - cpu_datap(cpu)->cpu_kdb_saved_state) { - break; - } - } - if (top_act != THREAD_NULL) { - /* - * Trying to get the backtrace of an activation - * which is not the top_most one in the RPC chain: - * use the activation's pcb. - */ - iss32 = (x86_saved_state32_t *)th->machine.iss; - - frame = (struct i386_frame *) (iss32->ebp); - callpc = (db_addr_t) (iss32->eip); - } else { - if (cpu == real_ncpus) { - register struct x86_kernel_state *iks; - int r; - - iks = STACK_IKS(th->kernel_stack); - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - frame = (struct i386_frame *) (iks->k_ebp); - callpc = (db_addr_t) (iks->k_eip); - } else { - /* - * The kernel stack has probably been - * paged out (swapped out activation). - */ - db_recover = prev; - if (r == 2) /* 'q' from db_more() */ - db_error(0); - db_printf("\n", - iks); - goto thread_done; - } - db_recover = prev; - } else { - db_printf(">>>>> active on cpu %d <<<<<\n", - cpu); - - iss32 = (x86_saved_state32_t *)cpu_datap(cpu)->cpu_kdb_saved_state; - - frame = (struct i386_frame *) (iss32->ebp); - callpc = (db_addr_t) (iss32->eip); - } - } - } - } - } else { - frame = (struct i386_frame *)addr; - th = (db_default_act)? db_default_act: current_thread(); - task = (th != THREAD_NULL)? th->task: TASK_NULL; - callpc = (db_addr_t)db_get_task_value((int)&frame->f_retaddr, - 4, - FALSE, - (user_frame) ? task : 0); - } - - if (!INKERNELSTACK((unsigned)frame, th)) { - db_printf(">>>>> user space <<<<<\n"); - if (kernel_only) - goto thread_done; - user_frame++; - } - - lastframe = 0; - lastcallpc = (db_addr_t) 0; - while (frame_count-- && frame != 0) { - int narg = DB_NUMARGS_MAX; - char * name; - db_expr_t offset; - db_addr_t call_func = 0; - int r; - db_addr_t off; - - db_symbol_values(NULL, - db_search_task_symbol_and_line( - callpc, - DB_STGY_XTRN, - &offset, - &filename, - &linenum, - (user_frame) ? task : 0, - &narg), - &name, (db_expr_t *)&call_func); - if ( name == NULL) { - db_find_task_sym_and_offset(callpc, - &name, &off, (user_frame) ? task : 0); - offset = (db_expr_t) off; - } - - if (user_frame == 0) { - if (call_func && call_func == db_user_trap_symbol_value || - call_func == db_kernel_trap_symbol_value) { - frame_type = TRAP; - narg = 1; - } else if (call_func && - call_func == db_interrupt_symbol_value) { - frame_type = INTERRUPT; - goto next_frame; - } else if (call_func && call_func == db_syscall_symbol_value) { - frame_type = SYSCALL; - goto next_frame; - } else { - frame_type = 0; - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - if (narg < 0) - narg = db_numargs(frame, - (user_frame) ? task : 0); - db_recover = prev; - } else { - db_recover = prev; - goto thread_done; - } - } - } else { - frame_type = 0; - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - if (narg < 0) - narg = db_numargs(frame, - (user_frame) ? task : 0); - db_recover = prev; - } else { - db_recover = prev; - goto thread_done; - } - } - - if (name == 0 || offset > db_maxoff) { - db_printf("0x%x 0x%x(", frame, callpc); - offset = 0; - } else - db_printf("0x%x %s(", frame, name); - - argp = &frame->f_arg0; - while (narg > 0) { - int value; - - prev = db_recover; - if ((r = _setjmp(db_recover = &db_jmp_buf)) == 0) { - value = db_get_task_value((int)argp, - 4, - FALSE, - (user_frame) ? task : 0); - } else { - db_recover = prev; - if (r == 2) /* 'q' from db_more() */ - db_error(0); - db_printf("... )"); - if (offset) - db_printf("+%x", offset); - if (filename) { - db_printf(" [%s", filename); - if (linenum > 0) - db_printf(":%d", linenum); - db_printf("]"); - } - db_printf("\n"); - goto thread_done; - } - db_recover = prev; - db_printf("%x", value); - argp++; - if (--narg != 0) - db_printf(","); - } - if (narg < 0) - db_printf("..."); - db_printf(")"); - if (offset) { - db_printf("+%x", offset); - } - if (filename) { - db_printf(" [%s", filename); - if (linenum > 0) - db_printf(":%d", linenum); - db_printf("]"); - } - db_printf("\n"); - -next_frame: - lastcallpc = callpc; - db_nextframe(&lastframe, &frame, &callpc, frame_type, - (user_frame) ? th : THREAD_NULL); - - if (frame == 0) { - if (th->task_threads.prev != THREAD_NULL) { - if (top_act == THREAD_NULL) - top_act = th; - th = th->task_threads.prev; - db_printf(">>>>> next activation 0x%x ($task%d.%d) <<<<<\n", - th, - db_lookup_task(th->task), - db_lookup_task_act(th->task, th)); - goto next_activation; - } - /* end of chain */ - break; - } - if (!INKERNELSTACK(lastframe, th) || - !INKERNELSTACK((unsigned)frame, th)) - user_frame++; - if (user_frame == 1) { - db_printf(">>>>> user space <<<<<\n"); - if (kernel_only) - break; - } - if (frame <= lastframe) { - if ((INKERNELSTACK(lastframe, th) && - !INKERNELSTACK(frame, th))) - continue; - db_printf("Bad frame pointer: 0x%x\n", frame); - break; - } - } - -thread_done: - if (trace_all_threads) { - if (top_act != THREAD_NULL) - th = top_act; - th = (thread_t) queue_next(&th->task_threads); - if (! queue_end(act_list, (queue_entry_t) th)) { - db_printf("\n"); - addr = (db_expr_t) th; - thcount++; - goto next_thread; - - } - } -} - -extern mach_vm_size_t kdp_machine_vm_read(mach_vm_address_t, caddr_t, mach_vm_size_t); -extern boolean_t kdp_trans_off; -/* - * Print out 256 bytes of real storage - * - * dr [entaddr] - */ -void -db_display_real(db_expr_t addr, boolean_t have_addr, db_expr_t count, - char *modif) -{ - int i; - unsigned int xbuf[8]; - unsigned read_result = 0; -/* Print 256 bytes */ - for(i=0; i<8; i++) { - -/* - * Do a physical read using kdp_machine_vm_read(), rather than replicating the same - * facility - */ - kdp_trans_off = 1; - read_result = kdp_machine_vm_read(addr, &xbuf[0], 32); - kdp_trans_off = 0; - - if (read_result != 32) - db_printf("Unable to read address\n"); - else - db_printf("%016llX %08X %08X %08X %08X %08X %08X %08X %08X\n", addr, /* Print a line */ - xbuf[0], xbuf[1], xbuf[2], xbuf[3], - xbuf[4], xbuf[5], xbuf[6], xbuf[7]); - addr = addr + 0x00000020; /* Point to next address */ - } - db_next = addr; -} - -/* - * Displays all of the kmods in the system. - * - * dk - */ -void -db_display_kmod(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - - kmod_info_t *kmd; - unsigned int strt, end; - - kmd = kmod; /* Start at the start */ - - db_printf("info addr start - end name ver\n"); - - while (kmd) { /* Dump 'em all */ - strt = (unsigned int) kmd->address + kmd->hdr_size; - end = (unsigned int) kmd->address + kmd->size; - db_printf("%08X %08X %08X - %08X: %s, %s\n", - kmd, kmd->address, strt, end, kmd->name, kmd->version); - kmd = kmd->next; - } -} - -void -db_display_iokit(__unused db_expr_t addr, __unused boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ -} diff --git a/osfmk/i386/etimer.c b/osfmk/i386/etimer.c index c196f8b9f..c834962ef 100644 --- a/osfmk/i386/etimer.c +++ b/osfmk/i386/etimer.c @@ -91,9 +91,11 @@ etimer_intr(int user_mode, */ latency = (int32_t) (abstime - MAX(mytimer->deadline, mytimer->when_set)); - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_TRAP_LATENCY | DBG_FUNC_NONE, - -latency, rip, user_mode, 0, 0); + -latency, + ((user_mode != 0) ? rip : VM_KERNEL_UNSLIDE(rip)), + user_mode, 0, 0); mytimer->has_expired = TRUE; /* Remember that we popped */ mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); @@ -106,11 +108,11 @@ etimer_intr(int user_mode, /* is it time for power management state change? */ if ((pmdeadline = pmCPUGetDeadline(pp)) && (pmdeadline <= abstime)) { - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_PM_DEADLINE | DBG_FUNC_START, 0, 0, 0, 0, 0); pmCPUDeadline(pp); - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_PM_DEADLINE | DBG_FUNC_END, 0, 0, 0, 0, 0); } @@ -180,7 +182,7 @@ etimer_resync_deadlines(void) /* Record non-PM deadline for latency tool */ if (deadline != pmdeadline) { - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_SET_DEADLINE | DBG_FUNC_NONE, decr, 2, deadline, (uint32_t)(deadline >> 32), 0); @@ -227,7 +229,7 @@ mpqueue_head_t * timer_queue_assign( uint64_t deadline) { - cpu_data_t *cdp = current_cpu_datap(); + cpu_data_t *cdp = current_cpu_datap(); mpqueue_head_t *queue; if (cdp->cpu_running) { @@ -239,7 +241,7 @@ timer_queue_assign( else queue = &cpu_datap(master_cpu)->rtclock_timer.queue; - return queue; + return (queue); } void @@ -260,7 +262,7 @@ timer_queue_cancel( * deadline so that it's timer queue can be moved to another processor. * This target processor should be the least idle (most busy) -- * currently this is the primary processor for the calling thread's package. - * Locking restrictions demand that the target cpu must be the boot cpu. + * Locking restrictions demand that the target cpu must be the boot cpu. */ uint32_t etimer_queue_migrate(int target_cpu) @@ -273,7 +275,7 @@ etimer_queue_migrate(int target_cpu) assert(target_cpu != cdp->cpu_number); assert(target_cpu == master_cpu); - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_TIMER_MIGRATE | DBG_FUNC_START, target_cpu, cdp->rtclock_timer.deadline, (cdp->rtclock_timer.deadline >>32), @@ -297,7 +299,7 @@ etimer_queue_migrate(int target_cpu) setPop(EndOfAllTime); } - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_TIMER_MIGRATE | DBG_FUNC_END, target_cpu, ntimers_moved, 0, 0, 0); diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 478eb2b4e..84f860b5b 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -415,13 +415,11 @@ fpu_module_init(void) 64 * fp_register_state_size, "x86 fpsave state"); -#if ZONE_DEBUG /* To maintain the required alignment, disable * zone debugging for this zone as that appends * 16 bytes to each element. */ - zone_debug_disable(ifps_zone); -#endif + zone_change(ifps_zone, Z_ALIGNMENT_REQUIRED, TRUE); /* Determine MXCSR reserved bits and configure initial FPU state*/ configure_mxcsr_capability_mask(&initial_fp_state); } @@ -1014,7 +1012,7 @@ fp_setvalid(boolean_t value) { } } -__private_extern__ boolean_t +boolean_t ml_fpu_avx_enabled(void) { return (fpu_YMM_present == TRUE); } diff --git a/osfmk/i386/gdt.c b/osfmk/i386/gdt.c index 7677f2488..ae40e4f01 100644 --- a/osfmk/i386/gdt.c +++ b/osfmk/i386/gdt.c @@ -62,7 +62,13 @@ */ #include -struct real_descriptor master_gdt[GDTSZ] __attribute__ ((section("__INITGDT,__data")))= { +struct real_descriptor master_gdt[GDTSZ] +#if __x86_64__ + __attribute__((section("__HIB,__desc"))) +#else + __attribute__((section("__INITGDT,__DATA"))) +#endif + __attribute__((aligned(CPU_CACHE_SIZE))) = { [SEL_TO_INDEX(KERNEL32_CS)] = MAKE_REAL_DESCRIPTOR( /* kernel 32-bit code */ 0, 0xfffff, diff --git a/osfmk/i386/genassym.c b/osfmk/i386/genassym.c index bb77d38a2..0c7f1f595 100644 --- a/osfmk/i386/genassym.c +++ b/osfmk/i386/genassym.c @@ -55,9 +55,7 @@ */ #include -#include #include -#include /* * Pass field offsets to assembly code. @@ -369,10 +367,6 @@ main( DECLARE("INTEL_PTE_INVALID", INTEL_PTE_INVALID); DECLARE("NPGPTD", NPGPTD); #if defined(__x86_64__) - DECLARE("INITPT_SEG_BASE",INITPT_SEG_BASE); - DECLARE("INITGDT_SEG_BASE",INITGDT_SEG_BASE); - DECLARE("SLEEP_SEG_BASE",SLEEP_SEG_BASE); - DECLARE("PROT_MODE_GDT_SIZE",PROT_MODE_GDT_SIZE); DECLARE("KERNEL_PML4_INDEX",KERNEL_PML4_INDEX); #endif DECLARE("IDTSZ", IDTSZ); @@ -390,9 +384,6 @@ main( #ifdef __i386__ DECLARE("DF_TSS", DF_TSS); DECLARE("MC_TSS", MC_TSS); -#if MACH_KDB - DECLARE("DEBUG_TSS", DEBUG_TSS); -#endif /* MACH_KDB */ DECLARE("CPU_DATA_GS", CPU_DATA_GS); #endif /* __i386__ */ DECLARE("SYSENTER_CS", SYSENTER_CS); @@ -515,17 +506,11 @@ main( offsetof(cpu_data_t *, cpu_tlb_invalid_global)); #endif /* x86_64 */ DECLARE("enaExpTrace", enaExpTrace); - DECLARE("enaExpTraceb", enaExpTraceb); DECLARE("enaUsrFCall", enaUsrFCall); - DECLARE("enaUsrFCallb", enaUsrFCallb); DECLARE("enaUsrPhyMp", enaUsrPhyMp); - DECLARE("enaUsrPhyMpb", enaUsrPhyMpb); DECLARE("enaDiagSCs", enaDiagSCs); - DECLARE("enaDiagSCsb", enaDiagSCsb); DECLARE("enaDiagEM", enaDiagEM); - DECLARE("enaDiagEMb", enaDiagEMb); DECLARE("enaNotifyEM", enaNotifyEM); - DECLARE("enaNotifyEMb", enaNotifyEMb); DECLARE("dgLock", offsetof(struct diagWork *, dgLock)); DECLARE("dgFlags", offsetof(struct diagWork *, dgFlags)); DECLARE("dgMisc1", offsetof(struct diagWork *, dgMisc1)); @@ -592,13 +577,11 @@ main( DECLARE("TIMER_HIGH", offsetof(struct timer *, high_bits)); DECLARE("TIMER_HIGHCHK", offsetof(struct timer *, high_bits_check)); #endif -#if !STAT_TIME DECLARE("TIMER_TSTAMP", offsetof(struct timer *, tstamp)); DECLARE("THREAD_TIMER", offsetof(struct processor *, processor_data.thread_timer)); -#endif DECLARE("KERNEL_TIMER", offsetof(struct processor *, processor_data.kernel_timer)); DECLARE("SYSTEM_TIMER", diff --git a/osfmk/i386/hibernate_restore.c b/osfmk/i386/hibernate_restore.c index 47a5b9c7a..f04a56c4a 100644 --- a/osfmk/i386/hibernate_restore.c +++ b/osfmk/i386/hibernate_restore.c @@ -31,7 +31,7 @@ #include -extern pd_entry_t BootstrapPTD[2048]; +extern pd_entry_t BootPTD[2048]; // src is virtually mapped, not page aligned, // dst is a physical 4k page aligned ptr, len is one 4K page @@ -82,9 +82,9 @@ pal_hib_map(uintptr_t virt, uint64_t phys) index = (virt >> I386_LPGSHIFT); virt += (uintptr_t)(phys & I386_LPGMASK); phys = ((phys & ~((uint64_t)I386_LPGMASK)) | INTEL_PTE_PS | INTEL_PTE_VALID | INTEL_PTE_WRITE); - BootstrapPTD[index] = phys; + BootPTD[index] = phys; invlpg(virt); - BootstrapPTD[index + 1] = (phys + I386_LPGBYTES); + BootPTD[index + 1] = (phys + I386_LPGBYTES); invlpg(virt + I386_LPGBYTES); return (virt); diff --git a/osfmk/i386/hpet.c b/osfmk/i386/hpet.c index 994ba06b5..f8fd9832d 100644 --- a/osfmk/i386/hpet.c +++ b/osfmk/i386/hpet.c @@ -58,16 +58,6 @@ #include #include #include -#if MACH_KDB -#include -#include -#include -#include -#include -#include -#include -#include -#endif /* MACH_KDB */ /* Decimal powers: */ #define kilo (1000ULL) @@ -296,10 +286,6 @@ hpet_init(void) hpet2bus = tmrCvt(hpetCvtt2n, busFCvtn2t); DBG(" CVT: HPET to BUS = %08X.%08X\n", (uint32_t)(hpet2bus >> 32), (uint32_t)hpet2bus); - -#if MACH_KDB - db_display_hpet((hpetReg_t *)hpetArea); /* (BRINGUP) */ -#endif } /* @@ -484,64 +470,3 @@ rdHPET(void) return (((uint64_t) high) << 32) | low; } - -#if MACH_KDB - -#define HI32(x) ((uint32_t)(((x) >> 32) & 0xFFFFFFFF)) -#define LO32(x) ((uint32_t)((x) & 0xFFFFFFFF)) - -/* - * Displays HPET memory mapped area - * hp - */ -void -db_hpet(__unused db_expr_t addr, __unused int have_addr, __unused db_expr_t count, __unused char *modif) -{ - - db_display_hpet((hpetReg_t *) hpetArea); /* Dump out the HPET - * stuff */ - return; -} - -void -db_display_hpet(hpetReg_t *hpt) -{ - uint64_t cmain; - - cmain = hpt->MAIN_CNT; /* Get the main timer */ - - /* General capabilities */ - db_printf(" GCAP_ID = %08X.%08X\n", - HI32(hpt->GCAP_ID), LO32(hpt->GCAP_ID)); - /* General configuration */ - db_printf(" GEN_CONF = %08X.%08X\n", - HI32(hpt->GEN_CONF), LO32(hpt->GEN_CONF)); - /* General Interrupt status */ - db_printf("GINTR_STA = %08X.%08X\n", - HI32(hpt->GINTR_STA), LO32(hpt->GINTR_STA)); - /* Main counter */ - db_printf(" MAIN_CNT = %08X.%08X\n", - HI32(cmain), LO32(cmain)); - /* Timer 0 config and cap */ - db_printf("TIM0_CONF = %08X.%08X\n", - HI32(hpt->TIM0_CONF), LO32(hpt->TIM0_CONF)); - /* Timer 0 comparator */ - db_printf("TIM0_COMP = %08X.%08X\n", - HI32(hpt->TIM0_COMP), LO32(hpt->TIM0_COMP)); - /* Timer 1 config and cap */ - db_printf("TIM0_CONF = %08X.%08X\n", - HI32(hpt->TIM1_CONF), LO32(hpt->TIM1_CONF)); - /* Timer 1 comparator */ - db_printf("TIM1_COMP = %08X.%08X\n", - HI32(hpt->TIM1_COMP), LO32(hpt->TIM1_COMP)); - /* Timer 2 config and cap */ - db_printf("TIM2_CONF = %08X.%08X\n", - HI32(hpt->TIM2_CONF), LO32(hpt->TIM2_CONF)); - /* Timer 2 comparator */ - db_printf("TIM2_COMP = %08X.%08X\n", - HI32(hpt->TIM2_COMP), LO32(hpt->TIM2_COMP)); - - db_printf("\nHPET Frequency = %d.%05dMHz\n", - (uint32_t) (hpetFreq / 1000000), (uint32_t) (hpetFreq % 1000000)); -} -#endif diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 560a88ffc..39102c926 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -55,7 +55,6 @@ */ #include -#include #include @@ -102,9 +101,6 @@ #include /* LcksOpts */ #ifdef __i386__ #include -#if MACH_KDB -#include -#endif #endif #if DEBUG #include @@ -115,9 +111,6 @@ #else #define DBG(x...) #endif -#if MACH_KDB -#include -#endif /* MACH_KDB */ int debug_task; @@ -128,14 +121,15 @@ extern const char version[]; extern const char version_variant[]; extern int nx_enabled; -#ifdef __x86_64__ -extern void *low_eintstack; -#endif +uint64_t physmap_base, physmap_max; -void *KPTphys; +pd_entry_t *KPTphys; pd_entry_t *IdlePTD; #ifdef __i386__ pd_entry_t *IdlePDPT64; +#else +pdpt_entry_t *IdlePDPT; +pml4_entry_t *IdlePML4; #endif char *physfree; @@ -200,8 +194,11 @@ x86_64_post_sleep(uint64_t new_cr3) // NPHYSMAP is determined by the maximum supported RAM size plus 4GB to account // the PCI hole (which is less 4GB but not more). -// Compile-time guard: -extern int maxphymapsupported[NPHYSMAP <= PTE_PER_PAGE ? 1 : -1]; +/* Compile-time guard: NPHYSMAP is capped to 256GiB, accounting for + * randomisation + */ +extern int maxphymapsupported[NPHYSMAP <= (PTE_PER_PAGE/2) ? 1 : -1]; + static void physmap_init(void) { @@ -210,31 +207,71 @@ physmap_init(void) pt_entry_t entries[PTE_PER_PAGE]; } * physmapL2 = ALLOCPAGES(NPHYSMAP); - uintptr_t i; - for(i=0;i> PAGE_SHIFT)); + + /* IdlePTD */ + fillkpt(IdlePTD, + INTEL_PTE_WRITE, (uintptr_t)ID_MAP_VTOP(KPTphys), 0, NKPT); + + // IdlePDPT entries + fillkpt(IdlePDPT, + INTEL_PTE_WRITE, (uintptr_t)ID_MAP_VTOP(IdlePTD), 0, NPGPTD); + + // IdlePML4 single entry for kernel space. + fillkpt(IdlePML4 + KERNEL_PML4_INDEX, + INTEL_PTE_WRITE, (uintptr_t)ID_MAP_VTOP(IdlePDPT), 0, 1); + + postcode(VSTART_PHYSMAP_INIT); -#ifdef __x86_64__ physmap_init(); -#else + + postcode(VSTART_DESC_ALIAS_INIT); + + descriptor_alias_init(); + + postcode(VSTART_SET_CR3); + + // Switch to the page tables.. + set_cr3_raw((uintptr_t)ID_MAP_VTOP(IdlePML4)); + +} + +#else /* __x86_64__ */ + +static void +Idle_PTs_init(void) +{ + /* Allocate the "idle" kernel page tables: */ + KPTphys = ALLOCPAGES(NKPT); /* level 1 */ + IdlePTD = ALLOCPAGES(NPGPTD); /* level 2 */ + IdlePDPT64 = ALLOCPAGES(1); // Recursive mapping of PTEs fillkpt(IdlePTD, INTEL_PTE_WRITE, (uintptr_t)IdlePTD, PTDPTDI, NPGPTD); // commpage fillkpt(IdlePTD, INTEL_PTE_WRITE|INTEL_PTE_USER, (uintptr_t)ALLOCPAGES(1), _COMM_PAGE32_BASE_ADDRESS >> PDESHIFT,1); -#endif + // Fill the lowest level with everything up to physfree fillkpt(KPTphys, - INTEL_PTE_WRITE, 0, 0, (int)(((uintptr_t)physfree) >> PAGE_SHIFT)); + INTEL_PTE_WRITE, 0, 0, (int)(((uintptr_t)physfree) >> PAGE_SHIFT)); // Rewrite the 2nd-lowest level to point to pages of KPTphys. // This was previously filled statically by idle_pt.c, and thus // must be done after the KPTphys fill since IdlePTD is in use fillkpt(IdlePTD, - INTEL_PTE_WRITE, (uintptr_t)ID_MAP_VTOP(KPTphys), 0, NKPT); + INTEL_PTE_WRITE, (uintptr_t)ID_MAP_VTOP(KPTphys), 0, NKPT); // IdlePDPT entries -#ifdef __i386__ fillkpt(IdlePDPT, 0, (uintptr_t)IdlePTD, 0, NPGPTD); -#else - fillkpt(IdlePDPT, INTEL_PTE_WRITE, (uintptr_t)ID_MAP_VTOP(IdlePTD), 0, NPGPTD); -#endif + + postcode(VSTART_SET_CR3); // Flush the TLB now we're done rewriting the page tables.. set_cr3_raw(get_cr3_raw()); } +#endif /* * vstart() is called in the natural mode (64bit for K64, 32 for K32) @@ -294,7 +369,7 @@ vstart(vm_offset_t boot_args_start) { boolean_t is_boot_cpu = !(boot_args_start == 0); int cpu; - uint32_t lphysfree; + uint32_t lphysfree; postcode(VSTART_ENTRY); @@ -320,14 +395,8 @@ vstart(vm_offset_t boot_args_start) kernelBootArgs, &kernelBootArgs->ksize, &kernelBootArgs->kaddr); -#ifdef __x86_64__ - /* enable NX/XD, boot processor */ - if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { - wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_NXE); - DBG("vstart() NX/XD enabled\n"); - } -#endif - postcode(PSTART_PAGE_TABLES); + + postcode(VSTART_IDLE_PTS_INIT); Idle_PTs_init(); @@ -348,17 +417,16 @@ vstart(vm_offset_t boot_args_start) PE_init_platform(FALSE, kernelBootArgs); postcode(PE_INIT_PLATFORM_D); } else { +#ifdef __x86_64__ + /* Switch to kernel's page tables (from the Boot PTs) */ + set_cr3_raw((uintptr_t)ID_MAP_VTOP(IdlePML4)); +#endif /* Find our logical cpu number */ cpu = lapic_to_cpu[(LAPIC_READ(ID)>>LAPIC_ID_SHIFT) & LAPIC_ID_MASK]; DBG("CPU: %d, GSBASE initial value: 0x%llx\n", cpu, rdmsr64(MSR_IA32_GS_BASE)); -#ifdef __x86_64__ - if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { - wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_NXE); - DBG("vstart() NX/XD enabled, non-boot\n"); - } -#endif } + postcode(VSTART_CPU_DESC_INIT); #ifdef __x86_64__ if(is_boot_cpu) cpu_desc_init64(cpu_datap(cpu)); @@ -368,16 +436,12 @@ vstart(vm_offset_t boot_args_start) cpu_desc_init(cpu_datap(cpu)); cpu_desc_load(cpu_datap(cpu)); #endif + postcode(VSTART_CPU_MODE_INIT); if (is_boot_cpu) cpu_mode_init(current_cpu_datap()); /* cpu_mode_init() will be * invoked on the APs * via i386_init_slave() */ -#ifdef __x86_64__ - /* Done with identity mapping */ - IdlePML4[0] = 0; -#endif - postcode(VSTART_EXIT); #ifdef __i386__ if (cpuid_extfeatures() & CPUID_EXTFEATURE_XD) { @@ -391,26 +455,9 @@ vstart(vm_offset_t boot_args_start) i386_init_slave(); /*NOTREACHED*/ #else - /* We need to switch to a new per-cpu stack, but we must do this atomically with - * the call to ensure the compiler doesn't assume anything about the stack before - * e.g. tail-call optimisations - */ - if (is_boot_cpu) - { - asm volatile( - "mov %1, %%rdi;" - "mov %0, %%rsp;" - "call _i386_init;" : : "r" - (cpu_datap(cpu)->cpu_int_stack_top), "r" (boot_args_start)); - } - else - { - asm volatile( - "mov %0, %%rsp;" - "call _i386_init_slave;" : : "r" - (cpu_datap(cpu)->cpu_int_stack_top)); - } - /*NOTREACHED*/ + x86_init_wrapper(is_boot_cpu ? (uintptr_t) i386_init + : (uintptr_t) i386_init_slave, + cpu_datap(cpu)->cpu_int_stack_top); #endif } @@ -555,6 +602,7 @@ do_init_slave(boolean_t fast_restart) assert(!ml_get_interrupts_enabled()); cpu_mode_init(current_cpu_datap()); + pmap_cpu_init(); #if CONFIG_MCA mca_cpu_init(); @@ -587,14 +635,6 @@ do_init_slave(boolean_t fast_restart) cpu_thread_init(); /* not strictly necessary */ -#ifdef __x86_64__ - /* Re-zero the identity-map for the idle PT's. This MUST be done before - * cpu_running is set so that other slaves can set up their own - * identity-map */ - if (!fast_restart) - IdlePML4[0] = 0; -#endif - cpu_init(); /* Sets cpu_running which starter cpu waits for */ slave_main(init_param); diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index 0f7bdba3a..6b34073c6 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -54,8 +54,7 @@ * When performance isn't the only concern, it's * nice to build stack frames... */ -#define BUILD_STACK_FRAMES (GPROF || \ - ((MACH_LDEBUG) && MACH_KDB)) +#define BUILD_STACK_FRAMES (GPROF) #if BUILD_STACK_FRAMES @@ -360,7 +359,7 @@ LEAF_ENTRY(hw_lock_init) /* - * void hw_lock_byte_init(uint8_t *) + * void hw_lock_byte_init(volatile uint8_t *) * * Initialize a hardware byte lock. */ @@ -454,7 +453,6 @@ LEAF_ENTRY(hw_lock_to) lfence rdtsc /* read cyclecount into %edx:%eax */ - lfence addl %ecx,%eax /* fetch and timeout */ adcl $0,%edx /* add carry */ mov %edx,%ecx @@ -464,7 +462,6 @@ LEAF_ENTRY(hw_lock_to) push %r9 lfence rdtsc /* read cyclecount into %edx:%eax */ - lfence shlq $32, %rdx orq %rdx, %rax /* load 64-bit quantity into %rax */ addq %rax, %rsi /* %rsi is the timeout expiry */ @@ -498,7 +495,6 @@ LEAF_ENTRY(hw_lock_to) mov %edx,%edi /* Save %edx */ lfence rdtsc /* cyclecount into %edx:%eax */ - lfence xchg %edx,%edi /* cyclecount into %edi:%eax */ cmpl %ecx,%edi /* compare high-order 32-bits */ jb 4b /* continue spinning if less, or */ @@ -510,7 +506,6 @@ LEAF_ENTRY(hw_lock_to) #else lfence rdtsc /* cyclecount into %edx:%eax */ - lfence shlq $32, %rdx orq %rdx, %rax /* load 64-bit quantity into %rax */ cmpq %rsi, %rax /* compare to timeout */ @@ -708,7 +703,7 @@ Entry(lck_rw_try_lock_shared) LOCKSTAT_LABEL(_lck_rw_try_lock_shared_lockstat_patch_point) ret /* Fall thru when patched, counting on lock pointer in LCK_RW_REGISTER */ - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) #endif movl $1, %eax /* return TRUE */ ret @@ -784,7 +779,7 @@ Entry(lck_rw_lock_exclusive) LOCKSTAT_LABEL(_lck_rw_lock_exclusive_lockstat_patch_point) ret /* Fall thru when patched, counting on lock pointer in LCK_RW_REGISTER */ - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, LCK_RW_REGISTER) #endif ret 2: @@ -828,7 +823,7 @@ Entry(lck_rw_try_lock_exclusive) LOCKSTAT_LABEL(_lck_rw_try_lock_exclusive_lockstat_patch_point) ret /* Fall thru when patched, counting on lock pointer in LCK_RW_REGISTER */ - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) + LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, LCK_RW_REGISTER) #endif movl $1, %eax /* return TRUE */ ret @@ -889,7 +884,7 @@ Entry(lck_rw_lock_shared_to_exclusive) LOCKSTAT_LABEL(_lck_rw_lock_shared_to_exclusive_lockstat_patch_point) ret /* Fall thru when patched, counting on lock pointer in LCK_RW_REGISTER */ - LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, LCK_RW_REGISTER) + LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, LCK_RW_REGISTER) #endif movl $1, %eax /* return success */ ret @@ -1419,14 +1414,14 @@ mutex_interlock_destroyed_str: * lck_mtx_convert_spin() */ NONLEAF_ENTRY(lck_mtx_lock_spin_always) - LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ - jmp Llmls_avoid_check - + LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ + jmp Llmls_avoid_check + NONLEAF_ENTRY(lck_mtx_lock_spin) LOAD_LMTX_REG(B_ARG0) /* fetch lock pointer */ CHECK_PREEMPTION_LEVEL() -Llmls_avoid_check: +Llmls_avoid_check: mov M_STATE(LMTX_REG), LMTX_C_REG32 test $(M_ILOCKED_MSK | M_MLOCKED_MSK), LMTX_C_REG32 /* is the interlock or mutex held */ jnz Llmls_slow @@ -2265,27 +2260,42 @@ LEAF_ENTRY(bit_unlock) * Atomic primitives, prototyped in kern/simple_lock.h */ LEAF_ENTRY(hw_atomic_add) +#if MACH_LDEBUG + test $3, %rdi + jz 1f + ud2 +1: +#endif movl %esi, %eax /* Load addend */ - lock - xaddl %eax, (%rdi) /* Atomic exchange and add */ + lock xaddl %eax, (%rdi) /* Atomic exchange and add */ addl %esi, %eax /* Calculate result */ LEAF_RET LEAF_ENTRY(hw_atomic_sub) +#if MACH_LDEBUG + test $3, %rdi + jz 1f + ud2 +1: +#endif negl %esi movl %esi, %eax - lock - xaddl %eax, (%rdi) /* Atomic exchange and add */ + lock xaddl %eax, (%rdi) /* Atomic exchange and add */ addl %esi, %eax /* Calculate result */ LEAF_RET LEAF_ENTRY(hw_atomic_or) +#if MACH_LDEBUG + test $3, %rdi + jz 1f + ud2 +1: +#endif movl (%rdi), %eax 1: movl %esi, %edx /* Load mask */ orl %eax, %edx - lock - cmpxchgl %edx, (%rdi) /* Atomic CAS */ + lock cmpxchgl %edx, (%rdi) /* Atomic CAS */ jne 1b movl %edx, %eax /* Result */ LEAF_RET @@ -2295,18 +2305,29 @@ LEAF_ENTRY(hw_atomic_or) */ LEAF_ENTRY(hw_atomic_or_noret) +#if MACH_LDEBUG + test $3, %rdi + jz 1f + ud2 +1: +#endif lock orl %esi, (%rdi) /* Atomic OR */ LEAF_RET LEAF_ENTRY(hw_atomic_and) +#if MACH_LDEBUG + test $3, %rdi + jz 1f + ud2 +1: +#endif movl (%rdi), %eax 1: movl %esi, %edx /* Load mask */ andl %eax, %edx - lock - cmpxchgl %edx, (%rdi) /* Atomic CAS */ + lock cmpxchgl %edx, (%rdi) /* Atomic CAS */ jne 1b movl %edx, %eax /* Result */ LEAF_RET @@ -2316,8 +2337,13 @@ LEAF_ENTRY(hw_atomic_and) */ LEAF_ENTRY(hw_atomic_and_noret) - lock - andl %esi, (%rdi) /* Atomic OR */ +#if MACH_LDEBUG + test $3, %rdi + jz 1f + ud2 +1: +#endif + lock andl %esi, (%rdi) /* Atomic OR */ LEAF_RET #endif /* !__i386 __ */ diff --git a/osfmk/i386/i386_lowmem.h b/osfmk/i386/i386_lowmem.h index 1e571cd59..97fa06012 100644 --- a/osfmk/i386/i386_lowmem.h +++ b/osfmk/i386/i386_lowmem.h @@ -32,7 +32,9 @@ #ifdef __APPLE_API_PRIVATE -/* The kernel is linked at VM_MIN_KERNEL_ADDRESS + 0x100000 */ +/* + * The kernel better be statically linked at VM_MIN_KERNEL_ADDRESS + 0x100000 + */ #define I386_KERNEL_IMAGE_BASE_PAGE 0x100 #if defined(__i386__) diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index 866dfa1fb..9a9735c5c 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -55,7 +55,6 @@ */ #include -#include #include @@ -79,18 +78,33 @@ #include #include #include +#ifdef __x86_64__ +#include +#else #include +#endif #include #include #include + vm_size_t mem_size = 0; pmap_paddr_t first_avail = 0;/* first after page tables */ uint64_t max_mem; /* Size of physical memory (bytes), adjusted by maxmem */ uint64_t mem_actual; -uint64_t sane_size = 0; /* Memory size to use for defaults calculations */ +uint64_t sane_size = 0; /* Memory size for defaults calculations */ + +/* + * KASLR parameters + */ +ppnum_t vm_kernel_base_page; +vm_offset_t vm_kernel_base; +vm_offset_t vm_kernel_top; +vm_offset_t vm_kernel_stext; +vm_offset_t vm_kernel_etext; +vm_offset_t vm_kernel_slide; #define MAXLORESERVE (32 * 1024 * 1024) @@ -112,21 +126,23 @@ vm_offset_t virtual_avail, virtual_end; static pmap_paddr_t avail_remaining; vm_offset_t static_memory_end = 0; -vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, end; +vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, sconstdata, econstdata, end; /* * _mh_execute_header is the mach_header for the currently executing kernel */ -void *sectTEXTB; unsigned long sectSizeTEXT; -void *sectDATAB; unsigned long sectSizeDATA; -void *sectOBJCB; unsigned long sectSizeOBJC; -void *sectLINKB; unsigned long sectSizeLINK; -void *sectPRELINKB; unsigned long sectSizePRELINK; -void *sectHIBB; unsigned long sectSizeHIB; -void *sectINITPTB; unsigned long sectSizeINITPT; +vm_offset_t segTEXTB; unsigned long segSizeTEXT; +vm_offset_t segDATAB; unsigned long segSizeDATA; +vm_offset_t segLINKB; unsigned long segSizeLINK; +vm_offset_t segPRELINKB; unsigned long segSizePRELINK; +vm_offset_t segHIBB; unsigned long segSizeHIB; +vm_offset_t sectCONSTB; unsigned long sectSizeConst; -kernel_segment_command_t *segTEXT; -kernel_section_t *cursectTEXT, *lastsectTEXT; +boolean_t doconstro_override = FALSE; + +static kernel_segment_command_t *segTEXT, *segDATA; +static kernel_section_t *cursectTEXT, *lastsectTEXT; +static kernel_section_t *sectDCONST; extern uint64_t firmware_Conventional_bytes; extern uint64_t firmware_RuntimeServices_bytes; @@ -138,8 +154,19 @@ extern uint64_t firmware_Unusable_bytes; extern uint64_t firmware_other_bytes; uint64_t firmware_MMIO_bytes; +/* + * Linker magic to establish the highest address in the kernel. + * This is replicated from libsa which marks last_kernel_symbol + * but that's not visible from here in osfmk. + */ +__asm__(".zerofill __LAST, __last, _kernel_top, 0"); +extern void *kernel_top; + #if DEBUG #define PRINT_PMAP_MEMORY_TABLE +#define DBG(x...) kprintf(x) +#else +#define DBG(x...) #endif /* DEBUG */ /* * Basic VM initialization. @@ -164,64 +191,124 @@ i386_vm_init(uint64_t maxmem, uint32_t mbuf_reserve = 0; boolean_t mbuf_override = FALSE; boolean_t coalescing_permitted; -#if DEBUG - kprintf("Boot args revision: %d version: %d", - args->Revision, args->Version); - kprintf(" commandline: \""); - for(i=0; iCommandLine[i]); - kprintf("\"\n"); -#endif + vm_kernel_base_page = i386_btop(args->kaddr); +#ifdef __x86_64__ + vm_offset_t base_address; + vm_offset_t static_base_address; + + /* + * Establish the KASLR parameters. + */ + static_base_address = ml_static_ptovirt(KERNEL_BASE_OFFSET); + base_address = ml_static_ptovirt(args->kaddr); + vm_kernel_slide = base_address - static_base_address; + if (args->kslide) { + kprintf("KASLR slide: 0x%016lx dynamic\n", vm_kernel_slide); + if (vm_kernel_slide != ((vm_offset_t)args->kslide)) + panic("Kernel base inconsistent with slide - rebased?"); + } else { + /* No slide relative to on-disk symbols */ + kprintf("KASLR slide: 0x%016lx static and ignored\n", + vm_kernel_slide); + vm_kernel_slide = 0; + } + /* + * Zero out local relocations to avoid confusing kxld. + * TODO: might be better to move this code to OSKext::initialize + */ + if (_mh_execute_header.flags & MH_PIE) { + struct load_command *loadcmd; + uint32_t cmd; + + loadcmd = (struct load_command *)((uintptr_t)&_mh_execute_header + + sizeof (_mh_execute_header)); + + for (cmd = 0; cmd < _mh_execute_header.ncmds; cmd++) { + if (loadcmd->cmd == LC_DYSYMTAB) { + struct dysymtab_command *dysymtab; + + dysymtab = (struct dysymtab_command *)loadcmd; + dysymtab->nlocrel = 0; + dysymtab->locreloff = 0; + kprintf("Hiding local relocations\n"); + break; + } + loadcmd = (struct load_command *)((uintptr_t)loadcmd + loadcmd->cmdsize); + } + } + +#endif // __x86_64__ + /* * Now retrieve addresses for end, edata, and etext * from MACH-O headers. */ - - sectTEXTB = (void *) getsegdatafromheader( - &_mh_execute_header, "__TEXT", §SizeTEXT); - sectDATAB = (void *) getsegdatafromheader( - &_mh_execute_header, "__DATA", §SizeDATA); - sectOBJCB = (void *) getsegdatafromheader( - &_mh_execute_header, "__OBJC", §SizeOBJC); - sectLINKB = (void *) getsegdatafromheader( - &_mh_execute_header, "__LINKEDIT", §SizeLINK); - sectHIBB = (void *)getsegdatafromheader( - &_mh_execute_header, "__HIB", §SizeHIB); - sectINITPTB = (void *)getsegdatafromheader( - &_mh_execute_header, "__INITPT", §SizeINITPT); - sectPRELINKB = (void *) getsegdatafromheader( - &_mh_execute_header, "__PRELINK_TEXT", §SizePRELINK); - - segTEXT = getsegbynamefromheader(&_mh_execute_header, "__TEXT"); + segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__TEXT", &segSizeTEXT); + segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__DATA", &segSizeDATA); + segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__LINKEDIT", &segSizeLINK); + segHIBB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__HIB", &segSizeHIB); + segPRELINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, + "__PRELINK_TEXT", &segSizePRELINK); + segTEXT = getsegbynamefromheader(&_mh_execute_header, + "__TEXT"); + segDATA = getsegbynamefromheader(&_mh_execute_header, + "__DATA"); + sectDCONST = getsectbynamefromheader(&_mh_execute_header, + "__DATA", "__const"); cursectTEXT = lastsectTEXT = firstsect(segTEXT); /* Discover the last TEXT section within the TEXT segment */ while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) { lastsectTEXT = cursectTEXT; } - sHIB = (vm_offset_t) sectHIBB; - eHIB = (vm_offset_t) sectHIBB + sectSizeHIB; + sHIB = segHIBB; + eHIB = segHIBB + segSizeHIB; /* Zero-padded from ehib to stext if text is 2M-aligned */ - stext = (vm_offset_t) sectTEXTB; + stext = segTEXTB; +#ifdef __x86_64__ + lowGlo.lgStext = stext; +#endif etext = (vm_offset_t) round_page_64(lastsectTEXT->addr + lastsectTEXT->size); /* Zero-padded from etext to sdata if text is 2M-aligned */ - sdata = (vm_offset_t) sectDATAB; - edata = (vm_offset_t) sectDATAB + sectSizeDATA; - -#if DEBUG - kprintf("sectTEXTB = %p\n", sectTEXTB); - kprintf("sectDATAB = %p\n", sectDATAB); - kprintf("sectOBJCB = %p\n", sectOBJCB); - kprintf("sectLINKB = %p\n", sectLINKB); - kprintf("sectHIBB = %p\n", sectHIBB); - kprintf("sectPRELINKB = %p\n", sectPRELINKB); - kprintf("eHIB = %p\n", (void *) eHIB); - kprintf("stext = %p\n", (void *) stext); - kprintf("etext = %p\n", (void *) etext); - kprintf("sdata = %p\n", (void *) sdata); - kprintf("edata = %p\n", (void *) edata); -#endif + sdata = segDATAB; + edata = segDATAB + segSizeDATA; + + sectCONSTB = (vm_offset_t) sectDCONST->addr; + sectSizeConst = sectDCONST->size; + sconstdata = sectCONSTB; + econstdata = sectCONSTB + sectSizeConst; + + if (sectSizeConst & PAGE_MASK) { + kernel_section_t *ns = nextsect(segDATA, sectDCONST); + if (ns && !(ns->addr & PAGE_MASK)) + doconstro_override = TRUE; + } else + doconstro_override = TRUE; + + DBG("segTEXTB = %p\n", (void *) segTEXTB); + DBG("segDATAB = %p\n", (void *) segDATAB); + DBG("segLINKB = %p\n", (void *) segLINKB); + DBG("segHIBB = %p\n", (void *) segHIBB); + DBG("segPRELINKB = %p\n", (void *) segPRELINKB); + DBG("sHIB = %p\n", (void *) sHIB); + DBG("eHIB = %p\n", (void *) eHIB); + DBG("stext = %p\n", (void *) stext); + DBG("etext = %p\n", (void *) etext); + DBG("sdata = %p\n", (void *) sdata); + DBG("edata = %p\n", (void *) edata); + DBG("sconstdata = %p\n", (void *) sconstdata); + DBG("econstdata = %p\n", (void *) econstdata); + DBG("kernel_top = %p\n", (void *) &kernel_top); + + vm_kernel_base = sHIB; + vm_kernel_top = (vm_offset_t) &kernel_top; + vm_kernel_stext = stext; + vm_kernel_etext = etext; vm_set_page_size(); @@ -328,10 +415,10 @@ i386_vm_init(uint64_t maxmem, break; } -#if DEBUG - kprintf("EFI region %d: type %u/%d, base 0x%x, top 0x%x\n", - i, mptr->Type, pmap_type, base, top); -#endif + DBG("EFI region %d: type %u/%d, base 0x%x, top 0x%x %s\n", + i, mptr->Type, pmap_type, base, top, + (mptr->Attribute&EFI_MEMORY_KERN_RESERVED)? "RESERVED" : + (mptr->Attribute&EFI_MEMORY_RUNTIME)? "RUNTIME" : ""); if (maxpg) { if (base >= maxpg) @@ -384,7 +471,7 @@ i386_vm_init(uint64_t maxmem, if ((mptr->Attribute & EFI_MEMORY_KERN_RESERVED) && - (top < I386_KERNEL_IMAGE_BASE_PAGE)) { + (top < vm_kernel_base_page)) { pmptr->alloc = pmptr->base; pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count; } @@ -518,7 +605,7 @@ i386_vm_init(uint64_t maxmem, if ( (maxmem > (uint64_t)first_avail) && (maxmem < sane_size)) { ppnum_t discarded_pages = (ppnum_t)((sane_size - maxmem) >> I386_PGSHIFT); ppnum_t highest_pn = 0; - ppnum_t cur_alloc = 0; + ppnum_t cur_end = 0; uint64_t pages_to_use; unsigned cur_region = 0; @@ -532,15 +619,15 @@ i386_vm_init(uint64_t maxmem, pages_to_use = avail_remaining; while (cur_region < pmap_memory_region_count && pages_to_use) { - for (cur_alloc = pmap_memory_regions[cur_region].alloc; - cur_alloc < pmap_memory_regions[cur_region].end && pages_to_use; - cur_alloc++) { - if (cur_alloc > highest_pn) - highest_pn = cur_alloc; + for (cur_end = pmap_memory_regions[cur_region].base; + cur_end < pmap_memory_regions[cur_region].end && pages_to_use; + cur_end++) { + if (cur_end > highest_pn) + highest_pn = cur_end; pages_to_use--; } if (pages_to_use == 0) - pmap_memory_regions[cur_region].end = cur_alloc; + pmap_memory_regions[cur_region].end = cur_end; cur_region++; } diff --git a/osfmk/i386/idle_pt.c b/osfmk/i386/idle_pt.c index 663375acf..4110c212e 100644 --- a/osfmk/i386/idle_pt.c +++ b/osfmk/i386/idle_pt.c @@ -28,28 +28,18 @@ #include #define PML4_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) -pml4_entry_t IdlePML4[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) = { -#ifdef __x86_64__ - [ 0] - = ((uint64_t)(INITPT_SEG_BASE + PAGE_SIZE) | PML4_PROT), -#if KERNEL_PML4_INDEX != 0 - [KERNEL_PML4_INDEX] - = ((uint64_t)(INITPT_SEG_BASE + PAGE_SIZE) | PML4_PROT), -#endif -#endif - }; +pml4_entry_t IdlePML4[PTE_PER_PAGE] + __attribute__((section("__INITPT, __data"))) = { +}; -#if defined(__x86_64__) -#define PDPT_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) -#elif defined(__i386__) #define PDPT_PROT (INTEL_PTE_VALID) -#endif -pdpt_entry_t IdlePDPT[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) = { - [0] = ((uint64_t)(INITPT_SEG_BASE + 2*PAGE_SIZE) | PDPT_PROT), - [1] = ((uint64_t)(INITPT_SEG_BASE + 3*PAGE_SIZE) | PDPT_PROT), - [2] = ((uint64_t)(INITPT_SEG_BASE + 4*PAGE_SIZE) | PDPT_PROT), - [3] = ((uint64_t)(INITPT_SEG_BASE + 5*PAGE_SIZE) | PDPT_PROT), - }; +pdpt_entry_t IdlePDPT[PTE_PER_PAGE] + __attribute__((section("__INITPT, __data"))) = { + [0] = ((uint64_t)(INITPT_SEG_BASE + 2*PAGE_SIZE) | PDPT_PROT), + [1] = ((uint64_t)(INITPT_SEG_BASE + 3*PAGE_SIZE) | PDPT_PROT), + [2] = ((uint64_t)(INITPT_SEG_BASE + 4*PAGE_SIZE) | PDPT_PROT), + [3] = ((uint64_t)(INITPT_SEG_BASE + 5*PAGE_SIZE) | PDPT_PROT), +}; #if NPGPTD != 4 #error Please update idle_pt.c to reflect the new value of NPGPTD @@ -74,7 +64,8 @@ pdpt_entry_t IdlePDPT[PTE_PER_PAGE] __attribute__((section("__INITPT, __data"))) #define FOR_0_TO_2047(x) L11(x,2047) -pd_entry_t BootstrapPTD[2048] __attribute__((section("__INITPT, __data"))) = { +pd_entry_t BootPTD[2048] + __attribute__((section("__INITPT, __data"))) = { FOR_0_TO_2047(ID_MAP_2MEG) }; #endif /* MACHINE_BOOTSTRAPPTD */ diff --git a/osfmk/i386/idt.s b/osfmk/i386/idt.s index 362b783a4..d56556364 100644 --- a/osfmk/i386/idt.s +++ b/osfmk/i386/idt.s @@ -57,7 +57,6 @@ */ #include #include -#include #include #include #include @@ -76,8 +75,6 @@ #define LO_UNIX_SCALL EXT(lo_unix_scall32) #define LO_MACH_SCALL EXT(lo_mach_scall32) #define LO_MDEP_SCALL EXT(lo_mdep_scall32) -#define LO_DIAG_SCALL EXT(lo_diag_scall32) - #define HI_DATA(lo_addr) ( (EXT(lo_addr) - EXT(hi_remap_data)) + HIGH_IDT_BASE ) #define HI_TEXT(lo_text) ( (EXT(lo_text) - EXT(hi_remap_text)) + HIGH_MEM_BASE ) @@ -155,8 +152,6 @@ Entry(name) ;\ * Extra-special interrupt code. Note that no offset may be * specified in a task gate descriptor, so name is ignored. */ -#define EXCEP_TASK(n,name) \ - IDT_BASE_ENTRY_TG(0,DEBUG_TSS,K_TASK_GATE) /* Double-fault fatal handler */ #define DF_FATAL_TASK(n,name) \ @@ -208,19 +203,11 @@ EXCEP_USR(0x04,t_into) EXCEP_USR(0x05,t_bounds) EXCEPTION(0x06,t_invop) EXCEPTION(0x07,t_nofpu) -#if MACH_KDB -EXCEP_TASK(0x08,db_task_dbl_fault) -#else DF_FATAL_TASK(0x08,df_task_start) -#endif EXCEPTION(0x09,a_fpu_over) EXCEPTION(0x0a,a_inv_tss) EXCEP_SPC(0x0b,hi_segnp) -#if MACH_KDB -EXCEP_TASK(0x0c,db_task_stk_fault) -#else EXCEP_ERR(0x0c,t_stack_fault) -#endif EXCEP_SPC(0x0d,hi_gen_prot) EXCEP_SPC(0x0e,hi_page_fault) EXCEPTION(0x0f,t_trap_0f) @@ -346,8 +333,7 @@ EXCEP_USR(0x7f, t_dtrace_ret) EXCEP_SPC_USR(0x80,hi_unix_scall) EXCEP_SPC_USR(0x81,hi_mach_scall) EXCEP_SPC_USR(0x82,hi_mdep_scall) -EXCEP_SPC_USR(0x83,hi_diag_scall) - +INTERRUPT(0x83) INTERRUPT(0x84) INTERRUPT(0x85) INTERRUPT(0x86) @@ -606,14 +592,6 @@ Entry(hi_mdep_scall) jmp enter_lohandler -Entry(hi_diag_scall) - pushl %eax // Save sselector - pushl $0 // Clear trap number slot - pusha // save the general registers - movl $(LO_DIAG_SCALL),%ebx // Get the function down low to transfer to - jmp enter_lohandler // Leap to it... - - /* * sysenter entry point * Requires user code to set up: @@ -1222,32 +1200,6 @@ Entry(lo_mdep_scall32) */ -Entry(lo_diag_scall32) - TIME_TRAP_UENTRY - - movl %gs:CPU_KERNEL_STACK,%edi - xchgl %edi,%esp /* switch to kernel stack */ - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ - movl TH_TASK(%ecx),%ebx /* point to current task */ - - /* Check for active vtimers in the current task */ - TASK_VTIMER_CHECK(%ebx, %ecx) - - pushl %edi /* push pbc stack for later */ - - CCALL1(diagCall, %edi) // Call diagnostics - - cli // Disable interruptions just in case - popl %esp // Get back the original stack - cmpl $0,%eax // What kind of return is this? - jne EXT(return_to_user) // Normal return, do not check asts... - - CCALL5(i386_exception, $EXC_SYSCALL, $0x6000, $0, $1, $0) - // pass what would be the diag syscall - // error return - cause an exception - /* no return */ - - LEXT(return_to_user) TIME_TRAP_UEXIT jmp ret_to_user @@ -1267,69 +1219,3 @@ Entry(df_task_start) Entry(mc_task_start) CCALL1(panic_machine_check32, $(T_MACHINE_CHECK)) hlt - -#if MACH_KDB -#include -#define CX(addr,reg) addr(,reg,4) -#if 0 -/* - * Note that the per-fault entry points are not currently - * functional. The only way to make them work would be to - * set up separate TSS's for each fault type, which doesn't - * currently seem worthwhile. (The offset part of a task - * gate is always ignored.) So all faults that task switch - * currently resume at db_task_start. - */ -/* - * Double fault (Murphy's point) - error code (0) on stack - */ -Entry(db_task_dbl_fault) - popl %eax - movl $(T_DOUBLE_FAULT),%ebx - jmp db_task_start -/* - * Segment not present - error code on stack - */ -Entry(db_task_seg_np) - popl %eax - movl $(T_SEGMENT_NOT_PRESENT),%ebx - jmp db_task_start -/* - * Stack fault - error code on (current) stack - */ -Entry(db_task_stk_fault) - popl %eax - movl $(T_STACK_FAULT),%ebx - jmp db_task_start -/* - * General protection fault - error code on stack - */ -Entry(db_task_gen_prot) - popl %eax - movl $(T_GENERAL_PROTECTION),%ebx - jmp db_task_start -#endif /* 0 */ -/* - * The entry point where execution resumes after last-ditch debugger task - * switch. - */ -Entry(db_task_start) - movl %esp,%edx - subl $(ISS32_SIZE),%edx - movl %edx,%esp /* allocate x86_saved_state on stack */ - movl %eax,R32_ERR(%esp) - movl %ebx,R32_TRAPNO(%esp) - pushl %edx - CPU_NUMBER(%edx) - movl CX(EXT(master_dbtss),%edx),%edx - movl TSS_LINK(%edx),%eax - pushl %eax /* pass along selector of previous TSS */ - call EXT(db_tss_to_frame) - popl %eax /* get rid of TSS selector */ - call EXT(db_trap_from_asm) - addl $0x4,%esp - /* - * And now...? - */ - iret /* ha, ha, ha... */ -#endif /* MACH_KDB */ diff --git a/osfmk/i386/idt64.s b/osfmk/i386/idt64.s index 4d91cb82f..fd488ebd9 100644 --- a/osfmk/i386/idt64.s +++ b/osfmk/i386/idt64.s @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -48,7 +47,6 @@ #define LO_UNIX_SCALL EXT(lo_unix_scall) #define LO_MACH_SCALL EXT(lo_mach_scall) #define LO_MDEP_SCALL EXT(lo_mdep_scall) -#define LO_DIAG_SCALL EXT(lo_diag_scall) #define LO_DOUBLE_FAULT EXT(lo_df64) #define LO_MACHINE_CHECK EXT(lo_mc64) @@ -162,19 +160,11 @@ EXCEP64_USR(0x04,t64_into) EXCEP64_USR(0x05,t64_bounds) EXCEPTION64(0x06,t64_invop) EXCEPTION64(0x07,t64_nofpu) -#if MACH_KDB -EXCEP64_IST(0x08,db_task_dbl_fault64,1) -#else EXCEP64_IST(0x08,hi64_double_fault,1) -#endif EXCEPTION64(0x09,a64_fpu_over) EXCEPTION64(0x0a,a64_inv_tss) EXCEP64_SPC(0x0b,hi64_segnp) -#if MACH_KDB -EXCEP64_IST(0x0c,db_task_stk_fault64,1) -#else EXCEP64_SPC(0x0c,hi64_stack_fault) -#endif EXCEP64_SPC(0x0d,hi64_gen_prot) EXCEP64_SPC(0x0e, hi64_page_fault) EXCEPTION64(0x0f,t64_trap_0f) @@ -300,8 +290,7 @@ EXCEP64_USR(0x7f, t64_dtrace_ret) EXCEP64_SPC_USR(0x80,hi64_unix_scall) EXCEP64_SPC_USR(0x81,hi64_mach_scall) EXCEP64_SPC_USR(0x82,hi64_mdep_scall) -EXCEP64_SPC_USR(0x83,hi64_diag_scall) - +INTERRUPT64(0x83) INTERRUPT64(0x84) INTERRUPT64(0x85) INTERRUPT64(0x86) @@ -616,7 +605,7 @@ EXT(ret32_set_gs): add $(ISC32_OFFSET)+8+8+8, %rsp /* pop compat frame + trapno, trapfn and error */ - cmp $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) + cmpl $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) /* test for fast entry/exit */ je L_fast_exit EXT(ret32_iret): @@ -630,7 +619,7 @@ L_fast_exit: pop %rcx /* user return esp */ .code32 sti /* interrupts enabled after sysexit */ - sysexit /* 32-bit sysexit */ + .byte 0x0f,0x35 /* 32-bit sysexit */ .code64 L_64bit_return: @@ -731,14 +720,6 @@ L_mdep_scall_continue: jmp L_32bit_enter_check -Entry(hi64_diag_scall) - swapgs /* switch to kernel gs (cpu_data) */ -L_diag_scall_continue: - push %rax /* save system call number */ - push $(LO_DIAG_SCALL) - push $(DIAG_INT) - jmp L_32bit_enter_check - Entry(hi64_syscall) swapgs /* Kapow! get per-cpu data area */ L_syscall_continue: @@ -1605,34 +1586,6 @@ Entry(lo_mdep_scall) * always returns through thread_exception_return */ - -Entry(lo_diag_scall) - TIME_TRAP_UENTRY - - movl %gs:CPU_KERNEL_STACK,%edi - xchgl %edi,%esp /* switch to kernel stack */ - movl %gs:CPU_ACTIVE_THREAD,%ecx /* get current thread */ - movl TH_TASK(%ecx),%ebx /* point to current task */ - - /* Check for active vtimers in the current task */ - TASK_VTIMER_CHECK(%ebx, %ecx) - - pushl %edi /* push pbc stack for later */ - - CCALL1(diagCall, %edi) // Call diagnostics - - cli // Disable interruptions just in case - cmpl $0,%eax // What kind of return is this? - je 1f // - branch if bad (zero) - popl %esp // Get back the original stack - jmp return_to_user // Normal return, do not check asts... -1: - CCALL5(i386_exception, $EXC_SYSCALL, $0x6000, $0, $1, $0) - // pass what would be the diag syscall - // error return - cause an exception - /* no return */ - - return_to_user: TIME_TRAP_UEXIT jmp ret_to_user diff --git a/osfmk/i386/ktss.c b/osfmk/i386/ktss.c index 973fc395a..a0275828c 100644 --- a/osfmk/i386/ktss.c +++ b/osfmk/i386/ktss.c @@ -64,7 +64,6 @@ */ #include #include -#include #ifdef __i386__ struct i386_tss master_ktss @@ -194,41 +193,4 @@ struct i386_tss master_mctss so no bitmap */ }; -#if MACH_KDB - -struct i386_tss master_dbtss - __attribute__ ((section ("__DESC, master_dbtss"))) - __attribute__ ((aligned (4096))) = { - 0, /* back link */ - 0, /* esp0 */ - KERNEL_DS, /* ss0 */ - 0, /* esp1 */ - 0, /* ss1 */ - 0, /* esp2 */ - 0, /* ss2 */ - (int) IdlePDPT, /* cr3 */ - 0, /* eip */ - 0, /* eflags */ - 0, /* eax */ - 0, /* ecx */ - 0, /* edx */ - 0, /* ebx */ - 0, /* esp */ - 0, /* ebp */ - 0, /* esi */ - 0, /* edi */ - KERNEL_DS, /* es */ - KERNEL32_CS, /* cs */ - KERNEL_DS, /* ss */ - KERNEL_DS, /* ds */ - KERNEL_DS, /* fs */ - KERNEL_DS, /* gs */ - KERNEL_LDT, /* ldt */ - 0, /* trace_trap */ - 0x0FFF /* IO bitmap offset - - beyond end of TSS segment, - so no bitmap */ -}; - -#endif /* MACH_KDB */ -#endif +#endif /* __i386__ */ diff --git a/osfmk/i386/lapic.h b/osfmk/i386/lapic.h index 9d6f53abb..378f4d7eb 100644 --- a/osfmk/i386/lapic.h +++ b/osfmk/i386/lapic.h @@ -215,8 +215,8 @@ typedef uint32_t lapic_timer_count_t; */ #define LAPIC_PERFCNT_INTERRUPT 0xF -#define LAPIC_TIMER_INTERRUPT 0xE -#define LAPIC_INTERPROCESSOR_INTERRUPT 0xD +#define LAPIC_INTERPROCESSOR_INTERRUPT 0xE +#define LAPIC_TIMER_INTERRUPT 0xD #define LAPIC_THERMAL_INTERRUPT 0xC #define LAPIC_ERROR_INTERRUPT 0xB #define LAPIC_SPURIOUS_INTERRUPT 0xA diff --git a/osfmk/i386/lapic_native.c b/osfmk/i386/lapic_native.c index 7142be269..3e6991974 100644 --- a/osfmk/i386/lapic_native.c +++ b/osfmk/i386/lapic_native.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,10 +60,6 @@ #include #endif -#if MACH_KDB -#include -#endif - #include #if MP_DEBUG @@ -132,13 +128,19 @@ legacy_init(void) panic("legacy_init: vm_map_find_entry FAILED (err=%d)", result); } vm_map_unlock(kernel_map); -/* Map in the local APIC non-cacheable, as recommended by Intel - * in section 8.4.1 of the "System Programming Guide". - */ + + /* + * Map in the local APIC non-cacheable, as recommended by Intel + * in section 8.4.1 of the "System Programming Guide". + * In fact, this is redundant because EFI will have assigned an + * MTRR physical range containing the local APIC's MMIO space as + * UC and this will override the default PAT setting. + */ pmap_enter(pmap_kernel(), lapic_vbase, (ppnum_t) i386_btop(lapic_pbase), VM_PROT_READ|VM_PROT_WRITE, + VM_PROT_NONE, VM_WIMG_IO, TRUE); } @@ -258,7 +260,6 @@ static const char *TMR_str[] = { "Periodic", "TSC-Deadline", "Illegal" - "Illegal" }; void @@ -359,26 +360,6 @@ lapic_dump(void) kprintf("\n"); } -#if MACH_KDB -/* - * Displays apic junk - * - * da - */ -void -db_apic(__unused db_expr_t addr, - __unused int have_addr, - __unused db_expr_t count, - __unused char *modif) -{ - - lapic_dump(); - - return; -} - -#endif - boolean_t lapic_probe(void) { @@ -550,7 +531,6 @@ lapic_config_timer( /* * Configure TSC-deadline timer mode. The lapic interrupt is always unmasked. */ -__private_extern__ void lapic_config_tsc_deadline_timer(void) { @@ -582,7 +562,6 @@ lapic_set_timer_fast( LAPIC_WRITE(TIMER_INITIAL_COUNT, initial_count); } -__private_extern__ void lapic_set_tsc_deadline_timer(uint64_t deadline) { @@ -590,7 +569,6 @@ lapic_set_tsc_deadline_timer(uint64_t deadline) wrmsr64(MSR_IA32_TSC_DEADLINE, deadline); } -__private_extern__ uint64_t lapic_get_tsc_deadline_timer(void) { @@ -917,3 +895,4 @@ lapic_disable_timer(void) lvt_timer = LAPIC_READ(LVT_TIMER); } } + diff --git a/osfmk/i386/locks.h b/osfmk/i386/locks.h index a0409d257..065dfecea 100644 --- a/osfmk/i386/locks.h +++ b/osfmk/i386/locks.h @@ -123,9 +123,9 @@ extern void lck_mtx_unlock_wakeup_x86(lck_mtx_t *mutex, int prior_lock_state); extern void lck_mtx_lock_mark_destroyed(lck_mtx_t *mutex); extern int lck_mtx_lock_grab_mutex(lck_mtx_t *mutex); -extern void hw_lock_byte_init(uint8_t *lock_byte); -extern void hw_lock_byte_lock(uint8_t *lock_byte); -extern void hw_lock_byte_unlock(uint8_t *lock_byte); +extern void hw_lock_byte_init(volatile uint8_t *lock_byte); +extern void hw_lock_byte_lock(volatile uint8_t *lock_byte); +extern void hw_lock_byte_unlock(volatile uint8_t *lock_byte); typedef struct { unsigned int type; @@ -191,7 +191,7 @@ typedef struct __lck_mtx_ext_t__ lck_mtx_ext_t; #pragma pack(1) /* Make sure the structure stays as we defined it */ typedef struct _lck_rw_t_internal_ { volatile uint16_t lck_rw_shared_count; /* No. of accepted readers */ - uint8_t lck_rw_interlock; /* Interlock byte */ + volatile uint8_t lck_rw_interlock; /* Interlock byte */ volatile uint8_t lck_rw_priv_excl:1, /* Writers prioritized if set */ lck_rw_want_upgrade:1, /* Read-to-write upgrade waiting */ diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 048dc704d..a274e0e40 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -61,7 +61,6 @@ * Locking primitives implementation */ -#include #include #include @@ -77,12 +76,6 @@ #include #include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ #include /* machine_timeout_suspended() */ #include #include @@ -123,12 +116,6 @@ unsigned int LcksOpts=0; /* Forwards */ -#if MACH_KDB -void db_print_simple_lock( - simple_lock_t addr); -#endif /* MACH_KDB */ - - #if USLOCK_DEBUG /* * Perform simple lock checks. @@ -2102,66 +2089,3 @@ lck_mtx_lock_wait_x86 ( } #endif } - - -#if MACH_KDB - -void -db_show_one_lock( - lock_t *lock) -{ - db_printf("Read_count = 0x%x, %swant_upgrade, %swant_write, ", - lock->lck_rw_shared_count, - lock->lck_rw_want_upgrade ? "" : "!", - lock->lck_rw_want_write ? "" : "!"); - db_printf("%swaiting, %scan_sleep\n", - (lock->lck_r_waiting || lock->lck_w_waiting) ? "" : "!", - lock->lck_rw_can_sleep ? "" : "!"); - db_printf("Interlock:\n"); - db_show_one_simple_lock((db_expr_t) ((vm_offset_t)simple_lock_addr(lock->lck_rw_interlock)), - TRUE, (db_expr_t)0, (char *)0); -} - -/* - * Routines to print out simple_locks and mutexes in a nicely-formatted - * fashion. - */ - -const char *simple_lock_labels = "ENTRY ILK THREAD DURATION CALLER"; - -void -db_show_one_simple_lock ( - db_expr_t addr, - boolean_t have_addr, - __unused db_expr_t count, - __unused char * modif) -{ - simple_lock_t saddr = (simple_lock_t) ((vm_offset_t) addr); - - if (saddr == (simple_lock_t)0 || !have_addr) { - db_error ("No simple_lock\n"); - } -#if USLOCK_DEBUG - else if (saddr->lock_type != USLOCK_TAG) - db_error ("Not a simple_lock\n"); -#endif /* USLOCK_DEBUG */ - - db_printf ("%s\n", simple_lock_labels); - db_print_simple_lock (saddr); -} - -void -db_print_simple_lock ( - simple_lock_t addr) -{ - - db_printf ("%08x %3d", addr, *hw_lock_addr(addr->interlock)); -#if USLOCK_DEBUG - db_printf (" %08x", addr->debug.lock_thread); - db_printf (" %08x ", addr->debug.duration[1]); - db_printsym ((int)addr->debug.lock_pc, DB_STGY_ANY); -#endif /* USLOCK_DEBUG */ - db_printf ("\n"); -} - -#endif /* MACH_KDB */ diff --git a/osfmk/i386/locore.s b/osfmk/i386/locore.s index 6e8e3d3a2..911439764 100644 --- a/osfmk/i386/locore.s +++ b/osfmk/i386/locore.s @@ -56,10 +56,7 @@ #include #include -#include -#include #include -#include #include #include @@ -391,18 +388,6 @@ rdmsr_fail: RECOVERY_SECTION RECOVER_TABLE_END - .data -dr_msk: - .long ~0x000f0003 - .long ~0x00f0000c - .long ~0x0f000030 - .long ~0xf00000c0 -ENTRY(dr_addr) - .long 0,0,0,0 - .long 0,0,0,0 - - .text - /* * ffs(mask) */ diff --git a/osfmk/i386/loose_ends.c b/osfmk/i386/loose_ends.c index ee59d599a..fe0a6aabd 100644 --- a/osfmk/i386/loose_ends.c +++ b/osfmk/i386/loose_ends.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -641,7 +641,7 @@ flush_dcache64(addr64_t addr, unsigned count, int phys) dcache_incoherent_io_flush64(addr, count); } else { - uint32_t linesize = cpuid_info()->cache_linesize; + uint64_t linesize = cpuid_info()->cache_linesize; addr64_t bound = (addr + count + linesize - 1) & ~(linesize - 1); __mfence(); while (addr < bound) { @@ -710,6 +710,20 @@ kdp_register_callout(void) } #endif +/* + * Return a uniformly distributed 64-bit random number. + * + * This interface should have minimal dependencies on kernel + * services, and thus be available very early in the life + * of the kernel. But as a result, it may not be very random + * on all platforms. + */ +uint64_t +early_random(void) +{ + return (ml_early_random()); +} + #if !CONFIG_VMX int host_vmxon(boolean_t exclusive __unused) { diff --git a/osfmk/i386/lowmem_vectors.s b/osfmk/i386/lowmem_vectors.s index 45b74dd10..c1b8a2e18 100644 --- a/osfmk/i386/lowmem_vectors.s +++ b/osfmk/i386/lowmem_vectors.s @@ -56,7 +56,6 @@ */ #include -#include #include #include diff --git a/osfmk/i386/machdep_call.c b/osfmk/i386/machdep_call.c index 9152b8741..454eb1b02 100644 --- a/osfmk/i386/machdep_call.c +++ b/osfmk/i386/machdep_call.c @@ -42,7 +42,7 @@ extern kern_return_t kern_invalid(void); -machdep_call_t machdep_call_table[] = { +const machdep_call_t machdep_call_table[] = { MACHDEP_CALL_ROUTINE(kern_invalid,0), MACHDEP_CALL_ROUTINE(kern_invalid,0), MACHDEP_CALL_ROUTINE(kern_invalid,0), @@ -51,7 +51,7 @@ machdep_call_t machdep_call_table[] = { MACHDEP_BSD_CALL_ROUTINE(i386_set_ldt,3), MACHDEP_BSD_CALL_ROUTINE(i386_get_ldt,3), }; -machdep_call_t machdep_call_table64[] = { +const machdep_call_t machdep_call_table64[] = { MACHDEP_CALL_ROUTINE(kern_invalid,0), MACHDEP_CALL_ROUTINE(kern_invalid,0), MACHDEP_CALL_ROUTINE(kern_invalid,0), diff --git a/osfmk/i386/machdep_call.h b/osfmk/i386/machdep_call.h index 63cbf08cb..3b6d9fbe9 100644 --- a/osfmk/i386/machdep_call.h +++ b/osfmk/i386/machdep_call.h @@ -63,8 +63,8 @@ typedef struct { int bsd_style; } machdep_call_t; -extern machdep_call_t machdep_call_table[]; -extern machdep_call_t machdep_call_table64[]; +extern const machdep_call_t machdep_call_table[]; +extern const machdep_call_t machdep_call_table64[]; extern int machdep_call_count; diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 77681d340..64b0c9b8d 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,6 +36,17 @@ #include #include +/* + * At the time of the machine-check exception, all hardware-threads panic. + * Each thread saves the state of its MCA registers to its per-cpu data area. + * + * State reporting is serialized so one thread dumps all valid state for all + * threads to the panic log. This may entail spinning waiting for other + * threads to complete saving state to memory. A timeout applies to this wait + * -- in particular, a 3-strikes timeout may prevent a thread from taking + * part is the affair. + */ + #define IF(bool,str) ((bool) ? (str) : "") static boolean_t mca_initialized = FALSE; @@ -60,6 +71,8 @@ typedef struct { } mca_mci_bank_t; typedef struct mca_state { + boolean_t mca_is_saved; + boolean_t mca_is_valid; /* some state is valid */ ia32_mcg_ctl_t mca_mcg_ctl; ia32_mcg_status_t mca_mcg_status; mca_mci_bank_t mca_error_bank[0]; @@ -206,6 +219,7 @@ mca_save_state(mca_state_t *mca_state) rdmsr64(IA32_MCi_MISC(i)) : 0ULL; bank->mca_mci_addr = (bank->mca_mci_status.bits.addrv)? rdmsr64(IA32_MCi_ADDR(i)) : 0ULL; + mca_state->mca_is_valid = TRUE; } /* @@ -213,7 +227,9 @@ mca_save_state(mca_state_t *mca_state) * and don't care about races */ if (x86_package()->mca_state == NULL) - x86_package()->mca_state = mca_state; + x86_package()->mca_state = mca_state; + + mca_state->mca_is_saved = TRUE; } void @@ -358,9 +374,9 @@ mca_dump_bank_mc8(mca_state_t *state, int i) static const char *mca_threshold_status[] = { [THRESHOLD_STATUS_NO_TRACKING] = "No tracking", - [THRESHOLD_STATUS_GREEN] = "Green", - [THRESHOLD_STATUS_YELLOW] = "Yellow", - [THRESHOLD_STATUS_RESERVED] = "Reserved" + [THRESHOLD_STATUS_GREEN] = "Green", + [THRESHOLD_STATUS_YELLOW] = "Yellow", + [THRESHOLD_STATUS_RESERVED] = "Reserved" }; static void @@ -423,41 +439,24 @@ mca_dump_bank(mca_state_t *state, int i) } static void -mca_dump_error_banks(mca_state_t *state) +mca_cpu_dump_error_banks(mca_state_t *state) { unsigned int i; + if (!state->mca_is_valid) + return; + kdb_printf("MCA error-reporting registers:\n"); for (i = 0; i < mca_error_bank_count; i++ ) { - if (i == 8) { + if (i == 8 && state == x86_package()->mca_state) { /* * Fatal Memory Error */ - /* Dump MC8 for local package */ + /* Dump MC8 for this package */ kdb_printf(" Package %d logged:\n", x86_package()->ppkg_num); mca_dump_bank_mc8(state, 8); - - /* If there's other packages, report their MC8s */ - x86_pkg_t *pkg; - uint64_t deadline; - for (pkg = x86_pkgs; pkg != NULL; pkg = pkg->next) { - if (pkg == x86_package()) - continue; - deadline = mach_absolute_time() + LockTimeOut; - while (pkg->mca_state == NULL && - mach_absolute_time() < deadline) - cpu_pause(); - if (pkg->mca_state) { - kdb_printf(" Package %d logged:\n", - pkg->ppkg_num); - mca_dump_bank_mc8(pkg->mca_state, 8); - } else { - kdb_printf(" Package %d timed out!\n", - pkg->ppkg_num); - } - } continue; } mca_dump_bank(state, i); @@ -467,8 +466,9 @@ mca_dump_error_banks(mca_state_t *state) void mca_dump(void) { - ia32_mcg_status_t status; - mca_state_t *mca_state = current_cpu_datap()->cpu_mca_state; + mca_state_t *mca_state = current_cpu_datap()->cpu_mca_state; + uint64_t deadline; + unsigned int i = 0; /* * Capture local MCA registers to per-cpu data. @@ -476,8 +476,7 @@ mca_dump(void) mca_save_state(mca_state); /* - * Serialize in case of multiple simultaneous machine-checks. - * Only the first caller is allowed to dump MCA registers, + * Serialize: the first caller controls dumping MCA registers, * other threads spin meantime. */ simple_lock(&mca_lock); @@ -490,12 +489,24 @@ mca_dump(void) mca_dump_state = DUMPING; simple_unlock(&mca_lock); + /* + * Wait for all other hardware threads to save their state. + * Or timeout. + */ + deadline = mach_absolute_time() + LockTimeOut; + while (mach_absolute_time() < deadline && i < real_ncpus) { + if (!cpu_datap(i)->cpu_mca_state->mca_is_saved) { + cpu_pause(); + continue; + } + i += 1; + } + /* * Report machine-check capabilities: */ kdb_printf( - "Machine-check capabilities (cpu %d) 0x%016qx:\n", - cpu_number(), ia32_mcg_cap.u64); + "Machine-check capabilities 0x%016qx:\n", ia32_mcg_cap.u64); mca_report_cpu_info(); @@ -512,19 +523,32 @@ mca_dump(void) " %d extended MSRs present\n", mca_extended_MSRs_count); /* - * Report machine-check status: + * Dump all processor state: */ - status.u64 = rdmsr64(IA32_MCG_STATUS); - kdb_printf( - "Machine-check status 0x%016qx:\n%s%s%s", status.u64, - IF(status.bits.ripv, " restart IP valid\n"), - IF(status.bits.eipv, " error IP valid\n"), - IF(status.bits.mcip, " machine-check in progress\n")); + for (i = 0; i < real_ncpus; i++) { + mca_state_t *mcsp = cpu_datap(i)->cpu_mca_state; + ia32_mcg_status_t status; + + kdb_printf("Processor %d: ", i); + if (mcsp == NULL || + mcsp->mca_is_saved == FALSE || + mcsp->mca_mcg_status.u64 == 0) { + kdb_printf("no machine-check status reported\n"); + continue; + } + if (!mcsp->mca_is_valid) { + kdb_printf("no valid machine-check state\n"); + continue; + } + status = mcsp->mca_mcg_status; + kdb_printf( + "machine-check status 0x%016qx:\n%s%s%s", status.u64, + IF(status.bits.ripv, " restart IP valid\n"), + IF(status.bits.eipv, " error IP valid\n"), + IF(status.bits.mcip, " machine-check in progress\n")); - /* - * Dump error-reporting registers: - */ - mca_dump_error_banks(mca_state); + mca_cpu_dump_error_banks(mcsp); + } /* * Dump any extended machine state: @@ -539,3 +563,15 @@ mca_dump(void) /* Update state to release any other threads. */ mca_dump_state = DUMPED; } + + +extern void mca_exception_panic(void); +extern void mtrr_lapic_cached(void); +void mca_exception_panic(void) +{ +#if DEBUG + mtrr_lapic_cached(); +#else + kprintf("mca_exception_panic() requires DEBUG build\n"); +#endif +} diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index b7d3f559a..22eae0159 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -47,17 +47,8 @@ #include #include #include +#include #include -#if MACH_KDB -#include -#include -#include -#include -#include -#include -#include -#include -#endif #if DEBUG #define DBG(x...) kprintf("DBG: " x) @@ -73,6 +64,7 @@ unsigned int LockTimeOut; unsigned int LockTimeOutTSC; unsigned int MutexSpin; uint64_t LastDebuggerEntryAllowance; +uint64_t delay_spin_threshold; extern uint64_t panic_restart_timeout; @@ -129,13 +121,13 @@ ml_static_mfree( { addr64_t vaddr_cur; ppnum_t ppn; - + uint32_t freed_pages = 0; assert(vaddr >= VM_MIN_KERNEL_ADDRESS); assert((vaddr & (PAGE_SIZE-1)) == 0); /* must be page aligned */ for (vaddr_cur = vaddr; - vaddr_cur < round_page_64(vaddr+size); + vaddr_cur < round_page_64(vaddr+size); vaddr_cur += PAGE_SIZE) { ppn = pmap_find_phys(kernel_pmap, vaddr_cur); if (ppn != (vm_offset_t)NULL) { @@ -146,10 +138,18 @@ ml_static_mfree( kernel_pmap->stats.resident_count; } pmap_remove(kernel_pmap, vaddr_cur, vaddr_cur+PAGE_SIZE); - vm_page_create(ppn,(ppn+1)); - vm_page_wire_count--; + assert(pmap_valid_page(ppn)); + + if (IS_MANAGED_PAGE(ppn)) { + vm_page_create(ppn,(ppn+1)); + vm_page_wire_count--; + freed_pages++; + } } } +#if DEBUG + kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn); +#endif } @@ -553,6 +553,22 @@ ml_init_lock_timeout(void) interrupt_latency_tracker_setup(); } +/* + * Threshold above which we should attempt to block + * instead of spinning for clock_delay_until(). + */ +void +ml_init_delay_spin_threshold(void) +{ + nanoseconds_to_absolutetime(10ULL * NSEC_PER_USEC, &delay_spin_threshold); +} + +boolean_t +ml_delay_should_spin(uint64_t interval) +{ + return (interval < delay_spin_threshold) ? TRUE : FALSE; +} + /* * This is called from the machine-independent routine cpu_up() * to perform machine-dependent info updates. Defer to cpu_thread_init(). @@ -683,45 +699,3 @@ kernel_preempt_check(void) boolean_t machine_timeout_suspended(void) { return (virtualized || pmap_tlb_flush_timeout || spinlock_timed_out || panic_active() || mp_recent_debugger_activity()); } - -#if MACH_KDB - -/* - * Display the global msrs - * * - * ms - */ -void -db_msr(__unused db_expr_t addr, - __unused int have_addr, - __unused db_expr_t count, - __unused char *modif) -{ - - uint32_t i, msrlow, msrhigh; - - /* Try all of the first 4096 msrs */ - for (i = 0; i < 4096; i++) { - if (!rdmsr_carefully(i, &msrlow, &msrhigh)) { - db_printf("%08X - %08X.%08X\n", i, msrhigh, msrlow); - } - } - - /* Try all of the 4096 msrs at 0x0C000000 */ - for (i = 0; i < 4096; i++) { - if (!rdmsr_carefully(0x0C000000 | i, &msrlow, &msrhigh)) { - db_printf("%08X - %08X.%08X\n", - 0x0C000000 | i, msrhigh, msrlow); - } - } - - /* Try all of the 4096 msrs at 0xC0000000 */ - for (i = 0; i < 4096; i++) { - if (!rdmsr_carefully(0xC0000000 | i, &msrlow, &msrhigh)) { - db_printf("%08X - %08X.%08X\n", - 0xC0000000 | i, msrhigh, msrlow); - } - } -} - -#endif diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index 42f77f6c4..65e28b742 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -80,6 +80,9 @@ void ml_install_interrupt_handler( void ml_get_timebase(unsigned long long *timestamp); void ml_init_lock_timeout(void); +void ml_init_delay_spin_threshold(void); + +boolean_t ml_delay_should_spin(uint64_t interval); vm_offset_t ml_static_ptovirt( @@ -303,7 +306,9 @@ void ml_get_csw_threads(thread_t * /*old*/, thread_t * /*new*/); __END_DECLS #ifdef XNU_KERNEL_PRIVATE + boolean_t ml_fpu_avx_enabled(void); + void interrupt_latency_tracker_setup(void); void interrupt_reset_latency_stats(void); void interrupt_populate_latency_stats(char *, unsigned); diff --git a/osfmk/i386/machine_routines_asm.s b/osfmk/i386/machine_routines_asm.s index 0e3d9fb68..c4176cdf5 100644 --- a/osfmk/i386/machine_routines_asm.s +++ b/osfmk/i386/machine_routines_asm.s @@ -309,3 +309,6 @@ Entry(call_continuation) call EXT(thread_terminate) +Entry(ml_early_random) + xor %eax, %eax + ret diff --git a/osfmk/i386/machine_task.c b/osfmk/i386/machine_task.c index c05d69bef..3c2d93ffd 100644 --- a/osfmk/i386/machine_task.c +++ b/osfmk/i386/machine_task.c @@ -229,6 +229,31 @@ machine_task_get_state(task_t task, } } +/* + * This is called when a task is terminated, and also on exec(). + * Clear machine-dependent state that is stored on the task. + */ +void +machine_task_terminate(task_t task) +{ + if (task) { + user_ldt_t user_ldt; + void *task_debug; + + user_ldt = task->i386_ldt; + if (user_ldt != 0) { + task->i386_ldt = 0; + user_ldt_free(user_ldt); + } + + task_debug = task->task_debug; + if (task_debug != NULL) { + task->task_debug = NULL; + zfree(ids_zone, task_debug); + } + } +} + /* * Set initial default state on a thread as stored in the MACHINE_TASK data. * Note: currently only debug state is supported. diff --git a/osfmk/i386/misc_protos.h b/osfmk/i386/misc_protos.h index 474733e96..4186c623b 100644 --- a/osfmk/i386/misc_protos.h +++ b/osfmk/i386/misc_protos.h @@ -37,9 +37,11 @@ struct boot_args; struct cpu_data; +extern boolean_t virtualized; + extern void vstart(vm_offset_t); extern void i386_init(void); - +extern void x86_init_wrapper(uintptr_t, uintptr_t) __attribute__((noreturn)); extern void i386_vm_init( uint64_t, boolean_t, @@ -171,4 +173,5 @@ extern void act_machine_switch_pcb(thread_t old, thread_t new); #define FULL_SLAVE_INIT (NULL) #define FAST_SLAVE_INIT ((void *)(uintptr_t)1) +uint64_t ml_early_random(void); #endif /* _I386_MISC_PROTOS_H_ */ diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index f4221f964..b66399d2d 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -29,7 +29,6 @@ */ #include -#include #include #include #include @@ -79,16 +78,6 @@ #include #include -#if MACH_KDB -#include -#include -#include -#include -#include -#include -#include -#include -#endif #if MP_DEBUG #define PAUSE delay(1000000) @@ -110,12 +99,6 @@ void slave_boot_init(void); void i386_cpu_IPI(int cpu); -#if MACH_KDB -static void mp_kdb_wait(void); -volatile boolean_t mp_kdb_trap = FALSE; -volatile long mp_kdb_ncpus = 0; -#endif - static void mp_kdp_wait(boolean_t flush, boolean_t isNMI); static void mp_rendezvous_action(void); static void mp_broadcast_action(void); @@ -246,8 +229,12 @@ smp_init(void) mp_cpus_call_cpu_init(); if (PE_parse_boot_argn("TSC_sync_margin", - &TSC_sync_margin, sizeof(TSC_sync_margin))) + &TSC_sync_margin, sizeof(TSC_sync_margin))) { kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin); + } else if (cpuid_vmm_present()) { + kprintf("TSC sync margin disabled\n"); + TSC_sync_margin = 0; + } smp_initialized = TRUE; cpu_prewarm_init(); @@ -394,12 +381,6 @@ start_cpu(void *arg) } } -extern char prot_mode_gdt[]; -extern char slave_boot_base[]; -extern char real_mode_bootstrap_base[]; -extern char real_mode_bootstrap_end[]; -extern char slave_boot_end[]; - kern_return_t intel_startCPU( int slot_num) @@ -476,9 +457,6 @@ cpu_signal_handler(x86_saved_state_t *regs) { int my_cpu; volatile int *my_word; -#if MACH_KDB && MACH_ASSERT - int i=100; -#endif /* MACH_KDB && MACH_ASSERT */ SCHED_STATS_IPI(current_processor()); @@ -492,10 +470,6 @@ cpu_signal_handler(x86_saved_state_t *regs) cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word; do { -#if MACH_KDB && MACH_ASSERT - if (i-- <= 0) - Debugger("cpu_signal_handler: signals did not clear"); -#endif /* MACH_KDB && MACH_ASSERT */ #if MACH_KDP if (i_bit(MP_KDP, my_word)) { DBGLOG(cpu_handle,my_cpu,MP_KDP); @@ -521,14 +495,6 @@ cpu_signal_handler(x86_saved_state_t *regs) DBGLOG(cpu_handle,my_cpu,MP_AST); i_bit_clear(MP_AST, my_word); ast_check(cpu_to_processor(my_cpu)); -#if MACH_KDB - } else if (i_bit(MP_KDB, my_word)) { - - i_bit_clear(MP_KDB, my_word); - current_cpu_datap()->cpu_kdb_is_slave++; - mp_kdb_wait(); - current_cpu_datap()->cpu_kdb_is_slave--; -#endif /* MACH_KDB */ } else if (i_bit(MP_RENDEZVOUS, my_word)) { DBGLOG(cpu_handle,my_cpu,MP_RENDEZVOUS); i_bit_clear(MP_RENDEZVOUS, my_word); @@ -894,79 +860,95 @@ typedef struct { void *arg1; /* routine's 2nd arg */ volatile long *countp; /* completion counter */ } mp_call_t; - + + +typedef struct { + queue_head_t queue; + decl_simple_lock_data(, lock); +} mp_call_queue_t; #define MP_CPUS_CALL_BUFS_PER_CPU MAX_CPUS -static queue_head_t mp_cpus_call_freelist; -static queue_head_t mp_cpus_call_queue[MAX_CPUS]; -/* - * The free list and the per-cpu call queues are protected by the following - * lock which is taken wil interrupts disabled. - */ -decl_simple_lock_data(,mp_cpus_call_lock); +static mp_call_queue_t mp_cpus_call_freelist; +static mp_call_queue_t mp_cpus_call_head[MAX_CPUS]; static inline boolean_t -mp_call_lock(void) +mp_call_head_lock(mp_call_queue_t *cqp) { boolean_t intrs_enabled; intrs_enabled = ml_set_interrupts_enabled(FALSE); - simple_lock(&mp_cpus_call_lock); + simple_lock(&cqp->lock); return intrs_enabled; } static inline boolean_t -mp_call_is_locked(void) +mp_call_head_is_locked(mp_call_queue_t *cqp) { return !ml_get_interrupts_enabled() && - hw_lock_held((hw_lock_t)&mp_cpus_call_lock); + hw_lock_held((hw_lock_t)&cqp->lock); } static inline void -mp_call_unlock(boolean_t intrs_enabled) +mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled) { - simple_unlock(&mp_cpus_call_lock); + simple_unlock(&cqp->lock); ml_set_interrupts_enabled(intrs_enabled); } static inline mp_call_t * mp_call_alloc(void) { - mp_call_t *callp; + mp_call_t *callp = NULL; + boolean_t intrs_enabled; + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + intrs_enabled = mp_call_head_lock(cqp); + if (!queue_empty(&cqp->queue)) + queue_remove_first(&cqp->queue, callp, typeof(callp), link); + mp_call_head_unlock(cqp, intrs_enabled); - assert(mp_call_is_locked()); - if (queue_empty(&mp_cpus_call_freelist)) - return NULL; - queue_remove_first(&mp_cpus_call_freelist, callp, typeof(callp), link); return callp; } static inline void mp_call_free(mp_call_t *callp) { - assert(mp_call_is_locked()); - queue_enter_first(&mp_cpus_call_freelist, callp, typeof(callp), link); + boolean_t intrs_enabled; + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + + intrs_enabled = mp_call_head_lock(cqp); + queue_enter_first(&cqp->queue, callp, typeof(callp), link); + mp_call_head_unlock(cqp, intrs_enabled); } static inline mp_call_t * -mp_call_dequeue(queue_t call_queue) +mp_call_dequeue_locked(mp_call_queue_t *cqp) { - mp_call_t *callp; + mp_call_t *callp = NULL; - assert(mp_call_is_locked()); - if (queue_empty(call_queue)) - return NULL; - queue_remove_first(call_queue, callp, typeof(callp), link); + assert(mp_call_head_is_locked(cqp)); + if (!queue_empty(&cqp->queue)) + queue_remove_first(&cqp->queue, callp, typeof(callp), link); return callp; } +static inline void +mp_call_enqueue_locked( + mp_call_queue_t *cqp, + mp_call_t *callp) +{ + queue_enter(&cqp->queue, callp, typeof(callp), link); +} + /* Called on the boot processor to initialize global structures */ static void mp_cpus_call_init(void) { + mp_call_queue_t *cqp = &mp_cpus_call_freelist; + DBG("mp_cpus_call_init()\n"); - simple_lock_init(&mp_cpus_call_lock, 0); - queue_init(&mp_cpus_call_freelist); + simple_lock_init(&cqp->lock, 0); + queue_init(&cqp->queue); } /* @@ -977,19 +959,18 @@ mp_cpus_call_init(void) static void mp_cpus_call_cpu_init(void) { - boolean_t intrs_enabled; int i; + mp_call_queue_t *cqp = &mp_cpus_call_head[cpu_number()]; mp_call_t *callp; - if (mp_cpus_call_queue[cpu_number()].next != NULL) + if (cqp->queue.next != NULL) return; /* restart/wake case: called already */ - queue_init(&mp_cpus_call_queue[cpu_number()]); + simple_lock_init(&cqp->lock, 0); + queue_init(&cqp->queue); for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) { callp = (mp_call_t *) kalloc(sizeof(mp_call_t)); - intrs_enabled = mp_call_lock(); mp_call_free(callp); - mp_call_unlock(intrs_enabled); } DBG("mp_cpus_call_init() done on cpu %d\n", cpu_number()); @@ -1002,56 +983,30 @@ mp_cpus_call_cpu_init(void) static void mp_cpus_call_action(void) { - queue_t cpu_head; + mp_call_queue_t *cqp; boolean_t intrs_enabled; mp_call_t *callp; mp_call_t call; assert(!ml_get_interrupts_enabled()); - cpu_head = &mp_cpus_call_queue[cpu_number()]; - intrs_enabled = mp_call_lock(); - while ((callp = mp_call_dequeue(cpu_head)) != NULL) { + cqp = &mp_cpus_call_head[cpu_number()]; + intrs_enabled = mp_call_head_lock(cqp); + while ((callp = mp_call_dequeue_locked(cqp)) != NULL) { /* Copy call request to the stack to free buffer */ call = *callp; mp_call_free(callp); if (call.func != NULL) { - mp_call_unlock(intrs_enabled); + mp_call_head_unlock(cqp, intrs_enabled); KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_ACTION, call.func, call.arg0, call.arg1, call.countp, 0); call.func(call.arg0, call.arg1); - (void) mp_call_lock(); + (void) mp_call_head_lock(cqp); } if (call.countp != NULL) atomic_incl(call.countp, 1); } - mp_call_unlock(intrs_enabled); -} - -static boolean_t -mp_call_queue( - int cpu, - void (*action_func)(void *, void *), - void *arg0, - void *arg1, - volatile long *countp) -{ - queue_t cpu_head = &mp_cpus_call_queue[cpu]; - mp_call_t *callp; - - assert(mp_call_is_locked()); - callp = mp_call_alloc(); - if (callp == NULL) - return FALSE; - - callp->func = action_func; - callp->arg0 = arg0; - callp->arg1 = arg1; - callp->countp = countp; - - queue_enter(cpu_head, callp, typeof(callp), link); - - return TRUE; + mp_call_head_unlock(cqp, intrs_enabled); } /* @@ -1085,19 +1040,19 @@ mp_cpus_call( } static void -mp_cpus_call_wait(boolean_t intrs_enabled, - long mp_cpus_signals, - volatile long *mp_cpus_calls) +mp_cpus_call_wait(boolean_t intrs_enabled, + long mp_cpus_signals, + volatile long *mp_cpus_calls) { - queue_t cpu_head; + mp_call_queue_t *cqp; - cpu_head = &mp_cpus_call_queue[cpu_number()]; + cqp = &mp_cpus_call_head[cpu_number()]; while (*mp_cpus_calls < mp_cpus_signals) { if (!intrs_enabled) { - if (!queue_empty(cpu_head)) + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) mp_cpus_call_action(); - handle_pending_TLB_flushes(); } cpu_pause(); @@ -1124,7 +1079,7 @@ mp_cpus_call1( KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL | DBG_FUNC_START, - cpus, mode, action_func, arg0, arg1); + cpus, mode, VM_KERNEL_UNSLIDE(action_func), arg0, arg1); if (!smp_initialized) { if ((cpus & CPUMASK_SELF) == 0) @@ -1158,7 +1113,8 @@ mp_cpus_call1( if (mode == SYNC && action_func != NULL) { KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_LOCAL, - action_func, arg0, arg1, 0, 0); + VM_KERNEL_UNSLIDE(action_func), + arg0, arg1, 0, 0); action_func(arg0, arg1); } } else { @@ -1166,41 +1122,52 @@ mp_cpus_call1( * Here to queue a call to cpu and IPI. * Spinning for request buffer unless NOSYNC. */ + mp_call_t *callp = NULL; + mp_call_queue_t *cqp = &mp_cpus_call_head[cpu]; + queue_call: - intrs_enabled = mp_call_lock(); + if (callp == NULL) + callp = mp_call_alloc(); + intrs_enabled = mp_call_head_lock(cqp); if (!cpu_datap(cpu)->cpu_running) { - mp_call_unlock(intrs_enabled); + mp_call_head_unlock(cqp, intrs_enabled); continue; } if (mode == NOSYNC) { - if (!mp_call_queue(cpu, action_func, arg0, arg1, - NULL)) { + if (callp == NULL) { cpus_notcalled |= cpu_to_cpumask(cpu); - mp_call_unlock(intrs_enabled); + mp_call_head_unlock(cqp, intrs_enabled); KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_NOBUF, cpu, 0, 0, 0, 0); continue; } + callp->countp = NULL; } else { - if (!mp_call_queue(cpu, action_func, arg0, arg1, - &mp_cpus_calls)) { - mp_call_unlock(intrs_enabled); + if (callp == NULL) { + mp_call_head_unlock(cqp, intrs_enabled); KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_NOBUF, cpu, 0, 0, 0, 0); if (!intrs_enabled) { - mp_cpus_call_action(); + /* Sniffing w/o locking */ + if (!queue_empty(&cqp->queue)) + mp_cpus_call_action(); handle_pending_TLB_flushes(); } cpu_pause(); goto queue_call; } + callp->countp = &mp_cpus_calls; } + callp->func = action_func; + callp->arg0 = arg0; + callp->arg1 = arg1; + mp_call_enqueue_locked(cqp, callp); mp_cpus_signals++; cpus_called |= cpu_to_cpumask(cpu); i386_signal_cpu(cpu, MP_CALL, ASYNC); - mp_call_unlock(intrs_enabled); + mp_call_head_unlock(cqp, intrs_enabled); if (mode == SYNC) { mp_cpus_call_wait(intrs_enabled, mp_cpus_signals, &mp_cpus_calls); } @@ -1211,7 +1178,7 @@ mp_cpus_call1( if (mode != SYNC && call_self ) { KERNEL_DEBUG_CONSTANT( TRACE_MP_CPUS_CALL_LOCAL, - action_func, arg0, arg1, 0, 0); + VM_KERNEL_UNSLIDE(action_func), arg0, arg1, 0, 0); if (action_func != NULL) { ml_set_interrupts_enabled(FALSE); action_func(arg0, arg1); @@ -1423,7 +1390,10 @@ mp_kdp_enter(void) * "unsafe-to-interrupt" points such as the trampolines, * but neither do we want to lose state by waiting too long. */ - tsc_timeout = rdtsc64() + (ncpus * 1000 * 1000); + tsc_timeout = rdtsc64() + (ncpus * 1000 * 1000 * 10ULL); + + if (virtualized) + tsc_timeout = ~0ULL; while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { /* @@ -1454,7 +1424,7 @@ mp_kdp_enter(void) cpu_NMI_interrupt(cpu); } - DBG("mp_kdp_enter() %lu processors done %s\n", + DBG("mp_kdp_enter() %u processors done %s\n", (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out"); postcode(MP_KDP_ENTER); @@ -1596,104 +1566,6 @@ cause_ast_check( } } -#if MACH_KDB -/* - * invoke kdb on slave processors - */ - -void -remote_kdb(void) -{ - unsigned int my_cpu = cpu_number(); - unsigned int cpu; - int kdb_ncpus; - uint64_t tsc_timeout = 0; - - mp_kdb_trap = TRUE; - mp_kdb_ncpus = 1; - for (kdb_ncpus = 1, cpu = 0; cpu < real_ncpus; cpu++) { - if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running) - continue; - kdb_ncpus++; - i386_signal_cpu(cpu, MP_KDB, ASYNC); - } - DBG("remote_kdb() waiting for (%d) processors to suspend\n",kdb_ncpus); - - tsc_timeout = rdtsc64() + (kdb_ncpus * 100 * 1000 * 1000); - - while (mp_kdb_ncpus != kdb_ncpus && rdtsc64() < tsc_timeout) { - /* - * a TLB shootdown request may be pending... this would result in the requesting - * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. - * Process it, so it can now enter mp_kdp_wait() - */ - handle_pending_TLB_flushes(); - - cpu_pause(); - } - DBG("mp_kdp_enter() %lu processors done %s\n", - mp_kdb_ncpus, (mp_kdb_ncpus == kdb_ncpus) ? "OK" : "timed out"); -} - -static void -mp_kdb_wait(void) -{ - DBG("mp_kdb_wait()\n"); - - /* If an I/O port has been specified as a debugging aid, issue a read */ - panic_io_port_read(); - - atomic_incl(&mp_kdb_ncpus, 1); - while (mp_kdb_trap) { - /* - * a TLB shootdown request may be pending... this would result in the requesting - * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. - * Process it, so it can now enter mp_kdp_wait() - */ - handle_pending_TLB_flushes(); - - cpu_pause(); - } - atomic_decl((volatile long *)&mp_kdb_ncpus, 1); - DBG("mp_kdb_wait() done\n"); -} - -/* - * Clear kdb interrupt - */ - -void -clear_kdb_intr(void) -{ - mp_disable_preemption(); - i_bit_clear(MP_KDB, ¤t_cpu_datap()->cpu_signals); - mp_enable_preemption(); -} - -void -mp_kdb_exit(void) -{ - DBG("mp_kdb_exit()\n"); - atomic_decl((volatile long *)&mp_kdb_ncpus, 1); - mp_kdb_trap = FALSE; - __asm__ volatile("mfence"); - - while (mp_kdb_ncpus > 0) { - /* - * a TLB shootdown request may be pending... this would result in the requesting - * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it. - * Process it, so it can now enter mp_kdp_wait() - */ - handle_pending_TLB_flushes(); - - cpu_pause(); - } - - DBG("mp_kdb_exit() done\n"); -} - -#endif /* MACH_KDB */ - void slave_machine_init(void *param) { @@ -1718,54 +1590,6 @@ int cpu_number(void) return get_cpu_number(); } -#if MACH_KDB -#include - -#define TRAP_DEBUG 0 /* Must match interrupt.s and spl.s */ - - -#if TRAP_DEBUG -#define MTRAPS 100 -struct mp_trap_hist_struct { - unsigned char type; - unsigned char data[5]; -} trap_hist[MTRAPS], *cur_trap_hist = trap_hist, - *max_trap_hist = &trap_hist[MTRAPS]; - -void db_trap_hist(void); - -/* - * SPL: - * 1: new spl - * 2: old spl - * 3: new tpr - * 4: old tpr - * INT: - * 1: int vec - * 2: old spl - * 3: new spl - * 4: post eoi tpr - * 5: exit tpr - */ - -void -db_trap_hist(void) -{ - int i,j; - for(i=0;i=cur_trap_hist)?"*":" ", - (trap_hist[i].type == 1)?"SPL":"INT"); - for(j=0;j<5;j++) - db_printf(" %02x", trap_hist[i].data[j]); - db_printf("\n"); - } - -} -#endif /* TRAP_DEBUG */ -#endif /* MACH_KDB */ - static void cpu_prewarm_init() { diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index 6974ef256..faa84df7d 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -114,9 +114,6 @@ extern void mp_kdp_enter(void); extern void mp_kdp_exit(void); extern boolean_t mp_recent_debugger_activity(void); -#if MACH_KDB -extern void mp_kdb_exit(void); -#endif /* * All cpu rendezvous: diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 2421dc734..fd4003f20 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -73,14 +73,15 @@ #include #include #include +#if defined(__i386__) +#include +#endif /* i386 */ #if CONFIG_MCA #include #endif #include -#include - #ifdef __x86_64__ #define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE) #define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE) @@ -146,10 +147,11 @@ }, #define USER_TRAP_SPC USER_TRAP - // Declare the table using the macros we just set up -struct fake_descriptor64 master_idt64[IDTSZ] __attribute__ ((aligned (4096))) = { +struct fake_descriptor64 master_idt64[IDTSZ] + __attribute__ ((section("__HIB,__desc"))) + __attribute__ ((aligned(PAGE_SIZE))) = { #include "../x86_64/idt_table.h" }; #endif @@ -163,7 +165,7 @@ struct fake_descriptor64 master_idt64[IDTSZ] __attribute__ ((aligned (4096))) = /* * First cpu`s interrupt stack. */ -extern uint32_t low_intstack[]; /* bottom */ +extern uint32_t low_intstack[]; /* bottom */ extern uint32_t low_eintstack[]; /* top */ /* @@ -243,12 +245,14 @@ struct fake_descriptor cpudata_desc_pattern = { ACC_P|ACC_PL_K|ACC_DATA_W }; +#if NCOPY_WINDOWS > 0 struct fake_descriptor userwindow_desc_pattern = { (unsigned int) 0, ((NBPDE * NCOPY_WINDOWS) / PAGE_SIZE) - 1, SZ_32 | SZ_G, ACC_P|ACC_PL_U|ACC_DATA_W }; +#endif struct fake_descriptor physwindow_desc_pattern = { (unsigned int) 0, @@ -433,14 +437,6 @@ cpu_desc_init(cpu_data_t *cdp) *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] = temp_fake_desc; -#if MACH_KDB - temp_fake_desc = tss_desc_pattern; - temp_fake_desc.offset = (vm_offset_t) &master_dbtss; - fix_desc(&temp_fake_desc, 1); - *(struct fake_descriptor *) &master_gdt[sel_idx(DEBUG_TSS)] = - temp_fake_desc; -#endif - temp_fake_desc = cpudata_desc_pattern; temp_fake_desc.offset = (vm_offset_t) &cpu_data_master; fix_desc(&temp_fake_desc, 1); @@ -508,13 +504,6 @@ cpu_desc_init(cpu_data_t *cdp) bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt)); bzero((char *)&cdt->ktss, sizeof(struct i386_tss)); -#if MACH_KDB - cdi->cdi_dbtss = (struct i386_tss *) (cpu_hi_desc + - offsetof(cpu_desc_table_t, dbtss)); - bcopy((char *)&master_dbtss, - (char *)&cdt->dbtss, - sizeof(struct i386_tss)); -#endif /* MACH_KDB */ /* * Fix up the entries in the GDT to point to @@ -535,17 +524,6 @@ cpu_desc_init(cpu_data_t *cdp) cdt->gdt[sel_idx(CPU_DATA_GS)].offset = (vm_offset_t) cdp; fix_desc(&cdt->gdt[sel_idx(CPU_DATA_GS)], 1); -#if MACH_KDB /* this only works for legacy 32-bit machines */ - cdt->gdt[sel_idx(DEBUG_TSS)] = tss_desc_pattern; - cdt->gdt[sel_idx(DEBUG_TSS)].offset = (vm_offset_t) cdi->cdi_dbtss; - fix_desc(&cdt->gdt[sel_idx(DEBUG_TSS)], 1); - - cdt->dbtss.esp0 = (int)(db_task_stack_store + - (INTSTACK_SIZE * (cdp->cpu_number + 1)) - sizeof (natural_t)); - cdt->dbtss.esp = cdt->dbtss.esp0; - cdt->dbtss.eip = (int)&db_task_start; -#endif /* MACH_KDB */ - cdt->ktss.ss0 = KERNEL_DS; cdt->ktss.io_bit_map_offset = 0x0FFF; /* no IO bitmap */ @@ -568,11 +546,15 @@ cpu_desc_init64(cpu_data_t *cdp) */ cdi->cdi_ktss = (void *)&master_ktss64; cdi->cdi_sstk = (vm_offset_t) &master_sstk.top; +#if __x86_64__ + cdi->cdi_gdt.ptr = (void *)MASTER_GDT_ALIAS; + cdi->cdi_idt.ptr = (void *)MASTER_IDT_ALIAS; +#else cdi->cdi_gdt.ptr = (void *)master_gdt; cdi->cdi_idt.ptr = (void *)master_idt64; +#endif cdi->cdi_ldt = (struct fake_descriptor *) master_ldt; - /* Replace the expanded LDTs and TSS slots in the GDT */ kernel_ldt_desc64.offset64 = UBER64(&master_ldt); *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] = @@ -592,7 +574,11 @@ cpu_desc_init64(cpu_data_t *cdp) /* * Set the double-fault stack as IST1 in the 64-bit TSS */ +#if __x86_64__ + master_ktss64.ist1 = (uintptr_t) low_eintstack; +#else master_ktss64.ist1 = UBER64((uintptr_t) df_task_stack_end); +#endif } else { cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; @@ -601,8 +587,12 @@ cpu_desc_init64(cpu_data_t *cdp) * heap (cpu_desc_table). * LDT descriptors are mapped into a separate area. */ - cdi->cdi_gdt.ptr = (struct fake_descriptor *)cdt->gdt; +#if __x86_64__ + cdi->cdi_idt.ptr = (void *)MASTER_IDT_ALIAS; +#else cdi->cdi_idt.ptr = (void *)cdt->idt; +#endif + cdi->cdi_gdt.ptr = (struct fake_descriptor *)cdt->gdt; cdi->cdi_ktss = (void *)&cdt->ktss; cdi->cdi_sstk = (vm_offset_t)&cdt->sstk.top; cdi->cdi_ldt = cdp->cpu_ldtp; @@ -610,7 +600,9 @@ cpu_desc_init64(cpu_data_t *cdp) /* * Copy the tables */ +#if !__x86_64__ bcopy((char *)master_idt64, (char *)cdt->idt, sizeof(master_idt64)); +#endif bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt)); bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss)); @@ -662,8 +654,8 @@ cpu_desc_load(cpu_data_t *cdp) cdi->cdi_idt.size = 0x1000 + cdp->cpu_number; cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1; - lgdt((unsigned long *) &cdi->cdi_gdt); - lidt((unsigned long *) &cdi->cdi_idt); + lgdt((uintptr_t *) &cdi->cdi_gdt); + lidt((uintptr_t *) &cdi->cdi_idt); lldt(KERNEL_LDT); set_tr(KERNEL_TSS); @@ -703,19 +695,18 @@ cpu_desc_load64(cpu_data_t *cdp) /* Load the GDT, LDT, IDT and TSS */ cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1; cdi->cdi_idt.size = 0x1000 + cdp->cpu_number; - lgdt((unsigned long *) &cdi->cdi_gdt); - lidt((unsigned long *) &cdi->cdi_idt); + lgdt((uintptr_t *) &cdi->cdi_gdt); + lidt((uintptr_t *) &cdi->cdi_idt); lldt(KERNEL_LDT); set_tr(KERNEL_TSS); - /* Stuff the pre-cpu data area into the MSR and swapgs to activate */ - wrmsr64(MSR_IA32_KERNEL_GS_BASE, (unsigned long)cdp); + /* Stuff the kernel per-cpu data area address into the MSRs */ + wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); + wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); + #if GPROF // Hack to enable mcount to work on K64 __asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS))); #endif - swapgs(); - - cpu_mode_init(cdp); #endif } @@ -784,7 +775,7 @@ cpu_data_alloc(boolean_t is_boot_cpu) if (is_boot_cpu) { assert(real_ncpus == 1); - cdp = &cpu_data_master; + cdp = cpu_datap(0); if (cdp->cpu_processor == NULL) { simple_lock_init(&ncpus_lock, 0); cdp->cpu_processor = cpu_processor_alloc(TRUE); @@ -826,6 +817,7 @@ cpu_data_alloc(boolean_t is_boot_cpu) * Allocate descriptor table: * Size depends on cpu mode. */ + ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp->cpu_desc_tablep, cdp->cpu_is64bit ? sizeof(cpu_desc_table64_t) @@ -1040,7 +1032,7 @@ cpu_physwindow_init(int cpu) * pte pointer we're interested in actually * exists in the page table */ - pmap_expand(kernel_pmap, phys_window); + pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE); cdp->cpu_physwindow_base = phys_window; cdp->cpu_physwindow_ptep = vtopte(phys_window); @@ -1072,8 +1064,50 @@ cpu_mode_init(cpu_data_t *cdp) #else fast_syscall_init64(cdp); #endif - - /* Call for per-cpu pmap mode initialization */ - pmap_cpu_init(); } +#if __x86_64__ +/* + * Allocate a new interrupt stack for the boot processor from the + * heap rather than continue to use the statically allocated space. + * Also switch to a dynamically allocated cpu data area. + */ +void +cpu_data_realloc(void) +{ + int ret; + vm_offset_t stack; + cpu_data_t *cdp; + boolean_t istate; + + ret = kmem_alloc(kernel_map, &stack, INTSTACK_SIZE); + if (ret != KERN_SUCCESS) { + panic("cpu_data_realloc() stack alloc, ret=%d\n", ret); + } + bzero((void*) stack, INTSTACK_SIZE); + stack += INTSTACK_SIZE; + + ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t)); + if (ret != KERN_SUCCESS) { + panic("cpu_data_realloc() cpu data alloc, ret=%d\n", ret); + } + + /* Copy old contents into new area and make fix-ups */ + bcopy((void *) &cpu_data_master, (void*) cdp, sizeof(cpu_data_t)); + cdp->cpu_this = cdp; + cdp->cpu_int_stack_top = stack; + timer_call_initialize_queue(&cdp->rtclock_timer.queue); + + kprintf("Reallocated master cpu data: %p, interrupt stack top: %p\n", + (void *) cdp, (void *) stack); + + /* + * With interrupts disabled commmit the new areas. + */ + istate = ml_set_interrupts_enabled(FALSE); + cpu_data_ptr[0] = cdp; + wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); + wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); + (void) ml_set_interrupts_enabled(istate); +} +#endif /* __x86_64__ */ diff --git a/osfmk/i386/mp_desc.h b/osfmk/i386/mp_desc.h index 97b04c9cb..3b8ef7ea1 100644 --- a/osfmk/i386/mp_desc.h +++ b/osfmk/i386/mp_desc.h @@ -60,7 +60,6 @@ #ifndef _I386_MP_DESC_H_ #define _I386_MP_DESC_H_ -#include #include __BEGIN_DECLS @@ -80,19 +79,18 @@ __BEGIN_DECLS /* * The descriptor tables are together in a structure * allocated one per processor (except for the boot processor). - * Note that dbtss could be conditionalized on MACH_KDB, but - * doing so increases misconfiguration risk. */ typedef struct cpu_desc_table { struct fake_descriptor idt[IDTSZ] __attribute__ ((aligned (16))); struct fake_descriptor gdt[GDTSZ] __attribute__ ((aligned (16))); struct i386_tss ktss __attribute__ ((aligned (16))); - struct i386_tss dbtss __attribute__ ((aligned (16))); struct sysenter_stack sstk; } cpu_desc_table_t; typedef struct cpu_desc_table64 { +#if !__x86_64__ struct fake_descriptor64 idt[IDTSZ] __attribute__ ((aligned (16))); +#endif struct fake_descriptor gdt[GDTSZ] __attribute__ ((aligned (16))); struct x86_64_tss ktss __attribute__ ((aligned (16))); struct sysenter_stack sstk __attribute__ ((aligned (16))); @@ -103,7 +101,6 @@ typedef struct cpu_desc_table64 { #define current_idt() (current_cpu_datap()->cpu_desc_index.cdi_idt.ptr) #define current_ldt() (current_cpu_datap()->cpu_desc_index.cdi_ldt) #define current_ktss() (current_cpu_datap()->cpu_desc_index.cdi_ktss) -#define current_dbtss() (current_cpu_datap()->cpu_desc_index.cdi_dbtss) #define current_sstk() (current_cpu_datap()->cpu_desc_index.cdi_sstk) #define current_ktss64() ((struct x86_64_tss *) current_ktss()) diff --git a/osfmk/i386/mp_native.c b/osfmk/i386/mp_native.c index 73d3b1ca0..ea3799780 100644 --- a/osfmk/i386/mp_native.c +++ b/osfmk/i386/mp_native.c @@ -110,17 +110,5 @@ i386_cpu_IPI(int cpu) } #endif /* MP_DEBUG */ -#if MACH_KDB -#ifdef MP_DEBUG - if(!trappedalready && (cpu_datap(cpu)->cpu_signals & 6)) { /* (BRINGUP) */ - if(kdb_cpu != cpu_number()) { - trappedalready = 1; - panic("i386_cpu_IPI: sending enter debugger signal (%08X) to cpu %d and I do not own debugger, owner = %08X\n", - cpu_datap(cpu)->cpu_signals, cpu, kdb_cpu); - } - } -#endif /* MP_DEBUG */ -#endif - lapic_send_ipi(cpu, LAPIC_VECTOR(INTERPROCESSOR)); } diff --git a/osfmk/i386/mtrr.c b/osfmk/i386/mtrr.c index 63a19c6a2..0978551b2 100644 --- a/osfmk/i386/mtrr.c +++ b/osfmk/i386/mtrr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,6 +34,7 @@ #include #include #include +#include struct mtrr_var_range { uint64_t base; /* in IA32_MTRR_PHYSBASE format */ @@ -62,6 +63,7 @@ decl_simple_lock_data(static, mtrr_lock); #define MTRR_LOCK() simple_lock(&mtrr_lock); #define MTRR_UNLOCK() simple_unlock(&mtrr_lock); +//#define MTRR_DEBUG 1 #if MTRR_DEBUG #define DBG(x...) kprintf(x) #else @@ -692,3 +694,62 @@ pat_init(void) } ml_set_interrupts_enabled(istate); } + +#if DEBUG +void +mtrr_lapic_cached(void); +void +mtrr_lapic_cached(void) +{ + boolean_t istate; + uint32_t lo; + uint32_t hi; + uint64_t lapic_pbase; + uint64_t base; + uint64_t length; + uint32_t type; + unsigned int i; + + /* Find the local APIC physical base address */ + rdmsr(MSR_IA32_APIC_BASE, lo, hi); + lapic_pbase = (lo & MSR_IA32_APIC_BASE_BASE); + + DBG("mtrr_lapic_cached() on cpu %d, lapic_pbase: 0x%016llx\n", + get_cpu_number(), lapic_pbase); + + istate = ml_set_interrupts_enabled(FALSE); + + /* + * Search for the variable range MTRR mapping the lapic. + * Flip its type to WC and return. + */ + for (i = 0; i < mtrr_state.var_count; i++) { + if (!(mtrr_state.var_range[i].mask & IA32_MTRR_PHYMASK_VALID)) + continue; + base = mtrr_state.var_range[i].base & IA32_MTRR_PHYSBASE_MASK; + type = (uint32_t)(mtrr_state.var_range[i].base & IA32_MTRR_PHYSBASE_TYPE); + length = MASK_TO_LEN(mtrr_state.var_range[i].mask); + DBG("%d: base: 0x%016llx size: 0x%016llx type: %d\n", + i, base, length, type); + if (base <= lapic_pbase && + lapic_pbase <= base + length - PAGE_SIZE) { + DBG("mtrr_lapic_cached() matched var: %d\n", i); + mtrr_state.var_range[i].base &=~IA32_MTRR_PHYSBASE_TYPE; + mtrr_state.var_range[i].base |= MTRR_TYPE_WRITECOMBINE; + ml_set_interrupts_enabled(istate); + } + } + + /* + * In case we didn't find a covering variable range, + * we slam WC into the default memory type. + */ + mtrr_state.MTRRdefType = MTRR_TYPE_WRITECOMBINE; + + mtrr_update_cpu(); + + ml_set_interrupts_enabled(istate); + + return; +} +#endif /* DEBUG */ diff --git a/osfmk/i386/pal_hibernate.h b/osfmk/i386/pal_hibernate.h index a1fefe4e5..13d48fbe4 100644 --- a/osfmk/i386/pal_hibernate.h +++ b/osfmk/i386/pal_hibernate.h @@ -36,7 +36,7 @@ #define IMAGE_AREA (BITMAP_AREA - HIB_MAP_SIZE) #define IMAGE2_AREA (IMAGE_AREA - HIB_MAP_SIZE) -#define HIB_BASE sectINITPTB +#define HIB_BASE segHIBB #define HIB_ENTRYPOINT acpi_wake_prot_entry uintptr_t pal_hib_map(uintptr_t v, uint64_t p); diff --git a/osfmk/i386/pal_routines.c b/osfmk/i386/pal_routines.c index 34e5bd0a5..cb83084e6 100644 --- a/osfmk/i386/pal_routines.c +++ b/osfmk/i386/pal_routines.c @@ -64,12 +64,7 @@ struct pal_apic_table *apic_table = NULL; decl_simple_lock_data(static , pal_efi_lock); #ifdef __x86_64__ -#define PML4_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) -#define INIT_PDPT_BASE (INITPT_SEG_BASE + PAGE_SIZE) -static pml4_entry_t IDPML4[PTE_PER_PAGE] __attribute__ ((aligned (4096))) = { - [0] = (uint64_t)(INIT_PDPT_BASE | PML4_PROT), - [KERNEL_PML4_INDEX] = (uint64_t)(INIT_PDPT_BASE | PML4_PROT), -}; +static pml4_entry_t IDPML4[PTE_PER_PAGE] __attribute__ ((aligned (4096))); uint64_t pal_efi_saved_cr0; uint64_t pal_efi_saved_cr3; #endif @@ -168,6 +163,14 @@ pal_efi_call_in_64bit_mode(uint64_t func, return KERN_NOT_SUPPORTED; } + if (func < VM_MIN_KERNEL_ADDRESS) { + /* + * EFI Runtime Services must be mapped in our address + * space at an appropriate location. + */ + return KERN_INVALID_ADDRESS; + } + _pal_efi_call_in_64bit_mode_asm(func, efi_reg, stack_contents, @@ -245,6 +248,8 @@ pal_efi_call_in_32bit_mode(uint32_t func, MARK_CPU_IDLE(cpu_number()); pal_efi_saved_cr3 = get_cr3_raw(); pal_efi_saved_cr0 = get_cr0(); + IDPML4[KERNEL_PML4_INDEX] = IdlePML4[KERNEL_PML4_INDEX]; + IDPML4[0] = IdlePML4[KERNEL_PML4_INDEX]; clear_ts(); set_cr3_raw((uint64_t) ID_MAP_VTOP(IDPML4)); diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index caf0c68db..a70348b33 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -155,24 +155,9 @@ set_thread_state64(thread_t thread, x86_thread_state64_t *ts); static inline void machine_pmc_cswitch(thread_t /* old */, thread_t /* new */); -static inline boolean_t -machine_thread_pmc_eligible(thread_t); - static inline void pmc_swi(thread_t /* old */, thread_t /*new */); -static inline boolean_t -machine_thread_pmc_eligible(thread_t t) { - /* - * NOTE: Task-level reservations are propagated to child threads via - * thread_create_internal. Any mutation of task reservations forces a - * recalculate of t_chud (for the pmc flag) for all threads in that task. - * Consequently, we can simply check the current thread's flag against - * THREAD_PMC_FLAG. If the result is non-zero, we SWI for a PMC switch. - */ - return (t != NULL) ? ((t->t_chud & THREAD_PMC_FLAG) ? TRUE : FALSE) : FALSE; -} - static inline void pmc_swi(thread_t old, thread_t new) { current_cpu_datap()->csw_old_thread = old; @@ -182,7 +167,7 @@ pmc_swi(thread_t old, thread_t new) { static inline void machine_pmc_cswitch(thread_t old, thread_t new) { - if (machine_thread_pmc_eligible(old) || machine_thread_pmc_eligible(new)) { + if (pmc_thread_eligible(old) || pmc_thread_eligible(new)) { pmc_swi(old, new); } } @@ -1675,27 +1660,6 @@ machine_set_current_thread(thread_t thread) current_cpu_datap()->cpu_active_thread = thread; } -/* - * This is called when a task is terminated, and also on exec(). - * Clear machine-dependent state that is stored on the task. - */ -void -machine_thread_terminate_self(void) -{ - task_t self_task = current_task(); - if (self_task) { - user_ldt_t user_ldt = self_task->i386_ldt; - if (user_ldt != 0) { - self_task->i386_ldt = 0; - user_ldt_free(user_ldt); - } - - if (self_task->task_debug != NULL) { - zfree(ids_zone, self_task->task_debug); - self_task->task_debug = NULL; - } - } -} /* * Perform machine-dependent per-thread initializations @@ -2130,3 +2094,11 @@ copy_debug_state64( target->dr6 = src->dr6; target->dr7 = src->dr7; } + +boolean_t is_useraddr64_canonical(uint64_t addr64); + +boolean_t +is_useraddr64_canonical(uint64_t addr64) +{ + return IS_USERADDR64_CANONICAL(addr64); +} diff --git a/osfmk/i386/phys.c b/osfmk/i386/phys.c index bfbb48d4b..4db5983c1 100644 --- a/osfmk/i386/phys.c +++ b/osfmk/i386/phys.c @@ -244,9 +244,6 @@ __private_extern__ void ml_copy_phys(addr64_t src64, addr64_t dst64, vm_size_t b src = (void *) ((uintptr_t)src_map->prv_CADDR | ((uint32_t)src64 & INTEL_OFFMASK)); dst = (void *) ((uintptr_t)dst_map->prv_CADDR | ((uint32_t)dst64 & INTEL_OFFMASK)); #elif defined(__x86_64__) - src = PHYSMAP_PTOV(src64); - dst = PHYSMAP_PTOV(dst64); - addr64_t debug_pa = 0; /* If either destination or source are outside the @@ -256,10 +253,15 @@ __private_extern__ void ml_copy_phys(addr64_t src64, addr64_t dst64, vm_size_t b if (physmap_enclosed(src64) == FALSE) { src = (void *)(debugger_window_kva | (src64 & INTEL_OFFMASK)); + dst = PHYSMAP_PTOV(dst64); debug_pa = src64 & PG_FRAME; } else if (physmap_enclosed(dst64) == FALSE) { + src = PHYSMAP_PTOV(src64); dst = (void *)(debugger_window_kva | (dst64 & INTEL_OFFMASK)); debug_pa = dst64 & PG_FRAME; + } else { + src = PHYSMAP_PTOV(src64); + dst = PHYSMAP_PTOV(dst64); } /* DRK: debugger only routine, we don't bother checking for an * identical mapping. diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index 1f064b614..b22749df7 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -66,7 +66,6 @@ static boolean_t earlyTopology = FALSE; static uint64_t earlyMaxBusDelay = DELAY_UNSET; static uint64_t earlyMaxIntDelay = DELAY_UNSET; - /* * Initialize the Cstate change code. */ @@ -97,8 +96,8 @@ machine_idle(void) /* * Handle case where ml_set_maxbusdelay() or ml_set_maxintdelay() * were called prior to the CPU PM kext being registered. We do - * this here since we know at this point since it'll be at idle - * where the decision using these values will be used. + * this here since we know at this point the values will be first + * used since idle is where the decisions using these values is made. */ if (earlyMaxBusDelay != DELAY_UNSET) ml_set_maxbusdelay((uint32_t)(earlyMaxBusDelay & 0xFFFFFFFF)); diff --git a/osfmk/i386/pmap.c b/osfmk/i386/pmap.c index c31ab2d8a..2e0594487 100644 --- a/osfmk/i386/pmap.c +++ b/osfmk/i386/pmap.c @@ -89,7 +89,6 @@ */ #include -#include #include #include @@ -100,6 +99,7 @@ #include #include #include +#include #include #include @@ -132,13 +132,6 @@ #include #include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ - #include #include @@ -227,53 +220,9 @@ unsigned int last_managed_page = 0; uint64_t pde_mapped_size; -/* - * Locking and TLB invalidation - */ +const boolean_t pmap_disable_kheap_nx = TRUE; +const boolean_t pmap_disable_kstack_nx = TRUE; -/* - * Locking Protocols: (changed 2/2007 JK) - * - * There are two structures in the pmap module that need locking: - * the pmaps themselves, and the per-page pv_lists (which are locked - * by locking the pv_lock_table entry that corresponds to the pv_head - * for the list in question.) Most routines want to lock a pmap and - * then do operations in it that require pv_list locking -- however - * pmap_remove_all and pmap_copy_on_write operate on a physical page - * basis and want to do the locking in the reverse order, i.e. lock - * a pv_list and then go through all the pmaps referenced by that list. - * - * The system wide pmap lock has been removed. Now, paths take a lock - * on the pmap before changing its 'shape' and the reverse order lockers - * (coming in by phys ppn) take a lock on the corresponding pv and then - * retest to be sure nothing changed during the window before they locked - * and can then run up/down the pv lists holding the list lock. This also - * lets the pmap layer run (nearly completely) interrupt enabled, unlike - * previously. - */ - - -/* - * PV locking - */ - -#define LOCK_PVH(index) { \ - mp_disable_preemption(); \ - lock_pvh_pai(index); \ -} - -#define UNLOCK_PVH(index) { \ - unlock_pvh_pai(index); \ - mp_enable_preemption(); \ -} - -/* - * PV hash locking - */ - -#define LOCK_PV_HASH(hash) lock_hash_hash(hash) - -#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) #if USLOCK_DEBUG extern int max_lock_loops; @@ -322,18 +271,6 @@ int pt_fake_zone_index = -1; extern long NMIPI_acks; -static inline void -PMAP_ZINFO_SALLOC(vm_size_t bytes) -{ - current_thread()->tkm_shared.alloc += bytes; -} - -static inline void -PMAP_ZINFO_SFREE(vm_size_t bytes) -{ - current_thread()->tkm_shared.free += (bytes); -} - addr64_t kernel64_cr3; boolean_t no_shared_cr3 = FALSE; /* -no_shared_cr3 boot arg */ @@ -557,7 +494,7 @@ pmap_map( ps = PAGE_SIZE; while (start_addr < end_addr) { pmap_enter(kernel_pmap, (vm_map_offset_t)virt, - (ppnum_t) i386_btop(start_addr), prot, flags, FALSE); + (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, FALSE); virt += ps; start_addr += ps; } @@ -644,9 +581,6 @@ pmap_init_high_shared(void) vm_offset_t haddr; spl_t s; -#if MACH_KDB - struct i386_tss *ttss; -#endif cpu_desc_index_t * cdi = &cpu_data_master.cpu_desc_index; @@ -696,17 +630,6 @@ pmap_init_high_shared(void) fix_desc(&temp_fake_desc, 1); *(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] = temp_fake_desc; kprintf("KTSS: 0x%x, ",haddr); -#if MACH_KDB - /* remap dbtss up high and put new high addr into gdt */ - haddr = pmap_high_shared_remap(HIGH_FIXED_DBTSS, - (vm_offset_t) &master_dbtss, 1); - temp_fake_desc = tss_desc_pattern; - temp_fake_desc.offset = (vm_offset_t) haddr; - fix_desc(&temp_fake_desc, 1); - *(struct fake_descriptor *)&master_gdt[sel_idx(DEBUG_TSS)] = temp_fake_desc; - ttss = (struct i386_tss *)haddr; - kprintf("DBTSS: 0x%x, ",haddr); -#endif /* MACH_KDB */ /* remap dftss up high and put new high addr into gdt */ haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS, @@ -742,7 +665,7 @@ pmap_bootstrap( boolean_t IA32e) { vm_offset_t va; - int i; + unsigned i; pdpt_entry_t *pdpt; spl_t s; @@ -1133,7 +1056,7 @@ pmap_lowmem_finalize(void) /* * Update wired memory statistics for early boot pages */ - PMAP_ZINFO_PALLOC(bootstrap_wired_pages * PAGE_SIZE); + PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE); /* * Free all pages in pmap regions below the base: @@ -1375,11 +1298,12 @@ pmap_is_empty( */ pmap_t pmap_create( - vm_map_size_t sz, - boolean_t is_64bit) + ledger_t ledger, + vm_map_size_t sz, + boolean_t is_64bit) { - pmap_t p; - int i; + pmap_t p; + unsigned i; vm_offset_t va; vm_size_t size; pdpt_entry_t *pdpt; @@ -1410,6 +1334,8 @@ pmap_create( p->stats.resident_count = 0; p->stats.resident_max = 0; p->stats.wired_count = 0; + ledger_reference(ledger); + p->ledger = ledger; p->ref_count = 1; p->nx_enabled = 1; p->pm_shared = FALSE; @@ -1437,7 +1363,7 @@ pmap_create( va = (vm_offset_t)p->dirbase; p->pdirbase = kvtophys(va); - PMAP_ZINFO_SALLOC(NBPTD); + PMAP_ZINFO_SALLOC(p,NBPTD); template = INTEL_PTE_VALID; for (i = 0; i< NPGPTD; i++, pdpt++ ) { @@ -1463,7 +1389,7 @@ pmap_create( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_SALLOC(PAGE_SIZE); + PMAP_ZINFO_SALLOC(p, PAGE_SIZE); /* allocate the vm_objs to hold the pdpt, pde and pte pages */ @@ -1485,7 +1411,7 @@ pmap_create( if (!is_64bit) { while ((pdp = pmap64_pde(p, (uint64_t)HIGH_MEM_BASE)) == PD_ENTRY_NULL) { splx(s); - pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE); /* need room for another pde entry */ + pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE, PMAP_EXPAND_OPTIONS_NONE); /* need room for another pde entry */ s = splhigh(); } pmap_store_pte(pdp, high_shared_pde); @@ -1559,7 +1485,7 @@ pmap_set_4GB_pagezero(pmap_t p) PMAP_LOCK(p); while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) { PMAP_UNLOCK(p); - pmap_expand_pml4(p, 0x0); + pmap_expand_pml4(p, 0x0, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(p); } kern_pdptp = kernel_pmap->pm_pdpt; @@ -1585,8 +1511,10 @@ pmap_clear_4GB_pagezero(pmap_t p) p->pm_task_map = TASK_MAP_64BIT; istate = ml_set_interrupts_enabled(FALSE); + if (current_cpu_datap()->cpu_task_map == TASK_MAP_64BIT_SHARED) - current_cpu_datap()->cpu_task_map = TASK_MAP_64BIT; + current_cpu_datap()->cpu_task_map = TASK_MAP_64BIT; + pmap_load_kernel_cr3(); user_pdptp = pmap64_pdpt(p, 0x0); @@ -1665,10 +1593,10 @@ pmap_destroy( */ if (!cpu_64bit) { OSAddAtomic(-p->pm_obj->resident_page_count, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(p->pm_obj->resident_page_count * PAGE_SIZE); + PMAP_ZINFO_PFREE(p, p->pm_obj->resident_page_count * PAGE_SIZE); kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD); - PMAP_ZINFO_SFREE(NBPTD); + PMAP_ZINFO_SFREE(p, NBPTD); zfree(pdpt_zone, (void *)p->pm_hold); @@ -1679,7 +1607,7 @@ pmap_destroy( /* free 64 bit mode structs */ kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE); - PMAP_ZINFO_SFREE(PAGE_SIZE); + PMAP_ZINFO_SFREE(p, PAGE_SIZE); inuse_ptepages += p->pm_obj_pml4->resident_page_count; vm_object_deallocate(p->pm_obj_pml4); @@ -1691,8 +1619,9 @@ pmap_destroy( vm_object_deallocate(p->pm_obj); OSAddAtomic(-(inuse_ptepages+1), &inuse_ptepages_count); - PMAP_ZINFO_PFREE(inuse_ptepages * PAGE_SIZE); + PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE); } + ledger_dereference(p->ledger); zfree(pmap_zone, p); @@ -1785,34 +1714,33 @@ pmap_protect( while (spte < epte) { - if (*spte & INTEL_PTE_VALID) { - - if (prot & VM_PROT_WRITE) - pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_WRITE)); - else - pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_WRITE)); + if (*spte & INTEL_PTE_VALID) { + if (prot & VM_PROT_WRITE) + pmap_update_pte(spte, 0, INTEL_PTE_WRITE); + else + pmap_update_pte(spte, INTEL_PTE_WRITE, 0); - if (set_NX == TRUE) - pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_NX)); - else - pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_NX)); + if (set_NX == TRUE) + pmap_update_pte(spte,0, INTEL_PTE_NX); + else + pmap_update_pte(spte, INTEL_PTE_NX, 0); - num_found++; - } - spte++; + num_found++; + } + spte++; } } sva = lva; } if (num_found) { - PMAP_UPDATE_TLBS(map, orig_sva, eva); + PMAP_UPDATE_TLBS(map, orig_sva, eva); } PMAP_UNLOCK(map); PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END, - 0, 0, 0, 0, 0); + 0, 0, 0, 0, 0); } @@ -1830,7 +1758,7 @@ pmap_map_block( uint32_t page; for (page = 0; page < size; page++) { - pmap_enter(pmap, va, pa, prot, attr, TRUE); + pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE); va += PAGE_SIZE; pa++; } @@ -1863,10 +1791,11 @@ pmap_extract( return (paddr); } -void +kern_return_t pmap_expand_pml4( pmap_t map, - vm_map_offset_t vaddr) + vm_map_offset_t vaddr, + __unused unsigned int options) { register vm_page_t m; register pmap_paddr_t pa; @@ -1907,7 +1836,7 @@ pmap_expand_pml4( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_PALLOC(PAGE_SIZE); + PMAP_ZINFO_PALLOC(map, PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pml4); @@ -1923,8 +1852,8 @@ pmap_expand_pml4( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); - return; + PMAP_ZINFO_PFREE(map, PAGE_SIZE); + return KERN_SUCCESS; } pmap_set_noencrypt(pn); @@ -1949,14 +1878,11 @@ pmap_expand_pml4( PMAP_UNLOCK(map); - return; - + return KERN_SUCCESS; } -void -pmap_expand_pdpt( - pmap_t map, - vm_map_offset_t vaddr) +kern_return_t +pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, __unused unsigned int options) { register vm_page_t m; register pmap_paddr_t pa; @@ -1970,7 +1896,7 @@ pmap_expand_pdpt( spl = splhigh(); while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) { splx(spl); - pmap_expand_pml4(map, vaddr); /* need room for another pdpt entry */ + pmap_expand_pml4(map, vaddr, PMAP_EXPAND_OPTIONS_NONE); /* need room for another pdpt entry */ spl = splhigh(); } splx(spl); @@ -2000,7 +1926,7 @@ pmap_expand_pdpt( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_PALLOC(PAGE_SIZE); + PMAP_ZINFO_PALLOC(map, PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pdpt); @@ -2016,8 +1942,8 @@ pmap_expand_pdpt( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); - return; + PMAP_ZINFO_PFREE(map, PAGE_SIZE); + return KERN_SUCCESS; } pmap_set_noencrypt(pn); @@ -2042,8 +1968,7 @@ pmap_expand_pdpt( PMAP_UNLOCK(map); - return; - + return KERN_SUCCESS; } @@ -2063,10 +1988,11 @@ pmap_expand_pdpt( * has been expanded enough. * (We won't loop forever, since page tables aren't shrunk.) */ -void +kern_return_t pmap_expand( pmap_t map, - vm_map_offset_t vaddr) + vm_map_offset_t vaddr, + __unused unsigned int options) { pt_entry_t *pdp; register vm_page_t m; @@ -2084,7 +2010,7 @@ pmap_expand( spl = splhigh(); while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) { splx(spl); - pmap_expand_pdpt(map, vaddr); /* need room for another pde entry */ + pmap_expand_pdpt(map, vaddr, PMAP_EXPAND_OPTIONS_NONE); /* need room for another pde entry */ spl = splhigh(); } splx(spl); @@ -2115,7 +2041,7 @@ pmap_expand( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_PALLOC(PAGE_SIZE); + PMAP_ZINFO_PALLOC(map, PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj); @@ -2132,8 +2058,8 @@ pmap_expand( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); - return; + PMAP_ZINFO_PFREE(map, PAGE_SIZE); + return KERN_SUCCESS; } pmap_set_noencrypt(pn); @@ -2162,7 +2088,7 @@ pmap_expand( PMAP_UNLOCK(map); - return; + return KERN_SUCCESS; } @@ -2287,7 +2213,7 @@ pmap_collect( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); + PMAP_ZINFO_PFREE(map, PAGE_SIZE); } PMAP_LOCK(p); @@ -2379,105 +2305,6 @@ kern_return_t dtrace_copyio_postflight(__unused addr64_t va) } #endif /* CONFIG_DTRACE */ -#if MACH_KDB - -/* show phys page mappings and attributes */ - -extern void db_show_page(pmap_paddr_t pa); - -#if 0 -void -db_show_page(pmap_paddr_t pa) -{ - pv_entry_t pv_h; - int pai; - char attr; - - pai = pa_index(pa); - pv_h = pai_to_pvh(pai); - - attr = pmap_phys_attributes[pai]; - printf("phys page %llx ", pa); - if (attr & PHYS_MODIFIED) - printf("modified, "); - if (attr & PHYS_REFERENCED) - printf("referenced, "); - if (pv_h->pmap || pv_h->next) - printf(" mapped at\n"); - else - printf(" not mapped\n"); - for (; pv_h; pv_h = pv_h->next) - if (pv_h->pmap) - printf("%llx in pmap %p\n", pv_h->va, pv_h->pmap); -} -#endif - -#endif /* MACH_KDB */ - -#if MACH_KDB -#if 0 -void db_kvtophys(vm_offset_t); -void db_show_vaddrs(pt_entry_t *); - -/* - * print out the results of kvtophys(arg) - */ -void -db_kvtophys( - vm_offset_t vaddr) -{ - db_printf("0x%qx", kvtophys(vaddr)); -} - -/* - * Walk the pages tables. - */ -void -db_show_vaddrs( - pt_entry_t *dirbase) -{ - pt_entry_t *ptep, *pdep, tmp; - unsigned int x, y, pdecnt, ptecnt; - - if (dirbase == 0) { - dirbase = kernel_pmap->dirbase; - } - if (dirbase == 0) { - db_printf("need a dirbase...\n"); - return; - } - dirbase = (pt_entry_t *) (int) ((unsigned long) dirbase & ~INTEL_OFFMASK); - - db_printf("dirbase: 0x%x\n", dirbase); - - pdecnt = ptecnt = 0; - pdep = &dirbase[0]; - for (y = 0; y < NPDEPG; y++, pdep++) { - if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) { - continue; - } - pdecnt++; - ptep = (pt_entry_t *) ((unsigned long)(*pdep) & ~INTEL_OFFMASK); - db_printf("dir[%4d]: 0x%x\n", y, *pdep); - for (x = 0; x < NPTEPG; x++, ptep++) { - if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) { - continue; - } - ptecnt++; - db_printf(" tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n", - x, - *ptep, - (y << 22) | (x << 12), - *ptep & ~INTEL_OFFMASK); - } - } - - db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt); - -} -#endif -#endif /* MACH_KDB */ - #include #if MACH_VM_DEBUG #include @@ -2620,7 +2447,7 @@ pmap_cpu_alloc(boolean_t is_boot_cpu) spl_t s; s = splhigh(); while ((pte = pmap_pte(kernel_pmap, (vm_map_offset_t)address)) == 0) - pmap_expand(kernel_pmap, (vm_map_offset_t)address); + pmap_expand(kernel_pmap, (vm_map_offset_t)address, PMAP_EXPAND_OPTIONS_NONE); * (int *) pte = 0; cp->mapwindow[i].prv_CADDR = (caddr_t) address; cp->mapwindow[i].prv_CMAP = pte; @@ -2929,7 +2756,7 @@ dump_4GB_pdpt(pmap_t p) spl = splhigh(); while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) { splx(spl); - pmap_expand_pml4(p, 0x0); + pmap_expand_pml4(p, 0x0, PMAP_EXPAND_OPTIONS_NONE); spl = splhigh(); } kern_pdptp = kernel_pmap->pm_pdpt; diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index a168562c9..5cc91b6e2 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -107,17 +107,16 @@ #endif /* ASSEMBLER */ -#define NPGPTD 4 -#define PDESHIFT 21 -#define PTEMASK 0x1ff -#define PTEINDX 3 - -#define PTESHIFT 12 +#define NPGPTD 4ULL +#define PDESHIFT 21ULL +#define PTEMASK 0x1ffULL +#define PTEINDX 3ULL +#define PTESHIFT 12ULL +#ifdef __i386__ #define INITPT_SEG_BASE 0x100000 -#define INITGDT_SEG_BASE 0x106000 -#define SLEEP_SEG_BASE 0x107000 +#endif #ifdef __x86_64__ #define LOW_4GB_MASK ((vm_offset_t)0x00000000FFFFFFFFUL) @@ -135,7 +134,7 @@ #define NBPTD (NPGPTD << PAGE_SHIFT) #define NPDEPTD (NBPTD / (sizeof (pd_entry_t))) #define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t))) -#define NBPDE (1 << PDESHIFT) +#define NBPDE (1ULL << PDESHIFT) #define PDEMASK (NBPDE - 1) #define PTE_PER_PAGE 512 /* number of PTE's per page on any level */ @@ -153,7 +152,7 @@ typedef uint64_t pdpt_entry_t; #define NPDPTPG (PAGE_SIZE/(sizeof (pdpt_entry_t))) #define PDPTSHIFT 30 #define PDPTPGSHIFT 9 -#define NBPDPT (1 << PDPTSHIFT) +#define NBPDPT (1ULL << PDPTSHIFT) #define PDPTMASK (NBPDPT-1) #define PDPT_ENTRY_NULL ((pdpt_entry_t *) 0) @@ -161,7 +160,7 @@ typedef uint64_t pd_entry_t; #define NPDPG (PAGE_SIZE/(sizeof (pd_entry_t))) #define PDSHIFT 21 #define PDPGSHIFT 9 -#define NBPD (1 << PDSHIFT) +#define NBPD (1ULL << PDSHIFT) #define PDMASK (NBPD-1) #define PD_ENTRY_NULL ((pd_entry_t *) 0) @@ -169,7 +168,7 @@ typedef uint64_t pt_entry_t; #define NPTPG (PAGE_SIZE/(sizeof (pt_entry_t))) #define PTSHIFT 12 #define PTPGSHIFT 9 -#define NBPT (1 << PTSHIFT) +#define NBPT (1ULL << PTSHIFT) #define PTMASK (NBPT-1) #define PT_ENTRY_NULL ((pt_entry_t *) 0) @@ -234,58 +233,6 @@ pmap_store_pte(pt_entry_t *entryp, pt_entry_t value) #endif } -/* - * Atomic 64-bit compare and exchange of a page table entry. - */ -static inline boolean_t -pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) -{ - boolean_t ret; - -#ifdef __i386__ - /* - * Load the old value into %edx:%eax - * Load the new value into %ecx:%ebx - * Compare-exchange-8bytes at address entryp (loaded in %edi) - * If the compare succeeds, the new value is stored, return TRUE. - * Otherwise, no swap is made, return FALSE. - */ - asm volatile( - " lock; cmpxchg8b (%1) \n\t" - " setz %%al \n\t" - " movzbl %%al,%0" - : "=a" (ret) - : "D" (entryp), - "a" ((uint32_t)old), - "d" ((uint32_t)(old >> 32)), - "b" ((uint32_t)new), - "c" ((uint32_t)(new >> 32)) - : "memory"); -#else - /* - * Load the old value into %rax - * Load the new value into another register - * Compare-exchange-quad at address entryp - * If the compare succeeds, the new value is stored, return TRUE. - * Otherwise, no swap is made, return FALSE. - */ - asm volatile( - " lock; cmpxchgq %2,(%3) \n\t" - " setz %%al \n\t" - " movzbl %%al,%0" - : "=a" (ret) - : "a" (old), - "r" (new), - "r" (entryp) - : "memory"); -#endif - return ret; -} - -#define pmap_update_pte(entryp, old, new) \ - while (!pmap_cmpx_pte((entryp), (old), (new))) - - /* in 64 bit spaces, the number of each type of page in the page tables */ #define NPML4PGS (1ULL * (PAGE_SIZE/(sizeof (pml4_entry_t)))) #define NPDPTPGS (NPML4PGS * (PAGE_SIZE/(sizeof (pdpt_entry_t)))) @@ -302,14 +249,15 @@ pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) #define KERNEL_UBER_BASE (0ULL - NBPML4) #define KERNEL_UBER_BASE_HI32 ((uint32_t)(KERNEL_UBER_BASE >> 32)) #else -#define KERNEL_PML4_INDEX 511 +#define KERNEL_PML4_INDEX 511 #define KERNEL_KEXTS_INDEX 510 /* Home of KEXTs - the basement */ -#define KERNEL_PHYSMAP_INDEX 509 /* virtual to physical map */ +#define KERNEL_PHYSMAP_PML4_INDEX 509 /* virtual to physical map */ #define KERNEL_BASE (0ULL - NBPML4) #define KERNEL_BASEMENT (KERNEL_BASE - NBPML4) #endif #define VM_WIMG_COPYBACK VM_MEM_COHERENT +#define VM_WIMG_COPYBACKLW VM_WIMG_COPYBACK #define VM_WIMG_DEFAULT VM_MEM_COHERENT /* ?? intel ?? */ #define VM_WIMG_IO (VM_MEM_COHERENT | \ @@ -317,7 +265,7 @@ pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) #define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) /* write combining mode, aka store gather */ #define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) - +#define VM_WIMG_INNERWBACK VM_MEM_COHERENT /* * Pte related macros */ @@ -426,19 +374,19 @@ enum high_fixed_addresses { * without using the bit fields). */ -#define INTEL_PTE_VALID 0x00000001 -#define INTEL_PTE_WRITE 0x00000002 -#define INTEL_PTE_RW 0x00000002 -#define INTEL_PTE_USER 0x00000004 -#define INTEL_PTE_WTHRU 0x00000008 -#define INTEL_PTE_NCACHE 0x00000010 -#define INTEL_PTE_REF 0x00000020 -#define INTEL_PTE_MOD 0x00000040 -#define INTEL_PTE_PS 0x00000080 -#define INTEL_PTE_PTA 0x00000080 -#define INTEL_PTE_GLOBAL 0x00000100 -#define INTEL_PTE_WIRED 0x00000200 -#define INTEL_PDPTE_NESTED 0x00000400 +#define INTEL_PTE_VALID 0x00000001ULL +#define INTEL_PTE_WRITE 0x00000002ULL +#define INTEL_PTE_RW 0x00000002ULL +#define INTEL_PTE_USER 0x00000004ULL +#define INTEL_PTE_WTHRU 0x00000008ULL +#define INTEL_PTE_NCACHE 0x00000010ULL +#define INTEL_PTE_REF 0x00000020ULL +#define INTEL_PTE_MOD 0x00000040ULL +#define INTEL_PTE_PS 0x00000080ULL +#define INTEL_PTE_PTA 0x00000080ULL +#define INTEL_PTE_GLOBAL 0x00000100ULL +#define INTEL_PTE_WIRED 0x00000200ULL +#define INTEL_PDPTE_NESTED 0x00000400ULL #define INTEL_PTE_PFN PG_FRAME #define INTEL_PTE_NX (1ULL << 63) @@ -477,14 +425,16 @@ extern pt_entry_t PTmap[], APTmap[], Upte; extern pd_entry_t PTD[], APTD[], PTDpde[], APTDpde[], Upde; extern pmap_paddr_t lo_kernel_cr3; extern pdpt_entry_t *IdlePDPT64; +extern pdpt_entry_t IdlePDPT[]; +extern pml4_entry_t IdlePML4[]; #else extern pt_entry_t *PTmap; +extern pdpt_entry_t *IdlePDPT; +extern pml4_entry_t *IdlePML4; #endif extern boolean_t no_shared_cr3; extern addr64_t kernel64_cr3; extern pd_entry_t *IdlePTD; /* physical addr of "Idle" state PTD */ -extern pdpt_entry_t IdlePDPT[]; -extern pml4_entry_t IdlePML4[]; extern uint64_t pmap_pv_hashlist_walks; extern uint64_t pmap_pv_hashlist_cnts; @@ -503,25 +453,46 @@ extern uint32_t pmap_kernel_text_ps; #define vtopte(va) (PTmap + i386_btop((vm_offset_t)va)) #endif + #ifdef __x86_64__ #define ID_MAP_VTOP(x) ((void *)(((uint64_t)(x)) & LOW_4GB_MASK)) -#define PHYSMAP_BASE KVADDR(KERNEL_PHYSMAP_INDEX,0,0,0) +extern uint64_t physmap_base, physmap_max; + #define NPHYSMAP (MAX(K64_MAXMEM/GB + 4, 4)) -#define PHYSMAP_PTOV(x) ((void *)(((uint64_t)(x)) + PHYSMAP_BASE)) static inline boolean_t physmap_enclosed(addr64_t a) { return (a < (NPHYSMAP * GB)); } -#endif + +static inline void * PHYSMAP_PTOV_check(void *paddr) { + uint64_t pvaddr = (uint64_t)paddr + physmap_base; + + if (__improbable(pvaddr >= physmap_max)) + panic("PHYSMAP_PTOV bounds exceeded, 0x%qx, 0x%qx, 0x%qx", + pvaddr, physmap_base, physmap_max); + + return (void *)pvaddr; +} + +#define PHYSMAP_PTOV(x) (PHYSMAP_PTOV_check((void*) (x))) + +/* + * For KASLR, we alias the master processor's IDT and GDT at fixed + * virtual addresses to defeat SIDT/SGDT address leakage. + */ +#define MASTER_IDT_ALIAS (VM_MIN_KERNEL_ADDRESS + 0x0000) +#define MASTER_GDT_ALIAS (VM_MIN_KERNEL_ADDRESS + 0x1000) + +/* + * The low global vector page is mapped at a fixed alias also. + */ +#define LOWGLOBAL_ALIAS (VM_MIN_KERNEL_ADDRESS + 0x2000) + +#endif /*__x86_64__ */ typedef volatile long cpu_set; /* set of CPUs - must be <= 32 */ /* changed by other processors */ -struct md_page { - int pv_list_count; - TAILQ_HEAD(,pv_entry) pv_list; -}; - #include /* @@ -551,6 +522,7 @@ struct pmap { struct pmap_statistics stats; /* map statistics */ int ref_count; /* reference count */ int nx_enabled; + ledger_t ledger; /* ledger tracking phys mappings */ }; @@ -639,9 +611,10 @@ extern void pmap_update_interrupt(void); extern addr64_t (kvtophys)( vm_offset_t addr); -extern void pmap_expand( +extern kern_return_t pmap_expand( pmap_t pmap, - vm_map_offset_t addr); + vm_map_offset_t addr, + unsigned int options); #if !defined(__x86_64__) extern pt_entry_t *pmap_pte( struct pmap *pmap, @@ -932,6 +905,8 @@ extern boolean_t pmap_is_empty(pmap_t pmap, #define MACHINE_BOOTSTRAPPTD 1 /* Static bootstrap page-tables */ +kern_return_t +pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t); #endif /* ASSEMBLER */ diff --git a/osfmk/i386/pmap_common.c b/osfmk/i386/pmap_common.c index 576b9c089..abe1e24a3 100644 --- a/osfmk/i386/pmap_common.c +++ b/osfmk/i386/pmap_common.c @@ -26,8 +26,10 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include +#include #include + /* * Each entry in the pv_head_table is locked by a bit in the * pv_lock_table. The lock bits are accessed by the physical diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index 37757f191..22bf95cec 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -26,11 +26,15 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include +#ifndef _I386_PMAP_INTERNAL_ +#define _I386_PMAP_INTERNAL_ #ifdef MACH_KERNEL_PRIVATE +#include +#include +#include + /* * pmap locking */ @@ -61,13 +65,15 @@ extern boolean_t pmap_trace; #define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \ KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ -void pmap_expand_pml4( +kern_return_t pmap_expand_pml4( pmap_t map, - vm_map_offset_t v); + vm_map_offset_t v, + unsigned int options); -void pmap_expand_pdpt( +kern_return_t pmap_expand_pdpt( pmap_t map, - vm_map_offset_t v); + vm_map_offset_t v, + unsigned int options); void phys_attribute_set( ppnum_t phys, @@ -273,20 +279,19 @@ extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ extern event_t mapping_replenish_event; static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { - + pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); simple_lock(&pv_hashed_free_list_lock); /* If the kernel reserved pool is low, let non-kernel mappings allocate * synchronously, possibly subject to a throttle. */ - if ((pv_hashed_kern_free_count >= pv_hashed_kern_low_water_mark) && - (*pvh_ep = pv_hashed_free_list) != 0) { + if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; pv_hashed_free_count--; } simple_unlock(&pv_hashed_free_list_lock); - if (pv_hashed_free_count < pv_hashed_low_water_mark) { + if (pv_hashed_free_count <= pv_hashed_low_water_mark) { if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) thread_wakeup(&mapping_replenish_event); } @@ -303,6 +308,7 @@ static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry extern unsigned pmap_kern_reserve_alloc_stat; static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { + pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); simple_lock(&pv_hashed_kern_free_list_lock); if ((*pvh_e = pv_hashed_kern_free_list) != 0) { @@ -373,6 +379,13 @@ static inline void pmap_pv_throttle(__unused pmap_t p) { #define PHYS_PTA INTEL_PTE_PTA #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) +extern const boolean_t pmap_disable_kheap_nx; +extern const boolean_t pmap_disable_kstack_nx; + +#define PMAP_EXPAND_OPTIONS_NONE (0x0) +#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) +#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) + /* * Amount of virtual memory mapped by one * page-directory entry. @@ -422,7 +435,7 @@ static inline void pmap_pv_throttle(__unused pmap_t p) { extern uint64_t pde_mapped_size; extern char *pmap_phys_attributes; -extern unsigned int last_managed_page; +extern ppnum_t last_managed_page; extern ppnum_t lowest_lo; extern ppnum_t lowest_hi; @@ -613,7 +626,7 @@ pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corru static inline pmap_pagetable_corruption_action_t pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { - pmap_pv_assertion_t action = PMAP_ACTION_ASSERT; + pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; pmap_pagetable_corruption_t suppress_reason = PTE_VALID; ppnum_t suppress_ppn = 0; pt_entry_t cpte = *ptep; @@ -650,7 +663,7 @@ pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t * action = PMAP_ACTION_RETRY; goto pmap_cpc_exit; } - } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h); + } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); /* Discover root entries with a Hamming * distance of 1 from the supplied @@ -732,12 +745,12 @@ pmap_pv_remove_retry: pvh_e = PV_HASHED_ENTRY_NULL; pv_h = pai_to_pvh(ppn_to_pai(ppn)); - if (pv_h->pmap == PMAP_NULL) { + if (__improbable(pv_h->pmap == PMAP_NULL)) { pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); if (pac == PMAP_ACTION_IGNORE) goto pmap_pv_remove_exit; else if (pac == PMAP_ACTION_ASSERT) - panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte); + panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte); else if (pac == PMAP_ACTION_RETRY_RELOCK) { LOCK_PVH(ppn_to_pai(*ppnp)); pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); @@ -790,8 +803,8 @@ pmap_pv_remove_retry: LOCK_PV_HASH(pvhash_idx); pprevh = pvhash(pvhash_idx); if (PV_HASHED_ENTRY_NULL == *pprevh) { - panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", - pmap, vaddr, ppn); + panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash", + pmap, vaddr, ppn, *pte, pte); } pvh_e = *pprevh; pmap_pv_hashlist_walks++; @@ -810,7 +823,7 @@ pmap_pv_remove_retry: pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); if (pac == PMAP_ACTION_ASSERT) - panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va); + panic("pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va); else { UNLOCK_PV_HASH(pvhash_idx); if (pac == PMAP_ACTION_RETRY_RELOCK) { @@ -841,31 +854,45 @@ pmap_pv_remove_exit: extern int pt_fake_zone_index; static inline void -PMAP_ZINFO_PALLOC(vm_size_t bytes) +PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) { thread_t thr = current_thread(); task_t task; zinfo_usage_t zinfo; - thr->tkm_private.alloc += bytes; + pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); + if (pt_fake_zone_index != -1 && (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); } static inline void -PMAP_ZINFO_PFREE(vm_size_t bytes) +PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) { thread_t thr = current_thread(); task_t task; zinfo_usage_t zinfo; - thr->tkm_private.free += bytes; + pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); + if (pt_fake_zone_index != -1 && (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); } +static inline void +PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) +{ + pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); +} + +static inline void +PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) +{ + pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); +} + extern boolean_t pmap_initialized;/* Has pmap_init completed? */ #define valid_page(x) (pmap_initialized && pmap_valid_page(x)) @@ -893,6 +920,70 @@ void phys_attribute_clear( #endif void pmap_pcid_configure(void); + +/* + * Atomic 64-bit compare and exchange of a page table entry. + */ +static inline boolean_t +pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) +{ + boolean_t ret; + +#ifdef __i386__ + /* + * Load the old value into %edx:%eax + * Load the new value into %ecx:%ebx + * Compare-exchange-8bytes at address entryp (loaded in %edi) + * If the compare succeeds, the new value is stored, return TRUE. + * Otherwise, no swap is made, return FALSE. + */ + asm volatile( + " lock; cmpxchg8b (%1) \n\t" + " setz %%al \n\t" + " movzbl %%al,%0" + : "=a" (ret) + : "D" (entryp), + "a" ((uint32_t)old), + "d" ((uint32_t)(old >> 32)), + "b" ((uint32_t)new), + "c" ((uint32_t)(new >> 32)) + : "memory"); +#else + /* + * Load the old value into %rax + * Load the new value into another register + * Compare-exchange-quad at address entryp + * If the compare succeeds, the new value is stored, return TRUE. + * Otherwise, no swap is made, return FALSE. + */ + asm volatile( + " lock; cmpxchgq %2,(%3) \n\t" + " setz %%al \n\t" + " movzbl %%al,%0" + : "=a" (ret) + : "a" (old), + "r" (new), + "r" (entryp) + : "memory"); +#endif + return ret; +} + +extern uint32_t pmap_update_clear_pte_count; + +static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { + pt_entry_t npte, opte; + do { + opte = *mptep; + if (__improbable(opte == 0)) { + pmap_update_clear_pte_count++; + break; + } + npte = opte & ~(pclear_bits); + npte |= pset_bits; + } while (!pmap_cmpx_pte(mptep, opte, npte)); +} + #if defined(__x86_64__) /* * The single pml4 page per pmap is allocated at pmap create time and exists @@ -903,6 +994,11 @@ static inline pml4_entry_t * pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) { + if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && + (vaddr < 0xFFFF800000000000ULL))) { + return (NULL); + } + #if PMAP_ASSERT return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); #else @@ -919,12 +1015,6 @@ pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) pml4_entry_t newpf; pml4_entry_t *pml4; - assert(pmap); - if ((vaddr > 0x00007FFFFFFFFFFFULL) && - (vaddr < 0xFFFF800000000000ULL)) { - return (0); - } - pml4 = pmap64_pml4(pmap, vaddr); if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { newpf = *pml4 & PG_FRAME; @@ -942,12 +1032,6 @@ pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) pdpt_entry_t newpf; pdpt_entry_t *pdpt; - assert(pmap); - if ((vaddr > 0x00007FFFFFFFFFFFULL) && - (vaddr < 0xFFFF800000000000ULL)) { - return (0); - } - pdpt = pmap64_pdpt(pmap, vaddr); if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { @@ -963,7 +1047,6 @@ pmap_pde(pmap_t m, vm_map_offset_t v) { pd_entry_t *pde; - assert(m); pde = pmap64_pde(m, v); return pde; @@ -983,7 +1066,7 @@ pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) pd_entry_t newpf; assert(pmap); - pde = pmap_pde(pmap, vaddr); + pde = pmap64_pde(pmap, vaddr); if (pde && ((*pde & INTEL_PTE_VALID))) { if (*pde & INTEL_PTE_PS) @@ -995,4 +1078,11 @@ pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) return (NULL); } #endif +#if DEBUG +#define DPRINTF(x...) kprintf(x) +#else +#define DPRINTF(x...) +#endif + #endif /* MACH_KERNEL_PRIVATE */ +#endif /* _I386_PMAP_INTERNAL_ */ diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index 9061d73cf..f400bc280 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -27,6 +27,7 @@ */ #include #include +#include #include void pmap_remove_range( @@ -35,6 +36,8 @@ void pmap_remove_range( pt_entry_t *spte, pt_entry_t *epte); +uint32_t pmap_update_clear_pte_count; + /* * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time, * on a NBPDE boundary. @@ -105,7 +108,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) { PMAP_UNLOCK(subord); - pmap_expand_pdpt(subord, nvaddr); + pmap_expand_pdpt(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(subord); npde = pmap64_pdpt(subord, nvaddr); } @@ -118,7 +121,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) { PMAP_UNLOCK(subord); - pmap_expand(subord, nvaddr); + pmap_expand(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(subord); npde = pmap_pde(subord, nvaddr); } @@ -144,7 +147,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t pde = pmap64_pdpt(grand, vaddr); if (0 == pde) { PMAP_UNLOCK(grand); - pmap_expand_pml4(grand, vaddr); + pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(grand); pde = pmap64_pdpt(grand, vaddr); } @@ -163,7 +166,7 @@ kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t pde = pmap_pde(grand, vaddr); if ((0 == pde) && cpu_64bit) { PMAP_UNLOCK(grand); - pmap_expand_pdpt(grand, vaddr); + pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE); PMAP_LOCK(grand); pde = pmap_pde(grand, vaddr); } @@ -362,7 +365,7 @@ pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) { panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap); nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); - pmap_update_pte(ptep, *ptep, (*ptep & ~PHYS_CACHEABILITY_MASK) | attributes); + pmap_update_pte(ptep, PHYS_CACHEABILITY_MASK, attributes); PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); pvh_e = nexth; } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h); @@ -395,18 +398,34 @@ void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) { * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ + void pmap_enter( register pmap_t pmap, vm_map_offset_t vaddr, ppnum_t pn, vm_prot_t prot, + vm_prot_t fault_type, unsigned int flags, boolean_t wired) +{ + (void) pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE); +} + +kern_return_t +pmap_enter_options( + register pmap_t pmap, + vm_map_offset_t vaddr, + ppnum_t pn, + vm_prot_t prot, + __unused vm_prot_t fault_type, + unsigned int flags, + boolean_t wired, + unsigned int options) { pt_entry_t *pte; pv_rooted_entry_t pv_h; - int pai; + ppnum_t pai; pv_hashed_entry_t pvh_e; pv_hashed_entry_t pvh_new; pt_entry_t template; @@ -421,25 +440,35 @@ pmap_enter( vm_object_t delpage_pm_obj = NULL; int delpage_pde_index = 0; pt_entry_t old_pte; + kern_return_t kr_expand; pmap_intr_assert(); - assert(pn != vm_page_fictitious_addr); if (pmap == PMAP_NULL) - return; + return KERN_INVALID_ARGUMENT; + + /* N.B. We can be supplied a zero page frame in the NOENTER case, it's an + * unused value for that scenario. + */ + assert(pn != vm_page_fictitious_addr); + if (pn == vm_page_guard_addr) - return; + return KERN_INVALID_ARGUMENT; PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, - pmap, - (uint32_t) (vaddr >> 32), (uint32_t) vaddr, - pn, prot); + pmap, + (uint32_t) (vaddr >> 32), (uint32_t) vaddr, + pn, prot); if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) set_NX = FALSE; else set_NX = TRUE; + if (__improbable(set_NX && (pmap == kernel_pmap) && ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) || (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) { + set_NX = FALSE; + } + /* * Must allocate a new pvlist entry while we're unlocked; * zalloc may cause pageout (which will lock the pmap system). @@ -463,7 +492,9 @@ Retry: while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) { /* need room for another pde entry */ PMAP_UNLOCK(pmap); - pmap_expand_pdpt(pmap, vaddr); + kr_expand = pmap_expand_pdpt(pmap, vaddr, options); + if (kr_expand != KERN_SUCCESS) + return kr_expand; PMAP_LOCK(pmap); } } else { @@ -473,10 +504,16 @@ Retry: * going to grow pde level page(s) */ PMAP_UNLOCK(pmap); - pmap_expand(pmap, vaddr); + kr_expand = pmap_expand(pmap, vaddr, options); + if (kr_expand != KERN_SUCCESS) + return kr_expand; PMAP_LOCK(pmap); } } + if (options & PMAP_EXPAND_OPTIONS_NOENTER) { + PMAP_UNLOCK(pmap); + return KERN_SUCCESS; + } if (superpage && *pte && !(*pte & INTEL_PTE_PS)) { /* @@ -540,14 +577,15 @@ Retry: if (wired) { template |= INTEL_PTE_WIRED; - if (!iswired(old_attributes)) - OSAddAtomic(+1, - &pmap->stats.wired_count); + if (!iswired(old_attributes)) { + OSAddAtomic(+1, &pmap->stats.wired_count); + pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); + } } else { if (iswired(old_attributes)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } } if (superpage) /* this path can not be used */ @@ -557,8 +595,11 @@ Retry: ((old_attributes ^ template) != INTEL_PTE_WIRED); /* store modified PTE and preserve RC bits */ - pmap_update_pte(pte, *pte, - template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD))); + pt_entry_t npte, opte;; + do { + opte = *pte; + npte = template | (opte & (INTEL_PTE_REF | INTEL_PTE_MOD)); + } while (!pmap_cmpx_pte(pte, opte, npte)); if (old_pa_locked) { UNLOCK_PVH(pai); old_pa_locked = FALSE; @@ -588,7 +629,7 @@ Retry: */ /* invalidate the PTE */ - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID)); + pmap_update_pte(pte, INTEL_PTE_VALID, 0); /* propagate invalidate everywhere */ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE); /* remember reference and change */ @@ -599,14 +640,14 @@ Retry: if (IS_MANAGED_PAGE(pai)) { pmap_assert(old_pa_locked == TRUE); + pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); - OSAddAtomic(-1, - &pmap->stats.resident_count); - + OSAddAtomic(-1, &pmap->stats.resident_count); if (iswired(*pte)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, + PAGE_SIZE); } pmap_phys_attributes[pai] |= oattr; @@ -627,8 +668,8 @@ Retry: if (iswired(*pte)) { assert(pmap->stats.wired_count >= 1); - OSAddAtomic(-1, - &pmap->stats.wired_count); + OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } } } @@ -708,6 +749,7 @@ Retry: * only count the mapping * for 'managed memory' */ + pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); if (pmap->stats.resident_count > pmap->stats.resident_max) { pmap->stats.resident_max = pmap->stats.resident_count; @@ -716,6 +758,7 @@ Retry: /* Account for early mappings created before "managed pages" * are determined. Consider consulting the available DRAM map. */ + pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE); OSAddAtomic(+1, &pmap->stats.resident_count); } /* @@ -746,6 +789,7 @@ Retry: if (wired) { template |= INTEL_PTE_WIRED; OSAddAtomic(+1, & pmap->stats.wired_count); + pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } if (superpage) template |= INTEL_PTE_PS; @@ -781,10 +825,11 @@ Done: vm_object_unlock(delpage_pm_obj); VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); + PMAP_ZINFO_PFREE(pmap, PAGE_SIZE); } PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0); + return KERN_SUCCESS; } /* @@ -812,7 +857,7 @@ pmap_remove_range( pv_hashed_entry_t pvh_e; int pvh_cnt = 0; int num_removed, num_unwired, num_found, num_invalid; - int pai; + ppnum_t pai; pmap_paddr_t pa; vm_map_offset_t vaddr; @@ -861,8 +906,8 @@ pmap_remove_range( if ((p & INTEL_PTE_VALID) == 0) num_invalid++; - /* invalidate the PTE */ - pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID)); + /* invalidate the PTE */ + pmap_update_pte(cpte, INTEL_PTE_VALID, 0); } if (num_found == 0) { @@ -933,6 +978,7 @@ update_counts: if (pmap->stats.resident_count < num_removed) panic("pmap_remove_range: resident_count"); #endif + pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed)); assert(pmap->stats.resident_count >= num_removed); OSAddAtomic(-num_removed, &pmap->stats.resident_count); @@ -942,6 +988,7 @@ update_counts: #endif assert(pmap->stats.wired_count >= num_unwired); OSAddAtomic(-num_unwired, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired)); return; } @@ -1144,11 +1191,12 @@ pmap_page_protect( /* * Remove the mapping, collecting dirty bits. */ - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID); + pmap_update_pte(pte, INTEL_PTE_VALID, 0); /* Remove per-pmap wired count */ if (iswired(*pte)) { OSAddAtomic(-1, &pmap->stats.wired_count); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE); } PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); @@ -1160,9 +1208,9 @@ pmap_page_protect( if (pmap->stats.resident_count < 1) panic("pmap_page_protect: resident_count"); #endif + pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE); assert(pmap->stats.resident_count >= 1); OSAddAtomic(-1, &pmap->stats.resident_count); - /* * Deal with the pv_rooted_entry. */ @@ -1190,8 +1238,7 @@ pmap_page_protect( */ pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - - pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE); + pmap_update_pte(pte, INTEL_PTE_WRITE, 0); PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE); } pvh_e = nexth; @@ -1265,7 +1312,7 @@ phys_attribute_clear( /* * Walk down PV list, clearing all modify or reference bits. * We do not have to lock the pv_list because we have - * the entire pmap system locked. + * the per-pmap lock */ if (pv_h->pmap != PMAP_NULL) { /* @@ -1285,8 +1332,7 @@ phys_attribute_clear( */ pte = pmap_pte(pmap, va); attributes |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED); - - pmap_update_pte(pte, *pte, (*pte & ~bits)); + pmap_update_pte(pte, bits, 0); /* Ensure all processors using this translation * invalidate this TLB entry. The invalidation *must* * follow the PTE update, to ensure that the TLB @@ -1415,8 +1461,9 @@ pmap_change_wiring( /* * wiring down mapping */ + pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE); OSAddAtomic(+1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED)); + pmap_update_pte(pte, 0, INTEL_PTE_WIRED); } else if (!wired && iswired(*pte)) { /* @@ -1424,7 +1471,8 @@ pmap_change_wiring( */ assert(map->stats.wired_count >= 1); OSAddAtomic(-1, &map->stats.wired_count); - pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED)); + pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE); + pmap_update_pte(pte, INTEL_PTE_WIRED, 0); } PMAP_UNLOCK(map); @@ -1459,6 +1507,12 @@ pmap_map_bd( if (!(flags & (VM_MEM_GUARDED))) template |= INTEL_PTE_PTA; } + +#if defined(__x86_64__) + if ((prot & VM_PROT_EXECUTE) == 0) + template |= INTEL_PTE_NX; +#endif + if (prot & VM_PROT_WRITE) template |= INTEL_PTE_WRITE; diff --git a/osfmk/i386/postcode.h b/osfmk/i386/postcode.h index 498a88143..9440fcbd0 100644 --- a/osfmk/i386/postcode.h +++ b/osfmk/i386/postcode.h @@ -114,37 +114,25 @@ * The following postcodes are defined for stages of early startup: */ -#define _PSTART_ENTRY 0xFF -#define _PSTART_RELOC 0xFE -#define PSTART_ENTRY 0xFD -#define PSTART_PAGE_TABLES 0xFC -#if defined(__x86_64__) -#define PSTART_BEFORE_ID_MAP 0xFB -#else -#define PSTART_BEFORE_PAGING 0xFB -#endif -#define VSTART_ENTRY 0xFA -#define VSTART_STACK_SWITCH 0xF9 -#define VSTART_BEFORE_PAGING 0xF8 -#define VSTART_EXIT 0xF7 -#define I386_INIT_ENTRY 0xF6 -#define CPU_INIT_D 0xF5 -#define PE_INIT_PLATFORM_D 0xF4 - -#define SLAVE_RSTART_ENTRY 0xEF -#define SLAVE_REAL_TO_PROT_ENTRY 0xEE -#define SLAVE_REAL_TO_PROT_EXIT 0xED -#define SLAVE_STARTPROG_ENTRY 0xEC -#define SLAVE_STARTPROG_EXIT 0xEB -#define SLAVE_PSTART_ENTRY 0xEA -#define SLAVE_PSTART_EXIT 0xE9 -#if defined(__i386__) -#define SLAVE_VSTART_ENTRY 0xE8 -#define SLAVE_VSTART_DESC_INIT 0xE7 -#define SLAVE_VSTART_STACK_SWITCH 0xE6 -#define SLAVE_VSTART_EXIT 0xE5 -#endif -#define I386_INIT_SLAVE 0xE4 +#define PSTART_ENTRY 0xFF +#define PSTART_REBASE 0xFE +#define PSTART_BEFORE_PAGING 0xFE +#define PSTART_VSTART 0xFD +#define VSTART_ENTRY 0xFC +#define VSTART_IDLE_PTS_INIT 0xFB +#define VSTART_PHYSMAP_INIT 0xFA +#define VSTART_DESC_ALIAS_INIT 0xF9 +#define VSTART_SET_CR3 0xF8 +#define VSTART_CPU_DESC_INIT 0xF7 +#define VSTART_CPU_MODE_INIT 0xF6 +#define VSTART_EXIT 0xF5 +#define I386_INIT_ENTRY 0xF4 +#define CPU_INIT_D 0xF3 +#define PE_INIT_PLATFORM_D 0xF2 + +#define SLAVE_STARTPROG_ENTRY 0xEF +#define SLAVE_PSTART 0xEE +#define I386_INIT_SLAVE 0xED #define PANIC_DOUBLE_FAULT 0xDF /* Double Fault exception */ #define PANIC_MACHINE_CHECK 0xDE /* Machine-Check */ diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index 05dd961f1..755be1c69 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -176,6 +176,9 @@ #define PMAP_PCID_PRESERVE (1ULL << 63) #define PMAP_PCID_MASK (0xFFF) + +#define RDRAND_RAX .byte 0x48, 0x0f, 0xc7, 0xf0 + #ifndef ASSEMBLER #include diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index d9de63185..8a5f8c667 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -40,7 +40,6 @@ */ #include -#include #include @@ -378,6 +377,7 @@ rtclock_init(void) rtc_timer_init(); clock_timebase_init(); ml_init_lock_timeout(); + ml_init_delay_spin_threshold(); } /* Set fixed configuration for lapic timers */ diff --git a/osfmk/i386/rtclock_native.c b/osfmk/i386/rtclock_native.c index 4d17039a2..cf24293be 100644 --- a/osfmk/i386/rtclock_native.c +++ b/osfmk/i386/rtclock_native.c @@ -30,7 +30,6 @@ */ #include -#include #include diff --git a/osfmk/i386/seg.h b/osfmk/i386/seg.h index df191c5d1..94af4521f 100644 --- a/osfmk/i386/seg.h +++ b/osfmk/i386/seg.h @@ -55,11 +55,6 @@ */ #ifndef _I386_SEG_H_ #define _I386_SEG_H_ -#ifdef MACH_KERNEL -#include -#else -#define MACH_KDB 0 -#endif /* MACH_KERNEL */ #ifndef __ASSEMBLER__ #include #include @@ -99,15 +94,7 @@ selector_to_sel(uint16_t selector) #define LDTSZ_MIN SEL_TO_INDEX(USER_SETTABLE) /* kernel ldt entries */ -#if MACH_KDB -#define GDTSZ 20 -#else #define GDTSZ 19 -#endif - -#ifdef __x86_64__ -#define PROT_MODE_GDT_SIZE 48 /* size of prot_mode_gdt in bytes */ -#endif /* * Interrupt table is always 256 entries long. @@ -219,12 +206,6 @@ extern char mc_task_stack_end[]; extern struct i386_tss master_mctss; extern void mc_task_start(void); -#if MACH_KDB -extern char db_task_stack_store[]; -extern struct i386_tss master_dbtss; -extern void db_task_start(void); -#endif /* MACH_KDB */ - __END_DECLS #endif /*__ASSEMBLER__*/ @@ -355,10 +336,6 @@ __END_DECLS #define SYSENTER_DS KERNEL64_SS /* sysenter kernel data segment */ #endif -#if MACH_KDB -#define DEBUG_TSS 0x90 /* 18: debug TSS (uniprocessor) */ -#endif - #ifdef __x86_64__ /* * 64-bit kernel LDT descriptors diff --git a/osfmk/i386/start.s b/osfmk/i386/start.s index 5472ffde3..c8a904038 100644 --- a/osfmk/i386/start.s +++ b/osfmk/i386/start.s @@ -124,18 +124,6 @@ EXT(mc_task_stack): .globl EXT(mc_task_stack_end) EXT(mc_task_stack_end): -#if MACH_KDB -/* - * Stack for last-ditch debugger task for each processor. - */ - .align 12 - .globl EXT(db_task_stack_store) -EXT(db_task_stack_store): - .space (INTSTACK_SIZE*MAX_CPUS) - -#endif /* MACH_KDB */ - - /* * BSP CPU start here. * eax points to kernbootstruct @@ -153,18 +141,18 @@ LEXT(_start) mov %eax, %ebp /* Move kernbootstruct to ebp */ mov %eax, %ebx /* get pointer to kernbootstruct */ - mov $EXT(low_eintstack),%esp /* switch to the bootup stack */ + mov $EXT(low_eintstack),%esp /* switch to the bootup stack */ POSTCODE(PSTART_ENTRY) - lgdt EXT(gdtptr) /* load GDT */ + lgdt EXT(gdtptr) /* load GDT */ - mov $(KERNEL_DS),%ax /* set kernel data segment */ + mov $(KERNEL_DS),%ax /* set kernel data segment */ mov %ax, %ds mov %ax, %es mov %ax, %ss - xor %ax, %ax /* fs must be zeroed; */ - mov %ax, %fs /* some bootstrappers don`t do this */ + xor %ax, %ax /* fs must be zeroed; */ + mov %ax, %fs /* some bootstrappers don`t do this */ mov %ax, %gs cld @@ -173,10 +161,10 @@ LEXT(_start) call .-1 paging: - andl $0xfffffff0, %esp /* align stack */ + andl $0xfffffff0, %esp /* align stack */ subl $0xc, %esp - pushl %ebp /* push boot args addr */ - xorl %ebp, %ebp /* zero frame pointer */ + pushl %ebp /* push boot args addr */ + xorl %ebp, %ebp /* zero frame pointer */ POSTCODE(PSTART_BEFORE_PAGING) @@ -185,14 +173,16 @@ paging: */ movl $EXT(IdlePDPT), %eax /* CR3 */ movl %eax, %cr3 - movl %cr4, %eax /* PAE */ + movl %cr4, %eax /* PAE */ orl $(CR4_PAE), %eax movl %eax, %cr4 - movl %cr0,%eax /* paging */ + movl %cr0,%eax /* paging */ orl $(CR0_PG|CR0_WP),%eax movl %eax,%cr0 + + POSTCODE(PSTART_VSTART) - call EXT(vstart) /* run C code */ + call EXT(vstart) /* run C code */ /*NOTREACHED*/ hlt @@ -292,7 +282,7 @@ LEXT(hibernate_machine_entrypoint) /* set up the page tables to use BootstrapPTD * as done in idle_pt.c, but this must be done programatically */ mov $EXT(IdlePDPT), %eax - mov $EXT(BootstrapPTD) + (INTEL_PTE_VALID), %ecx + mov $EXT(BootPTD) + (INTEL_PTE_VALID), %ecx mov $0x0, %edx mov %ecx, (0*8+0)(%eax) mov %edx, (0*8+4)(%eax) diff --git a/osfmk/i386/start64.s b/osfmk/i386/start64.s index bcabe2829..95a9dd664 100644 --- a/osfmk/i386/start64.s +++ b/osfmk/i386/start64.s @@ -27,12 +27,11 @@ */ #include -#include - #include #include #include #include +#include #include .data @@ -213,3 +212,45 @@ Entry(xrstor64o) .byte 0x29 ENTER_COMPAT_MODE() ret + +#if CONFIG_VMX + +/* + * __vmxon -- Enter VMX Operation + * int __vmxon(addr64_t v); + */ +Entry(__vmxon) + FRAME + + ENTER_64BIT_MODE() + mov $(VMX_FAIL_INVALID), %ecx + mov $(VMX_FAIL_VALID), %edx + mov $(VMX_SUCCEED), %eax + vmxon 8(%rbp) /* physical addr passed on stack */ + cmovcl %ecx, %eax /* CF = 1, ZF = 0 */ + cmovzl %edx, %eax /* CF = 0, ZF = 1 */ + ENTER_COMPAT_MODE() + + EMARF + ret + +/* + * __vmxoff -- Leave VMX Operation + * int __vmxoff(void); + */ +Entry(__vmxoff) + FRAME + + ENTER_64BIT_MODE() + mov $(VMX_FAIL_INVALID), %ecx + mov $(VMX_FAIL_VALID), %edx + mov $(VMX_SUCCEED), %eax + vmxoff + cmovcl %ecx, %eax /* CF = 1, ZF = 0 */ + cmovzl %edx, %eax /* CF = 0, ZF = 1 */ + ENTER_COMPAT_MODE() + + EMARF + ret + +#endif /* CONFIG_VMX */ diff --git a/osfmk/i386/startup64.c b/osfmk/i386/startup64.c index b445882cd..b4f69f741 100644 --- a/osfmk/i386/startup64.c +++ b/osfmk/i386/startup64.c @@ -188,6 +188,7 @@ cpu_IA32e_disable(cpu_data_t *cdp) #endif #if DEBUG +extern void dump_regs64(void); extern void dump_gdt(void *); extern void dump_ldt(void *); extern void dump_idt(void *); @@ -349,4 +350,49 @@ dump_tss(void *tssp) kprintf("%p: 0x%08x\n", ip+0, *(ip+0)); } } + +#if defined(__x86_64__) +void dump_regs64(void) +{ + +#define SNAP_REG(reg) \ + uint64_t reg; \ + __asm__ volatile("mov %%" #reg ", %0" : "=m" (reg)) + +#define KPRINT_REG(reg) \ + kprintf("%3s: %p\n", #reg, (void *) reg) + + SNAP_REG(rsp); + SNAP_REG(rbp); + SNAP_REG(rax); + SNAP_REG(rbx); + SNAP_REG(rcx); + SNAP_REG(rdx); + SNAP_REG(rsi); + SNAP_REG(rdi); + SNAP_REG(r8); + SNAP_REG(r9); + SNAP_REG(r10); + SNAP_REG(r11); + SNAP_REG(r12); + SNAP_REG(r13); + SNAP_REG(r14); + + KPRINT_REG(rsp); + KPRINT_REG(rbp); + KPRINT_REG(rax); + KPRINT_REG(rbx); + KPRINT_REG(rcx); + KPRINT_REG(rdx); + KPRINT_REG(rsi); + KPRINT_REG(rdi); + KPRINT_REG(r8); + KPRINT_REG(r9); + KPRINT_REG(r10); + KPRINT_REG(r11); + KPRINT_REG(r12); + KPRINT_REG(r13); + KPRINT_REG(r14); +} +#endif /* __x86_64__ */ #endif /* DEBUG */ diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 2a77aedf3..a2a805a03 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -60,8 +60,6 @@ * Hardware trap/fault handler. */ -#include -#include #include #include @@ -94,18 +92,6 @@ #include -#if MACH_KGDB -#include -#endif /* MACH_KGDB */ - -#if MACH_KDB -#include -#include -#include -#include -#include -#endif /* MACH_KDB */ - #include #include @@ -167,7 +153,7 @@ thread_syscall_return( == (SYSCALL_CLASS_MACH << SYSCALL_CLASS_SHIFT); if (kdebug_enable && is_mach) { /* Mach trap */ - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_SC,code)|DBG_FUNC_END, ret, 0, 0, 0, 0); } @@ -191,7 +177,7 @@ thread_syscall_return( is_mach = (code < 0); if (kdebug_enable && is_mach) { /* Mach trap */ - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_SC,-code)|DBG_FUNC_END, ret, 0, 0, 0, 0); } @@ -214,44 +200,6 @@ thread_syscall_return( } -#if MACH_KDB -boolean_t debug_all_traps_with_kdb = FALSE; -extern struct db_watchpoint *db_watchpoint_list; -extern boolean_t db_watchpoints_inserted; -extern boolean_t db_breakpoints_inserted; - -void -thread_kdb_return(void) -{ - thread_t thr_act = current_thread(); - x86_saved_state_t *iss = USER_STATE(thr_act); - - pal_register_cache_state(thr_act, DIRTY); - - if (is_saved_state64(iss)) { - x86_saved_state64_t *regs; - - regs = saved_state64(iss); - - if (kdb_trap(regs->isf.trapno, (int)regs->isf.err, (void *)regs)) { - thread_exception_return(); - /*NOTREACHED*/ - } - - } else { - x86_saved_state32_t *regs; - - regs = saved_state32(iss); - - if (kdb_trap(regs->trapno, regs->err, (void *)regs)) { - thread_exception_return(); - /*NOTREACHED*/ - } - } -} - -#endif /* MACH_KDB */ - static inline void user_page_fault_continue( kern_return_t kr) @@ -259,61 +207,24 @@ user_page_fault_continue( thread_t thread = current_thread(); user_addr_t vaddr; -#if MACH_KDB - x86_saved_state_t *regs = USER_STATE(thread); - int err; - int trapno; - - assert((is_saved_state32(regs) && !thread_is_64bit(thread)) || - (is_saved_state64(regs) && thread_is_64bit(thread))); -#endif - - if (thread_is_64bit(thread)) { - x86_saved_state64_t *uregs; + if (thread_is_64bit(thread)) { + x86_saved_state64_t *uregs; uregs = USER_REGS64(thread); -#if MACH_KDB - trapno = uregs->isf.trapno; - err = (int)uregs->isf.err; -#endif vaddr = (user_addr_t)uregs->cr2; } else { x86_saved_state32_t *uregs; uregs = USER_REGS32(thread); -#if MACH_KDB - trapno = uregs->trapno; - err = uregs->err; -#endif vaddr = uregs->cr2; } if (__probable((kr == KERN_SUCCESS) || (kr == KERN_ABORTED))) { -#if MACH_KDB - if (!db_breakpoints_inserted) { - db_set_breakpoints(); - } - if (db_watchpoint_list && - db_watchpoints_inserted && - (err & T_PF_WRITE) && - db_find_watchpoint(thread->map, - (vm_offset_t)vaddr, - saved_state32(regs))) - kdb_trap(T_WATCHPOINT, 0, saved_state32(regs)); -#endif /* MACH_KDB */ - thread_exception_return(); - /*NOTREACHED*/ - } - -#if MACH_KDB - if (debug_all_traps_with_kdb && - kdb_trap(trapno, err, saved_state32(regs))) { thread_exception_return(); /*NOTREACHED*/ } -#endif /* MACH_KDB */ /* PAL debug hook */ pal_dbg_page_fault( thread, vaddr, kr ); @@ -442,7 +353,8 @@ interrupt(x86_saved_state_t *state) boolean_t user_mode = FALSE; int ipl; int cnum = cpu_number(); - + int itype = 0; + if (is_saved_state64(state) == TRUE) { x86_saved_state64_t *state64; @@ -465,14 +377,23 @@ interrupt(x86_saved_state_t *state) interrupt_num = state32->trapno; } - KERNEL_DEBUG_CONSTANT( + if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT)) + itype = 1; + else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)) + itype = 2; + else + itype = 3; + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, - interrupt_num, rip, user_mode, 0, 0); + interrupt_num, + (user_mode ? rip : VM_KERNEL_UNSLIDE(rip)), + user_mode, itype, 0); SCHED_STATS_INTERRUPT(current_processor()); ipl = get_preemption_level(); - + /* * Handle local APIC interrupts * else call platform expert for devices. @@ -484,7 +405,8 @@ interrupt(x86_saved_state_t *state) panic("Preemption level altered by interrupt vector 0x%x: initial 0x%x, final: 0x%x\n", interrupt_num, ipl, get_preemption_level()); } - KERNEL_DEBUG_CONSTANT( + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, interrupt_num, 0, 0, 0, 0); @@ -514,7 +436,7 @@ interrupt(x86_saved_state_t *state) kernel_stack_depth_max = (vm_offset_t)depth; KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DEPTH), - (long) depth, (long) rip, 0, 0, 0); + (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0); } } } @@ -562,9 +484,6 @@ kernel_trap( int fault_in_copy_window = -1; #endif int is_user = 0; -#if MACH_KDB - pt_entry_t *pte; -#endif /* MACH_KDB */ thread = current_thread(); @@ -639,8 +558,9 @@ kernel_trap( if (__improbable(T_PREEMPT == type)) { ast_taken(AST_PREEMPTION, FALSE); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, - 0, 0, 0, kern_ip, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, + 0, 0, 0, VM_KERNEL_UNSLIDE(kern_ip), 0); return; } @@ -713,10 +633,11 @@ kernel_trap( #endif } } - - KERNEL_DEBUG_CONSTANT( + user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, - (unsigned)(vaddr >> 32), (unsigned)vaddr, is_user, kern_ip, 0); + (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, + VM_KERNEL_UNSLIDE(kern_ip), 0); (void) ml_set_interrupts_enabled(intr); @@ -760,24 +681,6 @@ kernel_trap( #endif case T_PAGE_FAULT: -#if MACH_KDB - /* - * Check for watchpoint on kernel static data. - * vm_fault would fail in this case - */ - if (map == kernel_map && db_watchpoint_list && db_watchpoints_inserted && - (code & T_PF_WRITE) && vaddr < vm_map_max(map) && - ((*(pte = pmap_pte(kernel_pmap, (vm_map_offset_t)vaddr))) & INTEL_PTE_WRITE) == 0) { - pmap_store_pte( - pte, - *pte | INTEL_PTE_VALID | INTEL_PTE_WRITE); - /* XXX need invltlb here? */ - - result = KERN_SUCCESS; - goto look_for_watchpoints; - } -#endif /* MACH_KDB */ - #if CONFIG_DTRACE if (thread != THREAD_NULL && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ @@ -790,7 +693,6 @@ kernel_trap( } } #endif /* CONFIG_DTRACE */ - prot = VM_PROT_READ; @@ -807,18 +709,6 @@ kernel_trap( FALSE, THREAD_UNINT, NULL, 0); -#if MACH_KDB - if (result == KERN_SUCCESS) { - /* - * Look for watchpoints - */ -look_for_watchpoints: - if (map == kernel_map && db_watchpoint_list && db_watchpoints_inserted && (code & T_PF_WRITE) && - db_find_watchpoint(map, vaddr, saved_state)) - kdb_trap(T_WATCHPOINT, 0, saved_state); - } -#endif /* MACH_KDB */ - if (result == KERN_SUCCESS) { #if NCOPY_WINDOWS > 0 if (fault_in_copy_window != -1) { @@ -879,30 +769,14 @@ debugger_entry: * access through the debugger. */ sync_iss_to_iks(state); -#if MACH_KDB -restart_debugger: -#endif /* MACH_KDB */ #if MACH_KDP - if (current_debugger != KDB_CUR_DB) { + if (current_debugger != KDB_CUR_DB) { if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) return; - } else { -#endif /* MACH_KDP */ -#if MACH_KDB - if (kdb_trap(type, code, saved_state)) { - if (switch_debugger) { - current_debugger = KDP_CUR_DB; - switch_debugger = 0; - goto restart_debugger; - } - return; - } -#endif /* MACH_KDB */ -#if MACH_KDP } #endif } - __asm__ volatile("cli":::"cc"); + pal_cli(); panic_trap(saved_state); /* * NO RETURN @@ -952,11 +826,12 @@ panic_trap(x86_saved_state32_t *regs) "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" "CR2: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" "EFL: 0x%08x, EIP: 0x%08x, CS: 0x%08x, DS: 0x%08x\n" - "Error code: 0x%08x\n", + "Error code: 0x%08x%s\n", regs->eip, regs->trapno, trapname, cr0, cr2, cr3, cr4, regs->eax,regs->ebx,regs->ecx,regs->edx, regs->cr2,regs->ebp,regs->esi,regs->edi, - regs->efl,regs->eip,regs->cs & 0xFFFF, regs->ds & 0xFFFF, regs->err); + regs->efl,regs->eip,regs->cs & 0xFFFF, regs->ds & 0xFFFF, regs->err, + virtualized ? " VMM" : ""); /* * This next statement is not executed, * but it's needed to stop the compiler using tail call optimization @@ -972,7 +847,7 @@ panic_trap(x86_saved_state64_t *regs) { const char *trapname = "Unknown"; pal_cr_t cr0, cr2, cr3, cr4; - boolean_t potential_smep_fault = FALSE; + boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE; pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); assert(ml_get_interrupts_enabled() == FALSE); @@ -991,8 +866,12 @@ panic_trap(x86_saved_state64_t *regs) if (regs->isf.trapno < TRAP_TYPES) trapname = trap_type[regs->isf.trapno]; - if ((regs->isf.trapno == T_PAGE_FAULT) && (regs->isf.err == (T_PF_PROT | T_PF_EXECUTE)) && (pmap_smep_enabled) && (regs->isf.rip == regs->cr2) && (regs->isf.rip < VM_MAX_USER_PAGE_ADDRESS)) { - potential_smep_fault = TRUE; + if ((regs->isf.trapno == T_PAGE_FAULT) && (regs->isf.err == (T_PF_PROT | T_PF_EXECUTE)) && (regs->isf.rip == regs->cr2)) { + if (pmap_smep_enabled && (regs->isf.rip < VM_MAX_USER_PAGE_ADDRESS)) { + potential_smep_fault = TRUE; + } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { + potential_kernel_NX_fault = TRUE; + } } #undef panic @@ -1003,7 +882,7 @@ panic_trap(x86_saved_state64_t *regs) "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" - "CR2: 0x%016llx, Error code: 0x%016llx, Faulting CPU: 0x%x%s\n", + "Fault CR2: 0x%016llx, Error code: 0x%016llx, Fault CPU: 0x%x%s%s%s\n", regs->isf.rip, regs->isf.trapno, trapname, cr0, cr2, cr3, cr4, regs->rax, regs->rbx, regs->rcx, regs->rdx, @@ -1012,7 +891,9 @@ panic_trap(x86_saved_state64_t *regs) regs->r12, regs->r13, regs->r14, regs->r15, regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, regs->isf.ss & 0xFFFF,regs->cr2, regs->isf.err, regs->isf.cpu, - potential_smep_fault ? " SMEP/NX fault" : ""); + virtualized ? " VMM" : "", + potential_kernel_NX_fault ? " Kernel NX fault" : "", + potential_smep_fault ? " SMEP/User NX fault" : ""); /* * This next statement is not executed, * but it's needed to stop the compiler using tail call optimization @@ -1086,7 +967,7 @@ user_trap( pal_sti(); - KERNEL_DEBUG_CONSTANT( + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, (unsigned)(vaddr>>32), (unsigned)vaddr, (unsigned)(rip>>32), (unsigned)rip, 0); @@ -1269,14 +1150,6 @@ user_trap( break; default: -#if MACH_KGDB - Debugger("Unanticipated user trap"); - return; -#endif /* MACH_KGDB */ -#if MACH_KDB - if (kdb_trap(type, err, saved_state32(saved_state))) - return; -#endif /* MACH_KDB */ panic("Unexpected user trap, type %d", type); return; } @@ -1337,39 +1210,6 @@ i386_exception( } -#if MACH_KDB - -extern void db_i386_state(x86_saved_state32_t *regs); - -#include - -void -db_i386_state( - x86_saved_state32_t *regs) -{ - db_printf("eip %8x\n", regs->eip); - db_printf("trap %8x\n", regs->trapno); - db_printf("err %8x\n", regs->err); - db_printf("efl %8x\n", regs->efl); - db_printf("ebp %8x\n", regs->ebp); - db_printf("esp %8x\n", regs->cr2); - db_printf("uesp %8x\n", regs->uesp); - db_printf("cs %8x\n", regs->cs & 0xff); - db_printf("ds %8x\n", regs->ds & 0xff); - db_printf("es %8x\n", regs->es & 0xff); - db_printf("fs %8x\n", regs->fs & 0xff); - db_printf("gs %8x\n", regs->gs & 0xff); - db_printf("ss %8x\n", regs->ss & 0xff); - db_printf("eax %8x\n", regs->eax); - db_printf("ebx %8x\n", regs->ebx); - db_printf("ecx %8x\n", regs->ecx); - db_printf("edx %8x\n", regs->edx); - db_printf("esi %8x\n", regs->esi); - db_printf("edi %8x\n", regs->edi); -} - -#endif /* MACH_KDB */ - /* Synchronize a thread's i386_kernel_state (if any) with the given * i386_saved_state_t obtained from the trap/IPI handler; called in * kernel_trap() prior to entering the debugger, and when receiving diff --git a/osfmk/i386/trap_native.c b/osfmk/i386/trap_native.c index 26a9cbf07..8c5ea3350 100644 --- a/osfmk/i386/trap_native.c +++ b/osfmk/i386/trap_native.c @@ -104,6 +104,7 @@ void panic_64(x86_saved_state_t *, int, const char *, boolean_t); extern volatile int panic_double_fault_cpu; + #if defined(__x86_64__) && DEBUG /* * K64 debug - fatal handler for debug code in the trap vectors. @@ -161,13 +162,13 @@ panic_32(__unused int code, __unused int pc, __unused const char *msg, boolean_t "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", + "EFL: 0x%08x, EIP: 0x%08x%s\n", msg, my_ktss->eip, code, (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), my_ktss->eax, my_ktss->ebx, my_ktss->ecx, my_ktss->edx, my_ktss->esp, my_ktss->ebp, my_ktss->esi, my_ktss->edi, - my_ktss->eflags, my_ktss->eip); + my_ktss->eflags, my_ktss->eip, virtualized ? " VMM" : ""); } /* @@ -233,7 +234,7 @@ panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boole "RSP: 0x%016qx, RBP: 0x%016qx, RSI: 0x%016qx, RDI: 0x%016qx\n" "R8: 0x%016qx, R9: 0x%016qx, R10: 0x%016qx, R11: 0x%016qx\n" "R12: 0x%016qx, R13: 0x%016qx, R14: 0x%016qx, R15: 0x%016qx\n" - "RFL: 0x%016qx, RIP: 0x%016qx, CR2: 0x%016qx\n", + "RFL: 0x%016qx, RIP: 0x%016qx, CR2: 0x%016qx%s\n", msg, ss64p->isf.trapno, ss64p->isf.err, (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), @@ -241,7 +242,8 @@ panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boole ss64p->isf.rsp, ss64p->rbp, ss64p->rsi, ss64p->rdi, ss64p->r8, ss64p->r9, ss64p->r10, ss64p->r11, ss64p->r12, ss64p->r13, ss64p->r14, ss64p->r15, - ss64p->isf.rflags, ss64p->isf.rip, ss64p->cr2); + ss64p->isf.rflags, ss64p->isf.rip, ss64p->cr2, + virtualized ? " VMM" : ""); } else { x86_saved_state32_t *ss32p = saved_state32(sp); panic("%s at 0x%08x, trapno:0x%x, err:0x%x," @@ -249,13 +251,13 @@ panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boole "CR0: 0x%08x, CR2: 0x%08x, CR3: 0x%08x, CR4: 0x%08x\n" "EAX: 0x%08x, EBX: 0x%08x, ECX: 0x%08x, EDX: 0x%08x\n" "ESP: 0x%08x, EBP: 0x%08x, ESI: 0x%08x, EDI: 0x%08x\n" - "EFL: 0x%08x, EIP: 0x%08x\n", + "EFL: 0x%08x, EIP: 0x%08x%s\n", msg, ss32p->eip, ss32p->trapno, ss32p->err, (uint32_t)get_cr0(), (uint32_t)get_cr2(), (uint32_t)get_cr3(), (uint32_t)get_cr4(), ss32p->eax, ss32p->ebx, ss32p->ecx, ss32p->edx, ss32p->uesp, ss32p->ebp, ss32p->esi, ss32p->edi, - ss32p->efl, ss32p->eip); + ss32p->efl, ss32p->eip, virtualized ? " VMM" : ""); } #else x86_saved_state64_t *regs = saved_state64(sp); @@ -266,7 +268,7 @@ panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boole "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" - "Error code: 0x%016llx\n", + "Error code: 0x%016llx%s\n", msg, regs->isf.rip, get_cr0(), get_cr2(), get_cr3_raw(), get_cr4(), @@ -275,7 +277,7 @@ panic_64(x86_saved_state_t *sp, __unused int pc, __unused const char *msg, boole regs->r8, regs->r9, regs->r10, regs->r11, regs->r12, regs->r13, regs->r14, regs->r15, regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, regs->isf.ss & 0xFFFF, - regs->isf.err); + regs->isf.err, virtualized ? " VMM" : ""); #endif } diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index b4bf3dfbd..02b41779c 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -36,7 +36,6 @@ */ #include -#include #include @@ -133,9 +132,33 @@ EFI_FSB_frequency(void) void tsc_init(void) { - uint64_t busFCvtInt = 0; boolean_t N_by_2_bus_ratio = FALSE; + if (cpuid_vmm_present()) { + kprintf("VMM vendor %u TSC frequency %u KHz bus frequency %u KHz\n", + cpuid_vmm_info()->cpuid_vmm_family, + cpuid_vmm_info()->cpuid_vmm_tsc_frequency, + cpuid_vmm_info()->cpuid_vmm_bus_frequency); + + if (cpuid_vmm_info()->cpuid_vmm_tsc_frequency && + cpuid_vmm_info()->cpuid_vmm_bus_frequency) { + + busFreq = (uint64_t)cpuid_vmm_info()->cpuid_vmm_bus_frequency * kilo; + busFCvtt2n = ((1 * Giga) << 32) / busFreq; + busFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / busFCvtt2n; + + tscFreq = (uint64_t)cpuid_vmm_info()->cpuid_vmm_tsc_frequency * kilo; + tscFCvtt2n = ((1 * Giga) << 32) / tscFreq; + tscFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / tscFCvtt2n; + + tscGranularity = tscFreq / busFreq; + + bus2tsc = tmrCvt(busFCvtt2n, tscFCvtn2t); + + return; + } + } + /* * Get the FSB frequency and conversion factors from EFI. */ @@ -146,7 +169,6 @@ tsc_init(void) case CPUFAMILY_INTEL_SANDYBRIDGE: case CPUFAMILY_INTEL_WESTMERE: case CPUFAMILY_INTEL_NEHALEM: { - uint64_t cpu_mhz; uint64_t msr_flex_ratio; uint64_t msr_platform_info; @@ -170,8 +192,6 @@ tsc_init(void) if (busFreq == 0) busFreq = BASE_NHM_CLOCK_SOURCE; - cpu_mhz = tscGranularity * BASE_NHM_CLOCK_SOURCE; - break; } default: { @@ -186,19 +206,16 @@ tsc_init(void) if (busFreq != 0) { busFCvtt2n = ((1 * Giga) << 32) / busFreq; busFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / busFCvtt2n; - busFCvtInt = tmrCvt(1 * Peta, 0xFFFFFFFFFFFFFFFFULL / busFreq); } else { panic("tsc_init: EFI not supported!\n"); } - kprintf(" BUS: Frequency = %6d.%04dMHz, " - "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, " - "cvtInt = %08X.%08X\n", + kprintf(" BUS: Frequency = %6d.%06dMHz, " + "cvtt2n = %08Xx.%08Xx, cvtn2t = %08Xx.%08Xx\n", (uint32_t)(busFreq / Mega), (uint32_t)(busFreq % Mega), (uint32_t)(busFCvtt2n >> 32), (uint32_t)busFCvtt2n, - (uint32_t)(busFCvtn2t >> 32), (uint32_t)busFCvtn2t, - (uint32_t)(busFCvtInt >> 32), (uint32_t)busFCvtInt); + (uint32_t)(busFCvtn2t >> 32), (uint32_t)busFCvtn2t); /* * Get the TSC increment. The TSC is incremented by this @@ -206,8 +223,12 @@ tsc_init(void) * to and from nano-seconds. * The tsc granularity is also called the "bus ratio". If the N/2 bit * is set this indicates the bus ration is 0.5 more than this - i.e. - * that the true bus ratio is (2*tscGranularity + 1)/2. + * that the true bus ratio is (2*tscGranularity + 1)/2. If we cannot + * determine the TSC conversion, assume it ticks at the bus frequency. */ + if (tscGranularity == 0) + tscGranularity = 1; + if (N_by_2_bus_ratio) tscFCvtt2n = busFCvtt2n * 2 / (1 + 2*tscGranularity); else @@ -216,8 +237,8 @@ tsc_init(void) tscFreq = ((1 * Giga) << 32) / tscFCvtt2n; tscFCvtn2t = 0xFFFFFFFFFFFFFFFFULL / tscFCvtt2n; - kprintf(" TSC: Frequency = %6d.%04dMHz, " - "cvtt2n = %08X.%08X, cvtn2t = %08X.%08X, gran = %lld%s\n", + kprintf(" TSC: Frequency = %6d.%06dMHz, " + "cvtt2n = %08Xx.%08Xx, cvtn2t = %08Xx.%08Xx, gran = %lld%s\n", (uint32_t)(tscFreq / Mega), (uint32_t)(tscFreq % Mega), (uint32_t)(tscFCvtt2n >> 32), (uint32_t)tscFCvtt2n, diff --git a/osfmk/i386/vmx/vmx_asm.h b/osfmk/i386/vmx/vmx_asm.h index c295f6b03..51a18b29d 100644 --- a/osfmk/i386/vmx/vmx_asm.h +++ b/osfmk/i386/vmx/vmx_asm.h @@ -28,111 +28,11 @@ #ifndef _I386_VMX_ASM_H_ #define _I386_VMX_ASM_H_ - -#include -#include -#include -#include -#include #define VMX_FAIL_INVALID -1 #define VMX_FAIL_VALID -2 #define VMX_SUCCEED 0 -__attribute__((always_inline)) static inline void enter_64bit_mode(void) { - __asm__ __volatile__ ( - ".byte 0xea /* far jump longmode */ \n\t" - ".long 1f \n\t" - ".word %P0 \n\t" - ".code64 \n\t" - "1:" - :: "i" (KERNEL64_CS) - ); -} -__attribute__((always_inline)) static inline void enter_compat_mode(void) { - asm( - "ljmp *4f \n\t" - "4: \n\t" - ".long 5f \n\t" - ".word %P0 \n\t" - ".code32 \n\t" - "5:" - :: "i" (KERNEL32_CS) - ); -} - -#define __VMXOFF(res) \ - __asm__ __volatile__ ( \ - "vmxoff \n\t" \ - "cmovcl %2, %0 \n\t" /* CF = 1, ZF = 0 */ \ - "cmovzl %3, %0" /* CF = 0, ZF = 1 */ \ - : "=&r" (res) \ - : "0" (VMX_SUCCEED), \ - "r" (VMX_FAIL_INVALID), \ - "r" (VMX_FAIL_VALID) \ - : "memory", "cc" \ - ) - -#define __VMXON(addr, res) \ - __asm__ __volatile__ ( \ - "vmxon %4 \n\t" \ - "cmovcl %2, %0 \n\t" /* CF = 1, ZF = 0 */ \ - "cmovzl %3, %0" /* CF = 0, ZF = 1 */ \ - : "=&r" (res) \ - : "0" (VMX_SUCCEED), \ - "r" (VMX_FAIL_INVALID), \ - "r" (VMX_FAIL_VALID), \ - "m" (*addr) \ - : "memory", "cc" \ - ); - - -/* - * __vmxoff -- Leave VMX Operation - * - */ -static inline int -__vmxoff(void) -{ - int result; -#if defined (__x86_64__) - __VMXOFF(result); -#else - if (ml_is64bit()) { - /* don't put anything between these lines! */ - enter_64bit_mode(); - __VMXOFF(result); - enter_compat_mode(); - } else { - __VMXOFF(result); - } -#endif - return result; -} - -/* - * __vmxon -- Enter VMX Operation - * - */ - static inline int -__vmxon(addr64_t *v) - { - int result; -#if defined (__x86_64__) - __VMXON(v, result); -#else - if (ml_is64bit()) { - /* don't put anything between these lines! */ - enter_64bit_mode(); - __VMXON(v, result); - enter_compat_mode(); - } else { - __VMXON(v, result); - } -#endif - return result; -} - /* * VMX Capability Registers (VCR) * diff --git a/osfmk/i386/vmx/vmx_cpu.c b/osfmk/i386/vmx/vmx_cpu.c index 22cebe2d8..76b4c0ee3 100644 --- a/osfmk/i386/vmx/vmx_cpu.c +++ b/osfmk/i386/vmx/vmx_cpu.c @@ -211,7 +211,14 @@ vmx_on(void *arg __unused) assert(vmx_is_cr0_valid(&cpu->specs)); assert(vmx_is_cr4_valid(&cpu->specs)); - if ((result = __vmxon(&vmxon_region_paddr)) != VMX_SUCCEED) { +#if defined(__i386__) + if (!cpu_mode_is64bit()) + result = VMX_FAIL_INVALID; /* Not supported in legacy mode */ + else +#endif + result = __vmxon(vmxon_region_paddr); + + if (result != VMX_SUCCEED) { panic("vmx_on: unexpected return %d from __vmxon()", result); } } @@ -226,7 +233,14 @@ vmx_off(void *arg __unused) int result; /* Tell the CPU to release the VMXON region */ - if ((result = __vmxoff()) != VMX_SUCCEED) { +#if defined(__i386__) + if (!cpu_mode_is64bit()) + result = VMX_FAIL_INVALID; /* Not supported in legacy mode */ + else +#endif + result = __vmxoff(); + + if (result != VMX_SUCCEED) { panic("vmx_off: unexpected return %d from __vmxoff()", result); } } diff --git a/osfmk/i386/vmx/vmx_cpu.h b/osfmk/i386/vmx/vmx_cpu.h index 255ba421c..bb9f5ad51 100644 --- a/osfmk/i386/vmx/vmx_cpu.h +++ b/osfmk/i386/vmx/vmx_cpu.h @@ -93,4 +93,16 @@ void vmx_get_specs(void); void vmx_resume(void); void vmx_suspend(void); +/* + * __vmxoff -- Leave VMX Operation + * + */ +extern int __vmxoff(void); + +/* + * __vmxon -- Enter VMX Operation + * + */ +extern int __vmxon(addr64_t v); + #endif /* _I386_VMX_CPU_H_ */ diff --git a/osfmk/ipc/ipc_entry.c b/osfmk/ipc/ipc_entry.c index 595660239..e14c8d5e9 100644 --- a/osfmk/ipc/ipc_entry.c +++ b/osfmk/ipc/ipc_entry.c @@ -63,7 +63,6 @@ * Primitive functions to manipulate translation entries. */ -#include #include #include @@ -72,64 +71,15 @@ #include #include #include -#if MACH_KDB -#include -#endif #include #include #include -#include #include #include #include #include #include -zone_t ipc_tree_entry_zone; - - - -/* - * Forward declarations - */ -boolean_t ipc_entry_tree_collision( - ipc_space_t space, - mach_port_name_t name); - -/* - * Routine: ipc_entry_tree_collision - * Purpose: - * Checks if "name" collides with an allocated name - * in the space's tree. That is, returns TRUE - * if the splay tree contains a name with the same - * index as "name". - * Conditions: - * The space is locked (read or write) and active. - */ - -boolean_t -ipc_entry_tree_collision( - ipc_space_t space, - mach_port_name_t name) -{ - mach_port_index_t index; - mach_port_name_t lower, upper; - - assert(space->is_active); - - /* - * Check if we collide with the next smaller name - * or the next larger name. - */ - - ipc_splay_tree_bounds(&space->is_tree, name, &lower, &upper); - - index = MACH_PORT_INDEX(name); - return (((lower != (mach_port_name_t)~0) && - (MACH_PORT_INDEX(lower) == index)) || - ((upper != 0) && (MACH_PORT_INDEX(upper) == index))); -} - /* * Routine: ipc_entry_lookup * Purpose: @@ -147,44 +97,18 @@ ipc_entry_lookup( mach_port_index_t index; ipc_entry_t entry; - assert(space->is_active); + assert(is_active(space)); index = MACH_PORT_INDEX(name); - /* - * If space is fast, we assume no splay tree and name within table - * bounds, but still check generation numbers (if enabled) and - * look for null entries. - */ - if (is_fast_space(space)) { - entry = &space->is_table[index]; + if (index < space->is_table_size) { + entry = &space->is_table[index]; if (IE_BITS_GEN(entry->ie_bits) != MACH_PORT_GEN(name) || IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) - entry = IE_NULL; + entry = IE_NULL; } - else - if (index < space->is_table_size) { - entry = &space->is_table[index]; - if (IE_BITS_GEN(entry->ie_bits) != MACH_PORT_GEN(name)) - if (entry->ie_bits & IE_BITS_COLLISION) { - assert(space->is_tree_total > 0); - goto tree_lookup; - } else - entry = IE_NULL; - else if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) - entry = IE_NULL; - } else if (space->is_tree_total == 0) - entry = IE_NULL; else { - tree_lookup: - entry = (ipc_entry_t) - ipc_splay_tree_lookup(&space->is_tree, name); - /* with sub-space introduction, an entry may appear in */ - /* the splay tree and yet not show rights for this subspace */ - if(entry != IE_NULL) { - if(!(IE_BITS_TYPE(entry->ie_bits))) - entry = IE_NULL; - } + entry = IE_NULL; } assert((entry == IE_NULL) || IE_BITS_TYPE(entry->ie_bits)); @@ -213,7 +137,7 @@ ipc_entry_get( mach_port_index_t first_free; ipc_entry_t free_entry; - assert(space->is_active); + assert(is_active(space)); { table = space->is_table; @@ -222,6 +146,7 @@ ipc_entry_get( if (first_free == 0) return KERN_NO_SPACE; + assert(first_free < space->is_table_size); free_entry = &table[first_free]; table->ie_next = free_entry->ie_next; } @@ -280,7 +205,7 @@ ipc_entry_alloc( is_write_lock(space); for (;;) { - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return KERN_INVALID_TASK; } @@ -308,6 +233,7 @@ ipc_entry_alloc( * KERN_SUCCESS Allocated a new entry. * KERN_INVALID_TASK The space is dead. * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + * KERN_FAILURE Couldn't allocate requested name. */ kern_return_t @@ -318,7 +244,6 @@ ipc_entry_alloc_name( { mach_port_index_t index = MACH_PORT_INDEX(name); mach_port_gen_t gen = MACH_PORT_GEN(name); - ipc_tree_entry_t tentry = ITE_NULL; assert(MACH_PORT_VALID(name)); @@ -327,12 +252,9 @@ ipc_entry_alloc_name( for (;;) { ipc_entry_t entry; - ipc_tree_entry_t tentry2; - ipc_table_size_t its; - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); - if (tentry) ite_free(tentry); return KERN_INVALID_TASK; } @@ -352,18 +274,27 @@ ipc_entry_alloc_name( entry = &table[index]; if (index == 0) { + /* case #1 - the entry is reserved */ assert(!IE_BITS_TYPE(entry->ie_bits)); assert(!IE_BITS_GEN(entry->ie_bits)); + is_write_unlock(space); + return KERN_FAILURE; } else if (IE_BITS_TYPE(entry->ie_bits)) { if (IE_BITS_GEN(entry->ie_bits) == gen) { + /* case #2 -- the entry is inuse, for the same name */ *entryp = entry; - assert(!tentry); return KERN_SUCCESS; + } else { + /* case #3 -- the entry is inuse, for a different name. */ + /* Collisions are not allowed */ + is_write_unlock(space); + return KERN_FAILURE; } } else { mach_port_index_t free_index, next_index; /* + * case #4 -- the entry is free * Rip the entry out of the free list. */ @@ -375,123 +306,36 @@ ipc_entry_alloc_name( table[free_index].ie_next = table[next_index].ie_next; + + /* mark the previous entry modified - reconstructing the name */ + ipc_entry_modified(space, + MACH_PORT_MAKE(free_index, + IE_BITS_GEN(table[free_index].ie_bits)), + &table[free_index]); entry->ie_bits = gen; entry->ie_request = IE_REQ_NONE; *entryp = entry; assert(entry->ie_object == IO_NULL); - if (is_fast_space(space)) - assert(!tentry); - else if (tentry) - ite_free(tentry); return KERN_SUCCESS; } } /* - * In a fast space, ipc_entry_alloc_name may be - * used only to add a right to a port name already - * known in this space. - */ - if (is_fast_space(space)) { - is_write_unlock(space); - assert(!tentry); - return KERN_FAILURE; - } - - /* - * Before trying to allocate any memory, - * check if the entry already exists in the tree. - * This avoids spurious resource errors. - * The splay tree makes a subsequent lookup/insert - * of the same name cheap, so this costs little. + * We grow the table so that the name + * index fits in the array space. + * Because the space will be unlocked, + * we must restart. */ - - if ((space->is_tree_total > 0) && - ((tentry2 = ipc_splay_tree_lookup(&space->is_tree, name)) - != ITE_NULL)) { - assert(tentry2->ite_space == space); - assert(IE_BITS_TYPE(tentry2->ite_bits)); - - *entryp = &tentry2->ite_entry; - if (tentry) ite_free(tentry); - return KERN_SUCCESS; - } - - its = space->is_table_next; - - /* - * Check if the table should be grown. - * - * Note that if space->is_table_size == its->its_size, - * then we won't ever try to grow the table. - * - * Note that we are optimistically assuming that name - * doesn't collide with any existing names. (So if - * it were entered into the tree, is_tree_small would - * be incremented.) This is OK, because even in that - * case, we don't lose memory by growing the table. - */ - if ((space->is_table_size <= index) && - (index < its->its_size) && - (((its->its_size - space->is_table_size) * - sizeof(struct ipc_entry)) < - ((space->is_tree_small + 1) * - sizeof(struct ipc_tree_entry)))) { - kern_return_t kr; - - /* - * Can save space by growing the table. - * Because the space will be unlocked, - * we must restart. - */ - - kr = ipc_entry_grow_table(space, ITS_SIZE_NONE); - assert(kr != KERN_NO_SPACE); - if (kr != KERN_SUCCESS) { - /* space is unlocked */ - if (tentry) ite_free(tentry); - return kr; - } - - continue; - } - - /* - * If a splay-tree entry was allocated previously, - * go ahead and insert it into the tree. - */ - - if (tentry != ITE_NULL) { - - space->is_tree_total++; - - if (index < space->is_table_size) { - entry = &space->is_table[index]; - entry->ie_bits |= IE_BITS_COLLISION; - } else if ((index < its->its_size) && - !ipc_entry_tree_collision(space, name)) - space->is_tree_small++; - - ipc_splay_tree_insert(&space->is_tree, name, tentry); - tentry->ite_bits = 0; - tentry->ite_request = 0; - tentry->ite_object = IO_NULL; - tentry->ite_space = space; - *entryp = &tentry->ite_entry; - return KERN_SUCCESS; + kern_return_t kr; + kr = ipc_entry_grow_table(space, index); + assert(kr != KERN_NO_SPACE); + if (kr != KERN_SUCCESS) { + /* space is unlocked */ + return kr; } - - /* - * Allocate a tree entry and try again. - */ - - is_write_unlock(space); - tentry = ite_alloc(); - if (tentry == ITE_NULL) - return KERN_RESOURCE_SHORTAGE; - is_write_lock(space); + continue; } } @@ -514,7 +358,7 @@ ipc_entry_dealloc( ipc_entry_num_t size; mach_port_index_t index; - assert(space->is_active); + assert(is_active(space)); assert(entry->ie_object == IO_NULL); assert(entry->ie_request == IE_REQ_NONE); @@ -527,113 +371,78 @@ ipc_entry_dealloc( table = space->is_table; size = space->is_table_size; - if (is_fast_space(space)) { - assert(index < size); - assert(entry == &table[index]); + if ((index < size) && (entry == &table[index])) { assert(IE_BITS_GEN(entry->ie_bits) == MACH_PORT_GEN(name)); - assert(!(entry->ie_bits & IE_BITS_COLLISION)); entry->ie_bits &= IE_BITS_GEN_MASK; entry->ie_next = table->ie_next; table->ie_next = index; - return; - } - - - if ((index < size) && (entry == &table[index])) { - assert(IE_BITS_GEN(entry->ie_bits) == MACH_PORT_GEN(name)); - - if (entry->ie_bits & IE_BITS_COLLISION) { - struct ipc_splay_tree small, collisions; - ipc_tree_entry_t tentry; - mach_port_name_t tname; - boolean_t pick; - ipc_object_t obj; - - /* must move an entry from tree to table */ - - ipc_splay_tree_split(&space->is_tree, - MACH_PORT_MAKE(index+1, 0), - &collisions); - ipc_splay_tree_split(&collisions, - MACH_PORT_MAKE(index, 0), - &small); - - pick = ipc_splay_tree_pick(&collisions, - &tname, &tentry); - assert(pick); - assert(MACH_PORT_INDEX(tname) == index); - - entry->ie_object = obj = tentry->ite_object; - entry->ie_bits = tentry->ite_bits|MACH_PORT_GEN(tname); - entry->ie_request = tentry->ite_request; - - assert(tentry->ite_space == space); - - if (IE_BITS_TYPE(tentry->ite_bits)==MACH_PORT_TYPE_SEND) { - ipc_hash_global_delete(space, obj, - tname, tentry); - ipc_hash_local_insert(space, obj, - index, entry); - } - - ipc_splay_tree_delete(&collisions, tname, tentry); - - assert(space->is_tree_total > 0); - space->is_tree_total--; - - /* check if collision bit should still be on */ - - pick = ipc_splay_tree_pick(&collisions, - &tname, &tentry); - if (pick) { - entry->ie_bits |= IE_BITS_COLLISION; - ipc_splay_tree_join(&space->is_tree, - &collisions); - } - - ipc_splay_tree_join(&space->is_tree, &small); - - } else { - entry->ie_bits &= IE_BITS_GEN_MASK; - entry->ie_next = table->ie_next; - table->ie_next = index; - } - } else { - ipc_tree_entry_t tentry = (ipc_tree_entry_t) entry; - - assert(tentry->ite_space == space); - - ipc_splay_tree_delete(&space->is_tree, name, tentry); + /* + * Nothing to do. The entry does not match + * so there is nothing to deallocate. + */ + assert(index < size); + assert(entry == &table[index]); + assert(IE_BITS_GEN(entry->ie_bits) == MACH_PORT_GEN(name)); + } + ipc_entry_modified(space, name, entry); +} - assert(space->is_tree_total > 0); - space->is_tree_total--; +/* + * Routine: ipc_entry_modified + * Purpose: + * Note that an entry was modified in a space. + * Conditions: + * Assumes exclusive write access to the space, + * either through a write lock or being the cleaner + * on an inactive space. + */ - if (index < size) { - ipc_entry_t ientry = &table[index]; +void +ipc_entry_modified( + ipc_space_t space, + mach_port_name_t name, + __assert_only ipc_entry_t entry) +{ + ipc_entry_t table; + ipc_entry_num_t size; + mach_port_index_t index; - assert(ientry->ie_bits & IE_BITS_COLLISION); - - if (!ipc_entry_tree_collision(space, name)) - ientry->ie_bits &= ~IE_BITS_COLLISION; + index = MACH_PORT_INDEX(name); + table = space->is_table; + size = space->is_table_size; - } else if ((index < space->is_table_next->its_size) && - !ipc_entry_tree_collision(space, name)) { + assert(index < size); + assert(entry == &table[index]); - assert(space->is_tree_small > 0); + assert(space->is_low_mod <= size); + assert(space->is_high_mod < size); - space->is_tree_small--; - } - } + if (index < space->is_low_mod) + space->is_low_mod = index; + if (index > space->is_high_mod) + space->is_high_mod = index; } +#define IPC_ENTRY_GROW_STATS 1 +#if IPC_ENTRY_GROW_STATS +static uint64_t ipc_entry_grow_count = 0; +static uint64_t ipc_entry_grow_rescan = 0; +static uint64_t ipc_entry_grow_rescan_max = 0; +static uint64_t ipc_entry_grow_rescan_entries = 0; +static uint64_t ipc_entry_grow_rescan_entries_max = 0; +static uint64_t ipc_entry_grow_freelist_entries = 0; +static uint64_t ipc_entry_grow_freelist_entries_max = 0; +#endif + /* * Routine: ipc_entry_grow_table * Purpose: * Grows the table in a space. * Conditions: * The space must be write-locked and active before. - * If successful, it is also returned locked. + * If successful, the space is also returned locked. + * On failure, the space is returned unlocked. * Allocates memory. * Returns: * KERN_SUCCESS Grew the table. @@ -650,342 +459,233 @@ ipc_entry_grow_table( { ipc_entry_num_t osize, size, nsize, psize; - do { - boolean_t reallocated=FALSE; - - ipc_entry_t otable, table; - ipc_table_size_t oits, its, nits; - mach_port_index_t i, free_index; - - assert(space->is_active); + ipc_entry_t otable, table; + ipc_table_size_t oits, its, nits; + mach_port_index_t i, free_index; + mach_port_index_t low_mod, hi_mod; + ipc_table_index_t sanity; +#if IPC_ENTRY_GROW_STATS + uint64_t rescan_count = 0; +#endif + assert(is_active(space)); - if (space->is_growing) { - /* - * Somebody else is growing the table. - * We just wait for them to finish. - */ + if (is_growing(space)) { + /* + * Somebody else is growing the table. + * We just wait for them to finish. + */ - is_write_sleep(space); - return KERN_SUCCESS; - } + is_write_sleep(space); + return KERN_SUCCESS; + } - otable = space->is_table; + otable = space->is_table; - its = space->is_table_next; - size = its->its_size; + its = space->is_table_next; + size = its->its_size; - /* - * Since is_table_next points to the next natural size - * we can identify the current size entry. - */ - oits = its - 1; - osize = oits->its_size; + /* + * Since is_table_next points to the next natural size + * we can identify the current size entry. + */ + oits = its - 1; + osize = oits->its_size; - /* - * If there is no target size, then the new size is simply - * specified by is_table_next. If there is a target - * size, then search for the next entry. - */ - if (target_size != ITS_SIZE_NONE) { - if (target_size <= osize) { - is_write_unlock(space); - return KERN_SUCCESS; - } - - psize = osize; - while ((psize != size) && (target_size > size)) { - psize = size; - its++; - size = its->its_size; - } - if (psize == size) { - is_write_unlock(space); - return KERN_NO_SPACE; - } + /* + * If there is no target size, then the new size is simply + * specified by is_table_next. If there is a target + * size, then search for the next entry. + */ + if (target_size != ITS_SIZE_NONE) { + if (target_size <= osize) { + /* the space is locked */ + return KERN_SUCCESS; } - if (osize == size) { + psize = osize; + while ((psize != size) && (target_size > size)) { + psize = size; + its++; + size = its->its_size; + } + if (psize == size) { is_write_unlock(space); return KERN_NO_SPACE; } - - nits = its + 1; - nsize = nits->its_size; - - assert((osize < size) && (size <= nsize)); - - /* - * OK, we'll attempt to grow the table. - * The realloc requires that the old table - * remain in existence. - */ + } - space->is_growing = TRUE; + if (osize == size) { is_write_unlock(space); + return KERN_NO_SPACE; + } + + nits = its + 1; + nsize = nits->its_size; + assert((osize < size) && (size <= nsize)); - if (it_entries_reallocable(oits)) { - table = it_entries_realloc(oits, otable, its); - reallocated=TRUE; - } - else { - table = it_entries_alloc(its); - } + /* + * We'll attempt to grow the table. + * + * Because we will be copying without the space lock, reset + * the lowest_mod index to just beyond the end of the current + * table. Modification of entries (other than hashes) will + * bump this downward, and we only have to reprocess entries + * above that mark. Eventually, we'll get done. + */ + is_start_growing(space); + space->is_low_mod = osize; + space->is_high_mod = 0; +#if IPC_ENTRY_GROW_STATS + ipc_entry_grow_count++; +#endif + is_write_unlock(space); + table = it_entries_alloc(its); + if (table == IE_NULL) { is_write_lock(space); - space->is_growing = FALSE; - - /* - * We need to do a wakeup on the space, - * to rouse waiting threads. We defer - * this until the space is unlocked, - * because we don't want them to spin. - */ - - if (table == IE_NULL) { - is_write_unlock(space); - thread_wakeup((event_t) space); - return KERN_RESOURCE_SHORTAGE; - } - - if (!space->is_active) { - /* - * The space died while it was unlocked. - */ - - is_write_unlock(space); - thread_wakeup((event_t) space); - it_entries_free(its, table); - is_write_lock(space); - return KERN_SUCCESS; - } + is_done_growing(space); + is_write_unlock(space); + thread_wakeup((event_t) space); + return KERN_RESOURCE_SHORTAGE; + } - assert(space->is_table == otable); - assert((space->is_table_next == its) || - (target_size != ITS_SIZE_NONE)); - assert(space->is_table_size == osize); + /* initialize new entries (free chain in backwards order) */ + for (i = osize; i < size; i++) { + table[i].ie_object = IO_NULL; + table[i].ie_bits = IE_BITS_GEN_MASK; + table[i].ie_index = 0; + table[i].ie_next = i + 1; + } + table[size-1].ie_next = 0; - space->is_table = table; - space->is_table_size = size; - space->is_table_next = nits; + /* clear out old entries in new table */ + memset((void *)table, 0, osize * sizeof(*table)); - /* - * If we did a realloc, it remapped the data. - * Otherwise we copy by hand first. Then we have - * to zero the new part and the old local hash - * values. - */ - if (!reallocated) - (void) memcpy((void *) table, (const void *) otable, - osize * (sizeof(struct ipc_entry))); + low_mod = 0; + hi_mod = osize - 1; + rescan: + /* + * Within the range of the table that changed, determine what we + * have to take action on. For each entry, take a snapshot of the + * corresponding entry in the old table (so it won't change + * during this iteration). The snapshot may not be self-consistent + * (if we caught it in the middle of being changed), so be very + * cautious with the values. + */ + for (i = low_mod; i <= hi_mod; i++) { + ipc_entry_t entry = &table[i]; + struct ipc_entry osnap = otable[i]; - for (i = 0; i < osize; i++) - table[i].ie_index = 0; + if (entry->ie_object != osnap.ie_object || + IE_BITS_TYPE(entry->ie_bits) != IE_BITS_TYPE(osnap.ie_bits)) { + + if (entry->ie_object != IO_NULL && + IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_SEND) + ipc_hash_table_delete(table, size, entry->ie_object, i, entry); - (void) memset((void *) (table + osize) , 0, - ((size - osize) * (sizeof(struct ipc_entry)))); + entry->ie_object = osnap.ie_object; + entry->ie_bits = osnap.ie_bits; + entry->ie_request = osnap.ie_request; /* or ie_next */ - /* - * Put old entries into the reverse hash table. - */ - for (i = 0; i < osize; i++) { - ipc_entry_t entry = &table[i]; - - if (IE_BITS_TYPE(entry->ie_bits)==MACH_PORT_TYPE_SEND) { - ipc_hash_local_insert(space, entry->ie_object, - i, entry); - } + if (entry->ie_object != IO_NULL && + IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_SEND) + ipc_hash_table_insert(table, size, entry->ie_object, i, entry); + } else { + assert(entry->ie_object == osnap.ie_object); + entry->ie_bits = osnap.ie_bits; + entry->ie_request = osnap.ie_request; /* or ie_next */ } - /* - * If there are entries in the splay tree, - * then we have work to do: - * 1) transfer entries to the table - * 2) update is_tree_small - */ - assert(!is_fast_space(space) || space->is_tree_total == 0); - if (space->is_tree_total > 0) { - mach_port_index_t index; - boolean_t delete; - struct ipc_splay_tree ignore; - struct ipc_splay_tree move; - struct ipc_splay_tree small; - ipc_entry_num_t nosmall; - ipc_tree_entry_t tentry; - - /* - * The splay tree divides into four regions, - * based on the index of the entries: - * 1) 0 <= index < osize - * 2) osize <= index < size - * 3) size <= index < nsize - * 4) nsize <= index - * - * Entries in the first part are ignored. - * Entries in the second part, that don't - * collide, are moved into the table. - * Entries in the third part, that don't - * collide, are counted for is_tree_small. - * Entries in the fourth part are ignored. - */ - - ipc_splay_tree_split(&space->is_tree, - MACH_PORT_MAKE(nsize, 0), - &small); - ipc_splay_tree_split(&small, - MACH_PORT_MAKE(size, 0), - &move); - ipc_splay_tree_split(&move, - MACH_PORT_MAKE(osize, 0), - &ignore); - - /* move entries into the table */ - - for (tentry = ipc_splay_traverse_start(&move); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&move, delete)) { - - mach_port_name_t name; - mach_port_gen_t gen; - mach_port_type_t type; - ipc_entry_bits_t bits; - ipc_object_t obj; - ipc_entry_t entry; - - name = tentry->ite_name; - gen = MACH_PORT_GEN(name); - index = MACH_PORT_INDEX(name); - - assert(tentry->ite_space == space); - assert((osize <= index) && (index < size)); - - entry = &table[index]; - bits = entry->ie_bits; - if (IE_BITS_TYPE(bits)) { - assert(IE_BITS_GEN(bits) != gen); - entry->ie_bits |= IE_BITS_COLLISION; - delete = FALSE; - continue; - } - - bits = tentry->ite_bits; - type = IE_BITS_TYPE(bits); - assert(type != MACH_PORT_TYPE_NONE); - - entry->ie_bits = bits | gen; - entry->ie_request = tentry->ite_request; - entry->ie_object = obj = tentry->ite_object; - - if (type == MACH_PORT_TYPE_SEND) { - ipc_hash_global_delete(space, obj, - name, tentry); - ipc_hash_local_insert(space, obj, - index, entry); - } - space->is_tree_total--; - delete = TRUE; - } - ipc_splay_traverse_finish(&move); - - /* count entries for is_tree_small */ - - nosmall = 0; index = 0; - for (tentry = ipc_splay_traverse_start(&small); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&small, FALSE)) { - mach_port_index_t nindex; - - nindex = MACH_PORT_INDEX(tentry->ite_name); - - if (nindex != index) { - nosmall++; - index = nindex; - } - } - ipc_splay_traverse_finish(&small); - - assert(nosmall <= (nsize - size)); - assert(nosmall <= space->is_tree_total); - space->is_tree_small = nosmall; + } + table[0].ie_next = otable[0].ie_next; /* always rebase the freelist */ - /* put the splay tree back together */ + /* + * find the end of the freelist (should be short). But be careful, + * the list items can change so only follow through truly free entries + * (no problem stopping short in those cases, because we'll rescan). + */ + free_index = 0; + for (sanity = 0; sanity < osize; sanity++) { + if (table[free_index].ie_object != IPC_OBJECT_NULL) + break; + i = table[free_index].ie_next; + if (i == 0 || i >= osize) + break; + free_index = i; + } +#if IPC_ENTRY_GROW_STATS + ipc_entry_grow_freelist_entries += sanity; + if (sanity > ipc_entry_grow_freelist_entries_max) + ipc_entry_grow_freelist_entries_max = sanity; +#endif + + is_write_lock(space); - ipc_splay_tree_join(&space->is_tree, &small); - ipc_splay_tree_join(&space->is_tree, &move); - ipc_splay_tree_join(&space->is_tree, &ignore); - } + /* + * We need to do a wakeup on the space, + * to rouse waiting threads. We defer + * this until the space is unlocked, + * because we don't want them to spin. + */ + if (!is_active(space)) { /* - * Add entries in the new part which still aren't used - * to the free list. Add them in reverse order, - * and set the generation number to -1, so that - * early allocations produce "natural" names. + * The space died while it was unlocked. */ - free_index = table[0].ie_next; - for (i = size-1; i >= osize; --i) { - ipc_entry_t entry = &table[i]; - - if (entry->ie_bits == 0) { - entry->ie_bits = IE_BITS_GEN_MASK; - entry->ie_next = free_index; - free_index = i; - } - } - table[0].ie_next = free_index; - - /* - * Now we need to free the old table. - * If the space dies or grows while unlocked, - * then we can quit here. - */ + is_done_growing(space); is_write_unlock(space); thread_wakeup((event_t) space); - - it_entries_free(oits, otable); + it_entries_free(its, table); is_write_lock(space); - if (!space->is_active || (space->is_table_next != nits)) - return KERN_SUCCESS; + return KERN_SUCCESS; + } - /* - * We might have moved enough entries from - * the splay tree into the table that - * the table can be profitably grown again. - * - * Note that if size == nsize, then - * space->is_tree_small == 0. - */ - } while ((space->is_tree_small > 0) && - (((nsize - size) * sizeof(struct ipc_entry)) < - (space->is_tree_small * sizeof(struct ipc_tree_entry)))); + /* If the space changed while unlocked, go back and process the changes */ + if (space->is_low_mod < osize) { + assert(space->is_high_mod > 0); + low_mod = space->is_low_mod; + space->is_low_mod = osize; + hi_mod = space->is_high_mod; + space->is_high_mod = 0; + is_write_unlock(space); +#if IPC_ENTRY_GROW_STATS + rescan_count++; + if (rescan_count > ipc_entry_grow_rescan_max) + ipc_entry_grow_rescan_max = rescan_count; + + ipc_entry_grow_rescan++; + ipc_entry_grow_rescan_entries += hi_mod - low_mod + 1; + if (hi_mod - low_mod + 1 > ipc_entry_grow_rescan_entries_max) + ipc_entry_grow_rescan_entries_max = hi_mod - low_mod + 1; +#endif + goto rescan; + } - return KERN_SUCCESS; -} + /* link new free entries onto the rest of the freelist */ + assert(table[free_index].ie_next == 0 && + table[free_index].ie_object == IO_NULL); + table[free_index].ie_next = osize; + assert(space->is_table == otable); + assert((space->is_table_next == its) || + (target_size != ITS_SIZE_NONE)); + assert(space->is_table_size == osize); -#if MACH_KDB -#include -#define printf kdbprintf + space->is_table = table; + space->is_table_size = size; + space->is_table_next = nits; -ipc_entry_t db_ipc_object_by_name( - task_t task, - mach_port_name_t name); + is_done_growing(space); + is_write_unlock(space); + thread_wakeup((event_t) space); -ipc_entry_t -db_ipc_object_by_name( - task_t task, - mach_port_name_t name) -{ - ipc_space_t space = task->itk_space; - ipc_entry_t entry; - - - entry = ipc_entry_lookup(space, name); - if(entry != IE_NULL) { - iprintf("(task 0x%x, name 0x%x) ==> object 0x%x\n", - task, name, entry->ie_object); - return (ipc_entry_t) entry->ie_object; - } - return entry; + /* + * Now we need to free the old table. + */ + it_entries_free(oits, otable); + is_write_lock(space); + + return KERN_SUCCESS; } -#endif /* MACH_KDB */ diff --git a/osfmk/ipc/ipc_entry.h b/osfmk/ipc/ipc_entry.h index 14d7d1846..592a31c73 100644 --- a/osfmk/ipc/ipc_entry.h +++ b/osfmk/ipc/ipc_entry.h @@ -80,9 +80,6 @@ * Spaces hold capabilities for ipc_object_t's. * Each ipc_entry_t records a capability. Most capabilities have * small names, and the entries are elements of a table. - * Capabilities can have large names, and a splay tree holds - * those entries. The cutoff point between the table and the tree - * is adjusted dynamically to minimize memory consumption. * * The ie_index field of entries in the table implements * a ordered hash table with open addressing and linear probing. @@ -100,19 +97,15 @@ struct ipc_entry { struct ipc_object *ie_object; ipc_entry_bits_t ie_bits; + mach_port_index_t ie_index; union { mach_port_index_t next; /* next in freelist, or... */ ipc_table_index_t request; /* dead name request notify */ } index; - union { - mach_port_index_t table; - struct ipc_tree_entry *tree; - } hash; }; #define ie_request index.request #define ie_next index.next -#define ie_index hash.table #define IE_REQ_NONE 0 /* no request */ @@ -122,9 +115,6 @@ struct ipc_entry { #define IE_BITS_TYPE_MASK 0x001f0000 /* 5 bits of capability type */ #define IE_BITS_TYPE(bits) ((bits) & IE_BITS_TYPE_MASK) -#define IE_BITS_COLLISION 0x00800000 /* 1 bit for collisions */ - - #ifndef NO_PORT_GEN #define IE_BITS_GEN_MASK 0xff000000 /* 8 bits for generation */ #define IE_BITS_GEN(bits) ((bits) & IE_BITS_GEN_MASK) @@ -140,24 +130,6 @@ struct ipc_entry { #define IE_BITS_RIGHT_MASK 0x007fffff /* relevant to the right */ -struct ipc_tree_entry { - struct ipc_entry ite_entry; - mach_port_name_t ite_name; - struct ipc_space *ite_space; - struct ipc_tree_entry *ite_lchild; - struct ipc_tree_entry *ite_rchild; -}; - -#define ite_bits ite_entry.ie_bits -#define ite_object ite_entry.ie_object -#define ite_request ite_entry.ie_request -#define ite_next ite_entry.hash.tree - -extern zone_t ipc_tree_entry_zone; - -#define ite_alloc() ((ipc_tree_entry_t) zalloc(ipc_tree_entry_zone)) -#define ite_free(ite) zfree(ipc_tree_entry_zone, (ite)) - /* * Exported interfaces */ @@ -191,6 +163,12 @@ extern void ipc_entry_dealloc( mach_port_name_t name, ipc_entry_t entry); +/* Mark and entry modified in a space */ +extern void ipc_entry_modified( + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t entry); + /* Grow the table in a space */ extern kern_return_t ipc_entry_grow_table( ipc_space_t space, diff --git a/osfmk/ipc/ipc_hash.c b/osfmk/ipc/ipc_hash.c index 2f43a63cf..87f4fc6c3 100644 --- a/osfmk/ipc/ipc_hash.c +++ b/osfmk/ipc/ipc_hash.c @@ -87,20 +87,6 @@ * Forward declarations */ -/* Lookup (space, obj) in global hash table */ -boolean_t ipc_hash_global_lookup( - ipc_space_t space, - ipc_object_t obj, - mach_port_name_t *namep, - ipc_tree_entry_t *entryp); - -/* Insert an entry into the global reverse hash table */ -void ipc_hash_global_insert( - ipc_space_t space, - ipc_object_t obj, - mach_port_name_t name, - ipc_tree_entry_t entry); - /* Delete an entry from the local reverse hash table */ void ipc_hash_local_delete( ipc_space_t space, @@ -124,16 +110,7 @@ ipc_hash_lookup( mach_port_name_t *namep, ipc_entry_t *entryp) { - boolean_t rv; - - rv = ipc_hash_local_lookup(space, obj, namep, entryp); - if (!rv) { - assert(!is_fast_space(space) || space->is_tree_hash == 0); - if (space->is_tree_hash > 0) - rv = ipc_hash_global_lookup(space, obj, namep, - (ipc_tree_entry_t *) entryp); - } - return (rv); + return ipc_hash_table_lookup(space->is_table, space->is_table_size, obj, namep, entryp); } /* @@ -155,14 +132,7 @@ ipc_hash_insert( mach_port_index_t index; index = MACH_PORT_INDEX(name); - if ((index < space->is_table_size) && - (entry == &space->is_table[index])) - ipc_hash_local_insert(space, obj, index, entry); - else { - assert(!is_fast_space(space)); - ipc_hash_global_insert(space, obj, name, - (ipc_tree_entry_t) entry); - } + ipc_hash_table_insert(space->is_table, space->is_table_size, obj, index, entry); } /* @@ -183,184 +153,7 @@ ipc_hash_delete( mach_port_index_t index; index = MACH_PORT_INDEX(name); - if ((index < space->is_table_size) && - (entry == &space->is_table[index])) - ipc_hash_local_delete(space, obj, index, entry); - else { - assert(!is_fast_space(space)); - ipc_hash_global_delete(space, obj, name, - (ipc_tree_entry_t) entry); - } -} - -/* - * The global reverse hash table holds splay tree entries. - * It is a simple open-chaining hash table with singly-linked buckets. - * Each bucket is locked separately, with an exclusive lock. - * Within each bucket, move-to-front is used. - */ - -typedef natural_t ipc_hash_index_t; - -ipc_hash_index_t ipc_hash_global_size; -ipc_hash_index_t ipc_hash_global_mask; - -#define IH_GLOBAL_HASH(space, obj) \ - (((((ipc_hash_index_t) ((vm_offset_t)space)) >> 4) + \ - (((ipc_hash_index_t) ((vm_offset_t)obj)) >> 6)) & \ - ipc_hash_global_mask) - -typedef struct ipc_hash_global_bucket { - decl_lck_mtx_data(, ihgb_lock_data) - ipc_tree_entry_t ihgb_head; -} *ipc_hash_global_bucket_t; - -#define IHGB_NULL ((ipc_hash_global_bucket_t) 0) - -#define ihgb_lock_init(ihgb) lck_mtx_init(&(ihgb)->ihgb_lock_data, &ipc_lck_grp, &ipc_lck_attr) -#define ihgb_lock(ihgb) lck_mtx_lock(&(ihgb)->ihgb_lock_data) -#define ihgb_unlock(ihgb) lck_mtx_unlock(&(ihgb)->ihgb_lock_data) - -ipc_hash_global_bucket_t ipc_hash_global_table; - -/* - * Routine: ipc_hash_global_lookup - * Purpose: - * Converts (space, obj) -> (name, entry). - * Looks in the global table, for splay tree entries. - * Returns TRUE if an entry was found. - * Conditions: - * The space must be locked (read or write) throughout. - */ - -boolean_t -ipc_hash_global_lookup( - ipc_space_t space, - ipc_object_t obj, - mach_port_name_t *namep, - ipc_tree_entry_t *entryp) -{ - ipc_hash_global_bucket_t bucket; - ipc_tree_entry_t this, *last; - - assert(space != IS_NULL); - assert(obj != IO_NULL); - - assert(!is_fast_space(space)); - bucket = &ipc_hash_global_table[IH_GLOBAL_HASH(space, obj)]; - ihgb_lock(bucket); - - if ((this = bucket->ihgb_head) != ITE_NULL) { - if ((this->ite_object == obj) && - (this->ite_space == space)) { - /* found it at front; no need to move */ - - *namep = this->ite_name; - *entryp = this; - } else for (last = &this->ite_next; - (this = *last) != ITE_NULL; - last = &this->ite_next) { - if ((this->ite_object == obj) && - (this->ite_space == space)) { - /* found it; move to front */ - - *last = this->ite_next; - this->ite_next = bucket->ihgb_head; - bucket->ihgb_head = this; - - *namep = this->ite_name; - *entryp = this; - break; - } - } - } - - ihgb_unlock(bucket); - return this != ITE_NULL; -} - -/* - * Routine: ipc_hash_global_insert - * Purpose: - * Inserts an entry into the global reverse hash table. - * Conditions: - * The space must be write-locked. - */ - -void -ipc_hash_global_insert( - ipc_space_t space, - ipc_object_t obj, - __assert_only mach_port_name_t name, - ipc_tree_entry_t entry) -{ - ipc_hash_global_bucket_t bucket; - - assert(!is_fast_space(space)); - assert(entry->ite_name == name); - assert(space != IS_NULL); - assert(entry->ite_space == space); - assert(obj != IO_NULL); - assert(entry->ite_object == obj); - - space->is_tree_hash++; - assert(space->is_tree_hash <= space->is_tree_total); - - bucket = &ipc_hash_global_table[IH_GLOBAL_HASH(space, obj)]; - ihgb_lock(bucket); - - /* insert at front of bucket */ - - entry->ite_next = bucket->ihgb_head; - bucket->ihgb_head = entry; - - ihgb_unlock(bucket); -} - -/* - * Routine: ipc_hash_global_delete - * Purpose: - * Deletes an entry from the global reverse hash table. - * Conditions: - * The space must be write-locked. - */ - -void -ipc_hash_global_delete( - ipc_space_t space, - ipc_object_t obj, - __assert_only mach_port_name_t name, - ipc_tree_entry_t entry) -{ - ipc_hash_global_bucket_t bucket; - ipc_tree_entry_t this, *last; - - assert(!is_fast_space(space)); - assert(entry->ite_name == name); - assert(space != IS_NULL); - assert(entry->ite_space == space); - assert(obj != IO_NULL); - assert(entry->ite_object == obj); - - assert(space->is_tree_hash > 0); - space->is_tree_hash--; - - bucket = &ipc_hash_global_table[IH_GLOBAL_HASH(space, obj)]; - ihgb_lock(bucket); - - for (last = &bucket->ihgb_head; - (this = *last) != ITE_NULL; - last = &this->ite_next) { - if (this == entry) { - /* found it; remove from bucket */ - - *last = this->ite_next; - break; - } - } - assert(this != ITE_NULL); - - ihgb_unlock(bucket); + ipc_hash_table_delete(space->is_table, space->is_table_size, obj, index, entry); } /* @@ -393,36 +186,30 @@ ipc_hash_global_delete( * So possibly a small win; probably nothing significant. */ -#define IH_LOCAL_HASH(obj, size) \ +#define IH_TABLE_HASH(obj, size) \ ((mach_port_index_t)((((uintptr_t) (obj)) >> 6) % (size))) /* - * Routine: ipc_hash_local_lookup + * Routine: ipc_hash_table_lookup * Purpose: - * Converts (space, obj) -> (name, entry). - * Looks in the space's local table, for table entries. - * Returns TRUE if an entry was found. + * Converts (table, obj) -> (name, entry). * Conditions: - * The space must be locked (read or write) throughout. + * Must have read consistency on the table. */ boolean_t -ipc_hash_local_lookup( - ipc_space_t space, +ipc_hash_table_lookup( + ipc_entry_t table, + ipc_entry_num_t size, ipc_object_t obj, mach_port_name_t *namep, ipc_entry_t *entryp) { - ipc_entry_t table; - ipc_entry_num_t size; mach_port_index_t hindex, index; - assert(space != IS_NULL); assert(obj != IO_NULL); - table = space->is_table; - size = space->is_table_size; - hindex = IH_LOCAL_HASH(obj, size); + hindex = IH_TABLE_HASH(obj, size); /* * Ideally, table[hindex].ie_index is the name we want. @@ -432,8 +219,10 @@ ipc_hash_local_lookup( */ while ((index = table[hindex].ie_index) != 0) { - ipc_entry_t entry = &table[index]; + ipc_entry_t entry; + assert(index < size); + entry = &table[index]; if (entry->ie_object == obj) { *entryp = entry; *namep = MACH_PORT_MAKE(index, @@ -449,7 +238,7 @@ ipc_hash_local_lookup( } /* - * Routine: ipc_hash_local_insert + * Routine: ipc_hash_table_insert * Purpose: * Inserts an entry into the space's reverse hash table. * Conditions: @@ -457,23 +246,19 @@ ipc_hash_local_lookup( */ void -ipc_hash_local_insert( - ipc_space_t space, +ipc_hash_table_insert( + ipc_entry_t table, + ipc_entry_num_t size, ipc_object_t obj, mach_port_index_t index, __assert_only ipc_entry_t entry) { - ipc_entry_t table; - ipc_entry_num_t size; mach_port_index_t hindex; assert(index != 0); - assert(space != IS_NULL); assert(obj != IO_NULL); - table = space->is_table; - size = space->is_table_size; - hindex = IH_LOCAL_HASH(obj, size); + hindex = IH_TABLE_HASH(obj, size); assert(entry == &table[index]); assert(entry->ie_object == obj); @@ -493,31 +278,27 @@ ipc_hash_local_insert( } /* - * Routine: ipc_hash_local_delete + * Routine: ipc_hash_table_delete * Purpose: - * Deletes an entry from the space's reverse hash table. + * Deletes an entry from the table's reverse hash. * Conditions: - * The space must be write-locked. + * Exclusive access to the table. */ void -ipc_hash_local_delete( - ipc_space_t space, +ipc_hash_table_delete( + ipc_entry_t table, + ipc_entry_num_t size, ipc_object_t obj, mach_port_index_t index, __assert_only ipc_entry_t entry) { - ipc_entry_t table; - ipc_entry_num_t size; mach_port_index_t hindex, dindex; assert(index != MACH_PORT_NULL); - assert(space != IS_NULL); assert(obj != IO_NULL); - table = space->is_table; - size = space->is_table_size; - hindex = IH_LOCAL_HASH(obj, size); + hindex = IH_TABLE_HASH(obj, size); assert(entry == &table[index]); assert(entry->ie_object == obj); @@ -571,7 +352,7 @@ ipc_hash_local_delete( tobj = table[index].ie_object; assert(tobj != IO_NULL); - tindex = IH_LOCAL_HASH(tobj, size); + tindex = IH_TABLE_HASH(tobj, size); if ((dindex < hindex) ? ((dindex < tindex) && (tindex <= hindex)) : @@ -583,112 +364,3 @@ ipc_hash_local_delete( } } -/* - * Routine: ipc_hash_init - * Purpose: - * Initialize the reverse hash table implementation. - */ - -void -ipc_hash_init(void) -{ - ipc_hash_index_t i; - - /* if not configured, initialize ipc_hash_global_size */ - - if (ipc_hash_global_size == 0) { - ipc_hash_global_size = ipc_tree_entry_max >> 8; - if (ipc_hash_global_size < 32) - ipc_hash_global_size = 32; - } - - /* make sure it is a power of two */ - - ipc_hash_global_mask = ipc_hash_global_size - 1; - if ((ipc_hash_global_size & ipc_hash_global_mask) != 0) { - natural_t bit; - - /* round up to closest power of two */ - - for (bit = 1;; bit <<= 1) { - ipc_hash_global_mask |= bit; - ipc_hash_global_size = ipc_hash_global_mask + 1; - - if ((ipc_hash_global_size & ipc_hash_global_mask) == 0) - break; - } - } - - /* allocate ipc_hash_global_table */ - - ipc_hash_global_table = (ipc_hash_global_bucket_t) - kalloc((vm_size_t) (ipc_hash_global_size * - sizeof(struct ipc_hash_global_bucket))); - assert(ipc_hash_global_table != IHGB_NULL); - - /* and initialize it */ - - for (i = 0; i < ipc_hash_global_size; i++) { - ipc_hash_global_bucket_t bucket; - - bucket = &ipc_hash_global_table[i]; - ihgb_lock_init(bucket); - bucket->ihgb_head = ITE_NULL; - } -} - -#if MACH_IPC_DEBUG - -/* - * Routine: ipc_hash_size - * Purpose: - * Return the size of the global reverse hash table. - */ -natural_t -ipc_hash_size(void) -{ - return ipc_hash_global_size; -} - -/* - * Routine: ipc_hash_info - * Purpose: - * Return information about the global reverse hash table. - * Fills the buffer with as much information as possible - * and returns the desired size of the buffer. - * Conditions: - * Nothing locked. The caller should provide - * possibly-pageable memory. - */ - - -ipc_hash_index_t -ipc_hash_info( - hash_info_bucket_t *info, - natural_t count) -{ - ipc_hash_index_t i; - - if (ipc_hash_global_size < count) - count = ipc_hash_global_size; - - for (i = 0; i < count; i++) { - ipc_hash_global_bucket_t bucket = &ipc_hash_global_table[i]; - unsigned int bucket_count = 0; - ipc_tree_entry_t entry; - - ihgb_lock(bucket); - for (entry = bucket->ihgb_head; - entry != ITE_NULL; - entry = entry->ite_next) - bucket_count++; - ihgb_unlock(bucket); - - /* don't touch pageable memory while holding locks */ - info[i].hib_count = bucket_count; - } - - return ipc_hash_global_size; -} - -#endif /* MACH_IPC_DEBUG */ diff --git a/osfmk/ipc/ipc_hash.h b/osfmk/ipc/ipc_hash.h index 987e54013..b0249f0fb 100644 --- a/osfmk/ipc/ipc_hash.h +++ b/osfmk/ipc/ipc_hash.h @@ -98,40 +98,39 @@ extern void ipc_hash_delete( /* * For use by functions that know what they're doing: - * the global primitives, for splay tree entries, - * and the local primitives, for table entries. + * local primitives are for table entries. */ -/* Delete an entry from the global reverse hash table */ -extern void ipc_hash_global_delete( - ipc_space_t space, - ipc_object_t obj, - mach_port_name_t name, - ipc_tree_entry_t entry); - /* Lookup (space, obj) in local hash table */ -extern boolean_t ipc_hash_local_lookup( - ipc_space_t space, +extern boolean_t ipc_hash_table_lookup( + ipc_entry_t table, + ipc_entry_num_t size, ipc_object_t obj, mach_port_name_t *namep, ipc_entry_t *entryp); /* Inserts an entry into the local reverse hash table */ -extern void ipc_hash_local_insert( - ipc_space_t space, +extern void ipc_hash_table_insert( + ipc_entry_t table, + ipc_entry_num_t size, ipc_object_t obj, mach_port_index_t index, ipc_entry_t entry); -/* Initialize the reverse hash table implementation */ -extern void ipc_hash_init(void) __attribute__((section("__TEXT, initcode"))); +/* Delete an entry from the appropriate reverse hash table */ +extern void ipc_hash_table_delete( + ipc_entry_t table, + ipc_entry_num_t size, + ipc_object_t obj, + mach_port_name_t name, + ipc_entry_t entry); #include #if MACH_IPC_DEBUG #include -extern natural_t ipc_hash_size(void); + extern natural_t ipc_hash_info( hash_info_bucket_t *info, natural_t count); diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index cc0970e32..11dd9dafc 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,6 +74,7 @@ #include #include +#include #include #include @@ -106,13 +107,20 @@ vm_map_t ipc_kernel_map; vm_size_t ipc_kernel_map_size = 1024 * 1024; +/* values to limit physical copy out-of-line memory descriptors */ vm_map_t ipc_kernel_copy_map; #define IPC_KERNEL_COPY_MAP_SIZE (8 * 1024 * 1024) vm_size_t ipc_kernel_copy_map_size = IPC_KERNEL_COPY_MAP_SIZE; -vm_size_t ipc_kmsg_max_vm_space = (IPC_KERNEL_COPY_MAP_SIZE * 7)/8; +vm_size_t ipc_kmsg_max_vm_space = ((IPC_KERNEL_COPY_MAP_SIZE * 7) / 8); + +/* + * values to limit inline message body handling + * avoid copyin/out limits - even after accounting for maximum descriptor expansion. + */ +#define IPC_KMSG_MAX_SPACE (64 * 1024 * 1024) /* keep in sync with COPYSIZELIMIT_PANIC */ +vm_size_t ipc_kmsg_max_body_space = ((IPC_KMSG_MAX_SPACE * 3)/4 - MAX_TRAILER_SIZE); int ipc_space_max; -int ipc_tree_entry_max; int ipc_port_max; int ipc_pset_max; @@ -142,7 +150,6 @@ ipc_bootstrap(void) ipc_port_multiple_lock_init(); - ipc_port_timestamp_lock_init(); ipc_port_timestamp_data = 0; /* all IPC zones should be exhaustible */ @@ -153,13 +160,6 @@ ipc_bootstrap(void) "ipc spaces"); zone_change(ipc_space_zone, Z_NOENCRYPT, TRUE); - ipc_tree_entry_zone = - zinit(sizeof(struct ipc_tree_entry), - ipc_tree_entry_max * sizeof(struct ipc_tree_entry), - sizeof(struct ipc_tree_entry), - "ipc tree entries"); - zone_change(ipc_tree_entry_zone, Z_NOENCRYPT, TRUE); - /* * populate all port(set) zones */ @@ -217,7 +217,7 @@ ipc_bootstrap(void) #endif mig_init(); ipc_table_init(); - ipc_hash_init(); + semaphore_init(); lock_set_init(); mk_timer_init(); diff --git a/osfmk/ipc/ipc_init.h b/osfmk/ipc/ipc_init.h index b590d8a3e..36fd8976b 100644 --- a/osfmk/ipc/ipc_init.h +++ b/osfmk/ipc/ipc_init.h @@ -113,7 +113,6 @@ #define _IPC_IPC_INIT_H_ extern int ipc_space_max; -extern int ipc_tree_entry_max; extern int ipc_port_max; extern int ipc_pset_max; diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 167d42145..f45d1deab 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -484,7 +484,9 @@ ipc_msg_print_untyped64( #endif /* !DEBUG_MSGS_K64 */ extern vm_map_t ipc_kernel_copy_map; +extern vm_size_t ipc_kmsg_max_space; extern vm_size_t ipc_kmsg_max_vm_space; +extern vm_size_t ipc_kmsg_max_body_space; extern vm_size_t msg_ool_size_small; #define MSG_OOL_SIZE_SMALL msg_ool_size_small @@ -591,10 +593,17 @@ ipc_kmsg_alloc( * data backwards. */ mach_msg_size_t size = msg_and_trailer_size - MAX_TRAILER_SIZE; + + /* compare against implementation upper limit for the body */ + if (size > ipc_kmsg_max_body_space) + return IKM_NULL; + if (size > sizeof(mach_msg_base_t)) { mach_msg_size_t max_desc = (mach_msg_size_t)(((size - sizeof(mach_msg_base_t)) / sizeof(mach_msg_ool_descriptor32_t)) * DESC_SIZE_ADJUSTMENT); + + /* make sure expansion won't cause wrap */ if (msg_and_trailer_size > MACH_MSG_SIZE_MAX - max_desc) return IKM_NULL; @@ -602,9 +611,7 @@ ipc_kmsg_alloc( } else max_expanded_size = msg_and_trailer_size; - if (max_expanded_size > ikm_less_overhead(MACH_MSG_SIZE_MAX)) - return IKM_NULL; - else if (max_expanded_size < IKM_SAVED_MSG_SIZE) + if (max_expanded_size < IKM_SAVED_MSG_SIZE) max_expanded_size = IKM_SAVED_MSG_SIZE; /* round up for ikm_cache */ if (max_expanded_size == IKM_SAVED_MSG_SIZE) { @@ -674,9 +681,11 @@ ipc_kmsg_free( if (ip_active(port) && (port->ip_premsg == kmsg)) { assert(IP_PREALLOC(port)); ip_unlock(port); + ip_release(port); return; } - ip_check_unlock(port); /* May be last reference */ + ip_unlock(port); + ip_release(port); /* May be last reference */ } /* @@ -1104,6 +1113,7 @@ ipc_kmsg_prealloc(mach_msg_size_t size) * MACH_MSG_SUCCESS Acquired a message buffer. * MACH_SEND_MSG_TOO_SMALL Message smaller than a header. * MACH_SEND_MSG_TOO_SMALL Message size not long-word multiple. + * MACH_SEND_TOO_LARGE Message too large to ever be sent. * MACH_SEND_NO_BUFFER Couldn't allocate a message buffer. * MACH_SEND_INVALID_DATA Couldn't copy message data. */ @@ -1124,7 +1134,7 @@ ipc_kmsg_get( if ((size < sizeof(mach_msg_legacy_header_t)) || (size & 3)) return MACH_SEND_MSG_TOO_SMALL; - if (size > MACH_MSG_SIZE_MAX - MAX_TRAILER_SIZE) + if (size > ipc_kmsg_max_body_space) return MACH_SEND_TOO_LARGE; if(size == sizeof(mach_msg_legacy_header_t)) @@ -1250,7 +1260,7 @@ ipc_kmsg_get_from_kernel( ipc_port_t dest_port; assert(size >= sizeof(mach_msg_header_t)); -// assert((size & 3) == 0); + assert((size & 3) == 0); dest_port = (ipc_port_t)msg->msgh_remote_port; @@ -1395,9 +1405,8 @@ ipc_kmsg_send( * in an infinite loop trying to deliver * a send-once notification. */ - + ip_unlock(port); ip_release(port); - ip_check_unlock(port); kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; ipc_kmsg_destroy(kmsg); return MACH_MSG_SUCCESS; @@ -1575,8 +1584,15 @@ ipc_kmsg_copyin_header( mach_msg_type_name_t dest_type = MACH_MSGH_BITS_REMOTE(mbits); mach_msg_type_name_t reply_type = MACH_MSGH_BITS_LOCAL(mbits); ipc_object_t dest_port, reply_port; - ipc_port_t dest_soright, reply_soright; ipc_entry_t dest_entry, reply_entry; + ipc_port_t dest_soright, reply_soright; + ipc_port_t release_port = IP_NULL; + + queue_head_t links_data; + queue_t links = &links_data; + wait_queue_link_t wql; + + queue_init(links); if ((mbits != msg->msgh_bits) || (!MACH_MSG_TYPE_PORT_ANY_SEND(dest_type)) || @@ -1588,7 +1604,7 @@ ipc_kmsg_copyin_header( reply_soright = IP_NULL; /* in case we go to invalid dest early */ is_write_lock(space); - if (!space->is_active) + if (!is_active(space)) goto invalid_dest; if (!MACH_PORT_VALID(dest_name)) @@ -1665,7 +1681,9 @@ ipc_kmsg_copyin_header( (reply_type == MACH_MSG_TYPE_MAKE_SEND_ONCE)) { kr = ipc_right_copyin(space, name, dest_entry, dest_type, FALSE, - &dest_port, &dest_soright); + &dest_port, &dest_soright, + &release_port, + links); if (kr != KERN_SUCCESS) goto invalid_dest; @@ -1684,7 +1702,9 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, name, reply_entry, reply_type, TRUE, - &reply_port, &reply_soright); + &reply_port, &reply_soright, + &release_port, + links); assert(kr == KERN_SUCCESS); assert(reply_port == dest_port); @@ -1699,7 +1719,9 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, name, dest_entry, dest_type, FALSE, - &dest_port, &dest_soright); + &dest_port, &dest_soright, + &release_port, + links); if (kr != KERN_SUCCESS) goto invalid_dest; @@ -1724,7 +1746,8 @@ ipc_kmsg_copyin_header( */ kr = ipc_right_copyin_two(space, name, dest_entry, - &dest_port, &dest_soright); + &dest_port, &dest_soright, + &release_port); if (kr != KERN_SUCCESS) goto invalid_dest; @@ -1751,7 +1774,9 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, name, dest_entry, MACH_MSG_TYPE_MOVE_SEND, FALSE, - &dest_port, &soright); + &dest_port, &soright, + &release_port, + links); if (kr != KERN_SUCCESS) goto invalid_dest; @@ -1791,7 +1816,9 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, dest_name, dest_entry, dest_type, FALSE, - &dest_port, &dest_soright); + &dest_port, &dest_soright, + &release_port, + links); if (kr != KERN_SUCCESS) goto invalid_dest; @@ -1856,7 +1883,9 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, dest_name, dest_entry, dest_type, FALSE, - &dest_port, &dest_soright); + &dest_port, &dest_soright, + &release_port, + links); if (kr != KERN_SUCCESS) goto invalid_dest; @@ -1864,8 +1893,9 @@ ipc_kmsg_copyin_header( kr = ipc_right_copyin(space, reply_name, reply_entry, reply_type, TRUE, - &reply_port, &reply_soright); - + &reply_port, &reply_soright, + &release_port, + links); assert(kr == KERN_SUCCESS); /* the entries might need to be deallocated */ @@ -1915,16 +1945,43 @@ ipc_kmsg_copyin_header( msg->msgh_remote_port = (ipc_port_t)dest_port; msg->msgh_local_port = (ipc_port_t)reply_port; + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + + if (release_port != IP_NULL) + ip_release(release_port); + return MACH_MSG_SUCCESS; invalid_reply: is_write_unlock(space); + + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + + if (release_port != IP_NULL) + ip_release(release_port); + return MACH_SEND_INVALID_REPLY; invalid_dest: is_write_unlock(space); + + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + + if (release_port != IP_NULL) + ip_release(release_port); + if (reply_soright != IP_NULL) ipc_notify_port_deleted(reply_soright, reply_name); + return MACH_SEND_INVALID_DEST; } @@ -2872,6 +2929,7 @@ ipc_kmsg_copyout_header( mach_msg_type_name_t dest_type = MACH_MSGH_BITS_REMOTE(mbits); mach_msg_type_name_t reply_type = MACH_MSGH_BITS_LOCAL(mbits); ipc_port_t reply = (ipc_port_t) msg->msgh_local_port; + ipc_port_t release_port = IP_NULL; mach_port_name_t dest_name, reply_name; if (IP_VALID(reply)) { @@ -2886,7 +2944,7 @@ ipc_kmsg_copyout_header( is_write_lock(space); for (;;) { - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return (MACH_RCV_HEADER_ERROR| MACH_MSG_IPC_SPACE); @@ -2903,12 +2961,11 @@ ipc_kmsg_copyout_header( ip_lock(reply); if (!ip_active(reply)) { - ip_release(reply); - ip_check_unlock(reply); - + ip_unlock(reply); ip_lock(dest); is_write_unlock(space); + release_port = reply; reply = IP_DEAD; reply_name = MACH_PORT_DEAD; goto copyout_dest; @@ -2957,7 +3014,7 @@ ipc_kmsg_copyout_header( */ is_read_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_read_unlock(space); return MACH_RCV_HEADER_ERROR|MACH_MSG_IPC_SPACE; } @@ -3025,8 +3082,8 @@ ipc_kmsg_copyout_header( ipc_port_timestamp_t timestamp; timestamp = dest->ip_timestamp; + ip_unlock(dest); ip_release(dest); - ip_check_unlock(dest); if (IP_VALID(reply)) { ip_lock(reply); @@ -3042,7 +3099,10 @@ ipc_kmsg_copyout_header( } if (IP_VALID(reply)) - ipc_port_release(reply); + ip_release(reply); + + if (IP_VALID(release_port)) + ip_release(release_port); msg->msgh_bits = (MACH_MSGH_BITS_OTHER(mbits) | MACH_MSGH_BITS(reply_type, dest_type)); @@ -3153,7 +3213,7 @@ mach_msg_descriptor_t * ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, int is_64bit, vm_map_t map, mach_msg_return_t *mr) { vm_map_copy_t copy; - mach_vm_offset_t rcv_addr; + vm_map_address_t rcv_addr; mach_msg_copy_options_t copy_options; mach_msg_size_t size; mach_msg_descriptor_type_t dsc_type; @@ -3693,8 +3753,8 @@ ipc_kmsg_copyout_dest( ipc_object_copyout_dest(space, dest, dest_type, &dest_name); /* dest is unlocked */ } else { + io_unlock(dest); io_release(dest); - io_check_unlock(dest); dest_name = MACH_PORT_DEAD; } @@ -3911,8 +3971,8 @@ ipc_kmsg_copyout_to_kernel( ipc_object_copyout_dest(space, dest, dest_type, &dest_name); /* dest is unlocked */ } else { + io_unlock(dest); io_release(dest); - io_check_unlock(dest); dest_name = MACH_PORT_DEAD; } @@ -3949,8 +4009,8 @@ ipc_kmsg_copyout_to_kernel_legacy( ipc_object_copyout_dest(space, dest, dest_type, &dest_name); /* dest is unlocked */ } else { + io_unlock(dest); io_release(dest); - io_check_unlock(dest); dest_name = MACH_PORT_DEAD; } @@ -4044,272 +4104,81 @@ ipc_kmsg_copyout_to_kernel_legacy( } #endif /* IKM_SUPPORT_LEGACY */ - -#include -#if MACH_KDB - -#include -#include -/* - * Forward declarations - */ -void ipc_msg_print_untyped( - mach_msg_body_t *body); - -const char * ipc_type_name( - int type_name, - boolean_t received); - -const char * -msgh_bit_decode( - mach_msg_bits_t bit); - -const char * -mm_copy_options_string( - mach_msg_copy_options_t option); - -void db_print_msg_uid(mach_msg_header_t *); - - -const char * -ipc_type_name( - int type_name, - boolean_t received) -{ - switch (type_name) { - case MACH_MSG_TYPE_PORT_NAME: - return "port_name"; - - case MACH_MSG_TYPE_MOVE_RECEIVE: - if (received) { - return "port_receive"; - } else { - return "move_receive"; - } - - case MACH_MSG_TYPE_MOVE_SEND: - if (received) { - return "port_send"; - } else { - return "move_send"; - } - - case MACH_MSG_TYPE_MOVE_SEND_ONCE: - if (received) { - return "port_send_once"; - } else { - return "move_send_once"; - } - - case MACH_MSG_TYPE_COPY_SEND: - return "copy_send"; - - case MACH_MSG_TYPE_MAKE_SEND: - return "make_send"; - - case MACH_MSG_TYPE_MAKE_SEND_ONCE: - return "make_send_once"; - - default: - return (char *) 0; - } -} - -void -ipc_print_type_name( - int type_name) -{ - const char *name = ipc_type_name(type_name, TRUE); - if (name) { - printf("%s", name); - } else { - printf("type%d", type_name); - } -} - -/* - * ipc_kmsg_print [ debug ] - */ -void -ipc_kmsg_print( - ipc_kmsg_t kmsg) -{ - iprintf("kmsg=0x%x\n", kmsg); - iprintf("ikm_next=0x%x, prev=0x%x, size=%d", - kmsg->ikm_next, - kmsg->ikm_prev, - kmsg->ikm_size); - printf("\n"); - ipc_msg_print(kmsg->ikm_header); -} - -const char * -msgh_bit_decode( - mach_msg_bits_t bit) -{ - switch (bit) { - case MACH_MSGH_BITS_COMPLEX: return "complex"; - case MACH_MSGH_BITS_CIRCULAR: return "circular"; - default: return (char *) 0; - } -} - -/* - * ipc_msg_print [ debug ] - */ -void -ipc_msg_print( - mach_msg_header_t *msgh) +mach_msg_trailer_size_t +ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space, + mach_msg_option_t option, thread_t thread, + mach_port_seqno_t seqno, boolean_t minimal_trailer, + mach_vm_offset_t context) { - mach_msg_bits_t mbits; - unsigned int bit, i; - const char *bit_name; - int needs_comma; + mach_msg_max_trailer_t *trailer; - mbits = msgh->msgh_bits; - iprintf("msgh_bits=0x%x: l=0x%x,r=0x%x\n", - mbits, - MACH_MSGH_BITS_LOCAL(msgh->msgh_bits), - MACH_MSGH_BITS_REMOTE(msgh->msgh_bits)); + (void)thread; + trailer = (mach_msg_max_trailer_t *) + ((vm_offset_t)kmsg->ikm_header + + round_msg(kmsg->ikm_header->msgh_size)); - mbits = MACH_MSGH_BITS_OTHER(mbits) & MACH_MSGH_BITS_USED; - db_indent += 2; - if (mbits) - iprintf("decoded bits: "); - needs_comma = 0; - for (i = 0, bit = 1; i < sizeof(mbits) * 8; ++i, bit <<= 1) { - if ((mbits & bit) == 0) - continue; - bit_name = msgh_bit_decode((mach_msg_bits_t)bit); - if (bit_name) - printf("%s%s", needs_comma ? "," : "", bit_name); - else - printf("%sunknown(0x%x),", needs_comma ? "," : "", bit); - ++needs_comma; - } - if (msgh->msgh_bits & ~MACH_MSGH_BITS_USED) { - printf("%sunused=0x%x,", needs_comma ? "," : "", - msgh->msgh_bits & ~MACH_MSGH_BITS_USED); + if (!(option & MACH_RCV_TRAILER_MASK)) { + return trailer->msgh_trailer_size; } - printf("\n"); - db_indent -= 2; - needs_comma = 1; - if (msgh->msgh_remote_port) { - iprintf("remote=0x%x(", msgh->msgh_remote_port); - ipc_print_type_name(MACH_MSGH_BITS_REMOTE(msgh->msgh_bits)); - printf(")"); - } else { - iprintf("remote=null"); - } + trailer->msgh_seqno = seqno; + trailer->msgh_context = context; + trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option); - if (msgh->msgh_local_port) { - printf("%slocal=%p(", needs_comma ? "," : "", - msgh->msgh_local_port); - ipc_print_type_name(MACH_MSGH_BITS_LOCAL(msgh->msgh_bits)); - printf(")\n"); - } else { - printf("local=null\n"); + if (minimal_trailer) { + goto done; } - iprintf("msgh_id=%d, size=%d\n", - msgh->msgh_id, - msgh->msgh_size); - - if (mbits & MACH_MSGH_BITS_COMPLEX) { - ipc_msg_print_untyped((mach_msg_body_t *) (msgh + 1)); + if (MACH_RCV_TRAILER_ELEMENTS(option) >= + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_AV)){ +#if CONFIG_MACF_MACH + if (kmsg->ikm_sender != NULL && + IP_VALID(kmsg->ikm_header->msgh_remote_port) && + mac_port_check_method(kmsg->ikm_sender, + &kmsg->ikm_sender->maclabel, + &kmsg->ikm_header->msgh_remote_port->ip_label, + kmsg->ikm_header->msgh_id) == 0) + trailer->msgh_ad = 1; + else +#endif + trailer->msgh_ad = 0; } -} + /* + * The ipc_kmsg_t holds a reference to the label of a label + * handle, not the port. We must get a reference to the port + * and a send right to copyout to the receiver. + */ -const char * -mm_copy_options_string( - mach_msg_copy_options_t option) -{ - const char *name; - - switch (option) { - case MACH_MSG_PHYSICAL_COPY: - name = "PHYSICAL"; - break; - case MACH_MSG_VIRTUAL_COPY: - name = "VIRTUAL"; - break; - case MACH_MSG_OVERWRITE: - name = "OVERWRITE"; - break; - case MACH_MSG_ALLOCATE: - name = "ALLOCATE"; - break; - case MACH_MSG_KALLOC_COPY_T: - name = "KALLOC_COPY_T"; - break; - default: - name = "unknown"; - break; + if (option & MACH_RCV_TRAILER_ELEMENTS (MACH_RCV_TRAILER_LABELS)) { +#if CONFIG_MACF_MACH + if (kmsg->ikm_sender != NULL) { + ipc_labelh_t lh = kmsg->ikm_sender->label; + kern_return_t kr; + + ip_lock(lh->lh_port); + lh->lh_port->ip_mscount++; + lh->lh_port->ip_srights++; + ip_reference(lh->lh_port); + ip_unlock(lh->lh_port); + + kr = ipc_object_copyout(space, (ipc_object_t)lh->lh_port, + MACH_MSG_TYPE_PORT_SEND, 0, + &trailer->msgh_labels.sender); + if (kr != KERN_SUCCESS) { + ip_release(lh->lh_port); + trailer->msgh_labels.sender = 0; + } + } else { + trailer->msgh_labels.sender = 0; + } +#else + (void)space; + trailer->msgh_labels.sender = 0; +#endif } - return name; -} -void -ipc_msg_print_untyped( - mach_msg_body_t *body) -{ - mach_msg_descriptor_t *saddr, *send; - mach_msg_descriptor_type_t type; - - iprintf("%d descriptors %d: \n", body->msgh_descriptor_count); - saddr = (mach_msg_descriptor_t *) (body + 1); - send = saddr + body->msgh_descriptor_count; - - for ( ; saddr < send; saddr++ ) { - - type = saddr->type.type; - - switch (type) { - - case MACH_MSG_PORT_DESCRIPTOR: { - mach_msg_port_descriptor_t *dsc; - - dsc = &saddr->port; - iprintf("-- PORT name = 0x%x disp = ", dsc->name); - ipc_print_type_name(dsc->disposition); - printf("\n"); - break; - } - case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: - case MACH_MSG_OOL_DESCRIPTOR: { - mach_msg_ool_descriptor_t *dsc; - - dsc = &saddr->out_of_line; - iprintf("-- OOL%s addr = 0x%x size = 0x%x copy = %s %s\n", - type == MACH_MSG_OOL_DESCRIPTOR ? "" : " VOLATILE", - dsc->address, dsc->size, - mm_copy_options_string(dsc->copy), - dsc->deallocate ? "DEALLOC" : ""); - break; - } - case MACH_MSG_OOL_PORTS_DESCRIPTOR : { - mach_msg_ool_ports_descriptor_t *dsc; - - dsc = &saddr->ool_ports; - - iprintf("-- OOL_PORTS addr = 0x%x count = 0x%x ", - dsc->address, dsc->count); - printf("disp = "); - ipc_print_type_name(dsc->disposition); - printf(" copy = %s %s\n", - mm_copy_options_string(dsc->copy), - dsc->deallocate ? "DEALLOC" : ""); - break; - } - - default: { - iprintf("-- UNKNOWN DESCRIPTOR 0x%x\n", type); - break; - } - } - } +done: + return trailer->msgh_trailer_size; } -#endif /* MACH_KDB */ diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index 6fb07b6dd..cb3579737 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -144,7 +144,6 @@ MACRO_END #define ikm_prealloc_clear_inuse(kmsg, port) \ MACRO_BEGIN \ (kmsg)->ikm_prealloc = IP_NULL; \ - ip_release(port); \ MACRO_END #define ikm_init(kmsg, size) \ @@ -408,5 +407,12 @@ extern void ipc_kmsg_free_scatter( mach_msg_body_t *slist, mach_msg_size_t slist_size); + +extern mach_msg_trailer_size_t +ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space, + mach_msg_option_t option, thread_t thread, + mach_port_seqno_t seqno, boolean_t minimal_trailer, + mach_vm_offset_t context); + #endif /* _IPC_IPC_KMSG_H_ */ diff --git a/osfmk/ipc/ipc_labelh.c b/osfmk/ipc/ipc_labelh.c index 934eaf7e4..9b14d07d5 100644 --- a/osfmk/ipc/ipc_labelh.c +++ b/osfmk/ipc/ipc_labelh.c @@ -234,6 +234,7 @@ labelh_destroy(ipc_port_t port) ipc_labelh_t lh = (ipc_labelh_t) port->ip_kobject; mac_task_label_destroy(&lh->lh_label); + lh_lock_destroy(lh); zfree(ipc_labelh_zone, (vm_offset_t)lh); } #else diff --git a/osfmk/ipc/ipc_labelh.h b/osfmk/ipc/ipc_labelh.h index 5eba16ca3..7126ff162 100644 --- a/osfmk/ipc/ipc_labelh.h +++ b/osfmk/ipc/ipc_labelh.h @@ -89,6 +89,7 @@ MACRO_END extern zone_t ipc_labelh_zone; #define lh_lock_init(lh) lck_mtx_init(&(lh)->lh_lock_data, &ipc_lck_grp, &ipc_lck_attr) +#define lh_lock_destroy(lh) lck_mtx_destroy(&(lh)->lh_lock_data, &ipc_lck_grp) #define lh_lock(lh) lck_mtx_lock(&(lh)->lh_lock_data) #define lh_unlock(lh) lck_mtx_unlock(&(lh)->lh_lock_data) diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index 406b5ae93..569c6fb0b 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -159,13 +159,14 @@ ipc_mqueue_member( kern_return_t ipc_mqueue_remove( - ipc_mqueue_t mqueue, - ipc_mqueue_t set_mqueue) + ipc_mqueue_t mqueue, + ipc_mqueue_t set_mqueue, + wait_queue_link_t *wqlp) { wait_queue_t mq_waitq = &mqueue->imq_wait_queue; wait_queue_set_t set_waitq = &set_mqueue->imq_set_queue; - return wait_queue_unlink(mq_waitq, set_waitq); + return wait_queue_unlink_nofree(mq_waitq, set_waitq, wqlp); } /* @@ -177,11 +178,12 @@ ipc_mqueue_remove( */ void ipc_mqueue_remove_from_all( - ipc_mqueue_t mqueue) + ipc_mqueue_t mqueue, + queue_t links) { wait_queue_t mq_waitq = &mqueue->imq_wait_queue; - wait_queue_unlink_all(mq_waitq); + wait_queue_unlink_all_nofree(mq_waitq, links); return; } @@ -194,11 +196,12 @@ ipc_mqueue_remove_from_all( */ void ipc_mqueue_remove_all( - ipc_mqueue_t mqueue) + ipc_mqueue_t mqueue, + queue_t links) { wait_queue_set_t mq_setq = &mqueue->imq_set_queue; - wait_queue_set_unlink_all(mq_setq); + wait_queue_set_unlink_all_nofree(mq_setq, links); return; } @@ -217,7 +220,8 @@ ipc_mqueue_remove_all( kern_return_t ipc_mqueue_add( ipc_mqueue_t port_mqueue, - ipc_mqueue_t set_mqueue) + ipc_mqueue_t set_mqueue, + wait_queue_link_t wql) { wait_queue_t port_waitq = &port_mqueue->imq_wait_queue; wait_queue_set_t set_waitq = &set_mqueue->imq_set_queue; @@ -226,7 +230,7 @@ ipc_mqueue_add( kern_return_t kr; spl_t s; - kr = wait_queue_link(port_waitq, set_waitq); + kr = wait_queue_link_noalloc(port_waitq, set_waitq, wql); if (kr != KERN_SUCCESS) return kr; @@ -278,7 +282,7 @@ ipc_mqueue_add( */ msize = ipc_kmsg_copyout_size(kmsg, th->map); if (th->ith_msize < - (msize + REQUESTED_TRAILER_SIZE(th->ith_option))) { + (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(th), th->ith_option))) { th->ith_state = MACH_RCV_TOO_LARGE; th->ith_msize = msize; if (th->ith_option & MACH_RCV_LARGE) { @@ -539,7 +543,7 @@ ipc_mqueue_post( */ msize = ipc_kmsg_copyout_size(kmsg, receiver->map); if (receiver->ith_msize < - (msize + REQUESTED_TRAILER_SIZE(receiver->ith_option))) { + (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(receiver), receiver->ith_option))) { receiver->ith_msize = msize; receiver->ith_state = MACH_RCV_TOO_LARGE; } else { @@ -917,7 +921,7 @@ ipc_mqueue_select_on_thread( * (and size needed). */ rcv_size = ipc_kmsg_copyout_size(kmsg, thread->map); - if (rcv_size + REQUESTED_TRAILER_SIZE(option) > max_size) { + if (rcv_size + REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option) > max_size) { mr = MACH_RCV_TOO_LARGE; if (option & MACH_RCV_LARGE) { thread->ith_receiver_name = mqueue->imq_receiver_name; @@ -1136,7 +1140,7 @@ ipc_mqueue_copyin( ipc_mqueue_t mqueue; is_read_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_read_unlock(space); return MACH_RCV_INVALID_NAME; } diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index c8a3f7a2e..f452f7079 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -150,7 +150,8 @@ extern void ipc_mqueue_changed( /* Add the specific mqueue as a member of the set */ extern kern_return_t ipc_mqueue_add( ipc_mqueue_t mqueue, - ipc_mqueue_t set_mqueue); + ipc_mqueue_t set_mqueue, + wait_queue_link_t wql); /* Check to see if mqueue is member of set_mqueue */ extern boolean_t ipc_mqueue_member( @@ -160,15 +161,18 @@ extern boolean_t ipc_mqueue_member( /* Remove an mqueue from a specific set */ extern kern_return_t ipc_mqueue_remove( ipc_mqueue_t mqueue, - ipc_mqueue_t set_mqueue); + ipc_mqueue_t set_mqueue, + wait_queue_link_t *wqlp); /* Remove an mqueue from all sets */ extern void ipc_mqueue_remove_from_all( - ipc_mqueue_t mqueue); + ipc_mqueue_t mqueue, + queue_t links); /* Remove all the members of the specifiied set */ extern void ipc_mqueue_remove_all( - ipc_mqueue_t mqueue); + ipc_mqueue_t mqueue, + queue_t links); /* Send a message to a port */ extern mach_msg_return_t ipc_mqueue_send( diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 176e80ec8..35f9224f0 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -108,10 +108,7 @@ void ipc_object_reference( ipc_object_t object) { - io_lock(object); - assert(object->io_references > 0); io_reference(object); - io_unlock(object); } /* @@ -124,10 +121,7 @@ void ipc_object_release( ipc_object_t object) { - io_lock(object); - assert(object->io_references > 0); io_release(object); - io_check_unlock(object); } /* @@ -263,7 +257,7 @@ ipc_object_alloc_dead( assert(entry->ie_object == IO_NULL); entry->ie_bits |= MACH_PORT_TYPE_DEAD_NAME | 1; - + ipc_entry_modified(space, *namep, entry); is_write_unlock(space); return KERN_SUCCESS; } @@ -301,7 +295,7 @@ ipc_object_alloc_dead_name( assert(entry->ie_object == IO_NULL); entry->ie_bits |= MACH_PORT_TYPE_DEAD_NAME | 1; - + ipc_entry_modified(space, name, entry); is_write_unlock(space); return KERN_SUCCESS; } @@ -366,6 +360,7 @@ ipc_object_alloc( entry->ie_bits |= type | urefs; entry->ie_object = object; + ipc_entry_modified(space, *namep, entry); io_lock(object); is_write_unlock(space); @@ -441,6 +436,7 @@ ipc_object_alloc_name( entry->ie_bits |= type | urefs; entry->ie_object = object; + ipc_entry_modified(space, name, entry); io_lock(object); is_write_unlock(space); @@ -506,7 +502,13 @@ ipc_object_copyin( { ipc_entry_t entry; ipc_port_t soright; + ipc_port_t release_port; kern_return_t kr; + queue_head_t links_data; + queue_t links = &links_data; + wait_queue_link_t wql; + + queue_init(links); /* * Could first try a read lock when doing @@ -519,13 +521,24 @@ ipc_object_copyin( return kr; /* space is write-locked and active */ + release_port = IP_NULL; kr = ipc_right_copyin(space, name, entry, msgt_name, TRUE, - objectp, &soright); + objectp, &soright, + &release_port, + links); if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) ipc_entry_dealloc(space, name, entry); is_write_unlock(space); + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + + if (release_port != IP_NULL) + ip_release(release_port); + if ((kr == KERN_SUCCESS) && (soright != IP_NULL)) ipc_notify_port_deleted(soright, name); @@ -752,7 +765,7 @@ ipc_object_copyout( is_write_lock(space); for (;;) { - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return KERN_INVALID_TASK; } @@ -1075,82 +1088,3 @@ io_free( io_lock_destroy(object); zfree(ipc_object_zones[otype], object); } - -#include -#if MACH_KDB - -#include - -#define printf kdbprintf - -/* - * Routine: ipc_object_print - * Purpose: - * Pretty-print an object for kdb. - */ - -const char *ikot_print_array[IKOT_MAX_TYPE] = { - "(NONE) ", - "(THREAD) ", - "(TASK) ", - "(HOST) ", - "(HOST_PRIV) ", - "(PROCESSOR) ", - "(PSET) ", - "(PSET_NAME) ", - "(TIMER) ", - "(PAGER_REQUEST) ", - "(DEVICE) ", /* 10 */ - "(XMM_OBJECT) ", - "(XMM_PAGER) ", - "(XMM_KERNEL) ", - "(XMM_REPLY) ", - "(NOTDEF 15) ", - "(NOTDEF 16) ", - "(HOST_SECURITY) ", - "(LEDGER) ", - "(MASTER_DEVICE) ", - "(ACTIVATION) ", /* 20 */ - "(SUBSYSTEM) ", - "(IO_DONE_QUEUE) ", - "(SEMAPHORE) ", - "(LOCK_SET) ", - "(CLOCK) ", - "(CLOCK_CTRL) ", /* 26 */ - "(IOKIT_SPARE) ", /* 27 */ - "(NAMED_MEM_ENTRY) ", /* 28 */ - "(IOKIT_CONNECT) ", - "(IOKIT_OBJECT) ", /* 30 */ - "(UPL) ", - "(MEM_OBJ_CONTROL) ", - "(AU_SESSIONPORT) ", /* 33 */ - "(FILEPORT)", /* 34 */ -#if CONFIG_MACF_MACH - "(LABELH) ", -#endif -/* - * Add new entries here. - * Please keep in sync with kern/ipc_kobject.h - */ - "(UNKNOWN) " /* magic catchall */ -}; - -void -ipc_object_print( - ipc_object_t object) -{ - int kotype; - - iprintf("%s", io_active(object) ? "active" : "dead"); - printf(", refs=%d", object->io_references); - printf(", otype=%d", io_otype(object)); - kotype = io_kotype(object); - if (kotype >= 0 && kotype < IKOT_MAX_TYPE) - printf(", kotype=%d %s\n", io_kotype(object), - ikot_print_array[kotype]); - else - printf(", kotype=0x%x %s\n", io_kotype(object), - ikot_print_array[IKOT_UNKNOWN]); -} - -#endif /* MACH_KDB */ diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index a813b29bf..05822d0fa 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -73,7 +73,6 @@ #define _IPC_IPC_OBJECT_H_ #include -#include #include #include @@ -81,6 +80,7 @@ #include #include #include +#include typedef natural_t ipc_object_refs_t; /* for ipc/ipc_object.h */ typedef natural_t ipc_object_bits_t; @@ -100,7 +100,7 @@ typedef natural_t ipc_object_type_t; struct ipc_object { ipc_object_bits_t io_bits; ipc_object_refs_t io_references; - decl_lck_mtx_data(, io_lock_data) + lck_spin_t io_lock_data; }; /* @@ -165,27 +165,18 @@ extern void io_free( * (ipc_port and ipc_pset). */ #define io_lock_init(io) \ - lck_mtx_init(&(io)->io_lock_data, &ipc_lck_grp, &ipc_lck_attr) + lck_spin_init(&(io)->io_lock_data, &ipc_lck_grp, &ipc_lck_attr) #define io_lock_destroy(io) \ - lck_mtx_destroy(&(io)->io_lock_data, &ipc_lck_grp) + lck_spin_destroy(&(io)->io_lock_data, &ipc_lck_grp) #define io_lock(io) \ - lck_mtx_lock(&(io)->io_lock_data) + lck_spin_lock(&(io)->io_lock_data) #define io_lock_try(io) \ - lck_mtx_try_lock(&(io)->io_lock_data) + lck_spin_try_lock(&(io)->io_lock_data) #define io_unlock(io) \ - lck_mtx_unlock(&(io)->io_lock_data) + lck_spin_unlock(&(io)->io_lock_data) #define _VOLATILE_ volatile -#define io_check_unlock(io) \ -MACRO_BEGIN \ - _VOLATILE_ ipc_object_refs_t _refs = (io)->io_references; \ - \ - io_unlock(io); \ - if (_refs == 0) \ - io_free(io_otype(io), io); \ -MACRO_END - /* Sanity check the ref count. If it is 0, we may be doubly zfreeing. * If it is larger than max int, it has been corrupted, probably by being * modified into an address (this is architecture dependent, but it's @@ -198,18 +189,24 @@ MACRO_END #define IO_MAX_REFERENCES \ (unsigned)(~0 ^ (1 << (sizeof(int)*BYTE_SIZE - 1))) -#define io_reference(io) \ -MACRO_BEGIN \ - assert((io)->io_references < IO_MAX_REFERENCES); \ - (io)->io_references++; \ -MACRO_END - -#define io_release(io) \ -MACRO_BEGIN \ - assert((io)->io_references > 0 && \ - (io)->io_references <= IO_MAX_REFERENCES); \ - (io)->io_references--; \ -MACRO_END +static inline void +io_reference(ipc_object_t io) { + assert((io)->io_references > 0 && + (io)->io_references < IO_MAX_REFERENCES); + OSIncrementAtomic(&((io)->io_references)); +} + + +static inline void +io_release(ipc_object_t io) { + assert((io)->io_references > 0 && + (io)->io_references < IO_MAX_REFERENCES); + /* If we just removed the last reference count */ + if ( 1 == OSDecrementAtomic(&((io)->io_references))) { + /* Free the object */ + io_free(io_otype((io)), (io)); + } +} /* * Retrieve a label for use in a kernel call that takes a security @@ -334,12 +331,4 @@ extern kern_return_t ipc_object_rename( mach_port_name_t oname, mach_port_name_t nname); -#if MACH_KDB -/* Pretty-print an ipc object */ - -extern void ipc_object_print( - ipc_object_t object); - -#endif /* MACH_KDB */ - #endif /* _IPC_IPC_OBJECT_H_ */ diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index 0ece0705c..d76463f64 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -70,7 +70,6 @@ */ #include -#include #include #include @@ -89,36 +88,40 @@ #include #include #include -#include #include #include -#if MACH_KDB -#include -#include -#include -#endif /* MACH_KDB */ - #include decl_lck_mtx_data(, ipc_port_multiple_lock_data) -decl_lck_mtx_data(, ipc_port_timestamp_lock_data) lck_mtx_ext_t ipc_port_multiple_lock_data_ext; -lck_mtx_ext_t ipc_port_timestamp_lock_data_ext; ipc_port_timestamp_t ipc_port_timestamp_data; int ipc_portbt; #if MACH_ASSERT void ipc_port_init_debug( - ipc_port_t port); + ipc_port_t port, + natural_t *callstack, + unsigned int callstack_max); + +void ipc_port_callstack_init_debug( + natural_t *callstack, + unsigned int callstack_max); + #endif /* MACH_ASSERT */ -#if MACH_KDB && ZONE_DEBUG -/* Forwards */ -void print_type_ports(unsigned, unsigned); -void print_ports(void); -#endif /* MACH_KDB && ZONE_DEBUG */ +void +ipc_port_release(ipc_port_t port) +{ + ip_release(port); +} + +void +ipc_port_reference(ipc_port_t port) +{ + ip_reference(port); +} /* * Routine: ipc_port_timestamp @@ -129,13 +132,7 @@ void print_ports(void); ipc_port_timestamp_t ipc_port_timestamp(void) { - ipc_port_timestamp_t timestamp; - - ipc_port_timestamp_lock(); - timestamp = ipc_port_timestamp_data++; - ipc_port_timestamp_unlock(); - - return timestamp; + return OSIncrementAtomic(&ipc_port_timestamp_data); } /* @@ -249,12 +246,11 @@ ipc_port_request_grow( if ((its->its_size == 0) || ((ntable = it_requests_alloc(its)) == IPR_NULL)) { - ipc_port_release(port); + ip_release(port); return KERN_RESOURCE_SHORTAGE; } ip_lock(port); - ip_release(port); /* * Check that port is still active and that nobody else @@ -302,12 +298,14 @@ ipc_port_request_grow( ntable->ipr_size = its; port->ip_requests = ntable; ip_unlock(port); + ip_release(port); if (otable != IPR_NULL) { it_requests_free(oits, otable); } } else { - ip_check_unlock(port); + ip_unlock(port); + ip_release(port); it_requests_free(its, ntable); } @@ -378,8 +376,6 @@ ipc_port_request_type( if (!IPR_SOR_SPARMED(ipr->ipr_soright)) { type |= MACH_PORT_TYPE_SPREQUEST_DELAYED; - } else { - assert(port->ip_sprequests == TRUE); } } } @@ -501,7 +497,8 @@ ipc_port_nsrequest( void ipc_port_clear_receiver( - ipc_port_t port) + ipc_port_t port, + queue_t links) { spl_t s; @@ -511,7 +508,7 @@ ipc_port_clear_receiver( * pull ourselves from any sets. */ if (port->ip_pset_count != 0) { - ipc_pset_remove_from_all(port); + ipc_pset_remove_from_all(port, links); assert(port->ip_pset_count == 0); } @@ -558,10 +555,6 @@ ipc_port_init( port->ip_premsg = IKM_NULL; port->ip_context = 0; -#if MACH_ASSERT - ipc_port_init_debug(port); -#endif /* MACH_ASSERT */ - ipc_mqueue_init(&port->ip_messages, FALSE /* set */); } @@ -589,6 +582,11 @@ ipc_port_alloc( mach_port_name_t name; kern_return_t kr; +#if MACH_ASSERT + natural_t buf[IP_CALLSTACK_MAX]; + ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX); +#endif /* MACH_ASSERT */ + kr = ipc_object_alloc(space, IOT_PORT, MACH_PORT_TYPE_RECEIVE, 0, &name, (ipc_object_t *) &port); @@ -599,6 +597,10 @@ ipc_port_alloc( ipc_port_init(port, space, name); +#if MACH_ASSERT + ipc_port_init_debug(port, &buf[0], IP_CALLSTACK_MAX); +#endif /* MACH_ASSERT */ + #if CONFIG_MACF_MACH task_t issuer = current_task(); tasklabel_lock2 (issuer, space->is_task); @@ -636,6 +638,11 @@ ipc_port_alloc_name( ipc_port_t port; kern_return_t kr; +#if MACH_ASSERT + natural_t buf[IP_CALLSTACK_MAX]; + ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX); +#endif /* MACH_ASSERT */ + kr = ipc_object_alloc_name(space, IOT_PORT, MACH_PORT_TYPE_RECEIVE, 0, name, (ipc_object_t *) &port); @@ -646,6 +653,10 @@ ipc_port_alloc_name( ipc_port_init(port, space, name); +#if MACH_ASSERT + ipc_port_init_debug(port, &buf[0], IP_CALLSTACK_MAX); +#endif /* MACH_ASSERT */ + #if CONFIG_MACF_MACH task_t issuer = current_task(); tasklabel_lock2 (issuer, space->is_task); @@ -815,13 +826,21 @@ ipc_port_destroy( * like a normal buffer. */ if (IP_PREALLOC(port)) { + ipc_port_t inuse_port; + kmsg = port->ip_premsg; assert(kmsg != IKM_NULL); + inuse_port = ikm_prealloc_inuse_port(kmsg); IP_CLEAR_PREALLOC(port, kmsg); - if (!ikm_prealloc_inuse(kmsg)) + ip_unlock(port); + if (inuse_port != IP_NULL) { + assert(inuse_port == port); + } else { ipc_kmsg_free(kmsg); + } + } else { + ip_unlock(port); } - ip_unlock(port); /* throw away no-senders request */ nsrequest = port->ip_nsrequest; @@ -837,7 +856,7 @@ ipc_port_destroy( ipc_kobject_destroy(port); - ipc_port_release(port); /* consume caller's ref */ + ip_release(port); /* consume caller's ref */ } /* @@ -1001,7 +1020,7 @@ ipc_port_lookup_notify( ipc_port_t port; ipc_entry_t entry; - assert(space->is_active); + assert(is_active(space)); entry = ipc_entry_lookup(space, name); if (entry == IE_NULL) @@ -1160,10 +1179,10 @@ ipc_port_release_send( return; ip_lock(port); - ip_release(port); if (!ip_active(port)) { - ip_check_unlock(port); + ip_unlock(port); + ip_release(port); return; } @@ -1175,9 +1194,30 @@ ipc_port_release_send( port->ip_nsrequest = IP_NULL; mscount = port->ip_mscount; ip_unlock(port); + ip_release(port); ipc_notify_no_senders(nsrequest, mscount); - } else + } else { ip_unlock(port); + ip_release(port); + } +} + +/* + * Routine: ipc_port_make_sonce_locked + * Purpose: + * Make a naked send-once right from a receive right. + * Conditions: + * The port is locked and active. + */ + +ipc_port_t +ipc_port_make_sonce_locked( + ipc_port_t port) +{ + assert(ip_active(port)); + port->ip_sorights++; + ip_reference(port); + return port; } /* @@ -1185,7 +1225,7 @@ ipc_port_release_send( * Purpose: * Make a naked send-once right from a receive right. * Conditions: - * The port is not locked but it is active. + * The port is not locked. */ ipc_port_t @@ -1196,12 +1236,14 @@ ipc_port_make_sonce( return port; ip_lock(port); - assert(ip_active(port)); - port->ip_sorights++; - ip_reference(port); + if (ip_active(port)) { + port->ip_sorights++; + ip_reference(port); + ip_unlock(port); + return port; + } ip_unlock(port); - - return port; + return IP_DEAD; } /* @@ -1231,14 +1273,8 @@ ipc_port_release_sonce( port->ip_sorights--; - ip_release(port); - - if (!ip_active(port)) { - ip_check_unlock(port); - return; - } - ip_unlock(port); + ip_release(port); } /* @@ -1267,7 +1303,7 @@ ipc_port_release_receive( ipc_port_destroy(port); /* consumes ref, unlocks */ if (dest != IP_NULL) - ipc_port_release(dest); + ip_release(dest); } /* @@ -1290,6 +1326,11 @@ ipc_port_alloc_special( if (port == IP_NULL) return IP_NULL; +#if MACH_ASSERT + natural_t buf[IP_CALLSTACK_MAX]; + ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX); +#endif /* MACH_ASSERT */ + bzero((char *)port, sizeof(*port)); io_lock_init(&port->ip_object); port->ip_references = 1; @@ -1297,6 +1338,10 @@ ipc_port_alloc_special( ipc_port_init(port, space, 1); +#if MACH_ASSERT + ipc_port_init_debug(port, &buf[0], IP_CALLSTACK_MAX); +#endif /* MACH_ASSERT */ + #if CONFIG_MACF_MACH /* Currently, ipc_port_alloc_special is used for two things: * - Reply ports for messages from the kernel @@ -1387,8 +1432,7 @@ ipc_port_finalize( * deallocation is intercepted via io_free. */ queue_head_t port_alloc_queue; -decl_lck_mtx_data(,port_alloc_queue_lock) -lck_mtx_ext_t port_alloc_queue_lock_ext; +lck_spin_t port_alloc_queue_lock; unsigned long port_count = 0; unsigned long port_count_warning = 20000; @@ -1412,7 +1456,8 @@ void ipc_port_debug_init(void) { queue_init(&port_alloc_queue); - lck_mtx_init_ext(&port_alloc_queue_lock, &port_alloc_queue_lock_ext, &ipc_lck_grp, &ipc_lck_attr); + + lck_spin_init(&port_alloc_queue_lock, &ipc_lck_grp, &ipc_lck_attr); if (!PE_parse_boot_argn("ipc_portbt", &ipc_portbt, sizeof (ipc_portbt))) ipc_portbt = 0; @@ -1428,16 +1473,18 @@ extern int proc_pid(struct proc*); */ void ipc_port_init_debug( - ipc_port_t port) + ipc_port_t port, + natural_t *callstack, + unsigned int callstack_max) { unsigned int i; port->ip_thread = current_thread(); port->ip_timetrack = port_timestamp++; - for (i = 0; i < IP_CALLSTACK_MAX; ++i) - port->ip_callstack[i] = 0; + for (i = 0; i < callstack_max; ++i) + port->ip_callstack[i] = callstack[i]; for (i = 0; i < IP_NSPARES; ++i) - port->ip_spares[i] = 0; + port->ip_spares[i] = 0; #ifdef MACH_BSD task_t task = current_task(); @@ -1448,24 +1495,39 @@ ipc_port_init_debug( } #endif /* MACH_BSD */ - /* - * Machine-dependent routine to fill in an - * array with up to IP_CALLSTACK_MAX levels - * of return pc information. - */ - if (ipc_portbt) - machine_callstack(&port->ip_callstack[0], IP_CALLSTACK_MAX); - #if 0 - lck_mtx_lock(&port_alloc_queue_lock); + lck_spin_lock(&port_alloc_queue_lock); ++port_count; if (port_count_warning > 0 && port_count >= port_count_warning) assert(port_count < port_count_warning); queue_enter(&port_alloc_queue, port, ipc_port_t, ip_port_links); - lck_mtx_unlock(&port_alloc_queue_lock); + lck_spin_unlock(&port_alloc_queue_lock); #endif } +/* + * Routine: ipc_port_callstack_init_debug + * Purpose: + * Calls the machine-dependent routine to + * fill in an array with up to IP_CALLSTACK_MAX + * levels of return pc information + * Conditions: + * May block (via copyin) + */ +void +ipc_port_callstack_init_debug( + natural_t *callstack, + unsigned int callstack_max) +{ + unsigned int i; + + /* guarantee the callstack is initialized */ + for (i=0; i < callstack_max; i++) + callstack[i] = 0; + + if (ipc_portbt) + machine_callstack(callstack, callstack_max); +} /* * Remove a port from the queue of allocated ports. @@ -1483,672 +1545,13 @@ void ipc_port_track_dealloc( ipc_port_t port) { - lck_mtx_lock(&port_alloc_queue_lock); + lck_spin_lock(&port_alloc_queue_lock); assert(port_count > 0); --port_count; queue_remove(&port_alloc_queue, port, ipc_port_t, ip_port_links); - lck_mtx_unlock(&port_alloc_queue_lock); + lck_spin_unlock(&port_alloc_queue_lock); } #endif #endif /* MACH_ASSERT */ - - -#if MACH_KDB - -#include -#include - -#define printf kdbprintf - -int -db_port_queue_print( - ipc_port_t port); - -/* - * Routine: ipc_port_print - * Purpose: - * Pretty-print a port for kdb. - */ -int ipc_port_print_long = 0; /* set for more detail */ - -void -ipc_port_print( - ipc_port_t port, - __unused boolean_t have_addr, - __unused db_expr_t count, - char *modif) -{ - db_addr_t task; - int task_id; - int nmsgs; - int verbose = 0; -#if MACH_ASSERT - int i, needs_db_indent, items_printed; -#endif /* MACH_ASSERT */ - - if (db_option(modif, 'l') || db_option(modif, 'v')) - ++verbose; - - printf("port 0x%x\n", port); - - db_indent += 2; - - ipc_object_print(&port->ip_object); - - if (ipc_port_print_long) { - printf("\n"); - } - - if (!ip_active(port)) { - iprintf("timestamp=0x%x", port->ip_timestamp); - } else if (port->ip_receiver_name == MACH_PORT_NULL) { - iprintf("destination=0x%x (", port->ip_destination); - if (port->ip_destination != MACH_PORT_NULL && - (task = db_task_from_space(port->ip_destination-> - ip_receiver, &task_id))) - printf("task%d at 0x%x", task_id, task); - else - printf("unknown"); - printf(")"); - } else { - iprintf("receiver=0x%x (", port->ip_receiver); - if (port->ip_receiver == ipc_space_kernel) - printf("kernel"); - else if (port->ip_receiver == ipc_space_reply) - printf("reply"); - else if (port->ip_receiver == default_pager_space) - printf("default_pager"); - else if ((task = db_task_from_space(port->ip_receiver, &task_id)) != (db_addr_t)0) - printf("task%d at 0x%x", task_id, task); - else - printf("unknown"); - printf(")"); - } - printf(", receiver_name=0x%x\n", port->ip_receiver_name); - - iprintf("mscount=%d", port->ip_mscount); - printf(", srights=%d", port->ip_srights); - printf(", sorights=%d\n", port->ip_sorights); - - iprintf("nsrequest=0x%x", port->ip_nsrequest); - printf(", pdrequest=0x%x", port->ip_pdrequest); - printf(", requests=0x%x\n", port->ip_requests); - - iprintf("pset_count=0x%x", port->ip_pset_count); - printf(", seqno=%d", port->ip_messages.imq_seqno); - printf(", msgcount=%d", port->ip_messages.imq_msgcount); - printf(", qlimit=%d\n", port->ip_messages.imq_qlimit); - - iprintf("kmsgs=0x%x", port->ip_messages.imq_messages.ikmq_base); - printf(", rcvrs queue=0x%x", port->ip_messages.imq_wait_queue); - printf(", kobj=0x%x\n", port->ip_kobject); - - iprintf("premsg=0x%x", port->ip_premsg); - -#if MACH_ASSERT - /* don't bother printing callstack or queue links */ - iprintf("ip_thread=0x%x, ip_timetrack=0x%x\n", - port->ip_thread, port->ip_timetrack); - items_printed = 0; - needs_db_indent = 1; - for (i = 0; i < IP_NSPARES; ++i) { - if (port->ip_spares[i] != 0) { - if (needs_db_indent) { - iprintf(""); - needs_db_indent = 0; - } - printf("%sip_spares[%d] = %d", - items_printed ? ", " : "", i, - port->ip_spares[i]); - if (++items_printed >= 4) { - needs_db_indent = 1; - printf("\n"); - items_printed = 0; - } - } - } -#endif /* MACH_ASSERT */ - - if (verbose) { - iprintf("kmsg queue contents:\n"); - db_indent += 2; - nmsgs = db_port_queue_print(port); - db_indent -= 2; - iprintf("...total kmsgs: %d\n", nmsgs); - } - - db_indent -=2; -} - -ipc_port_t -ipc_name_to_data( - task_t task, - mach_port_name_t name) -{ - ipc_space_t space; - ipc_entry_t entry; - - if (task == TASK_NULL) { - db_printf("port_name_to_data: task is null\n"); - return (0); - } - if ((space = task->itk_space) == 0) { - db_printf("port_name_to_data: task->itk_space is null\n"); - return (0); - } - if (!space->is_active) { - db_printf("port_name_to_data: task->itk_space not active\n"); - return (0); - } - if ((entry = ipc_entry_lookup(space, name)) == 0) { - db_printf("port_name_to_data: lookup yields zero\n"); - return (0); - } - return ((ipc_port_t)entry->ie_object); -} - -#if ZONE_DEBUG -void -print_type_ports(type, dead) - unsigned type; - unsigned dead; -{ - ipc_port_t port; - int n; - - n = 0; - for (port = (ipc_port_t)first_element(ipc_object_zones[IOT_PORT]); - port; - port = (ipc_port_t)next_element(ipc_object_zones[IOT_PORT], - port)) - if (ip_kotype(port) == type && - (!dead || !ip_active(port))) { - if (++n % 5) - printf("0x%x\t", port); - else - printf("0x%x\n", port); - } - if (n % 5) - printf("\n"); -} - -void -print_ports(void) -{ - ipc_port_t port; - int total_port_count; - int space_null_count; - int space_kernel_count; - int space_reply_count; - int space_pager_count; - int space_other_count; - - struct { - int total_count; - int dead_count; - } port_types[IKOT_MAX_TYPE]; - - total_port_count = 0; - - bzero((char *)&port_types[0], sizeof(port_types)); - space_null_count = 0; - space_kernel_count = 0; - space_reply_count = 0; - space_pager_count = 0; - space_other_count = 0; - - for (port = (ipc_port_t)first_element(ipc_object_zones[IOT_PORT]); - port; - port = (ipc_port_t)next_element(ipc_object_zones[IOT_PORT], - port)) { - total_port_count++; - if (ip_kotype(port) >= IKOT_MAX_TYPE) { - port_types[IKOT_UNKNOWN].total_count++; - if (!io_active(&port->ip_object)) - port_types[IKOT_UNKNOWN].dead_count++; - } else { - port_types[ip_kotype(port)].total_count++; - if (!io_active(&port->ip_object)) - port_types[ip_kotype(port)].dead_count++; - } - - if (!port->ip_receiver) - space_null_count++; - else if (port->ip_receiver == ipc_space_kernel) - space_kernel_count++; - else if (port->ip_receiver == ipc_space_reply) - space_reply_count++; - else if (port->ip_receiver == default_pager_space) - space_pager_count++; - else - space_other_count++; - } - printf("\n%7d total ports\n\n", total_port_count); - -#define PRINT_ONE_PORT_TYPE(name) \ - printf("%7d %s", port_types[IKOT_##name].total_count, # name); \ - if (port_types[IKOT_##name].dead_count) \ - printf(" (%d dead ports)", port_types[IKOT_##name].dead_count);\ - printf("\n"); - - PRINT_ONE_PORT_TYPE(NONE); - PRINT_ONE_PORT_TYPE(THREAD); - PRINT_ONE_PORT_TYPE(TASK); - PRINT_ONE_PORT_TYPE(HOST); - PRINT_ONE_PORT_TYPE(HOST_PRIV); - PRINT_ONE_PORT_TYPE(PROCESSOR); - PRINT_ONE_PORT_TYPE(PSET); - PRINT_ONE_PORT_TYPE(PSET_NAME); - PRINT_ONE_PORT_TYPE(TIMER); - PRINT_ONE_PORT_TYPE(PAGING_REQUEST); - PRINT_ONE_PORT_TYPE(MIG); - PRINT_ONE_PORT_TYPE(MEMORY_OBJECT); - PRINT_ONE_PORT_TYPE(XMM_PAGER); - PRINT_ONE_PORT_TYPE(XMM_KERNEL); - PRINT_ONE_PORT_TYPE(XMM_REPLY); - PRINT_ONE_PORT_TYPE(UND_REPLY); - PRINT_ONE_PORT_TYPE(HOST_NOTIFY); - PRINT_ONE_PORT_TYPE(HOST_SECURITY); - PRINT_ONE_PORT_TYPE(LEDGER); - PRINT_ONE_PORT_TYPE(MASTER_DEVICE); - PRINT_ONE_PORT_TYPE(TASK_NAME); - PRINT_ONE_PORT_TYPE(SUBSYSTEM); - PRINT_ONE_PORT_TYPE(IO_DONE_QUEUE); - PRINT_ONE_PORT_TYPE(SEMAPHORE); - PRINT_ONE_PORT_TYPE(LOCK_SET); - PRINT_ONE_PORT_TYPE(CLOCK); - PRINT_ONE_PORT_TYPE(CLOCK_CTRL); - PRINT_ONE_PORT_TYPE(IOKIT_SPARE); - PRINT_ONE_PORT_TYPE(NAMED_ENTRY); - PRINT_ONE_PORT_TYPE(IOKIT_CONNECT); - PRINT_ONE_PORT_TYPE(IOKIT_OBJECT); - PRINT_ONE_PORT_TYPE(UPL); - PRINT_ONE_PORT_TYPE(MEM_OBJ_CONTROL); - - PRINT_ONE_PORT_TYPE(UNKNOWN); - printf("\nipc_space:\n\n"); - printf("NULL KERNEL REPLY PAGER OTHER\n"); - printf("%d %d %d %d %d\n", - space_null_count, - space_kernel_count, - space_reply_count, - space_pager_count, - space_other_count - ); -} - -#endif /* ZONE_DEBUG */ - - -/* - * Print out all the kmsgs in a queue. Aggregate kmsgs with - * identical message ids into a single entry. Count up the - * amount of inline and out-of-line data consumed by each - * and every kmsg. - * - */ - -#define KMSG_MATCH_FIELD(kmsg) (kmsg->ikm_header->msgh_id) -#define DKQP_LONG(kmsg) FALSE -const char *dkqp_long_format = "(%3d) <%10d> 0x%x %10d %10d\n"; -const char *dkqp_format = "(%3d) <%10d> 0x%x %10d %10d\n"; - -int -db_kmsg_queue_print( - ipc_kmsg_t kmsg); -int -db_kmsg_queue_print( - ipc_kmsg_t kmsg) -{ - ipc_kmsg_t ikmsg, first_kmsg; - register int icount; - mach_msg_id_t cur_id; - unsigned int inline_total, ool_total; - int nmsgs; - - iprintf("Count msgh_id kmsg addr inline bytes ool bytes\n"); - inline_total = ool_total = (vm_size_t) 0; - cur_id = KMSG_MATCH_FIELD(kmsg); - for (icount = 0, nmsgs = 0, first_kmsg = ikmsg = kmsg; - kmsg != IKM_NULL && (kmsg != first_kmsg || nmsgs == 0); - kmsg = kmsg->ikm_next) { - ++nmsgs; - if (!(KMSG_MATCH_FIELD(kmsg) == cur_id)) { - iprintf(DKQP_LONG(kmsg) ? dkqp_long_format:dkqp_format, - icount, cur_id, ikmsg, inline_total,ool_total); - cur_id = KMSG_MATCH_FIELD(kmsg); - icount = 1; - ikmsg = kmsg; - inline_total = ool_total = 0; - } else { - icount++; - } - if (DKQP_LONG(kmsg)) - inline_total += kmsg->ikm_size; - else - inline_total += kmsg->ikm_header->msgh_size; - } - iprintf(DKQP_LONG(kmsg) ? dkqp_long_format : dkqp_format, - icount, cur_id, ikmsg, inline_total, ool_total); - return nmsgs; -} - - -/* - * Process all of the messages on a port - prints out the - * number of occurences of each message type, and the first - * kmsg with a particular msgh_id. - */ -int -db_port_queue_print( - ipc_port_t port) -{ - ipc_kmsg_t kmsg; - - if (ipc_kmsg_queue_empty(&port->ip_messages.imq_messages)) - return 0; - kmsg = ipc_kmsg_queue_first(&port->ip_messages.imq_messages); - return db_kmsg_queue_print(kmsg); -} - - -#if MACH_ASSERT -#include -#include - -#define FUNC_NULL ((void (*)) 0) -#define MAX_REFS 5 /* bins for tracking ref counts */ - -/* - * Translate port's cache of call stack pointers - * into symbolic names. - */ -void -db_port_stack_trace( - ipc_port_t port) -{ - unsigned int i; - - for (i = 0; i < IP_CALLSTACK_MAX; ++i) { - iprintf("[%d] 0x%x\t", i, port->ip_callstack[i]); - if (port->ip_callstack[i] != 0 && - DB_VALID_KERN_ADDR(port->ip_callstack[i])) - db_printsym(port->ip_callstack[i], DB_STGY_PROC); - printf("\n"); - } -} - - -typedef struct port_item { - unsigned long item; - unsigned long count; -} port_item; - - -#define ITEM_MAX 400 -typedef struct port_track { - const char *name; - unsigned long max; - unsigned long warning; - port_item items[ITEM_MAX]; -} port_track; - -port_track port_callers; /* match against calling addresses */ -port_track port_threads; /* match against allocating threads */ -port_track port_spaces; /* match against ipc spaces */ - -void port_track_init( - port_track *trackp, - const char *name); -void port_item_add( - port_track *trackp, - unsigned long item); -void port_track_sort( - port_track *trackp); -void port_track_print( - port_track *trackp, - void (*func)(port_item *)); -void port_callers_print( - port_item *p); - -void -port_track_init( - port_track *trackp, - const char *name) -{ - port_item *i; - - trackp->max = trackp->warning = 0; - trackp->name = name; - for (i = trackp->items; i < trackp->items + ITEM_MAX; ++i) - i->item = i->count = 0; -} - - -void -port_item_add( - port_track *trackp, - unsigned long item) -{ - port_item *limit, *i; - - limit = trackp->items + trackp->max; - for (i = trackp->items; i < limit; ++i) - if (i->item == item) { - i->count++; - return; - } - if (trackp->max >= ITEM_MAX) { - if (trackp->warning++ == 0) - iprintf("%s: no room\n", trackp->name); - return; - } - i->item = item; - i->count = 1; - trackp->max++; -} - - -/* - * Simple (and slow) bubble sort. - */ -void -port_track_sort( - port_track *trackp) -{ - port_item *limit, *p; - port_item temp; - boolean_t unsorted; - - limit = trackp->items + trackp->max - 1; - do { - unsorted = FALSE; - for (p = trackp->items; p < limit - 1; ++p) { - if (p->count < (p+1)->count) { - temp = *p; - *p = *(p+1); - *(p+1) = temp; - unsorted = TRUE; - } - } - } while (unsorted == TRUE); -} - - -void -port_track_print( - port_track *trackp, - void (*func)(port_item *)) -{ - port_item *limit, *p; - - limit = trackp->items + trackp->max; - iprintf("%s:\n", trackp->name); - for (p = trackp->items; p < limit; ++p) { - if (func != FUNC_NULL) - (*func)(p); - else - iprintf("0x%x\t%8d\n", p->item, p->count); - } -} - - -void -port_callers_print( - port_item *p) -{ - iprintf("0x%x\t%8d\t", p->item, p->count); - db_printsym(p->item, DB_STGY_PROC); - printf("\n"); -} - - -/* - * Show all ports with a given reference count. - */ -void -db_ref( - int refs) -{ - db_port_walk(1, 1, 1, refs); -} - - -/* - * Examine all currently allocated ports. - * Options: - * verbose display suspicious ports - * display print out each port encountered - * ref_search restrict examination to ports with - * a specified reference count - * ref_target reference count for ref_search - */ -int -db_port_walk( - unsigned int verbose, - unsigned int display, - unsigned int ref_search, - unsigned int ref_target) -{ - ipc_port_t port; - unsigned int ref_overflow, refs, i, ref_inactive_overflow; - unsigned int no_receiver, no_match; - unsigned int ref_counts[MAX_REFS]; - unsigned int inactive[MAX_REFS]; - unsigned int ipc_ports = 0; - - iprintf("Allocated port count is %d\n", port_count); - no_receiver = no_match = ref_overflow = 0; - ref_inactive_overflow = 0; - for (i = 0; i < MAX_REFS; ++i) { - ref_counts[i] = 0; - inactive[i] = 0; - } - port_track_init(&port_callers, "port callers"); - port_track_init(&port_threads, "port threads"); - port_track_init(&port_spaces, "port spaces"); - if (ref_search) - iprintf("Walking ports of ref_count=%d.\n", ref_target); - else - iprintf("Walking all ports.\n"); - - queue_iterate(&port_alloc_queue, port, ipc_port_t, ip_port_links) { - const char *port_type; - - port_type = " IPC port"; - if (ip_active(port)) - ipc_ports++; - - refs = port->ip_references; - if (ref_search && refs != ref_target) - continue; - - if (refs >= MAX_REFS) { - if (ip_active(port)) - ++ref_overflow; - else - ++ref_inactive_overflow; - } else { - if (refs == 0 && verbose) - iprintf("%s 0x%x has ref count of zero!\n", - port_type, port); - if (ip_active(port)) - ref_counts[refs]++; - else - inactive[refs]++; - } - port_item_add(&port_threads, (unsigned long) port->ip_thread); - for (i = 0; i < IP_CALLSTACK_MAX; ++i) { - if (port->ip_callstack[i] != 0 && - DB_VALID_KERN_ADDR(port->ip_callstack[i])) - port_item_add(&port_callers, - port->ip_callstack[i]); - } - if (!ip_active(port)) { - if (verbose) - iprintf("%s 0x%x, inactive, refcnt %d\n", - port_type, port, refs); - continue; - } - - if (port->ip_receiver_name == MACH_PORT_NULL) { - iprintf("%s 0x%x, no receiver, refcnt %d\n", - port, refs); - ++no_receiver; - continue; - } - if (port->ip_receiver == ipc_space_kernel || - port->ip_receiver == ipc_space_reply || - ipc_entry_lookup(port->ip_receiver, - port->ip_receiver_name) - != IE_NULL) { - port_item_add(&port_spaces, - (unsigned long)port->ip_receiver); - if (display) { - iprintf( "%s 0x%x time 0x%x ref_cnt %d\n", - port_type, port, - port->ip_timetrack, refs); - } - continue; - } - iprintf("%s 0x%x, rcvr 0x%x, name 0x%x, ref %d, no match\n", - port_type, port, port->ip_receiver, - port->ip_receiver_name, refs); - ++no_match; - } - iprintf("Active port type summary:\n"); - iprintf("\tlocal IPC %6d\n", ipc_ports); - iprintf("summary:\tcallers %d threads %d spaces %d\n", - port_callers.max, port_threads.max, port_spaces.max); - - iprintf("\tref_counts:\n"); - for (i = 0; i < MAX_REFS; ++i) - iprintf("\t ref_counts[%d] = %d\n", i, ref_counts[i]); - - iprintf("\t%d ports w/o receivers, %d w/o matches\n", - no_receiver, no_match); - - iprintf("\tinactives:"); - if ( ref_inactive_overflow || inactive[0] || inactive[1] || - inactive[2] || inactive[3] || inactive[4] ) - printf(" [0]=%d [1]=%d [2]=%d [3]=%d [4]=%d [5+]=%d\n", - inactive[0], inactive[1], inactive[2], - inactive[3], inactive[4], ref_inactive_overflow); - else - printf(" No inactive ports.\n"); - - port_track_sort(&port_spaces); - port_track_print(&port_spaces, FUNC_NULL); - port_track_sort(&port_threads); - port_track_print(&port_threads, FUNC_NULL); - port_track_sort(&port_callers); - port_track_print(&port_callers, port_callers_print); - return 0; -} - - -#endif /* MACH_ASSERT */ - -#endif /* MACH_KDB */ diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 34aab79d8..100195f7b 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -186,7 +186,6 @@ struct ipc_port { #define ip_lock(port) io_lock(&(port)->ip_object) #define ip_lock_try(port) io_lock_try(&(port)->ip_object) #define ip_unlock(port) io_unlock(&(port)->ip_object) -#define ip_check_unlock(port) io_check_unlock(&(port)->ip_object) #define ip_reference(port) io_reference(&(port)->ip_object) #define ip_release(port) io_release(&(port)->ip_object) @@ -259,6 +258,7 @@ extern lck_attr_t ipc_lck_attr; * when it is taken. */ +#if 1 decl_lck_mtx_data(extern,ipc_port_multiple_lock_data) extern lck_mtx_ext_t ipc_port_multiple_lock_data_ext; @@ -270,6 +270,18 @@ extern lck_mtx_ext_t ipc_port_multiple_lock_data_ext; #define ipc_port_multiple_unlock() \ lck_mtx_unlock(&ipc_port_multiple_lock_data) +#else +lck_spin_t ipc_port_multiple_lock_data; + +#define ipc_port_multiple_lock_init() \ + lck_spin_init(&ipc_port_multiple_lock_data, &ipc_lck_grp, &ipc_lck_attr) + +#define ipc_port_multiple_lock() \ + lck_spin_lock(&ipc_port_multiple_lock_data) + +#define ipc_port_multiple_unlock() \ + lck_spin_unlock(&ipc_port_multiple_lock_data) +#endif /* * The port timestamp facility provides timestamps @@ -277,20 +289,8 @@ extern lck_mtx_ext_t ipc_port_multiple_lock_data_ext; * mach_port_names with port death. */ -decl_lck_mtx_data(extern,ipc_port_timestamp_lock_data) -extern lck_mtx_ext_t ipc_port_timestamp_lock_data_ext; - extern ipc_port_timestamp_t ipc_port_timestamp_data; -#define ipc_port_timestamp_lock_init() \ - lck_mtx_init_ext(&ipc_port_timestamp_lock_data, &ipc_port_timestamp_lock_data_ext, &ipc_lck_grp, &ipc_lck_attr) - -#define ipc_port_timestamp_lock() \ - lck_mtx_lock(&ipc_port_timestamp_lock_data) - -#define ipc_port_timestamp_unlock() \ - lck_mtx_unlock(&ipc_port_timestamp_lock_data) - /* Retrieve a port timestamp value */ extern ipc_port_timestamp_t ipc_port_timestamp(void); @@ -385,7 +385,8 @@ MACRO_END /* Prepare a receive right for transmission/destruction */ extern void ipc_port_clear_receiver( - ipc_port_t port); + ipc_port_t port, + queue_t links); /* Initialize a newly-allocated port */ extern void ipc_port_init( @@ -454,10 +455,20 @@ extern mach_port_name_t ipc_port_copyout_send( extern void ipc_port_release_send( ipc_port_t port); +extern void ipc_port_reference( + ipc_port_t port); + +extern void ipc_port_release( + ipc_port_t port); + #endif /* KERNEL_PRIVATE */ #if MACH_KERNEL_PRIVATE +/* Make a naked send-once right from a locked and active receive right */ +extern ipc_port_t ipc_port_make_sonce_locked( + ipc_port_t port); + /* Make a naked send-once right from a receive right */ extern ipc_port_t ipc_port_make_sonce( ipc_port_t port); @@ -502,12 +513,6 @@ extern void ipc_port_debug_init(void); #define ipc_port_dealloc_reply(port) \ ipc_port_dealloc_special((port), ipc_space_reply) -#define ipc_port_reference(port) \ - ipc_object_reference(&(port)->ip_object) - -#define ipc_port_release(port) \ - ipc_object_release(&(port)->ip_object) - #endif /* MACH_KERNEL_PRIVATE */ #endif /* _IPC_IPC_PORT_H_ */ diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index 462527119..e2e6a6598 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -72,7 +72,6 @@ #include #include #include -#include #include #include @@ -186,15 +185,16 @@ ipc_pset_member( kern_return_t ipc_pset_add( - ipc_pset_t pset, - ipc_port_t port) + ipc_pset_t pset, + ipc_port_t port, + wait_queue_link_t wql) { kern_return_t kr; assert(ips_active(pset)); assert(ip_active(port)); - kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages); + kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages, wql); if (kr == KERN_SUCCESS) port->ip_pset_count++; @@ -216,8 +216,9 @@ ipc_pset_add( kern_return_t ipc_pset_remove( - ipc_pset_t pset, - ipc_port_t port) + ipc_pset_t pset, + ipc_port_t port, + wait_queue_link_t *wqlp) { kern_return_t kr; @@ -226,7 +227,7 @@ ipc_pset_remove( if (port->ip_pset_count == 0) return KERN_NOT_IN_SET; - kr = ipc_mqueue_remove(&port->ip_messages, &pset->ips_messages); + kr = ipc_mqueue_remove(&port->ip_messages, &pset->ips_messages, wqlp); if (kr == KERN_SUCCESS) port->ip_pset_count--; @@ -244,7 +245,8 @@ ipc_pset_remove( kern_return_t ipc_pset_remove_from_all( - ipc_port_t port) + ipc_port_t port, + queue_t links) { assert(ip_active(port)); @@ -254,7 +256,7 @@ ipc_pset_remove_from_all( /* * Remove the port's mqueue from all sets */ - ipc_mqueue_remove_from_all(&port->ip_messages); + ipc_mqueue_remove_from_all(&port->ip_messages, links); port->ip_pset_count = 0; return KERN_SUCCESS; } @@ -275,6 +277,11 @@ ipc_pset_destroy( ipc_pset_t pset) { spl_t s; + queue_head_t link_data; + queue_t links = &link_data; + wait_queue_link_t wql; + + queue_init(links); assert(ips_active(pset)); @@ -283,7 +290,7 @@ ipc_pset_destroy( /* * remove all the member message queues */ - ipc_mqueue_remove_all(&pset->ips_messages); + ipc_mqueue_remove_all(&pset->ips_messages, links); /* * Set all waiters on the portset running to @@ -295,8 +302,14 @@ ipc_pset_destroy( imq_unlock(&pset->ips_messages); splx(s); - ips_release(pset); /* consume the ref our caller gave us */ - ips_check_unlock(pset); + ips_unlock(pset); + ips_release(pset); /* consume the ref our caller gave us */ + + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + } /* Kqueue EVFILT_MACHPORT support */ @@ -321,6 +334,7 @@ filt_machportattach( struct knote *kn) { mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident; + wait_queue_link_t wql = wait_queue_link_allocate(); ipc_pset_t pset = IPS_NULL; int result = ENOSYS; kern_return_t kr; @@ -329,24 +343,28 @@ filt_machportattach( MACH_PORT_RIGHT_PORT_SET, (ipc_object_t *)&pset); if (kr != KERN_SUCCESS) { - result = (kr == KERN_INVALID_NAME ? ENOENT : ENOTSUP); - goto done; + wait_queue_link_free(wql); + return (kr == KERN_INVALID_NAME ? ENOENT : ENOTSUP); } /* We've got a lock on pset */ - /* keep a reference for the knote */ - kn->kn_ptr.p_pset = pset; - ips_reference(pset); - /* * Bind the portset wait queue directly to knote/kqueue. * This allows us to just use wait_queue foo to effect a wakeup, * rather than having to call knote() from the Mach code on each * message. */ - result = knote_link_wait_queue(kn, &pset->ips_messages.imq_wait_queue); + result = knote_link_wait_queue(kn, &pset->ips_messages.imq_wait_queue, wql); + if (result == 0) { + /* keep a reference for the knote */ + kn->kn_ptr.p_pset = pset; + ips_reference(pset); + ips_unlock(pset); + return 0; + } + ips_unlock(pset); -done: + wait_queue_link_free(wql); return result; } @@ -355,16 +373,19 @@ filt_machportdetach( struct knote *kn) { ipc_pset_t pset = kn->kn_ptr.p_pset; + wait_queue_link_t wql = WAIT_QUEUE_LINK_NULL; /* * Unlink the portset wait queue from knote/kqueue, * and release our reference on the portset. */ ips_lock(pset); - knote_unlink_wait_queue(kn, &pset->ips_messages.imq_wait_queue); - ips_release(kn->kn_ptr.p_pset); - kn->kn_ptr.p_pset = IPS_NULL; - ips_check_unlock(pset); + (void)knote_unlink_wait_queue(kn, &pset->ips_messages.imq_wait_queue, &wql); + kn->kn_ptr.p_pset = IPS_NULL; + ips_unlock(pset); + ips_release(pset); + if (wql != WAIT_QUEUE_LINK_NULL) + wait_queue_link_free(wql); } static int @@ -393,8 +414,9 @@ filt_machport( if (kr != KERN_SUCCESS || pset != kn->kn_ptr.p_pset || !ips_active(pset)) { kn->kn_data = 0; kn->kn_flags |= (EV_EOF | EV_ONESHOT); - if (pset != IPS_NULL) - ips_check_unlock(pset); + if (pset != IPS_NULL) { + ips_unlock(pset); + } return(1); } @@ -448,7 +470,7 @@ filt_machport( * portset and return zero. */ if (self->ith_state == MACH_RCV_TIMED_OUT) { - ipc_pset_release(pset); + ips_release(pset); return 0; } @@ -461,7 +483,7 @@ filt_machport( assert(self->ith_state == MACH_RCV_TOO_LARGE); assert(self->ith_kmsg == IKM_NULL); kn->kn_data = self->ith_receiver_name; - ipc_pset_release(pset); + ips_release(pset); return 1; } @@ -523,55 +545,3 @@ filt_machportpeek(struct knote *kn) return (ipc_mqueue_peek(set_mq)); } - - -#include -#if MACH_KDB - -#include - -#define printf kdbprintf - -int -ipc_list_count( - struct ipc_kmsg *base) -{ - register int count = 0; - - if (base) { - struct ipc_kmsg *kmsg = base; - - ++count; - while (kmsg && kmsg->ikm_next != base - && kmsg->ikm_next != IKM_BOGUS){ - kmsg = kmsg->ikm_next; - ++count; - } - } - return(count); -} - -/* - * Routine: ipc_pset_print - * Purpose: - * Pretty-print a port set for kdb. - */ -void -ipc_pset_print( - ipc_pset_t pset) -{ - printf("pset 0x%x\n", pset); - - db_indent += 2; - - ipc_object_print(&pset->ips_object); - iprintf("local_name = 0x%x\n", pset->ips_local_name); - iprintf("%d kmsgs => 0x%x", - ipc_list_count(pset->ips_messages.imq_messages.ikmq_base), - pset->ips_messages.imq_messages.ikmq_base); - printf(",rcvrs queue= 0x%x\n", &pset->ips_messages.imq_wait_queue); - - db_indent -=2; -} - -#endif /* MACH_KDB */ diff --git a/osfmk/ipc/ipc_pset.h b/osfmk/ipc/ipc_pset.h index 26c1f26c0..5dd8af593 100644 --- a/osfmk/ipc/ipc_pset.h +++ b/osfmk/ipc/ipc_pset.h @@ -74,8 +74,6 @@ #include #include -#include - struct ipc_pset { /* @@ -93,7 +91,6 @@ struct ipc_pset { #define ips_lock(pset) io_lock(&(pset)->ips_object) #define ips_lock_try(pset) io_lock_try(&(pset)->ips_object) #define ips_unlock(pset) io_unlock(&(pset)->ips_object) -#define ips_check_unlock(pset) io_check_unlock(&(pset)->ips_object) #define ips_reference(pset) io_reference(&(pset)->ips_object) #define ips_release(pset) io_release(&(pset)->ips_object) @@ -112,7 +109,8 @@ extern kern_return_t ipc_pset_alloc_name( /* Add a port to a port set */ extern kern_return_t ipc_pset_add( ipc_pset_t pset, - ipc_port_t port); + ipc_port_t port, + wait_queue_link_t wql); /* determine if port is a member of set */ extern boolean_t ipc_pset_member( @@ -122,25 +120,16 @@ extern boolean_t ipc_pset_member( /* Remove a port from a port set */ extern kern_return_t ipc_pset_remove( ipc_pset_t pset, - ipc_port_t port); + ipc_port_t port, + wait_queue_link_t *wqlp); /* Remove a port from all its current port sets */ extern kern_return_t ipc_pset_remove_from_all( - ipc_port_t port); + ipc_port_t port, + queue_t links); /* Destroy a port_set */ extern void ipc_pset_destroy( ipc_pset_t pset); -#define ipc_pset_reference(pset) \ - ipc_object_reference(&(pset)->ips_object) - -#define ipc_pset_release(pset) \ - ipc_object_release(&(pset)->ips_object) - - -#if MACH_KDB -int ipc_list_count(struct ipc_kmsg *base); -#endif /* MACH_KDB */ - #endif /* _IPC_IPC_PSET_H_ */ diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index d3db278b8..46d7f1ec7 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -112,7 +112,7 @@ ipc_right_lookup_write( is_write_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return KERN_INVALID_TASK; } @@ -154,7 +154,7 @@ ipc_right_lookup_two_write( is_write_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return KERN_INVALID_TASK; } @@ -197,7 +197,7 @@ ipc_right_reverse( /* would switch on io_otype to handle multiple types of object */ - assert(space->is_active); + assert(is_active(space)); assert(io_otype(object) == IOT_PORT); port = (ipc_port_t) object; @@ -273,6 +273,8 @@ ipc_right_request_alloc( kern_return_t kr; for (;;) { + ipc_port_t port = IP_NULL; + kr = ipc_right_lookup_write(space, name, &entry); if (kr != KERN_SUCCESS) return kr; @@ -291,7 +293,6 @@ ipc_right_request_alloc( /* see if the entry is of proper type for requests */ if (entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) { ipc_port_request_index_t new_request; - ipc_port_t port; port = (ipc_port_t) entry->ie_object; assert(port != IP_NULL); @@ -305,6 +306,7 @@ ipc_right_request_alloc( previous = ipc_port_request_cancel(port, name, prev_request); ip_unlock(port); entry->ie_request = IE_REQ_NONE; + ipc_entry_modified(space, name, entry); is_write_unlock(space); break; } @@ -320,6 +322,7 @@ ipc_right_request_alloc( previous = ipc_port_request_cancel(port, name, prev_request); ip_unlock(port); entry->ie_request = IE_REQ_NONE; + ipc_entry_modified(space, name, entry); is_write_unlock(space); ipc_notify_send_possible(notify, name); @@ -352,6 +355,7 @@ ipc_right_request_alloc( assert(new_request != IE_REQ_NONE); ip_unlock(port); entry->ie_request = new_request; + ipc_entry_modified(space, name, entry); is_write_unlock(space); break; } @@ -368,18 +372,28 @@ ipc_right_request_alloc( if (MACH_PORT_UREFS_OVERFLOW(urefs, 1)) { is_write_unlock(space); + if (port != IP_NULL) + ip_release(port); return KERN_UREFS_OVERFLOW; } (entry->ie_bits)++; /* increment urefs */ + ipc_entry_modified(space, name, entry); is_write_unlock(space); + if (port != IP_NULL) + ip_release(port); + ipc_notify_dead_name(notify, name); previous = IP_NULL; break; } is_write_unlock(space); + + if (port != IP_NULL) + ip_release(port); + if (entry->ie_bits & MACH_PORT_TYPE_PORT_OR_DEAD) return KERN_INVALID_ARGUMENT; else @@ -417,6 +431,7 @@ ipc_right_request_cancel( previous = ipc_port_request_cancel(port, name, entry->ie_request); entry->ie_request = IE_REQ_NONE; + ipc_entry_modified(space, name, entry); return previous; } @@ -451,8 +466,10 @@ ipc_right_inuse( * Conditions: * The space is write-locked; the port is not locked. * If returns FALSE, the port is also locked and active. - * Otherwise, entry is converted to a dead name, freeing - * a reference to port. + * Otherwise, entry is converted to a dead name. + * + * Caller is responsible for a reference to port if it + * had died (returns TRUE). */ boolean_t @@ -464,7 +481,7 @@ ipc_right_check( { ipc_entry_bits_t bits; - assert(space->is_active); + assert(is_active(space)); assert(port == (ipc_port_t) entry->ie_object); ip_lock(port); @@ -518,16 +535,14 @@ ipc_right_check( } entry->ie_bits = bits; entry->ie_object = IO_NULL; - - ipc_port_release(port); - + ipc_entry_modified(space, name, entry); return TRUE; } /* - * Routine: ipc_right_clean + * Routine: ipc_right_terminate * Purpose: - * Cleans up an entry in a dead space. + * Cleans up an entry in a terminated space. * The entry isn't deallocated or removed * from reverse hash tables. * Conditions: @@ -535,7 +550,7 @@ ipc_right_check( */ void -ipc_right_clean( +ipc_right_terminate( ipc_space_t space, mach_port_name_t name, ipc_entry_t entry) @@ -546,7 +561,7 @@ ipc_right_clean( bits = entry->ie_bits; type = IE_BITS_TYPE(bits); - assert(!space->is_active); + assert(!is_active(space)); /* * IE_BITS_COMPAT/ipc_right_dncancel doesn't have this @@ -571,7 +586,6 @@ ipc_right_clean( ips_lock(pset); assert(ips_active(pset)); - ipc_pset_destroy(pset); /* consumes ref, unlocks */ break; } @@ -589,8 +603,8 @@ ipc_right_clean( ip_lock(port); if (!ip_active(port)) { + ip_unlock(port); ip_release(port); - ip_check_unlock(port); break; } @@ -610,11 +624,21 @@ ipc_right_clean( } if (type & MACH_PORT_TYPE_RECEIVE) { + wait_queue_link_t wql; + queue_head_t links_data; + queue_t links = &links_data; + assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); - ipc_port_clear_receiver(port); + queue_init(links); + ipc_port_clear_receiver(port, links); ipc_port_destroy(port); /* consumes our ref, unlocks */ + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + } else if (type & MACH_PORT_TYPE_SEND_ONCE) { assert(port->ip_sorights > 0); ip_unlock(port); @@ -623,8 +647,8 @@ ipc_right_clean( } else { assert(port->ip_receiver != space); - ip_release(port); - ip_unlock(port); /* port is active */ + ip_unlock(port); + ip_release(port); } if (nsrequest != IP_NULL) @@ -636,7 +660,7 @@ ipc_right_clean( } default: - panic("ipc_right_clean: strange type - 0x%x", type); + panic("ipc_right_terminate: strange type - 0x%x", type); } } @@ -645,7 +669,7 @@ ipc_right_clean( * Purpose: * Destroys an entry in a space. * Conditions: - * The space is write-locked. + * The space is write-locked (returns unlocked). * The space must be active. * Returns: * KERN_SUCCESS The entry was destroyed. @@ -664,7 +688,7 @@ ipc_right_destroy( entry->ie_bits &= ~IE_BITS_TYPE_MASK; type = IE_BITS_TYPE(bits); - assert(space->is_active); + assert(is_active(space)); switch (type) { case MACH_PORT_TYPE_DEAD_NAME: @@ -672,6 +696,7 @@ ipc_right_destroy( assert(entry->ie_object == IO_NULL); ipc_entry_dealloc(space, name, entry); + is_write_unlock(space); break; case MACH_PORT_TYPE_PORT_SET: { @@ -684,8 +709,9 @@ ipc_right_destroy( ipc_entry_dealloc(space, name, entry); ips_lock(pset); - assert(ips_active(pset)); + is_write_unlock(space); + assert(ips_active(pset)); ipc_pset_destroy(pset); /* consumes ref, unlocks */ break; } @@ -709,13 +735,12 @@ ipc_right_destroy( if (!ip_active(port)) { assert((type & MACH_PORT_TYPE_RECEIVE) == 0); - ip_release(port); - ip_check_unlock(port); - + ip_unlock(port); entry->ie_request = IE_REQ_NONE; entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); - + is_write_unlock(space); + ip_release(port); break; } @@ -723,6 +748,7 @@ ipc_right_destroy( entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); + is_write_unlock(space); if (type & MACH_PORT_TYPE_SEND) { assert(port->ip_srights > 0); @@ -736,11 +762,21 @@ ipc_right_destroy( } if (type & MACH_PORT_TYPE_RECEIVE) { + queue_head_t links_data; + queue_t links = &links_data; + wait_queue_link_t wql; + assert(ip_active(port)); assert(port->ip_receiver == space); - ipc_port_clear_receiver(port); + queue_init(links); + ipc_port_clear_receiver(port, links); ipc_port_destroy(port); /* consumes our ref, unlocks */ + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + } else if (type & MACH_PORT_TYPE_SEND_ONCE) { assert(port->ip_sorights > 0); ip_unlock(port); @@ -749,8 +785,8 @@ ipc_right_destroy( } else { assert(port->ip_receiver != space); - ip_release(port); ip_unlock(port); + ip_release(port); } if (nsrequest != IP_NULL) @@ -788,7 +824,7 @@ ipc_right_dealloc( mach_port_name_t name, ipc_entry_t entry) { - + ipc_port_t port = IP_NULL; ipc_entry_bits_t bits; mach_port_type_t type; @@ -796,7 +832,7 @@ ipc_right_dealloc( type = IE_BITS_TYPE(bits); - assert(space->is_active); + assert(is_active(space)); switch (type) { case MACH_PORT_TYPE_DEAD_NAME: { @@ -808,16 +844,20 @@ ipc_right_dealloc( if (IE_BITS_UREFS(bits) == 1) { ipc_entry_dealloc(space, name, entry); - } - else + } else { entry->ie_bits = bits-1; /* decrement urefs */ - + ipc_entry_modified(space, name, entry); + } is_write_unlock(space); + + /* release any port that got converted to dead name below */ + if (port != IP_NULL) + ip_release(port); break; } case MACH_PORT_TYPE_SEND_ONCE: { - ipc_port_t port, request; + ipc_port_t request; assert(IE_BITS_UREFS(bits) == 1); @@ -828,7 +868,7 @@ ipc_right_dealloc( bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); - goto dead_name; + goto dead_name; /* it will release port */ } /* port is locked and active */ @@ -850,7 +890,6 @@ ipc_right_dealloc( } case MACH_PORT_TYPE_SEND: { - ipc_port_t port; ipc_port_t request = IP_NULL; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -864,7 +903,7 @@ ipc_right_dealloc( if (ipc_right_check(space, port, name, entry)) { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); - goto dead_name; + goto dead_name; /* it will release port */ } /* port is locked and active */ @@ -884,16 +923,19 @@ ipc_right_dealloc( ipc_hash_delete(space, (ipc_object_t) port, name, entry); - ip_release(port); + ip_unlock(port); entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); + is_write_unlock(space); + ip_release(port); - } else + } else { + ip_unlock(port); entry->ie_bits = bits-1; /* decrement urefs */ - - /* even if dropped a ref, port is active */ - ip_unlock(port); - is_write_unlock(space); + ipc_entry_modified(space, name, entry); + is_write_unlock(space); + } + if (nsrequest != IP_NULL) ipc_notify_no_senders(nsrequest, mscount); @@ -904,7 +946,6 @@ ipc_right_dealloc( } case MACH_PORT_TYPE_SEND_RECEIVE: { - ipc_port_t port; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -934,6 +975,8 @@ ipc_right_dealloc( entry->ie_bits = bits-1; /* decrement urefs */ ip_unlock(port); + + ipc_entry_modified(space, name, entry); is_write_unlock(space); if (nsrequest != IP_NULL) @@ -972,6 +1015,7 @@ ipc_right_delta( mach_port_right_t right, mach_port_delta_t delta) { + ipc_port_t port = IP_NULL; ipc_entry_bits_t bits; bits = entry->ie_bits; @@ -985,7 +1029,7 @@ ipc_right_delta( * we postpone doing so when we are holding the space lock. */ - assert(space->is_active); + assert(is_active(space)); assert(right < MACH_PORT_RIGHT_NUMBER); /* Rights-specific restrictions and operations. */ @@ -1010,12 +1054,9 @@ ipc_right_delta( pset = (ipc_pset_t) entry->ie_object; assert(pset != IPS_NULL); - - entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); - ips_lock(pset); assert(ips_active(pset)); is_write_unlock(space); @@ -1025,8 +1066,10 @@ ipc_right_delta( } case MACH_PORT_RIGHT_RECEIVE: { - ipc_port_t port; ipc_port_t request = IP_NULL; + queue_head_t links_data; + queue_t links = &links_data; + wait_queue_link_t wql; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) goto invalid_right; @@ -1069,6 +1112,7 @@ ipc_right_delta( * right and enter the remaining send right * into the hash table. */ + ipc_entry_modified(space, name, entry); entry->ie_bits &= ~MACH_PORT_TYPE_RECEIVE; ipc_hash_insert(space, (ipc_object_t) port, name, entry); @@ -1089,6 +1133,7 @@ ipc_right_delta( } entry->ie_bits = bits; entry->ie_object = IO_NULL; + ipc_entry_modified(space, name, entry); } } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_RECEIVE); @@ -1101,8 +1146,13 @@ ipc_right_delta( } is_write_unlock(space); - ipc_port_clear_receiver(port); + queue_init(links); + ipc_port_clear_receiver(port, links); ipc_port_destroy(port); /* consumes ref, unlocks */ + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } if (request != IP_NULL) ipc_notify_port_deleted(request, name); @@ -1110,7 +1160,7 @@ ipc_right_delta( } case MACH_PORT_RIGHT_SEND_ONCE: { - ipc_port_t port, request; + ipc_port_t request; if ((bits & MACH_PORT_TYPE_SEND_ONCE) == 0) goto invalid_right; @@ -1158,7 +1208,6 @@ ipc_right_delta( mach_port_urefs_t urefs; if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { - ipc_port_t port; port = (ipc_port_t) entry->ie_object; assert(port != IP_NULL); @@ -1166,6 +1215,7 @@ ipc_right_delta( if (!ipc_right_check(space, port, name, entry)) { /* port is locked and active */ ip_unlock(port); + port = IP_NULL; goto invalid_right; } bits = entry->ie_bits; @@ -1185,10 +1235,10 @@ ipc_right_delta( if ((urefs + delta) == 0) { ipc_entry_dealloc(space, name, entry); - } - else + } else { entry->ie_bits = bits + delta; - + ipc_entry_modified(space, name, entry); + } is_write_unlock(space); break; @@ -1196,7 +1246,6 @@ ipc_right_delta( case MACH_PORT_RIGHT_SEND: { mach_port_urefs_t urefs; - ipc_port_t port; ipc_port_t request = IP_NULL; ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount = 0; @@ -1239,11 +1288,13 @@ ipc_right_delta( if (bits & MACH_PORT_TYPE_RECEIVE) { assert(port->ip_receiver_name == name); assert(port->ip_receiver == space); + ip_unlock(port); assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_RECEIVE); entry->ie_bits = bits &~ (IE_BITS_UREFS_MASK| MACH_PORT_TYPE_SEND); + ipc_entry_modified(space, name, entry); } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); @@ -1253,16 +1304,18 @@ ipc_right_delta( ipc_hash_delete(space, (ipc_object_t) port, name, entry); + ip_unlock(port); ip_release(port); entry->ie_object = IO_NULL; ipc_entry_dealloc(space, name, entry); } - } else + } else { + ip_unlock(port); entry->ie_bits = bits + delta; + ipc_entry_modified(space, name, entry); + } - /* even if dropped a ref, port is active */ - ip_unlock(port); is_write_unlock(space); if (nsrequest != IP_NULL) @@ -1285,6 +1338,8 @@ ipc_right_delta( invalid_right: is_write_unlock(space); + if (port != IP_NULL) + ip_release(port); return KERN_INVALID_RIGHT; invalid_value: @@ -1301,10 +1356,10 @@ ipc_right_delta( * Purpose: * Retrieves information about the right. * Conditions: - * The space is write-locked, and is unlocked upon return - * if the call is unsuccessful. The space must be active. + * The space is active and write-locked. + * The space is unlocked upon return. * Returns: - * KERN_SUCCESS Retrieved info; space still locked. + * KERN_SUCCESS Retrieved info */ kern_return_t @@ -1333,6 +1388,7 @@ ipc_right_info( type |= ipc_port_request_type(port, name, request); ip_unlock(port); } + is_write_unlock(space); } else if (bits & MACH_PORT_TYPE_SEND_RIGHTS) { /* @@ -1344,10 +1400,15 @@ ipc_right_info( if (request != IE_REQ_NONE) type |= ipc_port_request_type(port, name, request); ip_unlock(port); + is_write_unlock(space); } else { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); + is_write_unlock(space); + ip_release(port); } + } else { + is_write_unlock(space); } type |= IE_BITS_TYPE(bits); @@ -1380,7 +1441,7 @@ ipc_right_copyin_check( #endif bits= entry->ie_bits; - assert(space->is_active); + assert(is_active(space)); switch (msgt_name) { case MACH_MSG_TYPE_MAKE_SEND: @@ -1525,21 +1586,25 @@ ipc_right_copyin( mach_msg_type_name_t msgt_name, boolean_t deadok, ipc_object_t *objectp, - ipc_port_t *sorightp) + ipc_port_t *sorightp, + ipc_port_t *releasep, + queue_t links) { ipc_entry_bits_t bits; + ipc_port_t port; #if CONFIG_MACF_MACH task_t self = current_task(); int rc; #endif + *releasep = IP_NULL; + bits = entry->ie_bits; - assert(space->is_active); + assert(is_active(space)); switch (msgt_name) { case MACH_MSG_TYPE_MAKE_SEND: { - ipc_port_t port; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) goto invalid_right; @@ -1573,7 +1638,6 @@ ipc_right_copyin( } case MACH_MSG_TYPE_MAKE_SEND_ONCE: { - ipc_port_t port; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) goto invalid_right; @@ -1606,7 +1670,6 @@ ipc_right_copyin( } case MACH_MSG_TYPE_MOVE_RECEIVE: { - ipc_port_t port; ipc_port_t request = IP_NULL; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) @@ -1649,9 +1712,9 @@ ipc_right_copyin( entry->ie_object = IO_NULL; } entry->ie_bits = bits &~ MACH_PORT_TYPE_RECEIVE; + ipc_entry_modified(space, name, entry); - ipc_port_clear_receiver(port); - + ipc_port_clear_receiver(port, links); port->ip_receiver_name = MACH_PORT_NULL; port->ip_destination = IP_NULL; ip_unlock(port); @@ -1662,7 +1725,6 @@ ipc_right_copyin( } case MACH_MSG_TYPE_COPY_SEND: { - ipc_port_t port; if (bits & MACH_PORT_TYPE_DEAD_NAME) goto copy_dead; @@ -1679,6 +1741,7 @@ ipc_right_copyin( if (ipc_right_check(space, port, name, entry)) { bits = entry->ie_bits; + *releasep = port; goto copy_dead; } /* port is locked and active */ @@ -1713,7 +1776,6 @@ ipc_right_copyin( } case MACH_MSG_TYPE_MOVE_SEND: { - ipc_port_t port; ipc_port_t request = IP_NULL; if (bits & MACH_PORT_TYPE_DEAD_NAME) @@ -1731,6 +1793,7 @@ ipc_right_copyin( if (ipc_right_check(space, port, name, entry)) { bits = entry->ie_bits; + *releasep = port; goto move_dead; } /* port is locked and active */ @@ -1781,7 +1844,7 @@ ipc_right_copyin( ip_reference(port); entry->ie_bits = bits-1; /* decrement urefs */ } - + ipc_entry_modified(space, name, entry); ip_unlock(port); *objectp = (ipc_object_t) port; @@ -1790,7 +1853,6 @@ ipc_right_copyin( } case MACH_MSG_TYPE_MOVE_SEND_ONCE: { - ipc_port_t port; ipc_port_t request; if (bits & MACH_PORT_TYPE_DEAD_NAME) @@ -1841,7 +1903,7 @@ ipc_right_copyin( entry->ie_object = IO_NULL; entry->ie_bits = bits &~ (IE_BITS_UREFS_MASK | MACH_PORT_TYPE_SEND_ONCE); - + ipc_entry_modified(space, name, entry); *objectp = (ipc_object_t) port; *sorightp = request; break; @@ -1880,7 +1942,7 @@ ipc_right_copyin( bits &= ~MACH_PORT_TYPE_DEAD_NAME; } entry->ie_bits = bits-1; /* decrement urefs */ - + ipc_entry_modified(space, name, entry); *objectp = IO_DEAD; *sorightp = IP_NULL; return KERN_SUCCESS; @@ -1910,7 +1972,7 @@ ipc_right_copyin_undo( bits = entry->ie_bits; - assert(space->is_active); + assert(is_active(space)); assert((msgt_name == MACH_MSG_TYPE_MOVE_SEND) || (msgt_name == MACH_MSG_TYPE_COPY_SEND) || @@ -1961,11 +2023,11 @@ ipc_right_copyin_undo( name, entry); /* object is dead so it is not locked */ } - + ipc_entry_modified(space, name, entry); /* release the reference acquired by copyin */ if (object != IO_DEAD) - ipc_object_release(object); + io_release(object); } /* @@ -1988,7 +2050,8 @@ ipc_right_copyin_two( mach_port_name_t name, ipc_entry_t entry, ipc_object_t *objectp, - ipc_port_t *sorightp) + ipc_port_t *sorightp, + ipc_port_t *releasep) { ipc_entry_bits_t bits; mach_port_urefs_t urefs; @@ -1999,7 +2062,9 @@ ipc_right_copyin_two( int rc; #endif - assert(space->is_active); + *releasep = IP_NULL; + + assert(is_active(space)); bits = entry->ie_bits; @@ -2014,6 +2079,7 @@ ipc_right_copyin_two( assert(port != IP_NULL); if (ipc_right_check(space, port, name, entry)) { + *releasep = port; goto invalid_right; } /* port is locked and active */ @@ -2059,6 +2125,8 @@ ipc_right_copyin_two( ip_reference(port); entry->ie_bits = bits-2; /* decrement urefs */ } + ipc_entry_modified(space, name, entry); + ip_unlock(port); *objectp = (ipc_object_t) port; @@ -2140,6 +2208,7 @@ ipc_right_copyout( ip_unlock(port); entry->ie_bits = bits | (MACH_PORT_TYPE_SEND_ONCE | 1); + ipc_entry_modified(space, name, entry); break; case MACH_MSG_TYPE_PORT_SEND: @@ -2171,25 +2240,26 @@ ipc_right_copyout( /* leave urefs pegged to maximum */ port->ip_srights--; - ip_release(port); ip_unlock(port); + ip_release(port); return KERN_SUCCESS; } ip_unlock(port); return KERN_UREFS_OVERFLOW; } - port->ip_srights--; - ip_release(port); ip_unlock(port); + ip_release(port); + } else if (bits & MACH_PORT_TYPE_RECEIVE) { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_RECEIVE); assert(IE_BITS_UREFS(bits) == 0); /* transfer send right to entry */ - ip_release(port); ip_unlock(port); + ip_release(port); + } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE); assert(IE_BITS_UREFS(bits) == 0); @@ -2204,6 +2274,7 @@ ipc_right_copyout( } entry->ie_bits = (bits | MACH_PORT_TYPE_SEND) + 1; + ipc_entry_modified(space, name, entry); break; case MACH_MSG_TYPE_PORT_RECEIVE: { @@ -2237,8 +2308,8 @@ ipc_right_copyout( assert(IE_BITS_UREFS(bits) > 0); assert(port->ip_srights > 0); - ip_release(port); ip_unlock(port); + ip_release(port); /* entry is locked holding ref, so can use port */ @@ -2252,9 +2323,10 @@ ipc_right_copyout( ip_unlock(port); } entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE; + ipc_entry_modified(space, name, entry); if (dest != IP_NULL) - ipc_port_release(dest); + ip_release(dest); break; } @@ -2289,8 +2361,9 @@ ipc_right_rename( ipc_port_request_index_t request = oentry->ie_request; ipc_entry_bits_t bits = oentry->ie_bits; ipc_object_t object = oentry->ie_object; + ipc_port_t release_port = IP_NULL; - assert(space->is_active); + assert(is_active(space)); assert(oname != nname); /* @@ -2311,6 +2384,7 @@ ipc_right_rename( request = IE_REQ_NONE; object = IO_NULL; bits = oentry->ie_bits; + release_port = port; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); assert(oentry->ie_request == IE_REQ_NONE); } else { @@ -2387,7 +2461,11 @@ ipc_right_rename( assert(oentry->ie_request == IE_REQ_NONE); oentry->ie_object = IO_NULL; ipc_entry_dealloc(space, oname, oentry); + ipc_entry_modified(space, nname, nentry); is_write_unlock(space); + if (release_port != IP_NULL) + ip_release(release_port); + return KERN_SUCCESS; } diff --git a/osfmk/ipc/ipc_right.h b/osfmk/ipc/ipc_right.h index 8b12cd895..dfbfd232e 100644 --- a/osfmk/ipc/ipc_right.h +++ b/osfmk/ipc/ipc_right.h @@ -129,7 +129,7 @@ extern boolean_t ipc_right_check( ipc_entry_t entry); /* Clean up an entry in a dead space */ -extern void ipc_right_clean( +extern void ipc_right_terminate( ipc_space_t space, mach_port_name_t name, ipc_entry_t entry); @@ -177,7 +177,9 @@ extern kern_return_t ipc_right_copyin( mach_msg_type_name_t msgt_name, boolean_t deadok, ipc_object_t *objectp, - ipc_port_t *sorightp); + ipc_port_t *sorightp, + ipc_port_t *releasep, + queue_t links); /* Undo the effects of an ipc_right_copyin */ extern void ipc_right_copyin_undo( @@ -194,7 +196,8 @@ extern kern_return_t ipc_right_copyin_two( mach_port_name_t name, ipc_entry_t entry, ipc_object_t *objectp, - ipc_port_t *sorightp); + ipc_port_t *sorightp, + ipc_port_t *releasep); /* Copyout a capability to a space */ extern kern_return_t ipc_right_copyout( diff --git a/osfmk/ipc/ipc_space.c b/osfmk/ipc/ipc_space.c index 1aaecc594..803ab7321 100644 --- a/osfmk/ipc/ipc_space.c +++ b/osfmk/ipc/ipc_space.c @@ -69,8 +69,6 @@ * Functions to manipulate IPC capability spaces. */ -#include - #include #include #include @@ -79,7 +77,6 @@ #include #include #include -#include #include #include #include @@ -91,31 +88,26 @@ zone_t ipc_space_zone; ipc_space_t ipc_space_kernel; ipc_space_t ipc_space_reply; -#if MACH_KDB -ipc_space_t default_pager_space; -#endif /* MACH_KDB */ /* * Routine: ipc_space_reference * Routine: ipc_space_release * Purpose: - * Function versions of the IPC space macros. - * The "is_" cover macros can be defined to use the - * macros or the functions, as desired. + * Function versions of the IPC space inline reference. */ void ipc_space_reference( ipc_space_t space) { - ipc_space_reference_macro(space); + is_reference(space); } void ipc_space_release( ipc_space_t space) { - ipc_space_release_macro(space); + is_release(space); } /* @@ -169,21 +161,14 @@ ipc_space_create( } table[new_size-1].ie_next = 0; - is_ref_lock_init(space); - space->is_references = 2; - is_lock_init(space); - space->is_active = TRUE; - space->is_growing = FALSE; - space->is_table = table; + space->is_bits = 2; /* 2 refs, active, not growing */ space->is_table_size = new_size; + space->is_table = table; space->is_table_next = initial+1; - - ipc_splay_tree_init(&space->is_tree); - space->is_tree_total = 0; - space->is_tree_small = 0; - space->is_tree_hash = 0; space->is_task = NULL; + space->is_low_mod = new_size; + space->is_high_mod = 0; *spacep = space; return KERN_SUCCESS; @@ -214,12 +199,8 @@ ipc_space_create_special( if (space == IS_NULL) return KERN_RESOURCE_SHORTAGE; - is_ref_lock_init(space); - space->is_references = 1; - is_lock_init(space); - space->is_active = FALSE; - + space->is_bits = IS_INACTIVE | 1; /* 1 ref, not active, not growing */ *spacep = space; return KERN_SUCCESS; } @@ -235,7 +216,6 @@ void ipc_space_clean( ipc_space_t space) { - ipc_tree_entry_t tentry; ipc_entry_t table; ipc_entry_num_t size; mach_port_index_t index; @@ -245,11 +225,12 @@ ipc_space_clean( * we must wait until they finish and figure * out the space died. */ + retry: is_write_lock(space); - while (space->is_growing) + while (is_growing(space)) is_write_sleep(space); - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return; } @@ -257,10 +238,6 @@ ipc_space_clean( /* * Now we can futz with it since we have the write lock. */ -#if MACH_KDB - if (space == default_pager_space) - default_pager_space = IS_NULL; -#endif /* MACH_KDB */ table = space->is_table; size = space->is_table_size; @@ -273,40 +250,23 @@ ipc_space_clean( if (type != MACH_PORT_TYPE_NONE) { mach_port_name_t name = MACH_PORT_MAKE(index, IE_BITS_GEN(entry->ie_bits)); - ipc_right_destroy(space, name, entry); + ipc_right_destroy(space, name, entry); /* unlocks space */ + goto retry; } } - /* + /* * JMM - Now the table is cleaned out. We don't bother shrinking the * size of the table at this point, but we probably should if it is - * really large. Lets just clean up the splay tree. + * really large. */ - start_splay: - for (tentry = ipc_splay_traverse_start(&space->is_tree); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree, TRUE)) { - mach_port_type_t type; - mach_port_name_t name = tentry->ite_name; - - type = IE_BITS_TYPE(tentry->ite_bits); - /* - * If it is a real right, then destroy it. This will have the - * side effect of removing it from the splay, so start over. - */ - if(type != MACH_PORT_TYPE_NONE) { - ipc_splay_traverse_finish(&space->is_tree); - ipc_right_destroy(space, name, &tentry->ite_entry); - goto start_splay; - } - } - ipc_splay_traverse_finish(&space->is_tree); + is_write_unlock(space); } /* - * Routine: ipc_space_destroy + * Routine: ipc_space_terminate * Purpose: * Marks the space as dead and cleans up the entries. * Does nothing if the space is already dead. @@ -315,11 +275,9 @@ ipc_space_clean( */ void -ipc_space_destroy( +ipc_space_terminate( ipc_space_t space) { - boolean_t active; - ipc_tree_entry_t tentry; ipc_entry_t table; ipc_entry_num_t size; mach_port_index_t index; @@ -327,31 +285,26 @@ ipc_space_destroy( assert(space != IS_NULL); is_write_lock(space); - active = space->is_active; - space->is_active = FALSE; - is_write_unlock(space); - - if (!active) + if (!is_active(space)) { + is_write_unlock(space); return; - + } + is_mark_inactive(space); /* * If somebody is trying to grow the table, * we must wait until they finish and figure * out the space died. */ - is_read_lock(space); - while (space->is_growing) - is_read_sleep(space); + while (is_growing(space)) + is_write_sleep(space); + + is_write_unlock(space); + - is_read_unlock(space); /* * Now we can futz with it unlocked. */ -#if MACH_KDB - if (space == default_pager_space) - default_pager_space = IS_NULL; -#endif /* MACH_KDB */ table = space->is_table; size = space->is_table_size; @@ -366,30 +319,13 @@ ipc_space_destroy( name = MACH_PORT_MAKE(index, IE_BITS_GEN(entry->ie_bits)); - ipc_right_clean(space, name, entry); + ipc_right_terminate(space, name, entry); } } it_entries_free(space->is_table_next-1, table); space->is_table_size = 0; - for (tentry = ipc_splay_traverse_start(&space->is_tree); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree, TRUE)) { - mach_port_type_t type; - mach_port_name_t name = tentry->ite_name; - - type = IE_BITS_TYPE(tentry->ite_bits); - assert(type != MACH_PORT_TYPE_NONE); - - ipc_right_clean(space, name, &tentry->ite_entry); - - if(type == MACH_PORT_TYPE_SEND) - ipc_hash_global_delete(space, tentry->ite_object, - name, tentry); - } - ipc_splay_traverse_finish(&space->is_tree); - /* * Because the space is now dead, * we must release the "active" reference for it. diff --git a/osfmk/ipc/ipc_space.h b/osfmk/ipc/ipc_space.h index 39c2e45a4..2f9edeb47 100644 --- a/osfmk/ipc/ipc_space.h +++ b/osfmk/ipc/ipc_space.h @@ -82,55 +82,72 @@ #ifdef __APPLE_API_PRIVATE #if MACH_KERNEL_PRIVATE -#include #include #include #include #include #include #include -#include #include +#include + /* * Every task has a space of IPC capabilities. * IPC operations like send and receive use this space. * IPC kernel calls manipulate the space of the target task. * * Every space has a non-NULL is_table with is_table_size entries. - * A space may have a NULL is_tree. is_tree_small records the - * number of entries in the tree that, if the table were to grow - * to the next larger size, would move from the tree to the table. - * - * is_growing marks when the table is in the process of growing. - * When the table is growing, it can't be freed or grown by another - * thread, because of krealloc/kmem_realloc's requirements. * + * Only one thread can be growing the space at a time. Others + * that need it grown wait for the first. We do almost all the + * work with the space unlocked, so lookups proceed pretty much + * unaffected while the grow operation is underway. */ typedef natural_t ipc_space_refs_t; +#define IS_REFS_MAX 0x0fffffff +#define IS_INACTIVE 0x40000000 /* space is inactive */ +#define IS_GROWING 0x20000000 /* space is growing */ struct ipc_space { - decl_lck_mtx_data(,is_ref_lock_data) - ipc_space_refs_t is_references; - - decl_lck_mtx_data(,is_lock_data) - boolean_t is_active; /* is the space alive? */ - boolean_t is_growing; /* is the space growing? */ - ipc_entry_t is_table; /* an array of entries */ + lck_spin_t is_lock_data; + ipc_space_refs_t is_bits; /* holds refs, active, growing */ ipc_entry_num_t is_table_size; /* current size of table */ - struct ipc_table_size *is_table_next; /* info for larger table */ - struct ipc_splay_tree is_tree; /* a splay tree of entries */ - ipc_entry_num_t is_tree_total; /* number of entries in the tree */ - ipc_entry_num_t is_tree_small; /* # of small entries in the tree */ - ipc_entry_num_t is_tree_hash; /* # of hashed entries in the tree */ - boolean_t is_fast; /* for is_fast_space() */ - + ipc_entry_t is_table; /* an array of entries */ task_t is_task; /* associated task */ + struct ipc_table_size *is_table_next; /* info for larger table */ + ipc_entry_num_t is_low_mod; /* lowest modified entry during growth */ + ipc_entry_num_t is_high_mod; /* highest modified entry during growth */ }; #define IS_NULL ((ipc_space_t) 0) +#define is_active(is) (((is)->is_bits & IS_INACTIVE) != IS_INACTIVE) + +static inline void +is_mark_inactive(ipc_space_t is) +{ + assert(is_active(is)); + OSBitOrAtomic(IS_INACTIVE, &is->is_bits); +} + +#define is_growing(is) (((is)->is_bits & IS_GROWING) == IS_GROWING) + +static inline void +is_start_growing(ipc_space_t is) +{ + assert(!is_growing(is)); + OSBitOrAtomic(IS_GROWING, &is->is_bits); +} + +static inline void +is_done_growing(ipc_space_t is) +{ + assert(is_growing(is)); + OSBitAndAtomic(~IS_GROWING, &is->is_bits); +} + extern zone_t ipc_space_zone; #define is_alloc() ((ipc_space_t) zalloc(ipc_space_zone)) @@ -141,62 +158,52 @@ extern ipc_space_t ipc_space_reply; #if DIPC extern ipc_space_t ipc_space_remote; #endif /* DIPC */ -#if DIPC || MACH_KDB +#if DIPC extern ipc_space_t default_pager_space; -#endif /* DIPC || MACH_KDB */ - -#define is_fast_space(is) ((is)->is_fast) - -#define is_ref_lock_init(is) lck_mtx_init(&(is)->is_ref_lock_data, &ipc_lck_grp, &ipc_lck_attr) -#define is_ref_lock_destroy(is) lck_mtx_destroy(&(is)->is_ref_lock_data, &ipc_lck_grp) - -#define ipc_space_reference_macro(is) \ -MACRO_BEGIN \ - lck_mtx_lock(&(is)->is_ref_lock_data); \ - assert((is)->is_references > 0); \ - (is)->is_references++; \ - lck_mtx_unlock(&(is)->is_ref_lock_data); \ -MACRO_END - -#define ipc_space_release_macro(is) \ -MACRO_BEGIN \ - ipc_space_refs_t _refs; \ - \ - lck_mtx_lock(&(is)->is_ref_lock_data); \ - assert((is)->is_references > 0); \ - _refs = --(is)->is_references; \ - lck_mtx_unlock(&(is)->is_ref_lock_data); \ - \ - if (_refs == 0) { \ - is_lock_destroy(is); \ - is_ref_lock_destroy(is); \ - is_free(is); \ - } \ -MACRO_END - -#define is_lock_init(is) lck_mtx_init(&(is)->is_lock_data, &ipc_lck_grp, &ipc_lck_attr) -#define is_lock_destroy(is) lck_mtx_destroy(&(is)->is_lock_data, &ipc_lck_grp) - -#define is_read_lock(is) lck_mtx_lock(&(is)->is_lock_data) -#define is_read_unlock(is) lck_mtx_unlock(&(is)->is_lock_data) -#define is_read_sleep(is) lck_mtx_sleep(&(is)->is_lock_data, \ +#endif /* DIPC */ + +extern lck_grp_t ipc_lck_grp; +extern lck_attr_t ipc_lck_attr; + +#define is_lock_init(is) lck_spin_init(&(is)->is_lock_data, &ipc_lck_grp, &ipc_lck_attr) +#define is_lock_destroy(is) lck_spin_destroy(&(is)->is_lock_data, &ipc_lck_grp) + +#define is_read_lock(is) lck_spin_lock(&(is)->is_lock_data) +#define is_read_unlock(is) lck_spin_unlock(&(is)->is_lock_data) +#define is_read_sleep(is) lck_spin_sleep(&(is)->is_lock_data, \ LCK_SLEEP_DEFAULT, \ (event_t)(is), \ THREAD_UNINT) -#define is_write_lock(is) lck_mtx_lock(&(is)->is_lock_data) -#define is_write_lock_try(is) lck_mtx_try_lock(&(is)->is_lock_data) -#define is_write_unlock(is) lck_mtx_unlock(&(is)->is_lock_data) -#define is_write_sleep(is) lck_mtx_sleep(&(is)->is_lock_data, \ +#define is_write_lock(is) lck_spin_lock(&(is)->is_lock_data) +#define is_write_lock_try(is) lck_spin_try_lock(&(is)->is_lock_data) +#define is_write_unlock(is) lck_spin_unlock(&(is)->is_lock_data) +#define is_write_sleep(is) lck_spin_sleep(&(is)->is_lock_data, \ LCK_SLEEP_DEFAULT, \ (event_t)(is), \ THREAD_UNINT) -#define is_reference(is) ipc_space_reference(is) -#define is_release(is) ipc_space_release(is) +#define is_refs(is) ((is)->is_bits & IS_REFS_MAX) + +static inline void +is_reference(ipc_space_t is) +{ + assert(is_refs(is) > 0 && is_refs(is) < IS_REFS_MAX); + OSIncrementAtomic(&(is->is_bits)); +} + -#define is_write_to_read_lock(is) +static inline void +is_release(ipc_space_t is) { + assert(is_refs(is) > 0); + /* If we just removed the last reference count */ + if ( 1 == (OSDecrementAtomic(&(is->is_bits)) & IS_REFS_MAX)) { + is_lock_destroy(is); + is_free(is); + } +} + #define current_space_fast() (current_task_fast()->itk_space) #define current_space() (current_space_fast()) @@ -210,7 +217,7 @@ extern kern_return_t ipc_space_create( ipc_space_t *spacep); /* Mark a space as dead and cleans up the entries*/ -extern void ipc_space_destroy( +extern void ipc_space_terminate( ipc_space_t space); /* Clean up the entries - but leave the space alive */ diff --git a/osfmk/ipc/ipc_splay.c b/osfmk/ipc/ipc_splay.c deleted file mode 100644 index e0fec7699..000000000 --- a/osfmk/ipc/ipc_splay.c +++ /dev/null @@ -1,950 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: ipc/ipc_splay.c - * Author: Rich Draves - * Date: 1989 - * - * Primitive splay tree operations. - */ - -#include -#include -#include -#include -#include - -/* - * Splay trees are self-adjusting binary search trees. - * They have the following attractive properties: - * 1) Space efficient; only two pointers per entry. - * 2) Robust performance; amortized O(log n) per operation. - * 3) Recursion not needed. - * This makes them a good fall-back data structure for those - * entries that don't fit into the lookup table. - * - * The paper by Sleator and Tarjan, JACM v. 32, no. 3, pp. 652-686, - * describes the splaying operation. ipc_splay_prim_lookup - * and ipc_splay_prim_assemble implement the top-down splay - * described on p. 669. - * - * The tree is stored in an unassembled form. If ist_root is null, - * then the tree has no entries. Otherwise, ist_name records - * the value used for the last lookup. ist_root points to the - * middle tree obtained from the top-down splay. ist_ltree and - * ist_rtree point to left and right subtrees, whose entries - * are all smaller (larger) than those in the middle tree. - * ist_ltreep and ist_rtreep are pointers to fields in the - * left and right subtrees. ist_ltreep points to the rchild field - * of the largest entry in ltree, and ist_rtreep points to the - * lchild field of the smallest entry in rtree. The pointed-to - * fields aren't initialized. If the left (right) subtree is null, - * then ist_ltreep (ist_rtreep) points to the ist_ltree (ist_rtree) - * field in the splay structure itself. - * - * The primary advantage of the unassembled form is that repeated - * unsuccessful lookups are efficient. In particular, an unsuccessful - * lookup followed by an insert only requires one splaying operation. - * - * The traversal algorithm works via pointer inversion. - * When descending down the tree, child pointers are reversed - * to point back to the parent entry. When ascending, - * the pointers are restored to their original value. - * - * The biggest potential problem with the splay tree implementation - * is that the operations, even lookup, require an exclusive lock. - * If IPC spaces are protected with exclusive locks, then - * the splay tree doesn't require its own lock, and ist_lock/ist_unlock - * needn't do anything. If IPC spaces are protected with read/write - * locks then ist_lock/ist_unlock should provide exclusive access. - * - * If it becomes important to let lookups run in parallel, - * or if the restructuring makes lookups too expensive, then - * there is hope. Use a read/write lock on the splay tree. - * Keep track of the number of entries in the tree. When doing - * a lookup, first try a non-restructuring lookup with a read lock held, - * with a bound (based on log of size of the tree) on the number of - * entries to traverse. If the lookup runs up against the bound, - * then take a write lock and do a reorganizing lookup. - * This way, if lookups only access roughly balanced parts - * of the tree, then lookups run in parallel and do no restructuring. - * - * The traversal algorithm currently requires an exclusive lock. - * If that is a problem, the tree could be changed from an lchild/rchild - * representation to a leftmost child/right sibling representation. - * In conjunction with non-restructing lookups, this would let - * lookups and traversals all run in parallel. But this representation - * is more complicated and would slow down the operations. - */ - -/* - * Boundary values to hand to ipc_splay_prim_lookup: - */ - -#define MACH_PORT_SMALLEST ((mach_port_name_t) 0) -#define MACH_PORT_LARGEST ((mach_port_name_t) ~0) - -/* - * Routine: ipc_splay_prim_lookup - * Purpose: - * Searches for the node labeled name in the splay tree. - * Returns three nodes (treep, ltreep, rtreep) and - * two pointers to nodes (ltreepp, rtreepp). - * - * ipc_splay_prim_lookup splits the supplied tree into - * three subtrees, left, middle, and right, returned - * in ltreep, treep, and rtreep. - * - * If name is present in the tree, then it is at - * the root of the middle tree. Otherwise, the root - * of the middle tree is the last node traversed. - * - * ipc_splay_prim_lookup returns a pointer into - * the left subtree, to the rchild field of its - * largest node, in ltreepp. It returns a pointer - * into the right subtree, to the lchild field of its - * smallest node, in rtreepp. - */ - -static void -ipc_splay_prim_lookup( - mach_port_name_t name, - ipc_tree_entry_t tree, - ipc_tree_entry_t *treep, - ipc_tree_entry_t *ltreep, - ipc_tree_entry_t **ltreepp, - ipc_tree_entry_t *rtreep, - ipc_tree_entry_t **rtreepp) -{ - mach_port_name_t tname; /* temp name */ - ipc_tree_entry_t lchild, rchild; /* temp child pointers */ - - assert(tree != ITE_NULL); - -#define link_left \ -MACRO_BEGIN \ - *ltreep = tree; \ - ltreep = &tree->ite_rchild; \ - tree = *ltreep; \ -MACRO_END - -#define link_right \ -MACRO_BEGIN \ - *rtreep = tree; \ - rtreep = &tree->ite_lchild; \ - tree = *rtreep; \ -MACRO_END - -#define rotate_left \ -MACRO_BEGIN \ - ipc_tree_entry_t temp = tree; \ - \ - tree = temp->ite_rchild; \ - temp->ite_rchild = tree->ite_lchild; \ - tree->ite_lchild = temp; \ -MACRO_END - -#define rotate_right \ -MACRO_BEGIN \ - ipc_tree_entry_t temp = tree; \ - \ - tree = temp->ite_lchild; \ - temp->ite_lchild = tree->ite_rchild; \ - tree->ite_rchild = temp; \ -MACRO_END - - while (name != (tname = tree->ite_name)) { - if (name < tname) { - /* descend to left */ - - lchild = tree->ite_lchild; - if (lchild == ITE_NULL) - break; - tname = lchild->ite_name; - - if ((name < tname) && - (lchild->ite_lchild != ITE_NULL)) - rotate_right; - link_right; - if ((name > tname) && - (lchild->ite_rchild != ITE_NULL)) - link_left; - } else { - /* descend to right */ - - rchild = tree->ite_rchild; - if (rchild == ITE_NULL) - break; - tname = rchild->ite_name; - - if ((name > tname) && - (rchild->ite_rchild != ITE_NULL)) - rotate_left; - link_left; - if ((name < tname) && - (rchild->ite_lchild != ITE_NULL)) - link_right; - } - - assert(tree != ITE_NULL); - } - - *treep = tree; - *ltreepp = ltreep; - *rtreepp = rtreep; - -#undef link_left -#undef link_right -#undef rotate_left -#undef rotate_right -} - -/* - * Routine: ipc_splay_prim_assemble - * Purpose: - * Assembles the results of ipc_splay_prim_lookup - * into a splay tree with the found node at the root. - * - * ltree and rtree are by-reference so storing - * through ltreep and rtreep can change them. - */ - -static void -ipc_splay_prim_assemble( - ipc_tree_entry_t tree, - ipc_tree_entry_t *ltree, - ipc_tree_entry_t *ltreep, - ipc_tree_entry_t *rtree, - ipc_tree_entry_t *rtreep) -{ - assert(tree != ITE_NULL); - - *ltreep = tree->ite_lchild; - *rtreep = tree->ite_rchild; - - tree->ite_lchild = *ltree; - tree->ite_rchild = *rtree; -} - -/* - * Routine: ipc_splay_tree_init - * Purpose: - * Initialize a raw splay tree for use. - */ - -void -ipc_splay_tree_init( - ipc_splay_tree_t splay) -{ - splay->ist_root = ITE_NULL; -} - -/* - * Routine: ipc_splay_tree_pick - * Purpose: - * Picks and returns a random entry in a splay tree. - * Returns FALSE if the splay tree is empty. - */ - -boolean_t -ipc_splay_tree_pick( - ipc_splay_tree_t splay, - mach_port_name_t *namep, - ipc_tree_entry_t *entryp) -{ - ipc_tree_entry_t root; - - ist_lock(splay); - - root = splay->ist_root; - if (root != ITE_NULL) { - *namep = root->ite_name; - *entryp = root; - } - - ist_unlock(splay); - - return root != ITE_NULL; -} - -/* - * Routine: ipc_splay_tree_lookup - * Purpose: - * Finds an entry in a splay tree. - * Returns ITE_NULL if not found. - */ - -ipc_tree_entry_t -ipc_splay_tree_lookup( - ipc_splay_tree_t splay, - mach_port_name_t name) -{ - ipc_tree_entry_t root; - - ist_lock(splay); - - root = splay->ist_root; - if (root != ITE_NULL) { - if (splay->ist_name != name) { - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - ipc_splay_prim_lookup(name, root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - splay->ist_name = name; - splay->ist_root = root; - } - - if (name != root->ite_name) - root = ITE_NULL; - } - - ist_unlock(splay); - - return root; -} - -/* - * Routine: ipc_splay_tree_insert - * Purpose: - * Inserts a new entry into a splay tree. - * The caller supplies a new entry. - * The name can't already be present in the tree. - */ - -void -ipc_splay_tree_insert( - ipc_splay_tree_t splay, - mach_port_name_t name, - ipc_tree_entry_t entry) -{ - ipc_tree_entry_t root; - - assert(entry != ITE_NULL); - - ist_lock(splay); - - root = splay->ist_root; - if (root == ITE_NULL) { - entry->ite_lchild = ITE_NULL; - entry->ite_rchild = ITE_NULL; - } else { - if (splay->ist_name != name) { - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - ipc_splay_prim_lookup(name, root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - } - - assert(root->ite_name != name); - - if (name < root->ite_name) { - assert(root->ite_lchild == ITE_NULL); - - *splay->ist_ltreep = ITE_NULL; - *splay->ist_rtreep = root; - } else { - assert(root->ite_rchild == ITE_NULL); - - *splay->ist_ltreep = root; - *splay->ist_rtreep = ITE_NULL; - } - - entry->ite_lchild = splay->ist_ltree; - entry->ite_rchild = splay->ist_rtree; - } - - entry->ite_name = name; - splay->ist_root = entry; - splay->ist_name = name; - splay->ist_ltreep = &splay->ist_ltree; - splay->ist_rtreep = &splay->ist_rtree; - - ist_unlock(splay); -} - -/* - * Routine: ipc_splay_tree_delete - * Purpose: - * Deletes an entry from a splay tree. - * The name must be present in the tree. - * Frees the entry. - * - * The "entry" argument isn't currently used. - * Other implementations might want it, though. - */ - -void -ipc_splay_tree_delete( - ipc_splay_tree_t splay, - mach_port_name_t name, - __assert_only ipc_tree_entry_t entry) -{ - ipc_tree_entry_t root, saved; - - ist_lock(splay); - - root = splay->ist_root; - assert(root != ITE_NULL); - - if (splay->ist_name != name) { - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - ipc_splay_prim_lookup(name, root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - } - - assert(root->ite_name == name); - assert(root == entry); - - *splay->ist_ltreep = root->ite_lchild; - *splay->ist_rtreep = root->ite_rchild; - ite_free(root); - - root = splay->ist_ltree; - saved = splay->ist_rtree; - - if (root == ITE_NULL) - root = saved; - else if (saved != ITE_NULL) { - /* - * Find the largest node in the left subtree, and splay it - * to the root. Then add the saved right subtree. - */ - - ipc_splay_prim_lookup(MACH_PORT_LARGEST, root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - - assert(root->ite_rchild == ITE_NULL); - root->ite_rchild = saved; - } - - splay->ist_root = root; - if (root != ITE_NULL) { - splay->ist_name = root->ite_name; - splay->ist_ltreep = &splay->ist_ltree; - splay->ist_rtreep = &splay->ist_rtree; - } - - ist_unlock(splay); -} - -/* - * Routine: ipc_splay_tree_split - * Purpose: - * Split a splay tree. Puts all entries smaller than "name" - * into a new tree, "small". - * - * Doesn't do locking on "small", because nobody else - * should be fiddling with the uninitialized tree. - */ - -void -ipc_splay_tree_split( - ipc_splay_tree_t splay, - mach_port_name_t name, - ipc_splay_tree_t small) -{ - ipc_tree_entry_t root; - - ipc_splay_tree_init(small); - - ist_lock(splay); - - root = splay->ist_root; - if (root != ITE_NULL) { - /* lookup name, to get it (or last traversed) to the top */ - - if (splay->ist_name != name) { - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - ipc_splay_prim_lookup(name, root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - } - - if (root->ite_name < name) { - /* root goes into small */ - - *splay->ist_ltreep = root->ite_lchild; - *splay->ist_rtreep = ITE_NULL; - root->ite_lchild = splay->ist_ltree; - assert(root->ite_rchild == ITE_NULL); - - small->ist_root = root; - small->ist_name = root->ite_name; - small->ist_ltreep = &small->ist_ltree; - small->ist_rtreep = &small->ist_rtree; - - /* rtree goes into splay */ - - root = splay->ist_rtree; - splay->ist_root = root; - if (root != ITE_NULL) { - splay->ist_name = root->ite_name; - splay->ist_ltreep = &splay->ist_ltree; - splay->ist_rtreep = &splay->ist_rtree; - } - } else { - /* root stays in splay */ - - *splay->ist_ltreep = root->ite_lchild; - root->ite_lchild = ITE_NULL; - - splay->ist_root = root; - splay->ist_name = name; - splay->ist_ltreep = &splay->ist_ltree; - - /* ltree goes into small */ - - root = splay->ist_ltree; - small->ist_root = root; - if (root != ITE_NULL) { - small->ist_name = root->ite_name; - small->ist_ltreep = &small->ist_ltree; - small->ist_rtreep = &small->ist_rtree; - } - } - } - - ist_unlock(splay); -} - -/* - * Routine: ipc_splay_tree_join - * Purpose: - * Joins two splay trees. Merges the entries in "small", - * which must all be smaller than the entries in "splay", - * into "splay". - */ - -void -ipc_splay_tree_join( - ipc_splay_tree_t splay, - ipc_splay_tree_t small) -{ - ipc_tree_entry_t sroot; - - /* pull entries out of small */ - - ist_lock(small); - - sroot = small->ist_root; - if (sroot != ITE_NULL) { - ipc_splay_prim_assemble(sroot, - &small->ist_ltree, small->ist_ltreep, - &small->ist_rtree, small->ist_rtreep); - small->ist_root = ITE_NULL; - } - - ist_unlock(small); - - /* put entries, if any, into splay */ - - if (sroot != ITE_NULL) { - ipc_tree_entry_t root; - - ist_lock(splay); - - root = splay->ist_root; - if (root == ITE_NULL) { - root = sroot; - } else { - /* get smallest entry in splay tree to top */ - - if (splay->ist_name != MACH_PORT_SMALLEST) { - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - ipc_splay_prim_lookup(MACH_PORT_SMALLEST, - root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - } - - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - - assert(root->ite_lchild == ITE_NULL); - assert(sroot->ite_name < root->ite_name); - root->ite_lchild = sroot; - } - - splay->ist_root = root; - splay->ist_name = root->ite_name; - splay->ist_ltreep = &splay->ist_ltree; - splay->ist_rtreep = &splay->ist_rtree; - - ist_unlock(splay); - } -} - -/* - * Routine: ipc_splay_tree_bounds - * Purpose: - * Given a name, returns the largest value present - * in the tree that is smaller than or equal to the name, - * or ~0 if no such value exists. Similarly, returns - * the smallest value present that is greater than or - * equal to the name, or 0 if no such value exists. - * - * Hence, if - * lower = upper, then lower = name = upper - * and name is present in the tree - * lower = ~0 and upper = 0, - * then the tree is empty - * lower = ~0 and upper > 0, then name < upper - * and upper is smallest value in tree - * lower < ~0 and upper = 0, then lower < name - * and lower is largest value in tree - * lower < ~0 and upper > 0, then lower < name < upper - * and they are tight bounds on name - * - * (Note MACH_PORT_SMALLEST = 0 and MACH_PORT_LARGEST = ~0.) - */ - -void -ipc_splay_tree_bounds( - ipc_splay_tree_t splay, - mach_port_name_t name, - mach_port_name_t *lowerp, - mach_port_name_t *upperp) -{ - ipc_tree_entry_t root; - - ist_lock(splay); - - root = splay->ist_root; - if (root == ITE_NULL) { - *lowerp = MACH_PORT_LARGEST; - *upperp = MACH_PORT_SMALLEST; - } else { - mach_port_name_t rname; - - if (splay->ist_name != name) { - ipc_splay_prim_assemble(root, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - ipc_splay_prim_lookup(name, root, &root, - &splay->ist_ltree, &splay->ist_ltreep, - &splay->ist_rtree, &splay->ist_rtreep); - splay->ist_name = name; - splay->ist_root = root; - } - - rname = root->ite_name; - - /* - * OK, it's a hack. We convert the ltreep and rtreep - * pointers back into real entry pointers, - * so we can pick the names out of the entries. - */ - - if (rname <= name) - *lowerp = rname; - else if (splay->ist_ltreep == &splay->ist_ltree) - *lowerp = MACH_PORT_LARGEST; - else { - ipc_tree_entry_t entry; - - entry = (ipc_tree_entry_t) - ((char *)splay->ist_ltreep - - ((char *)&root->ite_rchild - - (char *)root)); - *lowerp = entry->ite_name; - } - - if (rname >= name) - *upperp = rname; - else if (splay->ist_rtreep == &splay->ist_rtree) - *upperp = MACH_PORT_SMALLEST; - else { - ipc_tree_entry_t entry; - - entry = (ipc_tree_entry_t) - ((char *)splay->ist_rtreep - - ((char *)&root->ite_lchild - - (char *)root)); - *upperp = entry->ite_name; - } - } - - ist_unlock(splay); -} - -/* - * Routine: ipc_splay_traverse_start - * Routine: ipc_splay_traverse_next - * Routine: ipc_splay_traverse_finish - * Purpose: - * Perform a symmetric order traversal of a splay tree. - * Usage: - * for (entry = ipc_splay_traverse_start(splay); - * entry != ITE_NULL; - * entry = ipc_splay_traverse_next(splay, delete)) { - * do something with entry - * } - * ipc_splay_traverse_finish(splay); - * - * If "delete" is TRUE, then the current entry - * is removed from the tree and deallocated. - * - * During the traversal, the splay tree is locked. - */ - -ipc_tree_entry_t -ipc_splay_traverse_start( - ipc_splay_tree_t splay) -{ - ipc_tree_entry_t current, parent; - - ist_lock(splay); - - current = splay->ist_root; - if (current != ITE_NULL) { - ipc_splay_prim_assemble(current, - &splay->ist_ltree, splay->ist_ltreep, - &splay->ist_rtree, splay->ist_rtreep); - - parent = ITE_NULL; - - while (current->ite_lchild != ITE_NULL) { - ipc_tree_entry_t next; - - next = current->ite_lchild; - current->ite_lchild = parent; - parent = current; - current = next; - } - - splay->ist_ltree = current; - splay->ist_rtree = parent; - } - - return current; -} - -ipc_tree_entry_t -ipc_splay_traverse_next( - ipc_splay_tree_t splay, - boolean_t delete) -{ - ipc_tree_entry_t current, parent; - - /* pick up where traverse_entry left off */ - - current = splay->ist_ltree; - parent = splay->ist_rtree; - assert(current != ITE_NULL); - - if (!delete) - goto traverse_right; - - /* we must delete current and patch the tree */ - - if (current->ite_lchild == ITE_NULL) { - if (current->ite_rchild == ITE_NULL) { - /* like traverse_back, but with deletion */ - - if (parent == ITE_NULL) { - ite_free(current); - - splay->ist_root = ITE_NULL; - return ITE_NULL; - } - - if (current->ite_name < parent->ite_name) { - ite_free(current); - - current = parent; - parent = current->ite_lchild; - current->ite_lchild = ITE_NULL; - goto traverse_entry; - } else { - ite_free(current); - - current = parent; - parent = current->ite_rchild; - current->ite_rchild = ITE_NULL; - goto traverse_back; - } - } else { - ipc_tree_entry_t prev; - - prev = current; - current = current->ite_rchild; - ite_free(prev); - goto traverse_left; - } - } else { - if (current->ite_rchild == ITE_NULL) { - ipc_tree_entry_t prev; - - prev = current; - current = current->ite_lchild; - ite_free(prev); - goto traverse_back; - } else { - ipc_tree_entry_t prev; - ipc_tree_entry_t ltree, rtree; - ipc_tree_entry_t *ltreep, *rtreep; - - /* replace current with largest of left children */ - - prev = current; - ipc_splay_prim_lookup(MACH_PORT_LARGEST, - current->ite_lchild, ¤t, - <ree, <reep, &rtree, &rtreep); - ipc_splay_prim_assemble(current, - <ree, ltreep, &rtree, rtreep); - - assert(current->ite_rchild == ITE_NULL); - current->ite_rchild = prev->ite_rchild; - ite_free(prev); - goto traverse_right; - } - } - /*NOTREACHED*/ - - /* - * A state machine: for each entry, we - * 1) traverse left subtree - * 2) traverse the entry - * 3) traverse right subtree - * 4) traverse back to parent - */ - - traverse_left: - if (current->ite_lchild != ITE_NULL) { - ipc_tree_entry_t next; - - next = current->ite_lchild; - current->ite_lchild = parent; - parent = current; - current = next; - goto traverse_left; - } - - traverse_entry: - splay->ist_ltree = current; - splay->ist_rtree = parent; - return current; - - traverse_right: - if (current->ite_rchild != ITE_NULL) { - ipc_tree_entry_t next; - - next = current->ite_rchild; - current->ite_rchild = parent; - parent = current; - current = next; - goto traverse_left; - } - - traverse_back: - if (parent == ITE_NULL) { - splay->ist_root = current; - return ITE_NULL; - } - - if (current->ite_name < parent->ite_name) { - ipc_tree_entry_t prev; - - prev = current; - current = parent; - parent = current->ite_lchild; - current->ite_lchild = prev; - goto traverse_entry; - } else { - ipc_tree_entry_t prev; - - prev = current; - current = parent; - parent = current->ite_rchild; - current->ite_rchild = prev; - goto traverse_back; - } -} - -void -ipc_splay_traverse_finish( - ipc_splay_tree_t splay) -{ - ipc_tree_entry_t root; - - root = splay->ist_root; - if (root != ITE_NULL) { - splay->ist_name = root->ite_name; - splay->ist_ltreep = &splay->ist_ltree; - splay->ist_rtreep = &splay->ist_rtree; - } - - ist_unlock(splay); -} - diff --git a/osfmk/ipc/ipc_splay.h b/osfmk/ipc/ipc_splay.h deleted file mode 100644 index 03cfe0d70..000000000 --- a/osfmk/ipc/ipc_splay.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: ipc/ipc_splay.h - * Author: Rich Draves - * Date: 1989 - * - * Declarations of primitive splay tree operations. - */ - -#ifndef _IPC_IPC_SPLAY_H_ -#define _IPC_IPC_SPLAY_H_ - -#include -#include -#include -#include - -typedef struct ipc_splay_tree { - mach_port_name_t ist_name; /* name used in last lookup */ - ipc_tree_entry_t ist_root; /* root of middle tree */ - ipc_tree_entry_t ist_ltree; /* root of left tree */ - ipc_tree_entry_t *ist_ltreep; /* pointer into left tree */ - ipc_tree_entry_t ist_rtree; /* root of right tree */ - ipc_tree_entry_t *ist_rtreep; /* pointer into right tree */ -} *ipc_splay_tree_t; - -#define ist_lock(splay) /* no locking */ -#define ist_unlock(splay) /* no locking */ - -/* Initialize a raw splay tree */ -extern void ipc_splay_tree_init( - ipc_splay_tree_t splay); - -/* Pick a random entry in a splay tree */ -extern boolean_t ipc_splay_tree_pick( - ipc_splay_tree_t splay, - mach_port_name_t *namep, - ipc_tree_entry_t *entryp); - -/* Find an entry in a splay tree */ -extern ipc_tree_entry_t ipc_splay_tree_lookup( - ipc_splay_tree_t splay, - mach_port_name_t name); - -/* Insert a new entry into a splay tree */ -extern void ipc_splay_tree_insert( - ipc_splay_tree_t splay, - mach_port_name_t name, - ipc_tree_entry_t entry); - -/* Delete an entry from a splay tree */ -extern void ipc_splay_tree_delete( - ipc_splay_tree_t splay, - mach_port_name_t name, - ipc_tree_entry_t entry); - -/* Split a splay tree */ -extern void ipc_splay_tree_split( - ipc_splay_tree_t splay, - mach_port_name_t name, - ipc_splay_tree_t entry); - -/* Join two splay trees */ -extern void ipc_splay_tree_join( - ipc_splay_tree_t splay, - ipc_splay_tree_t small); - -/* Do a bounded splay tree lookup */ -extern void ipc_splay_tree_bounds( - ipc_splay_tree_t splay, - mach_port_name_t name, - mach_port_name_t *lowerp, - mach_port_name_t *upperp); - -/* Initialize a symmetric order traversal of a splay tree */ -extern ipc_tree_entry_t ipc_splay_traverse_start( - ipc_splay_tree_t splay); - -/* Return the next entry in a symmetric order traversal of a splay tree */ -extern ipc_tree_entry_t ipc_splay_traverse_next( - ipc_splay_tree_t splay, - boolean_t delete); - -/* Terminate a symmetric order traversal of a splay tree */ -extern void ipc_splay_traverse_finish( - ipc_splay_tree_t splay); - -#endif /* _IPC_IPC_SPLAY_H_ */ diff --git a/osfmk/ipc/ipc_table.c b/osfmk/ipc/ipc_table.c index 4e19f8844..1add1f5b3 100644 --- a/osfmk/ipc/ipc_table.c +++ b/osfmk/ipc/ipc_table.c @@ -187,33 +187,6 @@ ipc_table_alloc( return (void *)table; } -/* - * Routine: ipc_table_realloc - * Purpose: - * Reallocate a big table. - * - * The new table remaps the old table, - * so copying is not necessary. - * Conditions: - * Only works for page-size or bigger tables. - * May block. - */ - -void * -ipc_table_realloc( - vm_size_t old_size, - void * old_table, - vm_size_t new_size) -{ - vm_offset_t new_table; - - if (kmem_realloc(kalloc_map, - (vm_offset_t) old_table, old_size, - &new_table, new_size) != KERN_SUCCESS) - new_table = 0; - - return (void *)new_table; -} /* * Routine: ipc_table_free diff --git a/osfmk/ipc/ipc_table.h b/osfmk/ipc/ipc_table.h index fee56f778..a310197e0 100644 --- a/osfmk/ipc/ipc_table.h +++ b/osfmk/ipc/ipc_table.h @@ -112,56 +112,30 @@ extern ipc_table_size_t ipc_table_requests; extern void ipc_table_init(void) __attribute__((section("__TEXT, initcode"))); /* - * Note that ipc_table_alloc, ipc_table_realloc, and ipc_table_free - * all potentially use the VM system. Hence simple locks can't + * Note that ipc_table_alloc and ipc_table_free + * potentially use the VM system. Hence simple locks can't * be held across them. - * - * We can't use a copying realloc, because the realloc happens - * with the data unlocked. ipc_table_realloc remaps the data, - * so it is OK. */ /* Allocate a table */ extern void * ipc_table_alloc( vm_size_t size); -/* Reallocate a big table */ -extern void * ipc_table_realloc( - vm_size_t old_size, - void * old_table, - vm_size_t new_size); - /* Free a table */ extern void ipc_table_free( vm_size_t size, void * table); -#define it_entries_reallocable(its) \ - ((its)->its_size * sizeof(struct ipc_entry) >= PAGE_SIZE) - #define it_entries_alloc(its) \ ((ipc_entry_t) \ - ipc_table_alloc(it_entries_reallocable(its) ? \ - round_page((its)->its_size * sizeof(struct ipc_entry)) : \ - (its)->its_size * sizeof(struct ipc_entry) \ - )) - -#define it_entries_realloc(its, table, nits) \ - ((ipc_entry_t) \ - ipc_table_realloc( \ - round_page((its)->its_size * sizeof(struct ipc_entry)), \ - (void *)(table), \ - round_page((nits)->its_size * sizeof(struct ipc_entry)) \ - )) + ipc_table_alloc((its)->its_size * sizeof(struct ipc_entry))) #define it_entries_free(its, table) \ - ipc_table_free(it_entries_reallocable(its) ? \ - round_page((its)->its_size * sizeof(struct ipc_entry)) : \ - (its)->its_size * sizeof(struct ipc_entry), \ - (void *)(table) \ - ) + ipc_table_free((its)->its_size * sizeof(struct ipc_entry), \ + (void *)(table)) + -#define it_requests_alloc(its) \ +#define it_requests_alloc(its) \ ((ipc_port_request_t) \ ipc_table_alloc((its)->its_size * \ sizeof(struct ipc_port_request))) diff --git a/osfmk/ipc/ipc_types.h b/osfmk/ipc/ipc_types.h index 5857e5ecf..b601a2e69 100644 --- a/osfmk/ipc/ipc_types.h +++ b/osfmk/ipc/ipc_types.h @@ -55,16 +55,16 @@ typedef mach_port_name_t mach_port_index_t; /* index values */ typedef mach_port_name_t mach_port_gen_t; /* generation numbers */ typedef struct ipc_entry *ipc_entry_t; -typedef struct ipc_tree_entry *ipc_tree_entry_t; + typedef struct ipc_table_size *ipc_table_size_t; typedef struct ipc_port_request *ipc_port_request_t; typedef struct ipc_pset *ipc_pset_t; typedef struct ipc_kmsg *ipc_kmsg_t; #define IE_NULL ((ipc_entry_t) 0) -#define ITE_NULL ((ipc_tree_entry_t) 0) + #define ITS_NULL ((ipc_table_size_t) 0) -#define ITS_SIZE_NONE ((ipc_table_elems_t) -1) +#define ITS_SIZE_NONE ((ipc_table_elems_t) -1) #define IPR_NULL ((ipc_port_request_t) 0) #define IPS_NULL ((ipc_pset_t) 0) #define IKM_NULL ((ipc_kmsg_t) 0) diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index f255df6f5..87b7329d1 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -137,67 +137,6 @@ mach_port_get_srights( } #endif /* MACH_IPC_DEBUG */ -/* - * Routine: host_ipc_hash_info - * Purpose: - * Return information about the global reverse hash table. - * Conditions: - * Nothing locked. Obeys CountInOut protocol. - * Returns: - * KERN_SUCCESS Returned information. - * KERN_INVALID_HOST The host is null. - * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. - */ - -#if !MACH_IPC_DEBUG -kern_return_t -host_ipc_hash_info( - __unused host_t host, - __unused hash_info_bucket_array_t *infop, - __unused mach_msg_type_number_t *countp) -{ - return KERN_FAILURE; -} -#else -kern_return_t -host_ipc_hash_info( - host_t host, - hash_info_bucket_array_t *infop, - mach_msg_type_number_t *countp) -{ - vm_map_copy_t copy; - vm_offset_t addr; - vm_size_t size; - hash_info_bucket_t *info; - natural_t count; - kern_return_t kr; - - if (host == HOST_NULL) - return KERN_INVALID_HOST; - - /* start with in-line data */ - - count = ipc_hash_size(); - size = round_page(count * sizeof(hash_info_bucket_t)); - kr = kmem_alloc_pageable(ipc_kernel_map, &addr, size); - if (kr != KERN_SUCCESS) - return KERN_RESOURCE_SHORTAGE; - - info = (hash_info_bucket_t *) addr; - count = ipc_hash_info(info, count); - - if (size > count * sizeof(hash_info_bucket_t)) - bzero((char *)&info[count], size - count * sizeof(hash_info_bucket_t)); - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)addr, - (vm_map_size_t)size, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *infop = (hash_info_bucket_t *) copy; - *countp = count; - return KERN_SUCCESS; -} -#endif /* MACH_IPC_DEBUG */ /* * Routine: mach_port_space_info @@ -231,16 +170,12 @@ mach_port_space_info( ipc_info_space_t *infop, ipc_info_name_array_t *tablep, mach_msg_type_number_t *tableCntp, - ipc_info_tree_name_array_t *treep, - mach_msg_type_number_t *treeCntp) + __unused ipc_info_tree_name_array_t *treep, + __unused mach_msg_type_number_t *treeCntp) { ipc_info_name_t *table_info; vm_offset_t table_addr; vm_size_t table_size, table_size_needed; - ipc_info_tree_name_t *tree_info; - vm_offset_t tree_addr; - vm_size_t tree_size, tree_size_needed; - ipc_tree_entry_t tentry; ipc_entry_t table; ipc_entry_num_t tsize; mach_port_index_t index; @@ -254,28 +189,21 @@ mach_port_space_info( /* start with in-line memory */ table_size = 0; - tree_size = 0; for (;;) { is_read_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_read_unlock(space); if (table_size != 0) kmem_free(ipc_kernel_map, table_addr, table_size); - if (tree_size != 0) - kmem_free(ipc_kernel_map, - tree_addr, tree_size); return KERN_INVALID_TASK; } table_size_needed = round_page(space->is_table_size * sizeof(ipc_info_name_t)); - tree_size_needed = round_page(space->is_tree_total - * sizeof(ipc_info_tree_name_t)); - if ((table_size_needed == table_size) && - (tree_size_needed == tree_size)) + if (table_size_needed == table_size) break; is_read_unlock(space); @@ -285,23 +213,11 @@ mach_port_space_info( kmem_free(ipc_kernel_map, table_addr, table_size); kr = kmem_alloc(ipc_kernel_map, &table_addr, table_size_needed); if (kr != KERN_SUCCESS) { - if (tree_size != 0) - kmem_free(ipc_kernel_map, tree_addr, tree_size); return KERN_RESOURCE_SHORTAGE; } table_size = table_size_needed; } - if (tree_size != tree_size_needed) { - if (tree_size != 0) - kmem_free(ipc_kernel_map, tree_addr, tree_size); - kr = kmem_alloc(ipc_kernel_map, &tree_addr, tree_size_needed); - if (kr != KERN_SUCCESS) { - if (table_size != 0) - kmem_free(ipc_kernel_map, table_addr, table_size); - return KERN_RESOURCE_SHORTAGE; - } - tree_size = tree_size_needed; - } + } /* space is read-locked and active; we have enough wired memory */ @@ -309,9 +225,6 @@ mach_port_space_info( infop->iis_genno_mask = MACH_PORT_NGEN(MACH_PORT_DEAD); infop->iis_table_size = space->is_table_size; infop->iis_table_next = space->is_table_next->its_size; - infop->iis_tree_size = space->is_tree_total; - infop->iis_tree_small = space->is_tree_small; - infop->iis_tree_hash = space->is_tree_hash; /* walk the table for this space */ table = space->is_table; @@ -324,7 +237,6 @@ mach_port_space_info( bits = entry->ie_bits; iin->iin_name = MACH_PORT_MAKE(index, IE_BITS_GEN(bits)); - iin->iin_collision = (bits & IE_BITS_COLLISION) ? TRUE : FALSE; iin->iin_type = IE_BITS_TYPE(bits); if ((entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) != MACH_PORT_TYPE_NONE && entry->ie_request != IE_REQ_NONE) { @@ -342,48 +254,6 @@ mach_port_space_info( iin->iin_hash = entry->ie_index; } - /* walk the splay tree for this space */ - tree_info = (ipc_info_tree_name_array_t)tree_addr; - for (tentry = ipc_splay_traverse_start(&space->is_tree), index = 0; - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree, FALSE)) { - ipc_info_tree_name_t *iitn = &tree_info[index++]; - ipc_info_name_t *iin = &iitn->iitn_name; - ipc_entry_t entry = &tentry->ite_entry; - ipc_entry_bits_t bits = entry->ie_bits; - - assert(IE_BITS_TYPE(bits) != MACH_PORT_TYPE_NONE); - - iin->iin_name = tentry->ite_name; - iin->iin_collision = (bits & IE_BITS_COLLISION) ? TRUE : FALSE; - iin->iin_type = IE_BITS_TYPE(bits); - if ((entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) != MACH_PORT_TYPE_NONE && - entry->ie_request != IE_REQ_NONE) { - ipc_port_t port = (ipc_port_t) entry->ie_object; - - assert(IP_VALID(port)); - ip_lock(port); - iin->iin_type |= ipc_port_request_type(port, iin->iin_name, entry->ie_request); - ip_unlock(port); - } - - iin->iin_urefs = IE_BITS_UREFS(bits); - iin->iin_object = (natural_t)(uintptr_t)entry->ie_object; - iin->iin_next = entry->ie_next; - iin->iin_hash = entry->ie_index; - - if (tentry->ite_lchild == ITE_NULL) - iitn->iitn_lchild = MACH_PORT_NULL; - else - iitn->iitn_lchild = tentry->ite_lchild->ite_name; - - if (tentry->ite_rchild == ITE_NULL) - iitn->iitn_rchild = MACH_PORT_NULL; - else - iitn->iitn_rchild = tentry->ite_rchild->ite_name; - - } - ipc_splay_traverse_finish(&space->is_tree); is_read_unlock(space); /* prepare the table out-of-line data for return */ @@ -405,24 +275,9 @@ mach_port_space_info( *tableCntp = 0; } - /* prepare the tree out-of-line data for return */ - if (tree_size > 0) { - if (tree_size > infop->iis_tree_size * sizeof(ipc_info_tree_name_t)) - bzero((char *)&tree_info[infop->iis_tree_size], - tree_size - infop->iis_tree_size * sizeof(ipc_info_tree_name_t)); - - kr = vm_map_unwire(ipc_kernel_map, vm_map_trunc_page(tree_addr), - vm_map_round_page(tree_addr + tree_size), FALSE); - assert(kr == KERN_SUCCESS); - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)tree_addr, - (vm_map_size_t)tree_size, TRUE, ©); - assert(kr == KERN_SUCCESS); - *treep = (ipc_info_tree_name_t *)copy; - *treeCntp = infop->iis_tree_size; - } else { - *treep = (ipc_info_tree_name_t *)0; - *treeCntp = 0; - } + /* splay tree is obsolete, no work to do... */ + *treep = (ipc_info_tree_name_t *)0; + *treeCntp = 0; return KERN_SUCCESS; } #endif /* MACH_IPC_DEBUG */ @@ -537,6 +392,7 @@ mach_port_kobject( ipc_entry_t entry; ipc_port_t port; kern_return_t kr; + mach_vm_address_t kaddr; if (space == IS_NULL) return KERN_INVALID_TASK; @@ -563,10 +419,15 @@ mach_port_kobject( } *typep = (unsigned int) ip_kotype(port); - *addrp = (mach_vm_address_t)port->ip_kobject; + kaddr = (mach_vm_address_t)port->ip_kobject; ip_unlock(port); - return KERN_SUCCESS; + if (0 != kaddr && is_ipc_kobject(*typep)) + *addrp = VM_KERNEL_ADDRPERM(VM_KERNEL_UNSLIDE(kaddr)); + else + *addrp = 0; + + return KERN_SUCCESS; } #endif /* MACH_IPC_DEBUG */ /* diff --git a/osfmk/ipc/mach_kernelrpc.c b/osfmk/ipc/mach_kernelrpc.c new file mode 100644 index 000000000..75244966f --- /dev/null +++ b/osfmk/ipc/mach_kernelrpc.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +_kernelrpc_mach_vm_allocate_trap(struct _kernelrpc_mach_vm_allocate_trap_args *args) +{ + mach_vm_offset_t addr; + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + if (copyin(args->addr, (char *)&addr, sizeof (addr))) + goto done; + + rv = mach_vm_allocate(task->map, &addr, args->size, args->flags); + if (rv == KERN_SUCCESS) + rv = copyout(&addr, args->addr, sizeof (addr)); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_vm_deallocate_trap(struct _kernelrpc_mach_vm_deallocate_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_vm_deallocate(task->map, args->address, args->size); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_vm_protect_trap(struct _kernelrpc_mach_vm_protect_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_vm_protect(task->map, args->address, args->size, + args->set_maximum, args->new_protection); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_port_allocate_trap(struct _kernelrpc_mach_port_allocate_args *args) +{ + task_t task = port_name_to_task(args->target); + mach_port_name_t name; + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_allocate(task->itk_space, args->right, &name); + if (rv == KERN_SUCCESS) + rv = copyout(&name, args->name, sizeof (name)); + + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_port_destroy_trap(struct _kernelrpc_mach_port_destroy_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_destroy(task->itk_space, args->name); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_port_deallocate_trap(struct _kernelrpc_mach_port_deallocate_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_deallocate(task->itk_space, args->name); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_port_mod_refs_trap(struct _kernelrpc_mach_port_mod_refs_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_mod_refs(task->itk_space, args->name, args->right, args->delta); + +done: + if (task) + task_deallocate(task); + return (rv); +} + + +int +_kernelrpc_mach_port_move_member_trap(struct _kernelrpc_mach_port_move_member_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_move_member(task->itk_space, args->member, args->after); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_args *args) +{ + task_t task = port_name_to_task(args->target); + ipc_port_t port; + mach_msg_type_name_t disp; + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = ipc_object_copyin(task->itk_space, args->poly, args->polyPoly, + (ipc_object_t *)&port); + if (rv != KERN_SUCCESS) + goto done; + disp = ipc_object_copyin_type(args->polyPoly); + + rv = mach_port_insert_right(task->itk_space, args->name, port, disp); + +done: + if (task) + task_deallocate(task); + return (rv); +} + +int +_kernelrpc_mach_port_insert_member_trap(struct _kernelrpc_mach_port_insert_member_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_insert_member(task->itk_space, args->name, args->pset); + +done: + if (task) + task_deallocate(task); + return (rv); +} + + +int +_kernelrpc_mach_port_extract_member_trap(struct _kernelrpc_mach_port_extract_member_args *args) +{ + task_t task = port_name_to_task(args->target); + int rv = MACH_SEND_INVALID_DEST; + + if (task != current_task()) + goto done; + + rv = mach_port_extract_member(task->itk_space, args->name, args->pset); + +done: + if (task) + task_deallocate(task); + return (rv); +} diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index b83ef8191..2b4c67eb2 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -261,7 +261,7 @@ mach_msg_receive_results(void) { thread_t self = current_thread(); ipc_space_t space = current_space(); - vm_map_t map = current_map(); + vm_map_t map = current_map(); ipc_object_t object = self->ith_object; mach_msg_return_t mr = self->ith_state; @@ -269,10 +269,9 @@ mach_msg_receive_results(void) mach_msg_option_t option = self->ith_option; ipc_kmsg_t kmsg = self->ith_kmsg; mach_port_seqno_t seqno = self->ith_seqno; + mach_msg_trailer_size_t trailer_size; - mach_msg_max_trailer_t *trailer; - - ipc_object_release(object); + io_release(object); if (mr != MACH_MSG_SUCCESS) { @@ -298,67 +297,8 @@ mach_msg_receive_results(void) goto out; } - trailer = (mach_msg_max_trailer_t *) - ((vm_offset_t)kmsg->ikm_header + - round_msg(kmsg->ikm_header->msgh_size)); - if (option & MACH_RCV_TRAILER_MASK) { - trailer->msgh_seqno = seqno; - trailer->msgh_context = - kmsg->ikm_header->msgh_remote_port->ip_context; - trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(option); - - if (MACH_RCV_TRAILER_ELEMENTS(option) >= - MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_AV)){ -#if CONFIG_MACF_MACH - if (kmsg->ikm_sender != NULL && - IP_VALID(kmsg->ikm_header->msgh_remote_port) && - mac_port_check_method(kmsg->ikm_sender, - &kmsg->ikm_sender->maclabel, - &kmsg->ikm_header->msgh_remote_port->ip_label, - kmsg->ikm_header->msgh_id) == 0) - trailer->msgh_ad = 1; - else -#endif - trailer->msgh_ad = 0; - } - - /* - * The ipc_kmsg_t holds a reference to the label of a label - * handle, not the port. We must get a reference to the port - * and a send right to copyout to the receiver. - */ - - if (option & MACH_RCV_TRAILER_ELEMENTS (MACH_RCV_TRAILER_LABELS)) { -#if CONFIG_MACF_MACH - if (kmsg->ikm_sender != NULL) { - ipc_labelh_t lh = kmsg->ikm_sender->label; - kern_return_t kr; - - ip_lock(lh->lh_port); - lh->lh_port->ip_mscount++; - lh->lh_port->ip_srights++; - ip_reference(lh->lh_port); - ip_unlock(lh->lh_port); - - kr = ipc_object_copyout(space, (ipc_object_t)lh->lh_port, - MACH_MSG_TYPE_PORT_SEND, 0, - &trailer->msgh_labels.sender); - if (kr != KERN_SUCCESS) { - ip_lock(lh->lh_port); - ip_release(lh->lh_port); - ip_check_unlock(lh->lh_port); - - trailer->msgh_labels.sender = 0; - } - } else { - trailer->msgh_labels.sender = 0; - } -#else - trailer->msgh_labels.sender = 0; -#endif - } - } - + trailer_size = ipc_kmsg_add_trailer(kmsg, space, option, self, seqno, FALSE, + kmsg->ikm_header->msgh_remote_port->ip_context); /* * If MACH_RCV_OVERWRITE was specified, try to get the scatter * list and verify it against the contents of the message. If @@ -379,7 +319,7 @@ mach_msg_receive_results(void) if (mr != MACH_MSG_SUCCESS) { if ((mr &~ MACH_MSG_MASK) == MACH_RCV_BODY_ERROR) { if (ipc_kmsg_put(msg_addr, kmsg, kmsg->ikm_header->msgh_size + - trailer->msgh_trailer_size) == MACH_RCV_INVALID_DATA) + trailer_size) == MACH_RCV_INVALID_DATA) mr = MACH_RCV_INVALID_DATA; } else { @@ -392,7 +332,7 @@ mach_msg_receive_results(void) mr = ipc_kmsg_put(msg_addr, kmsg, kmsg->ikm_header->msgh_size + - trailer->msgh_trailer_size); + trailer_size); out: return mr; } @@ -577,8 +517,9 @@ msg_receive_error( mach_port_seqno_t seqno, ipc_space_t space) { - mach_msg_max_trailer_t *trailer; mach_vm_address_t context; + mach_msg_trailer_size_t trailer_size; + mach_msg_max_trailer_t *trailer; context = kmsg->ikm_header->msgh_remote_port->ip_context; @@ -598,17 +539,16 @@ msg_receive_error( bcopy( (char *)&trailer_template, (char *)trailer, sizeof(trailer_template)); - if (option & MACH_RCV_TRAILER_MASK) { - trailer->msgh_context = context; - trailer->msgh_seqno = seqno; - trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(option); - } + + trailer_size = ipc_kmsg_add_trailer(kmsg, space, + option, current_thread(), seqno, + TRUE, context); /* * Copy the message to user space */ if (ipc_kmsg_put(msg_addr, kmsg, kmsg->ikm_header->msgh_size + - trailer->msgh_trailer_size) == MACH_RCV_INVALID_DATA) + trailer_size) == MACH_RCV_INVALID_DATA) return(MACH_RCV_INVALID_DATA); else return(MACH_MSG_SUCCESS); diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index adfc70bcb..18c764892 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -221,7 +221,6 @@ mach_port_names( mach_port_type_t **typesp, mach_msg_type_number_t *typesCnt) { - ipc_tree_entry_t tentry; ipc_entry_t table; ipc_entry_num_t tsize; mach_port_index_t index; @@ -250,7 +249,7 @@ mach_port_names( vm_size_t size_needed; is_read_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_read_unlock(space); if (size != 0) { kmem_free(ipc_kernel_map, addr1, size); @@ -260,8 +259,7 @@ mach_port_names( } /* upper bound on number of names in the space */ - - bound = space->is_table_size + space->is_tree_total; + bound = space->is_table_size; size_needed = round_page(bound * sizeof(mach_port_name_t)); if (size_needed <= size) @@ -330,17 +328,6 @@ mach_port_names( } } - for (tentry = ipc_splay_traverse_start(&space->is_tree); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree, FALSE)) { - ipc_entry_t entry = &tentry->ite_entry; - mach_port_name_t name = tentry->ite_name; - - assert(IE_BITS_TYPE(tentry->ite_bits) != MACH_PORT_TYPE_NONE); - mach_port_names_helper(timestamp, entry, name, names, - types, &actual); - } - ipc_splay_traverse_finish(&space->is_tree); is_read_unlock(space); if (actual == 0) { @@ -441,17 +428,16 @@ mach_port_type( kr = ipc_right_lookup_write(space, name, &entry); if (kr != KERN_SUCCESS) return kr; - /* space is write-locked and active */ + /* space is write-locked and active */ kr = ipc_right_info(space, name, entry, typep, &urefs); - if (kr == KERN_SUCCESS) - is_write_unlock(space); + /* space is unlocked */ + #if 1 /* JMM - workaround rdar://problem/9121297 (CF being too picky on these bits). */ *typep &= ~(MACH_PORT_TYPE_SPREQUEST | MACH_PORT_TYPE_SPREQUEST_DELAYED); #endif - /* space is unlocked */ return kr; } @@ -470,26 +456,21 @@ mach_port_type( * KERN_INVALID_VALUE The nname isn't a legal name. * KERN_NAME_EXISTS The nname already denotes a right. * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + * + * This interface is obsolete and always returns + * KERN_NOT_SUPPORTED. */ kern_return_t mach_port_rename( - ipc_space_t space, - mach_port_name_t oname, - mach_port_name_t nname) + __unused ipc_space_t space, + __unused mach_port_name_t oname, + __unused mach_port_name_t nname) { - if (space == IS_NULL) - return KERN_INVALID_TASK; - - if (!MACH_PORT_VALID(oname)) - return KERN_INVALID_NAME; - - if (!MACH_PORT_VALID(nname)) - return KERN_INVALID_VALUE; - - return ipc_object_rename(space, oname, nname); + return KERN_NOT_SUPPORTED; } + /* * Routine: mach_port_allocate_name [kernel call] * Purpose: @@ -650,8 +631,6 @@ mach_port_allocate_full( if (qosp->name) { if (!MACH_PORT_VALID (*namep)) return (KERN_INVALID_VALUE); - if (is_fast_space (space)) - return (KERN_FAILURE); } if (qosp->prealloc) { @@ -750,8 +729,7 @@ mach_port_destroy( return kr; /* space is write-locked and active */ - kr = ipc_right_destroy(space, name, entry); - is_write_unlock(space); + kr = ipc_right_destroy(space, name, entry); /* unlocks space */ return kr; } @@ -843,12 +821,13 @@ mach_port_get_refs( kr = ipc_right_lookup_write(space, name, &entry); if (kr != KERN_SUCCESS) return kr; + /* space is write-locked and active */ + kr = ipc_right_info(space, name, entry, &type, &urefs); + /* space is unlocked */ - kr = ipc_right_info(space, name, entry, &type, &urefs); /* unlocks */ if (kr != KERN_SUCCESS) - return kr; /* space is unlocked */ - is_write_unlock(space); + return kr; if (type & MACH_PORT_TYPE(right)) switch (right) { @@ -1027,7 +1006,7 @@ kern_return_t mach_port_get_context( ipc_space_t space, mach_port_name_t name, - mach_vm_address_t *context) + mach_vm_address_t *context) { ipc_port_t port; kern_return_t kr; @@ -1068,7 +1047,7 @@ kern_return_t mach_port_set_context( ipc_space_t space, mach_port_name_t name, - mach_vm_address_t context) + mach_vm_address_t context) { ipc_port_t port; kern_return_t kr; @@ -1093,6 +1072,9 @@ mach_port_set_context( /* * Routine: mach_port_gst_helper + * Conditions: + * portspace is locked for both the recieve right and pset + * under observation. * Purpose: * A helper function for mach_port_get_set_status. */ @@ -1108,15 +1090,14 @@ mach_port_gst_helper( mach_port_name_t name; assert(port != IP_NULL); - - ip_lock(port); + /* + * The space lock is held by the calling function, + * hence it is OK to read name without the port lock. + */ assert(ip_active(port)); - name = port->ip_receiver_name; assert(name != MACH_PORT_NULL); - ip_unlock(port); - if (ipc_pset_member(pset, port)) { ipc_entry_num_t actual = *actualp; @@ -1167,7 +1148,6 @@ mach_port_get_set_status( size = PAGE_SIZE; /* initial guess */ for (;;) { - ipc_tree_entry_t tentry; ipc_entry_t entry, table; ipc_entry_num_t tsize; mach_port_index_t index; @@ -1220,21 +1200,6 @@ mach_port_get_set_status( } } - for (tentry = ipc_splay_traverse_start(&space->is_tree); - tentry != ITE_NULL; - tentry = ipc_splay_traverse_next(&space->is_tree,FALSE)) { - ipc_entry_bits_t bits = tentry->ite_bits; - - assert(IE_BITS_TYPE(bits) != MACH_PORT_TYPE_NONE); - - if (bits & MACH_PORT_TYPE_RECEIVE) { - ipc_port_t port = (ipc_port_t) tentry->ite_object; - - mach_port_gst_helper(pset, port, maxnames, - names, &actual); - } - } - ipc_splay_traverse_finish(&space->is_tree); is_read_unlock(space); if (actual <= maxnames) @@ -1311,6 +1276,9 @@ mach_port_move_member( ipc_port_t port; ipc_pset_t nset; kern_return_t kr; + wait_queue_link_t wql; + queue_head_t links_data; + queue_t links = &links_data; if (space == IS_NULL) return KERN_INVALID_TASK; @@ -1320,15 +1288,22 @@ mach_port_move_member( if (after == MACH_PORT_DEAD) return KERN_INVALID_RIGHT; + else if (after == MACH_PORT_NULL) + wql = WAIT_QUEUE_LINK_NULL; + else + wql = wait_queue_link_allocate(); + + queue_init(links); kr = ipc_right_lookup_read(space, member, &entry); if (kr != KERN_SUCCESS) - return kr; + goto done; /* space is read-locked and active */ if ((entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) { is_read_unlock(space); - return KERN_INVALID_RIGHT; + kr = KERN_INVALID_RIGHT; + goto done; } port = (ipc_port_t) entry->ie_object; @@ -1340,27 +1315,38 @@ mach_port_move_member( entry = ipc_entry_lookup(space, after); if (entry == IE_NULL) { is_read_unlock(space); - return KERN_INVALID_NAME; + kr = KERN_INVALID_NAME; + goto done; } if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) { is_read_unlock(space); - return KERN_INVALID_RIGHT; + kr = KERN_INVALID_RIGHT; + goto done; } nset = (ipc_pset_t) entry->ie_object; assert(nset != IPS_NULL); } ip_lock(port); - ipc_pset_remove_from_all(port); + ipc_pset_remove_from_all(port, links); if (nset != IPS_NULL) { ips_lock(nset); - kr = ipc_pset_add(nset, port); + kr = ipc_pset_add(nset, port, wql); ips_unlock(nset); } ip_unlock(port); is_read_unlock(space); + + done: + if (kr != KERN_SUCCESS && wql != WAIT_QUEUE_LINK_NULL) + wait_queue_link_free(wql); + while(!queue_empty(links)) { + wql = (wait_queue_link_t) dequeue(links); + wait_queue_link_free(wql); + } + return kr; } @@ -1811,6 +1797,7 @@ mach_port_insert_member( ipc_object_t obj; ipc_object_t psobj; kern_return_t kr; + wait_queue_link_t wql; if (space == IS_NULL) return KERN_INVALID_TASK; @@ -1818,19 +1805,26 @@ mach_port_insert_member( if (!MACH_PORT_VALID(name) || !MACH_PORT_VALID(psname)) return KERN_INVALID_RIGHT; + wql = wait_queue_link_allocate(); + kr = ipc_object_translate_two(space, name, MACH_PORT_RIGHT_RECEIVE, &obj, psname, MACH_PORT_RIGHT_PORT_SET, &psobj); if (kr != KERN_SUCCESS) - return kr; + goto done; /* obj and psobj are locked (and were locked in that order) */ assert(psobj != IO_NULL); assert(obj != IO_NULL); - kr = ipc_pset_add((ipc_pset_t)psobj, (ipc_port_t)obj); + kr = ipc_pset_add((ipc_pset_t)psobj, (ipc_port_t)obj, wql); io_unlock(psobj); io_unlock(obj); + + done: + if (kr != KERN_SUCCESS) + wait_queue_link_free(wql); + return kr; } @@ -1861,6 +1855,7 @@ mach_port_extract_member( ipc_object_t psobj; ipc_object_t obj; kern_return_t kr; + wait_queue_link_t wql = WAIT_QUEUE_LINK_NULL; if (space == IS_NULL) return KERN_INVALID_TASK; @@ -1878,9 +1873,13 @@ mach_port_extract_member( assert(psobj != IO_NULL); assert(obj != IO_NULL); - kr = ipc_pset_remove((ipc_pset_t)psobj, (ipc_port_t)obj); + kr = ipc_pset_remove((ipc_pset_t)psobj, (ipc_port_t)obj, &wql); io_unlock(psobj); io_unlock(obj); + + if (wql != WAIT_QUEUE_LINK_NULL) + wait_queue_link_free(wql); + return kr; } @@ -1898,7 +1897,7 @@ task_set_port_space( is_write_lock(space); - if (!space->is_active) { + if (!is_active(space)) { is_write_unlock(space); return KERN_INVALID_TASK; } @@ -1937,6 +1936,7 @@ mach_get_label( dead = ipc_right_check(space, port, name, entry); if (dead) { is_write_unlock(space); + ip_release(port); return KERN_INVALID_RIGHT; } /* port is now locked */ @@ -1980,6 +1980,7 @@ mach_get_label_text( labelstr_t outlabel) { ipc_entry_t entry; + ipc_port_t port; kern_return_t kr; struct label *l; int dead; @@ -1994,10 +1995,11 @@ mach_get_label_text( if (kr != KERN_SUCCESS) return kr; - dead = ipc_right_check(space, (ipc_port_t) entry->ie_object, name, - entry); + port = (ipc_port_t)entry->ie_object; + dead = ipc_right_check(space, port, name, entry); if (dead) { is_write_unlock(space); + ip_release(port); return KERN_INVALID_RIGHT; } /* object (port) is now locked */ diff --git a/osfmk/kdp/Makefile b/osfmk/kdp/Makefile index 3382d9243..03e7bb526 100644 --- a/osfmk/kdp/Makefile +++ b/osfmk/kdp/Makefile @@ -15,10 +15,13 @@ DATAFILES = \ kdp_callout.h \ kdp_en_debugger.h -EXPORT_MI_LIST = ${DATAFILES} +EXPORT_MI_LIST = ${DATAFILES} kdp_dyld.h -EXPORT_MI_DIR = kdp +INSTALL_KF_MI_LIST = ${DATAFILES} + +INSTALL_KF_MI_LCL_LIST = ${DATAFILES} +EXPORT_MI_DIR = kdp include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/osfmk/kdp/kdp.c b/osfmk/kdp/kdp.c index 7eb3459ac..ec8e73b4e 100644 --- a/osfmk/kdp/kdp.c +++ b/osfmk/kdp/kdp.c @@ -27,6 +27,7 @@ */ #include +#include #include #include #include @@ -46,6 +47,9 @@ #include #include #include +#include + +extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */ #define DO_ALIGN 1 /* align all packet data accesses */ @@ -1068,19 +1072,35 @@ kdp_copyin(pmap_t p, uint64_t uaddr, void *dest, size_t size) { static void -kdp_mem_snapshot(struct mem_snapshot *mem_snap) +kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap) { - mem_snap->snapshot_magic = STACKSHOT_MEM_SNAPSHOT_MAGIC; - mem_snap->free_pages = vm_page_free_count; - mem_snap->active_pages = vm_page_active_count; - mem_snap->inactive_pages = vm_page_inactive_count; - mem_snap->purgeable_pages = vm_page_purgeable_count; - mem_snap->wired_pages = vm_page_wire_count; - mem_snap->speculative_pages = vm_page_speculative_count; - mem_snap->throttled_pages = vm_page_throttled_count; + unsigned int pages_reclaimed; + unsigned int pages_wanted; + kern_return_t kErr; + + memio_snap->snapshot_magic = STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC; + memio_snap->free_pages = vm_page_free_count; + memio_snap->active_pages = vm_page_active_count; + memio_snap->inactive_pages = vm_page_inactive_count; + memio_snap->purgeable_pages = vm_page_purgeable_count; + memio_snap->wired_pages = vm_page_wire_count; + memio_snap->speculative_pages = vm_page_speculative_count; + memio_snap->throttled_pages = vm_page_throttled_count; + memio_snap->busy_buffer_count = count_busy_buffers(); + kErr = mach_vm_pressure_monitor(FALSE, VM_PRESSURE_TIME_WINDOW, &pages_reclaimed, &pages_wanted); + if ( ! kErr ) { + memio_snap->pages_wanted = (uint32_t)pages_wanted; + memio_snap->pages_reclaimed = (uint32_t)pages_reclaimed; + memio_snap->pages_wanted_reclaimed_valid = 1; + } else { + memio_snap->pages_wanted = 0; + memio_snap->pages_reclaimed = 0; + memio_snap->pages_wanted_reclaimed_valid = 0; + } } + /* * Method for grabbing timer values safely, in the sense that no infinite loop will occur * Certain flavors of the timer_grab function, which would seem to be the thing to use, @@ -1126,12 +1146,12 @@ kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_fl boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); if(trace_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) { - if(tracepos + sizeof(struct mem_snapshot) > tracebound) { + if(tracepos + sizeof(struct mem_and_io_snapshot) > tracebound) { error = -1; goto error_exit; } - kdp_mem_snapshot((struct mem_snapshot *)tracepos); - tracepos += sizeof(struct mem_snapshot); + kdp_mem_and_io_snapshot((struct mem_and_io_snapshot *)tracepos); + tracepos += sizeof(struct mem_and_io_snapshot); } walk_list: @@ -1165,14 +1185,14 @@ walk_list: if (have_pmap && task->active && save_loadinfo_p && task_pid > 0) { // Read the dyld_all_image_infos struct from the task memory to get UUID array count and location if (task64) { - struct dyld_all_image_infos64 task_image_infos; - if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos64))) { + struct user64_dyld_all_image_infos task_image_infos; + if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct user64_dyld_all_image_infos))) { uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; uuid_info_addr = task_image_infos.uuidArray; } } else { - struct dyld_all_image_infos task_image_infos; - if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos))) { + struct user32_dyld_all_image_infos task_image_infos; + if (kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct user32_dyld_all_image_infos))) { uuid_info_count = task_image_infos.uuidArrayCount; uuid_info_addr = task_image_infos.uuidArray; } @@ -1205,6 +1225,8 @@ walk_list: task_snap->ss_flags |= kUser64_p; if (!task->active) task_snap->ss_flags |= kTerminatedSnapshot; + if(task->pidsuspended) task_snap->ss_flags |= kPidSuspended; + if(task->frozen) task_snap->ss_flags |= kFrozen; task_snap->suspend_count = task->suspend_count; task_snap->task_size = have_pmap ? pmap_resident_count(task->map->pmap) : 0; @@ -1217,7 +1239,7 @@ walk_list: tracepos += sizeof(struct task_snapshot); if (task_pid > 0 && uuid_info_count > 0) { - uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct dyld_uuid_info64) : sizeof(struct dyld_uuid_info)); + uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; if (tracepos + uuid_info_array_size > tracebound) { @@ -1234,6 +1256,8 @@ walk_list: } queue_iterate(&task->threads, thread, thread_t, task_threads){ + uint64_t tval; + if ((thread == NULL) || (ml_nofault_copy((vm_offset_t) thread, (vm_offset_t) &cthread, sizeof(struct thread)) != sizeof(struct thread))) goto error_exit; @@ -1245,10 +1269,19 @@ walk_list: tsnap = (thread_snapshot_t) tracepos; tsnap->thread_id = thread_tid(thread); tsnap->state = thread->state; - tsnap->wait_event = thread->wait_event; - tsnap->continuation = (uint64_t) (uintptr_t) thread->continuation; - tsnap->user_time = safe_grab_timer_value(&thread->user_timer); - tsnap->system_time = safe_grab_timer_value(&thread->system_timer); + tsnap->sched_pri = thread->sched_pri; + tsnap->sched_flags = thread->sched_flags; + tsnap->wait_event = VM_KERNEL_UNSLIDE(thread->wait_event); + tsnap->continuation = VM_KERNEL_UNSLIDE(thread->continuation); + tval = safe_grab_timer_value(&thread->user_timer); + tsnap->user_time = tval; + tval = safe_grab_timer_value(&thread->system_timer); + if (thread->precise_user_kernel_time) { + tsnap->system_time = tval; + } else { + tsnap->user_time += tval; + tsnap->system_time = 0; + } tsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC; tracepos += sizeof(struct thread_snapshot); tsnap->ss_flags = 0; diff --git a/osfmk/kdp/kdp_dyld.h b/osfmk/kdp/kdp_dyld.h index 910565f2e..18a27c4fb 100644 --- a/osfmk/kdp/kdp_dyld.h +++ b/osfmk/kdp/kdp_dyld.h @@ -33,21 +33,33 @@ /* From dyld/include/dyld_images.h */ -struct dyld_uuid_info { +struct user32_dyld_uuid_info { user32_addr_t imageLoadAddress; /* base address image is mapped into */ uuid_t imageUUID; /* UUID of image */ }; -struct dyld_uuid_info64 { +struct user64_dyld_uuid_info { user64_addr_t imageLoadAddress; /* base address image is mapped into */ uuid_t imageUUID; /* UUID of image */ }; +struct user32_dyld_image_info { + user32_addr_t imageLoadAddress; /* base address image is mapped int */ + user32_addr_t imageFilePath; /* path dyld used to load the image */ + user32_ulong_t imageFileModDate; /* time_t of image file */ +}; + +struct user64_dyld_image_info { + user64_addr_t imageLoadAddress; /* base address image is mapped int */ + user64_addr_t imageFilePath; /* path dyld used to load the image */ + user64_ulong_t imageFileModDate; /* time_t of image file */ +}; + // FIXME: dyld is in C++, and some of the fields in dyld_all_image_infos are C++ // native booleans. There must be a better way... typedef uint8_t bool; -struct dyld_all_image_infos { +struct user32_dyld_all_image_infos { uint32_t version; uint32_t infoArrayCount; user32_addr_t infoArray; @@ -63,9 +75,10 @@ struct dyld_all_image_infos { user32_addr_t systemOrderFlag; user32_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count user32_addr_t uuidArray; + user32_addr_t dyldAllImageInfosAddress; }; -struct dyld_all_image_infos64 { +struct user64_dyld_all_image_infos { uint32_t version; uint32_t infoArrayCount; user64_addr_t infoArray; @@ -81,4 +94,5 @@ struct dyld_all_image_infos64 { user64_addr_t systemOrderFlag; user64_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count user64_addr_t uuidArray; -}; \ No newline at end of file + user64_addr_t dyldAllImageInfosAddress; +}; diff --git a/osfmk/kdp/kdp_udp.c b/osfmk/kdp/kdp_udp.c index caa07dfa5..4415e10f6 100644 --- a/osfmk/kdp/kdp_udp.c +++ b/osfmk/kdp/kdp_udp.c @@ -35,7 +35,6 @@ * Kernel Debugging Protocol UDP implementation. */ -#include #include #include #include @@ -302,7 +301,6 @@ inline static void kdp_receive_data(void *packet, unsigned int *len, } - void kdp_register_link(kdp_link_t link, kdp_mode_t mode) { kdp_en_linkstatus = link; @@ -322,12 +320,7 @@ kdp_register_send_receive( { unsigned int debug = 0; - debug_log_init(); - - kdp_timer_callout_init(); - PE_parse_boot_argn("debug", &debug, sizeof (debug)); - kdp_crashdump_feature_mask = htonl(kdp_crashdump_feature_mask); if (!debug) @@ -1144,6 +1137,7 @@ kdp_connection_wait(void) } printf("\nWaiting for remote debugger connection.\n"); + kprintf("\nWaiting for remote debugger connection.\n"); if (reattach_wait == 0) { @@ -1153,9 +1147,6 @@ kdp_connection_wait(void) printf("------------ ----\n"); printf("continue.... 'c'\n"); printf("reboot...... 'r'\n"); -#if MACH_KDB - printf("enter kdb... 'k'\n"); -#endif } } else reattach_wait = 0; @@ -1175,15 +1166,7 @@ kdp_connection_wait(void) printf("Rebooting...\n"); kdp_machine_reboot(); break; -#if MACH_KDB - case 'k': - printf("calling kdb...\n"); - if (kdp_call_kdb()) - return; - else - printf("not implemented...\n"); -#endif - default: + default: break; } } @@ -1218,6 +1201,7 @@ kdp_connection_wait(void) if (current_debugger == KDP_CUR_DB) active_debugger=1; printf("Connected to remote debugger.\n"); + kprintf("Connected to remote debugger.\n"); } static void @@ -1280,13 +1264,16 @@ kdp_raise_exception( ) { int index; + unsigned int initial_not_in_kdp = not_in_kdp; + not_in_kdp = 0; /* Was a system trace requested ? */ if (kdp_snapshot && (!panic_active()) && (panic_caller == 0)) { stack_snapshot_ret = kdp_stackshot(stack_snapshot_pid, stack_snapshot_buf, stack_snapshot_bufsize, stack_snapshot_flags, stack_snapshot_dispatch_offset, &stack_snapshot_bytes_traced); + not_in_kdp = initial_not_in_kdp; return; } @@ -1396,6 +1383,9 @@ kdp_raise_exception( exit_raise_exception: if (kdp_en_setmode) (*kdp_en_setmode)(FALSE); /* link cleanup */ + + not_in_kdp = initial_not_in_kdp; + enable_preemption(); } @@ -1419,13 +1409,13 @@ create_panic_header(unsigned int request, const char *corename, struct ether_header *eh; struct corehdr *coreh; const char *mode = "octet"; - char modelen = strlen(mode); + char modelen = strlen(mode) + 1; size_t fmask_size = sizeof(KDP_FEATURE_MASK_STRING) + sizeof(kdp_crashdump_feature_mask); pkt.off = sizeof (struct ether_header); pkt.len = (unsigned int)(length + ((request == KDP_WRQ) ? modelen + fmask_size : 0) + - (corename ? strlen(corename): 0) + sizeof(struct corehdr)); + (corename ? (strlen(corename) + 1 ): 0) + sizeof(struct corehdr)); #if DO_ALIGN bcopy((char *)&pkt.data[pkt.off], (char *)ui, sizeof(*ui)); @@ -1902,7 +1892,6 @@ kdp_panic_dump(void) } flag_panic_dump_in_progress = TRUE; - not_in_kdp = 0; if (pkt.input) kdp_panic("kdp_panic_dump: unexpected pending input packet"); @@ -2024,7 +2013,6 @@ abort_panic_transfer(void) { flag_panic_dump_in_progress = FALSE; flag_dont_abort_panic_dump = FALSE; - not_in_kdp = 1; panic_block = 0; } @@ -2120,11 +2108,26 @@ kdp_init(void) strlcat(kdp_kernelversion_string, kernel_uuid, sizeof(kdp_kernelversion_string)); } +#if defined(__x86_64__) || defined(__arm__) + debug_log_init(); + + if (vm_kernel_slide) { + char KASLR_stext[19]; + strlcat(kdp_kernelversion_string, "; stext=", sizeof(kdp_kernelversion_string)); + snprintf(KASLR_stext, sizeof(KASLR_stext), "%p", (void *) vm_kernel_stext); + strlcat(kdp_kernelversion_string, KASLR_stext, sizeof(kdp_kernelversion_string)); + } +#endif + if (debug_boot_arg & DB_REBOOT_POST_CORE) kdp_flag |= REBOOT_POST_CORE; #if defined(__x86_64__) kdp_machine_init(); #endif + + kdp_timer_callout_init(); + kdp_crashdump_feature_mask = htonl(kdp_crashdump_feature_mask); + #if CONFIG_SERIAL_KDP char kdpname[80]; struct in_addr ipaddr; diff --git a/osfmk/kdp/ml/i386/kdp_machdep.c b/osfmk/kdp/ml/i386/kdp_machdep.c index 8beb2959e..ca38128a4 100644 --- a/osfmk/kdp/ml/i386/kdp_machdep.c +++ b/osfmk/kdp/ml/i386/kdp_machdep.c @@ -290,6 +290,7 @@ void kdp_machine_reboot(void) { printf("Attempting system restart..."); + kprintf("Attempting system restart..."); /* Call the platform specific restart*/ if (PE_halt_restart) (*PE_halt_restart)(kPERestartCPU); diff --git a/osfmk/kdp/ml/i386/kdp_vm.c b/osfmk/kdp/ml/i386/kdp_vm.c index 5633c73b9..58f614d06 100644 --- a/osfmk/kdp/ml/i386/kdp_vm.c +++ b/osfmk/kdp/ml/i386/kdp_vm.c @@ -30,80 +30,79 @@ #include #include -#include -#include - #include #include +#include #include -#include -#include #include #include -#include -#include - -int kern_dump(void); int kdp_dump_trap(int type, x86_saved_state32_t *regs); -typedef struct { - int flavor; /* the number for this flavor */ - mach_msg_type_number_t count; /* count of ints in this flavor */ -} mythread_state_flavor_t; - -static mythread_state_flavor_t thread_flavor_array [] = { +static const x86_state_hdr_t thread_flavor_array [] = { {x86_THREAD_STATE32, x86_THREAD_STATE32_COUNT} }; -static int kdp_mynum_flavors = 1; -static int MAX_TSTATE_FLAVORS = 1; +size_t +kern_collectth_state_size(void) +{ + unsigned int i; + size_t tstate_size = 0; -typedef struct { - vm_offset_t header; - int hoffset; - mythread_state_flavor_t *flavors; - int tstate_size; -} tir_t; + for (i = 0; i < sizeof(thread_flavor_array)/sizeof(thread_flavor_array[0]); i++) + tstate_size += sizeof(x86_state_hdr_t) + + (thread_flavor_array[i].count * sizeof(int)); -char command_buffer[512]; + return tstate_size; +} -static void -kern_collectth_state(thread_t thread, tir_t *t) +void +kern_collectth_state(thread_t thread, void *buffer, size_t size) { - vm_offset_t header; - int hoffset, i ; - mythread_state_flavor_t *flavors; + size_t hoffset; + unsigned int i; struct thread_command *tc; + /* * Fill in thread command structure. */ - header = t->header; - hoffset = t->hoffset; - flavors = t->flavors; + hoffset = 0; - tc = (struct thread_command *) (header + hoffset); + if (hoffset + sizeof(struct thread_command) > size) + return; + + tc = (struct thread_command *) ((uintptr_t)buffer + hoffset); tc->cmd = LC_THREAD; - tc->cmdsize = sizeof(struct thread_command) + t->tstate_size; + tc->cmdsize = sizeof(struct thread_command) + kern_collectth_state_size(); hoffset += sizeof(struct thread_command); /* * Follow with a struct thread_state_flavor and * the appropriate thread state struct for each * thread state flavor. */ - for (i = 0; i < kdp_mynum_flavors; i++) { - *(mythread_state_flavor_t *)(header+hoffset) = - flavors[i]; - hoffset += sizeof(mythread_state_flavor_t); + for (i = 0; i < sizeof(thread_flavor_array)/sizeof(thread_flavor_array[0]); i++) { + + if (hoffset + sizeof(x86_state_hdr_t) > size) + return; + + *(x86_state_hdr_t *)((uintptr_t)buffer + hoffset) = + thread_flavor_array[i]; + hoffset += sizeof(x86_state_hdr_t); + + + if (hoffset + thread_flavor_array[i].count*sizeof(int) > size) + return; + /* Locate and obtain the non-volatile register context * for this kernel thread. This should ideally be * encapsulated in machine_thread_get_kern_state() * but that routine appears to have been co-opted * by CHUD to obtain pre-interrupt state. */ - if (flavors[i].flavor == x86_THREAD_STATE32) { - x86_thread_state32_t *tstate = (x86_thread_state32_t *) (header + hoffset); + if (thread_flavor_array[i].flavor == x86_THREAD_STATE32) { + x86_thread_state32_t *tstate = (x86_thread_state32_t *) ((uintptr_t)buffer + hoffset); vm_offset_t kstack; + bzero(tstate, x86_THREAD_STATE32_COUNT * sizeof(int)); if ((kstack = thread->kernel_stack) != 0){ struct x86_kernel_state *iks = STACK_IKS(kstack); @@ -113,16 +112,15 @@ kern_collectth_state(thread_t thread, tir_t *t) tstate->edi = iks->k_edi; tstate->esi = iks->k_esi; tstate->eip = iks->k_eip; + } + } else { + void *tstate = (void *)((uintptr_t)buffer + hoffset); + + bzero(tstate, thread_flavor_array[i].count*sizeof(int)); } - } - else if (machine_thread_get_kern_state(thread, - flavors[i].flavor, (thread_state_t) (header+hoffset), - &flavors[i].count) != KERN_SUCCESS) - printf ("Failure in machine_thread_get_kern_state()\n"); - hoffset += flavors[i].count*sizeof(int); - } - t->hoffset = hoffset; + hoffset += thread_flavor_array[i].count*sizeof(int); + } } /* Intended to be called from the kernel trap handler if an unrecoverable fault @@ -146,224 +144,3 @@ kdp_dump_trap( kdp_raise_exception(EXC_BAD_ACCESS, 0, 0, kdp.saved_state); return( 0 ); } - -int -kern_dump(void) -{ - vm_map_t map; - unsigned int thread_count, segment_count; - unsigned int command_size = 0, header_size = 0, tstate_size = 0; - - uint64_t hoffset = 0, foffset = 0, nfoffset = 0, max_header_size; - vm_offset_t header, txstart; - vm_address_t vmoffset; - - struct mach_header *mh; - struct segment_command *sc; - vm_size_t size; - vm_prot_t prot = 0; - vm_prot_t maxprot = 0; - mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS]; - vm_size_t nflavors; - vm_size_t i; - uint32_t nesting_depth = 0; - kern_return_t kret = 0; - struct vm_region_submap_info_64 vbr; - mach_msg_type_number_t vbrcount = 0; - tir_t tir1; - - int error = 0; - int panic_error = 0; - - map = kernel_map; - - thread_count = 1; - segment_count = get_vmmap_entries(map); - - printf("Kernel map has %d entries\n", segment_count); - - nflavors = kdp_mynum_flavors; - bcopy((char *)thread_flavor_array,(char *) flavors,sizeof(thread_flavor_array)); - - for (i = 0; i < nflavors; i++) - tstate_size += sizeof(mythread_state_flavor_t) + - (flavors[i].count * sizeof(int)); - - command_size = (segment_count) * - sizeof(struct segment_command) + - thread_count * sizeof(struct thread_command) + - tstate_size * thread_count; - - header_size = command_size + sizeof(struct mach_header); - header = (vm_offset_t) command_buffer; - - /* - * Set up Mach-O header for currently executing 32 bit kernel. - */ - printf ("Generated Mach-O header size was %d\n", header_size); - - mh = (struct mach_header *) header; - mh->magic = MH_MAGIC; - mh->cputype = cpu_type(); - mh->cpusubtype = cpu_subtype(); - mh->filetype = MH_CORE; - mh->ncmds = segment_count + thread_count; - mh->sizeofcmds = command_size; - mh->flags = 0; - - hoffset = sizeof(struct mach_header); /* offset into header */ - foffset = round_page_32(header_size); /* offset into file */ - /* Padding */ - if ((foffset - header_size) < (4*sizeof(struct segment_command))) { - foffset += ((4*sizeof(struct segment_command)) - (foffset-header_size)); - } - - max_header_size = foffset; - - vmoffset = VM_MIN_KERNEL_ADDRESS; /* offset into VM */ - - /* Transmit the Mach-O MH_CORE header, and seek forward past the - * area reserved for the segment and thread commands - * to begin data transmission - */ - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(nfoffset) , &nfoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, sizeof(struct mach_header), (caddr_t) mh) < 0)) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(foffset) , &foffset) < 0)) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - printf ("Transmitting kernel state:\n"); - - while ((segment_count > 0) || (kret == KERN_SUCCESS)) { - while (1) { - - /* - * Get region information for next region. - */ - - vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; - if((kret = vm_region_recurse_64(map, - &vmoffset, &size, &nesting_depth, - (vm_region_recurse_info_t)&vbr, - &vbrcount)) != KERN_SUCCESS) { - break; - } - - if(vbr.is_submap) { - nesting_depth++; - continue; - } else { - break; - } - } - - if(kret != KERN_SUCCESS) - break; - - prot = vbr.protection; - maxprot = vbr.max_protection; - /* - * Fill in segment command structure. - */ - - if (hoffset > max_header_size) - break; - sc = (struct segment_command *) (header); - sc->cmd = LC_SEGMENT; - sc->cmdsize = sizeof(struct segment_command); - sc->segname[0] = 0; - sc->vmaddr = vmoffset; - sc->vmsize = size; - sc->fileoff = (uint32_t) foffset; - sc->filesize = size; - sc->maxprot = maxprot; - sc->initprot = prot; - sc->nsects = 0; - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, sizeof(struct segment_command) , (caddr_t) sc)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - /* Do not transmit memory tagged VM_MEMORY_IOKIT - instead, - * seek past that region on the server - this creates a - * hole in the file. - */ - - if ((vbr.user_tag != VM_MEMORY_IOKIT)) { - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(foffset) , &foffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - txstart = vmoffset; - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, size, (caddr_t) txstart)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - } - - hoffset += sizeof(struct segment_command); - foffset += size; - vmoffset += size; - segment_count--; - } - tir1.header = header; - tir1.hoffset = 0; - tir1.flavors = flavors; - tir1.tstate_size = tstate_size; - - /* Now send out the LC_THREAD load command, with the thread information - * for the current activation. - * Note that the corefile can contain LC_SEGMENT commands with file - * offsets that point past the edge of the corefile, in the event that - * the last N VM regions were all I/O mapped or otherwise - * non-transferable memory, not followed by a normal VM region; - * i.e. there will be no hole that reaches to the end of the core file. - */ - kern_collectth_state (current_thread(), &tir1); - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, tir1.hoffset , (caddr_t) header)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - /* last packet */ - if ((panic_error = kdp_send_crashdump_pkt (KDP_EOF, NULL, 0, ((void *) 0))) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } -out: - return (error); -} diff --git a/osfmk/kdp/ml/i386/kdp_x86_common.c b/osfmk/kdp/ml/i386/kdp_x86_common.c index 6016e4835..b1c4669ba 100644 --- a/osfmk/kdp/ml/i386/kdp_x86_common.c +++ b/osfmk/kdp/ml/i386/kdp_x86_common.c @@ -33,6 +33,7 @@ #include #include +#include /* pmap_pde */ #include #include #include @@ -41,12 +42,15 @@ #include #include +#include +#include #include #include #include #include +#include // #define KDP_VM_READ_DEBUG 1 // #define KDP_VM_WRITE_DEBUG 1 @@ -56,6 +60,13 @@ boolean_t kdp_trans_off; static addr64_t kdp_vtophys(pmap_t pmap, addr64_t va); +int kern_dump_pmap_traverse_preflight_callback(vm_map_offset_t start, + vm_map_offset_t end, + void *context); +int kern_dump_pmap_traverse_send_callback(vm_map_offset_t start, + vm_map_offset_t end, + void *context); + pmap_t kdp_pmap = 0; static addr64_t @@ -377,6 +388,290 @@ kdp_machine_msr64_write(kdp_writemsr64_req_t *rq, caddr_t data, uint16_t lcpu) return KDPERR_NO_ERROR; } +int +pmap_traverse_present_mappings(pmap_t pmap, + vm_map_offset_t start, + vm_map_offset_t end, + pmap_traverse_callback callback, + void *context) +{ + int ret = KERN_SUCCESS; + vm_map_offset_t vcurstart, vcur; + boolean_t lastvavalid = FALSE; + + /* Assumes pmap is locked, or being called from the kernel debugger */ + + if (start > end) { + return (KERN_INVALID_ARGUMENT); + } + + if (start & PAGE_MASK_64) { + return (KERN_INVALID_ARGUMENT); + } + + for (vcur = vcurstart = start; (ret == KERN_SUCCESS) && (vcur < end); ) { + ppnum_t ppn = pmap_find_phys(pmap, vcur); + + if (ppn != 0 && !pmap_valid_page(ppn)) { + /* not something we want */ + ppn = 0; + } + + if (ppn != 0) { + if (!lastvavalid) { + /* Start of a new virtual region */ + vcurstart = vcur; + lastvavalid = TRUE; + } + } else { + if (lastvavalid) { + /* end of a virtual region */ + + ret = callback(vcurstart, vcur, context); + + lastvavalid = FALSE; + } + + /* Try to skip by 2MB if possible */ + if (((vcur & PDMASK) == 0) && cpu_64bit) { + pd_entry_t *pde; + + pde = pmap_pde(pmap, vcur); + if (0 == pde || ((*pde & INTEL_PTE_VALID) == 0)) { + /* Make sure we wouldn't overflow */ + if (vcur < (end - NBPD)) { + vcur += NBPD; + continue; + } + } + } + } + + vcur += PAGE_SIZE_64; + } + + if ((ret == KERN_SUCCESS) + && lastvavalid) { + /* send previous run */ + + ret = callback(vcurstart, vcur, context); + } + return (ret); +} + +struct kern_dump_preflight_context { + uint32_t region_count; + uint64_t dumpable_bytes; +}; + +struct kern_dump_send_context { + uint64_t hoffset; + uint64_t foffset; + uint64_t header_size; +}; + +int +kern_dump_pmap_traverse_preflight_callback(vm_map_offset_t start, + vm_map_offset_t end, + void *context) +{ + struct kern_dump_preflight_context *kdc = (struct kern_dump_preflight_context *)context; + int ret = KERN_SUCCESS; + + kdc->region_count++; + kdc->dumpable_bytes += (end - start); + + return (ret); +} + +int +kern_dump_pmap_traverse_send_callback(vm_map_offset_t start, + vm_map_offset_t end, + void *context) +{ + struct kern_dump_send_context *kdc = (struct kern_dump_send_context *)context; + int ret = KERN_SUCCESS; + kernel_segment_command_t sc; + vm_size_t size = (vm_size_t)(end - start); + + if (kdc->hoffset + sizeof(sc) > kdc->header_size) { + return (KERN_NO_SPACE); + } + + /* + * Fill in segment command structure. + */ + + sc.cmd = LC_SEGMENT_KERNEL; + sc.cmdsize = sizeof(kernel_segment_command_t); + sc.segname[0] = 0; + sc.vmaddr = (vm_address_t)start; + sc.vmsize = size; + sc.fileoff = (vm_address_t)kdc->foffset; + sc.filesize = size; + sc.maxprot = VM_PROT_READ; + sc.initprot = VM_PROT_READ; + sc.nsects = 0; + sc.flags = 0; + + if ((ret = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(kdc->hoffset) , &kdc->hoffset)) < 0) { + printf ("kdp_send_crashdump_pkt failed with error %d\n", ret); + goto out; + } + + if ((ret = kdp_send_crashdump_data (KDP_DATA, NULL, sizeof(kernel_segment_command_t) , (caddr_t) &sc)) < 0) { + printf ("kdp_send_crashdump_data failed with error %d\n", ret); + goto out; + } + + kdc->hoffset += sizeof(kernel_segment_command_t); + + if ((ret = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(kdc->foffset) , &kdc->foffset)) < 0) { + printf ("kdp_send_crashdump_pkt failed with error %d\n", ret); + goto out; + } + + if ((ret = kdp_send_crashdump_data (KDP_DATA, NULL, (unsigned int)size, (caddr_t)(uintptr_t)start)) < 0) { + printf ("kdp_send_crashdump_data failed with error %d\n", ret); + goto out; + } + + kdc->foffset += size; + +out: + return (ret); +} + +int +kern_dump(void) +{ + int ret; + struct kern_dump_preflight_context kdc_preflight; + struct kern_dump_send_context kdc_send; + uint32_t segment_count; + size_t command_size = 0, header_size = 0, tstate_size = 0; + uint64_t hoffset = 0, foffset = 0; + kernel_mach_header_t mh; + + + kdc_preflight.region_count = 0; + kdc_preflight.dumpable_bytes = 0; + + ret = pmap_traverse_present_mappings(kernel_pmap, + VM_MIN_KERNEL_AND_KEXT_ADDRESS, + VM_MAX_KERNEL_ADDRESS, + kern_dump_pmap_traverse_preflight_callback, + &kdc_preflight); + if (ret) { + printf("pmap traversal failed: %d\n", ret); + return (ret); + } + + printf("Kernel dump region count: %u\n", kdc_preflight.region_count); + printf("Kernel dump byte count: %llu\n", kdc_preflight.dumpable_bytes); + + segment_count = kdc_preflight.region_count; + + tstate_size = sizeof(struct thread_command) + kern_collectth_state_size(); + + command_size = segment_count * sizeof(kernel_segment_command_t) + + tstate_size; + + header_size = command_size + sizeof(kernel_mach_header_t); + + /* + * Set up Mach-O header for currently executing kernel. + */ + printf ("Generated Mach-O header size was %lu\n", header_size); + + mh.magic = _mh_execute_header.magic; + mh.cputype = _mh_execute_header.cputype;; + mh.cpusubtype = _mh_execute_header.cpusubtype; + mh.filetype = MH_CORE; + mh.ncmds = segment_count + 1 /* thread */; + mh.sizeofcmds = (uint32_t)command_size; + mh.flags = 0; +#if defined(__LP64__) + mh.reserved = 0; +#endif + + hoffset = 0; /* offset into header */ + foffset = (uint32_t)round_page(header_size); /* offset into file */ + + /* Transmit the Mach-O MH_CORE header, and seek forward past the + * area reserved for the segment and thread commands + * to begin data transmission + */ + if ((ret = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { + printf ("kdp_send_crashdump_pkt failed with error %d\n", ret); + goto out; + } + if ((ret = kdp_send_crashdump_data (KDP_DATA, NULL, sizeof(kernel_mach_header_t), (caddr_t) &mh) < 0)) { + printf ("kdp_send_crashdump_data failed with error %d\n", ret); + goto out; + } + + hoffset += sizeof(kernel_mach_header_t); + + if ((ret = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(foffset) , &foffset) < 0)) { + printf ("kdp_send_crashdump_pkt failed with error %d\n", ret); + goto out; + } + + printf ("Transmitting kernel state, please wait: "); + + kdc_send.hoffset = hoffset; + kdc_send.foffset = foffset; + kdc_send.header_size = header_size; + + ret = pmap_traverse_present_mappings(kernel_pmap, + VM_MIN_KERNEL_AND_KEXT_ADDRESS, + VM_MAX_KERNEL_ADDRESS, + kern_dump_pmap_traverse_send_callback, + &kdc_send); + if (ret) { + kprintf("pmap traversal failed: %d\n", ret); + return (ret); + } + + /* Reload mutated offsets */ + hoffset = kdc_send.hoffset; + foffset = kdc_send.foffset; + + /* + * Now send out the LC_THREAD load command, with the thread information + * for the current activation. + */ + if (tstate_size > 0) { + char tstate[tstate_size]; + + kern_collectth_state (current_thread(), tstate, tstate_size); + + if ((ret = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(hoffset), &hoffset)) < 0) { + printf ("kdp_send_crashdump_pkt failed with error %d\n", ret); + goto out; + } + + if ((ret = kdp_send_crashdump_data (KDP_DATA, NULL, tstate_size, tstate)) < 0) { + printf ("kdp_send_crashdump_data failed with error %d\n", ret); + goto out; + } + + hoffset += tstate_size; + } + + /* last packet */ + if ((ret = kdp_send_crashdump_pkt (KDP_EOF, NULL, 0, ((void *) 0))) < 0) + { + printf ("kdp_send_crashdump_pkt failed with error %d\n", ret); + goto out; + } + +out: + return (ret); +} + + pt_entry_t *debugger_ptep; vm_map_offset_t debugger_window_kva; @@ -405,7 +700,8 @@ kdp_machine_init(void) { debugger_ptep = pmap_pte(kernel_pmap, debugger_window_kva); if (debugger_ptep == NULL) { - pmap_expand(kernel_pmap, debugger_window_kva); + pmap_expand(kernel_pmap, debugger_window_kva, PMAP_EXPAND_OPTIONS_NONE); debugger_ptep = pmap_pte(kernel_pmap, debugger_window_kva); } } + diff --git a/osfmk/kern/kern_print.h b/osfmk/kdp/ml/i386/kdp_x86_common.h similarity index 57% rename from osfmk/kern/kern_print.h rename to osfmk/kdp/ml/i386/kdp_x86_common.h index 836211062..8c1a7cee3 100644 --- a/osfmk/kern/kern_print.h +++ b/osfmk/kdp/ml/i386/kdp_x86_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,39 +25,33 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:34 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:55 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.5.1 1995/01/06 19:47:13 devrcs - * mk6 CR668 - 1.3b26 merge - * new file for mk6 - * [1994/10/12 22:19:25 dwm] - * - * Revision 1.1.2.1 1993/11/22 20:14:46 jeffc - * Modularized declarations of ddb print functions. - * [1993/11/22 19:03:03 jeffc] - * - * $EndLog$ - */ -#ifndef KERN_PRINT_H_ -#define KERN_PRINT_H_ +#ifndef _KDP_X86_COMMON_H_ +#define _KDP_X86_COMMON_H_ -#include +#include +#include +#include + +/* + * Attempt to discover all virtually contiguous ranges in a pmap + * that have valid mappings to DRAM (not MMIO device memory for example). + * Results are returned via a callback. If the callback returns an error, + * traversal is aborted. + */ +typedef int (*pmap_traverse_callback)(vm_map_offset_t start, + vm_map_offset_t end, + void *context); -extern void db_show_all_slocks(void); +extern int pmap_traverse_present_mappings(pmap_t pmap, + vm_map_offset_t start, + vm_map_offset_t end, + pmap_traverse_callback callback, + void *context); -extern void db_show_one_zone(db_expr_t, boolean_t, db_expr_t, char *); -extern void db_show_all_zones(db_expr_t, boolean_t, db_expr_t, char *); +extern int kern_dump(void); +extern size_t kern_collectth_state_size(void); +extern void kern_collectth_state(thread_t thread, void *buffer, size_t size); -#endif /* KERN_PRINT_H_ */ +#endif /* _KDP_X86_COMMON_H_ */ diff --git a/osfmk/kdp/ml/x86_64/kdp_machdep.c b/osfmk/kdp/ml/x86_64/kdp_machdep.c index d7e071569..d8587db3d 100644 --- a/osfmk/kdp/ml/x86_64/kdp_machdep.c +++ b/osfmk/kdp/ml/x86_64/kdp_machdep.c @@ -606,6 +606,7 @@ machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nf else { stackptr = STACK_IKS(thread->kernel_stack)->k_rbp; init_rip = STACK_IKS(thread->kernel_stack)->k_rip; + init_rip = VM_KERNEL_UNSLIDE(init_rip); kdp_pmap = 0; } @@ -635,6 +636,9 @@ machine_trace_thread64(thread_t thread, char *tracepos, char *tracebound, int nf if (machine_read64(stackptr + RETURN_OFFSET64, (caddr_t) tracebuf, sizeof(addr64_t)) != sizeof(addr64_t)) { break; } + if (!user_p) + *tracebuf = VM_KERNEL_UNSLIDE(*tracebuf); + tracebuf++; prevsp = stackptr; diff --git a/osfmk/kdp/ml/x86_64/kdp_vm.c b/osfmk/kdp/ml/x86_64/kdp_vm.c index a76167621..2cd1c5cba 100644 --- a/osfmk/kdp/ml/x86_64/kdp_vm.c +++ b/osfmk/kdp/ml/x86_64/kdp_vm.c @@ -30,82 +30,80 @@ #include #include -#include -#include - #include #include +#include #include -#include -#include -#include #include #include -#include -#include - -int kern_dump(void); int kdp_dump_trap(int type, x86_saved_state64_t *regs); -typedef struct { - int flavor; /* the number for this flavor */ - mach_msg_type_number_t count; /* count of ints in this flavor */ -} mythread_state_flavor_t; - -static mythread_state_flavor_t thread_flavor_array [] = { +static const x86_state_hdr_t thread_flavor_array [] = { {x86_THREAD_STATE64, x86_THREAD_STATE64_COUNT} }; -static int kdp_mynum_flavors = 1; -static int MAX_TSTATE_FLAVORS = 1; +size_t +kern_collectth_state_size(void) +{ + unsigned int i; + size_t tstate_size = 0; -typedef struct { - vm_offset_t header; - int hoffset; - mythread_state_flavor_t *flavors; - int tstate_size; -} tir_t; + for (i = 0; i < sizeof(thread_flavor_array)/sizeof(thread_flavor_array[0]); i++) + tstate_size += sizeof(x86_state_hdr_t) + + (thread_flavor_array[i].count * sizeof(int)); -char command_buffer[512]; + return tstate_size; +} -static void -kern_collectth_state(thread_t thread, tir_t *t) +void +kern_collectth_state(thread_t thread, void *buffer, size_t size) { - vm_offset_t header; - int hoffset, i ; - mythread_state_flavor_t *flavors; + size_t hoffset; + unsigned int i; struct thread_command *tc; + /* * Fill in thread command structure. */ - header = t->header; - hoffset = t->hoffset; - flavors = t->flavors; + hoffset = 0; - tc = (struct thread_command *) (header + hoffset); + if (hoffset + sizeof(struct thread_command) > size) + return; + + tc = (struct thread_command *) ((uintptr_t)buffer + hoffset); tc->cmd = LC_THREAD; - tc->cmdsize = (uint32_t)sizeof(struct thread_command) + t->tstate_size; - hoffset += (uint32_t)sizeof(struct thread_command); + tc->cmdsize = (uint32_t)(sizeof(struct thread_command) + kern_collectth_state_size()); + hoffset += sizeof(struct thread_command); /* * Follow with a struct thread_state_flavor and * the appropriate thread state struct for each * thread state flavor. */ - for (i = 0; i < kdp_mynum_flavors; i++) { - *(mythread_state_flavor_t *)(header+hoffset) = - flavors[i]; - hoffset += (uint32_t)sizeof(mythread_state_flavor_t); + for (i = 0; i < sizeof(thread_flavor_array)/sizeof(thread_flavor_array[0]); i++) { + + if (hoffset + sizeof(x86_state_hdr_t) > size) + return; + + *(x86_state_hdr_t *)((uintptr_t)buffer + hoffset) = + thread_flavor_array[i]; + hoffset += sizeof(x86_state_hdr_t); + + + if (hoffset + thread_flavor_array[i].count*sizeof(int) > size) + return; + /* Locate and obtain the non-volatile register context * for this kernel thread. This should ideally be * encapsulated in machine_thread_get_kern_state() * but that routine appears to have been co-opted * by CHUD to obtain pre-interrupt state. */ - if (flavors[i].flavor == x86_THREAD_STATE64) { - x86_thread_state64_t *tstate = (x86_thread_state64_t *) (header + hoffset); + if (thread_flavor_array[i].flavor == x86_THREAD_STATE64) { + x86_thread_state64_t *tstate = (x86_thread_state64_t *) ((uintptr_t)buffer + hoffset); vm_offset_t kstack; x86_saved_state64_t *cpstate = current_cpu_datap()->cpu_fatal_trap_state; + bzero(tstate, x86_THREAD_STATE64_COUNT * sizeof(int)); if ((current_thread() == thread) && (cpstate != NULL)) { tstate->rax = cpstate->rax; @@ -140,15 +138,14 @@ kern_collectth_state(thread_t thread, tir_t *t) tstate->r15 = iks->k_r15; tstate->rip = iks->k_rip; } + } else { + void *tstate = (void *)((uintptr_t)buffer + hoffset); + + bzero(tstate, thread_flavor_array[i].count*sizeof(int)); } - else if (machine_thread_get_kern_state(thread, - flavors[i].flavor, (thread_state_t) (header+hoffset), - &flavors[i].count) != KERN_SUCCESS) - printf ("Failure in machine_thread_get_kern_state()\n"); - hoffset += (uint32_t)(flavors[i].count*sizeof(int)); - } - t->hoffset = hoffset; + hoffset += thread_flavor_array[i].count*sizeof(int); + } } /* Intended to be called from the kernel trap handler if an unrecoverable fault @@ -172,225 +169,3 @@ kdp_dump_trap( kdp_raise_exception(EXC_BAD_ACCESS, 0, 0, kdp.saved_state); return( 0 ); } - -int -kern_dump(void) -{ - vm_map_t map; - unsigned int thread_count, segment_count; - unsigned int command_size = 0, header_size = 0, tstate_size = 0; - uint64_t hoffset = 0, foffset = 0, nfoffset = 0; - unsigned int max_header_size = 0; - vm_offset_t header, txstart; - vm_map_offset_t vmoffset; - struct mach_header_64 *mh64; - struct segment_command_64 *sc64; - mach_vm_size_t size = 0; - vm_prot_t prot = 0; - vm_prot_t maxprot = 0; - mythread_state_flavor_t flavors[MAX_TSTATE_FLAVORS]; - vm_size_t nflavors; - vm_size_t i; - uint32_t nesting_depth = 0; - kern_return_t kret = 0; - struct vm_region_submap_info_64 vbr; - mach_msg_type_number_t vbrcount = 0; - tir_t tir1; - - int error = 0; - int panic_error = 0; - - map = kernel_map; - - thread_count = 1; - segment_count = get_vmmap_entries(map); - - printf("Kernel map has %d entries\n", segment_count); - - nflavors = kdp_mynum_flavors; - bcopy((char *)thread_flavor_array,(char *) flavors,sizeof(thread_flavor_array)); - - for (i = 0; i < nflavors; i++) - tstate_size += (uint32_t)(sizeof(mythread_state_flavor_t) + - (flavors[i].count * sizeof(int))); - - command_size = (uint32_t)((segment_count) * - sizeof(struct segment_command_64) + - thread_count * sizeof(struct thread_command) + - tstate_size * thread_count); - - header_size = command_size + (uint32_t)sizeof(struct mach_header_64); - header = (vm_offset_t) command_buffer; - - /* - * Set up Mach-O header for currently executing 32 bit kernel. - */ - printf ("Generated Mach-O header size was %d\n", header_size); - - mh64 = (struct mach_header_64 *) header; - mh64->magic = MH_MAGIC_64; - mh64->cputype = cpu_type(); - mh64->cpusubtype = cpu_subtype(); - mh64->filetype = MH_CORE; - mh64->ncmds = segment_count + thread_count; - mh64->sizeofcmds = command_size; - mh64->flags = 0; - mh64->reserved = 0; - - hoffset = sizeof(struct mach_header_64); /* offset into header */ - foffset = (uint32_t)round_page(header_size); /* offset into file */ - /* Padding */ - if ((foffset - header_size) < (4*sizeof(struct segment_command_64))) { - foffset += (uint32_t)((4*sizeof(struct segment_command_64)) - (foffset-header_size)); - } - - max_header_size = (unsigned int)foffset; - - vmoffset = vm_map_min(map); - - /* Transmit the Mach-O MH_CORE header, and seek forward past the - * area reserved for the segment and thread commands - * to begin data transmission - */ - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(nfoffset) , &nfoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, sizeof(struct mach_header_64), (caddr_t) mh64) < 0)) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(foffset) , &foffset) < 0)) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - printf ("Transmitting kernel state, please wait: "); - - while ((segment_count > 0) || (kret == KERN_SUCCESS)){ - - while (1) { - - /* - * Get region information for next region. - */ - - vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; - if((kret = mach_vm_region_recurse(map, - &vmoffset, &size, &nesting_depth, - (vm_region_recurse_info_t)&vbr, - &vbrcount)) != KERN_SUCCESS) { - break; - } - - if(vbr.is_submap) { - nesting_depth++; - continue; - } else { - break; - } - } - - if(kret != KERN_SUCCESS) - break; - - prot = vbr.protection; - maxprot = vbr.max_protection; - - /* - * Fill in segment command structure. - */ - - if (hoffset > max_header_size) - break; - sc64 = (struct segment_command_64 *) (header); - sc64->cmd = LC_SEGMENT_64; - sc64->cmdsize = sizeof(struct segment_command_64); - sc64->segname[0] = 0; - sc64->vmaddr = vmoffset; - sc64->vmsize = size; - sc64->fileoff = foffset; - sc64->filesize = size; - sc64->maxprot = maxprot; - sc64->initprot = prot; - sc64->nsects = 0; - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, sizeof(struct segment_command_64) , (caddr_t) sc64)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - /* Do not transmit memory tagged VM_MEMORY_IOKIT - instead, - * seek past that region on the server - this creates a - * hole in the file. - */ - - if ((vbr.user_tag != VM_MEMORY_IOKIT)) { - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(foffset) , &foffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - txstart = vmoffset; - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, (unsigned int)size, (caddr_t) txstart)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - } - - hoffset += (unsigned int)sizeof(struct segment_command_64); - foffset += (unsigned int)size; - vmoffset += size; - segment_count--; - } - tir1.header = header; - tir1.hoffset = 0; - tir1.flavors = flavors; - tir1.tstate_size = tstate_size; - - /* Now send out the LC_THREAD load command, with the thread information - * for the current activation. - * Note that the corefile can contain LC_SEGMENT commands with file - * offsets that point past the edge of the corefile, in the event that - * the last N VM regions were all I/O mapped or otherwise - * non-transferable memory, not followed by a normal VM region; - * i.e. there will be no hole that reaches to the end of the core file. - */ - kern_collectth_state (current_thread(), &tir1); - - if ((panic_error = kdp_send_crashdump_pkt (KDP_SEEK, NULL, sizeof(hoffset) , &hoffset)) < 0) { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - if ((panic_error = kdp_send_crashdump_data (KDP_DATA, NULL, tir1.hoffset , (caddr_t) header)) < 0) { - printf ("kdp_send_crashdump_data failed with error %d\n", panic_error); - error = panic_error; - goto out; - } - - /* last packet */ - if ((panic_error = kdp_send_crashdump_pkt (KDP_EOF, NULL, 0, ((void *) 0))) < 0) - { - printf ("kdp_send_crashdump_pkt failed with error %d\n", panic_error); - error = panic_error; - goto out; - } -out: - return (error); -} diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index cf8f5539e..846b95682 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -24,6 +24,7 @@ EXPORT_ONLY_FILES = \ kalloc.h \ kext_alloc.h \ kern_types.h \ + ledger.h \ lock.h \ locks.h \ host.h \ diff --git a/osfmk/kern/affinity.c b/osfmk/kern/affinity.c index eb5095459..002482dea 100644 --- a/osfmk/kern/affinity.c +++ b/osfmk/kern/affinity.c @@ -528,7 +528,10 @@ affinity_set_place(affinity_space_t aspc, affinity_set_t new_aset) */ queue_iterate(&aspc->aspc_affinities, aset, affinity_set_t, aset_affinities) { - set_occupancy[aset->aset_num]++; + if(aset->aset_num < num_cpu_asets) + set_occupancy[aset->aset_num]++; + else + panic("aset_num = %d in %s\n", aset->aset_num, __FUNCTION__); } /* diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index e7b895598..9f6757a6b 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,10 +78,11 @@ #include #include #include +#include #include #include // for CHUD AST hook #include - +#include // for MACF AST hook volatile perfASTCallback perfASTHook; @@ -91,6 +92,8 @@ ast_init(void) { } +extern void chudxnu_thread_ast(thread_t); // XXX this should probably be in a header... + /* * Called at splsched. */ @@ -157,13 +160,35 @@ ast_taken( bsd_ast(thread); } #endif - +#if CONFIG_MACF + /* + * Handle MACF hook. + */ + if (reasons & AST_MACF) { + thread_ast_clear(thread, AST_MACF); + mac_thread_userret(thread); + } +#endif /* * Thread APC hook. */ if (reasons & AST_APC) act_execute_returnhandlers(); + if (reasons & AST_LEDGER) { + thread_ast_clear(thread, AST_LEDGER); + ledger_ast(thread); + } + + /* + * Kernel Profiling Hook + */ + if (reasons & AST_KPERF) + { + thread_ast_clear(thread, AST_KPERF); + chudxnu_thread_ast(thread); + } + ml_set_interrupts_enabled(FALSE); /* diff --git a/osfmk/kern/ast.h b/osfmk/kern/ast.h index b6f42e4ec..713c6158c 100644 --- a/osfmk/kern/ast.h +++ b/osfmk/kern/ast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,12 +89,16 @@ typedef uint32_t ast_t; #define AST_HANDOFF 0x08 #define AST_YIELD 0x10 #define AST_APC 0x20 /* migration APC hook */ +#define AST_LEDGER 0x40 + /* * JMM - This is here temporarily. AST_BSD is used to simulate a * general purpose mechanism for setting asynchronous procedure calls * from the outside. */ #define AST_BSD 0x80 +#define AST_KPERF 0x100 /* kernel profiling */ +#define AST_MACF 0x200 /* MACF user ret pending */ #define AST_NONE 0x00 #define AST_ALL (~AST_NONE) @@ -140,7 +144,7 @@ extern ast_t *ast_pending(void); #define MACHINE_AST_PER_THREAD 0 #endif -#define AST_PER_THREAD (AST_APC | AST_BSD | MACHINE_AST_PER_THREAD) +#define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | MACHINE_AST_PER_THREAD | AST_LEDGER) /* * ast_pending(), ast_on(), ast_off(), ast_context(), and ast_propagate() * assume splsched. diff --git a/osfmk/kern/audit_sessionport.c b/osfmk/kern/audit_sessionport.c index 7e8ee9c30..9d3bd7ab9 100644 --- a/osfmk/kern/audit_sessionport.c +++ b/osfmk/kern/audit_sessionport.c @@ -86,11 +86,11 @@ audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport) audit_session_aiaref(aia_p); - /* Need a send-once right for the target of the notification */ - notifyport = ipc_port_make_sonce(port); - /* Request a no-senders notification (at the new make-send threshold) */ ip_lock(port); + /* Need a send-once right for the target of the notification */ + notifyport = ipc_port_make_sonce_locked(port); + /* Request a no-senders notification (at the new make-send threshold) */ ipc_port_nsrequest(port, port->ip_mscount, notifyport, ¬ifyport); /* port unlocked */ @@ -175,9 +175,7 @@ audit_session_nosenders(mach_msg_header_t *msg) * request, re-arm the notification with the new threshold. */ if (port->ip_mscount > notification->not_count) { - ip_unlock(port); - notifyport = ipc_port_make_sonce(port); - ip_lock(port); + notifyport = ipc_port_make_sonce_locked(port); ipc_port_nsrequest(port, port->ip_mscount, notifyport, ¬ifyport); /* port unlocked */ diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index 07de86ef0..ea5f9e139 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -43,7 +43,6 @@ #include /* last */ #undef thread_should_halt -#undef ipc_port_release /* BSD KERN COMPONENT INTERFACE */ @@ -54,10 +53,8 @@ extern unsigned int not_in_kdp; /* Skip acquiring locks if we're in kdp */ thread_t get_firstthread(task_t); int get_task_userstop(task_t); int get_thread_userstop(thread_t); -boolean_t thread_should_abort(thread_t); boolean_t current_thread_aborted(void); void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); -void ipc_port_release(ipc_port_t); kern_return_t get_signalact(task_t , thread_t *, int); int get_vmsubmap_entries(vm_map_t, vm_object_offset_t, vm_object_offset_t); void syscall_exit_funnelcheck(void); @@ -218,6 +215,11 @@ check_actforsig( return (result); } +ledger_t get_task_ledger(task_t t) +{ + return(t->ledger); +} + /* * This is only safe to call from a thread executing in * in the task's context or if the task is locked Otherwise, @@ -302,8 +304,9 @@ swap_task_map(task_t task, thread_t thread, vm_map_t map, boolean_t doswitch) mp_disable_preemption(); old_map = task->map; thread->map = task->map = map; - if (doswitch) + if (doswitch) { pmap_switch(map->pmap); + } mp_enable_preemption(); task_unlock(task); @@ -458,6 +461,26 @@ get_thread_userstop( return(th->user_stop_count); } +/* + * + */ +boolean_t +get_task_pidsuspended( + task_t task) +{ + return (task->pidsuspended); +} + +/* + * + */ +boolean_t +get_task_frozen( + task_t task) +{ + return (task->frozen); +} + /* * */ @@ -519,12 +542,6 @@ task_act_iterate_wth_args( task_unlock(task); } -void -ipc_port_release( - ipc_port_t port) -{ - ipc_object_release(&(port)->ip_object); -} void astbsd_on(void) @@ -567,6 +584,10 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) queue_iterate(&task->threads, thread, thread_t, task_threads) { uint64_t tval; + spl_t x; + + x = splsched(); + thread_lock(thread); if ((thread->state & TH_RUN) == TH_RUN) numrunning++; @@ -576,11 +597,21 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) tinfo.total_user += tval; tval = timer_grab(&thread->system_timer); - tinfo.threads_system += tval; - tinfo.total_system += tval; + + if (thread->precise_user_kernel_time) { + tinfo.threads_system += tval; + tinfo.total_system += tval; + } else { + /* system_timer may represent either sys or user */ + tinfo.threads_user += tval; + tinfo.total_user += tval; + } syscalls_unix += thread->syscalls_unix; syscalls_mach += thread->syscalls_mach; + + thread_unlock(thread); + splx(x); } ptinfo->pti_total_system = tinfo.total_system; @@ -604,19 +635,21 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) } int -fill_taskthreadinfo(task_t task, uint64_t thaddr, struct proc_threadinfo_internal * ptinfo, void * vpp, int *vidp) +fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void * vpp, int *vidp) { thread_t thact; int err=0; mach_msg_type_number_t count; thread_basic_info_data_t basic_info; kern_return_t kret; + uint64_t addr = 0; task_lock(task); for (thact = (thread_t)queue_first(&task->threads); !queue_end(&task->threads, (queue_entry_t)thact); ) { - if (thact->machine.cthread_self == thaddr) + addr = (thuniqueid==0)?thact->machine.cthread_self: thact->thread_id; + if (addr == thaddr) { count = THREAD_BASIC_INFO_COUNT; @@ -624,14 +657,9 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, struct proc_threadinfo_interna err = 1; goto out; } -#if 0 - ptinfo->pth_user_time = timer_grab(&basic_info.user_time); - ptinfo->pth_system_time = timer_grab(&basic_info.system_time); -#else ptinfo->pth_user_time = ((basic_info.user_time.seconds * NSEC_PER_SEC) + (basic_info.user_time.microseconds * NSEC_PER_USEC)); ptinfo->pth_system_time = ((basic_info.system_time.seconds * NSEC_PER_SEC) + (basic_info.system_time.microseconds * NSEC_PER_USEC)); -#endif ptinfo->pth_cpu_usage = basic_info.cpu_usage; ptinfo->pth_policy = basic_info.policy; ptinfo->pth_run_state = basic_info.run_state; diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index e9c487ad6..c7aaa6faf 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -126,6 +126,9 @@ static thread_call_data_t calend_wakecall; extern void IOKitResetTime(void); +void _clock_delay_until_deadline(uint64_t interval, + uint64_t deadline); + static uint64_t clock_boottime; /* Seconds boottime epoch */ #define TIME_ADD(rsecs, secs, rfrac, frac, unit) \ @@ -773,6 +776,15 @@ mach_wait_until_continue( /*NOTREACHED*/ } +/* + * mach_wait_until_trap: Suspend execution of calling thread until the specified time has passed + * + * Parameters: args->deadline Amount of time to wait + * + * Returns: 0 Success + * !0 Not success + * + */ kern_return_t mach_wait_until_trap( struct mach_wait_until_trap_args *args) @@ -796,27 +808,44 @@ clock_delay_until( if (now >= deadline) return; - if ( (deadline - now) < (8 * sched_cswtime) || + _clock_delay_until_deadline(deadline - now, deadline); +} + +/* + * Preserve the original precise interval that the client + * requested for comparison to the spin threshold. + */ +void +_clock_delay_until_deadline( + uint64_t interval, + uint64_t deadline) +{ + + if (interval == 0) + return; + + if ( ml_delay_should_spin(interval) || get_preemption_level() != 0 || - ml_get_interrupts_enabled() == FALSE ) + ml_get_interrupts_enabled() == FALSE ) { machine_delay_until(deadline); - else { - assert_wait_deadline((event_t)clock_delay_until, THREAD_UNINT, deadline - sched_cswtime); + } else { + assert_wait_deadline((event_t)clock_delay_until, THREAD_UNINT, deadline); thread_block(THREAD_CONTINUE_NULL); } } + void delay_for_interval( uint32_t interval, uint32_t scale_factor) { - uint64_t end; + uint64_t abstime; - clock_interval_to_deadline(interval, scale_factor, &end); + clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime); - clock_delay_until(end); + _clock_delay_until_deadline(abstime, mach_absolute_time() + abstime); } void diff --git a/osfmk/kern/clock.h b/osfmk/kern/clock.h index d456198db..ed8218d17 100644 --- a/osfmk/kern/clock.h +++ b/osfmk/kern/clock.h @@ -120,17 +120,6 @@ extern void clock_gettimeofday_set_commpage( extern void machine_delay_until( uint64_t deadline); -#include - -#if STAT_TIME || GPROF - -extern void hertz_tick( - natural_t ticks, - boolean_t usermode, /* executing user code */ - natural_t pc); - -#endif /* STAT_TIME */ - extern uint32_t hz_tick_interval; extern void absolutetime_to_nanotime( @@ -267,10 +256,12 @@ extern void nanoseconds_to_absolutetime( } \ } while (0) +#include -extern mach_timespec_t clock_get_system_value(void); +/* Use mach_absolute_time() */ +extern mach_timespec_t clock_get_system_value(void) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, __IPHONE_2_0, __IPHONE_NA); -extern mach_timespec_t clock_get_calendar_value(void); +extern mach_timespec_t clock_get_calendar_value(void) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0, __MAC_10_8, __IPHONE_2_0, __IPHONE_NA); #else /* __LP64__ */ diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 1dd1aee28..b89774897 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -55,8 +55,6 @@ */ #include -#include -#include #include #include @@ -123,6 +121,9 @@ unsigned int debug_buf_size = sizeof(debug_buf); static char model_name[64]; /* uuid_string_t */ char kernel_uuid[37]; +static spl_t panic_prologue(const char *str); +static void panic_epilogue(spl_t s); + struct pasc { unsigned a: 7; unsigned b: 7; @@ -227,13 +228,10 @@ void _consume_panic_args(int a __unused, ...) panic("panic"); } -void -panic(const char *str, ...) +static spl_t +panic_prologue(const char *str) { - va_list listp; spl_t s; - thread_t thread; - wait_queue_t wq; if (kdebug_enable) { ml_set_interrupts_enabled(TRUE); @@ -255,21 +253,14 @@ panic(const char *str, ...) panic_safe(); - thread = current_thread(); /* Get failing thread */ - wq = thread->wait_queue; /* Save the old value */ - thread->wait_queue = NULL; /* Clear the wait so we do not get double panics when we try locks */ - if( logPanicDataToScreen ) disable_debug_output = FALSE; debug_mode = TRUE; - /* panic_caller is initialized to 0. If set, don't change it */ - if ( ! panic_caller ) - panic_caller = (unsigned long)(char *)__builtin_return_address(0); - restart: PANIC_LOCK(); + if (panicstr) { if (cpu_number() != paniccpu) { PANIC_UNLOCK(); @@ -294,26 +285,19 @@ restart: panicwait = 1; PANIC_UNLOCK(); - kdb_printf("panic(cpu %d caller 0x%lx): ", (unsigned) paniccpu, panic_caller); - if (str) { - va_start(listp, str); - _doprnt(str, &listp, consdebug_putc, 0); - va_end(listp); - } - kdb_printf("\n"); + return(s); +} - /* - * Release panicwait indicator so that other cpus may call Debugger(). - */ - panicwait = 0; - Debugger("panic"); + +static void +panic_epilogue(spl_t s) +{ /* * Release panicstr so that we can handle normally other panics. */ PANIC_LOCK(); panicstr = (char *)0; PANIC_UNLOCK(); - thread->wait_queue = wq; /* Restore the wait queue */ if (return_on_panic) { panic_normal(); @@ -321,12 +305,65 @@ restart: splx(s); return; } - kdb_printf("panic: We are hanging here...\n"); panic_stop(); /* NOTREACHED */ } +void +panic(const char *str, ...) +{ + va_list listp; + spl_t s; + + /* panic_caller is initialized to 0. If set, don't change it */ + if ( ! panic_caller ) + panic_caller = (unsigned long)(char *)__builtin_return_address(0); + + s = panic_prologue(str); + kdb_printf("panic(cpu %d caller 0x%lx): ", (unsigned) paniccpu, panic_caller); + if (str) { + va_start(listp, str); + _doprnt(str, &listp, consdebug_putc, 0); + va_end(listp); + } + kdb_printf("\n"); + + /* + * Release panicwait indicator so that other cpus may call Debugger(). + */ + panicwait = 0; + Debugger("panic"); + panic_epilogue(s); +} + +void +panic_context(unsigned int reason, void *ctx, const char *str, ...) +{ + va_list listp; + spl_t s; + + /* panic_caller is initialized to 0. If set, don't change it */ + if ( ! panic_caller ) + panic_caller = (unsigned long)(char *)__builtin_return_address(0); + + s = panic_prologue(str); + kdb_printf("panic(cpu %d caller 0x%lx): ", (unsigned) paniccpu, panic_caller); + if (str) { + va_start(listp, str); + _doprnt(str, &listp, consdebug_putc, 0); + va_end(listp); + } + kdb_printf("\n"); + + /* + * Release panicwait indicator so that other cpus may call Debugger(). + */ + panicwait = 0; + DebuggerWithContext(reason, ctx, "panic"); + panic_epilogue(s); +} + void log(__unused int level, char *fmt, ...) { @@ -446,6 +483,15 @@ static void panic_display_kernel_uuid(void) { kdb_printf("Kernel UUID: %s\n", tmp_kernel_uuid); } +static void panic_display_kernel_aslr(void) { +#if defined(__x86_64__) + if (vm_kernel_slide) { + kdb_printf("Kernel slide: 0x%016lx\n", vm_kernel_slide); + kdb_printf("Kernel text base: %p\n", (void *) vm_kernel_stext); + } +#endif +} + static void panic_display_uptime(void) { uint64_t uptime; absolutetime_to_nanoseconds(mach_absolute_time(), &uptime); @@ -469,6 +515,7 @@ __private_extern__ void panic_display_system_configuration(void) { (osversion[0] != 0) ? osversion : "Not yet set"); kdb_printf("\nKernel version:\n%s\n",version); panic_display_kernel_uuid(); + panic_display_kernel_aslr(); panic_display_pal_info(); panic_display_model_name(); panic_display_uptime(); diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 66702bc16..c773ca623 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -45,6 +45,8 @@ struct thread_snapshot { uint64_t user_time; uint64_t system_time; int32_t state; + int32_t sched_pri; // scheduled (current) priority + int32_t sched_flags; // scheduler flags char ss_flags; } __attribute__ ((packed)); @@ -69,7 +71,7 @@ struct task_snapshot { } __attribute__ ((packed)); -struct mem_snapshot { +struct mem_and_io_snapshot { uint32_t snapshot_magic; uint32_t free_pages; uint32_t active_pages; @@ -78,24 +80,33 @@ struct mem_snapshot { uint32_t wired_pages; uint32_t speculative_pages; uint32_t throttled_pages; + int busy_buffer_count; + uint32_t pages_wanted; + uint32_t pages_reclaimed; + uint8_t pages_wanted_reclaimed_valid; // did mach_vm_pressure_monitor succeed? } __attribute__((packed)); + enum { kUser64_p = 0x1, kKernel64_p = 0x2, kHasDispatchSerial = 0x4, - kTerminatedSnapshot = 0x8 + kTerminatedSnapshot = 0x8, + kPidSuspended = 0x10, // true for suspended task + kFrozen = 0x20 // true for hibernated task (along with pidsuspended) }; +#define VM_PRESSURE_TIME_WINDOW 5 /* seconds */ + enum { STACKSHOT_GET_DQ = 0x1, STACKSHOT_SAVE_LOADINFO = 0x2, STACKSHOT_GET_GLOBAL_MEM_STATS = 0x4 }; -#define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface -#define STACKSHOT_TASK_SNAPSHOT_MAGIC 0xdecafbad -#define STACKSHOT_MEM_SNAPSHOT_MAGIC 0xabcddcba +#define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface +#define STACKSHOT_TASK_SNAPSHOT_MAGIC 0xdecafbad +#define STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC 0xbfcabcde #endif /* __APPLE_API_UNSTABLE */ #endif /* __APPLE_API_PRIVATE */ @@ -204,7 +215,7 @@ extern int debug_kprint_current_process(const char **namep); } while (0) #else /* !DEBUG */ #define DEBUG_KPRINT_SYSCALL_PREDICATE_INTERNAL(mask, namep) (0) -#define DEBUG_KPRINT_SYSCALL_MASK(mask, fmt, args...) do { } while(0) +#define DEBUG_KPRINT_SYSCALL_MASK(mask, fmt, args...) do { } while (0) /* kprintf(fmt, args) */ #endif /* !DEBUG */ enum { @@ -233,6 +244,7 @@ extern void panic(const char *string, ...) __printflike(1,2); #if KERNEL_PRIVATE void _consume_panic_args(int, ...); +void panic_context(unsigned int reason, void *ctx, const char *string, ...); #endif #if CONFIG_NO_PANIC_STRINGS diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index 27082522f..8df6e268b 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -56,8 +56,6 @@ /* */ -#include - #include #include #include @@ -86,22 +84,7 @@ #include #include #include - -#if MACH_KDB -#include -#endif /* MACH_KDB */ - -#if MACH_KDB - -#include - -#if iPSC386 || iPSC860 -boolean_t debug_user_with_kdb = TRUE; -#else -boolean_t debug_user_with_kdb = FALSE; -#endif - -#endif /* MACH_KDB */ +#include unsigned long c_thr_exc_raise = 0; unsigned long c_thr_exc_raise_state = 0; @@ -328,13 +311,11 @@ exception_triage( assert(exception != EXC_RPC_ALERT); - if (exception == KERN_SUCCESS) - panic("exception"); + thread = current_thread(); /* * Try to raise the exception at the activation level. */ - thread = current_thread(); mutex = &thread->mutex; excp = &thread->exc_actions[exception]; kr = exception_deliver(thread, exception, code, codeCnt, excp, mutex); @@ -365,22 +346,10 @@ exception_triage( * Nobody handled it, terminate the task. */ -#if MACH_KDB - if (debug_user_with_kdb) { - /* - * Debug the exception with kdb. - * If kdb handles the exception, - * then thread_kdb_return won't return. - */ - db_printf("No exception server, calling kdb...\n"); - thread_kdb_return(); - } -#endif /* MACH_KDB */ - (void) task_terminate(task); out: - if (exception != EXC_CRASH) + if ((exception != EXC_CRASH) && (exception != EXC_RESOURCE)) thread_exception_return(); return; } @@ -413,11 +382,11 @@ bsd_exception( /* - * Raise an EXC_CRASH exception on the dying task. + * Raise an exception on a task. * This should tell launchd to launch Crash Reporter for this task. */ -kern_return_t abnormal_exit_notify(mach_exception_data_type_t exccode, - mach_exception_data_type_t excsubcode) +kern_return_t task_exception_notify(exception_type_t exception, + mach_exception_data_type_t exccode, mach_exception_data_type_t excsubcode) { mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; wait_interrupt_t wsave; @@ -426,7 +395,7 @@ kern_return_t abnormal_exit_notify(mach_exception_data_type_t exccode, code[1] = excsubcode; wsave = thread_interrupt_level(THREAD_UNINT); - exception_triage(EXC_CRASH, code, EXCEPTION_CODE_MAX); + exception_triage(exception, code, EXCEPTION_CODE_MAX); (void) thread_interrupt_level(wsave); return (KERN_SUCCESS); } diff --git a/osfmk/kern/exception.h b/osfmk/kern/exception.h index 78e659a97..a35895d8b 100644 --- a/osfmk/kern/exception.h +++ b/osfmk/kern/exception.h @@ -58,7 +58,7 @@ extern void exception_triage( extern kern_return_t sys_perf_notify(thread_t thread, int pid); /* Notify crash reporter */ -extern kern_return_t abnormal_exit_notify(mach_exception_data_type_t code, - mach_exception_data_type_t subcode); +extern kern_return_t task_exception_notify(exception_type_t exception, + mach_exception_data_type_t code, mach_exception_data_type_t subcode); #endif /* _KERN_EXCEPTION_H_ */ diff --git a/osfmk/kern/gzalloc.c b/osfmk/kern/gzalloc.c new file mode 100644 index 000000000..10b315d78 --- /dev/null +++ b/osfmk/kern/gzalloc.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * File: kern/gzalloc.c + * Author: Derek Kumar + * + * "Guard mode" zone allocator, used to trap use-after-free errors, + * overruns, underruns, mismatched allocations/frees, uninitialized + * zone element use, timing dependent races etc. + * + * The allocator is configured by these boot-args: + * gzalloc_size=: target all zones with elements of bytes + * gzalloc_min=: target zones with elements >= size + * gzalloc_max=: target zones with elements <= size + * gzalloc_min/max can be specified in conjunction to target a range of + * sizes + * gzalloc_fc_size=: number of zone elements (effectively page + * multiple sized) to retain in the free VA cache. This cache is evicted + * (backing pages and VA released) in a least-recently-freed fashion. + * Larger free VA caches allow for a longer window of opportunity to trap + * delayed use-after-free operations, but use more memory. + * -gzalloc_wp: Write protect, rather than unmap, freed allocations + * lingering in the free VA cache. Useful to disambiguate between + * read-after-frees/read overruns and writes. Also permits direct inspection + * of the freed element in the cache via the kernel debugger. As each + * element has a "header" (trailer in underflow detection mode), the zone + * of origin of the element can be easily determined in this mode. + * -gzalloc_uf_mode: Underflow detection mode, where the guard page + * adjoining each element is placed *before* the element page rather than + * after. The element is also located at the top of the page, rather than + * abutting the bottom as with the standard overflow detection mode. + * -gzalloc_noconsistency: disable consistency checks that flag mismatched + * frees, corruptions of the header/trailer signatures etc. + * -nogzalloc_mode: Disables the guard mode allocator. The DEBUG kernel + * enables the guard allocator for zones sized 8K-16K (if present) by + * default, this option can disable that behaviour. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include + +extern boolean_t vm_kernel_ready, kmem_ready; +boolean_t gzalloc_mode = FALSE; +uint32_t pdzalloc_count, pdzfree_count; + +#define GZALLOC_MIN_DEFAULT (1024) +#define GZDEADZONE ((zone_t) 0xDEAD201E) +#define GZALLOC_SIGNATURE (0xABADCAFE) +#define GZALLOC_RESERVE_SIZE_DEFAULT (2 * 1024 * 1024) +#define GZFC_DEFAULT_SIZE (1024) + +char gzalloc_fill_pattern = 0x67; /* 'g' */ + +uint32_t gzalloc_min = ~0U; +uint32_t gzalloc_max = 0; +uint32_t gzalloc_size = 0; +uint64_t gzalloc_allocated, gzalloc_freed, gzalloc_early_alloc, gzalloc_early_free, gzalloc_wasted; +boolean_t gzalloc_uf_mode = FALSE, gzalloc_consistency_checks = TRUE; +vm_prot_t gzalloc_prot = VM_PROT_NONE; +uint32_t gzalloc_guard = KMA_GUARD_LAST; +uint32_t gzfc_size = GZFC_DEFAULT_SIZE; + +vm_map_t gzalloc_map; +vm_offset_t gzalloc_map_min, gzalloc_map_max; +vm_offset_t gzalloc_reserve; +vm_size_t gzalloc_reserve_size; + +typedef struct gzalloc_header { + zone_t gzone; + uint32_t gzsize; + uint32_t gzsig; +} gzhdr_t; + +#define GZHEADER_SIZE (sizeof(gzhdr_t)) + +extern zone_t vm_page_zone; + +void gzalloc_reconfigure(__unused zone_t z) { + /* Nothing for now */ +} + +boolean_t gzalloc_enabled(void) { + return gzalloc_mode; +} + +void gzalloc_zone_init(zone_t z) { + if (gzalloc_mode) { + bzero(&z->gz, sizeof(z->gz)); + + if (gzfc_size && (z->elem_size >= gzalloc_min) && (z->elem_size <= gzalloc_max) && (z->gzalloc_exempt == FALSE)) { + vm_size_t gzfcsz = round_page(sizeof(*z->gz.gzfc) * gzfc_size); + + /* If the VM/kmem system aren't yet configured, carve + * out the free element cache structure directly from the + * gzalloc_reserve supplied by the pmap layer. + */ + if (!kmem_ready) { + if (gzalloc_reserve_size < gzfcsz) + panic("gzalloc reserve exhausted"); + + z->gz.gzfc = (vm_offset_t *)gzalloc_reserve; + gzalloc_reserve += gzfcsz; + gzalloc_reserve_size -= gzfcsz; + } else { + kern_return_t kr; + + if ((kr = kernel_memory_allocate(kernel_map, (vm_offset_t *)&z->gz.gzfc, gzfcsz, 0, KMA_KOBJECT)) != KERN_SUCCESS) { + panic("zinit/gzalloc: kernel_memory_allocate failed (%d) for 0x%lx bytes", kr, (unsigned long) gzfcsz); + } + } + bzero((void *)z->gz.gzfc, gzfcsz); + } + } +} + +void gzalloc_configure(void) { + char temp_buf[16]; + + if (PE_parse_boot_argn("-gzalloc_mode", temp_buf, sizeof (temp_buf))) { + gzalloc_mode = TRUE; + gzalloc_min = GZALLOC_MIN_DEFAULT; +#if ZONE_DEBUG + gzalloc_min += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; +#endif + gzalloc_max = ~0U; + } + + if (PE_parse_boot_argn("gzalloc_min", &gzalloc_min, sizeof(gzalloc_min))) { +#if ZONE_DEBUG + gzalloc_min += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; +#endif + gzalloc_mode = TRUE; + gzalloc_max = ~0U; + } + + if (PE_parse_boot_argn("gzalloc_max", &gzalloc_max, sizeof(gzalloc_max))) { +#if ZONE_DEBUG + gzalloc_max += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; +#endif + gzalloc_mode = TRUE; + if (gzalloc_min == ~0U) + gzalloc_min = 0; + } + + if (PE_parse_boot_argn("gzalloc_size", &gzalloc_size, sizeof(gzalloc_size))) { +#if ZONE_DEBUG + gzalloc_size += (typeof(gzalloc_min))ZONE_DEBUG_OFFSET; +#endif + gzalloc_min = gzalloc_max = gzalloc_size; + gzalloc_mode = TRUE; + } + + (void)PE_parse_boot_argn("gzalloc_fc_size", &gzfc_size, sizeof(gzfc_size)); + + if (PE_parse_boot_argn("-gzalloc_wp", temp_buf, sizeof (temp_buf))) { + gzalloc_prot = VM_PROT_READ; + } + + if (PE_parse_boot_argn("-gzalloc_uf_mode", temp_buf, sizeof (temp_buf))) { + gzalloc_uf_mode = TRUE; + gzalloc_guard = KMA_GUARD_FIRST; + } + + if (PE_parse_boot_argn("-gzalloc_noconsistency", temp_buf, sizeof (temp_buf))) { + gzalloc_consistency_checks = FALSE; + } +#if DEBUG + if (gzalloc_mode == FALSE) { + gzalloc_min = 8192; + gzalloc_max = 16384; + gzalloc_prot = VM_PROT_READ; + gzalloc_mode = TRUE; + } +#endif + if (PE_parse_boot_argn("-nogzalloc_mode", temp_buf, sizeof (temp_buf))) + gzalloc_mode = FALSE; + + if (gzalloc_mode) { + gzalloc_reserve_size = GZALLOC_RESERVE_SIZE_DEFAULT; + gzalloc_reserve = (vm_offset_t) pmap_steal_memory(gzalloc_reserve_size); + } +} + +void gzalloc_init(vm_size_t max_zonemap_size) { + kern_return_t retval; + + if (gzalloc_mode) { + retval = kmem_suballoc(kernel_map, &gzalloc_map_min, (max_zonemap_size << 2), + FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT, + &gzalloc_map); + + if (retval != KERN_SUCCESS) + panic("zone_init: kmem_suballoc(gzalloc) failed"); + gzalloc_map_max = gzalloc_map_min + (max_zonemap_size << 2); + } +} + +vm_offset_t +gzalloc_alloc(zone_t zone, boolean_t canblock) { + vm_offset_t addr = 0; + + if (__improbable(gzalloc_mode && + (((zone->elem_size >= gzalloc_min) && + (zone->elem_size <= gzalloc_max))) && + (zone->gzalloc_exempt == 0))) { + + if (get_preemption_level() != 0) { + if (canblock == TRUE) { + pdzalloc_count++; + } + else + return 0; + } + + vm_offset_t rounded_size = round_page(zone->elem_size + GZHEADER_SIZE); + vm_offset_t residue = rounded_size - zone->elem_size; + vm_offset_t gzaddr = 0; + gzhdr_t *gzh; + + if (!kmem_ready || (vm_page_zone == ZONE_NULL)) { + /* Early allocations are supplied directly from the + * reserve. + */ + if (gzalloc_reserve_size < rounded_size) + panic("gzalloc reserve exhausted"); + gzaddr = gzalloc_reserve; + /* No guard page for these early allocations, just + * waste an additional page. + */ + gzalloc_reserve += rounded_size + PAGE_SIZE; + gzalloc_reserve_size -= rounded_size + PAGE_SIZE; + OSAddAtomic64((SInt32) (rounded_size), &gzalloc_early_alloc); + } + else { + kern_return_t kr = kernel_memory_allocate(gzalloc_map, + &gzaddr, rounded_size + (1*PAGE_SIZE), + 0, KMA_KOBJECT | gzalloc_guard); + if (kr != KERN_SUCCESS) + panic("gzalloc: kernel_memory_allocate for size 0x%llx failed with %d", (uint64_t)rounded_size, kr); + + } + + if (gzalloc_uf_mode) { + gzaddr += PAGE_SIZE; + /* The "header" becomes a "footer" in underflow + * mode. + */ + gzh = (gzhdr_t *) (gzaddr + zone->elem_size); + addr = gzaddr; + } else { + gzh = (gzhdr_t *) (gzaddr + residue - GZHEADER_SIZE); + addr = (gzaddr + residue); + } + + /* Fill with a pattern on allocation to trap uninitialized + * data use. Since the element size may be "rounded up" + * by higher layers such as the kalloc layer, this may + * also identify overruns between the originally requested + * size and the rounded size via visual inspection. + * TBD: plumb through the originally requested size, + * prior to rounding by kalloc/IOMalloc etc. + * We also add a signature and the zone of origin in a header + * prefixed to the allocation. + */ + memset((void *)gzaddr, gzalloc_fill_pattern, rounded_size); + + gzh->gzone = (kmem_ready && vm_page_zone) ? zone : GZDEADZONE; + gzh->gzsize = (uint32_t) zone->elem_size; + gzh->gzsig = GZALLOC_SIGNATURE; + + lock_zone(zone); + zone->count++; + zone->sum_count++; + zone->cur_size += rounded_size; + unlock_zone(zone); + + OSAddAtomic64((SInt32) rounded_size, &gzalloc_allocated); + OSAddAtomic64((SInt32) (rounded_size - zone->elem_size), &gzalloc_wasted); + } + return addr; +} + +boolean_t gzalloc_free(zone_t zone, void *addr) { + boolean_t gzfreed = FALSE; + kern_return_t kr; + + if (__improbable(gzalloc_mode && + (((zone->elem_size >= gzalloc_min) && + (zone->elem_size <= gzalloc_max))) && + (zone->gzalloc_exempt == 0))) { + gzhdr_t *gzh; + vm_offset_t rounded_size = round_page(zone->elem_size + GZHEADER_SIZE); + vm_offset_t residue = rounded_size - zone->elem_size; + vm_offset_t saddr; + vm_offset_t free_addr = 0; + + if (gzalloc_uf_mode) { + gzh = (gzhdr_t *)((vm_offset_t)addr + zone->elem_size); + saddr = (vm_offset_t) addr - PAGE_SIZE; + } else { + gzh = (gzhdr_t *)((vm_offset_t)addr - GZHEADER_SIZE); + saddr = ((vm_offset_t)addr) - residue; + } + + assert((saddr & PAGE_MASK) == 0); + + if (gzalloc_consistency_checks) { + if (gzh->gzsig != GZALLOC_SIGNATURE) { + panic("GZALLOC signature mismatch for element %p, expected 0x%x, found 0x%x", addr, GZALLOC_SIGNATURE, gzh->gzsig); + } + + if (gzh->gzone != zone && (gzh->gzone != GZDEADZONE)) + panic("%s: Mismatched zone or under/overflow, current zone: %p, recorded zone: %p, address: %p", __FUNCTION__, zone, gzh->gzone, (void *)addr); + /* Partially redundant given the zone check, but may flag header corruption */ + if (gzh->gzsize != zone->elem_size) { + panic("Mismatched zfree or under/overflow for zone %p, recorded size: 0x%x, element size: 0x%x, address: %p\n", zone, gzh->gzsize, (uint32_t) zone->elem_size, (void *)addr); + } + } + + if (!kmem_ready || gzh->gzone == GZDEADZONE) { + /* For now, just leak frees of early allocations + * performed before kmem is fully configured. + * They don't seem to get freed currently; + * consider ml_static_mfree in the future. + */ + OSAddAtomic64((SInt32) (rounded_size), &gzalloc_early_free); + return TRUE; + } + + if (get_preemption_level() != 0) { + pdzfree_count++; + } + + if (gzfc_size) { + /* Either write protect or unmap the newly freed + * allocation + */ + kr = vm_map_protect( + gzalloc_map, + saddr, + saddr + rounded_size + (1 * PAGE_SIZE), + gzalloc_prot, + FALSE); + if (kr != KERN_SUCCESS) + panic("%s: vm_map_protect: %p, 0x%x", __FUNCTION__, (void *)saddr, kr); + } else { + free_addr = saddr; + } + + lock_zone(zone); + + /* Insert newly freed element into the protected free element + * cache, and rotate out the LRU element. + */ + if (gzfc_size) { + if (zone->gz.gzfc_index >= gzfc_size) { + zone->gz.gzfc_index = 0; + } + free_addr = zone->gz.gzfc[zone->gz.gzfc_index]; + zone->gz.gzfc[zone->gz.gzfc_index++] = saddr; + } + + if (free_addr) { + zone->count--; + zone->cur_size -= rounded_size; + } + + unlock_zone(zone); + + if (free_addr) { + kr = vm_map_remove( + gzalloc_map, + free_addr, + free_addr + rounded_size + (1 * PAGE_SIZE), + VM_MAP_REMOVE_KUNWIRE); + if (kr != KERN_SUCCESS) + panic("gzfree: vm_map_remove: %p, 0x%x", (void *)free_addr, kr); + + OSAddAtomic64((SInt32)rounded_size, &gzalloc_freed); + OSAddAtomic64(-((SInt32) (rounded_size - zone->elem_size)), &gzalloc_wasted); + } + + gzfreed = TRUE; + } + return gzfreed; +} diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 90542a946..dc55ce790 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -411,7 +411,12 @@ MACRO_END timer_t idle_state; GET_TICKS_VALUE(processor, CPU_STATE_USER, user_state); - GET_TICKS_VALUE(processor, CPU_STATE_SYSTEM, system_state); + if (precise_user_kernel_time) { + GET_TICKS_VALUE(processor, CPU_STATE_SYSTEM, system_state); + } else { + /* system_state may represent either sys or user */ + GET_TICKS_VALUE(processor, CPU_STATE_USER, system_state); + } idle_state = &PROCESSOR_DATA(processor, idle_state); idle_temp = *idle_state; @@ -427,6 +432,7 @@ MACRO_END } } simple_unlock(&processor_list_lock); + *count = HOST_CPU_LOAD_INFO_COUNT; return (KERN_SUCCESS); diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index 8963abea6..89b8f9e68 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,7 +93,6 @@ #include #include #include -#include #include #include #include @@ -189,7 +188,6 @@ const struct mig_subsystem *mig_e[] = { (const struct mig_subsystem *)&is_iokit_subsystem, (const struct mig_subsystem *)&memory_object_name_subsystem, (const struct mig_subsystem *)&lock_set_subsystem, - (const struct mig_subsystem *)&ledger_subsystem, (const struct mig_subsystem *)&task_subsystem, (const struct mig_subsystem *)&thread_act_subsystem, #if VM32_SUPPORT @@ -317,6 +315,13 @@ ipc_kobject_server( #define InP ((mach_msg_header_t *) request->ikm_header) #define OutP ((mig_reply_error_t *) reply->ikm_header) + /* + * MIG should really assure no data leakage - + * but until it does, pessimistically zero the + * whole reply buffer. + */ + bzero((void *)OutP, reply_size); + OutP->NDR = NDR_record; OutP->Head.msgh_size = sizeof(mig_reply_error_t); @@ -324,6 +329,7 @@ ipc_kobject_server( MACH_MSGH_BITS(MACH_MSGH_BITS_LOCAL(InP->msgh_bits), 0); OutP->Head.msgh_remote_port = InP->msgh_local_port; OutP->Head.msgh_local_port = MACH_PORT_NULL; + OutP->Head.msgh_reserved = (mach_msg_size_t)InP->msgh_id; /* useful for debug */ OutP->Head.msgh_id = InP->msgh_id + 100; #undef InP @@ -590,68 +596,3 @@ ipc_kobject_notify( return FALSE; } } - - - -#include -#if MACH_COUNTERS && MACH_KDB - -#include -#include - -#define printf kdbprintf - -extern void kobjserver_stats(void); -extern void bucket_stats_print(mig_hash_t *bucket); - -extern void kobjserver_stats_clear(void); - - -void -kobjserver_stats_clear(void) -{ - int i; - for (i = 0; i < MAX_MIG_ENTRIES; i++) { - mig_buckets[i].callcount = 0; - } -} - -void -kobjserver_stats(void) -{ - register unsigned int i, n = sizeof(mig_e)/sizeof(struct mig_subsystem); - register unsigned int howmany; - register mach_msg_id_t j, pos, nentry, range; - - db_printf("Kobject server call counts:\n"); - for (i = 0; i < n; i++) { - db_printf(" "); - db_printsym((vm_offset_t)mig_e[i], DB_STGY_ANY); - db_printf(":\n"); - range = mig_e[i]->end - mig_e[i]->start; - if (!mig_e[i]->start || range < 0) continue; - - for (j = 0; j < range; j++) { - nentry = j + mig_e[i]->start; - for (pos = MIG_HASH(nentry) % MAX_MIG_ENTRIES, howmany = 1; - mig_buckets[pos].num; - pos++, pos = pos % MAX_MIG_ENTRIES, howmany++) { - if (mig_buckets[pos].num == nentry) - bucket_stats_print(&mig_buckets[pos]); - } - } - } -} - -void -bucket_stats_print(mig_hash_t *bucket) -{ - if (bucket->callcount) { - db_printf(" "); - db_printsym((vm_offset_t)bucket->routine, DB_STGY_ANY); - db_printf(" (%d):\t%d\n", bucket->num, bucket->callcount); - } -} - - -#endif /* MACH_COUNTERS && MACH_KDB */ diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index 1b2a9163d..512073675 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -277,7 +277,7 @@ mach_msg_rpc_from_kernel_body( kmsg->ikm_header->msgh_bits |= MACH_MSGH_BITS(0, MACH_MSG_TYPE_MAKE_SEND_ONCE); - ipc_port_reference(reply); + ip_reference(reply); #if IKM_SUPPORT_LEGACY if(legacy) @@ -303,12 +303,12 @@ mach_msg_rpc_from_kernel_body( ip_lock(reply); if ( !ip_active(reply)) { ip_unlock(reply); - ipc_port_release(reply); + ip_release(reply); return MACH_RCV_PORT_DIED; } if (!self->active) { ip_unlock(reply); - ipc_port_release(reply); + ip_release(reply); return MACH_RCV_INTERRUPTED; } @@ -336,11 +336,11 @@ mach_msg_rpc_from_kernel_body( assert(mr == MACH_RCV_INTERRUPTED); if (self->handlers) { - ipc_port_release(reply); + ip_release(reply); return(mr); } } - ipc_port_release(reply); + ip_release(reply); /* * Check to see how much of the message/trailer can be received. @@ -419,7 +419,7 @@ mach_msg_overwrite( ipc_kmsg_t kmsg; mach_port_seqno_t seqno; mach_msg_return_t mr; - mach_msg_max_trailer_t *trailer; + mach_msg_trailer_size_t trailer_size; if (option & MACH_SEND_MSG) { mach_msg_size_t msg_and_trailer_size; @@ -489,22 +489,17 @@ mach_msg_overwrite( kmsg = self->ith_kmsg; seqno = self->ith_seqno; - ipc_object_release(object); + io_release(object); } while (mr == MACH_RCV_INTERRUPTED); if (mr != MACH_MSG_SUCCESS) return mr; - trailer = (mach_msg_max_trailer_t *) - ((vm_offset_t)kmsg->ikm_header + kmsg->ikm_header->msgh_size); - if (option & MACH_RCV_TRAILER_MASK) { - trailer->msgh_seqno = seqno; - trailer->msgh_context = - kmsg->ikm_header->msgh_remote_port->ip_context; - trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(option); - } - if (rcv_size < (kmsg->ikm_header->msgh_size + trailer->msgh_trailer_size)) { + trailer_size = ipc_kmsg_add_trailer(kmsg, space, option, current_thread(), seqno, TRUE, + kmsg->ikm_header->msgh_remote_port->ip_context); + + if (rcv_size < (kmsg->ikm_header->msgh_size + trailer_size)) { ipc_kmsg_copyout_dest(kmsg, space); (void) memcpy((void *) msg, (const void *) kmsg->ikm_header, sizeof *msg); ipc_kmsg_free(kmsg); @@ -515,7 +510,7 @@ mach_msg_overwrite( if (mr != MACH_MSG_SUCCESS) { if ((mr &~ MACH_MSG_MASK) == MACH_RCV_BODY_ERROR) { ipc_kmsg_put_to_kernel(msg, kmsg, - kmsg->ikm_header->msgh_size + trailer->msgh_trailer_size); + kmsg->ikm_header->msgh_size + trailer_size); } else { ipc_kmsg_copyout_dest(kmsg, space); (void) memcpy((void *) msg, (const void *) kmsg->ikm_header, sizeof *msg); @@ -526,7 +521,7 @@ mach_msg_overwrite( } (void) memcpy((void *) msg, (const void *) kmsg->ikm_header, - kmsg->ikm_header->msgh_size + trailer->msgh_trailer_size); + kmsg->ikm_header->msgh_size + trailer_size); ipc_kmsg_free(kmsg); } diff --git a/osfmk/kern/ipc_mig.h b/osfmk/kern/ipc_mig.h index 06d3ae97e..ff7326b4b 100644 --- a/osfmk/kern/ipc_mig.h +++ b/osfmk/kern/ipc_mig.h @@ -50,7 +50,8 @@ #ifdef _MIG_TRACE_PARAMETERS_ #define __BeforeRcvCallTrace(msgid,arg1,arg2,arg3,arg4) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ (unsigned int)(arg1), \ (unsigned int)(arg2), \ (unsigned int)(arg3), \ @@ -58,7 +59,8 @@ (unsigned int)(0)); #define __AfterRcvCallTrace(msgid,arg1,arg2,arg3,arg4) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ (unsigned int)(arg1), \ (unsigned int)(arg2), \ (unsigned int)(arg3), \ @@ -66,7 +68,8 @@ (unsigned int)(0)); #define __BeforeSimpleCallTrace(msgid,arg1,arg2,arg3,arg4) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ (unsigned int)(arg1), \ (unsigned int)(arg2), \ (unsigned int)(arg3), \ @@ -74,7 +77,8 @@ (unsigned int)(0)); #define __AfterSimpleCallTrace(msgid,arg1,arg2,arg3,arg4) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ (unsigned int)(arg1), \ (unsigned int)(arg2), \ (unsigned int)(arg3), \ @@ -84,7 +88,8 @@ #else /* !_MIG_TRACE_PARAMETERS_ */ #define __BeforeRcvRpc(msgid, _NAME_) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ (unsigned int)(0), \ (unsigned int)(0), \ (unsigned int)(0), \ @@ -92,7 +97,8 @@ (unsigned int)(0)); #define __AfterRcvRpc(msgid, _NAME_) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ (unsigned int)(0), \ (unsigned int)(0), \ (unsigned int)(0), \ @@ -101,7 +107,8 @@ #define __BeforeRcvSimple(msgid, _NAME_) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_START, \ (unsigned int)(0), \ (unsigned int)(0), \ (unsigned int)(0), \ @@ -109,7 +116,8 @@ (unsigned int)(0)); #define __AfterRcvSimple(msgid, _NAME_) \ - KERNEL_DEBUG_CONSTANT(KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + KDBG_MIGCODE(msgid) | DBG_FUNC_END, \ (unsigned int)(0), \ (unsigned int)(0), \ (unsigned int)(0), \ @@ -119,7 +127,8 @@ #endif /* !_MIG_TRACE_PARAMETERS_ */ #define _MIG_MSGID_INVALID(msgid) \ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_MSGID_INVALID, (msgid)), \ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, \ + MACHDBG_CODE(DBG_MACH_MSGID_INVALID, (msgid)), \ (unsigned int)(0), \ (unsigned int)(0), \ (unsigned int)(0), \ diff --git a/osfmk/kern/ipc_misc.c b/osfmk/kern/ipc_misc.c index 547abeaad..6690db410 100644 --- a/osfmk/kern/ipc_misc.c +++ b/osfmk/kern/ipc_misc.c @@ -62,8 +62,8 @@ fileport_alloc(struct fileglob *fg) } ipc_kobject_set(fileport, (ipc_kobject_t)fg, IKOT_FILEPORT); - notifyport = ipc_port_make_sonce(fileport); ip_lock(fileport); /* unlocked by ipc_port_nsrequest */ + notifyport = ipc_port_make_sonce_locked(fileport); ipc_port_nsrequest(fileport, 1, notifyport, ¬ifyport); sendport = ipc_port_make_send(fileport); diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index 019dacd6b..61013e399 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -144,7 +144,6 @@ ipc_task_init( task->itk_nself = nport; task->itk_sself = ipc_port_make_send(kport); task->itk_space = space; - space->is_fast = FALSE; #if CONFIG_MACF_MACH if (parent) @@ -326,9 +325,6 @@ ipc_task_terminate( if (IP_VALID(task->itk_registered[i])) ipc_port_release_send(task->itk_registered[i]); - ipc_port_release_send(task->wired_ledger_port); - ipc_port_release_send(task->paged_ledger_port); - /* destroy the kernel ports */ ipc_port_dealloc_kernel(kport); ipc_port_dealloc_kernel(nport); @@ -858,22 +854,10 @@ task_get_special_port( port = ipc_port_copy_send(task->itk_bootstrap); break; - case TASK_WIRED_LEDGER_PORT: - port = ipc_port_copy_send(task->wired_ledger_port); - break; - - case TASK_PAGED_LEDGER_PORT: - port = ipc_port_copy_send(task->paged_ledger_port); - break; - case TASK_SEATBELT_PORT: port = ipc_port_copy_send(task->itk_seatbelt); break; - case TASK_GSSD_PORT: - port = ipc_port_copy_send(task->itk_gssd); - break; - case TASK_ACCESS_PORT: port = ipc_port_copy_send(task->itk_task_access); break; @@ -929,22 +913,10 @@ task_set_special_port( whichp = &task->itk_bootstrap; break; - case TASK_WIRED_LEDGER_PORT: - whichp = &task->wired_ledger_port; - break; - - case TASK_PAGED_LEDGER_PORT: - whichp = &task->paged_ledger_port; - break; - case TASK_SEATBELT_PORT: whichp = &task->itk_seatbelt; break; - case TASK_GSSD_PORT: - whichp = &task->itk_gssd; - break; - case TASK_ACCESS_PORT: whichp = &task->itk_task_access; break; diff --git a/osfmk/kern/ipc_tt.h b/osfmk/kern/ipc_tt.h index 4a3a9ac1c..7c1384c41 100644 --- a/osfmk/kern/ipc_tt.h +++ b/osfmk/kern/ipc_tt.h @@ -149,18 +149,6 @@ extern thread_t convert_port_to_thread( extern thread_t port_name_to_thread( mach_port_name_t port_name); -/* Convert from a task to a port */ -extern ipc_port_t convert_task_to_port( - task_t task); - -/* Convert from a task name to a port */ -extern ipc_port_t convert_task_name_to_port( - task_name_t task_name); - -/* Convert from a thread to a port */ -extern ipc_port_t convert_thread_to_port( - thread_t thread); - /* Deallocate a space ref produced by convert_port_to_space */ extern void space_deallocate( ipc_space_t space); diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index f84a19956..a5febf94c 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -114,7 +115,8 @@ KALLOC_ZINFO_SALLOC(vm_size_t bytes) task_t task; zinfo_usage_t zinfo; - thr->tkm_shared.alloc += bytes; + ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, bytes); + if (kalloc_fake_zone_index != -1 && (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) zinfo[kalloc_fake_zone_index].alloc += bytes; @@ -127,7 +129,8 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) task_t task; zinfo_usage_t zinfo; - thr->tkm_shared.free += bytes; + ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, bytes); + if (kalloc_fake_zone_index != -1 && (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) zinfo[kalloc_fake_zone_index].free += bytes; @@ -135,32 +138,147 @@ KALLOC_ZINFO_SFREE(vm_size_t bytes) /* * All allocations of size less than kalloc_max are rounded to the - * next highest power of 2. This allocator is built on top of + * next nearest sized zone. This allocator is built on top of * the zone allocator. A zone is created for each potential size * that we are willing to get in small blocks. * * We assume that kalloc_max is not greater than 64K; - * thus 16 is a safe array size for k_zone and k_zone_name. * * Note that kalloc_max is somewhat confusingly named. * It represents the first power of two for which no zone exists. * kalloc_max_prerounded is the smallest allocation size, before * rounding, for which no zone exists. - * Also if the allocation size is more than kalloc_kernmap_size - * then allocate from kernel map rather than kalloc_map. + * + * Also if the allocation size is more than kalloc_kernmap_size + * then allocate from kernel map rather than kalloc_map. + */ + +#if KALLOC_MINSIZE == 16 && KALLOC_LOG2_MINALIGN == 4 + +/* + * "Legacy" aka "power-of-2" backing zones with 16-byte minimum + * size and alignment. Users of this profile would probably + * benefit from some tuning. + */ + +#define K_ZONE_SIZES \ + 16, \ + 32, \ +/* 6 */ 64, \ + 128, \ + 256, \ +/* 9 */ 512, \ + 1024, \ + 2048, \ +/* C */ 4096 + + +#define K_ZONE_NAMES \ + "kalloc.16", \ + "kalloc.32", \ +/* 6 */ "kalloc.64", \ + "kalloc.128", \ + "kalloc.256", \ +/* 9 */ "kalloc.512", \ + "kalloc.1024", \ + "kalloc.2048", \ +/* C */ "kalloc.4096" + +#define K_ZONE_MAXIMA \ + 1024, \ + 4096, \ +/* 6 */ 4096, \ + 4096, \ + 4096, \ +/* 9 */ 1024, \ + 1024, \ + 1024, \ +/* C */ 1024 + +#elif KALLOC_MINSIZE == 8 && KALLOC_LOG2_MINALIGN == 3 + +/* + * Tweaked for ARM (and x64) in 04/2011 */ -int first_k_zone = -1; -struct zone *k_zone[16]; -static const char *k_zone_name[16] = { - "kalloc.1", "kalloc.2", - "kalloc.4", "kalloc.8", - "kalloc.16", "kalloc.32", - "kalloc.64", "kalloc.128", - "kalloc.256", "kalloc.512", - "kalloc.1024", "kalloc.2048", - "kalloc.4096", "kalloc.8192", - "kalloc.16384", "kalloc.32768" +#define K_ZONE_SIZES \ +/* 3 */ 8, \ + 16, 24, \ + 32, 40, 48, \ +/* 6 */ 64, 88, 112, \ + 128, 192, \ + 256, 384, \ +/* 9 */ 512, 768, \ + 1024, 1536, \ + 2048, 3072, \ + 4096, 6144 + +#define K_ZONE_NAMES \ +/* 3 */ "kalloc.8", \ + "kalloc.16", "kalloc.24", \ + "kalloc.32", "kalloc.40", "kalloc.48", \ +/* 6 */ "kalloc.64", "kalloc.88", "kalloc.112", \ + "kalloc.128", "kalloc.192", \ + "kalloc.256", "kalloc.384", \ +/* 9 */ "kalloc.512", "kalloc.768", \ + "kalloc.1024", "kalloc.1536", \ + "kalloc.2048", "kalloc.3072", \ + "kalloc.4096", "kalloc.6144" + +#define K_ZONE_MAXIMA \ +/* 3 */ 1024, \ + 1024, 1024, \ + 4096, 4096, 4096, \ +/* 6 */ 4096, 4096, 4096, \ + 4096, 4096, \ + 4096, 4096, \ +/* 9 */ 1024, 1024, \ + 1024, 1024, \ + 1024, 1024, \ +/* C */ 1024, 64 + +#else +#error missing zone size parameters for kalloc +#endif + +#define KALLOC_MINALIGN (1 << KALLOC_LOG2_MINALIGN) + +static const int k_zone_size[] = { + K_ZONE_SIZES, + 8192, + 16384, +/* F */ 32768 +}; + +#define N_K_ZONE (sizeof (k_zone_size) / sizeof (k_zone_size[0])) + +/* + * Many kalloc() allocations are for small structures containing a few + * pointers and longs - the k_zone_dlut[] direct lookup table, indexed by + * size normalized to the minimum alignment, finds the right zone index + * for them in one dereference. + */ + +#define INDEX_ZDLUT(size) \ + (((size) + KALLOC_MINALIGN - 1) / KALLOC_MINALIGN) +#define N_K_ZDLUT (2048 / KALLOC_MINALIGN) + /* covers sizes [0 .. 2048 - KALLOC_MINALIGN] */ +#define MAX_SIZE_ZDLUT ((N_K_ZDLUT - 1) * KALLOC_MINALIGN) + +static int8_t k_zone_dlut[N_K_ZDLUT]; /* table of indices into k_zone[] */ + +/* + * If there's no hit in the DLUT, then start searching from k_zindex_start. + */ +static int k_zindex_start; + +static zone_t k_zone[N_K_ZONE]; + +static const char *k_zone_name[N_K_ZONE] = { + K_ZONE_NAMES, + "kalloc.8192", + "kalloc.16384", +/* F */ "kalloc.32768" }; /* @@ -169,25 +287,15 @@ static const char *k_zone_name[16] = { * based on need, rather than just guessing; it also * means its patchable in case you're wrong! */ -unsigned long k_zone_max[16] = { - 1024, /* 1 Byte */ - 1024, /* 2 Byte */ - 1024, /* 4 Byte */ - 1024, /* 8 Byte */ - 1024, /* 16 Byte */ - 4096, /* 32 Byte */ - 4096, /* 64 Byte */ - 4096, /* 128 Byte */ - 4096, /* 256 Byte */ - 1024, /* 512 Byte */ - 1024, /* 1024 Byte */ - 1024, /* 2048 Byte */ - 1024, /* 4096 Byte */ - 4096, /* 8192 Byte */ - 64, /* 16384 Byte */ - 64, /* 32768 Byte */ +unsigned int k_zone_max[N_K_ZONE] = { + K_ZONE_MAXIMA, + 4096, + 64, +/* F */ 64 }; +/* #define KALLOC_DEBUG 1 */ + /* forward declarations */ void * kalloc_canblock( vm_size_t size, @@ -277,25 +385,101 @@ kalloc_init( * for the allocation, as we aren't sure how the memory * will be handled. */ - for (i = 0, size = 1; size < kalloc_max; i++, size <<= 1) { - if (size < KALLOC_MINSIZE) { - k_zone[i] = NULL; - continue; - } - if (size == KALLOC_MINSIZE) { - first_k_zone = i; - } + for (i = 0; (size = k_zone_size[i]) < kalloc_max; i++) { k_zone[i] = zinit(size, k_zone_max[i] * size, size, k_zone_name[i]); zone_change(k_zone[i], Z_CALLERACCT, FALSE); } + + /* + * Build the Direct LookUp Table for small allocations + */ + for (i = 0, size = 0; i <= N_K_ZDLUT; i++, size += KALLOC_MINALIGN) { + int zindex = 0; + + while ((vm_size_t)k_zone_size[zindex] < size) + zindex++; + + if (i == N_K_ZDLUT) { + k_zindex_start = zindex; + break; + } + k_zone_dlut[i] = (int8_t)zindex; + } + +#ifdef KALLOC_DEBUG + printf("kalloc_init: k_zindex_start %d\n", k_zindex_start); + + /* + * Do a quick synthesis to see how well/badly we can + * find-a-zone for a given size. + * Useful when debugging/tweaking the array of zone sizes. + * Cache misses probably more critical than compare-branches! + */ + for (i = 0; i < (int)N_K_ZONE; i++) { + vm_size_t testsize = (vm_size_t)k_zone_size[i] - 1; + int compare = 0; + int zindex; + + if (testsize < MAX_SIZE_ZDLUT) { + compare += 1; /* 'if' (T) */ + + long dindex = INDEX_ZDLUT(testsize); + zindex = (int)k_zone_dlut[dindex]; + + } else if (testsize < kalloc_max_prerounded) { + + compare += 2; /* 'if' (F), 'if' (T) */ + + zindex = k_zindex_start; + while ((vm_size_t)k_zone_size[zindex] < testsize) { + zindex++; + compare++; /* 'while' (T) */ + } + compare++; /* 'while' (F) */ + } else + break; /* not zone-backed */ + + zone_t z = k_zone[zindex]; + printf("kalloc_init: req size %4lu: %11s took %d compare%s\n", + (unsigned long)testsize, z->zone_name, compare, + compare == 1 ? "" : "s"); + } +#endif kalloc_lck_grp = lck_grp_alloc_init("kalloc.large", LCK_GRP_ATTR_NULL); lck_mtx_init(&kalloc_lock, kalloc_lck_grp, LCK_ATTR_NULL); OSMalloc_init(); #ifdef MUTEX_ZONE lck_mtx_zone = zinit(sizeof(struct _lck_mtx_), 1024*256, 4096, "lck_mtx"); #endif +} +/* + * Given an allocation size, return the kalloc zone it belongs to. + * Direct LookUp Table variant. + */ +static __inline zone_t +get_zone_dlut(vm_size_t size) +{ + long dindex = INDEX_ZDLUT(size); + int zindex = (int)k_zone_dlut[dindex]; + return (k_zone[zindex]); +} + +/* As above, but linear search k_zone_size[] for the next zone that fits. */ + +static __inline zone_t +get_zone_search(vm_size_t size, int zindex) +{ + assert(size < kalloc_max_prerounded); + + while ((vm_size_t)k_zone_size[zindex] < size) + zindex++; + + assert((unsigned)zindex < N_K_ZONE && + (vm_size_t)k_zone_size[zindex] < kalloc_max); + + return (k_zone[zindex]); } void * @@ -303,17 +487,19 @@ kalloc_canblock( vm_size_t size, boolean_t canblock) { - register int zindex; - register vm_size_t allocsize; - vm_map_t alloc_map = VM_MAP_NULL; - - /* - * If size is too large for a zone, then use kmem_alloc. - * (We use kmem_alloc instead of kmem_alloc_kobject so that - * krealloc can use kmem_realloc.) - */ - - if (size >= kalloc_max_prerounded) { + zone_t z; + + if (size < MAX_SIZE_ZDLUT) + z = get_zone_dlut(size); + else if (size < kalloc_max_prerounded) + z = get_zone_search(size, k_zindex_start); + else { + /* + * If size is too large for a zone, then use kmem_alloc. + * (We use kmem_alloc instead of kmem_alloc_kobject so that + * krealloc can use kmem_realloc.) + */ + vm_map_t alloc_map; void *addr; /* kmem_alloc could block so we return if noblock */ @@ -357,19 +543,13 @@ kalloc_canblock( } return(addr); } - - /* compute the size of the block that we will actually allocate */ - - allocsize = KALLOC_MINSIZE; - zindex = first_k_zone; - while (allocsize < size) { - allocsize <<= 1; - zindex++; - } - - /* allocate from the appropriate zone */ - assert(allocsize < kalloc_max); - return(zalloc_canblock(k_zone[zindex], canblock)); +#ifdef KALLOC_DEBUG + if (size > z->elem_size) + panic("%s: z %p (%s) but requested size %lu", __func__, + z, z->zone_name, (unsigned long)size); +#endif + assert(size <= z->elem_size); + return (zalloc_canblock(z, canblock)); } void * @@ -386,164 +566,6 @@ kalloc_noblock( return( kalloc_canblock(size, FALSE) ); } - -void -krealloc( - void **addrp, - vm_size_t old_size, - vm_size_t new_size, - simple_lock_t lock) -{ - register int zindex; - register vm_size_t allocsize; - void *naddr; - vm_map_t alloc_map = VM_MAP_NULL; - - /* can only be used for increasing allocation size */ - - assert(new_size > old_size); - - /* if old_size is zero, then we are simply allocating */ - - if (old_size == 0) { - simple_unlock(lock); - naddr = kalloc(new_size); - simple_lock(lock); - *addrp = naddr; - return; - } - - /* if old block was kmem_alloc'd, then use kmem_realloc if necessary */ - - if (old_size >= kalloc_max_prerounded) { - if (old_size >= kalloc_kernmap_size) - alloc_map = kernel_map; - else - alloc_map = kalloc_map; - - old_size = round_page(old_size); - new_size = round_page(new_size); - if (new_size > old_size) { - - if (KERN_SUCCESS != kmem_realloc(alloc_map, - (vm_offset_t)*addrp, old_size, - (vm_offset_t *)&naddr, new_size)) - panic("krealloc: kmem_realloc"); - - simple_lock(lock); - *addrp = (void *) naddr; - - /* kmem_realloc() doesn't free old page range. */ - kmem_free(alloc_map, (vm_offset_t)*addrp, old_size); - - kalloc_large_total += (new_size - old_size); - kalloc_large_sum += (new_size - old_size); - - if (kalloc_large_total > kalloc_large_max) - kalloc_large_max = kalloc_large_total; - - } - return; - } - - /* compute the size of the block that we actually allocated */ - - allocsize = KALLOC_MINSIZE; - zindex = first_k_zone; - while (allocsize < old_size) { - allocsize <<= 1; - zindex++; - } - - /* if new size fits in old block, then return */ - - if (new_size <= allocsize) { - return; - } - - /* if new size does not fit in zone, kmem_alloc it, else zalloc it */ - - simple_unlock(lock); - if (new_size >= kalloc_max_prerounded) { - if (new_size >= kalloc_kernmap_size) - alloc_map = kernel_map; - else - alloc_map = kalloc_map; - if (KERN_SUCCESS != kmem_alloc(alloc_map, - (vm_offset_t *)&naddr, new_size)) { - panic("krealloc: kmem_alloc"); - simple_lock(lock); - *addrp = NULL; - return; - } - kalloc_spin_lock(); - - kalloc_large_inuse++; - kalloc_large_sum += new_size; - kalloc_large_total += new_size; - - if (kalloc_large_total > kalloc_large_max) - kalloc_large_max = kalloc_large_total; - - kalloc_unlock(); - - KALLOC_ZINFO_SALLOC(new_size); - } else { - register int new_zindex; - - allocsize <<= 1; - new_zindex = zindex + 1; - while (allocsize < new_size) { - allocsize <<= 1; - new_zindex++; - } - naddr = zalloc(k_zone[new_zindex]); - } - simple_lock(lock); - - /* copy existing data */ - - bcopy((const char *)*addrp, (char *)naddr, old_size); - - /* free old block, and return */ - - zfree(k_zone[zindex], *addrp); - - /* set up new address */ - - *addrp = (void *) naddr; -} - - -void * -kget( - vm_size_t size) -{ - register int zindex; - register vm_size_t allocsize; - - /* size must not be too large for a zone */ - - if (size >= kalloc_max_prerounded) { - /* This will never work, so we might as well panic */ - panic("kget"); - } - - /* compute the size of the block that we will actually allocate */ - - allocsize = KALLOC_MINSIZE; - zindex = first_k_zone; - while (allocsize < size) { - allocsize <<= 1; - zindex++; - } - - /* allocate from the appropriate zone */ - - assert(allocsize < kalloc_max); - return(zget(k_zone[zindex])); -} - volatile SInt32 kfree_nop_count = 0; void @@ -551,19 +573,23 @@ kfree( void *data, vm_size_t size) { - register int zindex; - register vm_size_t freesize; - vm_map_t alloc_map = kernel_map; + zone_t z; + + if (size < MAX_SIZE_ZDLUT) + z = get_zone_dlut(size); + else if (size < kalloc_max_prerounded) + z = get_zone_search(size, k_zindex_start); + else { + /* if size was too large for a zone, then use kmem_free */ - /* if size was too large for a zone, then use kmem_free */ + vm_map_t alloc_map = kernel_map; - if (size >= kalloc_max_prerounded) { if ((((vm_offset_t) data) >= kalloc_map_min) && (((vm_offset_t) data) <= kalloc_map_max)) alloc_map = kalloc_map; if (size > kalloc_largest_allocated) { /* * work around double FREEs of small MALLOCs - * this use to end up being a nop + * this used to end up being a nop * since the pointer being freed from an * alloc backed by the zalloc world could * never show up in the kalloc_map... however, @@ -574,7 +600,7 @@ kfree( * the underlying allocation... that pointer ends up * looking like a really big size on the 2nd FREE and * pushes the kfree into the kernel_map... we - * end up removing a ton of virutal space before we panic + * end up removing a ton of virtual space before we panic * this check causes us to ignore the kfree for a size * that must be 'bogus'... note that it might not be due * to the above scenario, but it would still be wrong and @@ -597,19 +623,14 @@ kfree( return; } - /* compute the size of the block that we actually allocated from */ - - freesize = KALLOC_MINSIZE; - zindex = first_k_zone; - while (freesize < size) { - freesize <<= 1; - zindex++; - } - /* free to the appropriate zone */ - - assert(freesize < kalloc_max); - zfree(k_zone[zindex], data); +#ifdef KALLOC_DEBUG + if (size > z->elem_size) + panic("%s: z %p (%s) but requested size %lu", __func__, + z, z->zone_name, (unsigned long)size); +#endif + assert(size <= z->elem_size); + zfree(z, data); } #ifdef MACH_BSD @@ -617,21 +638,10 @@ zone_t kalloc_zone( vm_size_t size) { - register int zindex = 0; - register vm_size_t allocsize; - - /* compute the size of the block that we will actually allocate */ - - allocsize = size; - if (size <= kalloc_max) { - allocsize = KALLOC_MINSIZE; - zindex = first_k_zone; - while (allocsize < size) { - allocsize <<= 1; - zindex++; - } - return (k_zone[zindex]); - } + if (size < MAX_SIZE_ZDLUT) + return (get_zone_dlut(size)); + if (size <= kalloc_max) + return (get_zone_search(size, k_zindex_start)); return (ZONE_NULL); } #endif @@ -705,7 +715,7 @@ OSMalloc_Tagref( OSMallocTag tag) { if (!((tag->OSMT_state & OSMT_VALID_MASK) == OSMT_VALID)) - panic("OSMalloc_Tagref(): bad state 0x%08X\n",tag->OSMT_state); + panic("OSMalloc_Tagref():'%s' has bad state 0x%08X\n", tag->OSMT_name, tag->OSMT_state); (void)hw_atomic_add(&tag->OSMT_refcnt, 1); } @@ -715,7 +725,7 @@ OSMalloc_Tagrele( OSMallocTag tag) { if (!((tag->OSMT_state & OSMT_VALID_MASK) == OSMT_VALID)) - panic("OSMalloc_Tagref(): bad state 0x%08X\n",tag->OSMT_state); + panic("OSMalloc_Tagref():'%s' has bad state 0x%08X\n", tag->OSMT_name, tag->OSMT_state); if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) { if (hw_compare_and_store(OSMT_VALID|OSMT_RELEASED, OSMT_VALID|OSMT_RELEASED, &tag->OSMT_state)) { @@ -724,7 +734,7 @@ OSMalloc_Tagrele( OSMalloc_tag_unlock(); kfree((void*)tag, sizeof(*tag)); } else - panic("OSMalloc_Tagrele(): refcnt 0\n"); + panic("OSMalloc_Tagrele():'%s' has refcnt 0\n", tag->OSMT_name); } } @@ -733,7 +743,7 @@ OSMalloc_Tagfree( OSMallocTag tag) { if (!hw_compare_and_store(OSMT_VALID, OSMT_VALID|OSMT_RELEASED, &tag->OSMT_state)) - panic("OSMalloc_Tagfree(): bad state 0x%08X\n", tag->OSMT_state); + panic("OSMalloc_Tagfree():'%s' has bad state 0x%08X \n", tag->OSMT_name, tag->OSMT_state); if (hw_atomic_sub(&tag->OSMT_refcnt, 1) == 0) { OSMalloc_tag_spin_lock(); @@ -813,7 +823,7 @@ OSFree( && (size & ~PAGE_MASK)) { kmem_free(kernel_map, (vm_offset_t)addr, size); } else - kfree((void*)addr, size); + kfree((void *)addr, size); OSMalloc_Tagrele(tag); } diff --git a/osfmk/kern/kalloc.h b/osfmk/kern/kalloc.h index 9fcb07edc..77b8cd3be 100644 --- a/osfmk/kern/kalloc.h +++ b/osfmk/kern/kalloc.h @@ -68,8 +68,6 @@ extern void *kalloc(vm_size_t size); extern void *kalloc_noblock(vm_size_t size); -extern void *kget(vm_size_t size); - extern void kfree(void *data, vm_size_t size); @@ -79,15 +77,8 @@ __END_DECLS #include -#define KALLOC_MINSIZE 16 - extern void kalloc_init(void) __attribute__((section("__TEXT, initcode"))); -extern void krealloc(void **addrp, - vm_size_t old_size, - vm_size_t new_size, - simple_lock_t lock); - extern void kalloc_fake_zone_init( int ); extern void kalloc_fake_zone_info( diff --git a/osfmk/kern/kext_alloc.c b/osfmk/kern/kext_alloc.c index 1d3aea127..c44446335 100644 --- a/osfmk/kern/kext_alloc.c +++ b/osfmk/kern/kext_alloc.c @@ -39,10 +39,20 @@ #include #include +#define KASLR_IOREG_DEBUG 0 + vm_map_t g_kext_map = 0; +#if KASLR_IOREG_DEBUG +mach_vm_offset_t kext_alloc_base = 0; +mach_vm_offset_t kext_alloc_max = 0; +#else static mach_vm_offset_t kext_alloc_base = 0; static mach_vm_offset_t kext_alloc_max = 0; +#if CONFIG_KEXT_BASEMENT +static mach_vm_offset_t kext_post_boot_base = 0; +#endif +#endif /* * On x86_64 systems, kernel extension text must remain within 2GB of the @@ -52,9 +62,10 @@ static mach_vm_offset_t kext_alloc_max = 0; void kext_alloc_init(void) { -#if __x86_64__ +#if CONFIG_KEXT_BASEMENT kern_return_t rval = 0; kernel_segment_command_t *text = NULL; + kernel_segment_command_t *prelinkTextSegment = NULL; mach_vm_offset_t text_end, text_start; mach_vm_size_t text_size; mach_vm_size_t kext_alloc_size; @@ -72,9 +83,21 @@ kext_alloc_init(void) kext_alloc_base = KEXT_ALLOC_BASE(text_end); kext_alloc_size = KEXT_ALLOC_SIZE(text_size); kext_alloc_max = kext_alloc_base + kext_alloc_size; + + /* Post boot kext allocation will start after the prelinked kexts */ + prelinkTextSegment = getsegbyname("__PRELINK_TEXT"); + if (prelinkTextSegment) { + /* use kext_post_boot_base to start allocations past all the prelinked + * kexts + */ + kext_post_boot_base = + vm_map_round_page(kext_alloc_base + prelinkTextSegment->vmsize); + } + else { + kext_post_boot_base = kext_alloc_base; + } - /* Allocate the subblock of the kernel map */ - + /* Allocate the sub block of the kernel map */ rval = kmem_suballoc(kernel_map, (vm_offset_t *) &kext_alloc_base, kext_alloc_size, /* pageable */ TRUE, VM_FLAGS_FIXED|VM_FLAGS_OVERWRITE, @@ -91,33 +114,65 @@ kext_alloc_init(void) kernel_map->min_offset = kext_alloc_base; } - printf("kext submap [0x%llx - 0x%llx], kernel text [0x%llx - 0x%llx]\n", - kext_alloc_base, kext_alloc_max, text->vmaddr, - text->vmaddr + text->vmsize); + printf("kext submap [0x%lx - 0x%lx], kernel text [0x%lx - 0x%lx]\n", + VM_KERNEL_UNSLIDE(kext_alloc_base), + VM_KERNEL_UNSLIDE(kext_alloc_max), + VM_KERNEL_UNSLIDE(text->vmaddr), + VM_KERNEL_UNSLIDE(text->vmaddr + text->vmsize)); + #else g_kext_map = kernel_map; kext_alloc_base = VM_MIN_KERNEL_ADDRESS; kext_alloc_max = VM_MAX_KERNEL_ADDRESS; -#endif /* __x86_64__ */ +#endif /* CONFIG_KEXT_BASEMENT */ } kern_return_t kext_alloc(vm_offset_t *_addr, vm_size_t size, boolean_t fixed) { kern_return_t rval = 0; +#if CONFIG_KEXT_BASEMENT + mach_vm_offset_t addr = (fixed) ? *_addr : kext_post_boot_base; +#else mach_vm_offset_t addr = (fixed) ? *_addr : kext_alloc_base; +#endif int flags = (fixed) ? VM_FLAGS_FIXED : VM_FLAGS_ANYWHERE; - /* Allocate the kext virtual memory */ +#if CONFIG_KEXT_BASEMENT + /* Allocate the kext virtual memory + * 10608884 - use mach_vm_map since we want VM_FLAGS_ANYWHERE allocated past + * kext_post_boot_base (when possible). mach_vm_allocate will always + * start at 0 into the map no matter what you pass in addr. We want non + * fixed (post boot) kext allocations to start looking for free space + * just past where prelinked kexts have loaded. + */ + rval = mach_vm_map(g_kext_map, + &addr, + size, + 0, + flags, + MACH_PORT_NULL, + 0, + TRUE, + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (rval != KERN_SUCCESS) { + printf("mach_vm_map failed - %d\n", rval); + goto finish; + } +#else rval = mach_vm_allocate(g_kext_map, &addr, size, flags); if (rval != KERN_SUCCESS) { printf("vm_allocate failed - %d\n", rval); goto finish; } +#endif /* Check that the memory is reachable by kernel text */ if ((addr + size) > kext_alloc_max) { kext_free((vm_offset_t)addr, size); + rval = KERN_INVALID_ADDRESS; goto finish; } diff --git a/osfmk/kern/ledger.c b/osfmk/kern/ledger.c index c97771d04..cf1a7aa02 100644 --- a/osfmk/kern/ledger.c +++ b/osfmk/kern/ledger.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,394 +28,1091 @@ /* * @OSF_COPYRIGHT@ */ -/* - * 8/13/93 - * - * This is a half-hearted attempt at providing the parts of the - * ledger facility to satisfy the ledger interfaces. - * - * This implementation basically leaves the (dysfunctional) ledgers - * unfunctional and are mearly here to satisfy the Mach spec interface - * reqirements. - */ - -#include -#include -#include -#include -#include -#include #include -#include -#include #include #include +#include -#include -#include +#include +#include +#include +#include -ledger_t root_wired_ledger; -ledger_t root_paged_ledger; +#include +#include +/* + * Ledger entry flags. Bits in second nibble (masked by 0xF0) are used for + * ledger actions (LEDGER_ACTION_BLOCK, etc). + */ +#define ENTRY_ACTIVE 0x0001 /* entry is active if set */ +#define WAKE_NEEDED 0x0100 /* one or more threads are asleep */ +#define WAKE_INPROGRESS 0x0200 /* the wait queue is being processed */ +#define REFILL_SCHEDULED 0x0400 /* a refill timer has been set */ +#define REFILL_INPROGRESS 0x0800 /* the ledger is being refilled */ +#define CALLED_BACK 0x1000 /* callback has already been called */ -/* Utility routine to handle entries to a ledger */ -kern_return_t -ledger_enter( - ledger_t ledger, - ledger_item_t amount) +/* Determine whether a ledger entry exists and has been initialized and active */ +#define ENTRY_VALID(l, e) \ + (((l) != NULL) && ((e) >= 0) && ((e) < (l)->l_size) && \ + (((l)->l_entries[e].le_flags & ENTRY_ACTIVE) == ENTRY_ACTIVE)) + +#ifdef LEDGER_DEBUG +int ledger_debug = 0; + +#define ASSERT(a) assert(a) +#define lprintf(a) if (ledger_debug) { \ + printf("%lld ", abstime_to_nsecs(mach_absolute_time() / 1000000)); \ + printf a ; \ +} +#else +#define lprintf(a) +#define ASSERT(a) +#endif + +struct ledger_callback { + ledger_callback_t lc_func; + const void *lc_param0; + const void *lc_param1; +}; + +struct entry_template { + char et_key[LEDGER_NAME_MAX]; + char et_group[LEDGER_NAME_MAX]; + char et_units[LEDGER_NAME_MAX]; + uint32_t et_flags; + struct ledger_callback *et_callback; +}; + +lck_grp_t ledger_lck_grp; + +/* + * Modifying the reference count, table size, or table contents requires + * holding the lt_lock. Modfying the table address requires both lt_lock + * and setting the inuse bit. This means that the lt_entries field can be + * safely dereferenced if you hold either the lock or the inuse bit. The + * inuse bit exists solely to allow us to swap in a new, larger entries + * table without requiring a full lock to be acquired on each lookup. + * Accordingly, the inuse bit should never be held for longer than it takes + * to extract a value from the table - i.e., 2 or 3 memory references. + */ +struct ledger_template { + const char *lt_name; + int lt_refs; + int lt_cnt; + int lt_table_size; + volatile uint32_t lt_inuse; + lck_mtx_t lt_lock; + struct entry_template *lt_entries; +}; + +#define template_lock(template) lck_mtx_lock(&(template)->lt_lock) +#define template_unlock(template) lck_mtx_unlock(&(template)->lt_lock) + +#define TEMPLATE_INUSE(s, t) { \ + s = splsched(); \ + while (OSCompareAndSwap(0, 1, &((t)->lt_inuse))) \ + ; \ +} + +#define TEMPLATE_IDLE(s, t) { \ + (t)->lt_inuse = 0; \ + splx(s); \ +} + +/* + * The explicit alignment is to ensure that atomic operations don't panic + * on ARM. + */ +struct ledger_entry { + volatile uint32_t le_flags; + ledger_amount_t le_limit; + volatile ledger_amount_t le_credit __attribute__((aligned(8))); + volatile ledger_amount_t le_debit __attribute__((aligned(8))); + /* + * XXX - the following two fields can go away if we move all of + * the refill logic into process policy + */ + uint64_t le_refill_period; + uint64_t le_last_refill; +} __attribute__((aligned(8))); + +struct ledger { + int l_id; + struct ledger_template *l_template; + int l_refs; + int l_size; + struct ledger_entry *l_entries; +}; + +static int ledger_cnt = 0; +/* ledger ast helper functions */ +static uint32_t ledger_check_needblock(ledger_t l, uint64_t now); +static kern_return_t ledger_perform_blocking(ledger_t l); +static uint32_t flag_set(volatile uint32_t *flags, uint32_t bit); +static uint32_t flag_clear(volatile uint32_t *flags, uint32_t bit); + +#if 0 +static void +debug_callback(const void *p0, __unused const void *p1) { - if (ledger == LEDGER_NULL) - return KERN_SUCCESS; + printf("ledger: resource exhausted [%s] for task %p\n", + (const char *)p0, p1); +} +#endif - /* Need to lock the ledger */ - ledger_lock(ledger); - - if (amount > 0) { - if (ledger->ledger_limit != LEDGER_ITEM_INFINITY && - ledger->ledger_balance + amount > ledger->ledger_limit) { - /* XXX this is where you do BAD things */ - printf("Ledger limit exceeded ! ledger=%p lim=%d balance=%d\n", - ledger, ledger->ledger_limit, - ledger->ledger_balance); - ledger_unlock(ledger); - return(KERN_RESOURCE_SHORTAGE); +/************************************/ + +static uint64_t +abstime_to_nsecs(uint64_t abstime) +{ + uint64_t nsecs; + + absolutetime_to_nanoseconds(abstime, &nsecs); + return (nsecs); +} + +static uint64_t +nsecs_to_abstime(uint64_t nsecs) +{ + uint64_t abstime; + + nanoseconds_to_absolutetime(nsecs, &abstime); + return (abstime); +} + +void +ledger_init(void) +{ + lck_grp_init(&ledger_lck_grp, "ledger", LCK_GRP_ATTR_NULL); +} + +ledger_template_t +ledger_template_create(const char *name) +{ + ledger_template_t template; + + template = (ledger_template_t)kalloc(sizeof (*template)); + if (template == NULL) + return (NULL); + + template->lt_name = name; + template->lt_refs = 1; + template->lt_cnt = 0; + template->lt_table_size = 1; + template->lt_inuse = 0; + lck_mtx_init(&template->lt_lock, &ledger_lck_grp, LCK_ATTR_NULL); + + template->lt_entries = (struct entry_template *) + kalloc(sizeof (struct entry_template) * template->lt_table_size); + if (template->lt_entries == NULL) { + kfree(template, sizeof (*template)); + template = NULL; + } + + return (template); +} + +void +ledger_template_dereference(ledger_template_t template) +{ + template_lock(template); + template->lt_refs--; + template_unlock(template); + + if (template->lt_refs == 0) + kfree(template, sizeof (*template)); +} + +/* + * Add a new entry to the list of entries in a ledger template. There is + * currently no mechanism to remove an entry. Implementing such a mechanism + * would require us to maintain per-entry reference counts, which we would + * prefer to avoid if possible. + */ +int +ledger_entry_add(ledger_template_t template, const char *key, + const char *group, const char *units) +{ + int idx; + struct entry_template *et; + + if ((key == NULL) || (strlen(key) >= LEDGER_NAME_MAX)) + return (-1); + + template_lock(template); + + /* If the table is full, attempt to double its size */ + if (template->lt_cnt == template->lt_table_size) { + struct entry_template *new_entries, *old_entries; + int old_cnt, old_sz; + spl_t s; + + old_cnt = template->lt_table_size; + old_sz = (int)(old_cnt * sizeof (struct entry_template)); + new_entries = kalloc(old_sz * 2); + if (new_entries == NULL) { + template_unlock(template); + return (-1); } - if ((ledger->ledger_balance + amount) - < LEDGER_ITEM_INFINITY) - ledger->ledger_balance += amount; - else - ledger->ledger_balance = LEDGER_ITEM_INFINITY; + memcpy(new_entries, template->lt_entries, old_sz); + memset(((char *)new_entries) + old_sz, 0, old_sz); + template->lt_table_size = old_cnt * 2; + + old_entries = template->lt_entries; + + TEMPLATE_INUSE(s, template); + template->lt_entries = new_entries; + TEMPLATE_IDLE(s, template); + + kfree(old_entries, old_sz); } - else if (amount) { - if (ledger->ledger_balance + amount > 0) - ledger->ledger_balance += amount; - else - ledger->ledger_balance = 0; + + et = &template->lt_entries[template->lt_cnt]; + strlcpy(et->et_key, key, LEDGER_NAME_MAX); + strlcpy(et->et_group, group, LEDGER_NAME_MAX); + strlcpy(et->et_units, units, LEDGER_NAME_MAX); + et->et_flags = ENTRY_ACTIVE; + et->et_callback = NULL; + + idx = template->lt_cnt++; + template_unlock(template); + + return (idx); +} + + +kern_return_t +ledger_entry_setactive(ledger_t ledger, int entry) +{ + struct ledger_entry *le; + + if ((ledger == NULL) || (entry < 0) || (entry >= ledger->l_size)) + return (KERN_INVALID_ARGUMENT); + + le = &ledger->l_entries[entry]; + if ((le->le_flags & ENTRY_ACTIVE) == 0) { + flag_set(&le->le_flags, ENTRY_ACTIVE); } - ledger_unlock(ledger); - return(KERN_SUCCESS); + return (KERN_SUCCESS); } -/* Utility routine to create a new ledger */ -static ledger_t -ledger_allocate( - ledger_item_t limit, - ledger_t ledger_ledger, - ledger_t ledger_parent) + +int +ledger_key_lookup(ledger_template_t template, const char *key) { - ledger_t ledger; + int idx; + + template_lock(template); + for (idx = 0; idx < template->lt_cnt; idx++) + if (template->lt_entries[idx].et_key && + (strcmp(key, template->lt_entries[idx].et_key) == 0)) + break; - ledger = (ledger_t)kalloc(sizeof(ledger_data_t)); - if (ledger == LEDGER_NULL) - return(LEDGER_NULL); + if (idx >= template->lt_cnt) + idx = -1; + template_unlock(template); + + return (idx); +} - ledger->ledger_self = ipc_port_alloc_kernel(); - if (ledger->ledger_self == IP_NULL) { - kfree(ledger, sizeof(ledger_data_t)); - return(LEDGER_NULL); +/* + * Create a new ledger based on the specified template. As part of the + * ledger creation we need to allocate space for a table of ledger entries. + * The size of the table is based on the size of the template at the time + * the ledger is created. If additional entries are added to the template + * after the ledger is created, they will not be tracked in this ledger. + */ +ledger_t +ledger_instantiate(ledger_template_t template, int entry_type) +{ + ledger_t ledger; + size_t sz; + int i; + + ledger = (ledger_t)kalloc(sizeof (struct ledger)); + if (ledger == NULL) + return (LEDGER_NULL); + + ledger->l_template = template; + ledger->l_id = ledger_cnt++; + ledger->l_refs = 1; + + template_lock(template); + template->lt_refs++; + ledger->l_size = template->lt_cnt; + template_unlock(template); + + sz = ledger->l_size * sizeof (struct ledger_entry); + ledger->l_entries = kalloc(sz); + if (sz && (ledger->l_entries == NULL)) { + ledger_template_dereference(template); + kfree(ledger, sizeof(struct ledger)); + return (LEDGER_NULL); } - ledger_lock_init(ledger); - ledger->ledger_limit = limit; - ledger->ledger_balance = 0; - ledger->ledger_service_port = MACH_PORT_NULL; - ledger->ledger_ledger = ledger_ledger; - ledger->ledger_parent = ledger_parent; - ipc_kobject_set(ledger->ledger_self, (ipc_kobject_t)ledger, - IKOT_LEDGER); + template_lock(template); + assert(ledger->l_size <= template->lt_cnt); + for (i = 0; i < ledger->l_size; i++) { + struct ledger_entry *le = &ledger->l_entries[i]; + struct entry_template *et = &template->lt_entries[i]; - return(ledger); + le->le_flags = et->et_flags; + /* make entry inactive by removing active bit */ + if (entry_type == LEDGER_CREATE_INACTIVE_ENTRIES) + flag_clear(&le->le_flags, ENTRY_ACTIVE); + /* + * If template has a callback, this entry is opted-in, + * by default. + */ + if (et->et_callback != NULL) + flag_set(&le->le_flags, LEDGER_ACTION_CALLBACK); + le->le_credit = 0; + le->le_debit = 0; + le->le_limit = LEDGER_LIMIT_INFINITY; + le->le_refill_period = 0; + } + template_unlock(template); + + return (ledger); } -/* Utility routine to destroy a ledger */ -static void -ledger_deallocate( - ledger_t ledger) +static uint32_t +flag_set(volatile uint32_t *flags, uint32_t bit) { - /* XXX can be many send rights (copies) of this */ - ipc_port_dealloc_kernel(ledger->ledger_self); + return (OSBitOrAtomic(bit, flags)); +} - /* XXX release send right on service port */ - kfree(ledger, sizeof(*ledger)); +static uint32_t +flag_clear(volatile uint32_t *flags, uint32_t bit) +{ + return (OSBitAndAtomic(~bit, flags)); +} + +/* + * Take a reference on a ledger + */ +kern_return_t +ledger_reference(ledger_t ledger) +{ + if (!LEDGER_VALID(ledger)) + return (KERN_INVALID_ARGUMENT); + OSIncrementAtomic(&ledger->l_refs); + return (KERN_SUCCESS); } +int +ledger_reference_count(ledger_t ledger) +{ + if (!LEDGER_VALID(ledger)) + return (-1); + + return (ledger->l_refs); +} /* - * Inititalize the ledger facility + * Remove a reference on a ledger. If this is the last reference, + * deallocate the unused ledger. */ -void ledger_init(void) +kern_return_t +ledger_dereference(ledger_t ledger) { - /* - * Allocate the root ledgers; wired and paged. - */ - root_wired_ledger = ledger_allocate(LEDGER_ITEM_INFINITY, - LEDGER_NULL, LEDGER_NULL); - if (root_wired_ledger == LEDGER_NULL) - panic("can't allocate root (wired) ledger"); - ipc_port_make_send(root_wired_ledger->ledger_self); + int v; + + if (!LEDGER_VALID(ledger)) + return (KERN_INVALID_ARGUMENT); + + v = OSDecrementAtomic(&ledger->l_refs); + ASSERT(v >= 1); - root_paged_ledger = ledger_allocate(LEDGER_ITEM_INFINITY, - LEDGER_NULL, LEDGER_NULL); - if (root_paged_ledger == LEDGER_NULL) - panic("can't allocate root (paged) ledger"); - ipc_port_make_send(root_paged_ledger->ledger_self); + /* Just released the last reference. Free it. */ + if (v == 1) { + kfree(ledger->l_entries, + ledger->l_size * sizeof (struct ledger_entry)); + kfree(ledger, sizeof (*ledger)); + } + + return (KERN_SUCCESS); +} + +/* + * Determine whether an entry has exceeded its limit. + */ +static inline int +limit_exceeded(struct ledger_entry *le) +{ + ledger_amount_t balance; + + balance = le->le_credit - le->le_debit; + if ((le->le_limit <= 0) && (balance < le->le_limit)) + return (1); + + if ((le->le_limit > 0) && (balance > le->le_limit)) + return (1); + return (0); +} + +static inline struct ledger_callback * +entry_get_callback(ledger_t ledger, int entry) +{ + struct ledger_callback *callback; + spl_t s; + + TEMPLATE_INUSE(s, ledger->l_template); + callback = ledger->l_template->lt_entries[entry].et_callback; + TEMPLATE_IDLE(s, ledger->l_template); + + return (callback); +} + +/* + * If the ledger value is positive, wake up anybody waiting on it. + */ +static inline void +ledger_limit_entry_wakeup(struct ledger_entry *le) +{ + uint32_t flags; + + if (!limit_exceeded(le)) { + flags = flag_clear(&le->le_flags, CALLED_BACK); + + while (le->le_flags & WAKE_NEEDED) { + flag_clear(&le->le_flags, WAKE_NEEDED); + thread_wakeup((event_t)le); + } + } } /* - * Create a subordinate ledger + * Refill the coffers. */ -kern_return_t ledger_create( - ledger_t parent_ledger, - ledger_t ledger_ledger, - ledger_t *new_ledger, - ledger_item_t transfer) +static void +ledger_refill(uint64_t now, ledger_t ledger, int entry) { - if (parent_ledger == LEDGER_NULL) - return(KERN_INVALID_ARGUMENT); + uint64_t elapsed, period, periods; + struct ledger_entry *le; + ledger_amount_t balance, due; + int cnt; - if (ledger_ledger == LEDGER_NULL) - return(KERN_INVALID_LEDGER); + le = &ledger->l_entries[entry]; /* - * Allocate a new ledger and change the ledger_ledger for - * its space. + * If another thread is handling the refill already, we're not + * needed. Just sit here for a few cycles while the other thread + * finishes updating the balance. If it takes too long, just return + * and we'll block again. */ - ledger_lock(ledger_ledger); - if ((ledger_ledger->ledger_limit != LEDGER_ITEM_INFINITY) && - (ledger_ledger->ledger_balance + sizeof(ledger_data_t) > - ledger_ledger->ledger_limit)) { - ledger_unlock(ledger_ledger); - return(KERN_RESOURCE_SHORTAGE); + if (flag_set(&le->le_flags, REFILL_INPROGRESS) & REFILL_INPROGRESS) { + cnt = 0; + while (cnt++ < 100 && (le->le_flags & REFILL_INPROGRESS)) + ; + return; } - *new_ledger = ledger_allocate(LEDGER_ITEM_INFINITY, ledger_ledger, parent_ledger); - if (*new_ledger == LEDGER_NULL) { - ledger_unlock(ledger_ledger); - return(KERN_RESOURCE_SHORTAGE); + /* + * See how many refill periods have passed since we last + * did a refill. + */ + period = le->le_refill_period; + elapsed = now - le->le_last_refill; + if ((period == 0) || (elapsed < period)) { + flag_clear(&le->le_flags, REFILL_INPROGRESS); + return; } - + + /* + * Optimize for the most common case of only one or two + * periods elapsing. + */ + periods = 0; + while ((periods < 2) && (elapsed > 0)) { + periods++; + elapsed -= period; + } + + /* + * OK, it's been a long time. Do a divide to figure out + * how long. + */ + if (elapsed > 0) + periods = (now - le->le_last_refill) / period; + + balance = le->le_credit - le->le_debit; + due = periods * le->le_limit; + if (balance - due < 0) + due = balance; + OSAddAtomic64(due, &le->le_debit); + /* - * Now transfer the limit for the new ledger from the parent + * If we've completely refilled the pool, set the refill time to now. + * Otherwise set it to the time at which it last should have been + * fully refilled. */ - ledger_lock(parent_ledger); - if (parent_ledger->ledger_limit != LEDGER_ITEM_INFINITY) { - /* Would the existing balance exceed the new limit ? */ - if (parent_ledger->ledger_limit - transfer < parent_ledger->ledger_balance) { - ledger_unlock(parent_ledger); - ledger_unlock(ledger_ledger); - return(KERN_RESOURCE_SHORTAGE); + if (balance == due) + le->le_last_refill = now; + else + le->le_last_refill += (le->le_refill_period * periods); + + flag_clear(&le->le_flags, REFILL_INPROGRESS); + + lprintf(("Refill %lld %lld->%lld\n", periods, balance, balance - due)); + if (!limit_exceeded(le)) + ledger_limit_entry_wakeup(le); +} + +static void +ledger_check_new_balance(ledger_t ledger, int entry) +{ + struct ledger_entry *le; + uint64_t now; + + le = &ledger->l_entries[entry]; + + /* Check to see whether we're due a refill */ + if (le->le_refill_period) { + now = mach_absolute_time(); + if ((now - le->le_last_refill) > le->le_refill_period) + ledger_refill(now, ledger, entry); + } + + if (limit_exceeded(le)) { + /* + * We've exceeded the limit for this entry. There + * are several possible ways to handle it. We can block, + * we can execute a callback, or we can ignore it. In + * either of the first two cases, we want to set the AST + * flag so we can take the appropriate action just before + * leaving the kernel. The one caveat is that if we have + * already called the callback, we don't want to do it + * again until it gets rearmed. + */ + if ((le->le_flags & LEDGER_ACTION_BLOCK) || + (!(le->le_flags & CALLED_BACK) && + entry_get_callback(ledger, entry))) { + set_astledger(current_thread()); } - if (parent_ledger->ledger_limit - transfer > 0) - parent_ledger->ledger_limit -= transfer; - else - parent_ledger->ledger_limit = 0; + } else { + /* + * The balance on the account is below the limit. If + * there are any threads blocked on this entry, now would + * be a good time to wake them up. + */ + if (le->le_flags & WAKE_NEEDED) + ledger_limit_entry_wakeup(le); } - (*new_ledger)->ledger_limit = transfer; +} - /* Charge the ledger against the ledger_ledger */ - ledger_ledger->ledger_balance += (ledger_item_t)sizeof(ledger_data_t); - ledger_unlock(parent_ledger); +/* + * Add value to an entry in a ledger. + */ +kern_return_t +ledger_credit(ledger_t ledger, int entry, ledger_amount_t amount) +{ + ledger_amount_t old, new; + struct ledger_entry *le; - ledger_unlock(ledger_ledger); - - return(KERN_SUCCESS); + if (!ENTRY_VALID(ledger, entry) || (amount < 0)) + return (KERN_INVALID_VALUE); + + if (amount == 0) + return (KERN_SUCCESS); + + le = &ledger->l_entries[entry]; + + old = OSAddAtomic64(amount, &le->le_credit); + new = old + amount; + lprintf(("%p Credit %lld->%lld\n", current_thread(), old, new)); + ledger_check_new_balance(ledger, entry); + + return (KERN_SUCCESS); } + /* - * Destroy a ledger + * Adjust the limit of a limited resource. This does not affect the + * current balance, so the change doesn't affect the thread until the + * next refill. */ -kern_return_t ledger_terminate( - ledger_t ledger) +kern_return_t +ledger_set_limit(ledger_t ledger, int entry, ledger_amount_t limit) { - if (ledger == LEDGER_NULL) - return(KERN_INVALID_ARGUMENT); - - /* You can't deallocate kernel ledgers */ - if (ledger == root_wired_ledger || - ledger == root_paged_ledger) - return(KERN_INVALID_LEDGER); + struct ledger_entry *le; - /* Lock the ledger */ - ledger_lock(ledger); - - /* the parent ledger gets back the limit */ - ledger_lock(ledger->ledger_parent); - if (ledger->ledger_parent->ledger_limit != LEDGER_ITEM_INFINITY) { - assert((natural_t)(ledger->ledger_parent->ledger_limit + - ledger->ledger_limit) < - LEDGER_ITEM_INFINITY); - ledger->ledger_parent->ledger_limit += ledger->ledger_limit; + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_VALUE); + + lprintf(("ledger_set_limit: %x\n", (uint32_t)limit)); + le = &ledger->l_entries[entry]; + le->le_limit = limit; + le->le_last_refill = 0; + flag_clear(&le->le_flags, CALLED_BACK); + ledger_limit_entry_wakeup(le); + + return (KERN_SUCCESS); +} + +/* + * Add a callback to be executed when the resource goes into deficit + */ +kern_return_t +ledger_set_callback(ledger_template_t template, int entry, + ledger_callback_t func, const void *param0, const void *param1) +{ + struct entry_template *et; + struct ledger_callback *old_cb, *new_cb; + + if ((entry < 0) || (entry >= template->lt_cnt)) + return (KERN_INVALID_VALUE); + + if (func) { + new_cb = (struct ledger_callback *)kalloc(sizeof (*new_cb)); + new_cb->lc_func = func; + new_cb->lc_param0 = param0; + new_cb->lc_param1 = param1; + } else { + new_cb = NULL; } - ledger_unlock(ledger->ledger_parent); - /* - * XXX The spec says that you have to destroy all objects that - * have been created with this ledger. Nice work eh? For now - * Transfer the balance to the parent and let it worry about - * it. - */ - /* XXX the parent ledger inherits the debt ?? */ - (void) ledger_enter(ledger->ledger_parent, ledger->ledger_balance); - - /* adjust the balance of the creation ledger */ - (void) ledger_enter(ledger->ledger_ledger, (ledger_item_t)-sizeof(*ledger)); + template_lock(template); + et = &template->lt_entries[entry]; + old_cb = et->et_callback; + et->et_callback = new_cb; + template_unlock(template); + if (old_cb) + kfree(old_cb, sizeof (*old_cb)); - /* delete the ledger */ - ledger_deallocate(ledger); + return (KERN_SUCCESS); +} - return(KERN_SUCCESS); +/* + * Disable callback notification for a specific ledger entry. + * + * Otherwise, if using a ledger template which specified a + * callback function (ledger_set_callback()), it will be invoked when + * the resource goes into deficit. + */ +kern_return_t +ledger_disable_callback(ledger_t ledger, int entry) +{ + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_VALUE); + + flag_clear(&ledger->l_entries[entry].le_flags, LEDGER_ACTION_CALLBACK); + return (KERN_SUCCESS); } /* - * Return the ledger limit and balance + * Clear the called_back flag, indicating that we want to be notified + * again when the limit is next exceeded. */ -kern_return_t ledger_read( - ledger_t ledger, - ledger_item_t *balance, - ledger_item_t *limit) +kern_return_t +ledger_reset_callback(ledger_t ledger, int entry) { - if (ledger == LEDGER_NULL) - return(KERN_INVALID_ARGUMENT); - - ledger_lock(ledger); - *balance = ledger->ledger_balance; - *limit = ledger->ledger_limit; - ledger_unlock(ledger); + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_VALUE); - return(KERN_SUCCESS); + flag_clear(&ledger->l_entries[entry].le_flags, CALLED_BACK); + return (KERN_SUCCESS); } /* - * Transfer resources from a parent ledger to a child + * Adjust the automatic refill period. */ -kern_return_t ledger_transfer( - ledger_t parent_ledger, - ledger_t child_ledger, - ledger_item_t transfer) +kern_return_t +ledger_set_period(ledger_t ledger, int entry, uint64_t period) { -#define abs(v) ((v) > 0)?(v):-(v) - - ledger_t src, dest; - ledger_item_t amount = abs(transfer); - - if (parent_ledger == LEDGER_NULL) - return(KERN_INVALID_ARGUMENT); + struct ledger_entry *le; - if (child_ledger == LEDGER_NULL) - return(KERN_INVALID_ARGUMENT); + lprintf(("ledger_set_period: %llx\n", period)); + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_VALUE); - /* Must be different ledgers */ - if (parent_ledger == child_ledger) - return(KERN_INVALID_ARGUMENT); + le = &ledger->l_entries[entry]; + le->le_refill_period = nsecs_to_abstime(period); - if (transfer == 0) - return(KERN_SUCCESS); - - ledger_lock(child_ledger); - ledger_lock(parent_ledger); - - /* XXX Should be the parent you created it from ?? */ - if (parent_ledger != child_ledger->ledger_parent) { - ledger_unlock(parent_ledger); - ledger_unlock(child_ledger); - return(KERN_INVALID_LEDGER); + return (KERN_SUCCESS); +} + +kern_return_t +ledger_set_action(ledger_t ledger, int entry, int action) +{ + lprintf(("ledger_set_action: %d\n", action)); + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_VALUE); + + flag_set(&ledger->l_entries[entry].le_flags, action); + return (KERN_SUCCESS); +} + +void +set_astledger(thread_t thread) +{ + spl_t s = splsched(); + + if (thread == current_thread()) { + thread_ast_set(thread, AST_LEDGER); + ast_propagate(thread->ast); + } else { + processor_t p; + + thread_lock(thread); + thread_ast_set(thread, AST_LEDGER); + p = thread->last_processor; + if ((p != PROCESSOR_NULL) && (p->state == PROCESSOR_RUNNING) && + (p->active_thread == thread)) + cause_ast_check(p); + thread_unlock(thread); } + + splx(s); +} + +kern_return_t +ledger_debit(ledger_t ledger, int entry, ledger_amount_t amount) +{ + struct ledger_entry *le; + ledger_amount_t old, new; + + if (!ENTRY_VALID(ledger, entry) || (amount < 0)) + return (KERN_INVALID_ARGUMENT); + + if (amount == 0) + return (KERN_SUCCESS); + + le = &ledger->l_entries[entry]; + + old = OSAddAtomic64(amount, &le->le_debit); + new = old + amount; + + lprintf(("%p Debit %lld->%lld\n", thread, old, new)); + ledger_check_new_balance(ledger, entry); + return (KERN_SUCCESS); - if (transfer > 0) { - dest = child_ledger; - src = parent_ledger; +} + +void +ledger_ast(thread_t thread) +{ + struct ledger *l = thread->t_ledger; + struct ledger *thl = thread->t_threadledger; + uint32_t block; + uint64_t now; + kern_return_t ret; + task_t task = thread->task; + + lprintf(("Ledger AST for %p\n", thread)); + + ASSERT(task != NULL); + ASSERT(thread == current_thread()); + +top: + /* + * Make sure this thread is up to date with regards to any task-wide per-thread + * CPU limit. + */ + if ((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) && + ((thread->options & TH_OPT_PROC_CPULIMIT) == 0) ) { + /* + * Task has a per-thread CPU limit on it, and this thread + * needs it applied. + */ + thread_set_cpulimit(THREAD_CPULIMIT_EXCEPTION, task->rusage_cpu_perthr_percentage, + task->rusage_cpu_perthr_interval); + assert((thread->options & TH_OPT_PROC_CPULIMIT) != 0); + } else if (((task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) == 0) && + (thread->options & TH_OPT_PROC_CPULIMIT)) { + /* + * Task no longer has a per-thread CPU limit; remove this thread's + * corresponding CPU limit. + */ + thread_set_cpulimit(THREAD_CPULIMIT_EXCEPTION, 0, 0); + assert((thread->options & TH_OPT_PROC_CPULIMIT) == 0); } - else { - src = child_ledger; - dest = parent_ledger; + + /* + * If the task or thread is being terminated, let's just get on with it + */ + if ((l == NULL) || !task->active || task->halting || !thread->active) + return; + + /* + * Examine all entries in deficit to see which might be eligble for + * an automatic refill, which require callbacks to be issued, and + * which require blocking. + */ + block = 0; + now = mach_absolute_time(); + + if (LEDGER_VALID(thl)) { + block |= ledger_check_needblock(thl, now); } + block |= ledger_check_needblock(l, now); - if (src->ledger_limit != LEDGER_ITEM_INFINITY) { - /* Would the existing balance exceed the new limit ? */ - if (src->ledger_limit - amount < src->ledger_balance) { - ledger_unlock(parent_ledger); - ledger_unlock(child_ledger); - return(KERN_RESOURCE_SHORTAGE); + /* + * If we are supposed to block on the availability of one or more + * resources, find the first entry in deficit for which we should wait. + * Schedule a refill if necessary and then sleep until the resource + * becomes available. + */ + if (block) { + if (LEDGER_VALID(thl)) { + ret = ledger_perform_blocking(thl); + if (ret != KERN_SUCCESS) + goto top; } - if (src->ledger_limit - amount > 0) - src->ledger_limit -= amount; - else - src->ledger_limit = 0; - } + ret = ledger_perform_blocking(l); + if (ret != KERN_SUCCESS) + goto top; + } /* block */ +} - if (dest->ledger_limit != LEDGER_ITEM_INFINITY) { - if ((natural_t)(dest->ledger_limit + amount) - < LEDGER_ITEM_INFINITY) - dest->ledger_limit += amount; - else - dest->ledger_limit = (LEDGER_ITEM_INFINITY - 1); +static uint32_t +ledger_check_needblock(ledger_t l, uint64_t now) +{ + int i; + uint32_t flags, block = 0; + struct ledger_entry *le; + struct ledger_callback *lc; + + + for (i = 0; i < l->l_size; i++) { + le = &l->l_entries[i]; + if (limit_exceeded(le) == FALSE) + continue; + + /* Check for refill eligibility */ + if (le->le_refill_period) { + if ((le->le_last_refill + le->le_refill_period) > now) { + ledger_refill(now, l, i); + if (limit_exceeded(le) == FALSE) + continue; + } + } + + if (le->le_flags & LEDGER_ACTION_BLOCK) + block = 1; + if ((le->le_flags & LEDGER_ACTION_CALLBACK) == 0) + continue; + lc = entry_get_callback(l, i); + assert(lc != NULL); + flags = flag_set(&le->le_flags, CALLED_BACK); + /* Callback has already been called */ + if (flags & CALLED_BACK) + continue; + lc->lc_func(lc->lc_param0, lc->lc_param1); } + return(block); +} - ledger_unlock(parent_ledger); - ledger_unlock(child_ledger); - + +/* return KERN_SUCCESS to continue, KERN_FAILURE to restart */ +static kern_return_t +ledger_perform_blocking(ledger_t l) +{ + int i; + kern_return_t ret; + struct ledger_entry *le; + + for (i = 0; i < l->l_size; i++) { + le = &l->l_entries[i]; + if ((!limit_exceeded(le)) || + ((le->le_flags & LEDGER_ACTION_BLOCK) == 0)) + continue; + + /* Prepare to sleep until the resource is refilled */ + ret = assert_wait_deadline(le, TRUE, + le->le_last_refill + le->le_refill_period); + if (ret != THREAD_WAITING) + return(KERN_SUCCESS); + + /* Mark that somebody is waiting on this entry */ + flag_set(&le->le_flags, WAKE_NEEDED); + + ret = thread_block_reason(THREAD_CONTINUE_NULL, NULL, + AST_LEDGER); + if (ret != THREAD_AWAKENED) + return(KERN_SUCCESS); + + /* + * The world may have changed while we were asleep. + * Some other resource we need may have gone into + * deficit. Or maybe we're supposed to die now. + * Go back to the top and reevaluate. + */ + return(KERN_FAILURE); + } return(KERN_SUCCESS); -#undef abs } -/* - * Routine: convert_port_to_ledger - * Purpose: - * Convert from a port to a ledger. - * Doesn't consume the port ref; the ledger produced may be null. - * Conditions: - * Nothing locked. - */ -ledger_t -convert_port_to_ledger( - ipc_port_t port) +kern_return_t +ledger_get_entries(ledger_t ledger, int entry, ledger_amount_t *credit, + ledger_amount_t *debit) +{ + struct ledger_entry *le; + + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_ARGUMENT); + + le = &ledger->l_entries[entry]; + + *credit = le->le_credit; + *debit = le->le_debit; + + return (KERN_SUCCESS); +} + +int +ledger_template_info(void **buf, int *len) { - ledger_t ledger = LEDGER_NULL; + struct ledger_template_info *lti; + struct entry_template *et; + int i; + ledger_t l; - if (IP_VALID(port)) { - ip_lock(port); - if (ip_active(port) && - (ip_kotype(port) == IKOT_LEDGER)) - ledger = (ledger_t) port->ip_kobject; - ip_unlock(port); + /* + * Since all tasks share a ledger template, we'll just use the + * caller's as the source. + */ + l = current_task()->ledger; + if ((*len < 0) || (l == NULL)) + return (EINVAL); + + if (*len > l->l_size) + *len = l->l_size; + lti = kalloc((*len) * sizeof (struct ledger_template_info)); + if (lti == NULL) + return (ENOMEM); + *buf = lti; + + template_lock(l->l_template); + et = l->l_template->lt_entries; + + for (i = 0; i < *len; i++) { + memset(lti, 0, sizeof (*lti)); + strlcpy(lti->lti_name, et->et_key, LEDGER_NAME_MAX); + strlcpy(lti->lti_group, et->et_group, LEDGER_NAME_MAX); + strlcpy(lti->lti_units, et->et_units, LEDGER_NAME_MAX); + et++; + lti++; } + template_unlock(l->l_template); - return ledger; + return (0); } -/* - * Routine: convert_ledger_to_port - * Purpose: - * Convert from a ledger to a port. - * Produces a naked send right which may be invalid. - * Conditions: - * Nothing locked. - */ +int +ledger_entry_info(task_t task, void **buf, int *len) +{ + struct ledger_entry_info *lei; + struct ledger_entry *le; + uint64_t now = mach_absolute_time(); + int i; + ledger_t l; + + if ((*len < 0) || ((l = task->ledger) == NULL)) + return (EINVAL); -ipc_port_t -convert_ledger_to_port( - ledger_t ledger) + if (*len > l->l_size) + *len = l->l_size; + lei = kalloc((*len) * sizeof (struct ledger_entry_info)); + if (lei == NULL) + return (ENOMEM); + *buf = lei; + + le = l->l_entries; + + for (i = 0; i < *len; i++) { + memset(lei, 0, sizeof (*lei)); + lei->lei_limit = le->le_limit; + lei->lei_credit = le->le_credit; + lei->lei_debit = le->le_debit; + lei->lei_balance = lei->lei_credit - lei->lei_debit; + lei->lei_refill_period = + abstime_to_nsecs(le->le_refill_period); + lei->lei_last_refill = + abstime_to_nsecs(now - le->le_last_refill); + le++; + lei++; + } + + return (0); +} + +int +ledger_info(task_t task, struct ledger_info *info) { - ipc_port_t port; + ledger_t l; + + if ((l = task->ledger) == NULL) + return (ENOENT); - if (ledger == LEDGER_NULL) - return IP_NULL; + memset(info, 0, sizeof (*info)); - port = ipc_port_make_send(ledger->ledger_self); - return port; + strlcpy(info->li_name, l->l_template->lt_name, LEDGER_NAME_MAX); + info->li_id = l->l_id; + info->li_entries = l->l_size; + return (0); } -/* - * Copy a ledger - */ -ipc_port_t -ledger_copy( - ledger_t ledger) +#ifdef LEDGER_DEBUG +int +ledger_limit(task_t task, struct ledger_limit_args *args) { - if (ledger == LEDGER_NULL) - return IP_NULL; + ledger_t l; + int64_t limit; + int idx; + + if ((l = task->ledger) == NULL) + return (EINVAL); + + idx = ledger_key_lookup(l->l_template, args->lla_name); + if ((idx < 0) || (idx >= l->l_size)) + return (EINVAL); + + /* + * XXX - this doesn't really seem like the right place to have + * a context-sensitive conversion of userspace units into kernel + * units. For now I'll handwave and say that the ledger() system + * call isn't meant for civilians to use - they should be using + * the process policy interfaces. + */ + if (idx == task_ledgers.cpu_time) { + int64_t nsecs; + + if (args->lla_refill_period) { + /* + * If a refill is scheduled, then the limit is + * specified as a percentage of one CPU. The + * syscall specifies the refill period in terms of + * milliseconds, so we need to convert to nsecs. + */ + args->lla_refill_period *= 1000000; + nsecs = args->lla_limit * + (args->lla_refill_period / 100); + lprintf(("CPU limited to %lld nsecs per second\n", + nsecs)); + } else { + /* + * If no refill is scheduled, then this is a + * fixed amount of CPU time (in nsecs) that can + * be consumed. + */ + nsecs = args->lla_limit; + lprintf(("CPU limited to %lld nsecs\n", nsecs)); + } + limit = nsecs_to_abstime(nsecs); + } else { + limit = args->lla_limit; + lprintf(("%s limited to %lld\n", args->lla_name, limit)); + } + + if (args->lla_refill_period > 0) + ledger_set_period(l, idx, args->lla_refill_period); - return(ipc_port_copy_send(ledger->ledger_self)); + ledger_set_limit(l, idx, limit); + flag_set(&l->l_entries[idx].le_flags, LEDGER_ACTION_BLOCK); + return (0); } +#endif diff --git a/osfmk/kern/ledger.h b/osfmk/kern/ledger.h index fe0fa2c04..982781686 100644 --- a/osfmk/kern/ledger.h +++ b/osfmk/kern/ledger.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,50 +29,106 @@ * @OSF_COPYRIGHT@ */ -#ifdef MACH_KERNEL_PRIVATE - #ifndef _KERN_LEDGER_H_ #define _KERN_LEDGER_H_ -#include +#define LEDGER_INFO 0 +#define LEDGER_ENTRY_INFO 1 +#define LEDGER_TEMPLATE_INFO 2 +#define LEDGER_LIMIT 3 -#include -#include -#include +#define LEDGER_NAME_MAX 32 -struct ledger { - ipc_port_t ledger_self; - ipc_port_t ledger_service_port; - ledger_item_t ledger_balance; - ledger_item_t ledger_limit; - struct ledger *ledger_ledger; - struct ledger *ledger_parent; - decl_simple_lock_data(,lock) +struct ledger_info { + char li_name[LEDGER_NAME_MAX]; + int64_t li_id; + int64_t li_entries; }; -typedef struct ledger ledger_data_t; - -#define ledger_lock(ledger) simple_lock(&(ledger)->lock) -#define ledger_unlock(ledger) simple_unlock(&(ledger)->lock) -#define ledger_lock_init(ledger) \ - simple_lock_init(&(ledger)->lock, 0) - -extern ledger_t root_wired_ledger; -extern ledger_t root_paged_ledger; - -#define root_wired_ledger_port root_wired_ledger->ledger_self -#define root_paged_ledger_port root_paged_ledger->ledger_self - -extern void ledger_init(void) __attribute__((section("__TEXT, initcode"))); - -extern ipc_port_t ledger_copy(ledger_t); +struct ledger_template_info { + char lti_name[LEDGER_NAME_MAX]; + char lti_group[LEDGER_NAME_MAX]; + char lti_units[LEDGER_NAME_MAX]; +}; -extern kern_return_t ledger_enter(ledger_t, ledger_item_t); +struct ledger_entry_info { + int64_t lei_balance; + int64_t lei_credit; + int64_t lei_debit; + uint64_t lei_limit; + uint64_t lei_refill_period; /* In milliseconds */ + uint64_t lei_last_refill; /* Time since last refill */ +}; -extern ledger_t convert_port_to_ledger(ipc_port_t); +struct ledger_limit_args { + char lla_name[LEDGER_NAME_MAX]; + uint64_t lla_limit; + uint64_t lla_refill_period; +}; -extern ipc_port_t convert_ledger_to_port(ledger_t); +#ifdef KERNEL_PRIVATE + +typedef struct ledger_template *ledger_template_t; + +#define LEDGER_VALID(ledger) (ledger != LEDGER_NULL) + +/* Action to take when a ledger goes into deficit */ +#define LEDGER_ACTION_IGNORE 0x0000 +#define LEDGER_ACTION_BLOCK 0x0010 +#define LEDGER_ACTION_EXCEPTION 0x0020 +#define LEDGER_ACTION_CALLBACK 0x0040 +#define LEDGER_ACTION_MASK 0x00f0 + +typedef void (*ledger_callback_t)(const void * param0, const void *param1); + +extern void ledger_init(void); + +extern ledger_template_t ledger_template_create(const char *name); +extern void ledger_template_dereference(ledger_template_t template); +extern int ledger_entry_add(ledger_template_t template, const char *key, + const char *group, const char *units); +extern kern_return_t ledger_set_callback(ledger_template_t template, int entry, + ledger_callback_t callback, const void *param0, const void *param1); +extern int ledger_key_lookup(ledger_template_t template, const char *key); + +/* value of entry type */ +#define LEDGER_CREATE_ACTIVE_ENTRIES 0 +#define LEDGER_CREATE_INACTIVE_ENTRIES 1 +extern ledger_t ledger_instantiate(ledger_template_t template, int entry_type); +extern kern_return_t ledger_disable_callback(ledger_t ledger, int entry); +extern kern_return_t ledger_reset_callback(ledger_t ledger, int entry); +extern kern_return_t ledger_set_limit(ledger_t ledger, int entry, + ledger_amount_t limit); +extern kern_return_t ledger_set_action(ledger_t ledger, int entry, int action); +extern kern_return_t ledger_set_period(ledger_t ledger, int entry, + uint64_t period); +extern kern_return_t ledger_entry_setactive(ledger_t ledger, int entry); +extern kern_return_t ledger_credit(ledger_t ledger, int entry, + ledger_amount_t amount); +extern kern_return_t ledger_debit(ledger_t ledger, int entry, + ledger_amount_t amount); +extern kern_return_t ledger_get_entries(ledger_t ledger, int entry, + ledger_amount_t *credit, ledger_amount_t *debit); + +extern void ledger_ast(thread_t thread); +extern void set_astledger(thread_t thread); + +extern int ledger_reference_count(ledger_t ledger); +extern kern_return_t ledger_reference(ledger_t ledger); +extern kern_return_t ledger_dereference(ledger_t ledger); + +/* Per-pmap ledger operations */ +#define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a) +#define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a) + +/* Support for ledger() syscall */ +#ifdef LEDGER_DEBUG +extern int ledger_limit(task_t task, struct ledger_limit_args *args); +#endif +extern int ledger_info(task_t task, struct ledger_info *info); +extern int ledger_entry_info(task_t task, void **buf, int *len); +extern int ledger_template_info(void **buf, int *len); + +#endif /* KERNEL_PRIVATE */ #endif /* _KERN_LEDGER_H_ */ - -#endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index 07b9924a1..7ee5a2f4e 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -53,7 +53,6 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -#include #include #include @@ -366,6 +365,7 @@ void lck_attr_setdefault( lck_attr_t *attr) { +#if __i386__ || __x86_64__ #if !DEBUG if (LcksOpts & enaLkDeb) attr->lck_attr_val = LCK_ATTR_DEBUG; @@ -374,6 +374,9 @@ lck_attr_setdefault( #else attr->lck_attr_val = LCK_ATTR_DEBUG; #endif /* !DEBUG */ +#else +#error Unknown architecture. +#endif /* __arm__ */ } diff --git a/osfmk/kern/mach_clock.c b/osfmk/kern/mach_clock.c deleted file mode 100644 index 779855296..000000000 --- a/osfmk/kern/mach_clock.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: clock_prim.c - * Author: Avadis Tevanian, Jr. - * Date: 1986 - * - * Clock primitives. - */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if GPROF -static void prof_tick(boolean_t usermode, natural_t pc); -#endif - -#if STAT_TIME || GPROF -/* - * Hertz rate clock interrupt servicing. Used to update processor - * statistics and perform kernel profiling. - */ -void -hertz_tick( -#if GPROF - __unused natural_t ticks, -#else - natural_t ticks, -#endif - boolean_t usermode, -#if GPROF - natural_t pc) -#else - __unused natural_t pc) -#endif -{ - processor_t processor = current_processor(); - thread_t thread = current_thread(); - timer_t state; - - if (usermode) { - TIMER_BUMP(&thread->user_timer, ticks); - - state = &PROCESSOR_DATA(processor, user_state); - } - else { - /* If this thread is idling, do not charge that time as system time */ - if ((thread->state & TH_IDLE) == 0) { - TIMER_BUMP(&thread->system_timer, ticks); - } - - if (processor->state == PROCESSOR_IDLE) - state = &PROCESSOR_DATA(processor, idle_state); - else - state = &PROCESSOR_DATA(processor, system_state); - } - - TIMER_BUMP(state, ticks); - -#if GPROF - prof_tick(usermode, pc); -#endif /* GPROF */ -} - -#endif /* STAT_TIME */ - -#if GPROF - -static void -prof_tick( - boolean_t usermode, - natural_t pc) -{ - struct profile_vars *pv; - prof_uptrint_t s; - - pv = PROFILE_VARS(cpu_number()); - - if (usermode) { - if (pv->active) - PROF_CNT_INC(pv->stats.user_ticks); - } - else { - if (pv->active) { - if (current_processor()->state == CPU_STATE_IDLE) - PROF_CNT_INC(pv->stats.idle_ticks); - else - PROF_CNT_INC(pv->stats.kernel_ticks); - - if ((prof_uptrint_t)pc < _profile_vars.profil_info.lowpc) - PROF_CNT_INC(pv->stats.too_low); - else { - s = (prof_uptrint_t)pc - _profile_vars.profil_info.lowpc; - if (s < pv->profil_info.text_len) { - LHISTCOUNTER *ptr = (LHISTCOUNTER *) pv->profil_buf; - LPROF_CNT_INC(ptr[s / HISTFRACTION]); - } - else - PROF_CNT_INC(pv->stats.too_high); - } - } - } -} - -#endif /* GPROF */ diff --git a/osfmk/kern/mach_param.h b/osfmk/kern/mach_param.h index 44b21a9da..a89e08fc3 100644 --- a/osfmk/kern/mach_param.h +++ b/osfmk/kern/mach_param.h @@ -83,8 +83,6 @@ extern int thread_max, task_threadmax, task_max; #define SET_MAX (task_max + (thread_max * 2) + 200) /* Max number of port sets */ -#define ITE_MAX (1 << 16) /* Max number of splay tree entries */ - #define SPACE_MAX (task_max + 5) /* Max number of IPC spaces */ #define SEMAPHORE_MAX (PORT_MAX >> 1) /* Maximum number of semaphores */ diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index f7fb46b3c..fa9a57251 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -112,6 +112,9 @@ extern int copyoutmsg( extern void inval_copy_windows(thread_t); extern void copy_window_fault(thread_t, vm_map_t, int); +extern int copyin_validate(const user_addr_t, uintptr_t, vm_size_t); +extern int copyout_validate(uintptr_t, const user_addr_t, vm_size_t); + extern int sscanf(const char *input, const char *fmt, ...) __scanflike(2,3); /* sprintf() is being deprecated. Please use snprintf() instead. */ @@ -131,8 +134,6 @@ int _consume_printf_args(int, ...); #endif #endif -extern void dbugprintf(const char *format, ...) __printflike(1,2); - extern int kdb_printf(const char *format, ...) __printflike(1,2); extern int kdb_log(const char *format, ...) __printflike(1,2); @@ -195,6 +196,11 @@ extern void bootstrap_create(void); extern void Debugger( const char * message); +extern void DebuggerWithContext( + unsigned int reason, + void *ctx, + const char *message); + extern void delay( int n); @@ -215,4 +221,6 @@ user_addr_t get_useraddr(void); /* symbol lookup */ struct kmod_info_t; +extern uint64_t early_random(void); + #endif /* _MISC_PROTOS_H_ */ diff --git a/osfmk/kern/mk_sp.c b/osfmk/kern/mk_sp.c index d8e86124b..83ec87bb1 100644 --- a/osfmk/kern/mk_sp.c +++ b/osfmk/kern/mk_sp.c @@ -131,12 +131,20 @@ thread_policy_common( if (priority < MINPRI) priority = MINPRI; - thread->importance = priority - thread->task_priority; - #if CONFIG_EMBEDDED + if ((thread->task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) && + (thread->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL)) { + thread->saved_importance = priority - thread->task_priority; + priority = MAXPRI_THROTTLE; + } else { + thread->importance = priority - thread->task_priority; + } /* No one can have a base priority less than MAXPRI_THROTTLE */ if (priority < MAXPRI_THROTTLE) priority = MAXPRI_THROTTLE; +#else /* CONFIG_EMBEDDED */ + thread->importance = priority - thread->task_priority; + #endif /* CONFIG_EMBEDDED */ set_priority(thread, priority); diff --git a/osfmk/kern/mk_timer.c b/osfmk/kern/mk_timer.c index 6bbb1b194..f0dd81cc9 100644 --- a/osfmk/kern/mk_timer.c +++ b/osfmk/kern/mk_timer.c @@ -81,7 +81,7 @@ mk_timer_create_trap( } simple_lock_init(&timer->lock, 0); - call_entry_setup(&timer->call_entry, mk_timer_expire, timer); + thread_call_setup(&timer->call_entry, mk_timer_expire, timer); timer->is_armed = timer->is_dead = FALSE; timer->active = 0; @@ -190,6 +190,18 @@ mk_timer_expire( simple_unlock(&timer->lock); } +/* + * mk_timer_destroy_trap: Destroy the Mach port associated with a timer + * + * Parameters: args User argument descriptor (see below) + * + * Indirect: args->name Mach port name + * + * + * Returns: 0 Success + * !0 Not success + * + */ kern_return_t mk_timer_destroy_trap( struct mk_timer_destroy_trap_args *args) @@ -215,6 +227,19 @@ mk_timer_destroy_trap( return (result); } +/* + * mk_timer_arm_trap: Start (arm) a timer + * + * Parameters: args User argument descriptor (see below) + * + * Indirect: args->name Mach port name + * args->expire_time Time when timer expires + * + * + * Returns: 0 Success + * !0 Not success + * + */ kern_return_t mk_timer_arm_trap( struct mk_timer_arm_trap_args *args) @@ -254,6 +279,19 @@ mk_timer_arm_trap( return (result); } +/* + * mk_timer_cancel_trap: Cancel a timer + * + * Parameters: args User argument descriptor (see below) + * + * Indirect: args->name Mach port name + * args->result_time The armed time of the cancelled timer (return value) + * + * + * Returns: 0 Success + * !0 Not success + * + */ kern_return_t mk_timer_cancel_trap( struct mk_timer_cancel_trap_args *args) @@ -278,7 +316,7 @@ mk_timer_cancel_trap( ip_unlock(port); if (timer->is_armed) { - armed_time = timer->call_entry.deadline; + armed_time = timer->call_entry.tc_call.deadline; if (thread_call_cancel(&timer->call_entry)) timer->active--; timer->is_armed = FALSE; diff --git a/osfmk/kern/mk_timer.h b/osfmk/kern/mk_timer.h index c67d74ba9..adcad2133 100644 --- a/osfmk/kern/mk_timer.h +++ b/osfmk/kern/mk_timer.h @@ -40,11 +40,11 @@ #ifdef MACH_KERNEL_PRIVATE #include -#include +#include struct mk_timer { - decl_simple_lock_data(,lock) - call_entry_data_t call_entry; + decl_simple_lock_data(,lock); + thread_call_data_t call_entry; uint32_t is_dead:1, is_armed:1; int active; diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index 730be5c81..88813d844 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -155,7 +155,6 @@ */ #include -#include #include #include #include @@ -765,24 +764,6 @@ cons_putc_locked( cnputc(c); } -#if MACH_KDB -extern void db_putchar(char c); -#endif - -void -dbugprintf(__unused const char *fmt, ...) -{ - -#if MACH_KDB - va_list listp; - - va_start(listp, fmt); - _doprnt(fmt, &listp, db_putchar, 16); - va_end(listp); -#endif - return; -} - int printf(const char *fmt, ...) { diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 3273a4f6c..a7fa6ea44 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,6 +73,7 @@ #include #include #include +#include #include /* @@ -94,6 +95,16 @@ thread_quantum_expire( SCHED_STATS_QUANTUM_TIMER_EXPIRATION(processor); + /* + * We bill CPU time to both the individual thread and its task. + * + * Because this balance adjustment could potentially attempt to wake this very + * thread, we must credit the ledger before taking the thread lock. The ledger + * pointers are only manipulated by the thread itself at the ast boundary. + */ + ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->current_quantum); + ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->current_quantum); + thread_lock(thread); /* @@ -101,19 +112,18 @@ thread_quantum_expire( * continue without re-entering the scheduler, so update this now. */ thread->last_run_time = processor->quantum_end; - + /* * Check for fail-safe trip. */ - if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && - !(thread->sched_flags & TH_SFLAG_PROMOTED) && - !(thread->options & TH_OPT_SYSTEM_CRITICAL)) { - uint64_t new_computation; - - new_computation = processor->quantum_end - thread->computation_epoch; - new_computation += thread->computation_metered; - if (new_computation > max_unsafe_computation) { - + if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && + !(thread->sched_flags & TH_SFLAG_PROMOTED) && + !(thread->options & TH_OPT_SYSTEM_CRITICAL)) { + uint64_t new_computation; + + new_computation = processor->quantum_end - thread->computation_epoch; + new_computation += thread->computation_metered; + if (new_computation > max_unsafe_computation) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE)|DBG_FUNC_NONE, (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0); @@ -158,7 +168,23 @@ thread_quantum_expire( thread_quantum_init(thread); thread->last_quantum_refill_time = processor->quantum_end; - processor->quantum_end += thread->current_quantum; + /* Reload precise timing global policy to thread-local policy */ + thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); + + /* + * Since non-precise user/kernel time doesn't update the state/thread timer + * during privilege transitions, synthesize an event now. + */ + if (!thread->precise_user_kernel_time) { + timer_switch(PROCESSOR_DATA(processor, current_state), + processor->quantum_end, + PROCESSOR_DATA(processor, current_state)); + timer_switch(PROCESSOR_DATA(processor, thread_timer), + processor->quantum_end, + PROCESSOR_DATA(processor, thread_timer)); + } + + processor->quantum_end = mach_absolute_time() + thread->current_quantum; timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_CRITICAL); @@ -449,6 +475,50 @@ update_priority( thread->sched_flags &= ~TH_SFLAG_FAILSAFE; } +#if CONFIG_EMBEDDED + /* Check for pending throttle transitions, and safely switch queues */ + if (thread->sched_flags & TH_SFLAG_PENDING_THROTTLE_MASK) { + boolean_t removed = thread_run_queue_remove(thread); + + if (thread->sched_flags & TH_SFLAG_PENDING_THROTTLE_DEMOTION) { + if (thread->sched_mode == TH_MODE_REALTIME) { + thread->saved_mode = thread->sched_mode; + thread->sched_mode = TH_MODE_TIMESHARE; + + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_incr(); + } else { + /* + * It's possible that this is a realtime thread that has + * already tripped the failsafe, in which case saved_mode + * is already set correctly. + */ + if (!(thread->sched_flags & TH_SFLAG_FAILSAFE)) { + thread->saved_mode = thread->sched_mode; + } + thread->sched_flags &= ~TH_SFLAG_FAILSAFE; + } + thread->sched_flags |= TH_SFLAG_THROTTLED; + + } else { + if ((thread->sched_mode == TH_MODE_TIMESHARE) + && (thread->saved_mode == TH_MODE_REALTIME)) { + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_decr(); + } + + thread->sched_mode = thread->saved_mode; + thread->saved_mode = TH_MODE_NONE; + thread->sched_flags &= ~TH_SFLAG_THROTTLED; + } + + thread->sched_flags &= ~(TH_SFLAG_PENDING_THROTTLE_MASK); + + if (removed) + thread_setrun(thread, SCHED_TAILQ); + } +#endif + /* * Recompute scheduled priority if appropriate. */ diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index b0771351f..23a549611 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -332,34 +332,41 @@ processor_info( case PROCESSOR_CPU_LOAD_INFO: { - register processor_cpu_load_info_t cpu_load_info; + processor_cpu_load_info_t cpu_load_info; + timer_data_t idle_temp; + timer_t idle_state; if (*count < PROCESSOR_CPU_LOAD_INFO_COUNT) return (KERN_FAILURE); cpu_load_info = (processor_cpu_load_info_t) info; - cpu_load_info->cpu_ticks[CPU_STATE_USER] = + if (precise_user_kernel_time) { + cpu_load_info->cpu_ticks[CPU_STATE_USER] = (uint32_t)(timer_grab(&PROCESSOR_DATA(processor, user_state)) / hz_tick_interval); - cpu_load_info->cpu_ticks[CPU_STATE_SYSTEM] = + cpu_load_info->cpu_ticks[CPU_STATE_SYSTEM] = (uint32_t)(timer_grab(&PROCESSOR_DATA(processor, system_state)) / hz_tick_interval); - { - timer_data_t idle_temp; - timer_t idle_state; + } else { + uint64_t tval = timer_grab(&PROCESSOR_DATA(processor, user_state)) + + timer_grab(&PROCESSOR_DATA(processor, system_state)); + + cpu_load_info->cpu_ticks[CPU_STATE_USER] = (uint32_t)(tval / hz_tick_interval); + cpu_load_info->cpu_ticks[CPU_STATE_SYSTEM] = 0; + } idle_state = &PROCESSOR_DATA(processor, idle_state); idle_temp = *idle_state; if (PROCESSOR_DATA(processor, current_state) != idle_state || - timer_grab(&idle_temp) != timer_grab(idle_state)) + timer_grab(&idle_temp) != timer_grab(idle_state)) { cpu_load_info->cpu_ticks[CPU_STATE_IDLE] = (uint32_t)(timer_grab(&PROCESSOR_DATA(processor, idle_state)) / hz_tick_interval); - else { + } else { timer_advance(&idle_temp, mach_absolute_time() - idle_temp.tstamp); cpu_load_info->cpu_ticks[CPU_STATE_IDLE] = (uint32_t)(timer_grab(&idle_temp) / hz_tick_interval); } - } + cpu_load_info->cpu_ticks[CPU_STATE_NICE] = 0; *count = PROCESSOR_CPU_LOAD_INFO_COUNT; @@ -525,6 +532,9 @@ processor_get_assignment( { int state; + if (processor == PROCESSOR_NULL) + return(KERN_INVALID_ARGUMENT); + state = processor->state; if (state == PROCESSOR_SHUTDOWN || state == PROCESSOR_OFF_LINE) return(KERN_FAILURE); diff --git a/osfmk/kern/queue.h b/osfmk/kern/queue.h index 836b55293..2202f812d 100644 --- a/osfmk/kern/queue.h +++ b/osfmk/kern/queue.h @@ -614,8 +614,12 @@ MACRO_END */ struct mpqueue_head { struct queue_entry head; /* header for queue */ +#if defined(__i386__) || defined(__x86_64__) lck_mtx_t lock_data; lck_mtx_ext_t lock_data_ext; +#else + lck_spin_t lock_data; +#endif }; typedef struct mpqueue_head mpqueue_head_t; diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index 9532f4095..8f87afad2 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -67,8 +67,6 @@ #ifndef _KERN_SCHED_H_ #define _KERN_SCHED_H_ -#include - #include #include #include @@ -288,7 +286,8 @@ extern uint32_t default_timeshare_constraint; extern uint32_t max_rt_quantum, min_rt_quantum; -extern uint32_t sched_cswtime; +extern int default_preemption_rate; +extern int default_bg_preemption_rate; #if defined(CONFIG_SCHED_TRADITIONAL) @@ -319,6 +318,9 @@ extern void compute_memory_pressure( extern void compute_zone_gc_throttle( void *arg); +extern void compute_pageout_gc_throttle( + void *arg); + extern void compute_pmap_gc_throttle( void *arg); diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index 5db621937..d2a2ce6cb 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -104,7 +104,8 @@ static struct sched_average { { compute_averunnable, &sched_nrun, 5, 0 }, { compute_stack_target, NULL, 5, 1 }, { compute_memory_pressure, NULL, 1, 0 }, - { compute_zone_gc_throttle, NULL, 1, 0 }, + { compute_zone_gc_throttle, NULL, 60, 0 }, + { compute_pageout_gc_throttle, NULL, 1, 0 }, { compute_pmap_gc_throttle, NULL, 60, 0 }, { NULL, NULL, 0, 0 } }; diff --git a/osfmk/kern/sched_fixedpriority.c b/osfmk/kern/sched_fixedpriority.c index 1eca4aaac..ccde4a094 100644 --- a/osfmk/kern/sched_fixedpriority.c +++ b/osfmk/kern/sched_fixedpriority.c @@ -551,9 +551,15 @@ static ast_t sched_fixedpriority_processor_csw_check(processor_t processor) { run_queue_t runq; - + boolean_t has_higher; + runq = runq_for_processor(processor); - if (runq->highq > processor->current_pri) { + if (first_timeslice(processor)) { + has_higher = (runq->highq > processor->current_pri); + } else { + has_higher = (runq->highq >= processor->current_pri); + } + if (has_higher) { if (runq->urgency > 0) return (AST_PREEMPT | AST_URGENT); @@ -647,6 +653,61 @@ sched_fixedpriority_update_priority(thread_t thread) } +#if CONFIG_EMBEDDED + /* Check for pending throttle transitions, and safely switch queues */ + if ((thread->sched_flags & TH_SFLAG_PENDING_THROTTLE_MASK) && (thread->bound_processor == PROCESSOR_NULL)) { + boolean_t removed = thread_run_queue_remove(thread); + + if (thread->sched_flags & TH_SFLAG_PENDING_THROTTLE_DEMOTION) { + if (thread->sched_mode == TH_MODE_REALTIME) { + thread->saved_mode = thread->sched_mode; + thread->sched_mode = TH_MODE_TIMESHARE; + + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_incr(); + } else { + /* + * It's possible that this is a realtime thread that has + * already tripped the failsafe, in which case it should not + * degrade further. + */ + if (!(thread->sched_flags & TH_SFLAG_FAILSAFE)) { + + thread->saved_mode = thread->sched_mode; + + if (thread->sched_mode == TH_MODE_TIMESHARE) { + thread->sched_mode = TH_MODE_FAIRSHARE; + } + } + } + thread->sched_flags |= TH_SFLAG_THROTTLED; + + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_FAIRSHARE_ENTER) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), 0xFFFFFFFF, 0, 0, 0); + + } else { + if ((thread->sched_mode == TH_MODE_TIMESHARE) + && (thread->saved_mode == TH_MODE_REALTIME)) { + if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) + sched_share_decr(); + } + + thread->sched_mode = thread->saved_mode; + thread->saved_mode = TH_MODE_NONE; + thread->sched_flags &= ~TH_SFLAG_THROTTLED; + + KERNEL_DEBUG_CONSTANT1( + MACHDBG_CODE(DBG_MACH_SCHED,MACH_FAIRSHARE_EXIT) | DBG_FUNC_NONE, 0, 0, 0, 0, thread_tid(thread)); + + } + + thread->sched_flags &= ~(TH_SFLAG_PENDING_THROTTLE_MASK); + + if (removed) + thread_setrun(thread, SCHED_TAILQ); + } +#endif + /* * Check for fail-safe release. */ diff --git a/osfmk/kern/sched_grrr.c b/osfmk/kern/sched_grrr.c index d27b29e87..0c4d1a3d0 100644 --- a/osfmk/kern/sched_grrr.c +++ b/osfmk/kern/sched_grrr.c @@ -231,7 +231,6 @@ const struct sched_dispatch_table sched_grrr_dispatch = { TRUE /* direct_dispatch_to_idle_processors */ }; -extern int default_preemption_rate; extern int max_unsafe_quanta; static uint32_t grrr_quantum_us; diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index d7b959249..5f4803119 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,9 +65,6 @@ */ #include -#include - -#include #include #include @@ -98,6 +95,7 @@ #include #include #include +#include #include #include @@ -122,6 +120,9 @@ decl_simple_lock_data(static,fs_lock); #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ int default_preemption_rate = DEFAULT_PREEMPTION_RATE; +#define DEFAULT_BG_PREEMPTION_RATE 400 /* (1/s) */ +int default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE; + #define MAX_UNSAFE_QUANTA 800 int max_unsafe_quanta = MAX_UNSAFE_QUANTA; @@ -140,8 +141,10 @@ uint64_t sched_safe_duration; uint32_t std_quantum; uint32_t min_std_quantum; +uint32_t bg_quantum; uint32_t std_quantum_us; +uint32_t bg_quantum_us; #endif /* CONFIG_SCHED_TRADITIONAL */ @@ -152,8 +155,6 @@ uint32_t default_timeshare_constraint; uint32_t max_rt_quantum; uint32_t min_rt_quantum; -uint32_t sched_cswtime; - #if defined(CONFIG_SCHED_TRADITIONAL) unsigned sched_tick; @@ -594,6 +595,12 @@ sched_traditional_init(void) printf("standard timeslicing quantum is %d us\n", std_quantum_us); + if (default_bg_preemption_rate < 1) + default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE; + bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate; + + printf("standard background quantum is %d us\n", bg_quantum_us); + load_shift_init(); preempt_pri_init(); sched_tick = 0; @@ -616,6 +623,12 @@ sched_traditional_timebase_init(void) assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); min_std_quantum = (uint32_t)abstime; + /* quantum for background tasks */ + clock_interval_to_absolutetime_interval( + bg_quantum_us, NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + bg_quantum = (uint32_t)abstime; + /* scheduler tick interval */ clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT, NSEC_PER_USEC, &abstime); @@ -911,9 +924,12 @@ thread_unblock( thread->computation_metered = 0; thread->reason = AST_NONE; - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, 0, 0, 0); + /* Event should only be triggered if thread is not already running */ + if (result == FALSE) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, 0, 0); + } DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info); @@ -985,7 +1001,8 @@ thread_mark_wait_locked( (!at_safe_point && (thread->sched_flags & TH_SFLAG_ABORTSAFELY))) { - DTRACE_SCHED(sleep); + if ( !(thread->state & TH_TERMINATE)) + DTRACE_SCHED(sleep); thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT); thread->at_safe_point = at_safe_point; @@ -1062,6 +1079,10 @@ assert_wait( assert(event != NO_EVENT); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE(event), 0, 0, 0, 0); + index = wait_hash(event); wq = &wait_queues[index]; return wait_queue_assert_wait(wq, event, interruptible, 0); @@ -1088,6 +1109,11 @@ assert_wait_timeout( thread_lock(thread); clock_interval_to_deadline(interval, scale_factor, &deadline); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t, event), interruptible, deadline, thread); @@ -1116,6 +1142,10 @@ assert_wait_deadline( wait_queue_lock(wqueue); thread_lock(thread); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0); + wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t,event), interruptible, deadline, thread); @@ -1325,6 +1355,16 @@ thread_unstop( splx(s); } +/* + * Thread locked, returns the same way + */ +static inline boolean_t +thread_isoncpu(thread_t thread) +{ + processor_t processor = thread->last_processor; + + return ((processor != PROCESSOR_NULL) && (processor->active_thread == thread)); +} /* * thread_wait: * @@ -1333,19 +1373,32 @@ thread_unstop( */ void thread_wait( - thread_t thread) + thread_t thread, + boolean_t until_not_runnable) { wait_result_t wresult; - spl_t s = splsched(); + boolean_t oncpu; + processor_t processor; + spl_t s = splsched(); wake_lock(thread); thread_lock(thread); - while (thread->state & TH_RUN) { - processor_t processor = thread->last_processor; + /* + * Wait until not running on a CPU. If stronger requirement + * desired, wait until not runnable. Assumption: if thread is + * on CPU, then TH_RUN is set, so we're not waiting in any case + * where the original, pure "TH_RUN" check would have let us + * finish. + */ + while ((oncpu = thread_isoncpu(thread)) || + (until_not_runnable && (thread->state & TH_RUN))) { - if (processor != PROCESSOR_NULL && processor->active_thread == thread) + if (oncpu) { + assert(thread->state & TH_RUN); + processor = thread->last_processor; cause_ast_check(processor); + } thread->wake_active = TRUE; thread_unlock(thread); @@ -1481,7 +1534,7 @@ thread_wakeup_prim_internal( if (one_thread) return (wait_queue_wakeup_one(wq, event, result, priority)); else - return (wait_queue_wakeup_all(wq, event, result)); + return (wait_queue_wakeup_all(wq, event, result)); } /* @@ -1766,6 +1819,9 @@ thread_select_idle( processor->current_pri = IDLEPRI; processor->current_thmode = TH_MODE_NONE; + /* Reload precise timing global policy to thread-local policy */ + thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); + thread_unlock(thread); /* @@ -1982,6 +2038,9 @@ thread_invoke( assert(thread_runnable(thread)); #endif + /* Reload precise timing global policy to thread-local policy */ + thread->precise_user_kernel_time = use_precise_user_kernel_time(thread); + /* * Allow time constraint threads to hang onto * a stack. @@ -2025,9 +2084,20 @@ thread_invoke( self->last_run_time = processor->last_dispatch; thread_timer_event(processor->last_dispatch, &thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; + + /* + * Since non-precise user/kernel time doesn't update the state timer + * during privilege transitions, synthesize an event now. + */ + if (!thread->precise_user_kernel_time) { + timer_switch(PROCESSOR_DATA(processor, current_state), + processor->last_dispatch, + PROCESSOR_DATA(processor, current_state)); + } - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE, - self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE, + self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE, @@ -2062,8 +2132,9 @@ thread_invoke( counter(++c_thread_invoke_same); thread_unlock(self); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, - self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, + self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); self->continuation = self->parameter = NULL; @@ -2092,8 +2163,9 @@ need_stack: counter(++c_thread_invoke_same); thread_unlock(self); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, - self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, + self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); return (TRUE); } @@ -2126,8 +2198,20 @@ need_stack: thread_timer_event(processor->last_dispatch, &thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, - self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); + /* + * Since non-precise user/kernel time doesn't update the state timer + * during privilege transitions, synthesize an event now. + */ + if (!thread->precise_user_kernel_time) { + timer_switch(PROCESSOR_DATA(processor, current_state), + processor->last_dispatch, + PROCESSOR_DATA(processor, current_state)); + } + + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE, + self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE, @@ -2143,7 +2227,9 @@ need_stack: * and address space if required. We will next run * as a result of a subsequent context switch. */ + assert(continuation == self->continuation); thread = machine_switch_context(self, continuation, thread); + assert(self == current_thread()); TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread); DTRACE_SCHED(on__cpu); @@ -2192,15 +2278,34 @@ thread_dispatch( stack_free(thread); if (!(thread->state & TH_IDLE)) { + int64_t consumed; + int64_t remainder = 0; + + if (processor->quantum_end > processor->last_dispatch) + remainder = processor->quantum_end - + processor->last_dispatch; + + consumed = thread->current_quantum - remainder; + + if ((thread->reason & AST_LEDGER) == 0) + /* + * Bill CPU time to both the individual thread + * and the task. + */ + ledger_credit(thread->t_ledger, + task_ledgers.cpu_time, consumed); + ledger_credit(thread->t_threadledger, + thread_ledgers.cpu_time, consumed); + wake_lock(thread); thread_lock(thread); /* * Compute remainder of current quantum. */ - if ( first_timeslice(processor) && - processor->quantum_end > processor->last_dispatch ) - thread->current_quantum = (uint32_t)(processor->quantum_end - processor->last_dispatch); + if (first_timeslice(processor) && + processor->quantum_end > processor->last_dispatch) + thread->current_quantum = (uint32_t)remainder; else thread->current_quantum = 0; @@ -2222,7 +2327,7 @@ thread_dispatch( */ if (thread->current_quantum < min_std_quantum) { thread->reason |= AST_QUANTUM; - thread->current_quantum += std_quantum; + thread->current_quantum += SCHED(initial_quantum_size)(thread); } #endif } @@ -2253,7 +2358,15 @@ thread_dispatch( thread->reason = AST_NONE; - thread_unlock(thread); + if (thread->wake_active) { + thread->wake_active = FALSE; + thread_unlock(thread); + + thread_wakeup(&thread->wake_active); + } + else + thread_unlock(thread); + wake_unlock(thread); } else { @@ -2382,13 +2495,10 @@ thread_block_reason( self->continuation = continuation; self->parameter = parameter; - if (__improbable(kdebug_thread_block && kdebug_enable && self->state != TH_RUN)) { - uint32_t bt[8]; - - OSBacktrace((void **)&bt[0], 8); - - KERNEL_DEBUG_CONSTANT(0x140004c | DBG_FUNC_START, bt[0], bt[1], bt[2], bt[3], 0); - KERNEL_DEBUG_CONSTANT(0x140004c | DBG_FUNC_END, bt[4], bt[5], bt[6], bt[7], 0); + if (__improbable(kdebug_thread_block && kdebug_enable && self->state != TH_RUN)) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK), + reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0); } do { @@ -2471,9 +2581,9 @@ void thread_continue( register thread_t thread) { - register thread_t self = current_thread(); + register thread_t self = current_thread(); register thread_continue_t continuation; - register void *parameter; + register void *parameter; DTRACE_SCHED(on__cpu); @@ -2506,9 +2616,12 @@ thread_quantum_init(thread_t thread) #if defined(CONFIG_SCHED_TRADITIONAL) static uint32_t -sched_traditional_initial_quantum_size(thread_t thread __unused) +sched_traditional_initial_quantum_size(thread_t thread) { - return std_quantum; + if ((thread == THREAD_NULL) || thread->priority > MAXPRI_THROTTLE) + return std_quantum; + else + return bg_quantum; } static sched_mode_t @@ -2851,7 +2964,7 @@ realtime_setrun( int prstate = processor->state; if (processor == current_processor()) ast_on(AST_PREEMPT | AST_URGENT); - else if ((prstate == PROCESSOR_DISPATCHING) || (prstate == PROCESSOR_IDLE)) + else if ((prstate == PROCESSOR_IDLE) || (prstate == PROCESSOR_DISPATCHING)) machine_signal_idle(processor); else cause_ast_check(processor); @@ -3021,11 +3134,17 @@ static ast_t processor_csw_check(processor_t processor) { run_queue_t runq; + boolean_t has_higher; assert(processor->active_thread != NULL); runq = runq_for_processor(processor); - if (runq->highq > processor->current_pri) { + if (first_timeslice(processor)) { + has_higher = (runq->highq > processor->current_pri); + } else { + has_higher = (runq->highq >= processor->current_pri); + } + if (has_higher) { if (runq->urgency > 0) return (AST_PREEMPT | AST_URGENT); @@ -3529,24 +3648,18 @@ csw_check( processor_t processor) { ast_t result = AST_NONE; + thread_t thread = processor->active_thread; if (first_timeslice(processor)) { if (rt_runq.count > 0) return (AST_PREEMPT | AST_URGENT); - - result |= SCHED(processor_csw_check)(processor); - if (result & AST_URGENT) - return result; } else { if (rt_runq.count > 0 && BASEPRI_RTQUEUES >= processor->current_pri) return (AST_PREEMPT | AST_URGENT); - - result |= SCHED(processor_csw_check)(processor); - if (result & AST_URGENT) - return result; } + result = SCHED(processor_csw_check)(processor); if (result != AST_NONE) return (result); @@ -3556,7 +3669,7 @@ csw_check( if (machine_processor_is_inactive(processor)) return (AST_PREEMPT); - if (processor->active_thread->state & TH_SUSP) + if (thread->state & TH_SUSP) return (AST_PREEMPT); return (AST_NONE); @@ -3911,8 +4024,9 @@ processor_idle( int state; (void)splsched(); - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START, (uintptr_t)thread_tid(thread), 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START, + (uintptr_t)thread_tid(thread), 0, 0, 0, 0); SCHED_STATS_CPU_IDLE_START(processor); @@ -3962,16 +4076,18 @@ processor_idle( thread_setrun(new_thread, SCHED_HEADQ); thread_unlock(new_thread); - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, + (uintptr_t)thread_tid(thread), state, 0, 0, 0); return (THREAD_NULL); } pset_unlock(pset); - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, + (uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0); return (new_thread); } @@ -4003,8 +4119,9 @@ processor_idle( thread_setrun(new_thread, SCHED_HEADQ); thread_unlock(new_thread); - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, + (uintptr_t)thread_tid(thread), state, 0, 0, 0); return (THREAD_NULL); } @@ -4012,8 +4129,9 @@ processor_idle( pset_unlock(pset); - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, (uintptr_t)thread_tid(thread), state, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END, + (uintptr_t)thread_tid(thread), state, 0, 0, 0); return (THREAD_NULL); } @@ -4087,15 +4205,14 @@ sched_startup(void) thread_deallocate(thread); /* - * Yield to the sched_init_thread while it times - * a series of context switches back. It stores - * the baseline value in sched_cswtime. + * Yield to the sched_init_thread once, to + * initialize our own thread after being switched + * back to. * * The current thread is the only other thread * active at this point. */ - while (sched_cswtime == 0) - thread_block(THREAD_CONTINUE_NULL); + thread_block(THREAD_CONTINUE_NULL); } #if defined(CONFIG_SCHED_TRADITIONAL) @@ -4139,67 +4256,10 @@ sched_traditional_tick_continue(void) #endif /* CONFIG_SCHED_TRADITIONAL */ -static uint32_t -time_individual_cswitch(void) -{ - uint32_t switches = 0; - uint64_t newtime, starttime; - - /* Wait for absolute time to increase. */ - starttime = mach_absolute_time(); - do { - newtime = mach_absolute_time(); - } while (newtime == starttime); - - /* Measure one or more context switches until time increases again. - * This ensures we get non-zero timings even if absolute time - * increases very infrequently compared to CPU clock. */ - starttime = newtime; - do { - thread_block(THREAD_CONTINUE_NULL); - newtime = mach_absolute_time(); - ++switches; - } while (newtime == starttime); - /* Round up. */ - return (uint32_t) ((newtime - starttime + switches - 1) / switches); -} - -/* - * Time a series of context switches to determine - * a baseline. Toss the high and low and return - * the one-way value. - */ -static uint32_t -time_cswitch(void) -{ - uint32_t new, hi, low, accum; - int i, tries = 7, denom; - - accum = hi = low = 0; - for (i = 0; i < tries; ++i) { - new = time_individual_cswitch(); - - if (i == 0) - accum = hi = low = new; - else { - if (new < low) - low = new; - else - if (new > hi) - hi = new; - accum += new; - } - } - /* Round up. */ - denom = 2 * (tries - 2); - return (accum - hi - low + denom - 1) / denom; -} - void sched_init_thread(void (*continuation)(void)) { - sched_cswtime = time_cswitch(); - assert(sched_cswtime > 0); + thread_block(THREAD_CONTINUE_NULL); continuation(); @@ -4446,35 +4506,3 @@ thread_runnable( return ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN); } #endif /* DEBUG */ - -#if MACH_KDB -#include -#define printf kdbprintf -void db_sched(void); - -void -db_sched(void) -{ - iprintf("Scheduling Statistics:\n"); - db_indent += 2; - iprintf("Thread invocations: csw %d same %d\n", - c_thread_invoke_csw, c_thread_invoke_same); -#if MACH_COUNTERS - iprintf("Thread block: calls %d\n", - c_thread_block_calls); - iprintf("Idle thread:\n\thandoff %d block %d\n", - c_idle_thread_handoff, - c_idle_thread_block); - iprintf("Sched thread blocks: %d\n", c_sched_thread_block); -#endif /* MACH_COUNTERS */ - db_indent -= 2; -} - -#include -void db_show_thread_log(void); - -void -db_show_thread_log(void) -{ -} -#endif /* MACH_KDB */ diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index 0f89239ae..c22ba7efd 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -93,7 +93,8 @@ extern void thread_unstop( /* Wait for a thread to stop running */ extern void thread_wait( - thread_t thread); + thread_t thread, + boolean_t until_not_runnable); /* Unblock thread on wake up */ extern boolean_t thread_unblock( @@ -385,23 +386,22 @@ extern kern_return_t thread_wakeup_prim( boolean_t one_thread, wait_result_t result); -#ifdef MACH_KERNEL_PRIVATE -extern kern_return_t thread_wakeup_prim_internal( - event_t event, +extern kern_return_t thread_wakeup_prim_internal( + event_t event, boolean_t one_thread, wait_result_t result, int priority); -#endif + #define thread_wakeup(x) \ - thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) + thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) #define thread_wakeup_with_result(x, z) \ - thread_wakeup_prim((x), FALSE, (z)) + thread_wakeup_prim((x), FALSE, (z)) #define thread_wakeup_one(x) \ - thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) + thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) #ifdef MACH_KERNEL_PRIVATE -#define thread_wakeup_one_with_pri(x, pri) \ +#define thread_wakeup_one_with_pri(x, pri) \ thread_wakeup_prim_internal((x), TRUE, THREAD_AWAKENED, pri) #endif diff --git a/osfmk/kern/security.c b/osfmk/kern/security.c index b638479e9..948803887 100644 --- a/osfmk/kern/security.c +++ b/osfmk/kern/security.c @@ -82,9 +82,7 @@ mach_get_task_label( kr = ipc_object_copyout(space, (ipc_object_t) lh->lh_port, MACH_MSG_TYPE_PORT_SEND, 0, outlabel); if (kr != KERN_SUCCESS) { - ip_lock(lh->lh_port); ip_release(lh->lh_port); - ip_check_unlock(lh->lh_port); *outlabel = MACH_PORT_NULL; } @@ -236,14 +234,16 @@ mac_port_check_service_obj( return kr; } - dead = ipc_right_check(space, (ipc_port_t) entry->ie_object, obj, entry); + objp = entry->ie_object; + port = (ipc_port_t)objp; + dead = ipc_right_check(space, port, obj, entry); if (dead) { is_write_unlock(space); + ip_release(port); mac_task_label_destroy(&subjl); return KERN_INVALID_RIGHT; } - objp = entry->ie_object; io_lock (objp); is_write_unlock (space); diff --git a/osfmk/kern/stack.c b/osfmk/kern/stack.c index 6b5ea8302..9906b8b3a 100644 --- a/osfmk/kern/stack.c +++ b/osfmk/kern/stack.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -85,7 +86,8 @@ STACK_ZINFO_PALLOC(thread_t thread) task_t task; zinfo_usage_t zinfo; - thread->tkm_private.alloc += kernel_stack_size; + ledger_credit(thread->t_ledger, task_ledgers.tkm_private, kernel_stack_size); + if (stack_fake_zone_index != -1 && (task = thread->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) OSAddAtomic64(kernel_stack_size, @@ -98,7 +100,8 @@ STACK_ZINFO_PFREE(thread_t thread) task_t task; zinfo_usage_t zinfo; - thread->tkm_private.free += kernel_stack_size; + ledger_debit(thread->t_ledger, task_ledgers.tkm_private, kernel_stack_size); + if (stack_fake_zone_index != -1 && (task = thread->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) OSAddAtomic64(kernel_stack_size, @@ -108,8 +111,9 @@ STACK_ZINFO_PFREE(thread_t thread) static inline void STACK_ZINFO_HANDOFF(thread_t from, thread_t to) { - from->tkm_private.free += kernel_stack_size; - to->tkm_private.alloc += kernel_stack_size; + ledger_debit(from->t_ledger, task_ledgers.tkm_private, kernel_stack_size); + ledger_credit(to->t_ledger, task_ledgers.tkm_private, kernel_stack_size); + if (stack_fake_zone_index != -1) { task_t task; zinfo_usage_t zinfo; @@ -213,7 +217,7 @@ stack_alloc_internal(void) if (kernel_memory_allocate(kernel_map, &stack, kernel_stack_size + (2*PAGE_SIZE), stack_addr_mask, - KMA_KOBJECT | guard_flags) + KMA_KSTACK | KMA_KOBJECT | guard_flags) != KERN_SUCCESS) panic("stack_alloc: kernel_memory_allocate"); diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index cb31f783c..b20629ffa 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -225,13 +225,11 @@ kernel_bootstrap(void) kernel_bootstrap_kprintf("calling clock_init\n"); clock_init(); + ledger_init(); /* * Initialize the IPC, task, and thread subsystems. */ - kernel_bootstrap_kprintf("calling ledger_init\n"); - ledger_init(); - kernel_bootstrap_kprintf("calling task_init\n"); task_init(); @@ -249,13 +247,6 @@ kernel_bootstrap(void) thread->state = TH_RUN; thread_deallocate(thread); - /* transfer statistics from init thread to kernel */ - thread_t init_thread = current_thread(); - kernel_task->tkm_private.alloc = init_thread->tkm_private.alloc; - kernel_task->tkm_private.free = init_thread->tkm_private.free; - kernel_task->tkm_shared.alloc = init_thread->tkm_shared.alloc; - kernel_task->tkm_shared.free = init_thread->tkm_shared.free; - kernel_bootstrap_kprintf("calling load_context - done\n"); load_context(thread); /*NOTREACHED*/ @@ -263,6 +254,8 @@ kernel_bootstrap(void) int kth_started = 0; +vm_offset_t vm_kernel_addrperm; + /* * Now running in a thread. Kick off other services, * invoke user bootstrap, enter pageout loop. @@ -383,11 +376,21 @@ kernel_bootstrap_thread(void) */ vm_shared_region_init(); vm_commpage_init(); + vm_commpage_text_init(); #if CONFIG_MACF mac_policy_initmach(); #endif + /* + * Initialize the global used for permuting kernel + * addresses that may be exported to userland as tokens + * using VM_KERNEL_ADDRPERM(). Force the random number + * to be odd to avoid mapping a non-zero + * word-aligned address to zero via addition. + */ + vm_kernel_addrperm = (vm_offset_t)early_random() | 1; + /* * Start the user bootstrap. */ @@ -496,7 +499,7 @@ load_context( * should never occur since the thread is expected * to have reserved stack. */ - load_context_kprintf("stack %x, stackptr %x\n", + load_context_kprintf("thread %p, stack %x, stackptr %x\n", thread, thread->kernel_stack, thread->machine.kstackptr); if (!thread->kernel_stack) { load_context_kprintf("calling stack_alloc_try\n"); @@ -560,7 +563,6 @@ scale_setup() bsd_scale_setup(scale); ipc_space_max = SPACE_MAX; - ipc_tree_entry_max = ITE_MAX; ipc_port_max = PORT_MAX; ipc_pset_max = SET_MAX; semaphore_max = SEMAPHORE_MAX; diff --git a/osfmk/kern/sync_lock.c b/osfmk/kern/sync_lock.c index b69958ad7..5a06b28f8 100644 --- a/osfmk/kern/sync_lock.c +++ b/osfmk/kern/sync_lock.c @@ -51,6 +51,7 @@ #include #include +#include /* * Ulock ownership MACROS @@ -838,9 +839,7 @@ lock_handoff_accept (lock_set_t lock_set, int lock_id) void lock_set_reference(lock_set_t lock_set) { - lock_set_lock(lock_set); - lock_set->ref_count++; - lock_set_unlock(lock_set); + OSIncrementAtomic(&((lock_set)->ref_count)); } /* @@ -852,14 +851,9 @@ lock_set_reference(lock_set_t lock_set) void lock_set_dereference(lock_set_t lock_set) { - int ref_count; int size; - lock_set_lock(lock_set); - ref_count = --(lock_set->ref_count); - lock_set_unlock(lock_set); - - if (ref_count == 0) { + if (1 == OSDecrementAtomic(&((lock_set)->ref_count))) { ipc_port_dealloc_kernel(lock_set->port); size = (int)(sizeof(struct lock_set) + (sizeof(struct ulock) * (lock_set->n_ulocks - 1))); diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 80ffb8199..687387b3c 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -58,6 +58,8 @@ #include #include +#include + static unsigned int semaphore_event; #define SEMAPHORE_EVENT CAST_EVENT64_T(&semaphore_event) @@ -179,7 +181,12 @@ semaphore_create( } s->count = value; - s->ref_count = (task == kernel_task) ? 1 : 2; + + /* + * One reference for caller, one for port, and one for owner + * task (if not the kernel itself). + */ + s->ref_count = (task == kernel_task) ? 2 : 3; /* * Create and initialize the semaphore port @@ -1060,9 +1067,21 @@ semaphore_dereference( if (semaphore != NULL) { ref_count = hw_atomic_sub(&semaphore->ref_count, 1); + if (ref_count == 1) { + ipc_port_t port = semaphore->port; + + if (IP_VALID(port) && + OSCompareAndSwapPtr(port, IP_NULL, &semaphore->port)) { + /* + * We get to disassociate the port from the sema and + * drop the port's reference on the sema. + */ + ipc_port_dealloc_kernel(port); + ref_count = hw_atomic_sub(&semaphore->ref_count, 1); + } + } if (ref_count == 0) { assert(wait_queue_empty(&semaphore->wait_queue)); - ipc_port_dealloc_kernel(semaphore->port); zfree(semaphore_zone, semaphore); } } diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 3b1f0edaf..89fc63b1b 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -254,7 +254,7 @@ thread_switch( ip_unlock(port); thread = convert_port_to_thread(port); - ipc_port_release(port); + ip_release(port); if (thread == self) { (void)thread_deallocate_internal(thread); diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index 7dc2d61fd..9abedea98 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -60,6 +60,7 @@ #include #include +#include /* Forwards */ @@ -78,6 +79,12 @@ * * WARNING: Don't use numbers 0 through -9. They (along with * the positive numbers) are reserved for Unix. + * + * WARNING: The 'arg_count' parameter in the list below is poorly named. + * It doesn't refer to the number of arguments the trap takes - + * it actually refers to the number of 32-bit words that need + * to be copied in from userspace. The munging of words to trap + * arguments is done in mach_call_munger(). */ int kern_invalid_debug = 0; @@ -91,7 +98,7 @@ int kern_invalid_debug = 0; #include #include -mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { +const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 0 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 1 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 2 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), @@ -102,20 +109,20 @@ mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 7 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 8 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 9 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 10 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), +/* 10 */ MACH_TRAP(_kernelrpc_mach_vm_allocate_trap, 5, munge_wwlw, munge_dddd), /* 11 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 12 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), +/* 12 */ MACH_TRAP(_kernelrpc_mach_vm_deallocate_trap, 5, munge_wll, munge_ddd), /* 13 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 14 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), +/* 14 */ MACH_TRAP(_kernelrpc_mach_vm_protect_trap, 7, munge_wllww, munge_ddddd), /* 15 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 16 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 17 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 18 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 19 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 20 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 21 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 22 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), -/* 23 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), +/* 16 */ MACH_TRAP(_kernelrpc_mach_port_allocate_trap, 3, munge_www, munge_ddd), +/* 17 */ MACH_TRAP(_kernelrpc_mach_port_destroy_trap, 2, munge_ww, munge_dd), +/* 18 */ MACH_TRAP(_kernelrpc_mach_port_deallocate_trap, 2, munge_ww, munge_dd), +/* 19 */ MACH_TRAP(_kernelrpc_mach_port_mod_refs_trap, 4, munge_wwww, munge_dddd), +/* 20 */ MACH_TRAP(_kernelrpc_mach_port_move_member_trap, 3, munge_www, munge_ddd), +/* 21 */ MACH_TRAP(_kernelrpc_mach_port_insert_right_trap, 4, munge_wwww, munge_dddd), +/* 22 */ MACH_TRAP(_kernelrpc_mach_port_insert_member_trap, 3, munge_www, munge_ddd), +/* 23 */ MACH_TRAP(_kernelrpc_mach_port_extract_member_trap, 3, munge_www, munge_ddd), /* 24 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 25 */ MACH_TRAP(kern_invalid, 0, NULL, NULL), /* 26 */ MACH_TRAP(mach_reply_port, 0, NULL, NULL), @@ -241,20 +248,20 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 7 */ "kern_invalid", /* 8 */ "kern_invalid", /* 9 */ "kern_invalid", -/* 10 */ "kern_invalid", +/* 10 */ "_kernelrpc_mach_vm_allocate_trap", /* 11 */ "kern_invalid", -/* 12 */ "kern_invalid", +/* 12 */ "_kernelrpc_mach_vm_deallocate_trap", /* 13 */ "kern_invalid", -/* 14 */ "kern_invalid", +/* 14 */ "_kernelrpc_mach_vm_protect_trap", /* 15 */ "kern_invalid", -/* 16 */ "kern_invalid", -/* 17 */ "kern_invalid", -/* 18 */ "kern_invalid", -/* 19 */ "kern_invalid", -/* 20 */ "kern_invalid", -/* 21 */ "kern_invalid", -/* 22 */ "kern_invalid", -/* 23 */ "kern_invalid", +/* 16 */ "_kernelrpc_mach_port_allocate_trap", +/* 17 */ "_kernelrpc_mach_port_destroy_trap", +/* 18 */ "_kernelrpc_mach_port_deallocate_trap", +/* 19 */ "_kernelrpc_mach_port_mod_refs_trap", +/* 20 */ "_kernelrpc_mach_port_move_member_trap", +/* 21 */ "_kernelrpc_mach_port_insert_right_trap", +/* 22 */ "_kernelrpc_mach_port_insert_member_trap", +/* 23 */ "_kernelrpc_mach_port_extract_member_trap", /* 24 */ "kern_invalid", /* 25 */ "kern_invalid", /* 26 */ "mach_reply_port", diff --git a/osfmk/kern/syscall_sw.h b/osfmk/kern/syscall_sw.h index d186546d5..879b9cd5c 100644 --- a/osfmk/kern/syscall_sw.h +++ b/osfmk/kern/syscall_sw.h @@ -70,11 +70,7 @@ typedef void mach_munge_t(const void *, void *); typedef struct { int mach_trap_arg_count; - int (*mach_trap_function)(void); -#if 0 /* no active architectures use mungers for mach traps */ - mach_munge_t *mach_trap_arg_munge32; /* system call arguments for 32-bit */ - mach_munge_t *mach_trap_arg_munge64; /* system call arguments for 64-bit */ -#endif + kern_return_t (*mach_trap_function)(void *); #if MACH_ASSERT const char* mach_trap_name; #endif /* MACH_ASSERT */ @@ -83,16 +79,16 @@ typedef struct { #define MACH_TRAP_TABLE_COUNT 128 -extern mach_trap_t mach_trap_table[]; +extern const mach_trap_t mach_trap_table[]; extern int mach_trap_count; #if defined(__i386__) || defined(__x86_64__) #if !MACH_ASSERT #define MACH_TRAP(name, arg_count, munge32, munge64) \ - { (arg_count), (int (*)(void)) (name) } + { (arg_count), (kern_return_t (*)(void *)) (name) } #else #define MACH_TRAP(name, arg_count, munge32, munge64) \ - { (arg_count), (int (*)(void)) (name), #name } + { (arg_count), (kern_return_t (*)(void *)) (name), #name } #endif /* !MACH_ASSERT */ #else /* !defined(__i386__) && !defined(__x86_64__) && !defined(__arm__) */ #error Unsupported architecture diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 985f3c144..96a00ff5a 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,7 +86,6 @@ * Copyright (c) 2005 SPARTA, Inc. */ -#include #include #include @@ -113,7 +112,6 @@ #include #include /* for thread_wakeup */ #include -#include #include #include #include @@ -127,10 +125,6 @@ #include #include -#if MACH_KDB -#include -#endif /* MACH_KDB */ - /* * Exported interfaces */ @@ -156,10 +150,18 @@ zone_t task_zone; lck_attr_t task_lck_attr; lck_grp_t task_lck_grp; lck_grp_attr_t task_lck_grp_attr; +#if CONFIG_EMBEDDED +lck_mtx_t task_watch_mtx; +#endif /* CONFIG_EMBEDDED */ zinfo_usage_store_t tasks_tkm_private; zinfo_usage_store_t tasks_tkm_shared; +static ledger_template_t task_ledger_template = NULL; +struct _task_ledger_indices task_ledgers = {-1, -1, -1, -1, -1}; +void init_task_ledgers(void); + + int task_max = CONFIG_TASK_MAX; /* Max number of tasks */ /* externs for BSD kernel */ @@ -170,7 +172,8 @@ extern void proc_getexecutableuuid(void *, unsigned char *, unsigned long); void task_hold_locked( task_t task); void task_wait_locked( - task_t task); + task_t task, + boolean_t until_not_runnable); void task_release_locked( task_t task); void task_free( @@ -178,11 +181,6 @@ void task_free( void task_synchronizer_destroy_all( task_t task); -kern_return_t task_set_ledger( - task_t task, - ledger_t wired, - ledger_t paged); - int check_for_tasksuspend( task_t task); @@ -268,6 +266,9 @@ task_init(void) lck_grp_init(&task_lck_grp, "task", &task_lck_grp_attr); lck_attr_setdefault(&task_lck_attr); lck_mtx_init(&tasks_threads_lock, &task_lck_grp, &task_lck_attr); +#if CONFIG_EMBEDDED + lck_mtx_init(&task_watch_mtx, &task_lck_grp, &task_lck_attr); +#endif /* CONFIG_EMBEDDED */ task_zone = zinit( sizeof(struct task), @@ -277,6 +278,8 @@ task_init(void) zone_change(task_zone, Z_NOENCRYPT, TRUE); + init_task_ledgers(); + /* * Create the kernel task as the first task. */ @@ -289,6 +292,7 @@ task_init(void) vm_map_deallocate(kernel_task->map); kernel_task->map = kernel_map; + } /* @@ -347,6 +351,36 @@ host_security_create_task_token( return(KERN_FAILURE); } +void +init_task_ledgers(void) +{ + ledger_template_t t; + + assert(task_ledger_template == NULL); + assert(kernel_task == TASK_NULL); + + if ((t = ledger_template_create("Per-task ledger")) == NULL) + panic("couldn't create task ledger template"); + + task_ledgers.cpu_time = ledger_entry_add(t, "cpu_time", "sched", "ns"); + task_ledgers.tkm_private = ledger_entry_add(t, "tkm_private", + "physmem", "bytes"); + task_ledgers.tkm_shared = ledger_entry_add(t, "tkm_shared", "physmem", + "bytes"); + task_ledgers.phys_mem = ledger_entry_add(t, "phys_mem", "physmem", + "bytes"); + task_ledgers.wired_mem = ledger_entry_add(t, "wired_mem", "physmem", + "bytes"); + + if ((task_ledgers.cpu_time < 0) || (task_ledgers.tkm_private < 0) || + (task_ledgers.tkm_shared < 0) || (task_ledgers.phys_mem < 0) || + (task_ledgers.wired_mem < 0)) { + panic("couldn't create entries for task ledger template"); + } + + task_ledger_template = t; +} + kern_return_t task_create_internal( task_t parent_task, @@ -356,6 +390,7 @@ task_create_internal( { task_t new_task; vm_shared_region_t shared_region; + ledger_t ledger = NULL; new_task = (task_t) zalloc(task_zone); @@ -365,13 +400,22 @@ task_create_internal( /* one ref for just being alive; one for our caller */ new_task->ref_count = 2; + /* allocate with active entries */ + assert(task_ledger_template != NULL); + if ((ledger = ledger_instantiate(task_ledger_template, + LEDGER_CREATE_ACTIVE_ENTRIES)) == NULL) { + zfree(task_zone, new_task); + return(KERN_RESOURCE_SHORTAGE); + } + new_task->ledger = ledger; + /* if inherit_memory is true, parent_task MUST not be NULL */ if (inherit_memory) - new_task->map = vm_map_fork(parent_task->map); + new_task->map = vm_map_fork(ledger, parent_task->map); else - new_task->map = vm_map_create(pmap_create(0, is_64bit), - (vm_map_offset_t)(VM_MIN_ADDRESS), - (vm_map_offset_t)(VM_MAX_ADDRESS), TRUE); + new_task->map = vm_map_create(pmap_create(ledger, 0, is_64bit), + (vm_map_offset_t)(VM_MIN_ADDRESS), + (vm_map_offset_t)(VM_MAX_ADDRESS), TRUE); /* Inherit memlock limit from parent */ if (parent_task) @@ -399,11 +443,6 @@ task_create_internal( new_task->taskFeatures[0] = 0; /* Init task features */ new_task->taskFeatures[1] = 0; /* Init task features */ - new_task->tkm_private.alloc = 0; - new_task->tkm_private.free = 0; - new_task->tkm_shared.alloc = 0; - new_task->tkm_shared.free = 0; - zinfo_task_init(new_task); #ifdef MACH_BSD @@ -441,6 +480,21 @@ task_create_internal( new_task->t_chud = 0U; #endif + new_task->pidsuspended = FALSE; + new_task->frozen = FALSE; + new_task->rusage_cpu_flags = 0; + new_task->rusage_cpu_percentage = 0; + new_task->rusage_cpu_interval = 0; + new_task->rusage_cpu_deadline = 0; + new_task->rusage_cpu_callt = NULL; + new_task->proc_terminate = 0; +#if CONFIG_EMBEDDED + queue_init(&new_task->task_watchers); + new_task->appstate = TASK_APPSTATE_ACTIVE; + new_task->num_taskwatchers = 0; + new_task->watchapplying = 0; +#endif /* CONFIG_EMBEDDED */ + if (parent_task != TASK_NULL) { new_task->sec_token = parent_task->sec_token; new_task->audit_token = parent_task->audit_token; @@ -449,10 +503,6 @@ task_create_internal( shared_region = vm_shared_region_get(parent_task); vm_shared_region_set(new_task, shared_region); - new_task->wired_ledger_port = ledger_copy( - convert_port_to_ledger(parent_task->wired_ledger_port)); - new_task->paged_ledger_port = ledger_copy( - convert_port_to_ledger(parent_task->paged_ledger_port)); if(task_has_64BitAddr(parent_task)) task_set_64BitAddr(new_task); new_task->all_image_info_addr = parent_task->all_image_info_addr; @@ -468,20 +518,18 @@ task_create_internal( new_task->pset_hint = parent_task->pset_hint = task_choose_pset(parent_task); new_task->policystate = parent_task->policystate; /* inherit the self action state */ - new_task->actionstate = parent_task->actionstate; + new_task->appliedstate = parent_task->appliedstate; new_task->ext_policystate = parent_task->ext_policystate; #if NOTYET /* till the child lifecycle is cleared do not inherit external action */ - new_task->ext_actionstate = parent_task->ext_actionstate; + new_task->ext_appliedstate = parent_task->ext_appliedstate; #else - new_task->ext_actionstate = default_task_null_policy; + new_task->ext_appliedstate = default_task_null_policy; #endif } else { new_task->sec_token = KERNEL_SECURITY_TOKEN; new_task->audit_token = KERNEL_AUDIT_TOKEN; - new_task->wired_ledger_port = ledger_copy(root_wired_ledger); - new_task->paged_ledger_port = ledger_copy(root_paged_ledger); #ifdef __LP64__ if(is_64bit) task_set_64BitAddr(new_task); @@ -492,8 +540,8 @@ task_create_internal( new_task->pset_hint = PROCESSOR_SET_NULL; new_task->policystate = default_task_proc_policy; new_task->ext_policystate = default_task_proc_policy; - new_task->actionstate = default_task_null_policy; - new_task->ext_actionstate = default_task_null_policy; + new_task->appliedstate = default_task_null_policy; + new_task->ext_appliedstate = default_task_null_policy; } if (kernel_task == TASK_NULL) { @@ -530,6 +578,8 @@ void task_deallocate( task_t task) { + ledger_amount_t credit, debit; + if (task == TASK_NULL) return; @@ -540,6 +590,13 @@ task_deallocate( queue_remove(&terminated_tasks, task, task_t, tasks); lck_mtx_unlock(&tasks_threads_lock); + /* + * Give the machine dependent code a chance + * to perform cleanup before ripping apart + * the task. + */ + machine_task_terminate(task); + ipc_task_terminate(task); if (task->affinity_space) @@ -553,10 +610,18 @@ task_deallocate( #if CONFIG_MACF_MACH labelh_release(task->label); #endif - OSAddAtomic64(task->tkm_private.alloc, (int64_t *)&tasks_tkm_private.alloc); - OSAddAtomic64(task->tkm_private.free, (int64_t *)&tasks_tkm_private.free); - OSAddAtomic64(task->tkm_shared.alloc, (int64_t *)&tasks_tkm_shared.alloc); - OSAddAtomic64(task->tkm_shared.free, (int64_t *)&tasks_tkm_shared.free); + + if (!ledger_get_entries(task->ledger, task_ledgers.tkm_private, &credit, + &debit)) { + OSAddAtomic64(credit, (int64_t *)&tasks_tkm_private.alloc); + OSAddAtomic64(debit, (int64_t *)&tasks_tkm_private.free); + } + if (!ledger_get_entries(task->ledger, task_ledgers.tkm_shared, &credit, + &debit)) { + OSAddAtomic64(credit, (int64_t *)&tasks_tkm_shared.alloc); + OSAddAtomic64(debit, (int64_t *)&tasks_tkm_shared.free); + } + ledger_dereference(task->ledger); zinfo_task_free(task); zfree(task_zone, task); } @@ -665,15 +730,14 @@ task_terminate_internal( thread_terminate_internal(thread); } + task_unlock(task); + +#if CONFIG_EMBEDDED /* - * Give the machine dependent code a chance - * to perform cleanup before ripping apart - * the task. + * remove all task watchers */ - if (self_task == task) - machine_thread_terminate_self(); - - task_unlock(task); + task_removewatchers(task); +#endif /* CONFIG_EMBEDDED */ /* * Destroy all synchronizers owned by the task. @@ -683,7 +747,7 @@ task_terminate_internal( /* * Destroy the IPC space, leaving just a reference for it. */ - ipc_space_destroy(task->itk_space); + ipc_space_terminate(task->itk_space); if (vm_map_has_4GB_pagezero(task->map)) vm_map_clear_4GB_pagezero(task->map); @@ -800,20 +864,10 @@ task_complete_halt(task_t task) assert(task->halting); assert(task == current_task()); - /* - * Give the machine dependent code a chance - * to perform cleanup of task-level resources - * associated with the current thread before - * ripping apart the task. - * - * This must be done with the task locked. - */ - machine_thread_terminate_self(); - /* * Wait for the other threads to get shut down. * When the last other thread is reaped, we'll be - * worken up. + * woken up. */ if (task->thread_count > 1) { assert_wait((event_t)&task->halting, THREAD_UNINT); @@ -823,6 +877,14 @@ task_complete_halt(task_t task) task_unlock(task); } + /* + * Give the machine dependent code a chance + * to perform cleanup of task-level resources + * associated with the current thread before + * ripping apart the task. + */ + machine_task_terminate(task); + /* * Destroy all synchronizers owned by the task. */ @@ -906,6 +968,28 @@ task_hold( return (KERN_SUCCESS); } +kern_return_t +task_wait( + task_t task, + boolean_t until_not_runnable) +{ + if (task == TASK_NULL) + return (KERN_INVALID_ARGUMENT); + + task_lock(task); + + if (!task->active) { + task_unlock(task); + + return (KERN_FAILURE); + } + + task_wait_locked(task, until_not_runnable); + task_unlock(task); + + return (KERN_SUCCESS); +} + /* * task_wait_locked: * @@ -916,7 +1000,8 @@ task_hold( */ void task_wait_locked( - register task_t task) + register task_t task, + boolean_t until_not_runnable) { register thread_t thread, self; @@ -932,7 +1017,7 @@ task_wait_locked( */ queue_iterate(&task->threads, thread, thread_t, task_threads) { if (thread != self) - thread_wait(thread); + thread_wait(thread, until_not_runnable); } } @@ -1100,26 +1185,11 @@ task_threads( return (KERN_SUCCESS); } -/* - * task_suspend: - * - * Implement a user-level suspension on a task. - * - * Conditions: - * The caller holds a reference to the task - */ -kern_return_t -task_suspend( +static kern_return_t +place_task_hold ( register task_t task) -{ - if (task == TASK_NULL || task == kernel_task) - return (KERN_INVALID_ARGUMENT); - - task_lock(task); - +{ if (!task->active) { - task_unlock(task); - return (KERN_FAILURE); } @@ -1128,8 +1198,6 @@ task_suspend( * If the stop count was positive, the task is * already stopped and we can exit. */ - task_unlock(task); - return (KERN_SUCCESS); } @@ -1140,11 +1208,71 @@ task_suspend( * to stop executing user code. */ task_hold_locked(task); - task_wait_locked(task); + task_wait_locked(task, TRUE); + + return (KERN_SUCCESS); +} + +static kern_return_t +release_task_hold ( + register task_t task, + boolean_t pidresume) +{ + register boolean_t release = FALSE; + + if (!task->active) { + return (KERN_FAILURE); + } + + if (pidresume) { + if (task->pidsuspended == FALSE) { + return (KERN_FAILURE); + } + task->pidsuspended = FALSE; + } + + if (task->user_stop_count > (task->pidsuspended ? 1 : 0)) { + if (--task->user_stop_count == 0) { + release = TRUE; + } + } + else { + return (KERN_FAILURE); + } + + /* + * Release the task if necessary. + */ + if (release) + task_release_locked(task); + + return (KERN_SUCCESS); +} + +/* + * task_suspend: + * + * Implement a user-level suspension on a task. + * + * Conditions: + * The caller holds a reference to the task + */ +kern_return_t +task_suspend( + register task_t task) +{ + kern_return_t kr; + + if (task == TASK_NULL || task == kernel_task) + return (KERN_INVALID_ARGUMENT); + + task_lock(task); + + kr = place_task_hold(task); task_unlock(task); - return (KERN_SUCCESS); + return (kr); } /* @@ -1158,39 +1286,107 @@ kern_return_t task_resume( register task_t task) { - register boolean_t release = FALSE; + kern_return_t kr; if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); task_lock(task); - if (!task->active) { - task_unlock(task); + kr = release_task_hold(task, FALSE); - return (KERN_FAILURE); - } + task_unlock(task); - if (task->user_stop_count > 0) { - if (--task->user_stop_count == 0) { - release = TRUE; - } + return (kr); +} + +kern_return_t +task_pidsuspend_locked(task_t task) +{ + kern_return_t kr; + + if (task->pidsuspended) { + kr = KERN_FAILURE; + goto out; } - else { - task_unlock(task); - return (KERN_FAILURE); + task->pidsuspended = TRUE; + + kr = place_task_hold(task); + if (kr != KERN_SUCCESS) { + task->pidsuspended = FALSE; } +out: + return(kr); +} - /* - * Release the task if necessary. - */ - if (release) - task_release_locked(task); + +/* + * task_pidsuspend: + * + * Suspends a task by placing a hold on its threads. + * + * Conditions: + * The caller holds a reference to the task + */ +kern_return_t +task_pidsuspend( + register task_t task) +{ + kern_return_t kr; + + if (task == TASK_NULL || task == kernel_task) + return (KERN_INVALID_ARGUMENT); + + task_lock(task); + + kr = task_pidsuspend_locked(task); task_unlock(task); - return (KERN_SUCCESS); + return (kr); +} + +/* If enabled, we bring all the frozen pages back in prior to resumption; otherwise, they're faulted back in on demand */ +#define THAW_ON_RESUME 1 + +/* + * task_pidresume: + * Resumes a previously suspended task. + * + * Conditions: + * The caller holds a reference to the task + */ +kern_return_t +task_pidresume( + register task_t task) +{ + kern_return_t kr; +#if (CONFIG_FREEZE && THAW_ON_RESUME) + boolean_t frozen; +#endif + + if (task == TASK_NULL || task == kernel_task) + return (KERN_INVALID_ARGUMENT); + + task_lock(task); + +#if (CONFIG_FREEZE && THAW_ON_RESUME) + frozen = task->frozen; + task->frozen = FALSE; +#endif + + kr = release_task_hold(task, TRUE); + + task_unlock(task); + +#if (CONFIG_FREEZE && THAW_ON_RESUME) + if ((kr == KERN_SUCCESS) && (frozen == TRUE)) { + kr = vm_map_thaw(task->map); + } +#endif + + return (kr); } #if CONFIG_FREEZE @@ -1198,7 +1394,7 @@ task_resume( /* * task_freeze: * - * Freeze a currently suspended task. + * Freeze a task. * * Conditions: * The caller holds a reference to the task @@ -1210,19 +1406,35 @@ task_freeze( uint32_t *wired_count, uint32_t *clean_count, uint32_t *dirty_count, + uint32_t dirty_budget, boolean_t *shared, boolean_t walk_only) { + kern_return_t kr; + if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); + task_lock(task); + + if (task->frozen) { + task_unlock(task); + return (KERN_FAILURE); + } + + if (walk_only == FALSE) { + task->frozen = TRUE; + } + + task_unlock(task); + if (walk_only) { - vm_map_freeze_walk(task->map, purgeable_count, wired_count, clean_count, dirty_count, shared); + kr = vm_map_freeze_walk(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared); } else { - vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, shared); + kr = vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared); } - return (KERN_SUCCESS); + return (kr); } /* @@ -1237,12 +1449,25 @@ kern_return_t task_thaw( register task_t task) { + kern_return_t kr; + if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); - vm_map_thaw(task->map); + task_lock(task); + + if (!task->frozen) { + task_unlock(task); + return (KERN_FAILURE); + } + + task->frozen = FALSE; - return (KERN_SUCCESS); + task_unlock(task); + + kr = vm_map_thaw(task->map); + + return (kr); } #endif /* CONFIG_FREEZE */ @@ -1279,32 +1504,6 @@ host_security_set_task_token( return(kr); } -/* - * Utility routine to set a ledger - */ -kern_return_t -task_set_ledger( - task_t task, - ledger_t wired, - ledger_t paged) -{ - if (task == TASK_NULL) - return(KERN_INVALID_ARGUMENT); - - task_lock(task); - if (wired) { - ipc_port_release_send(task->wired_ledger_port); - task->wired_ledger_port = ledger_copy(wired); - } - if (paged) { - ipc_port_release_send(task->paged_ledger_port); - task->paged_ledger_port = ledger_copy(paged); - } - task_unlock(task); - - return(KERN_SUCCESS); -} - /* * This routine was added, pretty much exclusively, for registering the * RPC glue vector for in-kernel short circuited tasks. Rather than @@ -1435,6 +1634,51 @@ task_info( break; } + case MACH_TASK_BASIC_INFO: + { + mach_task_basic_info_t basic_info; + vm_map_t map; + clock_sec_t secs; + clock_usec_t usecs; + + if (*task_info_count < MACH_TASK_BASIC_INFO_COUNT) { + error = KERN_INVALID_ARGUMENT; + break; + } + + basic_info = (mach_task_basic_info_t)task_info_out; + + map = (task == kernel_task) ? kernel_map : task->map; + + basic_info->virtual_size = map->size; + + basic_info->resident_size = + (mach_vm_size_t)(pmap_resident_count(map->pmap)); + basic_info->resident_size *= PAGE_SIZE_64; + + basic_info->resident_size_max = + (mach_vm_size_t)(pmap_resident_max(map->pmap)); + basic_info->resident_size_max *= PAGE_SIZE_64; + + basic_info->policy = ((task != kernel_task) ? + POLICY_TIMESHARE : POLICY_RR); + + basic_info->suspend_count = task->user_stop_count; + + absolutetime_to_microtime(task->total_user_time, &secs, &usecs); + basic_info->user_time.seconds = + (typeof(basic_info->user_time.seconds))secs; + basic_info->user_time.microseconds = usecs; + + absolutetime_to_microtime(task->total_system_time, &secs, &usecs); + basic_info->system_time.seconds = + (typeof(basic_info->system_time.seconds))secs; + basic_info->system_time.microseconds = usecs; + + *task_info_count = MACH_TASK_BASIC_INFO_COUNT; + break; + } + case TASK_THREAD_TIMES_INFO: { register task_thread_times_info_t times_info; @@ -1485,14 +1729,27 @@ task_info( queue_iterate(&task->threads, thread, thread_t, task_threads) { uint64_t tval; + spl_t x; + + x = splsched(); + thread_lock(thread); tval = timer_grab(&thread->user_timer); info->threads_user += tval; info->total_user += tval; tval = timer_grab(&thread->system_timer); - info->threads_system += tval; - info->total_system += tval; + if (thread->precise_user_kernel_time) { + info->threads_system += tval; + info->total_system += tval; + } else { + /* system_timer may represent either sys or user */ + info->threads_user += tval; + info->total_user += tval; + } + + thread_unlock(thread); + splx(x); } @@ -1561,7 +1818,7 @@ task_info( case TASK_KERNELMEMORY_INFO: { task_kernelmemory_info_t tkm_info; - thread_t thread; + ledger_amount_t credit, debit; if (*task_info_count < TASK_KERNELMEMORY_INFO_COUNT) { error = KERN_INVALID_ARGUMENT; @@ -1569,6 +1826,10 @@ task_info( } tkm_info = (task_kernelmemory_info_t) task_info_out; + tkm_info->total_palloc = 0; + tkm_info->total_pfree = 0; + tkm_info->total_salloc = 0; + tkm_info->total_sfree = 0; if (task == kernel_task) { /* @@ -1581,41 +1842,37 @@ task_info( /* start by accounting for all the terminated tasks against the kernel */ tkm_info->total_palloc = tasks_tkm_private.alloc + tasks_tkm_shared.alloc; tkm_info->total_pfree = tasks_tkm_private.free + tasks_tkm_shared.free; - tkm_info->total_salloc = 0; - tkm_info->total_sfree = 0; /* count all other task/thread shared alloc/free against the kernel */ lck_mtx_lock(&tasks_threads_lock); + + /* XXX this really shouldn't be using the function parameter 'task' as a local var! */ queue_iterate(&tasks, task, task_t, tasks) { if (task == kernel_task) { - tkm_info->total_palloc += task->tkm_private.alloc; - tkm_info->total_pfree += task->tkm_private.free; + if (ledger_get_entries(task->ledger, + task_ledgers.tkm_private, &credit, + &debit) == KERN_SUCCESS) { + tkm_info->total_palloc += credit; + tkm_info->total_pfree += debit; + } } - tkm_info->total_palloc += task->tkm_shared.alloc; - tkm_info->total_pfree += task->tkm_shared.free; - } - queue_iterate(&threads, thread, thread_t, threads) { - if (thread->task == kernel_task) { - tkm_info->total_palloc += thread->tkm_private.alloc; - tkm_info->total_pfree += thread->tkm_private.free; + if (!ledger_get_entries(task->ledger, + task_ledgers.tkm_shared, &credit, &debit)) { + tkm_info->total_palloc += credit; + tkm_info->total_pfree += debit; } - tkm_info->total_palloc += thread->tkm_shared.alloc; - tkm_info->total_pfree += thread->tkm_shared.free; } lck_mtx_unlock(&tasks_threads_lock); } else { - /* account for all the terminated threads in the process */ - tkm_info->total_palloc = task->tkm_private.alloc; - tkm_info->total_pfree = task->tkm_private.free; - tkm_info->total_salloc = task->tkm_shared.alloc; - tkm_info->total_sfree = task->tkm_shared.free; - - /* then add in all the running threads */ - queue_iterate(&task->threads, thread, thread_t, task_threads) { - tkm_info->total_palloc += thread->tkm_private.alloc; - tkm_info->total_pfree += thread->tkm_private.free; - tkm_info->total_salloc += thread->tkm_shared.alloc; - tkm_info->total_sfree += thread->tkm_shared.free; + if (!ledger_get_entries(task->ledger, + task_ledgers.tkm_private, &credit, &debit)) { + tkm_info->total_palloc = credit; + tkm_info->total_pfree = debit; + } + if (!ledger_get_entries(task->ledger, + task_ledgers.tkm_shared, &credit, &debit)) { + tkm_info->total_salloc = credit; + tkm_info->total_sfree = debit; } task_unlock(task); } @@ -1785,6 +2042,7 @@ task_vtimer_set( integer_t which) { thread_t thread; + spl_t x; /* assert(task == current_task()); */ /* bogus assert 4803227 4807483 */ @@ -1796,21 +2054,36 @@ task_vtimer_set( case TASK_VTIMER_USER: queue_iterate(&task->threads, thread, thread_t, task_threads) { - thread->vtimer_user_save = timer_grab(&thread->user_timer); + x = splsched(); + thread_lock(thread); + if (thread->precise_user_kernel_time) + thread->vtimer_user_save = timer_grab(&thread->user_timer); + else + thread->vtimer_user_save = timer_grab(&thread->system_timer); + thread_unlock(thread); + splx(x); } break; case TASK_VTIMER_PROF: queue_iterate(&task->threads, thread, thread_t, task_threads) { + x = splsched(); + thread_lock(thread); thread->vtimer_prof_save = timer_grab(&thread->user_timer); thread->vtimer_prof_save += timer_grab(&thread->system_timer); + thread_unlock(thread); + splx(x); } break; case TASK_VTIMER_RLIM: queue_iterate(&task->threads, thread, thread_t, task_threads) { + x = splsched(); + thread_lock(thread); thread->vtimer_rlim_save = timer_grab(&thread->user_timer); thread->vtimer_rlim_save += timer_grab(&thread->system_timer); + thread_unlock(thread); + splx(x); } break; } @@ -1853,8 +2126,13 @@ __unused switch (which) { case TASK_VTIMER_USER: - tdelt = (uint32_t)timer_delta(&thread->user_timer, + if (thread->precise_user_kernel_time) { + tdelt = (uint32_t)timer_delta(&thread->user_timer, + &thread->vtimer_user_save); + } else { + tdelt = (uint32_t)timer_delta(&thread->system_timer, &thread->vtimer_user_save); + } absolutetime_to_microtime(tdelt, &secs, microsecs); break; @@ -2137,9 +2415,9 @@ task_findtid(task_t task, uint64_t tid) queue_iterate(&task->threads, thread, thread_t, task_threads) { if (thread->thread_id == tid) - break; + return(thread); } - return(thread); + return(THREAD_NULL); } diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index af0482aca..7d2c981ce 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -155,39 +155,40 @@ The bit defns of the policy states /* Hardware disk access attributes, bit different as it should reflect IOPOL_XXX */ #define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NONE 0x00 +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS 0x01 #define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL 0x01 #define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_PASSIVE 0x02 #define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE 0x03 -#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_DEFAULT TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_UTILITY 0x04 +#define TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_DEFAULT TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS -/* Hardware disk access attributes */ +/* Hardware GPU access attributes */ #define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NONE 0x00 -#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NORMAL 0x00 #define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_FULLACCESS 0x00 #define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS 0x01 -#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT 0x00 +#define TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_FULLACCESS /* Hardware Network access attributes */ #define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NONE 0x00 -#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NORMAL 0x00 +#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_FULLACCESS 0x00 #define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_THROTTLE 0x01 -#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_DEFAULT 0x00 +#define TASK_POLICY_HWACCESS_NET_ATTRIBUTE_DEFAULT TASK_POLICY_HWACCESS_NET_ATTRIBUTE_FULLACCESS /* Hardware CPU access attributes */ #define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_NONE 0x00 -#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_NORMAL 0x00 -#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_ALL 0x00 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_FULLACCESS 0x00 #define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_ONE 0x01 #define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_LLCACHE 0x02 -#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_DEFAULT 0x00 +#define TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_DEFAULT TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_FULLACCESS /* Resource usage/low resource attributes */ #define TASK_POLICY_RESOURCE_ATTRIBUTE_NONE 0x00 #define TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE 0x01 #define TASK_POLICY_RESOURCE_ATTRIBUTE_SUSPEND 0x02 #define TASK_POLICY_RESOURCE_ATTRIBUTE_TERMINATE 0x03 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY 0x04 -#define TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT 0x00 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_KQ 0x04 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC 0x05 +#define TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT TASK_POLICY_RESOURCE_ATTRIBUTE_NONE #endif /* XNU_KERNEL_PRIVATE */ @@ -211,10 +212,24 @@ typedef struct process_policy { hw_bg:8; /* Darwin Background Policy */ } process_policy_t; +#if CONFIG_EMBEDDED + +typedef struct task_watcher { + queue_chain_t tw_links; /* queueing of threads */ + task_t tw_task; /* task that is being watched */ + thread_t tw_thread; /* thread that is watching the watch_task */ + int tw_state; /* the current app state of the thread */ + int tw_importance; /* importance prior to backgrounding */ +} task_watch_t; + +extern lck_mtx_t task_watch_mtx; + +#endif /* CONFIG_EMBEDDED */ + #include extern process_policy_t default_task_proc_policy; /* init value for the process policy attributes */ -extern process_policy_t default_task_null_policy; /* none as the value for the process policy attributes */ +extern process_policy_t default_task_null_policy; /* none as the value for the process policy attributes */ struct task { /* Synchronization/destruction information */ @@ -280,9 +295,8 @@ struct task { int semaphores_owned; /* number of semaphores owned */ int lock_sets_owned; /* number of lock sets owned */ - /* Ledgers */ - struct ipc_port *wired_ledger_port; - struct ipc_port *paged_ledger_port; + ledger_t ledger; + unsigned int priv_flags; /* privilege resource flags */ #define VM_BACKING_STORE_PRIV 0x1 @@ -295,12 +309,10 @@ struct task { integer_t messages_received; /* messages received counter */ integer_t syscalls_mach; /* mach system call counter */ integer_t syscalls_unix; /* unix system call counter */ - uint32_t c_switch; /* total context switches */ - uint32_t p_switch; /* total processor switches */ - uint32_t ps_switch; /* total pset switches */ + uint32_t c_switch; /* total context switches */ + uint32_t p_switch; /* total processor switches */ + uint32_t ps_switch; /* total pset switches */ - zinfo_usage_store_t tkm_private;/* private kmem alloc/free stats (reaped threads) */ - zinfo_usage_store_t tkm_shared; /* shared kmem alloc/free stats (reaped threads) */ zinfo_usage_t tkm_zinfo; /* per-task, per-zone usage statistics */ #ifdef MACH_BSD @@ -328,14 +340,28 @@ struct task { uint32_t t_chud; /* CHUD flags, used for Shark */ #endif - process_policy_t ext_actionstate; /* externally applied actions */ + boolean_t pidsuspended; /* pid_suspend called; no threads can execute */ + boolean_t frozen; /* frozen; private resident pages committed to swap */ + process_policy_t ext_appliedstate; /* externally applied actions */ process_policy_t ext_policystate; /* externally defined process policy states*/ - process_policy_t actionstate; /* self applied acions */ + process_policy_t appliedstate; /* self applied acions */ process_policy_t policystate; /* process wide policy states */ - - uint64_t rsu_controldata[TASK_POLICY_RESOURCE_USAGE_COUNT]; + uint8_t rusage_cpu_flags; + uint8_t rusage_cpu_percentage; /* Task-wide CPU limit percentage */ + uint64_t rusage_cpu_interval; /* Task-wide CPU limit interval */ + uint8_t rusage_cpu_perthr_percentage; /* Per-thread CPU limit percentage */ + uint64_t rusage_cpu_perthr_interval; /* Per-thread CPU limit interval */ + uint64_t rusage_cpu_deadline; + thread_call_t rusage_cpu_callt; +#if CONFIG_EMBEDDED + uint32_t appstate; /* the current appstate */ + queue_head_t task_watchers; /* app state watcher threads */ + int num_taskwatchers; + int watchapplying; +#endif /* CONFIG_EMBEDDED */ vm_extmod_statistics_data_t extmod_statistics; + natural_t proc_terminate; /* the process is marked for proc_terminate */ }; #define task_lock(task) lck_mtx_lock(&(task)->lock) @@ -404,10 +430,25 @@ __BEGIN_DECLS extern kern_return_t task_hold( task_t task); +/* Wait for task to stop running, either just to get off CPU or to cease being runnable */ +extern kern_return_t task_wait( + task_t task, + boolean_t until_not_runnable); + /* Release hold on all threads in a task */ extern kern_return_t task_release( task_t task); +/* Suspends a task by placing a hold on its threads */ +extern kern_return_t task_pidsuspend( + task_t task); +extern kern_return_t task_pidsuspend_locked( + task_t task); + +/* Resumes a previously paused task */ +extern kern_return_t task_pidresume( + task_t task); + #if CONFIG_FREEZE /* Freeze a task's resident pages */ @@ -417,6 +458,7 @@ extern kern_return_t task_freeze( uint32_t *wired_count, uint32_t *clean_count, uint32_t *dirty_count, + uint32_t dirty_budget, boolean_t *shared, boolean_t walk_only); @@ -505,6 +547,16 @@ extern kern_return_t machine_task_set_state( thread_state_t state, mach_msg_type_number_t state_count); +extern void machine_task_terminate(task_t task); + +struct _task_ledger_indices { + int cpu_time; + int tkm_private; + int tkm_shared; + int phys_mem; + int wired_mem; +}; +extern struct _task_ledger_indices task_ledgers; int proc_get_task_bg_policy(task_t task); int proc_get_thread_bg_policy(task_t task, uint64_t tid); @@ -513,9 +565,9 @@ int proc_get_selfthread_isbackground(void); int proc_get_darwinbgstate(task_t, uint32_t *); int proc_set_bgtaskpolicy(task_t task, int intval); -int proc_set1_bgtaskpolicy(task_t task, int intval); +int proc_set_and_apply_bgtaskpolicy(task_t task, int intval); int proc_set_bgthreadpolicy(task_t task, uint64_t tid, int val); -int proc_set1_bgthreadpolicy(task_t task, uint64_t tid, int val); +int proc_set_and_apply_bgthreadpolicy(task_t task, uint64_t tid, int val); int proc_add_bgtaskpolicy(task_t task, int val); int proc_add_bgthreadpolicy(task_t task, uint64_t tid, int val); @@ -524,7 +576,6 @@ int proc_remove_bgthreadpolicy(task_t task, uint64_t tid, int val); int proc_apply_bgtaskpolicy(task_t task); int proc_apply_bgtaskpolicy_external(task_t task); -int proc_apply_bgtaskpolicy_internal(task_t task); int proc_apply_bgthreadpolicy(task_t task, uint64_t tid); int proc_apply_bgtask_selfpolicy(void); int proc_apply_bgthread_selfpolicy(void); @@ -534,6 +585,7 @@ int proc_restore_bgtaskpolicy(task_t task); int proc_restore_bgthreadpolicy(task_t task, uint64_t tid); int proc_restore_bgthread_selfpolicy(void); int proc_restore_workq_bgthreadpolicy(thread_t); +void proc_task_remove_throttle(task_t task); /* hw access routines */ int proc_apply_task_diskacc(task_t task, int policy); @@ -541,6 +593,7 @@ int proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy); int proc_apply_thread_selfdiskacc(int policy); int proc_get_task_disacc(task_t task); int proc_get_task_selfdiskacc(void); +int proc_get_diskacc(thread_t thread); int proc_get_thread_selfdiskacc(void); int proc_denyinherit_policy(task_t task); int proc_denyselfset_policy(task_t task); @@ -550,21 +603,32 @@ int proc_apply_task_gpuacc(task_t task, int prio); int proc_get_task_ruse_cpu(task_t task, uint32_t * policyp, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep); int proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint32_t percentage, uint64_t interval, uint64_t deadline); +int proc_clear_task_ruse_cpu(task_t task); thread_t task_findtid(task_t, uint64_t); +#define TASK_RUSECPU_FLAGS_PROC_LIMIT 0x1 +#define TASK_RUSECPU_FLAGS_PERTHR_LIMIT 0x2 +#define TASK_RUSECPU_FLAGS_DEADLINE 0x4 + #define PROC_POLICY_OSX_APPTYPE_NONE 0 +#if CONFIG_EMBEDDED +#define PROC_POLICY_IOS_RESV1_APPTYPE 1 +#define PROC_POLICY_IOS_APPLE_DAEMON 2 +#define PROC_POLICY_IOS_APPTYPE 3 +#define PROC_POLICY_IOS_NONUITYPE 4 +#else #define PROC_POLICY_OSX_APPTYPE_TAL 1 #define PROC_POLICY_OSX_APPTYPE_WIDGET 2 #define PROC_POLICY_OSX_APPTYPE_DBCLIENT 2 /* Not a bug, just rename of widget */ -#define PROC_POLICY_IOS_APPTYPE 3 -#define PROC_POLICY_IOS_NONUITYPE 4 +#endif -void proc_set_task_apptype(task_t, int); +void proc_set_task_apptype(task_t task, int type, thread_t thread); int proc_disable_task_apptype(task_t task, int policy_subtype); int proc_enable_task_apptype(task_t task, int policy_subtype); -/* resource handle callback */ -int task_action_cpuusage(task_t); +#if CONFIG_EMBEDDED +extern int proc_setthread_saved_importance(thread_t thread, int importance); +#endif /* BSD call back functions */ extern int proc_apply_resource_actions(void * p, int type, int action); @@ -574,6 +638,22 @@ extern int task_restore_resource_actions(task_t task, int type); extern void proc_apply_task_networkbg(void * bsd_info); extern void proc_restore_task_networkbg(void * bsd_info); extern void proc_set_task_networkbg(void * bsd_info, int setbg); +extern int task_clear_cpuusage(task_t task); + +#if CONFIG_EMBEDDED +#define TASK_APPSTATE_NONE 0 +#define TASK_APPSTATE_ACTIVE 1 +#define TASK_APPSTATE_BACKGROUND 2 +#define TASK_APPSTATE_NONUI 3 +#define TASK_APPSTATE_INACTIVE 4 + +extern int proc_lf_getappstate(task_t task); +extern int proc_lf_setappstate(task_t task, int state); +extern int proc_lf_pidbind(task_t curtask, uint64_t tid, task_t target_task, int bind); +extern void thead_remove_taskwatch(thread_t thread); +extern void task_removewatchers(task_t task); +#endif /* CONFIG_EMBEDDED */ + #endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -581,6 +661,16 @@ extern void proc_set_task_networkbg(void * bsd_info, int setbg); extern void *get_bsdtask_info(task_t); extern void *get_bsdthreadtask_info(thread_t); extern vm_map_t get_task_map(task_t); +extern ledger_t get_task_ledger(task_t); + +extern boolean_t get_task_pidsuspended(task_t); +extern boolean_t get_task_frozen(task_t); + +/* Convert from a task to a port */ +extern ipc_port_t convert_task_to_port(task_t); + +/* Convert from a task name to a port */ +extern ipc_port_t convert_task_name_to_port(task_name_t); #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index e8f9bc628..2cbb2daa2 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -35,14 +35,52 @@ #include #include #include +#include +#include +#if CONFIG_EMBEDDED +#include +#include +#endif /* CONFIG_EMBEDDED */ +#include + +#if CONFIG_MEMORYSTATUS +extern void memorystatus_on_suspend(int pid); +extern void memorystatus_on_resume(int pid); +#endif -static int proc_apply_bgtaskpolicy_locked(task_t task, int, int); -static int proc_restore_bgtaskpolicy_locked(task_t, int, int, int); +static int proc_apply_bgtaskpolicy_internal(task_t, int, int); +static int proc_restore_bgtaskpolicy_internal(task_t, int, int, int); static int task_get_cpuusage(task_t task, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep); -static int task_set_cpuusage(task_t task, uint32_t percentage, uint64_t interval, uint64_t deadline); +int task_set_cpuusage(task_t task, uint64_t percentage, uint64_t interval, uint64_t deadline, int scope); +static int task_clear_cpuusage_locked(task_t task); static int task_apply_resource_actions(task_t task, int type); +static void task_priority(task_t task, integer_t priority, integer_t max_priority); +static kern_return_t task_role_default_handler(task_t task, task_role_t role); +void task_action_cpuusage(thread_call_param_t param0, thread_call_param_t param1); static int proc_apply_bgthreadpolicy_locked(thread_t thread, int selfset); -static void restore_bgthreadpolicy_locked(thread_t thread, int selfset); +static void restore_bgthreadpolicy_locked(thread_t thread, int selfset, int importance); +static int proc_get_task_selfdiskacc_internal(task_t task, thread_t thread); +extern void unthrottle_thread(void * uthread); + +#if CONFIG_EMBEDDED +static void set_thread_appbg(thread_t thread, int setbg,int importance); +static void apply_bgthreadpolicy_external(thread_t thread); +static void add_taskwatch_locked(task_t task, task_watch_t * twp); +static void remove_taskwatch_locked(task_t task, task_watch_t * twp); +static void task_watch_lock(void); +static void task_watch_unlock(void); +static void apply_appstate_watchers(task_t task, int setbg); +void proc_apply_task_networkbg_internal(void *, thread_t); +void proc_restore_task_networkbg_internal(void *, thread_t); +int proc_pid(void * proc); + +typedef struct thread_watchlist { + thread_t thread; /* thread being worked on for taskwatch action */ + int importance; /* importance to be restored if thread is being made active */ +} thread_watchlist_t; + +#endif /* CONFIG_EMBEDDED */ + process_policy_t default_task_proc_policy = {0, 0, @@ -54,8 +92,8 @@ process_policy_t default_task_proc_policy = {0, TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, TASK_POLICY_RESOURCE_ATTRIBUTE_NONE, 0, - TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_ALL, - TASK_POLICY_HWACCESS_NET_ATTRIBUTE_NORMAL, + TASK_POLICY_HWACCESS_CPU_ATTRIBUTE_FULLACCESS, + TASK_POLICY_HWACCESS_NET_ATTRIBUTE_FULLACCESS, TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_FULLACCESS, TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL, TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL @@ -79,11 +117,44 @@ process_policy_t default_task_null_policy = {0, }; -static void -task_priority( - task_t task, - integer_t priority, - integer_t max_priority); + +/* + * This routine should always be called with the task lock held. + * This routine handles Default operations for TASK_FOREGROUND_APPLICATION + * and TASK_BACKGROUND_APPLICATION of task with no special app type. + */ +static kern_return_t +task_role_default_handler(task_t task, task_role_t role) +{ + kern_return_t result = KERN_SUCCESS; + + switch (task->role) { + case TASK_FOREGROUND_APPLICATION: + case TASK_BACKGROUND_APPLICATION: + case TASK_UNSPECIFIED: + /* if there are no process wide backgrounding ... */ + if ((task->ext_appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) && + (task->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) { + task_priority(task, + ((role == TASK_FOREGROUND_APPLICATION)? + BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), + task->max_priority); + } + task->role = role; + break; + + case TASK_CONTROL_APPLICATION: + case TASK_RENICED: + /* else fail silently */ + break; + + default: + result = KERN_INVALID_ARGUMENT; + break; + } + return(result); +} + kern_return_t task_policy_set( @@ -115,99 +186,66 @@ task_policy_set( #endif task_lock(task); - if ( info->role == TASK_FOREGROUND_APPLICATION || - info->role == TASK_BACKGROUND_APPLICATION) { + switch(info->role) { + case TASK_FOREGROUND_APPLICATION : { + if (task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_NONE) { + result = task_role_default_handler(task, info->role); + } else { + switch (task->ext_appliedstate.apptype) { #if !CONFIG_EMBEDDED - if (task->ext_actionstate.apptype != PROC_POLICY_OSX_APPTYPE_NONE) { - switch (info->role) { - case TASK_FOREGROUND_APPLICATION: - switch (task->ext_actionstate.apptype) { - case PROC_POLICY_OSX_APPTYPE_TAL: - /* Move the app to foreground with no DarwinBG */ - proc_restore_bgtaskpolicy_locked(task, 1, 1, BASEPRI_FOREGROUND); - bsdinfo = task->bsd_info; - setbg = 0; - break; - - case PROC_POLICY_OSX_APPTYPE_DBCLIENT: - /* reset the apptype so enforcement on background/foregound */ - task->ext_actionstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; - /* Internal application and make it foreground pri */ - proc_restore_bgtaskpolicy_locked(task, 1, 0, BASEPRI_FOREGROUND); - bsdinfo = task->bsd_info; - setbg = 0; - break; - - default: - /* the app types cannot be in CONTROL, GRAPHICS STATE, so it will de default state here */ - task_priority(task, - ((info->role == TASK_FOREGROUND_APPLICATION)? - BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), - task->max_priority); - break; - } - task->role = TASK_FOREGROUND_APPLICATION; - break; - - case TASK_BACKGROUND_APPLICATION: - switch (task->ext_actionstate.apptype) { - case PROC_POLICY_OSX_APPTYPE_TAL: - /* TAL apps will get Darwin backgrounded if not already set */ - if (task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { - /* external application of Darwin BG */ - proc_apply_bgtaskpolicy_locked(task, 1, 1); - bsdinfo = task->bsd_info; - setbg = 1; - } - break; - - default: - task_priority(task, - ((info->role == TASK_FOREGROUND_APPLICATION)? - BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), - task->max_priority); - break; - } - task->role = TASK_BACKGROUND_APPLICATION; - break; - - default: - /* do nothing */ - break; - - } /* switch info->role */ - } else { /* apptype != PROC_POLICY_OSX_APPTYPE_NONE */ + case PROC_POLICY_OSX_APPTYPE_TAL: + /* Move the app to foreground with no DarwinBG */ + proc_restore_bgtaskpolicy_internal(task, 1, 1, BASEPRI_FOREGROUND); + bsdinfo = task->bsd_info; + setbg = 0; + break; + + case PROC_POLICY_OSX_APPTYPE_DBCLIENT: + /* reset the apptype so enforcement on background/foregound */ + task->ext_appliedstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; + /* Internal application and make it foreground pri */ + proc_restore_bgtaskpolicy_internal(task, 1, 0, BASEPRI_FOREGROUND); + bsdinfo = task->bsd_info; + setbg = 0; + break; #endif /* !CONFIG_EMBEDDED */ - switch (task->role) { - - case TASK_FOREGROUND_APPLICATION: - case TASK_BACKGROUND_APPLICATION: - case TASK_UNSPECIFIED: - /* if there are no process wide backgrounding ... */ - if ((task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) && - (task->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) { - task_priority(task, - ((info->role == TASK_FOREGROUND_APPLICATION)? - BASEPRI_FOREGROUND: BASEPRI_BACKGROUND), - task->max_priority); - } - task->role = info->role; - break; - case TASK_CONTROL_APPLICATION: - case TASK_RENICED: - /* else fail silently */ - break; + default: + /* the app types cannot be in CONTROL, GRAPHICS STATE, so it will de default state here */ + task_priority(task, BASEPRI_FOREGROUND, task->max_priority); + break; - default: - result = KERN_INVALID_ARGUMENT; - break; + } /* switch (task->ext_appliedstate.apptype) */ + task->role = TASK_FOREGROUND_APPLICATION; + } } + break; + + case TASK_BACKGROUND_APPLICATION : { + if (task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_NONE) { + result = task_role_default_handler(task, info->role); + } else { /* apptype != PROC_POLICY_OSX_APPTYPE_NONE */ + switch (task->ext_appliedstate.apptype) { #if !CONFIG_EMBEDDED - } /* apptype != PROC_POLICY_OSX_APPTYPE_NONE */ + case PROC_POLICY_OSX_APPTYPE_TAL: + /* TAL apps will get Darwin backgrounded if not already set */ + if (task->ext_appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { + proc_apply_bgtaskpolicy_internal(task, 1, 1); + bsdinfo = task->bsd_info; + setbg = 1; + } + break; #endif /* !CONFIG_EMBEDDED */ + default: + task_priority(task, BASEPRI_BACKGROUND, task->max_priority); + break; + } /* switch (task->ext_appliedstate.apptype) */ + task->role = TASK_BACKGROUND_APPLICATION; + } + } + break; - } else if (info->role == TASK_CONTROL_APPLICATION) { + case TASK_CONTROL_APPLICATION: if (task != current_task()|| task->sec_token.val[0] != 0) result = KERN_INVALID_ARGUMENT; @@ -215,7 +253,9 @@ task_policy_set( task_priority(task, BASEPRI_CONTROL, task->max_priority); task->role = info->role; } - } else if (info->role == TASK_GRAPHICS_SERVER) { + break; + + case TASK_GRAPHICS_SERVER: if (task != current_task() || task->sec_token.val[0] != 0) result = KERN_INVALID_ARGUMENT; @@ -223,24 +263,16 @@ task_policy_set( task_priority(task, MAXPRI_RESERVED - 3, MAXPRI_RESERVED); task->role = info->role; } - } else -#if CONFIG_EMBEDDED - if (info->role == TASK_THROTTLE_APPLICATION) { - task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); - task->role = info->role; - } else if (info->role == TASK_DEFAULT_APPLICATION || info->role == TASK_NONUI_APPLICATION) - { - task_priority(task, BASEPRI_DEFAULT, MAXPRI_USER); - task->role = info->role; - } else -#else /* CONFIG_EMBEDDED */ - if (info->role == TASK_DEFAULT_APPLICATION) - { + break; + case TASK_DEFAULT_APPLICATION: task_priority(task, BASEPRI_DEFAULT, MAXPRI_USER); task->role = info->role; - } else -#endif /* CONFIG_EMBEDDED */ + break; + + default : result = KERN_INVALID_ARGUMENT; + break; + } /* switch (info->role) */ task_unlock(task); @@ -378,18 +410,22 @@ proc_get_task_bg_policy(task_t task) int proc_get_thread_bg_policy(task_t task, uint64_t tid) { + int selfset = 0; thread_t self = current_thread(); thread_t thread = THREAD_NULL; int val = 0; - if (tid == self->thread_id) { - val = self->policystate.hw_bg; - } else { + if (tid == self->thread_id) + selfset = 1; + + if (selfset == 0) { task_lock(task); thread = task_findtid(task, tid); if (thread != NULL) val = thread->ext_policystate.hw_bg; task_unlock(task); + } else { + val = self->policystate.hw_bg; } return(val); @@ -401,10 +437,10 @@ proc_get_self_isbackground(void) task_t task = current_task();; thread_t thread = current_thread(); - if ((task->ext_actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || - (task->actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || - (thread->ext_actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || - (thread->actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) + if ((task->ext_appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (task->appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (thread->ext_appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (thread->appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) return(1); else return(0); @@ -415,8 +451,8 @@ int proc_get_selfthread_isbackground(void) { thread_t thread = current_thread(); - if ((thread->ext_actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || - (thread->actionstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) + if ((thread->ext_appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) || + (thread->appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE)) return(1); else return(0); @@ -447,17 +483,32 @@ proc_set_bgtaskpolicy(task_t task, int intval) return(0); } -/* set and apply as well */ -int proc_set1_bgtaskpolicy(task_t task, int prio) +/* set and apply as well , handles reset of NONUI due to setprio() task app state implmn side effect */ +int +proc_set_and_apply_bgtaskpolicy(task_t task, int prio) { int error = 0; if (prio == PRIO_DARWIN_BG) { error = proc_set_bgtaskpolicy(task, TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL); - if (error == 0) + if (error == 0) { error = proc_apply_bgtaskpolicy(task); +#if CONFIG_EMBEDDED + /* XXX: till SB uses newer SPIs */ + apply_appstate_watchers(task, 1); +#endif /* CONFIG_EMBEDDED */ + } } else { error = proc_restore_bgtaskpolicy(task); + if (error == 0) { + /* since prior impl of non UI was overloaded with bg state, need to reset */ + error = proc_apply_task_gpuacc(task, TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT); +#if CONFIG_EMBEDDED + /* XXX: till SB uses newer SPIs */ + apply_appstate_watchers(task, 0); +#endif /* CONFIG_EMBEDDED */ + } + } return(error); @@ -467,19 +518,23 @@ int proc_set1_bgtaskpolicy(task_t task, int prio) int proc_set_bgthreadpolicy(task_t task, uint64_t tid, int prio) { + int selfset = 0; thread_t self = current_thread(); thread_t thread = THREAD_NULL; int reset; if (prio == 0) reset = 1; + if (tid == self->thread_id) + selfset = 1; + task_lock(task); - if (tid == self->thread_id) { - self->policystate.hw_bg = prio; - } else { + if (selfset == 0) { thread = task_findtid(task, tid); if (thread != NULL) thread->ext_policystate.hw_bg = prio; + } else { + self->policystate.hw_bg = prio; } task_unlock(task); @@ -488,7 +543,7 @@ proc_set_bgthreadpolicy(task_t task, uint64_t tid, int prio) } int -proc_set1_bgthreadpolicy(task_t task, uint64_t tid, int prio) +proc_set_and_apply_bgthreadpolicy(task_t task, uint64_t tid, int prio) { int error = 0; @@ -526,19 +581,23 @@ proc_add_bgtaskpolicy(task_t task, int val) int proc_add_bgthreadpolicy(task_t task, uint64_t tid, int val) { + int selfset = 0; thread_t self = current_thread(); thread_t thread = THREAD_NULL; int reset; if (val == 0) reset = 1; + if (tid == self->thread_id) + selfset = 1; + task_lock(task); - if (tid == self->thread_id) { - self->policystate.hw_bg |= val; - } else { + if (selfset == 0) { thread = task_findtid(task, tid); if (thread != NULL) thread->ext_policystate.hw_bg |= val; + } else { + self->policystate.hw_bg |= val; } task_unlock(task); @@ -569,19 +628,23 @@ proc_remove_bgtaskpolicy(task_t task, int intval) int proc_remove_bgthreadpolicy(task_t task, uint64_t tid, int val) { + int selfset = 0; thread_t self = current_thread(); thread_t thread = THREAD_NULL; int reset; if (val == 0) reset = 1; + if (tid == self->thread_id) + selfset = 1; + task_lock(task); - if (tid == self->thread_id) { - self->policystate.hw_bg &= ~val; - } else { + if (selfset == 0) { thread = task_findtid(task, tid); if (thread != NULL) thread->ext_policystate.hw_bg &= ~val; + } else { + self->policystate.hw_bg &= ~val; } task_unlock(task); @@ -602,50 +665,47 @@ proc_apply_bgtaskpolicy(task_t task) if (task == current_task()) external = 0; - - return(proc_apply_bgtaskpolicy_locked(task, 0, external)); + return(proc_apply_bgtaskpolicy_internal(task, 0, external)); } -int +int proc_apply_bgtaskpolicy_external(task_t task) { - return(proc_apply_bgtaskpolicy_locked(task, 0, 1)); - -} - -int -proc_apply_bgtaskpolicy_internal(task_t task) -{ - return(proc_apply_bgtaskpolicy_locked(task, 0, 0)); + return(proc_apply_bgtaskpolicy_internal(task, 0, 1)); } - static int -proc_apply_bgtaskpolicy_locked(task_t task, int locked, int external) +proc_apply_bgtaskpolicy_internal(task_t task, int locked, int external) { + if (locked == 0) task_lock(task); + /* if the process is exiting, no action to be done */ + if (task->proc_terminate != 0) + goto out; + if (external != 0) { /* allready set? */ - if (task->ext_actionstate.hw_bg != task->ext_policystate.hw_bg) { - task->ext_actionstate.hw_bg = task->ext_policystate.hw_bg; + if (task->ext_appliedstate.hw_bg != task->ext_policystate.hw_bg) { + task->ext_appliedstate.hw_bg = task->ext_policystate.hw_bg; task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); /* background state applied */ } } else { - if (task->actionstate.hw_bg != task->policystate.hw_bg) { - task->actionstate.hw_bg = task->policystate.hw_bg; + if (task->appliedstate.hw_bg != task->policystate.hw_bg) { + task->appliedstate.hw_bg = task->policystate.hw_bg; task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE); } } +out: if (locked == 0) task_unlock(task); return(0); } -/* apply the self backgrounding even if the thread is not current thread/task(timer threads) */ -int +/* apply the self backgrounding even if the thread is not current thread */ +int proc_apply_workq_bgthreadpolicy(thread_t thread) { int error; @@ -657,7 +717,7 @@ proc_apply_workq_bgthreadpolicy(thread_t thread) /* apply the background as selfset internal one */ error = proc_apply_bgthreadpolicy_locked(thread, 1); task_unlock(wqtask); - } else + } else error = ESRCH; return(error); @@ -666,56 +726,66 @@ proc_apply_workq_bgthreadpolicy(thread_t thread) int proc_apply_bgthreadpolicy(task_t task, uint64_t tid) { + int selfset = 0, error = 0; thread_t self = current_thread(); thread_t thread = THREAD_NULL; - int selfset = 0, error = 0; task_t localtask = TASK_NULL; if (tid == self->thread_id) { selfset = 1; localtask = current_task(); - } else { + } else localtask = task; - } task_lock(localtask); - if (selfset != 0) { + if (selfset != 0) { thread = self; } else { - thread = task_findtid(task, tid); + thread = task_findtid(localtask, tid); } error = proc_apply_bgthreadpolicy_locked(thread, selfset); - task_unlock(localtask); + task_unlock(localtask); + return(error); } -static int +static int proc_apply_bgthreadpolicy_locked(thread_t thread, int selfset) { int set = 0; thread_precedence_policy_data_t policy; + if (thread != NULL) { - if (selfset != 0) { + /* if the process is exiting, no action to be done */ + if (thread->task->proc_terminate != 0) + goto out; + + if (selfset != 0) { /* internal application */ - if (thread->actionstate.hw_bg != thread->policystate.hw_bg) { - thread->actionstate.hw_bg = thread->policystate.hw_bg; - if (thread->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + if (thread->appliedstate.hw_bg != thread->policystate.hw_bg) { + thread->appliedstate.hw_bg = thread->policystate.hw_bg; + if (thread->ext_appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) set = 1; } } else { /* external application */ - if (thread->ext_actionstate.hw_bg != thread->ext_policystate.hw_bg) { - thread->ext_actionstate.hw_bg = thread->ext_policystate.hw_bg; - if (thread->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + if (thread->ext_appliedstate.hw_bg != thread->ext_policystate.hw_bg) { + thread->ext_appliedstate.hw_bg = thread->ext_policystate.hw_bg; + if (thread->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) set = 1; } } if (set != 0) { +#if CONFIG_EMBEDDED + if (thread->task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) { + thread->saved_importance = thread->importance; + } +#endif /* CONFIG_EMBEDDED */ /* set thread priority (we did not save previous value) */ policy.importance = INT_MIN; @@ -724,12 +794,45 @@ proc_apply_bgthreadpolicy_locked(thread_t thread, int selfset) THREAD_PRECEDENCE_POLICY_COUNT ); } - } else + } else return(ESRCH); - + +out: return(0); } +#if CONFIG_EMBEDDED +/* set external application of background */ +static void +apply_bgthreadpolicy_external(thread_t thread) +{ +int set = 0; +thread_precedence_policy_data_t policy; + + /* if the process is exiting, no action to be done */ + if (thread->task->proc_terminate != 0) + return; + + thread->ext_policystate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL; + + if (thread->ext_appliedstate.hw_bg != thread->ext_policystate.hw_bg) { + thread->ext_appliedstate.hw_bg = thread->ext_policystate.hw_bg; + if (thread->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + set = 1; + } + + if (set != 0) { + /* set thread priority (we did not save previous value) */ + policy.importance = INT_MIN; + + thread_policy_set_internal(thread, THREAD_PRECEDENCE_POLICY, + (thread_policy_t)&policy, + THREAD_PRECEDENCE_POLICY_COUNT ); + } + +} +#endif /* CONFIG_EMBEDDED */ + int proc_apply_bgthread_selfpolicy(void) { @@ -742,39 +845,41 @@ proc_restore_bgtaskpolicy(task_t task) { int external = 1; - if (current_task() == task) + if (current_task() == task) external = 0; - return(proc_restore_bgtaskpolicy_locked(task, 0, external, BASEPRI_DEFAULT)); + return(proc_restore_bgtaskpolicy_internal(task, 0, external, BASEPRI_DEFAULT)); } static int -proc_restore_bgtaskpolicy_locked(task_t task, int locked, int external, int pri) +proc_restore_bgtaskpolicy_internal(task_t task, int locked, int external, int pri) { if (locked == 0) task_lock(task); + /* if the process is exiting, no action to be done */ + if (task->proc_terminate != 0) + goto out; + if (external != 0) { - task->ext_actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + task->ext_appliedstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; /* self BG in flight? */ - if (task->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { + if (task->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { task_priority(task, pri, MAXPRI_USER); #if CONFIG_EMBEDDED - /* non embedded users need role for policy reapplication */ task->role = TASK_DEFAULT_APPLICATION; #endif /* CONFIG_EMBEDDED */ } } else { - task->actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + task->appliedstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; /* external BG in flight? */ - if (task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { + if (task->ext_appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) { task_priority(task, pri, MAXPRI_USER); #if CONFIG_EMBEDDED - /* non embedded users need role for policy reapplication */ task->role = TASK_DEFAULT_APPLICATION; #endif /* CONFIG_EMBEDDED */ } } - +out: if (locked == 0) task_unlock(task); @@ -782,17 +887,25 @@ proc_restore_bgtaskpolicy_locked(task_t task, int locked, int external, int pri) } /* restore the self backgrounding even if the thread is not current thread */ -int +int proc_restore_workq_bgthreadpolicy(thread_t thread) { int error = 0; task_t wqtask = TASK_NULL; + int importance = 0; if (thread != THREAD_NULL) { wqtask = thread->task; task_lock(wqtask); /* remove the background and restore default importance as self(internal) removal */ - restore_bgthreadpolicy_locked(thread, 1); +#if CONFIG_EMBEDDED + if (thread->task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) { + /* restore prev set importnace */ + importance = thread->saved_importance; + thread->saved_importance = 0; + } +#endif /* CONFIG_EMBEDDED */ + restore_bgthreadpolicy_locked(thread, 1, importance); task_unlock(wqtask); } else error = ESRCH; @@ -800,31 +913,41 @@ proc_restore_workq_bgthreadpolicy(thread_t thread) return(error); } -int proc_restore_bgthread_selfpolicy(void) +int +proc_restore_bgthread_selfpolicy(void) { return(proc_restore_bgthreadpolicy(current_task(), thread_tid(current_thread()))); - } - int proc_restore_bgthreadpolicy(task_t task, uint64_t tid) { + int selfset = 0; thread_t self = current_thread(); thread_t thread = THREAD_NULL; + int importance = 0; - task_lock(task); - if (tid == self->thread_id) { - thread = self; + if (tid == self->thread_id) selfset = 1; - } else { + + task_lock(task); + if (selfset == 0) { thread = task_findtid(task, tid); + } else { + thread = self; } - if (thread != NULL) - restore_bgthreadpolicy_locked(thread, selfset); - + if (thread != NULL) { +#if CONFIG_EMBEDDED + if (thread->task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) { + /* restore prev set importnace */ + importance = thread->saved_importance; + thread->saved_importance = 0; + } +#endif /* CONFIG_EMBEDDED */ + restore_bgthreadpolicy_locked(thread, selfset, importance); + } task_unlock(task); if (thread != NULL) @@ -834,28 +957,32 @@ proc_restore_bgthreadpolicy(task_t task, uint64_t tid) } static void -restore_bgthreadpolicy_locked(thread_t thread, int selfset) +restore_bgthreadpolicy_locked(thread_t thread, int selfset, int importance) { thread_precedence_policy_data_t policy; int reset = 0; if (thread != NULL) { - if (selfset != 0) { - thread->actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + /* if the process is exiting, no action to be done */ + if (thread->task->proc_terminate != 0) + return; + + if (selfset != 0) { + thread->appliedstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; /* external BG in flight? */ - if (thread->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + if (thread->ext_appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) reset = 1; } else { - thread->ext_actionstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; + thread->ext_appliedstate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE; /* self BG in flight? */ - if (thread->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) + if (thread->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_NONE) reset = 1; } if (reset != 0) { /* reset thread priority (we did not save previous value) */ - policy.importance = 0; + policy.importance = importance; thread_policy_set_internal(thread, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&policy, THREAD_PRECEDENCE_POLICY_COUNT ); @@ -864,23 +991,41 @@ restore_bgthreadpolicy_locked(thread_t thread, int selfset) } void -proc_set_task_apptype(task_t task, int type) +#if CONFIG_EMBEDDED +proc_set_task_apptype(task_t task, int type, thread_t thread) +#else +proc_set_task_apptype(task_t task, int type, __unused thread_t thread) +#endif { +#if CONFIG_EMBEDDED + thread_t th = THREAD_NULL; +#endif /* CONFIG_EMBEDDED */ + switch (type) { - case PROC_POLICY_OSX_APPTYPE_TAL: +#if CONFIG_EMBEDDED + case PROC_POLICY_IOS_RESV1_APPTYPE: task->ext_policystate.apptype = type; task->policystate.apptype = type; proc_apply_bgtaskpolicy_external(task); /* indicate that BG is set and next foreground needs to reset */ - task->ext_actionstate.apptype = type; + task->ext_appliedstate.apptype = type; break; - case PROC_POLICY_OSX_APPTYPE_DBCLIENT: + case PROC_POLICY_IOS_APPLE_DAEMON: task->ext_policystate.apptype = type; task->policystate.apptype = type; - proc_apply_bgtaskpolicy_internal(task); - /* indicate that BG is set and next foreground needs to reset */ - task->ext_actionstate.apptype = type; + task->ext_appliedstate.apptype = type; + /* posix spawn will already have thread created, so backround it */ + if (thread == NULL) + th = current_thread(); + else + th = thread; + if (th->appliedstate.hw_bg != TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL) { + /* apply self backgrounding if not already set */ + task_lock(th->task); + proc_apply_bgthreadpolicy_locked(th, 1); + task_unlock(th->task); + } break; case PROC_POLICY_IOS_APPTYPE: @@ -891,9 +1036,25 @@ proc_set_task_apptype(task_t task, int type) task->ext_policystate.apptype = type; task->policystate.apptype = type; /* set to deny access to gpu */ - task->ext_actionstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; break; +#else /* CONFIG_EMBEDDED */ + case PROC_POLICY_OSX_APPTYPE_TAL: + task->ext_policystate.apptype = type; + task->policystate.apptype = type; + proc_apply_bgtaskpolicy_external(task); + /* indicate that BG is set and next foreground needs to reset */ + task->ext_appliedstate.apptype = type; + break; + + case PROC_POLICY_OSX_APPTYPE_DBCLIENT: + task->ext_policystate.apptype = type; + task->policystate.apptype = type; + proc_apply_bgtaskpolicy_internal(task, 0, 0); + break; + +#endif /* CONFIG_EMBEDDED */ default: break; @@ -903,16 +1064,22 @@ proc_set_task_apptype(task_t task, int type) /* update the darwin backdground action state in the flags field for libproc */ #define PROC_FLAG_DARWINBG 0x8000 /* process in darwin background */ #define PROC_FLAG_EXT_DARWINBG 0x10000 /* process in darwin background - external enforcement */ +#define PROC_FLAG_IOS_APPLEDAEMON 0x20000 /* process is apple ios daemon */ int proc_get_darwinbgstate(task_t task, uint32_t * flagsp) { - if (task->ext_actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL){ + if (task->ext_appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL){ *flagsp |= PROC_FLAG_EXT_DARWINBG; } - if (task->actionstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL){ + if (task->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL){ *flagsp |= PROC_FLAG_DARWINBG; } +#if CONFIG_EMBEDDED + if (task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) { + *flagsp |= PROC_FLAG_IOS_APPLEDAEMON; + } +#endif /* CONFIG_EMBEDDED */ return(0); } @@ -925,81 +1092,132 @@ proc_get_darwinbgstate(task_t task, uint32_t * flagsp) int proc_get_task_disacc(task_t task) { - if ((task->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) +#if CONFIG_EMBEDDED + if ((task->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (task->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(task->ext_actionstate.hw_disk); - if ((task->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) +#else /* CONFIG_EMBEDDED */ + if ((task->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) { + /* if it is a TAL or DBClient and not self throttled, return Utility */ + if ((task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_TAL) || (task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_DBCLIENT)) { + /* any setting for DBG, we need to honor that */ + if ((task->ext_appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE) && + ((task->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE)!= 0) && + (task->appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE)) { + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_UTILITY); + } else + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + } else + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + } +#endif /* CONFIG_EMBEDDED */ + if (task->ext_appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(task->ext_appliedstate.hw_disk); + if ((task->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (task->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(task->actionstate.hw_disk); - return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL); + if (task->appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(task->appliedstate.hw_disk); + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS); } int -proc_get_task_selfdiskacc(void) +proc_get_task_selfdiskacc_internal(task_t task, thread_t thread) { - task_t task = current_task(); - thread_t thread= current_thread(); - + /* if the task is marked for proc_terminate, no throttling for it */ + if (task->proc_terminate != 0) + goto out; /* * As per defined iopolicysys behavior, thread trumps task. * Do we need to follow that for external enforcements of BG or hw access? * Status quo for now.. */ - if((thread->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + + if((thread->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (thread->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(thread->ext_actionstate.hw_disk); - if((thread->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + if (thread->ext_appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(thread->ext_appliedstate.hw_disk); + if((thread->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (thread->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(thread->actionstate.hw_disk); + if (thread->appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(thread->appliedstate.hw_disk); - if ((task->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) +#if CONFIG_EMBEDDED + if ((task->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (task->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(task->ext_actionstate.hw_disk); - if ((task->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) +#else /* CONFIG_EMBEDDED */ + if ((task->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) { + /* if it is a TAL or DBClient and not self throttled, return Utility */ + if ((task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_TAL) || (task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_DBCLIENT)) { + /* any setting for DBG, we need to honor that */ + if ((task->ext_appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE) && + ((task->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE)!= 0) && + (task->appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE)) { + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_UTILITY); + } else + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + } else + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + } +#endif /* CONFIG_EMBEDDED */ + if (task->ext_appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(task->ext_appliedstate.hw_disk); + if ((task->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (task->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(task->actionstate.hw_disk); - return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL); + if (task->appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(task->appliedstate.hw_disk); +out: + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS); +} + + +int +proc_get_task_selfdiskacc(void) +{ + return(proc_get_task_selfdiskacc_internal(current_task(), current_thread())); +} + + +int +proc_get_diskacc(thread_t thread) +{ + return(proc_get_task_selfdiskacc_internal(thread->task, thread)); } + int proc_get_thread_selfdiskacc(void) { thread_t thread = current_thread(); - if((thread->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + if((thread->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (thread->ext_actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(thread->ext_actionstate.hw_disk); - if((thread->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) + if (thread->ext_appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(thread->ext_appliedstate.hw_disk); + if((thread->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_DISKTHROTTLE) != 0) return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); - if (thread->actionstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL) - return(thread->actionstate.hw_disk); - return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL); + if (thread->appliedstate.hw_disk != TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS) + return(thread->appliedstate.hw_disk); + return(TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS); } -int proc_apply_task_diskacc(task_t task, int policy) +int +proc_apply_task_diskacc(task_t task, int policy) { task_t self = current_task(); task_lock(task); if (task == self) { - task->actionstate.hw_disk = policy; + task->appliedstate.hw_disk = policy; task->policystate.hw_disk = policy; } else { - task->ext_actionstate.hw_disk = policy; + task->ext_appliedstate.hw_disk = policy; task->ext_policystate.hw_disk = policy; } task_unlock(task); return(0); } -int proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy) +int +proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy) { thread_t thread; @@ -1010,7 +1228,7 @@ int proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy) task_lock(task); thread = task_findtid(task, tid); if (thread != NULL) { - thread->ext_actionstate.hw_disk = policy; + thread->ext_appliedstate.hw_disk = policy; thread->ext_policystate.hw_disk = policy; } task_unlock(task); @@ -1021,22 +1239,66 @@ int proc_apply_thread_diskacc(task_t task, uint64_t tid, int policy) return(0); } -int -proc_apply_thread_selfdiskacc(int policy) +void +proc_task_remove_throttle(task_t task) { - task_t task = current_task(); - thread_t thread = current_thread(); + thread_t thread; + int importance = 0; task_lock(task); - thread->actionstate.hw_disk = policy; - thread->policystate.hw_disk = policy; - task_unlock(task); - return(0); -} -int -proc_denyinherit_policy(__unused task_t task) -{ + + /* remove processwide internal DBG applicationn */ + proc_restore_bgtaskpolicy_internal(task, 1, 0, BASEPRI_DEFAULT); + /* remove processwide external DBG applicationn */ + proc_restore_bgtaskpolicy_internal(task, 1, 1, BASEPRI_DEFAULT); + + for (thread = (thread_t)queue_first(&task->threads); + !queue_end(&task->threads, (queue_entry_t)thread); ) { +#if CONFIG_EMBEDDED + if (thread->task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) { + /* restore prev set importnace */ + importance = thread->saved_importance; + thread->saved_importance = 0; + } +#endif /* CONFIG_EMBEDDED */ + /* remove thread level internal DBG application */ + restore_bgthreadpolicy_locked(thread, 1, importance); + /* remove thread level external DBG application */ + restore_bgthreadpolicy_locked(thread, 0, importance); + /* reset thread io policy */ + thread->ext_appliedstate.hw_disk = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS; + thread->appliedstate.hw_disk = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS; + unthrottle_thread(thread->uthread); + thread = (thread_t)queue_next(&thread->task_threads); + } + + /* reset task iopolicy */ + task->ext_appliedstate.hw_disk = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS; + task->appliedstate.hw_disk = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_FULLACCESS; + task->proc_terminate = 1; + + task_unlock(task); +} + + + +int +proc_apply_thread_selfdiskacc(int policy) +{ + task_t task = current_task(); + thread_t thread = current_thread(); + + task_lock(task); + thread->appliedstate.hw_disk = policy; + thread->policystate.hw_disk = policy; + task_unlock(task); + return(0); +} + +int +proc_denyinherit_policy(__unused task_t task) +{ return(0); } @@ -1051,18 +1313,28 @@ int proc_get_task_selfgpuacc_deny(void) { task_t task = current_task(); +#ifdef NOTYET thread_t thread = current_thread(); +#endif /* NOTYET */ - if (((task->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (task->ext_actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + if (((task->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (task->ext_appliedstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); - if (((task->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (task->actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + if (((task->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (task->appliedstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); - if (((thread->ext_actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (thread->ext_actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) +#ifdef NOTYET + /* + * Since background dispatch items run in a thread can also be + * denied access, we need to make sure there are no unintended + * consequences of background dispatch usage. So till this is + * hashed out, disable thread level checking. + */ + if (((thread->ext_appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (thread->ext_appliedstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); - if (((thread->actionstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (thread->actionstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) + if (((thread->appliedstate.hw_bg & TASK_POLICY_BACKGROUND_ATTRIBUTE_NOGPU) != 0) || (thread->appliedstate.hw_gpu == TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS)) return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); - return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NORMAL); +#endif /* NOTYET */ + return(TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_FULLACCESS); } int @@ -1073,10 +1345,10 @@ proc_apply_task_gpuacc(task_t task, int policy) task_lock(task); if (task == self) { - task->actionstate.hw_gpu = policy; + task->appliedstate.hw_gpu = policy; task->policystate.hw_gpu = policy; } else { - task->ext_actionstate.hw_gpu = policy; + task->ext_appliedstate.hw_gpu = policy; task->ext_policystate.hw_gpu = policy; } task_unlock(task); @@ -1103,10 +1375,64 @@ proc_get_task_ruse_cpu(task_t task, uint32_t * policyp, uint32_t * percentagep, return(error); } +/* + * Currently supported configurations for CPU limits. + * + * Deadline-based CPU limit Percentage-based CPU limit + * PROC_POLICY_RSRCACT_THROTTLE ENOTSUP Task-wide scope only + * PROC_POLICY_RSRCACT_SUSPEND Task-wide scope only ENOTSUP + * PROC_POLICY_RSRCACT_TERMINATE Task-wide scope only ENOTSUP + * PROC_POLICY_RSRCACT_NOTIFY_KQ Task-wide scope only ENOTSUP + * PROC_POLICY_RSRCACT_NOTIFY_EXC ENOTSUP Per-thread scope only + * + * A deadline-based CPU limit is actually a simple wallclock timer - the requested action is performed + * after the specified amount of wallclock time has elapsed. + * + * A percentage-based CPU limit performs the requested action after the specified amount of actual CPU time + * has been consumed -- regardless of how much wallclock time has elapsed -- by either the task as an + * aggregate entity (so-called "Task-wide" or "Proc-wide" scope, whereby the CPU time consumed by all threads + * in the task are added together), or by any one thread in the task (so-called "per-thread" scope). + * + * We support either deadline != 0 OR percentage != 0, but not both. The original intention in having them + * share an API was to use actual CPU time as the basis of the deadline-based limit (as in: perform an action + * after I have used some amount of CPU time; this is different than the recurring percentage/interval model) + * but the potential consumer of the API at the time was insisting on wallclock time instead. + * + * Currently, requesting notification via an exception is the only way to get per-thread scope for a + * CPU limit. All other types of notifications force task-wide scope for the limit. + */ int proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint32_t percentage, uint64_t interval, uint64_t deadline) { int error = 0; + int scope; + + /* + * Enforce the matrix of supported configurations for policy, percentage, and deadline. + */ + switch (policy) { + // If no policy is explicitly given, the default is to throttle. + case TASK_POLICY_RESOURCE_ATTRIBUTE_NONE: + case TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE: + if (deadline != 0) + return (ENOTSUP); + scope = TASK_RUSECPU_FLAGS_PROC_LIMIT; + break; + case TASK_POLICY_RESOURCE_ATTRIBUTE_SUSPEND: + case TASK_POLICY_RESOURCE_ATTRIBUTE_TERMINATE: + case TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_KQ: + if (percentage != 0) + return (ENOTSUP); + scope = TASK_RUSECPU_FLAGS_DEADLINE; + break; + case TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC: + if (deadline != 0) + return (ENOTSUP); + scope = TASK_RUSECPU_FLAGS_PERTHR_LIMIT; + break; + default: + return (EINVAL); + } task_lock(task); if (task != current_task()) { @@ -1114,11 +1440,47 @@ proc_set_task_ruse_cpu(task_t task, uint32_t policy, uint32_t percentage, uint64 } else { task->policystate.ru_cpu = policy; } - error = task_set_cpuusage(task, percentage, interval, deadline); + error = task_set_cpuusage(task, percentage, interval, deadline, scope); task_unlock(task); return(error); } +int +proc_clear_task_ruse_cpu(task_t task) +{ + int error = 0; + int action; + void * bsdinfo = NULL; + + task_lock(task); + if (task != current_task()) { + task->ext_policystate.ru_cpu = TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT; + } else { + task->policystate.ru_cpu = TASK_POLICY_RESOURCE_ATTRIBUTE_DEFAULT; + } + + error = task_clear_cpuusage_locked(task); + if (error != 0) + goto out; + + action = task->ext_appliedstate.ru_cpu; + if (task->ext_appliedstate.ru_cpu != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + /* reset action */ + task->ext_appliedstate.ru_cpu = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE; + } + if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + bsdinfo = task->bsd_info; + task_unlock(task); + proc_restore_resource_actions(bsdinfo, TASK_POLICY_CPU_RESOURCE_USAGE, action); + goto out1; + } + +out: + task_unlock(task); +out1: + return(error); + +} /* used to apply resource limit related actions */ static int @@ -1144,11 +1506,14 @@ task_apply_resource_actions(task_t task, int type) /* only cpu actions for now */ task_lock(task); - if (task->ext_actionstate.ru_cpu == TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { + if (task->ext_appliedstate.ru_cpu == TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { /* apply action */ - task->ext_actionstate.ru_cpu = task->ext_policystate.ru_cpu; - action = task->ext_actionstate.ru_cpu; + task->ext_appliedstate.ru_cpu = task->ext_policystate.ru_cpu; + action = task->ext_appliedstate.ru_cpu; + } else { + action = task->ext_appliedstate.ru_cpu; } + if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { bsdinfo = task->bsd_info; task_unlock(task); @@ -1159,76 +1524,660 @@ task_apply_resource_actions(task_t task, int type) return(0); } +/* For ledger hookups */ +static int +task_get_cpuusage(task_t task, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep) +{ + *percentagep = task->rusage_cpu_percentage; + *intervalp = task->rusage_cpu_interval; + *deadlinep = task->rusage_cpu_deadline; + + return(0); +} + int -task_restore_resource_actions(task_t task, int type) +task_set_cpuusage(task_t task, uint64_t percentage, uint64_t interval, uint64_t deadline, int scope) { - int action; - void * bsdinfo = NULL; - - switch (type) { - case TASK_POLICY_CPU_RESOURCE_USAGE: - break; - case TASK_POLICY_WIREDMEM_RESOURCE_USAGE: - case TASK_POLICY_VIRTUALMEM_RESOURCE_USAGE: - case TASK_POLICY_DISK_RESOURCE_USAGE: - case TASK_POLICY_NETWORK_RESOURCE_USAGE: - case TASK_POLICY_POWER_RESOURCE_USAGE: - return(0); + uint64_t abstime = 0; + uint64_t save_abstime = 0; + uint64_t limittime = 0; + thread_t thread; - default: - return(1); - }; + lck_mtx_assert(&task->lock, LCK_MTX_ASSERT_OWNED); + + /* By default, refill once per second */ + if (interval == 0) + interval = NSEC_PER_SEC; + + if (percentage != 0) { + if (percentage > 100) + percentage = 100; + limittime = (interval * percentage)/ 100; + nanoseconds_to_absolutetime(limittime, &abstime); + if (scope == TASK_RUSECPU_FLAGS_PERTHR_LIMIT) { + /* + * A per-thread CPU limit on a task generates an exception + * (LEDGER_ACTION_EXCEPTION) if any one thread in the task + * exceeds the limit. + */ + task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PERTHR_LIMIT; + task->rusage_cpu_perthr_percentage = percentage; + task->rusage_cpu_perthr_interval = interval; + queue_iterate(&task->threads, thread, thread_t, task_threads) { + set_astledger(thread); + } + } else if (scope == TASK_RUSECPU_FLAGS_PROC_LIMIT) { + /* + * Currently, a proc-wide CPU limit always blocks if the limit is + * exceeded (LEDGER_ACTION_BLOCK). + */ + task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PROC_LIMIT; + task->rusage_cpu_percentage = percentage; + task->rusage_cpu_interval = interval; + + ledger_set_limit(task->ledger, task_ledgers.cpu_time, abstime); + ledger_set_period(task->ledger, task_ledgers.cpu_time, interval); + ledger_set_action(task->ledger, task_ledgers.cpu_time, LEDGER_ACTION_BLOCK); + } + } - /* only cpu actions for now */ - task_lock(task); - - action = task->ext_actionstate.ru_cpu; - if (task->ext_actionstate.ru_cpu != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { - /* reset action */ - task->ext_actionstate.ru_cpu = TASK_POLICY_RESOURCE_ATTRIBUTE_NONE; + if (deadline != 0) { + assert(scope == TASK_RUSECPU_FLAGS_DEADLINE); + + /* if already in use, cancel and wait for it to cleanout */ + if (task->rusage_cpu_callt != NULL) { + task_unlock(task); + thread_call_cancel_wait(task->rusage_cpu_callt); + task_lock(task); + } + if (task->rusage_cpu_callt == NULL) { + task->rusage_cpu_callt = thread_call_allocate_with_priority(task_action_cpuusage, (thread_call_param_t)task, THREAD_CALL_PRIORITY_KERNEL); + } + /* setup callout */ + if (task->rusage_cpu_callt != 0) { + task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_DEADLINE; + task->rusage_cpu_deadline = deadline; + + nanoseconds_to_absolutetime(deadline, &abstime); + save_abstime = abstime; + clock_absolutetime_interval_to_deadline(save_abstime, &abstime); + thread_call_enter_delayed(task->rusage_cpu_callt, abstime); + } } - if (action != TASK_POLICY_RESOURCE_ATTRIBUTE_NONE) { - bsdinfo = task->bsd_info; - task_unlock(task); - proc_restore_resource_actions(bsdinfo, TASK_POLICY_CPU_RESOURCE_USAGE, action); - } else - task_unlock(task); return(0); - } -/* For ledger hookups */ -static int -task_get_cpuusage(__unused task_t task, uint32_t * percentagep, uint64_t * intervalp, uint64_t * deadlinep) +int +task_clear_cpuusage(task_t task) { - *percentagep = 0; - *intervalp = 0; - *deadlinep = 0; + int retval = 0; - return(0); + task_lock(task); + retval = task_clear_cpuusage_locked(task); + task_unlock(task); + + return(retval); } -static int -task_set_cpuusage(__unused task_t task, __unused uint32_t percentage, __unused uint64_t interval, __unused uint64_t deadline) +int +task_clear_cpuusage_locked(task_t task) { + thread_call_t savecallt; + thread_t thread; + + /* cancel percentage handling if set */ + if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PROC_LIMIT) { + task->rusage_cpu_flags &= ~TASK_RUSECPU_FLAGS_PROC_LIMIT; + ledger_set_limit(task->ledger, task_ledgers.cpu_time, LEDGER_LIMIT_INFINITY); + task->rusage_cpu_percentage = 0; + task->rusage_cpu_interval = 0; + } + + if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) { + task->rusage_cpu_flags &= ~TASK_RUSECPU_FLAGS_PERTHR_LIMIT; + queue_iterate(&task->threads, thread, thread_t, task_threads) { + set_astledger(thread); + } + task->rusage_cpu_perthr_percentage = 0; + task->rusage_cpu_perthr_interval = 0; + + } + + /* cancel deadline handling if set */ + if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_DEADLINE) { + task->rusage_cpu_flags &= ~TASK_RUSECPU_FLAGS_DEADLINE; + if (task->rusage_cpu_callt != 0) { + savecallt = task->rusage_cpu_callt; + task->rusage_cpu_callt = NULL; + task->rusage_cpu_deadline = 0; + task_unlock(task); + thread_call_cancel_wait(savecallt); + thread_call_free(savecallt); + task_lock(task); + } + } return(0); } /* called by ledger unit to enforce action due to resource usage criteria being met */ +void +task_action_cpuusage(thread_call_param_t param0, __unused thread_call_param_t param1) +{ + task_t task = (task_t)param0; + (void)task_apply_resource_actions(task, TASK_POLICY_CPU_RESOURCE_USAGE); + return; +} + +#if CONFIG_EMBEDDED +/* return the appstate of a task */ int -task_action_cpuusage(task_t task) +proc_lf_getappstate(task_t task) { - return(task_apply_resource_actions(task, TASK_POLICY_CPU_RESOURCE_USAGE)); + return(task->appstate); + } + +/* set appstate of a task and apply approp actions */ +int +proc_lf_setappstate(task_t task, int state) +{ + int ret = 0, oldstate; + kern_return_t kret = KERN_SUCCESS; + int applywatch = 0, setbg = 0, setnetbg = 0; + int sethib_suspend = 0, sethib_resume=0; + + if (state == TASK_APPSTATE_NONE) + goto out; + + /* valid states? */ + switch (state) { + case TASK_APPSTATE_ACTIVE: + case TASK_APPSTATE_BACKGROUND: + case TASK_APPSTATE_NONUI: + case TASK_APPSTATE_INACTIVE: + break; + default: + ret = EINVAL; + goto out; + + } + + task_lock(task); + oldstate = task->appstate; + if (oldstate == state) { + /* no changes */ + goto out1; + } + + switch(oldstate) { + case TASK_APPSTATE_ACTIVE: + switch(state) { + case TASK_APPSTATE_BACKGROUND: + /* moving from active to app background */ + task->ext_policystate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL; + proc_apply_bgtaskpolicy_internal(task, 1, 1); + /* watchers need update */ + applywatch = 1; + setbg = 1; + /* set network part */ + setnetbg = 1; + break; + + case TASK_APPSTATE_NONUI: + /* set no graphics */ + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + break; + + case TASK_APPSTATE_INACTIVE: + /* suspend the process */ + kret = task_pidsuspend_locked(task); + if (kret != KERN_SUCCESS) + ret = EINVAL; + else + sethib_suspend = 1; + + break; + } + break; + + case TASK_APPSTATE_BACKGROUND: + switch(state) { + /* watchers need update */ + applywatch = 1; + setbg = 0; + /* set network part */ + setnetbg = 1; + case TASK_APPSTATE_ACTIVE: + /* remove app background */ + ret = proc_restore_bgtaskpolicy_internal(task, 1, 1, BASEPRI_DEFAULT); + /* going from BG to active */ + break; + + case TASK_APPSTATE_NONUI: + /* remove app background + no graphics */ + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + ret = proc_restore_bgtaskpolicy_internal(task, 1, 1, BASEPRI_DEFAULT); + break; + + case TASK_APPSTATE_INACTIVE: + /* suspend and then remove app background */ + kret = task_pidsuspend_locked(task); + if (kret != KERN_SUCCESS) { + ret = EINVAL; + } else { + ret = proc_restore_bgtaskpolicy_internal(task, 1, 1, BASEPRI_DEFAULT); + sethib_suspend = 1; + } + + break; + + } + break; + + case TASK_APPSTATE_NONUI: + switch(state) { + case TASK_APPSTATE_ACTIVE: + /* restore graphics access */ + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT; + break; + + case TASK_APPSTATE_BACKGROUND: + /* set app background */ + task->ext_policystate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL; + + ret = proc_apply_bgtaskpolicy_internal(task, 1, 1); + if (ret == 0) { + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT; + } + /* watchers need update */ + applywatch = 1; + setbg = 1; + /* set network part */ + setnetbg = 1; + break; + + case TASK_APPSTATE_INACTIVE: + /* suspend & restore graphics access */ + kret = task_pidsuspend_locked(task); + if (kret != KERN_SUCCESS) { + ret = EINVAL; + } else { + ret = proc_restore_bgtaskpolicy_internal(task, 1, 1, BASEPRI_DEFAULT); + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_DEFAULT; + sethib_suspend = 1; + } + break; + } + break; + + case TASK_APPSTATE_INACTIVE: + switch(state) { + case TASK_APPSTATE_ACTIVE: + /* resume process */ + /* going from inactive to active */ + break; + + case TASK_APPSTATE_BACKGROUND: + task->ext_policystate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL; + ret = proc_apply_bgtaskpolicy_internal(task, 1, 1); + /* put in app background & resume process */ + /* watchers need update */ + applywatch = 1; + setbg = 1; + /* set network part */ + setnetbg = 1; + break; + + case TASK_APPSTATE_NONUI: + /* remove graphics access and resume */ + task->ext_policystate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + task->ext_appliedstate.hw_gpu = TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS; + break; + } + /* pidresume does drop task lock,so no need to have locked version */ + task_unlock(task); + kret = task_pidresume(task); + task_lock(task); + sethib_resume = 1; + break; + } + /* set the new app state on the task */ + task->appstate = state; +out1: + task_unlock(task); + if (setnetbg != 0) { + /* apply network background */ + if (setbg != 0) + proc_apply_task_networkbg_internal(task->bsd_info, NULL); + else + proc_restore_task_networkbg_internal(task->bsd_info, NULL); + } +#if CONFIG_MEMORYSTATUS + if (sethib_suspend != 0) + memorystatus_on_suspend(proc_pid(task->bsd_info)); + if (sethib_resume != 0) + memorystatus_on_resume(proc_pid(task->bsd_info)); +#endif /* CONFIG_MEMORYSTATUS */ + /* if watchers need update, safe point to do that */ + if (applywatch != 0) + apply_appstate_watchers(task, setbg); + +out: + return(ret); +} + +static void +task_watch_lock(void) +{ + lck_mtx_lock(&task_watch_mtx); +} + +static void +task_watch_unlock(void) +{ + lck_mtx_unlock(&task_watch_mtx); +} + +static void +add_taskwatch_locked(task_t task, task_watch_t * twp) +{ + queue_enter(&task->task_watchers, twp, task_watch_t *, tw_links); + task->num_taskwatchers++; + +} + +static void +remove_taskwatch_locked(task_t task, task_watch_t * twp) +{ + queue_remove(&task->task_watchers, twp, task_watch_t *, tw_links); + task->num_taskwatchers--; +} + + +int +proc_lf_pidbind(task_t curtask, uint64_t tid, task_t target_task, int bind) +{ + thread_t self = current_thread(); + thread_t target_thread = NULL; + int selfset = 0, ret = 0, setbg = 0; + task_watch_t *twp = NULL; + task_t task = TASK_NULL; + + + if ((tid == 0) || (tid == self->thread_id)) { + selfset = 1; + target_thread = self; + thread_reference(target_thread); + } else { + task_lock(curtask); + target_thread = task_findtid(curtask, tid); + if (target_thread != NULL) + thread_reference(target_thread); + else { + ret = ESRCH; + goto out; + } + + task_unlock(curtask); + } + + if (bind != 0) { + /* task is still active ? */ + task_lock(target_task); + if (target_task->active == 0) { + task_unlock(target_task); + ret = ESRCH; + goto out; + } + task_unlock(target_task); + + twp = (task_watch_t *)kalloc(sizeof(task_watch_t)); + if (twp == NULL) { + task_watch_unlock(); + ret = ENOMEM; + goto out; + } + + bzero(twp, sizeof(task_watch_t)); + + task_watch_lock(); + + if (target_thread->taskwatch != NULL){ + /* already bound to another task */ + task_watch_unlock(); + + kfree(twp, sizeof(task_watch_t)); + ret = EBUSY; + goto out; + } + + task_reference(target_task); + + twp->tw_task = target_task; /* holds the task reference */ + twp->tw_thread = target_thread; /* holds the thread reference */ + twp->tw_state = target_task->appstate; + twp->tw_importance = target_thread->importance; + + add_taskwatch_locked(target_task, twp); + + target_thread->taskwatch = twp; + + if (target_task->appstate == TASK_APPSTATE_BACKGROUND) + setbg = 1; + + task_watch_unlock(); + + if (setbg != 0) { + set_thread_appbg(target_thread, setbg, INT_MIN); + } + + /* retain the thread reference as it is in twp */ + target_thread = NULL; + } else { + /* unbind */ + task_watch_lock(); + if ((twp = target_thread->taskwatch) != NULL) { + task = twp->tw_task; + target_thread->taskwatch = NULL; + remove_taskwatch_locked(task, twp); + + task_watch_unlock(); + + task_deallocate(task); /* drop task ref in twp */ + set_thread_appbg(target_thread, 0, twp->tw_importance); + thread_deallocate(target_thread); /* drop thread ref in twp */ + kfree(twp, sizeof(task_watch_t)); + } else { + task_watch_unlock(); + ret = 0; /* return success if it not alredy bound */ + goto out; + } + } +out: + if (target_thread != NULL) + thread_deallocate(target_thread); /* drop thread ref acquired in this routine */ + return(ret); +} + +static void +set_thread_appbg(thread_t thread, int setbg,int importance) +{ + /* TBD: ensure the proc for network is fine */ + if (setbg == 0) { + restore_bgthreadpolicy_locked(thread, 0, importance); + proc_restore_task_networkbg_internal(thread->task->bsd_info, thread); + } else { + apply_bgthreadpolicy_external(thread); + proc_apply_task_networkbg_internal(thread->task->bsd_info, thread); + } +} + +static void +apply_appstate_watchers(task_t task, int setbg) +{ + int numwatchers = 0, i, j; + thread_watchlist_t * threadlist; + task_watch_t * twp; + +retry: + /* if no watchers on the list return */ + if ((numwatchers = task->num_taskwatchers) == 0) + return; + + threadlist = (thread_watchlist_t *)kalloc(numwatchers*sizeof(thread_watchlist_t)); + if (threadlist == NULL) + return; + + bzero(threadlist, numwatchers*sizeof(thread_watchlist_t)); + + task_watch_lock(); + /*serialize application of app state changes */ + if (task->watchapplying != 0) { + lck_mtx_sleep(&task_watch_mtx, LCK_SLEEP_DEFAULT, &task->watchapplying, THREAD_UNINT); + task_watch_unlock(); + kfree(threadlist, numwatchers*sizeof(thread_watchlist_t)); + goto retry; + } + + if (numwatchers != task->num_taskwatchers) { + task_watch_unlock(); + kfree(threadlist, numwatchers*sizeof(thread_watchlist_t)); + goto retry; + } + + task->watchapplying = 1; + i = 0; + queue_iterate(&task->task_watchers, twp, task_watch_t *, tw_links) { + + threadlist[i].thread = twp->tw_thread; + thread_reference(threadlist[i].thread); + if (setbg != 0) { + twp->tw_importance = twp->tw_thread->importance; + threadlist[i].importance = INT_MIN; + } else + threadlist[i].importance = twp->tw_importance; + i++; + if (i > numwatchers) + break; + } + task_watch_unlock(); + + for (j = 0; j< i; j++) { + set_thread_appbg(threadlist[j].thread, setbg, threadlist[j].importance); + thread_deallocate(threadlist[j].thread); + } + kfree(threadlist, numwatchers*sizeof(thread_watchlist_t)); + + + task_watch_lock(); + task->watchapplying = 0; + thread_wakeup_one(&task->watchapplying); + task_watch_unlock(); +} + +void +thead_remove_taskwatch(thread_t thread) +{ + task_watch_t * twp; + int importance = 0; + + task_watch_lock(); + if ((twp = thread->taskwatch) != NULL) { + thread->taskwatch = NULL; + remove_taskwatch_locked(twp->tw_task, twp); + } + task_watch_unlock(); + if (twp != NULL) { + thread_deallocate(twp->tw_thread); + task_deallocate(twp->tw_task); + importance = twp->tw_importance; + kfree(twp, sizeof(task_watch_t)); + /* remove the thread and networkbg */ + set_thread_appbg(thread, 0, importance); + } +} + +void +task_removewatchers(task_t task) +{ + int numwatchers = 0, i, j; + task_watch_t ** twplist = NULL; + task_watch_t * twp = NULL; + +retry: + if ((numwatchers = task->num_taskwatchers) == 0) + return; + + twplist = (task_watch_t **)kalloc(numwatchers*sizeof(task_watch_t *)); + if (twplist == NULL) + return; + + bzero(twplist, numwatchers*sizeof(task_watch_t *)); + + task_watch_lock(); + if (task->num_taskwatchers == 0) { + task_watch_unlock(); + goto out; + } + + if (numwatchers != task->num_taskwatchers) { + task_watch_unlock(); + kfree(twplist, numwatchers*sizeof(task_watch_t *)); + numwatchers = 0; + goto retry; + } + + i = 0; + while((twp = (task_watch_t *)dequeue_head(&task->task_watchers)) != NULL) + { + twplist[i] = twp; + task->num_taskwatchers--; + + /* + * Since the linkage is removed and thead state cleanup is already set up, + * remove the refernce from the thread. + */ + twp->tw_thread->taskwatch = NULL; /* removed linkage, clear thread holding ref */ + i++; + if ((task->num_taskwatchers == 0) || (i > numwatchers)) + break; + } + + task_watch_unlock(); + + for (j = 0; j< i; j++) { + + twp = twplist[j]; + /* remove thread and network bg */ + set_thread_appbg(twp->tw_thread, 0, twp->tw_importance); + thread_deallocate(twp->tw_thread); + task_deallocate(twp->tw_task); + kfree(twp, sizeof(task_watch_t)); + } + +out: + kfree(twplist, numwatchers*sizeof(task_watch_t *)); + +} +#endif /* CONFIG_EMBEDDED */ + + int proc_disable_task_apptype(task_t task, int policy_subtype) { void * bsdinfo = NULL; - int setbg = 0; int ret = 0; + int setbg = 0; +#if !CONFIG_EMBEDDED int maxpri = BASEPRI_DEFAULT; +#endif /* !CONFIG_EMBEDDED */ task_lock(task); @@ -1248,34 +2197,41 @@ proc_disable_task_apptype(task_t task, int policy_subtype) default: maxpri = BASEPRI_DEFAULT; } -#endif - - if (task->ext_actionstate.apptype != PROC_POLICY_OSX_APPTYPE_NONE) { - switch (task->ext_actionstate.apptype) { + + +#endif /* !CONFIG_EMBEDDED */ + + /* TAL apps are cleared with BG handling on first foreground application */ + if (task->ext_appliedstate.apptype != PROC_POLICY_OSX_APPTYPE_NONE) { + switch (task->ext_appliedstate.apptype) { +#if !CONFIG_EMBEDDED case PROC_POLICY_OSX_APPTYPE_TAL: /* disable foreground/background handling */ - task->ext_actionstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; + task->ext_appliedstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; /* external BG application removal */ - proc_restore_bgtaskpolicy_locked(task, 1, 1, maxpri); + proc_restore_bgtaskpolicy_internal(task, 1, 1, maxpri); bsdinfo = task->bsd_info; setbg = 0; break; case PROC_POLICY_OSX_APPTYPE_DBCLIENT: /* disable foreground/background handling */ - task->ext_actionstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; + task->ext_appliedstate.apptype = PROC_POLICY_OSX_APPTYPE_NONE; /* internal BG application removal */ - proc_restore_bgtaskpolicy_locked(task, 1, 0, maxpri); + proc_restore_bgtaskpolicy_internal(task, 1, 0, maxpri); bsdinfo = task->bsd_info; setbg = 0; break; +#endif /* !CONFIG_EMBEDDED */ default: ret = EINVAL; break; } - } else + + } else { ret = EINVAL; + } out: task_unlock(task); @@ -1300,20 +2256,22 @@ proc_enable_task_apptype(task_t task, int policy_subtype) goto out; } - if (task->ext_actionstate.apptype == PROC_POLICY_OSX_APPTYPE_NONE) { + if (task->ext_appliedstate.apptype == PROC_POLICY_OSX_APPTYPE_NONE) { switch (task->ext_policystate.apptype) { +#if !CONFIG_EMBEDDED case PROC_POLICY_OSX_APPTYPE_TAL: /* TAL policy is activated again */ - task->ext_actionstate.apptype = task->ext_policystate.apptype; + task->ext_appliedstate.apptype = task->ext_policystate.apptype; if (task->role == TASK_BACKGROUND_APPLICATION) { if (task->role == TASK_BACKGROUND_APPLICATION) { - proc_apply_bgtaskpolicy_locked(task, 1, 1); + proc_apply_bgtaskpolicy_internal(task, 1, 1); bsdinfo = task->bsd_info; setbg = 1; } } ret = 0; break; +#endif /* !CONFIG_EMBEDDED */ default: ret = EINVAL; } @@ -1329,3 +2287,18 @@ out: return(ret); } +#if CONFIG_EMBEDDED +int +proc_setthread_saved_importance(thread_t thread, int importance) +{ + if ((thread->task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) && + (thread->appliedstate.hw_bg == TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL)) + { + /* the thread is still backgrounded , save the importance for restore time */ + thread->saved_importance = importance; + + return(1); + } else + return(0); +} +#endif /* CONFIG_EMBEDDED */ diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index 3c3e5ce07..2738d3850 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -92,6 +92,7 @@ #include #include +#include #include #include @@ -161,6 +162,10 @@ int task_threadmax = CONFIG_THREAD_MAX; static uint64_t thread_unique_id = 0; +struct _thread_ledger_indices thread_ledgers = { -1 }; +static ledger_template_t thread_ledger_template = NULL; +void init_thread_ledgers(void); + void thread_bootstrap(void) { @@ -196,7 +201,7 @@ thread_bootstrap(void) thread_template.promotions = 0; thread_template.pending_promoter_index = 0; thread_template.pending_promoter[0] = - thread_template.pending_promoter[1] = NULL; + thread_template.pending_promoter[1] = NULL; thread_template.realtime.deadline = UINT64_MAX; @@ -257,14 +262,17 @@ thread_bootstrap(void) thread_template.syscalls_unix = 0; thread_template.syscalls_mach = 0; - thread_template.tkm_private.alloc = 0; - thread_template.tkm_private.free = 0; - thread_template.tkm_shared.alloc = 0; - thread_template.tkm_shared.free = 0; - thread_template.actionstate = default_task_null_policy; - thread_template.ext_actionstate = default_task_null_policy; + thread_template.t_ledger = LEDGER_NULL; + thread_template.t_threadledger = LEDGER_NULL; + + thread_template.appliedstate = default_task_null_policy; + thread_template.ext_appliedstate = default_task_null_policy; thread_template.policystate = default_task_proc_policy; thread_template.ext_policystate = default_task_proc_policy; +#if CONFIG_EMBEDDED + thread_template.taskwatch = NULL; + thread_template.saved_importance = 0; +#endif /* CONFIG_EMBEDDED */ init_thread = thread_template; machine_set_current_thread(&init_thread); @@ -290,6 +298,8 @@ thread_init(void) * per-thread structures necessary. */ machine_thread_init(); + + init_thread_ledgers(); } static void @@ -354,6 +364,10 @@ thread_terminate_self(void) thread_policy_reset(thread); +#if CONFIG_EMBEDDED + thead_remove_taskwatch(thread); +#endif /* CONFIG_EMBEDDED */ + task = thread->task; uthread_cleanup(task, thread->uthread, task->bsd_info); threadcnt = hw_atomic_sub(&task->active_thread_count, 1); @@ -438,6 +452,11 @@ thread_deallocate( } #endif /* MACH_BSD */ + if (thread->t_ledger) + ledger_dereference(thread->t_ledger); + if (thread->t_threadledger) + ledger_dereference(thread->t_threadledger); + if (thread->kernel_stack != 0) stack_free(thread); @@ -474,7 +493,11 @@ thread_terminate_daemon(void) task_lock(task); task->total_user_time += timer_grab(&thread->user_timer); - task->total_system_time += timer_grab(&thread->system_timer); + if (thread->precise_user_kernel_time) { + task->total_system_time += timer_grab(&thread->system_timer); + } else { + task->total_user_time += timer_grab(&thread->system_timer); + } task->c_switch += thread->c_switch; task->p_switch += thread->p_switch; @@ -483,11 +506,6 @@ thread_terminate_daemon(void) task->syscalls_unix += thread->syscalls_unix; task->syscalls_mach += thread->syscalls_mach; - task->tkm_private.alloc += thread->tkm_private.alloc; - task->tkm_private.free += thread->tkm_private.free; - task->tkm_shared.alloc += thread->tkm_shared.alloc; - task->tkm_shared.free += thread->tkm_shared.free; - queue_remove(&task->threads, thread, thread_t, task_threads); task->thread_count--; @@ -669,7 +687,7 @@ thread_create_internal( return (KERN_FAILURE); } - new_thread->task = parent_task; + new_thread->task = parent_task; thread_lock_init(new_thread); wake_lock_init(new_thread); @@ -716,6 +734,18 @@ thread_create_internal( task_reference_internal(parent_task); + if (new_thread->task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) { + /* + * This task has a per-thread CPU limit; make sure this new thread + * gets its limit set too, before it gets out of the kernel. + */ + set_astledger(new_thread); + } + new_thread->t_threadledger = LEDGER_NULL; /* per thread ledger is not inherited */ + new_thread->t_ledger = new_thread->task->ledger; + if (new_thread->t_ledger) + ledger_reference(new_thread->t_ledger); + /* Cache the task's map */ new_thread->map = parent_task->map; @@ -759,6 +789,21 @@ thread_create_internal( #endif /* CONFIG_EMBEDDED */ new_thread->importance = new_thread->priority - new_thread->task_priority; +#if CONFIG_EMBEDDED + new_thread->saved_importance = new_thread->importance; + /* apple ios daemon starts all threads in darwin background */ + if (parent_task->ext_appliedstate.apptype == PROC_POLICY_IOS_APPLE_DAEMON) { + /* Cannot use generic routines here so apply darwin bacground directly */ + new_thread->policystate.hw_bg = TASK_POLICY_BACKGROUND_ATTRIBUTE_ALL; + /* set thread self backgrounding */ + new_thread->appliedstate.hw_bg = new_thread->policystate.hw_bg; + /* priority will get recomputed suitably bit later */ + new_thread->importance = INT_MIN; + /* to avoid changes to many pri compute routines, set the effect of those here */ + new_thread->priority = MAXPRI_THROTTLE; + } +#endif /* CONFIG_EMBEDDED */ + #if defined(CONFIG_SCHED_TRADITIONAL) new_thread->sched_stamp = sched_tick; new_thread->pri_shift = sched_pri_shift; @@ -774,16 +819,16 @@ thread_create_internal( kdbg_trace_data(parent_task->bsd_info, &dbg_arg2); - KERNEL_DEBUG_CONSTANT( - TRACEDBG_CODE(DBG_TRACE_DATA, 1) | DBG_FUNC_NONE, - (vm_address_t)(uintptr_t)thread_tid(new_thread), dbg_arg2, 0, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + TRACEDBG_CODE(DBG_TRACE_DATA, 1) | DBG_FUNC_NONE, + (vm_address_t)(uintptr_t)thread_tid(new_thread), dbg_arg2, 0, 0, 0); kdbg_trace_string(parent_task->bsd_info, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4); - KERNEL_DEBUG_CONSTANT( - TRACEDBG_CODE(DBG_TRACE_STRING, 1) | DBG_FUNC_NONE, - dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + TRACEDBG_CODE(DBG_TRACE_STRING, 1) | DBG_FUNC_NONE, + dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0); } DTRACE_PROC1(lwp__create, thread_t, *out_thread); @@ -1026,7 +1071,7 @@ kernel_thread_start( return kernel_thread_start_priority(continuation, parameter, -1, new_thread); } -#ifndef __LP64__ +#if defined(__i386__) thread_t kernel_thread( @@ -1048,7 +1093,7 @@ kernel_thread( return (thread); } -#endif /* __LP64__ */ +#endif /* defined(__i386__) */ kern_return_t thread_info_internal( @@ -1270,14 +1315,29 @@ thread_read_times( { clock_sec_t secs; clock_usec_t usecs; + uint64_t tval_user, tval_system; - absolutetime_to_microtime(timer_grab(&thread->user_timer), &secs, &usecs); - user_time->seconds = (typeof(user_time->seconds))secs; - user_time->microseconds = usecs; + tval_user = timer_grab(&thread->user_timer); + tval_system = timer_grab(&thread->system_timer); - absolutetime_to_microtime(timer_grab(&thread->system_timer), &secs, &usecs); - system_time->seconds = (typeof(system_time->seconds))secs; - system_time->microseconds = usecs; + if (thread->precise_user_kernel_time) { + absolutetime_to_microtime(tval_user, &secs, &usecs); + user_time->seconds = (typeof(user_time->seconds))secs; + user_time->microseconds = usecs; + + absolutetime_to_microtime(tval_system, &secs, &usecs); + system_time->seconds = (typeof(system_time->seconds))secs; + system_time->microseconds = usecs; + } else { + /* system_timer may represent either sys or user */ + tval_user += tval_system; + absolutetime_to_microtime(tval_user, &secs, &usecs); + user_time->seconds = (typeof(user_time->seconds))secs; + user_time->microseconds = usecs; + + system_time->seconds = 0; + system_time->microseconds = 0; + } } kern_return_t @@ -1369,6 +1429,128 @@ thread_wire( return (thread_wire_internal(host_priv, thread, wired, NULL)); } +static void +thread_resource_exception(const void *arg0, __unused const void *arg1) +{ + thread_t thread = current_thread(); + int code = (int)((uintptr_t)arg0 & ((int)-1)); + + assert(thread->t_threadledger != LEDGER_NULL); + + /* + * Disable the exception notification so we don't overwhelm + * the listener with an endless stream of redundant exceptions. + */ + ledger_set_action(thread->t_threadledger, thread_ledgers.cpu_time, + LEDGER_ACTION_IGNORE); + ledger_disable_callback(thread->t_threadledger, thread_ledgers.cpu_time); + + /* XXX code should eventually be a user-exported namespace of resources */ + (void) task_exception_notify(EXC_RESOURCE, code, 0); +} + +void +init_thread_ledgers(void) { + ledger_template_t t; + int idx; + + assert(thread_ledger_template == NULL); + + if ((t = ledger_template_create("Per-thread ledger")) == NULL) + panic("couldn't create thread ledger template"); + + if ((idx = ledger_entry_add(t, "cpu_time", "sched", "ns")) < 0) { + panic("couldn't create cpu_time entry for thread ledger template"); + } + + if (ledger_set_callback(t, idx, thread_resource_exception, + (void *)(uintptr_t)idx, NULL) < 0) { + panic("couldn't set thread ledger callback for cpu_time entry"); + } + + thread_ledgers.cpu_time = idx; + thread_ledger_template = t; +} + +/* + * Set CPU usage limit on a thread. + * + * Calling with percentage of 0 will unset the limit for this thread. + */ + +int +thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns) +{ + thread_t thread = current_thread(); + ledger_t l; + uint64_t limittime = 0; + uint64_t abstime = 0; + + assert(percentage <= 100); + + if (percentage == 0) { + /* + * Remove CPU limit, if any exists. + */ + if (thread->t_threadledger != LEDGER_NULL) { + /* + * The only way to get a per-thread ledger is via CPU limits. + */ + assert(thread->options & (TH_OPT_PROC_CPULIMIT | TH_OPT_PRVT_CPULIMIT)); + ledger_dereference(thread->t_threadledger); + thread->t_threadledger = LEDGER_NULL; + thread->options &= ~(TH_OPT_PROC_CPULIMIT | TH_OPT_PRVT_CPULIMIT); + } + + return (0); + } + + l = thread->t_threadledger; + if (l == LEDGER_NULL) { + /* + * This thread doesn't yet have a per-thread ledger; so create one with the CPU time entry active. + */ + if ((l = ledger_instantiate(thread_ledger_template, LEDGER_CREATE_INACTIVE_ENTRIES)) == LEDGER_NULL) + return (KERN_RESOURCE_SHORTAGE); + + /* + * We are the first to create this thread's ledger, so only activate our entry. + */ + ledger_entry_setactive(l, thread_ledgers.cpu_time); + thread->t_threadledger = l; + } + + /* + * The limit is specified as a percentage of CPU over an interval in nanoseconds. + * Calculate the amount of CPU time that the thread needs to consume in order to hit the limit. + */ + limittime = (interval_ns * percentage) / 100; + nanoseconds_to_absolutetime(limittime, &abstime); + ledger_set_limit(l, thread_ledgers.cpu_time, abstime); + /* + * Refill the thread's allotted CPU time every interval_ns nanoseconds. + */ + ledger_set_period(l, thread_ledgers.cpu_time, interval_ns); + + /* + * Ledgers supports multiple actions for one ledger entry, so we do too. + */ + if (action == THREAD_CPULIMIT_EXCEPTION) { + thread->options |= TH_OPT_PROC_CPULIMIT; + ledger_set_action(l, thread_ledgers.cpu_time, LEDGER_ACTION_EXCEPTION); + } + + if (action == THREAD_CPULIMIT_BLOCK) { + thread->options |= TH_OPT_PRVT_CPULIMIT; + /* The per-thread ledger template by default has a callback for CPU time */ + ledger_disable_callback(l, thread_ledgers.cpu_time); + ledger_set_action(l, thread_ledgers.cpu_time, LEDGER_ACTION_BLOCK); + } + + thread->t_threadledger = l; + return (0); +} + int split_funnel_off = 0; lck_grp_t *funnel_lck_grp = LCK_GRP_NULL; lck_grp_attr_t *funnel_lck_grp_attr; @@ -1603,12 +1785,6 @@ vm_offset_t dtrace_get_kernel_stack(thread_t thread) int64_t dtrace_calc_thread_recent_vtime(thread_t thread) { -#if STAT_TIME - if (thread != THREAD_NULL) { - return timer_grab(&(thread->system_timer)) + timer_grab(&(thread->user_timer)); - } else - return 0; -#else if (thread != THREAD_NULL) { processor_t processor = current_processor(); uint64_t abstime = mach_absolute_time(); @@ -1620,7 +1796,6 @@ int64_t dtrace_calc_thread_recent_vtime(thread_t thread) (abstime - timer->tstamp); /* XXX need interrupts off to prevent missed time? */ } else return 0; -#endif } void dtrace_set_thread_predcache(thread_t thread, uint32_t predcache) diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 916391593..b497fa3fa 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -147,6 +147,8 @@ struct thread { #define TH_OPT_VMPRIV 0x04 /* may allocate reserved memory */ #define TH_OPT_DTRACE 0x08 /* executing under dtrace_probe */ #define TH_OPT_SYSTEM_CRITICAL 0x10 /* Thread must always be allowed to run - even under heavy load */ +#define TH_OPT_PROC_CPULIMIT 0x20 /* Thread has a task-wide CPU limit applied to it */ +#define TH_OPT_PRVT_CPULIMIT 0x40 /* Thread has a thread-private CPU limit applied to it */ /* Data updated during assert_wait/thread_wakeup */ decl_simple_lock_data(,sched_lock) /* scheduling lock (thread_lock()) */ @@ -203,6 +205,13 @@ struct thread { #define TH_SFLAG_PRI_UPDATE 0x0100 /* Updating priority */ #define TH_SFLAG_EAGERPREEMPT 0x0200 /* Any preemption of this thread should be treated as if AST_URGENT applied */ +/* + * A thread can either be completely unthrottled, about to be throttled, + * throttled (TH_SFLAG_THROTTLED), or about to be unthrottled + */ +#define TH_SFLAG_PENDING_THROTTLE_DEMOTION 0x1000 /* Pending sched_mode demotion */ +#define TH_SFLAG_PENDING_THROTTLE_PROMOTION 0x2000 /* Pending sched_mode promition */ +#define TH_SFLAG_PENDING_THROTTLE_MASK (TH_SFLAG_PENDING_THROTTLE_DEMOTION | TH_SFLAG_PENDING_THROTTLE_PROMOTION) integer_t sched_pri; /* scheduled (current) priority */ integer_t priority; /* base priority */ @@ -268,6 +277,7 @@ struct thread { uint32_t ps_switch; /* total pset switches */ /* Timing data structures */ + int precise_user_kernel_time; /* precise user/kernel enabled for this thread */ timer_data_t user_timer; /* user mode timer */ uint64_t user_timer_save; /* saved user timer value */ uint64_t system_timer_save; /* saved system timer value */ @@ -382,13 +392,23 @@ struct thread { int64_t t_dtrace_vtime; #endif -#define T_CHUD_MARKED 0x1 /* this thread is marked by CHUD */ -#define T_IN_CHUD 0x2 /* this thread is already in a CHUD handler */ -#define THREAD_PMC_FLAG 0x4 /* Bit in "t_chud" signifying PMC interest */ uint32_t t_page_creation_count; clock_sec_t t_page_creation_time; +#define T_CHUD_MARKED 0x01 /* this thread is marked by CHUD */ +#define T_IN_CHUD 0x02 /* this thread is already in a CHUD handler */ +#define THREAD_PMC_FLAG 0x04 /* Bit in "t_chud" signifying PMC interest */ +#define T_AST_CALLSTACK 0x08 /* Thread scheduled to dump a + * callstack on its next + * AST */ +#define T_AST_NAME 0x10 /* Thread scheduled to dump + * its name on its next + * AST */ +#define T_NAME_DONE 0x20 /* Thread has previously + * recorded its name */ + uint32_t t_chud; /* CHUD flags, used for Shark */ + uint32_t chud_c_switch; /* last dispatch detection */ integer_t mutex_count; /* total count of locks held */ @@ -397,12 +417,16 @@ struct thread { /* Statistics accumulated per-thread and aggregated per-task */ uint32_t syscalls_unix; uint32_t syscalls_mach; - zinfo_usage_store_t tkm_private; /* private kernel memory allocs/frees */ - zinfo_usage_store_t tkm_shared; /* shared kernel memory allocs/frees */ - struct process_policy ext_actionstate; /* externally applied actions */ + ledger_t t_ledger; + ledger_t t_threadledger; /* per thread ledger */ + struct process_policy ext_appliedstate; /* externally applied actions */ struct process_policy ext_policystate; /* externally defined process policy states*/ - struct process_policy actionstate; /* self applied acions */ + struct process_policy appliedstate; /* self applied acions */ struct process_policy policystate; /* process wide policy states */ +#if CONFIG_EMBEDDED + task_watch_t * taskwatch; /* task watch */ + integer_t saved_importance; /* saved task-relative importance */ +#endif /* CONFIG_EMBEDDED */ }; #define ith_state saved.receive.state @@ -495,21 +519,6 @@ extern void stack_collect(void); extern void stack_init(void) __attribute__((section("__TEXT, initcode"))); -extern kern_return_t thread_state_initialize( - thread_t thread); - -extern kern_return_t thread_setstatus( - thread_t thread, - int flavor, - thread_state_t tstate, - mach_msg_type_number_t count); - -extern kern_return_t thread_getstatus( - thread_t thread, - int flavor, - thread_state_t tstate, - mach_msg_type_number_t *count); - extern kern_return_t thread_info_internal( thread_t thread, thread_flavor_t flavor, @@ -588,8 +597,6 @@ extern void machine_thread_destroy( extern void machine_set_current_thread( thread_t thread); -extern void machine_thread_terminate_self(void); - extern kern_return_t machine_thread_get_kern_state( thread_t thread, thread_flavor_t flavor, @@ -658,13 +665,13 @@ __END_DECLS __BEGIN_DECLS -#ifndef __LP64__ +#if defined(__i386__) extern thread_t kernel_thread( task_t task, void (*start)(void)); -#endif /* __LP64__ */ +#endif /* defined(__i386__) */ extern uint64_t thread_tid( thread_t thread); @@ -680,6 +687,21 @@ __BEGIN_DECLS #ifdef XNU_KERNEL_PRIVATE +extern kern_return_t thread_state_initialize( + thread_t thread); + +extern kern_return_t thread_setstatus( + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t count); + +extern kern_return_t thread_getstatus( + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t *count); + extern kern_return_t thread_create_workq( task_t task, thread_continue_t thread_return, @@ -688,6 +710,23 @@ extern kern_return_t thread_create_workq( extern void thread_yield_internal( mach_msg_timeout_t interval); +/* + * Thread-private CPU limits: apply a private CPU limit to this thread only. Available actions are: + * + * 1) Block. Prevent CPU consumption of the thread from exceeding the limit. + * 2) Exception. Generate a resource consumption exception when the limit is exceeded. + */ +#define THREAD_CPULIMIT_BLOCK 0x1 +#define THREAD_CPULIMIT_EXCEPTION 0x2 + +struct _thread_ledger_indices { + int cpu_time; +}; + +extern struct _thread_ledger_indices thread_ledgers; + +extern int thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns); + typedef struct funnel_lock funnel_t; #define THR_FUNNEL_NULL (funnel_t *)0 @@ -733,13 +772,16 @@ extern kern_return_t thread_userstack( mach_vm_offset_t *, int *); -kern_return_t thread_entrypoint( - thread_t, - int, - thread_state_t, - unsigned int, - mach_vm_offset_t *); +extern kern_return_t thread_entrypoint( + thread_t, + int, + thread_state_t, + unsigned int, + mach_vm_offset_t *); +extern kern_return_t thread_userstackdefault( + thread_t, + mach_vm_offset_t *); extern kern_return_t thread_wire_internal( host_priv_t host_priv, @@ -786,9 +828,13 @@ extern void uthread_cred_free(void *); extern boolean_t thread_should_halt( thread_t thread); +extern boolean_t thread_should_abort( + thread_t); + extern int is_64signalregset(void); void act_set_apc(thread_t); +void act_set_kperf(thread_t); extern uint32_t dtrace_get_thread_predcache(thread_t); extern int64_t dtrace_get_thread_vtime(thread_t); @@ -835,6 +881,7 @@ extern kern_return_t kernel_thread_start( #ifdef KERNEL_PRIVATE void thread_set_eager_preempt(thread_t thread); void thread_clear_eager_preempt(thread_t thread); +extern ipc_port_t convert_thread_to_port(thread_t); #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index 455a0fb01..d99ee186c 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -78,6 +78,8 @@ #include +#include + void act_abort(thread_t); void install_special_handler_locked(thread_t); void special_handler_continue(void); @@ -134,7 +136,7 @@ thread_terminate_internal( thread_mtx_unlock(thread); if (thread != current_thread() && result == KERN_SUCCESS) - thread_wait(thread); + thread_wait(thread, FALSE); return (result); } @@ -236,7 +238,7 @@ thread_suspend( thread_mtx_unlock(thread); if (thread != self && result == KERN_SUCCESS) - thread_wait(thread); + thread_wait(thread, TRUE); return (result); } @@ -575,7 +577,7 @@ thread_state_initialize( thread_release(thread); } else - result = machine_thread_state_initialize( thread ); + result = machine_thread_state_initialize( thread ); } else result = KERN_TERMINATED; @@ -897,25 +899,26 @@ act_get_state( return (thread_get_state(thread, flavor, state, count)); } -void -act_set_astbsd( - thread_t thread) +static void +act_set_ast( + thread_t thread, + ast_t ast) { spl_t s = splsched(); if (thread == current_thread()) { - thread_ast_set(thread, AST_BSD); + thread_ast_set(thread, ast); ast_propagate(thread->ast); } else { processor_t processor; thread_lock(thread); - thread_ast_set(thread, AST_BSD); + thread_ast_set(thread, ast); processor = thread->last_processor; - if ( processor != PROCESSOR_NULL && - processor->state == PROCESSOR_RUNNING && - processor->active_thread == thread ) + if ( processor != PROCESSOR_NULL && + processor->state == PROCESSOR_RUNNING && + processor->active_thread == thread ) cause_ast_check(processor); thread_unlock(thread); } @@ -923,28 +926,37 @@ act_set_astbsd( splx(s); } +void +act_set_astbsd( + thread_t thread) +{ + act_set_ast( thread, AST_BSD ); +} + void act_set_apc( thread_t thread) { - spl_t s = splsched(); - - if (thread == current_thread()) { - thread_ast_set(thread, AST_APC); - ast_propagate(thread->ast); - } - else { - processor_t processor; + act_set_ast( thread, AST_APC ); +} - thread_lock(thread); - thread_ast_set(thread, AST_APC); - processor = thread->last_processor; - if ( processor != PROCESSOR_NULL && - processor->state == PROCESSOR_RUNNING && - processor->active_thread == thread ) - cause_ast_check(processor); - thread_unlock(thread); - } - - splx(s); +void +act_set_kperf( + thread_t thread) +{ + /* safety check */ + if (thread != current_thread()) + if( !ml_get_interrupts_enabled() ) + panic("unsafe act_set_kperf operation"); + + act_set_ast( thread, AST_KPERF ); +} + +#if CONFIG_MACF +void +act_set_astmacf( + thread_t thread) +{ + act_set_ast( thread, AST_MACF); } +#endif diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index 93edbc489..7d43919ae 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -41,80 +41,72 @@ #include #include - #include +#include + #include -static zone_t thread_call_zone; +static zone_t thread_call_zone; +static struct wait_queue daemon_wqueue; struct thread_call_group { queue_head_t pending_queue; uint32_t pending_count; queue_head_t delayed_queue; + uint32_t delayed_count; timer_call_data_t delayed_timer; + timer_call_data_t dealloc_timer; struct wait_queue idle_wqueue; - struct wait_queue daemon_wqueue; uint32_t idle_count, active_count; -}; - -typedef struct thread_call_group *thread_call_group_t; - -static struct thread_call_group thread_call_group0; - -static boolean_t thread_call_daemon_awake; - -#define thread_call_thread_min 4 - -#define internal_call_count 768 - -static thread_call_data_t internal_call_storage[internal_call_count]; -static queue_head_t thread_call_internal_queue; - -static __inline__ thread_call_t _internal_call_allocate(void); - -static __inline__ void _internal_call_release( - thread_call_t call); -static __inline__ boolean_t _pending_call_enqueue( - thread_call_t call, - thread_call_group_t group), - _delayed_call_enqueue( - thread_call_t call, - thread_call_group_t group, - uint64_t deadline), - _call_dequeue( - thread_call_t call, - thread_call_group_t group); + integer_t pri; + uint32_t target_thread_count; + uint64_t idle_timestamp; -static __inline__ void thread_call_wake( - thread_call_group_t group); - -static __inline__ void _set_delayed_call_timer( - thread_call_t call, - thread_call_group_t group); - -static boolean_t _remove_from_pending_queue( - thread_call_func_t func, - thread_call_param_t param0, - boolean_t remove_all), - _remove_from_delayed_queue( - thread_call_func_t func, - thread_call_param_t param0, - boolean_t remove_all); + uint32_t flags; + sched_call_t sched_call; +}; -static void thread_call_daemon( - thread_call_group_t group), - thread_call_thread( - thread_call_group_t group); +typedef struct thread_call_group *thread_call_group_t; -extern void thread_call_delayed_timer( - timer_call_param_t p0, - timer_call_param_t p1); +#define TCG_PARALLEL 0x01 +#define TCG_DEALLOC_ACTIVE 0x02 + +#define THREAD_CALL_GROUP_COUNT 4 +#define THREAD_CALL_THREAD_MIN 4 +#define INTERNAL_CALL_COUNT 768 +#define THREAD_CALL_DEALLOC_INTERVAL_NS (5 * 1000 * 1000) /* 5 ms */ +#define THREAD_CALL_ADD_RATIO 4 +#define THREAD_CALL_MACH_FACTOR_CAP 3 + +static struct thread_call_group thread_call_groups[THREAD_CALL_GROUP_COUNT]; +static boolean_t thread_call_daemon_awake; +static thread_call_data_t internal_call_storage[INTERNAL_CALL_COUNT]; +static queue_head_t thread_call_internal_queue; +static uint64_t thread_call_dealloc_interval_abs; + +static __inline__ thread_call_t _internal_call_allocate(void); +static __inline__ void _internal_call_release(thread_call_t call); +static __inline__ boolean_t _pending_call_enqueue(thread_call_t call, thread_call_group_t group); +static __inline__ boolean_t _delayed_call_enqueue(thread_call_t call, thread_call_group_t group, uint64_t deadline); +static __inline__ boolean_t _call_dequeue(thread_call_t call, thread_call_group_t group); +static __inline__ void thread_call_wake(thread_call_group_t group); +static __inline__ void _set_delayed_call_timer(thread_call_t call, thread_call_group_t group); +static boolean_t _remove_from_pending_queue(thread_call_func_t func, thread_call_param_t param0, boolean_t remove_all); +static boolean_t _remove_from_delayed_queue(thread_call_func_t func, thread_call_param_t param0, boolean_t remove_all); +static void thread_call_daemon(void *arg); +static void thread_call_thread(thread_call_group_t group, wait_result_t wres); +extern void thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1); +static void thread_call_dealloc_timer(timer_call_param_t p0, timer_call_param_t p1); +static void thread_call_group_setup(thread_call_group_t group, thread_call_priority_t pri, uint32_t target_thread_count, boolean_t parallel); +static void sched_call_thread(int type, thread_t thread); +static void thread_call_start_deallocate_timer(thread_call_group_t group); +static void thread_call_wait_locked(thread_call_t call); #define qe(x) ((queue_entry_t)(x)) #define TC(x) ((thread_call_t)(x)) @@ -131,6 +123,7 @@ lck_mtx_t thread_call_lock_data; lck_spin_t thread_call_lock_data; #endif + #define thread_call_lock_spin() \ lck_mtx_lock_spin_always(&thread_call_lock_data) @@ -138,6 +131,158 @@ lck_spin_t thread_call_lock_data; lck_mtx_unlock_always(&thread_call_lock_data) +static inline spl_t +disable_ints_and_lock(void) +{ + spl_t s; + + s = splsched(); + thread_call_lock_spin(); + + return s; +} + +static inline void +enable_ints_and_unlock(void) +{ + thread_call_unlock(); + (void)spllo(); +} + + +static inline boolean_t +group_isparallel(thread_call_group_t group) +{ + return ((group->flags & TCG_PARALLEL) != 0); +} + +static boolean_t +thread_call_group_should_add_thread(thread_call_group_t group) +{ + uint32_t thread_count; + + if (!group_isparallel(group)) { + if (group->pending_count > 0 && group->active_count == 0) { + return TRUE; + } + + return FALSE; + } + + if (group->pending_count > 0) { + if (group->idle_count > 0) { + panic("Pending work, but threads are idle?"); + } + + thread_count = group->active_count; + + /* + * Add a thread if either there are no threads, + * the group has fewer than its target number of + * threads, or the amount of work is large relative + * to the number of threads. In the last case, pay attention + * to the total load on the system, and back off if + * it's high. + */ + if ((thread_count == 0) || + (thread_count < group->target_thread_count) || + ((group->pending_count > THREAD_CALL_ADD_RATIO * thread_count) && + (sched_mach_factor < THREAD_CALL_MACH_FACTOR_CAP))) { + return TRUE; + } + } + + return FALSE; +} + +static inline integer_t +thread_call_priority_to_sched_pri(thread_call_priority_t pri) +{ + switch (pri) { + case THREAD_CALL_PRIORITY_HIGH: + return BASEPRI_PREEMPT; + case THREAD_CALL_PRIORITY_KERNEL: + return BASEPRI_KERNEL; + case THREAD_CALL_PRIORITY_USER: + return BASEPRI_DEFAULT; + case THREAD_CALL_PRIORITY_LOW: + return DEPRESSPRI; + default: + panic("Invalid priority."); + } + + return 0; +} + +/* Lock held */ +static inline thread_call_group_t +thread_call_get_group( + thread_call_t call) +{ + thread_call_priority_t pri = call->tc_pri; + + assert(pri == THREAD_CALL_PRIORITY_LOW || + pri == THREAD_CALL_PRIORITY_USER || + pri == THREAD_CALL_PRIORITY_KERNEL || + pri == THREAD_CALL_PRIORITY_HIGH); + + return &thread_call_groups[pri]; +} + +static void +thread_call_group_setup( + thread_call_group_t group, + thread_call_priority_t pri, + uint32_t target_thread_count, + boolean_t parallel) +{ + queue_init(&group->pending_queue); + queue_init(&group->delayed_queue); + + timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group); + timer_call_setup(&group->dealloc_timer, thread_call_dealloc_timer, group); + + wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO); + + group->target_thread_count = target_thread_count; + group->pri = thread_call_priority_to_sched_pri(pri); + + group->sched_call = sched_call_thread; + if (parallel) { + group->flags |= TCG_PARALLEL; + group->sched_call = NULL; + } +} + +/* + * Simple wrapper for creating threads bound to + * thread call groups. + */ +static kern_return_t +thread_call_thread_create( + thread_call_group_t group) +{ + thread_t thread; + kern_return_t result; + + result = kernel_thread_start_priority((thread_continue_t)thread_call_thread, group, group->pri, &thread); + if (result != KERN_SUCCESS) { + return result; + } + + if (group->pri < BASEPRI_PREEMPT) { + /* + * New style doesn't get to run to completion in + * kernel if there are higher priority threads + * available. + */ + thread_set_eager_preempt(thread); + } + + thread_deallocate(thread); + return KERN_SUCCESS; +} + /* * thread_call_initialize: * @@ -148,11 +293,9 @@ void thread_call_initialize(void) { thread_call_t call; - thread_call_group_t group = &thread_call_group0; kern_return_t result; - thread_t thread; - int i; - spl_t s; + thread_t thread; + int i; i = sizeof (thread_call_data_t); thread_call_zone = zinit(i, 4096 * i, 16 * i, "thread_call"); @@ -169,21 +312,21 @@ thread_call_initialize(void) #else lck_spin_init(&thread_call_lock_data, &thread_call_lck_grp, &thread_call_lck_attr); #endif - queue_init(&group->pending_queue); - queue_init(&group->delayed_queue); - s = splsched(); - thread_call_lock_spin(); + nanotime_to_absolutetime(0, THREAD_CALL_DEALLOC_INTERVAL_NS, &thread_call_dealloc_interval_abs); + wait_queue_init(&daemon_wqueue, SYNC_POLICY_FIFO); - timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group); + thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_LOW], THREAD_CALL_PRIORITY_LOW, 0, TRUE); + thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_USER], THREAD_CALL_PRIORITY_USER, 0, TRUE); + thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_KERNEL], THREAD_CALL_PRIORITY_KERNEL, 1, TRUE); + thread_call_group_setup(&thread_call_groups[THREAD_CALL_PRIORITY_HIGH], THREAD_CALL_PRIORITY_HIGH, THREAD_CALL_THREAD_MIN, FALSE); - wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO); - wait_queue_init(&group->daemon_wqueue, SYNC_POLICY_FIFO); + disable_ints_and_lock(); queue_init(&thread_call_internal_queue); for ( - call = internal_call_storage; - call < &internal_call_storage[internal_call_count]; + call = internal_call_storage; + call < &internal_call_storage[INTERNAL_CALL_COUNT]; call++) { enqueue_tail(&thread_call_internal_queue, qe(call)); @@ -191,10 +334,9 @@ thread_call_initialize(void) thread_call_daemon_awake = TRUE; - thread_call_unlock(); - splx(s); + enable_ints_and_unlock(); - result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon, group, BASEPRI_PREEMPT + 1, &thread); + result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon, NULL, BASEPRI_PREEMPT + 1, &thread); if (result != KERN_SUCCESS) panic("thread_call_initialize"); @@ -207,7 +349,9 @@ thread_call_setup( thread_call_func_t func, thread_call_param_t param0) { - call_entry_setup(call, func, param0); + bzero(call, sizeof(*call)); + call_entry_setup((call_entry_t)call, func, param0); + call->tc_pri = THREAD_CALL_PRIORITY_HIGH; /* Default priority */ } /* @@ -243,7 +387,7 @@ _internal_call_release( thread_call_t call) { if ( call >= internal_call_storage && - call < &internal_call_storage[internal_call_count] ) + call < &internal_call_storage[INTERNAL_CALL_COUNT] ) enqueue_head(&thread_call_internal_queue, qe(call)); } @@ -265,10 +409,16 @@ _pending_call_enqueue( { queue_head_t *old_queue; - old_queue = call_entry_enqueue_tail(call, &group->pending_queue); + old_queue = call_entry_enqueue_tail(CE(call), &group->pending_queue); + + if (old_queue == NULL) { + call->tc_submit_count++; + } group->pending_count++; + thread_call_wake(group); + return (old_queue != NULL); } @@ -286,16 +436,18 @@ _pending_call_enqueue( */ static __inline__ boolean_t _delayed_call_enqueue( - thread_call_t call, + thread_call_t call, thread_call_group_t group, uint64_t deadline) { queue_head_t *old_queue; - old_queue = call_entry_enqueue_deadline(call, &group->delayed_queue, deadline); + old_queue = call_entry_enqueue_deadline(CE(call), &group->delayed_queue, deadline); if (old_queue == &group->pending_queue) group->pending_count--; + else if (old_queue == NULL) + call->tc_submit_count++; return (old_queue != NULL); } @@ -316,10 +468,13 @@ _call_dequeue( { queue_head_t *old_queue; - old_queue = call_entry_dequeue(call); + old_queue = call_entry_dequeue(CE(call)); - if (old_queue == &group->pending_queue) - group->pending_count--; + if (old_queue != NULL) { + call->tc_finish_count++; + if (old_queue == &group->pending_queue) + group->pending_count--; + } return (old_queue != NULL); } @@ -337,7 +492,7 @@ _set_delayed_call_timer( thread_call_t call, thread_call_group_t group) { - timer_call_enter(&group->delayed_timer, call->deadline, 0); + timer_call_enter(&group->delayed_timer, call->tc_call.deadline, 0); } /* @@ -357,32 +512,32 @@ _remove_from_pending_queue( thread_call_param_t param0, boolean_t remove_all) { - boolean_t call_removed = FALSE; + boolean_t call_removed = FALSE; thread_call_t call; - thread_call_group_t group = &thread_call_group0; - - call = TC(queue_first(&group->pending_queue)); - - while (!queue_end(&group->pending_queue, qe(call))) { - if ( call->func == func && - call->param0 == param0 ) { + thread_call_group_t group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; + + call = TC(queue_first(&group->pending_queue)); + + while (!queue_end(&group->pending_queue, qe(call))) { + if (call->tc_call.func == func && + call->tc_call.param0 == param0) { thread_call_t next = TC(queue_next(qe(call))); - + _call_dequeue(call, group); _internal_call_release(call); - + call_removed = TRUE; if (!remove_all) break; - + call = next; } else call = TC(queue_next(qe(call))); - } - - return (call_removed); + } + + return (call_removed); } /* @@ -402,32 +557,32 @@ _remove_from_delayed_queue( thread_call_param_t param0, boolean_t remove_all) { - boolean_t call_removed = FALSE; - thread_call_t call; - thread_call_group_t group = &thread_call_group0; - - call = TC(queue_first(&group->delayed_queue)); - - while (!queue_end(&group->delayed_queue, qe(call))) { - if ( call->func == func && - call->param0 == param0 ) { + boolean_t call_removed = FALSE; + thread_call_t call; + thread_call_group_t group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; + + call = TC(queue_first(&group->delayed_queue)); + + while (!queue_end(&group->delayed_queue, qe(call))) { + if (call->tc_call.func == func && + call->tc_call.param0 == param0) { thread_call_t next = TC(queue_next(qe(call))); - + _call_dequeue(call, group); - + _internal_call_release(call); - + call_removed = TRUE; if (!remove_all) break; - + call = next; } else call = TC(queue_next(qe(call))); - } - - return (call_removed); + } + + return (call_removed); } #ifndef __LP64__ @@ -446,38 +601,34 @@ thread_call_func( thread_call_param_t param, boolean_t unique_call) { - thread_call_t call; - thread_call_group_t group = &thread_call_group0; - spl_t s; - - s = splsched(); - thread_call_lock_spin(); - - call = TC(queue_first(&group->pending_queue)); - + thread_call_t call; + thread_call_group_t group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; + spl_t s; + + s = splsched(); + thread_call_lock_spin(); + + call = TC(queue_first(&group->pending_queue)); + while (unique_call && !queue_end(&group->pending_queue, qe(call))) { - if ( call->func == func && - call->param0 == param ) { + if (call->tc_call.func == func && call->tc_call.param0 == param) { break; } - + call = TC(queue_next(qe(call))); - } - - if (!unique_call || queue_end(&group->pending_queue, qe(call))) { + } + + if (!unique_call || queue_end(&group->pending_queue, qe(call))) { call = _internal_call_allocate(); - call->func = func; - call->param0 = param; - call->param1 = NULL; - + call->tc_call.func = func; + call->tc_call.param0 = param; + call->tc_call.param1 = NULL; + _pending_call_enqueue(call, group); - - if (group->active_count == 0) - thread_call_wake(group); - } + } - thread_call_unlock(); - splx(s); + thread_call_unlock(); + splx(s); } #endif /* __LP64__ */ @@ -490,29 +641,29 @@ thread_call_func( */ void thread_call_func_delayed( - thread_call_func_t func, - thread_call_param_t param, - uint64_t deadline) + thread_call_func_t func, + thread_call_param_t param, + uint64_t deadline) { - thread_call_t call; - thread_call_group_t group = &thread_call_group0; - spl_t s; - - s = splsched(); - thread_call_lock_spin(); - - call = _internal_call_allocate(); - call->func = func; - call->param0 = param; - call->param1 = 0; - - _delayed_call_enqueue(call, group, deadline); - - if (queue_first(&group->delayed_queue) == qe(call)) - _set_delayed_call_timer(call, group); - - thread_call_unlock(); - splx(s); + thread_call_t call; + thread_call_group_t group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; + spl_t s; + + s = splsched(); + thread_call_lock_spin(); + + call = _internal_call_allocate(); + call->tc_call.func = func; + call->tc_call.param0 = param; + call->tc_call.param1 = 0; + + _delayed_call_enqueue(call, group, deadline); + + if (queue_first(&group->delayed_queue) == qe(call)) + _set_delayed_call_timer(call, group); + + thread_call_unlock(); + splx(s); } /* @@ -529,29 +680,53 @@ thread_call_func_delayed( */ boolean_t thread_call_func_cancel( - thread_call_func_t func, - thread_call_param_t param, - boolean_t cancel_all) + thread_call_func_t func, + thread_call_param_t param, + boolean_t cancel_all) { - boolean_t result; - spl_t s; - - s = splsched(); - thread_call_lock_spin(); + boolean_t result; + spl_t s; - if (cancel_all) + s = splsched(); + thread_call_lock_spin(); + + if (cancel_all) result = _remove_from_pending_queue(func, param, cancel_all) | - _remove_from_delayed_queue(func, param, cancel_all); + _remove_from_delayed_queue(func, param, cancel_all); else result = _remove_from_pending_queue(func, param, cancel_all) || - _remove_from_delayed_queue(func, param, cancel_all); - - thread_call_unlock(); - splx(s); + _remove_from_delayed_queue(func, param, cancel_all); + + thread_call_unlock(); + splx(s); return (result); } +/* + * Allocate a thread call with a given priority. Importances + * other than THREAD_CALL_PRIORITY_HIGH will be run in threads + * with eager preemption enabled (i.e. may be aggressively preempted + * by higher-priority threads which are not in the normal "urgent" bands). + */ +thread_call_t +thread_call_allocate_with_priority( + thread_call_func_t func, + thread_call_param_t param0, + thread_call_priority_t pri) +{ + thread_call_t call; + + if (pri > THREAD_CALL_PRIORITY_LOW) { + panic("Invalid pri: %d\n", pri); + } + + call = thread_call_allocate(func, param0); + call->tc_pri = pri; + + return call; +} + /* * thread_call_allocate: * @@ -559,41 +734,53 @@ thread_call_func_cancel( */ thread_call_t thread_call_allocate( - thread_call_func_t func, - thread_call_param_t param0) + thread_call_func_t func, + thread_call_param_t param0) { - thread_call_t call = zalloc(thread_call_zone); + thread_call_t call = zalloc(thread_call_zone); - call_entry_setup(call, func, param0); + thread_call_setup(call, func, param0); + call->tc_refs = 1; + call->tc_flags = THREAD_CALL_ALLOC; - return (call); + return (call); } /* * thread_call_free: * - * Free a callout entry. + * Release a callout. If the callout is currently + * executing, it will be freed when all invocations + * finish. */ boolean_t thread_call_free( - thread_call_t call) + thread_call_t call) { - spl_t s; - - s = splsched(); - thread_call_lock_spin(); - - if (call->queue != NULL) { - thread_call_unlock(); - splx(s); + spl_t s; + int32_t refs; - return (FALSE); - } - - thread_call_unlock(); - splx(s); - - zfree(thread_call_zone, call); + s = splsched(); + thread_call_lock_spin(); + + if (call->tc_call.queue != NULL) { + thread_call_unlock(); + splx(s); + + return (FALSE); + } + + refs = --call->tc_refs; + if (refs < 0) { + panic("Refcount negative: %d\n", refs); + } + + thread_call_unlock(); + splx(s); + + if (refs == 0) { + zfree(thread_call_zone, call); + } return (TRUE); } @@ -608,23 +795,22 @@ thread_call_free( */ boolean_t thread_call_enter( - thread_call_t call) + thread_call_t call) { - boolean_t result = TRUE; - thread_call_group_t group = &thread_call_group0; - spl_t s; - + boolean_t result = TRUE; + thread_call_group_t group; + spl_t s; + + group = thread_call_get_group(call); + s = splsched(); thread_call_lock_spin(); - - if (call->queue != &group->pending_queue) { - result = _pending_call_enqueue(call, group); - - if (group->active_count == 0) - thread_call_wake(group); + + if (call->tc_call.queue != &group->pending_queue) { + result = _pending_call_enqueue(call, group); } - call->param1 = 0; + call->tc_call.param1 = 0; thread_call_unlock(); splx(s); @@ -634,24 +820,23 @@ thread_call_enter( boolean_t thread_call_enter1( - thread_call_t call, - thread_call_param_t param1) + thread_call_t call, + thread_call_param_t param1) { - boolean_t result = TRUE; - thread_call_group_t group = &thread_call_group0; - spl_t s; - + boolean_t result = TRUE; + thread_call_group_t group; + spl_t s; + + group = thread_call_get_group(call); + s = splsched(); thread_call_lock_spin(); - - if (call->queue != &group->pending_queue) { - result = _pending_call_enqueue(call, group); - - if (group->active_count == 0) - thread_call_wake(group); + + if (call->tc_call.queue != &group->pending_queue) { + result = _pending_call_enqueue(call, group); } - call->param1 = param1; + call->tc_call.param1 = param1; thread_call_unlock(); splx(s); @@ -670,12 +855,14 @@ thread_call_enter1( */ boolean_t thread_call_enter_delayed( - thread_call_t call, - uint64_t deadline) + thread_call_t call, + uint64_t deadline) { - boolean_t result = TRUE; - thread_call_group_t group = &thread_call_group0; - spl_t s; + boolean_t result = TRUE; + thread_call_group_t group; + spl_t s; + + group = thread_call_get_group(call); s = splsched(); thread_call_lock_spin(); @@ -685,7 +872,7 @@ thread_call_enter_delayed( if (queue_first(&group->delayed_queue) == qe(call)) _set_delayed_call_timer(call, group); - call->param1 = 0; + call->tc_call.param1 = 0; thread_call_unlock(); splx(s); @@ -695,13 +882,15 @@ thread_call_enter_delayed( boolean_t thread_call_enter1_delayed( - thread_call_t call, - thread_call_param_t param1, - uint64_t deadline) + thread_call_t call, + thread_call_param_t param1, + uint64_t deadline) { - boolean_t result = TRUE; - thread_call_group_t group = &thread_call_group0; - spl_t s; + boolean_t result = TRUE; + thread_call_group_t group; + spl_t s; + + group = thread_call_get_group(call); s = splsched(); thread_call_lock_spin(); @@ -711,7 +900,7 @@ thread_call_enter1_delayed( if (queue_first(&group->delayed_queue) == qe(call)) _set_delayed_call_timer(call, group); - call->param1 = param1; + call->tc_call.param1 = param1; thread_call_unlock(); splx(s); @@ -729,23 +918,61 @@ thread_call_enter1_delayed( */ boolean_t thread_call_cancel( - thread_call_t call) + thread_call_t call) { - boolean_t result; - thread_call_group_t group = &thread_call_group0; - spl_t s; - + boolean_t result; + thread_call_group_t group; + spl_t s; + + group = thread_call_get_group(call); + s = splsched(); thread_call_lock_spin(); result = _call_dequeue(call, group); - + thread_call_unlock(); splx(s); return (result); } +/* + * Cancel a thread call. If it cannot be cancelled (i.e. + * is already in flight), waits for the most recent invocation + * to finish. Note that if clients re-submit this thread call, + * it may still be pending or in flight when thread_call_cancel_wait + * returns, but all requests to execute this work item prior + * to the call to thread_call_cancel_wait will have finished. + */ +boolean_t +thread_call_cancel_wait( + thread_call_t call) +{ + boolean_t result; + thread_call_group_t group; + + if ((call->tc_flags & THREAD_CALL_ALLOC) == 0) { + panic("%s: Can't wait on thread call whose storage I don't own.", __FUNCTION__); + } + + group = thread_call_get_group(call); + + (void) splsched(); + thread_call_lock_spin(); + + result = _call_dequeue(call, group); + if (result == FALSE) { + thread_call_wait_locked(call); + } + + thread_call_unlock(); + (void) spllo(); + + return result; +} + + #ifndef __LP64__ /* @@ -761,16 +988,18 @@ thread_call_is_delayed( thread_call_t call, uint64_t *deadline) { - boolean_t result = FALSE; - thread_call_group_t group = &thread_call_group0; - spl_t s; + boolean_t result = FALSE; + thread_call_group_t group; + spl_t s; + + group = thread_call_get_group(call); s = splsched(); thread_call_lock_spin(); - if (call->queue == &group->delayed_queue) { + if (call->tc_call.queue == &group->delayed_queue) { if (deadline != NULL) - *deadline = call->deadline; + *deadline = call->tc_call.deadline; result = TRUE; } @@ -791,65 +1020,137 @@ thread_call_is_delayed( * create additional call threads. * * Called with thread_call_lock held. + * + * For high-priority group, only does wakeup/creation if there are no threads + * running. */ static __inline__ void thread_call_wake( thread_call_group_t group) { - if (group->idle_count > 0 && wait_queue_wakeup_one(&group->idle_wqueue, NO_EVENT, THREAD_AWAKENED, -1) == KERN_SUCCESS) { - group->idle_count--; group->active_count++; - } - else - if (!thread_call_daemon_awake) { - thread_call_daemon_awake = TRUE; - wait_queue_wakeup_one(&group->daemon_wqueue, NO_EVENT, THREAD_AWAKENED, -1); + /* + * New behavior: use threads if you've got 'em. + * Traditional behavior: wake only if no threads running. + */ + if (group_isparallel(group) || group->active_count == 0) { + if (wait_queue_wakeup_one(&group->idle_wqueue, NO_EVENT, THREAD_AWAKENED, -1) == KERN_SUCCESS) { + group->idle_count--; group->active_count++; + + if (group->idle_count == 0) { + timer_call_cancel(&group->dealloc_timer); + group->flags &= TCG_DEALLOC_ACTIVE; + } + } else { + if (!thread_call_daemon_awake && thread_call_group_should_add_thread(group)) { + thread_call_daemon_awake = TRUE; + wait_queue_wakeup_one(&daemon_wqueue, NO_EVENT, THREAD_AWAKENED, -1); + } + } } } /* * sched_call_thread: * - * Call out invoked by the scheduler. + * Call out invoked by the scheduler. Used only for high-priority + * thread call group. */ static void sched_call_thread( - int type, -__unused thread_t thread) + int type, + __unused thread_t thread) { - thread_call_group_t group = &thread_call_group0; + thread_call_group_t group; + + group = &thread_call_groups[THREAD_CALL_PRIORITY_HIGH]; /* XXX */ thread_call_lock_spin(); switch (type) { - case SCHED_CALL_BLOCK: - if (--group->active_count == 0 && group->pending_count > 0) - thread_call_wake(group); - break; + case SCHED_CALL_BLOCK: + --group->active_count; + if (group->pending_count > 0) + thread_call_wake(group); + break; - case SCHED_CALL_UNBLOCK: - group->active_count++; - break; + case SCHED_CALL_UNBLOCK: + group->active_count++; + break; } thread_call_unlock(); } +/* + * Interrupts disabled, lock held; returns the same way. + * Only called on thread calls whose storage we own. Wakes up + * anyone who might be waiting on this work item and frees it + * if the client has so requested. + */ +static void +thread_call_finish(thread_call_t call) +{ + boolean_t dowake = FALSE; + + call->tc_finish_count++; + call->tc_refs--; + + if ((call->tc_flags & THREAD_CALL_WAIT) != 0) { + dowake = TRUE; + call->tc_flags &= ~THREAD_CALL_WAIT; + + /* + * Dropping lock here because the sched call for the + * high-pri group can take the big lock from under + * a thread lock. + */ + thread_call_unlock(); + thread_wakeup((event_t)call); + thread_call_lock_spin(); + } + + if (call->tc_refs == 0) { + if (dowake) { + panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_call.func); + } + + enable_ints_and_unlock(); + + zfree(thread_call_zone, call); + + (void)disable_ints_and_lock(); + } + +} + /* * thread_call_thread: */ static void thread_call_thread( - thread_call_group_t group) + thread_call_group_t group, + wait_result_t wres) { - thread_t self = current_thread(); + thread_t self = current_thread(); + boolean_t canwait; - (void) splsched(); - thread_call_lock_spin(); + /* + * A wakeup with THREAD_INTERRUPTED indicates that + * we should terminate. + */ + if (wres == THREAD_INTERRUPTED) { + thread_terminate(self); + + /* NOTREACHED */ + panic("thread_terminate() returned?"); + } + + (void)disable_ints_and_lock(); - thread_sched_call(self, sched_call_thread); + thread_sched_call(self, group->sched_call); - while (group->pending_count > 0) { + while (group->pending_count > 0) { thread_call_t call; thread_call_func_t func; thread_call_param_t param0, param1; @@ -857,142 +1158,315 @@ thread_call_thread( call = TC(dequeue_head(&group->pending_queue)); group->pending_count--; - func = call->func; - param0 = call->param0; - param1 = call->param1; - - call->queue = NULL; + func = call->tc_call.func; + param0 = call->tc_call.param0; + param1 = call->tc_call.param1; + + call->tc_call.queue = NULL; _internal_call_release(call); - thread_call_unlock(); - (void) spllo(); + /* + * Can only do wakeups for thread calls whose storage + * we control. + */ + if ((call->tc_flags & THREAD_CALL_ALLOC) != 0) { + canwait = TRUE; + call->tc_refs++; /* Delay free until we're done */ + } else + canwait = FALSE; + + enable_ints_and_unlock(); KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_CALLOUT) | DBG_FUNC_NONE, - func, param0, param1, 0, 0); + MACHDBG_CODE(DBG_MACH_SCHED,MACH_CALLOUT) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE(func), param0, param1, 0, 0); (*func)(param0, param1); if (get_preemption_level() != 0) { int pl = get_preemption_level(); panic("thread_call_thread: preemption_level %d, last callout %p(%p, %p)", - pl, func, param0, param1); + pl, (void *)VM_KERNEL_UNSLIDE(func), param0, param1); } - + (void)thread_funnel_set(self->funnel_lock, FALSE); /* XXX */ - (void) splsched(); - thread_call_lock_spin(); - } + (void) disable_ints_and_lock(); + + if (canwait) { + /* Frees if so desired */ + thread_call_finish(call); + } + } thread_sched_call(self, NULL); group->active_count--; - if (group->idle_count < thread_call_thread_min) { + if (group_isparallel(group)) { + /* + * For new style of thread group, thread always blocks. + * If we have more than the target number of threads, + * and this is the first to block, and it isn't active + * already, set a timer for deallocating a thread if we + * continue to have a surplus. + */ group->idle_count++; - wait_queue_assert_wait(&group->idle_wqueue, NO_EVENT, THREAD_UNINT, 0); - - thread_call_unlock(); - (void) spllo(); + if (group->idle_count == 1) { + group->idle_timestamp = mach_absolute_time(); + } + + if (((group->flags & TCG_DEALLOC_ACTIVE) == 0) && + ((group->active_count + group->idle_count) > group->target_thread_count)) { + group->flags |= TCG_DEALLOC_ACTIVE; + thread_call_start_deallocate_timer(group); + } + + /* Wait for more work (or termination) */ + wres = wait_queue_assert_wait(&group->idle_wqueue, NO_EVENT, THREAD_INTERRUPTIBLE, 0); + if (wres != THREAD_WAITING) { + panic("kcall worker unable to assert wait?"); + } + + enable_ints_and_unlock(); thread_block_parameter((thread_continue_t)thread_call_thread, group); - /* NOTREACHED */ - } + } else { + if (group->idle_count < group->target_thread_count) { + group->idle_count++; - thread_call_unlock(); - (void) spllo(); - - thread_terminate(self); + wait_queue_assert_wait(&group->idle_wqueue, NO_EVENT, THREAD_UNINT, 0); /* Interrupted means to exit */ + + enable_ints_and_unlock(); + + thread_block_parameter((thread_continue_t)thread_call_thread, group); + /* NOTREACHED */ + } + } + + enable_ints_and_unlock(); + + thread_terminate(self); /* NOTREACHED */ } /* - * thread_call_daemon: + * thread_call_daemon: walk list of groups, allocating + * threads if appropriate (as determined by + * thread_call_group_should_add_thread()). */ static void -thread_call_daemon_continue( - thread_call_group_t group) +thread_call_daemon_continue(__unused void *arg) { - kern_return_t result; - thread_t thread; - - (void) splsched(); - thread_call_lock_spin(); - - while (group->active_count == 0 && group->pending_count > 0) { - group->active_count++; - - thread_call_unlock(); - (void) spllo(); - - result = kernel_thread_start_priority((thread_continue_t)thread_call_thread, group, BASEPRI_PREEMPT, &thread); - if (result != KERN_SUCCESS) - panic("thread_call_daemon"); + int i; + kern_return_t kr; + thread_call_group_t group; + + (void)disable_ints_and_lock(); + + /* Starting at zero happens to be high-priority first. */ + for (i = 0; i < THREAD_CALL_GROUP_COUNT; i++) { + group = &thread_call_groups[i]; + while (thread_call_group_should_add_thread(group)) { + group->active_count++; + + enable_ints_and_unlock(); + + kr = thread_call_thread_create(group); + if (kr != KERN_SUCCESS) { + /* + * On failure, just pause for a moment and give up. + * We can try again later. + */ + delay(10000); /* 10 ms */ + (void)disable_ints_and_lock(); + goto out; + } + + (void)disable_ints_and_lock(); + } + } - thread_deallocate(thread); +out: + thread_call_daemon_awake = FALSE; + wait_queue_assert_wait(&daemon_wqueue, NO_EVENT, THREAD_UNINT, 0); - (void) splsched(); - thread_call_lock_spin(); - } + enable_ints_and_unlock(); - thread_call_daemon_awake = FALSE; - wait_queue_assert_wait(&group->daemon_wqueue, NO_EVENT, THREAD_UNINT, 0); - - thread_call_unlock(); - (void) spllo(); - - thread_block_parameter((thread_continue_t)thread_call_daemon_continue, group); + thread_block_parameter((thread_continue_t)thread_call_daemon_continue, NULL); /* NOTREACHED */ } static void thread_call_daemon( - thread_call_group_t group) + __unused void *arg) { thread_t self = current_thread(); self->options |= TH_OPT_VMPRIV; vm_page_free_reserve(2); /* XXX */ - - thread_call_daemon_continue(group); - /* NOTREACHED */ + + thread_call_daemon_continue(NULL); + /* NOTREACHED */ +} + +/* + * Schedule timer to deallocate a worker thread if we have a surplus + * of threads (in excess of the group's target) and at least one thread + * is idle the whole time. + */ +static void +thread_call_start_deallocate_timer( + thread_call_group_t group) +{ + uint64_t deadline; + boolean_t onqueue; + + assert(group->idle_count > 0); + + group->flags |= TCG_DEALLOC_ACTIVE; + deadline = group->idle_timestamp + thread_call_dealloc_interval_abs; + onqueue = timer_call_enter(&group->dealloc_timer, deadline, 0); + + if (onqueue) { + panic("Deallocate timer already active?"); + } } void thread_call_delayed_timer( - timer_call_param_t p0, - __unused timer_call_param_t p1 + timer_call_param_t p0, + __unused timer_call_param_t p1 ) { - thread_call_t call; + thread_call_t call; thread_call_group_t group = p0; - boolean_t new_pending = FALSE; uint64_t timestamp; thread_call_lock_spin(); timestamp = mach_absolute_time(); - - call = TC(queue_first(&group->delayed_queue)); - - while (!queue_end(&group->delayed_queue, qe(call))) { - if (call->deadline <= timestamp) { + + call = TC(queue_first(&group->delayed_queue)); + + while (!queue_end(&group->delayed_queue, qe(call))) { + if (call->tc_call.deadline <= timestamp) { _pending_call_enqueue(call, group); - new_pending = TRUE; } else break; - + call = TC(queue_first(&group->delayed_queue)); - } + } if (!queue_end(&group->delayed_queue, qe(call))) _set_delayed_call_timer(call, group); - if (new_pending && group->active_count == 0) - thread_call_wake(group); + thread_call_unlock(); +} + +/* + * Timer callback to tell a thread to terminate if + * we have an excess of threads and at least one has been + * idle for a long time. + */ +static void +thread_call_dealloc_timer( + timer_call_param_t p0, + __unused timer_call_param_t p1) +{ + thread_call_group_t group = (thread_call_group_t)p0; + uint64_t now; + kern_return_t res; + boolean_t terminated = FALSE; + + thread_call_lock_spin(); + + now = mach_absolute_time(); + if (group->idle_count > 0) { + if (now > group->idle_timestamp + thread_call_dealloc_interval_abs) { + terminated = TRUE; + group->idle_count--; + res = wait_queue_wakeup_one(&group->idle_wqueue, NO_EVENT, THREAD_INTERRUPTED, -1); + if (res != KERN_SUCCESS) { + panic("Unable to wake up idle thread for termination?"); + } + } + + } + + /* + * If we still have an excess of threads, schedule another + * invocation of this function. + */ + if (group->idle_count > 0 && (group->idle_count + group->active_count > group->target_thread_count)) { + /* + * If we killed someone just now, push out the + * next deadline. + */ + if (terminated) { + group->idle_timestamp = now; + } - thread_call_unlock(); + thread_call_start_deallocate_timer(group); + } else { + group->flags &= ~TCG_DEALLOC_ACTIVE; + } + + thread_call_unlock(); } + +/* + * Wait for all requested invocations of a thread call prior to now + * to finish. Can only be invoked on thread calls whose storage we manage. + * Just waits for the finish count to catch up to the submit count we find + * at the beginning of our wait. + */ +static void +thread_call_wait_locked(thread_call_t call) +{ + uint64_t submit_count; + wait_result_t res; + + assert(call->tc_flags & THREAD_CALL_ALLOC); + + submit_count = call->tc_submit_count; + + while (call->tc_finish_count < submit_count) { + call->tc_flags |= THREAD_CALL_WAIT; + + res = assert_wait(call, THREAD_UNINT); + if (res != THREAD_WAITING) { + panic("Unable to assert wait?"); + } + + thread_call_unlock(); + (void) spllo(); + + res = thread_block(NULL); + if (res != THREAD_AWAKENED) { + panic("Awoken with %d?", res); + } + + (void) splsched(); + thread_call_lock_spin(); + } +} + +/* + * Determine whether a thread call is either on a queue or + * currently being executed. + */ +boolean_t +thread_call_isactive(thread_call_t call) +{ + boolean_t active; + + disable_ints_and_lock(); + active = (call->tc_submit_count > call->tc_finish_count); + enable_ints_and_unlock(); + + return active; +} + diff --git a/osfmk/kern/thread_call.h b/osfmk/kern/thread_call.h index aa38f0dda..e2836e293 100644 --- a/osfmk/kern/thread_call.h +++ b/osfmk/kern/thread_call.h @@ -25,8 +25,10 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Declarations for thread-based callouts. + +/*! + @header thread_call.h + @discussion Facilities for executing work asynchronously. */ #ifndef _KERN_THREAD_CALL_H_ @@ -38,46 +40,200 @@ #include -typedef struct call_entry *thread_call_t; -typedef void *thread_call_param_t; -typedef void (*thread_call_func_t)( - thread_call_param_t param0, - thread_call_param_t param1); +struct thread_call; +typedef struct thread_call *thread_call_t; + +typedef void *thread_call_param_t; +typedef void (*thread_call_func_t)( + thread_call_param_t param0, + thread_call_param_t param1); +/*! + @enum thread_call_priority_t + @discussion Thread call priorities should not be assumed to have any specific + numerical value; they should be interpreted as importances or roles for work + items, priorities for which will be reasonably managed by the subsystem. + @constant THREAD_CALL_PRIORITY_HIGH Importance above everything but realtime. + Thread calls allocated with this priority execute at extremely high priority, + above everything but realtime threads. They are generally executed in serial. + Though they may execute concurrently under some circumstances, no fan-out is implied. + These work items should do very small amounts of work or risk disrupting system + responsiveness. + @constant THREAD_CALL_PRIORITY_KERNEL Importance similar to that of normal kernel + threads. + @constant THREAD_CALL_PRIORITY_USER Importance similar to that of normal user threads. + @constant THREAD_CALL_PRIORITY_LOW Very low importance. + */ +typedef enum { + THREAD_CALL_PRIORITY_HIGH = 0, + THREAD_CALL_PRIORITY_KERNEL = 1, + THREAD_CALL_PRIORITY_USER = 2, + THREAD_CALL_PRIORITY_LOW = 3 +} thread_call_priority_t; + __BEGIN_DECLS +/*! + @function thread_call_enter + @abstract Submit a thread call work item for immediate execution. + @discussion If the work item is already scheduled for delayed execution, and it has + not yet begun to run, that delayed invocation will be cancelled. Note that if a + thread call is rescheduled from its own callback, then multiple invocations of the + callback may be in flight at the same time. + @result TRUE if the call was already pending for either delayed or immediate + execution, FALSE otherwise. + @param call The thread call to execute. + */ extern boolean_t thread_call_enter( thread_call_t call); - +/*! + @function thread_call_enter1 + @abstract Submit a thread call work item for immediate execution, with an extra parameter. + @discussion This routine is identical to thread_call_enter(), except that + the second parameter to the callback is specified. + @result TRUE if the call was already pending for either delayed or immediate + execution, FALSE otherwise. + @param call The thread call to execute. + @param param1 Parameter to pass callback. + */ extern boolean_t thread_call_enter1( - thread_call_t call, - thread_call_param_t param1); - + thread_call_t call, + thread_call_param_t param1); + +/*! + @function thread_call_enter_delayed + @abstract Submit a thread call to be executed at some point in the future. + @discussion If the work item is already scheduled for delayed or immediate execution, + and it has not yet begun to run, that invocation will be cancelled in favor of execution + at the newly specified time. Note that if a thread call is rescheduled from its own callback, + then multiple invocations of the callback may be in flight at the same time. + @result TRUE if the call was already pending for either delayed or immediate + execution, FALSE otherwise. + @param call The thread call to execute. + @param deadline Time, in absolute time units, at which to execute callback. + */ extern boolean_t thread_call_enter_delayed( thread_call_t call, - uint64_t deadline); - + uint64_t deadline); +/*! + @function thread_call_enter1_delayed + @abstract Submit a thread call to be executed at some point in the future, with an extra parameter. + @discussion This routine is identical to thread_call_enter_delayed(), + except that a second parameter to the callback is specified. + @result TRUE if the call was already pending for either delayed or immediate + execution, FALSE otherwise. + @param call The thread call to execute. + @param param1 Second parameter to callback. + @param deadline Time, in absolute time units, at which to execute callback. + */ extern boolean_t thread_call_enter1_delayed( - thread_call_t call, - thread_call_param_t param1, - uint64_t deadline); - + thread_call_t call, + thread_call_param_t param1, + uint64_t deadline); + +/*! + @function thread_call_cancel + @abstract Attempt to cancel a pending invocation of a thread call. + @discussion Attempt to cancel a thread call which has been scheduled + for execution with a thread_call_enter* variant. If the call has not + yet begun executing, the pending invocation will be cancelled and TRUE + will be returned. If the work item has already begun executing, + thread_call_cancel will return FALSE immediately; the callback may be + about to run, currently running, or already done executing. + @result TRUE if the call was successfully cancelled, FALSE otherwise. + */ extern boolean_t thread_call_cancel( thread_call_t call); +/*! + @function thread_call_cancel_wait + @abstract Attempt to cancel a pending invocation of a thread call. + If unable to cancel, wait for current invocation to finish. + @discussion Attempt to cancel a thread call which has been scheduled + for execution with a thread_call_enter* variant. If the call has not + yet begun executing, the pending invocation will be cancelled and TRUE + will be returned. If the work item has already begun executing, + thread_call_cancel_wait waits for the most recent invocation to finish. When + called on a work item which has already finished, it will return FALSE immediately. + Note that this routine can only be used on thread calls set up with either + thread_call_allocate or thread_call_allocate_with_priority, and that invocations + of the thread call after the current invocation may be in flight when + thread_call_cancel_wait returns. + @result TRUE if the call was successfully cancelled, FALSE otherwise. + */ +extern boolean_t thread_call_cancel_wait( + thread_call_t call); + /*! + @function thread_call_allocate + @abstract Allocate a thread call to execute with default (high) priority. + @discussion Allocates a thread call that will run with properties of + THREAD_CALL_PRIORITY_HIGH, binding the first parameter to the callback. + @param func Callback to invoke when thread call is scheduled. + @param param0 First argument ot pass to callback. + @result Thread call which can be passed to thread_call_enter variants. + */ extern thread_call_t thread_call_allocate( - thread_call_func_t func, - thread_call_param_t param0); - -extern boolean_t thread_call_free( - thread_call_t call); + thread_call_func_t func, + thread_call_param_t param0); + + /*! + @function thread_call_allocate_with_priority + @abstract Allocate a thread call to execute with a specified priority. + @discussion Identical to thread_call_allocate, except that priority + is specified by caller. + @param func Callback to invoke when thread call is scheduled. + @param param0 First argument to pass to callback. + @param pri Priority of item. + @result Thread call which can be passed to thread_call_enter variants. + */ +extern thread_call_t thread_call_allocate_with_priority( + thread_call_func_t func, + thread_call_param_t param0, + thread_call_priority_t pri); + +/*! + @function thread_call_free + @abstract Release a thread call. + @discussion Should only be used on thread calls allocated with thread_call_allocate + or thread_call_allocate_with_priority. Once thread_call_free has been called, + no other operations may be performed on a thread call. If the thread call is + currently pending, thread_call_free will return FALSE and will have no effect. + Calling thread_call_free from a thread call's own callback is safe; the work + item is not considering "pending" at that point. + @result TRUE if the thread call has been successfully released, else FALSE. + @param call The thread call to release. + */ +extern boolean_t thread_call_free( + thread_call_t call); +/*! + @function thread_call_isactive + @abstract Determine whether a thread call is pending or currently executing. + @param call Thread call to examine. + @result TRUE if the thread call is either scheduled for execution (immediately + or at some point in the future) or is currently executing. + */ +boolean_t thread_call_isactive( + thread_call_t call); __END_DECLS #ifdef MACH_KERNEL_PRIVATE #include -typedef struct call_entry thread_call_data_t; +struct thread_call { + struct call_entry tc_call; /* Must be first */ + uint64_t tc_submit_count; + uint64_t tc_finish_count; + thread_call_priority_t tc_pri; + + uint32_t tc_flags; + int32_t tc_refs; +}; + +#define THREAD_CALL_ALLOC 0x01 +#define THREAD_CALL_WAIT 0x02 + +typedef struct thread_call thread_call_data_t; extern void thread_call_initialize(void); @@ -100,22 +256,22 @@ __BEGIN_DECLS extern boolean_t thread_call_is_delayed( thread_call_t call, - uint64_t *deadline); + uint64_t *deadline); extern void thread_call_func( thread_call_func_t func, thread_call_param_t param, - boolean_t unique_call); + boolean_t unique_call); extern void thread_call_func_delayed( thread_call_func_t func, thread_call_param_t param, - uint64_t deadline); + uint64_t deadline); extern boolean_t thread_call_func_cancel( - thread_call_func_t func, - thread_call_param_t param, - boolean_t cancel_all); + thread_call_func_t func, + thread_call_param_t param, + boolean_t cancel_all); #else /* __LP64__ */ @@ -124,12 +280,12 @@ extern boolean_t thread_call_func_cancel( extern void thread_call_func_delayed( thread_call_func_t func, thread_call_param_t param, - uint64_t deadline); + uint64_t deadline); extern boolean_t thread_call_func_cancel( - thread_call_func_t func, - thread_call_param_t param, - boolean_t cancel_all); + thread_call_func_t func, + thread_call_param_t param, + boolean_t cancel_all); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/thread_policy.c b/osfmk/kern/thread_policy.c index 7ed70a151..8108514f5 100644 --- a/osfmk/kern/thread_policy.c +++ b/osfmk/kern/thread_policy.c @@ -164,12 +164,6 @@ thread_policy_set_internal( if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) { thread->saved_mode = TH_MODE_REALTIME; } -#if CONFIG_EMBEDDED - else if (thread->task_priority <= MAXPRI_THROTTLE) { - thread->saved_mode = TH_MODE_REALTIME; - thread->sched_flags |= TH_SFLAG_THROTTLED; - } -#endif else { if (thread->sched_mode == TH_MODE_TIMESHARE) { if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) @@ -293,55 +287,31 @@ thread_throttle( thread_t thread, integer_t task_priority) { - if (!(thread->sched_flags & TH_SFLAG_THROTTLED) && - (task_priority <= MAXPRI_THROTTLE)) { - - if (!((thread->sched_mode == TH_MODE_REALTIME) || - (thread->saved_mode == TH_MODE_REALTIME))) { - return; - } - - /* Demote to timeshare if throttling */ - if (thread->sched_mode == TH_MODE_REALTIME) - { - thread->saved_mode = TH_MODE_REALTIME; - - if (thread->sched_mode == TH_MODE_TIMESHARE) { - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) - sched_share_incr(); - } + if ((!(thread->sched_flags & TH_SFLAG_THROTTLED) + || (thread->sched_flags & TH_SFLAG_PENDING_THROTTLE_PROMOTION)) + && (task_priority <= MAXPRI_THROTTLE)) { + + /* Kill a promotion if it was in flight */ + thread->sched_flags &= ~TH_SFLAG_PENDING_THROTTLE_PROMOTION; + + if (!(thread->sched_flags & TH_SFLAG_THROTTLED)) { + /* + * Set the pending bit so that we can switch runqueues + * (potentially) at a later time safely + */ + thread->sched_flags |= TH_SFLAG_PENDING_THROTTLE_DEMOTION; } - - /* TH_SFLAG_FAILSAFE and TH_SFLAG_THROTTLED are mutually exclusive, - * since a throttled thread is not realtime during the throttle - * and doesn't need the failsafe repromotion. We therefore clear - * the former and set the latter flags here. - */ - thread->sched_flags &= ~TH_SFLAG_FAILSAFE; - thread->sched_flags |= TH_SFLAG_THROTTLED; - - if (SCHED(supports_timeshare_mode)()) - thread->sched_mode = TH_MODE_TIMESHARE; - else - thread->sched_mode = TH_MODE_FIXED; } - else if ((thread->sched_flags & TH_SFLAG_THROTTLED) && - (task_priority > MAXPRI_THROTTLE)) { - - /* Promote back to real time if unthrottling */ - if (!(thread->saved_mode == TH_MODE_TIMESHARE)) { + else if (((thread->sched_flags & TH_SFLAG_THROTTLED) + || (thread->sched_flags & TH_SFLAG_PENDING_THROTTLE_DEMOTION)) + && (task_priority > MAXPRI_THROTTLE)) { - thread->sched_mode = thread->saved_mode; + /* Kill a demotion if it was in flight */ + thread->sched_flags &= ~TH_SFLAG_PENDING_THROTTLE_DEMOTION; - if (thread->sched_mode == TH_MODE_TIMESHARE) { - if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) - sched_share_decr(); - } - - thread->saved_mode = TH_MODE_NONE; + if (thread->sched_flags & TH_SFLAG_THROTTLED) { + thread->sched_flags |= TH_SFLAG_PENDING_THROTTLE_PROMOTION; } - - thread->sched_flags &= ~TH_SFLAG_THROTTLED; } } #endif @@ -393,6 +363,7 @@ thread_policy_reset( } } else { + thread->sched_mode = thread->saved_mode; thread->saved_mode = TH_MODE_NONE; thread->sched_flags &= ~TH_SFLAG_DEMOTED_MASK; } diff --git a/osfmk/kern/timer.c b/osfmk/kern/timer.c index 7cce6afe3..02a088597 100644 --- a/osfmk/kern/timer.c +++ b/osfmk/kern/timer.c @@ -56,7 +56,6 @@ /* */ -#include #include #include @@ -67,6 +66,12 @@ #include #include +#if CONFIG_EMBEDDED +int precise_user_kernel_time = 0; +#else +int precise_user_kernel_time = 1; +#endif + /* * timer_init initializes a timer. */ @@ -74,9 +79,7 @@ void timer_init( timer_t timer) { -#if !STAT_TIME timer->tstamp = 0; -#endif /* STAT_TIME */ #if defined(__LP64__) timer->all_bits = 0; #else @@ -120,8 +123,6 @@ timer_advance( #endif /* defined(__LP64__) */ } -#if !STAT_TIME - void timer_start( timer_t timer, @@ -188,5 +189,3 @@ thread_timer_event( } #endif /* MACHINE_TIMER_ROUTINES */ - -#endif /* STAT_TIME */ diff --git a/osfmk/kern/timer.h b/osfmk/kern/timer.h index abbfcb5e3..a353c6c29 100644 --- a/osfmk/kern/timer.h +++ b/osfmk/kern/timer.h @@ -59,19 +59,36 @@ #ifndef _KERN_TIMER_H_ #define _KERN_TIMER_H_ -#include - #include +/* + * Some platforms have very expensive timebase routines. An optimization + * is to avoid switching timers on kernel exit/entry, which results in all + * time billed to the system timer. However, when exposed to userspace, + * we report as user time to indicate that work was done on behalf of + * userspace. + * + * Although this policy is implemented as a global variable, we snapshot it + * at key points in the thread structure (when the thread is locked and + * executing in the kernel) to avoid imbalances. + */ +extern int precise_user_kernel_time; + +/* + * thread must be locked, or be the current executing thread, so that + * it doesn't transition from user to kernel while updating the + * thread-local value (or in kernel debugger context). In the future, + * we make take into account task-level or thread-level policy. + */ +#define use_precise_user_kernel_time(thread) ( precise_user_kernel_time ) + /* * Definitions for high resolution timers. A check * word on the high portion allows atomic updates. */ struct timer { -#if !STAT_TIME uint64_t tstamp; -#endif /* STAT_TIME */ #if defined(__LP64__) uint64_t all_bits; #else @@ -87,32 +104,6 @@ typedef struct timer timer_data_t, *timer_t; * Exported kernel interface to timers */ -#if STAT_TIME - -#include - -/* Advance a timer by a 32 bit value */ -#define TIMER_BUMP(timer, ticks) \ -MACRO_BEGIN \ - uint32_t old_low, low; \ - \ - old_low = (timer)->low_bits; \ - low = old_low + (ticks); \ - if (low < old_low) \ - timer_update((timer), (timer)->high_bits + 1, low); \ - else \ - (timer)->low_bits = low; \ -MACRO_END - -#define timer_start(timer, tstamp) -#define timer_stop(timer, tstamp) -#define timer_switch(timer, tstamp, new_timer) -#define thread_timer_event(tstamp, new_timer) - -#else /* STAT_TIME */ - -#define TIMER_BUMP(timer, ticks) - /* Start a timer by setting the timestamp */ extern void timer_start( timer_t timer, @@ -134,8 +125,6 @@ extern void thread_timer_event( uint64_t tstamp, timer_t new_timer); -#endif /* STAT_TIME */ - /* Initialize a timer */ extern void timer_init( timer_t timer); diff --git a/osfmk/kern/timer_call.c b/osfmk/kern/timer_call.c index 83eb0e43a..0d737dbbb 100644 --- a/osfmk/kern/timer_call.c +++ b/osfmk/kern/timer_call.c @@ -74,6 +74,15 @@ lck_grp_attr_t timer_call_lck_grp_attr; #define MPQUEUE(x) ((mpqueue_head_t *)(x)) #define TIMER_CALL(x) ((timer_call_t)(x)) + +uint64_t past_deadline_timers; +uint64_t past_deadline_deltas; +uint64_t past_deadline_longest; +uint64_t past_deadline_shortest = ~0ULL; +enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000}; + +uint64_t past_deadline_timer_adjustment; + static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint32_t flags); boolean_t mach_timer_coalescing_enabled = TRUE; @@ -92,6 +101,7 @@ timer_call_initialize(void) lck_attr_setdefault(&timer_call_lck_attr); lck_grp_attr_setdefault(&timer_call_lck_grp_attr); lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr); + nanotime_to_absolutetime(0, PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment); } @@ -332,6 +342,22 @@ timer_call_enter_internal( deadline += slop; } +#if defined(__i386__) || defined(__x86_64__) + uint64_t ctime = mach_absolute_time(); + if (__improbable(deadline < ctime)) { + uint64_t delta = (ctime - deadline); + + past_deadline_timers++; + past_deadline_deltas += delta; + if (delta > past_deadline_longest) + past_deadline_longest = deadline; + if (delta < past_deadline_shortest) + past_deadline_shortest = delta; + + deadline = ctime + past_deadline_timer_adjustment; + call->soft_deadline = deadline; + } +#endif queue = timer_queue_assign(deadline); old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline); @@ -469,10 +495,9 @@ timer_queue_expire( simple_unlock(&call->lock); timer_call_unlock(queue); - KERNEL_DEBUG_CONSTANT(DECR_TIMER_CALLOUT | DBG_FUNC_START, - func, - param0, - param1, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + DECR_TIMER_CALLOUT | DBG_FUNC_START, + VM_KERNEL_UNSLIDE(func), param0, param1, 0, 0); #if CONFIG_DTRACE && (DEVELOPMENT || DEBUG ) DTRACE_TMR3(callout__start, timer_call_func_t, func, @@ -488,10 +513,9 @@ timer_queue_expire( timer_call_param_t, param1); #endif - KERNEL_DEBUG_CONSTANT(DECR_TIMER_CALLOUT | DBG_FUNC_END, - func, - param0, - param1, 0, 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + DECR_TIMER_CALLOUT | DBG_FUNC_END, + VM_KERNEL_UNSLIDE(func), param0, param1, 0, 0); timer_call_lock_spin(queue); } diff --git a/osfmk/kern/wait_queue.c b/osfmk/kern/wait_queue.c index 14cd08724..6bcabf8c1 100644 --- a/osfmk/kern/wait_queue.c +++ b/osfmk/kern/wait_queue.c @@ -84,7 +84,6 @@ static boolean_t wait_queue_member_locked( static void wait_queues_init(void) __attribute__((section("__TEXT, initcode"))); - #define WAIT_QUEUE_MAX thread_max #define WAIT_QUEUE_SET_MAX task_max * 3 #define WAIT_QUEUE_LINK_MAX PORT_MAX / 2 + (WAIT_QUEUE_MAX * WAIT_QUEUE_SET_MAX) / 64 @@ -128,16 +127,21 @@ volatile WaitQueueLink *unused_except_for_debugging; struct wait_queue boot_wait_queue[1]; __private_extern__ struct wait_queue *wait_queues = &boot_wait_queue[0]; - __private_extern__ uint32_t num_wait_queues = 1; +#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align))) +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) + static uint32_t -compute_wait_hash_size(__unused unsigned cpu_count, __unused uint64_t memsize) { - uint32_t hsize = (uint32_t)round_page_64((thread_max / 11) * sizeof(struct wait_queue)); - uint32_t bhsize; +compute_wait_hash_size(void) +{ + uint32_t hsize, queues; - if (PE_parse_boot_argn("wqsize", &bhsize, sizeof(bhsize))) - hsize = bhsize; + if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize))) + return (hsize); + + queues = thread_max / 11; + hsize = P2ROUNDUP(queues * sizeof(struct wait_queue), PAGE_SIZE); return hsize; } @@ -145,13 +149,37 @@ compute_wait_hash_size(__unused unsigned cpu_count, __unused uint64_t memsize) { static void wait_queues_init(void) { - uint32_t i, whsize; + uint32_t i, whsize, qsz; kern_return_t kret; - whsize = compute_wait_hash_size(processor_avail_count, machine_info.max_mem); - num_wait_queues = (whsize / ((uint32_t)sizeof(struct wait_queue))) - 1; + /* + * Determine the amount of memory we're willing to reserve for + * the waitqueue hash table + */ + whsize = compute_wait_hash_size(); + + /* Determine the number of waitqueues we can fit. */ + qsz = sizeof (struct wait_queue); + whsize = ROUNDDOWN(whsize, qsz); + num_wait_queues = whsize / qsz; + + /* + * The hash algorithm requires that this be a power of 2, so we + * just mask off all the low-order bits. + */ + for (i = 0; i < 31; i++) { + uint32_t bit = (1 << i); + if ((num_wait_queues & bit) == num_wait_queues) + break; + num_wait_queues &= ~bit; + } + assert(num_wait_queues > 0); + + /* Now determine how much memory we really need. */ + whsize = P2ROUNDUP(num_wait_queues * qsz, PAGE_SIZE); - kret = kernel_memory_allocate(kernel_map, (vm_offset_t *) &wait_queues, whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT); + kret = kernel_memory_allocate(kernel_map, (vm_offset_t *) &wait_queues, + whsize, 0, KMA_KOBJECT|KMA_NOPAGEWAIT); if (kret != KERN_SUCCESS || wait_queues == NULL) panic("kernel_memory_allocate() failed to allocate wait queues, error: %d, whsize: 0x%x", kret, whsize); @@ -676,6 +704,60 @@ wait_queue_unlink_locked( WAIT_QUEUE_SET_CHECK(wq_set); } +/* + * Routine: wait_queue_unlink_nofree + * Purpose: + * Remove the linkage between a wait queue and a set, + * returning the linkage structure to the caller to + * free later. + * Conditions: + * The wait queue being must be a member set queue + */ +kern_return_t +wait_queue_unlink_nofree( + wait_queue_t wq, + wait_queue_set_t wq_set, + wait_queue_link_t *wqlp) +{ + wait_queue_element_t wq_element; + wait_queue_link_t wql; + queue_t q; + spl_t s; + + if (!wait_queue_is_valid(wq) || !wait_queue_is_set(wq_set)) { + return KERN_INVALID_ARGUMENT; + } + s = splsched(); + wait_queue_lock(wq); + + q = &wq->wq_queue; + wq_element = (wait_queue_element_t) queue_first(q); + while (!queue_end(q, (queue_entry_t)wq_element)) { + WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element); + if (wq_element->wqe_type == WAIT_QUEUE_LINK || + wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) { + + wql = (wait_queue_link_t)wq_element; + + if (wql->wql_setqueue == wq_set) { + + wqs_lock(wq_set); + wait_queue_unlink_locked(wq, wq_set, wql); + wqs_unlock(wq_set); + wait_queue_unlock(wq); + splx(s); + *wqlp = wql; + return KERN_SUCCESS; + } + } + wq_element = (wait_queue_element_t) + queue_next((queue_t) wq_element); + } + wait_queue_unlock(wq); + splx(s); + return KERN_NOT_IN_SET; +} + /* * Routine: wait_queue_unlink * Purpose: @@ -732,36 +814,97 @@ wait_queue_unlink( } /* - * Routine: wait_queue_unlink_all + * Routine: wait_queue_unlink_all_nofree_locked * Purpose: * Remove the linkage between a wait queue and all its sets. - * All the linkage structures that were allocated internally - * are freed. The others are the caller's responsibility. + * All the linkage structures are returned to the caller for + * later freeing. * Conditions: - * Nothing of interest locked. + * Wait queue locked. */ -kern_return_t -wait_queue_unlink_all( - wait_queue_t wq) +static void +wait_queue_unlink_all_nofree_locked( + wait_queue_t wq, + queue_t links) { wait_queue_element_t wq_element; wait_queue_element_t wq_next_element; wait_queue_set_t wq_set; wait_queue_link_t wql; - queue_head_t links_queue_head; - queue_t links = &links_queue_head; queue_t q; + + q = &wq->wq_queue; + + wq_element = (wait_queue_element_t) queue_first(q); + while (!queue_end(q, (queue_entry_t)wq_element)) { + + WAIT_QUEUE_ELEMENT_CHECK(wq, wq_element); + wq_next_element = (wait_queue_element_t) + queue_next((queue_t) wq_element); + + if (wq_element->wqe_type == WAIT_QUEUE_LINK || + wq_element->wqe_type == WAIT_QUEUE_LINK_NOALLOC) { + wql = (wait_queue_link_t)wq_element; + wq_set = wql->wql_setqueue; + wqs_lock(wq_set); + wait_queue_unlink_locked(wq, wq_set, wql); + wqs_unlock(wq_set); + enqueue(links, &wql->wql_links); + } + wq_element = wq_next_element; + } +} + +/* + * Routine: wait_queue_unlink_all_nofree + * Purpose: + * Remove the linkage between a wait queue and all its sets. + * All the linkage structures are returned to the caller for + * later freeing. + * Conditions: + * Nothing of interest locked. + */ + +kern_return_t +wait_queue_unlink_all_nofree( + wait_queue_t wq, + queue_t links) +{ spl_t s; if (!wait_queue_is_valid(wq)) { return KERN_INVALID_ARGUMENT; } - queue_init(links); - s = splsched(); wait_queue_lock(wq); + wait_queue_unlink_all_nofree_locked(wq, links); + wait_queue_unlock(wq); + splx(s); + + return(KERN_SUCCESS); +} + +/* + * Routine: wait_queue_unlink_all_locked + * Purpose: + * Remove the linkage between a locked wait queue and all its + * sets and enqueue the allocated ones onto the links queue + * provided. + * Conditions: + * Wait queue locked. + */ +static void +wait_queue_unlink_all_locked( + wait_queue_t wq, + queue_t links) +{ + wait_queue_element_t wq_element; + wait_queue_element_t wq_next_element; + wait_queue_set_t wq_set; + wait_queue_link_t wql; + queue_t q; q = &wq->wq_queue; @@ -785,6 +928,38 @@ wait_queue_unlink_all( } wq_element = wq_next_element; } + +} + + +/* + * Routine: wait_queue_unlink_all + * Purpose: + * Remove the linkage between a wait queue and all its sets. + * All the linkage structures that were allocated internally + * are freed. The others are the caller's responsibility. + * Conditions: + * Nothing of interest locked. + */ + +kern_return_t +wait_queue_unlink_all( + wait_queue_t wq) +{ + wait_queue_link_t wql; + queue_head_t links_queue_head; + queue_t links = &links_queue_head; + spl_t s; + + if (!wait_queue_is_valid(wq)) { + return KERN_INVALID_ARGUMENT; + } + + queue_init(links); + + s = splsched(); + wait_queue_lock(wq); + wait_queue_unlink_all_locked(wq, links); wait_queue_unlock(wq); splx(s); @@ -805,12 +980,70 @@ wait_subqueue_unlink_all( } +/* + * Routine: wait_queue_set_unlink_all_nofree + * Purpose: + * Remove the linkage between a set wait queue and all its + * member wait queues and all the sets it may be a member of. + * The links structures are returned for later freeing by the + * caller. + * Conditions: + * The wait queue must be a set + */ +kern_return_t +wait_queue_set_unlink_all_nofree( + wait_queue_set_t wq_set, + queue_t links) +{ + wait_queue_link_t wql; + wait_queue_t wq; + queue_t q; + spl_t s; + + if (!wait_queue_is_set(wq_set)) { + return KERN_INVALID_ARGUMENT; + } + +retry: + s = splsched(); + wqs_lock(wq_set); + + /* remove the wait queues that are members of our set */ + q = &wq_set->wqs_setlinks; + + wql = (wait_queue_link_t)queue_first(q); + while (!queue_end(q, (queue_entry_t)wql)) { + WAIT_QUEUE_SET_LINK_CHECK(wq_set, wql); + wq = wql->wql_queue; + if (wait_queue_lock_try(wq)) { + wait_queue_unlink_locked(wq, wq_set, wql); + wait_queue_unlock(wq); + enqueue(links, &wql->wql_links); + wql = (wait_queue_link_t)queue_first(q); + } else { + wqs_unlock(wq_set); + splx(s); + delay(1); + goto retry; + } + } + + /* remove this set from sets it belongs to */ + wait_queue_unlink_all_nofree_locked(&wq_set->wqs_wait_queue, links); + + wqs_unlock(wq_set); + splx(s); + + return(KERN_SUCCESS); +} + /* * Routine: wait_queue_set_unlink_all * Purpose: * Remove the linkage between a set wait queue and all its - * member wait queues. The link structures are freed for those - * links which were dynamically allocated. + * member wait queues and all the sets it may be members of. + * The link structures are freed for those links which were + * dynamically allocated. * Conditions: * The wait queue must be a set */ @@ -835,6 +1068,7 @@ retry: s = splsched(); wqs_lock(wq_set); + /* remove the wait queues that are members of our set */ q = &wq_set->wqs_setlinks; wql = (wait_queue_link_t)queue_first(q); @@ -857,6 +1091,11 @@ retry: goto retry; } } + + + /* remove this set from sets it belongs to */ + wait_queue_unlink_all_locked(&wq_set->wqs_wait_queue, links); + wqs_unlock(wq_set); splx(s); diff --git a/osfmk/kern/wait_queue.h b/osfmk/kern/wait_queue.h index 42675a30b..fc91a60af 100644 --- a/osfmk/kern/wait_queue.h +++ b/osfmk/kern/wait_queue.h @@ -36,13 +36,13 @@ #include /* for kern_return_t */ #include /* for wait_queue_t */ +#include #include #ifdef MACH_KERNEL_PRIVATE #include -#include #include #include @@ -271,12 +271,11 @@ static inline uint32_t wq_hash(char *key) hash ^= (hash >> 11); hash += (hash << 15); + hash &= (num_wait_queues - 1); return hash; } -/* TBD: It should be possible to eliminate the divide here */ -#define wait_hash(event) \ - (wq_hash((char *)&event) % (num_wait_queues)) +#define wait_hash(event) wq_hash((char *)&event) #endif /* MACH_KERNEL_PRIVATE */ @@ -335,6 +334,19 @@ extern kern_return_t wait_queue_set_unlink_one( wait_queue_set_t set_queue, wait_queue_link_t link); +extern kern_return_t wait_queue_unlink_nofree( + wait_queue_t wait_queue, + wait_queue_set_t set_queue, + wait_queue_link_t *wqlp); + +extern kern_return_t wait_queue_unlink_all_nofree( + wait_queue_t wait_queue, + queue_t links); + +extern kern_return_t wait_queue_set_unlink_all_nofree( + wait_queue_set_t set_queue, + queue_t links); + extern wait_queue_link_t wait_queue_link_allocate(void); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/xpr.c b/osfmk/kern/xpr.c index 1b4d16707..a10ec384d 100644 --- a/osfmk/kern/xpr.c +++ b/osfmk/kern/xpr.c @@ -53,7 +53,6 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -#include /* * xpr silent tracing circular buffer. */ @@ -125,174 +124,3 @@ xpr( mp_enable_preemption(); } -void -xprbootstrap(void) -{ - vm_offset_t addr; - vm_size_t size; - kern_return_t kr; - - simple_lock_init(&xprlock, 0); - if (nxprbufs == 0) - return; /* assume XPR support not desired */ - - /* leave room at the end for a saved copy of xprptr */ - size = nxprbufs * sizeof(struct xprbuf) + sizeof xprptr; - - kr = kmem_alloc_kobject(kernel_map, &addr, size); - if (kr != KERN_SUCCESS) - panic("xprbootstrap"); - - if (xprenable) { - /* - * If xprenable is set (the default) then we zero - * the buffer so xpr_dump doesn't encounter bad pointers. - * If xprenable isn't set, then we preserve - * the original contents of the buffer. This is useful - * if memory survives reboots, so xpr_dump can show - * the previous buffer contents. - */ - - (void) memset((void *) addr, 0, size); - } - - xprbase = (struct xprbuf *) addr; - xprlast = &xprbase[nxprbufs]; - xprptr = xprbase; /* setting xprptr enables tracing */ -} - -int xprinitial = 0; - -void -xprinit(void) -{ - xprflags |= xprinitial; -} - -#if MACH_KDB -#include - -/* - * Prototypes for functions called from the debugger - */ -void -xpr_dump( - struct xprbuf *base, - int nbufs); - -void -xpr_search( - int arg_index, - int value); - -extern jmp_buf_t *db_recover; - -/* - * Print current content of xpr buffers (KDB's sake) - * Use stack order to make it understandable. - * - * Called as "!xpr_dump" this dumps the kernel's xpr buffer. - * Called with arguments, it can dump xpr buffers in user tasks, - * assuming they use the same format as the kernel. - */ -static spl_t xpr_dump_spl; -static struct xprbuf *base; -static int nbufs; -void -xpr_dump( - struct xprbuf *_base, - int _nbufs) -{ - jmp_buf_t db_jmpbuf; - jmp_buf_t *prev; - struct xprbuf *last, *ptr; - register struct xprbuf *x; - int i; - - base = _base; - nbufs = _nbufs; - - if (base == 0) { - base = xprbase; - nbufs = nxprbufs; - } - - if (nbufs == 0) - return; - - if (base == xprbase) { - xpr_dump_spl = splhigh(); - simple_lock(&xprlock); - } - - last = base + nbufs; - ptr = * (struct xprbuf **) last; - - prev = db_recover; - if (_setjmp(db_recover = &db_jmpbuf) == 0) - for (x = ptr, i = 0; i < nbufs; i++) { - if (--x < base) - x = last - 1; - - if (x->msg == 0) - break; - - db_printf("<%d:%x:%x> ", x - base, x->cpuinfo, x->timestamp); - db_printf(x->msg, x->arg1,x->arg2,x->arg3,x->arg4,x->arg5); - } - db_recover = prev; - - if (base == xprbase) { - simple_unlock(&xprlock); - splx(xpr_dump_spl); - } -} - -/* - * dump xpr table with a selection criteria. - * argument number "arg_index" must equal "value" - */ - -void -xpr_search( - int arg_index, - int value) -{ - jmp_buf_t db_jmpbuf; - jmp_buf_t *prev; - register struct xprbuf *x; - spl_t s; - int n; - - if (!nxprbufs) - return; - - s = splhigh(); - simple_lock(&xprlock); - - prev = db_recover; - if (_setjmp(db_recover = &db_jmpbuf) == 0) { - n = nxprbufs; - - for (x = *(struct xprbuf **)xprlast ; n--; ) { - if (--x < xprbase) - x = xprlast - 1; - - if (x->msg == 0) { - break; - } - - if (*((&x->arg1)+arg_index) != value) - continue; - - db_printf("<%d:%d:%x> ", x - xprbase, - x->cpuinfo, x->timestamp); - db_printf(x->msg, x->arg1,x->arg2,x->arg3,x->arg4,x->arg5); - } - } - db_recover = prev; - - simple_unlock(&xprlock); - splx(s); -} -#endif /* MACH_KDB */ diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 9d1afa5d6..dc9ea000c 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,8 +64,6 @@ */ #include #include -#include -#include #include #include @@ -74,6 +72,7 @@ #include #include #include +#include #include #include @@ -92,6 +91,8 @@ #include #include +#include + #include #include @@ -101,24 +102,130 @@ /* * Zone Corruption Debugging * - * We provide three methods to detect use of a zone element after it's been freed. These - * checks are enabled by specifying "-zc" and/or "-zp" in the boot-args: + * We perform three methods to detect use of a zone element after it's been freed. These + * checks are enabled for every N'th element (counted per-zone) by specifying + * "zp-factor=N" as a boot-arg. To turn this feature off, set "zp-factor=0" or "-no-zp". + * + * (1) Range-check the free-list "next" pointer for sanity. + * (2) Store the pointer in two different words, one at the beginning of the freed element + * and one at the end, and compare them against each other when re-using the element, + * to detect modifications. + * (3) Poison the freed memory by overwriting it with 0xdeadbeef, and check it when the + * memory is being reused to make sure it is still poisoned. + * + * As a result, each element (that is large enough to hold this data inside) must be marked + * as either "ZP_POISONED" or "ZP_NOT_POISONED" in the first integer within the would-be + * poisoned segment after the first free-list pointer. * - * (1) Range-check the free-list "next" ptr for sanity. - * (2) Store the ptr in two different words, and compare them against - * each other when re-using the zone element, to detect modifications. - * (3) poison the freed memory by overwriting it with 0xdeadbeef. + * Performance slowdown is inversely proportional to the frequency with which you check + * (as would be expected), with a 4-5% hit around N=1, down to ~0.3% at N=16 and just + * "noise" at N=32 and higher. You can expect to find a 100% reproducible + * bug in an average of N tries, with a standard deviation of about N, but you will probably + * want to set "zp-factor=1" or "-zp" if you are attempting to reproduce a known bug. * - * The first two checks are fairly light weight and are enabled by specifying "-zc" - * in the boot-args. If you want more aggressive checking for use-after-free bugs - * and you don't mind the additional overhead, then turn on poisoning by adding - * "-zp" to the boot-args in addition to "-zc". If you specify -zp without -zc, - * it still poisons the memory when it's freed, but doesn't check if the memory - * has been altered later when it's reallocated. + * + * Zone corruption logging + * + * You can also track where corruptions come from by using the boot-arguments: + * "zlog= -zc". Search for "Zone corruption logging" later in this + * document for more implementation and usage information. + */ + +#define ZP_POISON 0xdeadbeef +#define ZP_POISONED 0xfeedface +#define ZP_NOT_POISONED 0xbaddecaf + +#if CONFIG_EMBEDDED + #define ZP_DEFAULT_SAMPLING_FACTOR 0 +#else /* CONFIG_EMBEDDED */ + #define ZP_DEFAULT_SAMPLING_FACTOR 16 +#endif /* CONFIG_EMBEDDED */ + +uint32_t free_check_sample_factor = 0; /* set by zp-factor=N boot arg */ +boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-arg */ + +/* + * Zone checking helper macro. + */ +#define is_kernel_data_addr(a) (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3))) + +/* + * Frees the specified element, which is within the specified zone. If this + * element should be poisoned and its free list checker should be set, both are + * done here. These checks will only be enabled if the element size is at least + * large enough to hold two vm_offset_t's and one uint32_t (to enable both types + * of checks). + */ +static inline void +free_to_zone(zone_t zone, void *elem) { + /* get the index of the first uint32_t beyond the 'next' pointer */ + unsigned int i = sizeof(vm_offset_t) / sizeof(uint32_t); + + /* should we run checks on this piece of memory? */ + if (free_check_sample_factor != 0 && + zone->free_check_count++ % free_check_sample_factor == 0 && + zone->elem_size >= (2 * sizeof(vm_offset_t) + sizeof(uint32_t))) { + zone->free_check_count = 1; + ((uint32_t *) elem)[i] = ZP_POISONED; + for (i++; i < zone->elem_size / sizeof(uint32_t); i++) { + ((uint32_t *) elem)[i] = ZP_POISON; + } + ((vm_offset_t *) elem)[((zone->elem_size)/sizeof(vm_offset_t))-1] = zone->free_elements; + } else { + ((uint32_t *) elem)[i] = ZP_NOT_POISONED; + } + + /* maintain free list and decrement number of active objects in zone */ + ((vm_offset_t *) elem)[0] = zone->free_elements; + zone->free_elements = (vm_offset_t) elem; + zone->count--; +} + +/* + * Allocates an element from the specifed zone, storing its address in the + * return arg. This function will look for corruptions revealed through zone + * poisoning and free list checks. */ +static inline void +alloc_from_zone(zone_t zone, void **ret) { + void *elem = (void *) zone->free_elements; + if (elem != NULL) { + /* get the index of the first uint32_t beyond the 'next' pointer */ + unsigned int i = sizeof(vm_offset_t) / sizeof(uint32_t); + + /* first int in data section must be ZP_POISONED or ZP_NOT_POISONED */ + if (((uint32_t *) elem)[i] == ZP_POISONED && + zone->elem_size >= (2 * sizeof(vm_offset_t) + sizeof(uint32_t))) { + /* check the free list pointers */ + if (!is_kernel_data_addr(((vm_offset_t *) elem)[0]) || + ((vm_offset_t *) elem)[0] != + ((vm_offset_t *) elem)[(zone->elem_size/sizeof(vm_offset_t))-1]) { + panic("a freed zone element has been modified in zone: %s", + zone->zone_name); + } + + /* check for poisoning in free space */ + for (i++; + i < zone->elem_size / sizeof(uint32_t) - + sizeof(vm_offset_t) / sizeof(uint32_t); + i++) { + if (((uint32_t *) elem)[i] != ZP_POISON) { + panic("a freed zone element has been modified in zone: %s", + zone->zone_name); + } + } + } else if (((uint32_t *) elem)[i] != ZP_NOT_POISONED) { + panic("a freed zone element has been modified in zone: %s", + zone->zone_name); + } + + zone->count++; + zone->sum_count++; + zone->free_elements = ((vm_offset_t *) elem)[0]; + } + *ret = elem; +} -boolean_t check_freed_element = FALSE; /* enabled by -zc in boot-args */ -boolean_t zfree_clear = FALSE; /* enabled by -zp in boot-args */ /* * Fake zones for things that want to report via zprint but are not actually zones. @@ -131,26 +238,25 @@ struct fake_zone_info { uint64_t *, int *, int *, int *); }; -static struct fake_zone_info fake_zones[] = { +static const struct fake_zone_info fake_zones[] = { { .name = "kernel_stacks", .init = stack_fake_zone_init, .query = stack_fake_zone_info, }, -#if defined(__i386__) || defined (__x86_64__) { .name = "page_tables", .init = pt_fake_zone_init, .query = pt_fake_zone_info, }, -#endif /* i386 */ { .name = "kalloc.large", .init = kalloc_fake_zone_init, .query = kalloc_fake_zone_info, }, }; -unsigned int num_fake_zones = sizeof(fake_zones)/sizeof(fake_zones[0]); +static const unsigned int num_fake_zones = + sizeof (fake_zones) / sizeof (fake_zones[0]); /* * Zone info options @@ -159,61 +265,6 @@ boolean_t zinfo_per_task = FALSE; /* enabled by -zinfop in boot-args */ #define ZINFO_SLOTS 200 /* for now */ #define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1) -/* - * Allocation helper macros - */ -#define is_kernel_data_addr(a) (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3))) - -#define ADD_TO_ZONE(zone, element) \ -MACRO_BEGIN \ - if (zfree_clear) \ - { unsigned int i; \ - for (i=0; \ - i < zone->elem_size/sizeof(uint32_t); \ - i++) \ - ((uint32_t *)(element))[i] = 0xdeadbeef; \ - } \ - *((vm_offset_t *)(element)) = (zone)->free_elements; \ - if (check_freed_element) { \ - if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \ - ((vm_offset_t *)(element))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ - (zone)->free_elements; \ - } \ - (zone)->free_elements = (vm_offset_t) (element); \ - (zone)->count--; \ -MACRO_END - -#define REMOVE_FROM_ZONE(zone, ret, type) \ -MACRO_BEGIN \ - (ret) = (type) (zone)->free_elements; \ - if ((ret) != (type) 0) { \ - if (check_freed_element) { \ - if (!is_kernel_data_addr(((vm_offset_t *)(ret))[0]) || \ - ((zone)->elem_size >= (2 * sizeof(vm_offset_t)) && \ - ((vm_offset_t *)(ret))[((zone)->elem_size/sizeof(vm_offset_t))-1] != \ - ((vm_offset_t *)(ret))[0])) \ - panic("a freed zone element has been modified");\ - if (zfree_clear) { \ - unsigned int ii; \ - for (ii = sizeof(vm_offset_t) / sizeof(uint32_t); \ - ii < (zone)->elem_size/sizeof(uint32_t) - sizeof(vm_offset_t) / sizeof(uint32_t); \ - ii++) \ - if (((uint32_t *)(ret))[ii] != (uint32_t)0xdeadbeef) \ - panic("a freed zone element has been modified");\ - } \ - } \ - (zone)->count++; \ - (zone)->sum_count++; \ - (zone)->free_elements = *((vm_offset_t *)(ret)); \ - } \ -MACRO_END - -#if ZONE_DEBUG -#define zone_debug_enabled(z) z->active_zones.next -#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) -#define ZONE_DEBUG_OFFSET ROUNDUP(sizeof(queue_chain_t),16) -#endif /* ZONE_DEBUG */ - /* * Support for garbage collection of unused zone pages * @@ -255,7 +306,8 @@ void zone_page_alloc( vm_size_t size); void zone_page_free_element( - zone_page_index_t *free_page_list, + zone_page_index_t *free_page_head, + zone_page_index_t *free_page_tail, vm_offset_t addr, vm_size_t size); @@ -277,12 +329,6 @@ void zalloc_async( void zone_display_zprint( void ); -#if ZONE_DEBUG && MACH_KDB -int zone_count( - zone_t z, - int tail); -#endif /* ZONE_DEBUG && MACH_KDB */ - vm_map_t zone_map = VM_MAP_NULL; zone_t zone_zone = ZONE_NULL; /* the zone containing other zones */ @@ -298,16 +344,6 @@ zone_t zinfo_zone = ZONE_NULL; /* zone of per-task zone info */ vm_offset_t zdata; vm_size_t zdata_size; -#define lock_zone(zone) \ -MACRO_BEGIN \ - lck_mtx_lock_spin(&(zone)->lock); \ -MACRO_END - -#define unlock_zone(zone) \ -MACRO_BEGIN \ - lck_mtx_unlock(&(zone)->lock); \ -MACRO_END - #define zone_wakeup(zone) thread_wakeup((event_t)(zone)) #define zone_sleep(zone) \ (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN, (event_t)(zone), THREAD_UNINT); @@ -354,15 +390,14 @@ lck_grp_t zone_lck_grp; lck_grp_attr_t zone_lck_grp_attr; lck_mtx_ext_t zone_lck_ext; - #if !ZONE_ALIAS_ADDR #define from_zone_map(addr, size) \ ((vm_offset_t)(addr) >= zone_map_min_address && \ ((vm_offset_t)(addr) + size -1) < zone_map_max_address) #else #define from_zone_map(addr, size) \ - ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)addr)) >= zone_map_min_address && \ - ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)addr)) + size -1) < zone_map_max_address) + ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) >= zone_map_min_address && \ + ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) + size -1) < zone_map_max_address) #endif /* @@ -423,8 +458,9 @@ static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging * records since going much larger than this tends to make the system unresponsive and unbootable on small * memory configurations. The default value is 4000 records. */ + #if defined(__LP64__) -#define ZRECORDS_MAX 16000 /* Max records allowed in the log */ +#define ZRECORDS_MAX 128000 /* Max records allowed in the log */ #else #define ZRECORDS_MAX 8000 /* Max records allowed in the log */ #endif @@ -518,7 +554,7 @@ extern boolean_t zlog_ready; /* * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding - * allocations made by the zone allocator. Every z_sample_factor allocations in each zone, we capture a + * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a * backtrace. Every free, we examine the table and determine if the allocation was being tracked, * and stop tracking it if it was being tracked. * @@ -539,7 +575,7 @@ uint32_t zleak_state = 0; /* State of collection, as above */ boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ -unsigned int z_sample_factor = 1000; /* Allocations per sample attempt */ +unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */ /* * Counters for allocation statistics. @@ -575,11 +611,8 @@ struct zallocation { }; /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ -#define ZLEAK_ALLOCATION_MAP_NUM 16384 -#define ZLEAK_TRACE_MAP_NUM 8192 - -uint32_t zleak_alloc_buckets = ZLEAK_ALLOCATION_MAP_NUM; -uint32_t zleak_trace_buckets = ZLEAK_TRACE_MAP_NUM; +uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM; +uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM; vm_size_t zleak_max_zonemap_size; @@ -591,7 +624,7 @@ static struct ztrace* ztraces; struct ztrace* top_ztrace; /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ -static lck_mtx_t zleak_lock; +static lck_spin_t zleak_lock; static lck_attr_t zleak_lock_attr; static lck_grp_t zleak_lock_grp; static lck_grp_attr_t zleak_lock_grp_attr; @@ -609,6 +642,15 @@ zleak_init(vm_size_t max_zonemap_size) zleak_global_tracking_threshold = max_zonemap_size / 2; zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; +#if CONFIG_EMBEDDED + if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) { + zleak_enable_flag = TRUE; + printf("zone leak detection enabled\n"); + } else { + zleak_enable_flag = FALSE; + printf("zone leak detection disabled\n"); + } +#else /* CONFIG_EMBEDDED */ /* -zleakoff (flag to disable zone leak monitor) */ if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { zleak_enable_flag = FALSE; @@ -617,12 +659,13 @@ zleak_init(vm_size_t max_zonemap_size) zleak_enable_flag = TRUE; printf("zone leak detection enabled\n"); } +#endif /* CONFIG_EMBEDDED */ /* zfactor=XXXX (override how often to sample the zone allocator) */ - if (PE_parse_boot_argn("zfactor", &z_sample_factor, sizeof(z_sample_factor))) { - printf("Zone leak factor override:%u\n", z_sample_factor); + if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) { + printf("Zone leak factor override:%u\n", zleak_sample_factor); } - + /* zleak-allocs=XXXX (override number of buckets in zallocations) */ if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { printf("Zone leak alloc buckets override:%u\n", zleak_alloc_buckets); @@ -645,7 +688,7 @@ zleak_init(vm_size_t max_zonemap_size) lck_grp_attr_setdefault(&zleak_lock_grp_attr); lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr); lck_attr_setdefault(&zleak_lock_attr); - lck_mtx_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr); + lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr); if (zleak_enable_flag) { zleak_state = ZLEAK_STATE_ENABLED; @@ -656,7 +699,7 @@ zleak_init(vm_size_t max_zonemap_size) /* * Support for kern.zleak.active sysctl - a simplified - * simplified version of the zleak_state variable. + * version of the zleak_state variable. */ int get_zleak_state(void) @@ -686,14 +729,14 @@ zleak_activate(void) } /* Indicate that we're doing the setup */ - lck_mtx_lock_spin(&zleak_lock); + lck_spin_lock(&zleak_lock); if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); return KERN_SUCCESS; } zleak_state |= ZLEAK_STATE_ACTIVATING; - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); /* Allocate and zero tables */ retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size); @@ -724,10 +767,10 @@ zleak_activate(void) * the tables and setting the active flag, because the zfree() * path accesses the table without a lock if we're active. */ - lck_mtx_lock_spin(&zleak_lock); + lck_spin_lock(&zleak_lock); zleak_state |= ZLEAK_STATE_ACTIVE; zleak_state &= ~ZLEAK_STATE_ACTIVATING; - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); return 0; @@ -736,10 +779,10 @@ fail: * If we fail to allocate memory, don't further tax * the system by trying again. */ - lck_mtx_lock_spin(&zleak_lock); + lck_spin_lock(&zleak_lock); zleak_state |= ZLEAK_STATE_FAILED; zleak_state &= ~ZLEAK_STATE_ACTIVATING; - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); if (allocations_ptr != NULL) { kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); @@ -776,7 +819,7 @@ zleak_log(uintptr_t* bt, vm_size_t allocation_size) { /* Quit if there's someone else modifying the hash tables */ - if (!lck_mtx_try_lock_spin(&zleak_lock)) { + if (!lck_spin_try_lock(&zleak_lock)) { z_total_conflicts++; return FALSE; } @@ -796,7 +839,7 @@ zleak_log(uintptr_t* bt, if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { z_alloc_collisions++; - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); return TRUE; } @@ -812,7 +855,7 @@ zleak_log(uintptr_t* bt, trace->zt_collisions++; z_trace_collisions++; - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); return TRUE; } else if (trace->zt_size > 0) { /* Same trace, already added, so increment refcount */ @@ -858,7 +901,7 @@ zleak_log(uintptr_t* bt, if (top_ztrace->zt_size < trace->zt_size) top_ztrace = trace; - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); return TRUE; } @@ -881,7 +924,7 @@ zleak_free(uintptr_t addr, if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { /* if the allocation was the one, grab the lock, check again, then delete it */ - lck_mtx_lock_spin(&zleak_lock); + lck_spin_lock(&zleak_lock); if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { struct ztrace *trace; @@ -902,7 +945,7 @@ zleak_free(uintptr_t addr, /* A NULL element means the allocation bucket is unused */ allocation->za_element = 0; } - lck_mtx_unlock(&zleak_lock); + lck_spin_unlock(&zleak_lock); } } @@ -918,21 +961,23 @@ zleak_free(uintptr_t addr, * It's fast because it does no checking to make sure there isn't bad data. * Since it's only called from threads that we're going to keep executing, * if there's bad data we were going to die eventually. - * This seems to work for x86 and X86_64. - * ARMTODO: Test it on ARM, I think it will work but I can't test it. If it works, remove the ifdef. * If this function is inlined, it doesn't record the frame of the function it's inside. * (because there's no stack frame!) */ + uint32_t fastbacktrace(uintptr_t* bt, uint32_t max_frames) { -#if defined(__x86_64__) || defined(__i386__) uintptr_t* frameptr = NULL, *frameptr_next = NULL; uintptr_t retaddr = 0; uint32_t frame_index = 0, frames = 0; uintptr_t kstackb, kstackt; + thread_t cthread = current_thread(); - kstackb = current_thread()->kernel_stack; + if (__improbable(cthread == NULL)) + return 0; + + kstackb = cthread->kernel_stack; kstackt = kstackb + kernel_stack_size; /* Load stack frame pointer (EBP on x86) into frameptr */ frameptr = __builtin_frame_address(0); @@ -965,9 +1010,6 @@ fastbacktrace(uintptr_t* bt, uint32_t max_frames) bt[frame_index++] = 0; return frames; -#else - return OSBacktrace((void*)bt, max_frames); -#endif } /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ @@ -1001,8 +1043,8 @@ hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) uintptr_t hash = 0; uintptr_t mask = max_size - 1; - while (--depth) { - hash += bt[depth]; + while (depth) { + hash += bt[--depth]; } hash = hash_mix(hash) & mask; @@ -1053,6 +1095,7 @@ zinit( zdata_size -= sizeof(*z); } else z = (zone_t) zalloc(zone_zone); + if (z == ZONE_NULL) return(ZONE_NULL); @@ -1128,6 +1171,8 @@ use_this_allocation: z->noencrypt = FALSE; z->no_callout = FALSE; z->async_prio_refill = FALSE; + z->gzalloc_exempt = FALSE; + z->alignment_required = FALSE; z->prio_refill_watermark = 0; z->zone_replenish_thread = NULL; #if CONFIG_ZLEAKS @@ -1165,8 +1210,7 @@ use_this_allocation: /* * Check if we should be logging this zone. If so, remember the zone pointer. */ - - if (log_this_zone(z->zone_name, zone_name_to_log)) { + if (log_this_zone(z->zone_name, zone_name_to_log)) { zone_of_interest = z; } @@ -1178,7 +1222,6 @@ use_this_allocation: * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized * right now. */ - if (zone_of_interest != NULL && zrecords == NULL && zlog_ready) { if (kmem_alloc(kernel_map, (vm_offset_t *)&zrecords, log_records * sizeof(struct zrecord)) == KERN_SUCCESS) { @@ -1195,7 +1238,9 @@ use_this_allocation: zone_of_interest = NULL; } } - +#if CONFIG_GZALLOC + gzalloc_zone_init(z); +#endif return(z); } unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated; @@ -1306,10 +1351,10 @@ zcram( lock_zone(zone); while (size >= elem_size) { - ADD_TO_ZONE(zone, newmem); + free_to_zone(zone, (void *) newmem); if (from_zm) zone_page_alloc(newmem, elem_size); - zone->count++; /* compensate for ADD_TO_ZONE */ + zone->count++; /* compensate for free_to_zone */ size -= elem_size; newmem += elem_size; zone->cur_size += elem_size; @@ -1325,6 +1370,9 @@ zcram( void zone_steal_memory(void) { +#if CONFIG_GZALLOC + gzalloc_configure(); +#endif /* Request enough early memory to get to the pmap zone */ zdata_size = 12 * sizeof(struct zone); zdata = (vm_offset_t)pmap_steal_memory(round_page(zdata_size)); @@ -1375,35 +1423,30 @@ zone_bootstrap(void) { char temp_buf[16]; -#if 6094439 - /* enable zone checks by default, to try and catch offenders... */ -#if 0 - /* 7968354: turn "-zc" back off */ - check_freed_element = TRUE; - /* 7995202: turn "-zp" back off */ - zfree_clear = TRUE; -#endif - - /* ... but allow them to be turned off explicitely */ - if (PE_parse_boot_argn("-no_zc", temp_buf, sizeof (temp_buf))) { - check_freed_element = FALSE; - } - if (PE_parse_boot_argn("-no_zp", temp_buf, sizeof (temp_buf))) { - zfree_clear = FALSE; + if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof(temp_buf))) { + zinfo_per_task = TRUE; } -#endif - /* see if we want freed zone element checking and/or poisoning */ - if (PE_parse_boot_argn("-zc", temp_buf, sizeof (temp_buf))) { - check_freed_element = TRUE; + /* do we want corruption-style debugging with zlog? */ + if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) { + corruption_debug_flag = TRUE; } + + /* Set up zone poisoning */ - if (PE_parse_boot_argn("-zp", temp_buf, sizeof (temp_buf))) { - zfree_clear = TRUE; + free_check_sample_factor = ZP_DEFAULT_SAMPLING_FACTOR; + + /* support for old zone poisoning boot-args */ + if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) { + free_check_sample_factor = 1; + } + if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) { + free_check_sample_factor = 0; } - if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof (temp_buf))) { - zinfo_per_task = TRUE; + /* zp-factor=XXXX (override how often to poison freed zone elements) */ + if (PE_parse_boot_argn("zp-factor", &free_check_sample_factor, sizeof(free_check_sample_factor))) { + printf("Zone poisoning factor override:%u\n", free_check_sample_factor); } /* @@ -1498,6 +1541,9 @@ zone_init( if (retval != KERN_SUCCESS) panic("zone_init: kmem_suballoc failed"); zone_max = zone_min + round_page(max_zonemap_size); +#if CONFIG_GZALLOC + gzalloc_init(max_zonemap_size); +#endif /* * Setup garbage collection information: */ @@ -1608,19 +1654,26 @@ zalloc_canblock( register zone_t zone, boolean_t canblock) { - vm_offset_t addr; - kern_return_t retval; + vm_offset_t addr = 0; + kern_return_t retval; uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ int numsaved = 0; - int i; + int i; boolean_t zone_replenish_wakeup = FALSE; + boolean_t did_gzalloc; + did_gzalloc = FALSE; #if CONFIG_ZLEAKS uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ #endif /* CONFIG_ZLEAKS */ assert(zone != ZONE_NULL); - + +#if CONFIG_GZALLOC + addr = gzalloc_alloc(zone, canblock); + did_gzalloc = (addr != 0); +#endif + lock_zone(zone); /* @@ -1632,10 +1685,10 @@ zalloc_canblock( #if CONFIG_ZLEAKS /* - * Zone leak detection: capture a backtrace every z_sample_factor + * Zone leak detection: capture a backtrace every zleak_sample_factor * allocations in this zone. */ - if (zone->zleak_on && (zone->zleak_capture++ % z_sample_factor == 0)) { + if (zone->zleak_on && (zone->zleak_capture++ % zleak_sample_factor == 0)) { zone->zleak_capture = 1; /* Avoid backtracing twice if zone logging is on */ @@ -1646,10 +1699,12 @@ zalloc_canblock( } #endif /* CONFIG_ZLEAKS */ - REMOVE_FROM_ZONE(zone, addr, vm_offset_t); + if (__probable(addr == 0)) + alloc_from_zone(zone, (void **) &addr); if (zone->async_prio_refill && - ((zone->cur_size - (zone->count * zone->elem_size)) < (zone->prio_refill_watermark * zone->elem_size))) { + ((zone->cur_size - (zone->count * zone->elem_size)) < + (zone->prio_refill_watermark * zone->elem_size))) { zone_replenish_wakeup = TRUE; zone_replenish_wakeups_initiated++; } @@ -1698,6 +1753,11 @@ zalloc_canblock( } else { unlock_zone(zone); + panic_include_zprint = TRUE; +#if CONFIG_ZLEAKS + if (zleak_state & ZLEAK_STATE_ACTIVE) + panic_include_ztrace = TRUE; +#endif /* CONFIG_ZLEAKS */ panic("zalloc: zone \"%s\" empty.", zone->zone_name); } } @@ -1749,7 +1809,7 @@ zalloc_canblock( retry++; if (retry == 2) { - zone_gc(); + zone_gc(TRUE); printf("zalloc did gc\n"); zone_display_zprint(); } @@ -1775,7 +1835,7 @@ zalloc_canblock( zone->waiting = FALSE; zone_wakeup(zone); } - REMOVE_FROM_ZONE(zone, addr, vm_offset_t); + alloc_from_zone(zone, (void **) &addr); if (addr == 0 && retval == KERN_RESOURCE_SHORTAGE) { unlock_zone(zone); @@ -1785,7 +1845,7 @@ zalloc_canblock( } } if (addr == 0) - REMOVE_FROM_ZONE(zone, addr, vm_offset_t); + alloc_from_zone(zone, (void **) &addr); } #if CONFIG_ZLEAKS @@ -1796,7 +1856,7 @@ zalloc_canblock( /* Sampling can fail if another sample is happening at the same time in a different zone. */ if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { /* If it failed, roll back the counter so we sample the next allocation instead. */ - zone->zleak_capture = z_sample_factor; + zone->zleak_capture = zleak_sample_factor; } } #endif /* CONFIG_ZLEAKS */ @@ -1815,16 +1875,16 @@ zalloc_canblock( * depending on whether we're looking for the source of a zone leak or a zone corruption. When looking * for a leak, we want to log as many allocations as possible in order to clearly identify the leaker * among all the records. So we look for an unused slot in the log and fill that in before overwriting - * an old entry. When looking for a corrution however, it's better to have a chronological log of all + * an old entry. When looking for a corruption however, it's better to have a chronological log of all * the allocations and frees done in the zone so that the history of operations for a specific zone * element can be inspected. So in this case, we treat the log as a circular buffer and overwrite the * oldest entry whenever a new one needs to be added. * - * The check_freed_element flag tells us what style of logging to do. It's set if we're supposed to be + * The corruption_debug_flag flag tells us what style of logging to do. It's set if we're supposed to be * doing corruption style logging (indicated via -zc in the boot-args). */ - if (!check_freed_element && zrecords[zcurrent].z_element && zrecorded < log_records) { + if (!corruption_debug_flag && zrecords[zcurrent].z_element && zrecorded < log_records) { /* * If we get here, we're doing leak style logging and there's still some unused entries in @@ -1832,8 +1892,8 @@ zalloc_canblock( * starting at zcurrent and wrap-around if we reach the end of the buffer. If the buffer * is already full, we just fall through and overwrite the element indexed by zcurrent. */ - - for (i = zcurrent; i < log_records; i++) { + + for (i = zcurrent; i < log_records; i++) { if (zrecords[i].z_element == NULL) { zcurrent = i; goto empty_slot; @@ -1877,11 +1937,11 @@ empty_slot: unlock_zone(zone); thread_call_enter(&zone->call_async_alloc); lock_zone(zone); - REMOVE_FROM_ZONE(zone, addr, vm_offset_t); + alloc_from_zone(zone, (void **) &addr); } #if ZONE_DEBUG - if (addr && zone_debug_enabled(zone)) { + if (!did_gzalloc && addr && zone_debug_enabled(zone)) { enqueue_tail(&zone->active_zones, (queue_entry_t)addr); addr += ZONE_DEBUG_OFFSET; } @@ -1904,14 +1964,15 @@ empty_slot: thread_t thr = current_thread(); task_t task; zinfo_usage_t zinfo; + vm_size_t sz = zone->elem_size; if (zone->caller_acct) - thr->tkm_private.alloc += zone->elem_size; + ledger_credit(thr->t_ledger, task_ledgers.tkm_private, sz); else - thr->tkm_shared.alloc += zone->elem_size; + ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, sz); if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(zone->elem_size, (int64_t *)&zinfo[zone->index].alloc); + OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].alloc); } return((void *)addr); } @@ -1945,7 +2006,6 @@ zalloc_async( unlock_zone(((zone_t)p0)); } - /* * zget returns an element from the specified zone * and immediately returns nothing if there is nothing there. @@ -1961,7 +2021,7 @@ void * zget( register zone_t zone) { - register vm_offset_t addr; + vm_offset_t addr; #if CONFIG_ZLEAKS uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used for zone leak detection */ @@ -1977,13 +2037,13 @@ zget( /* * Zone leak detection: capture a backtrace */ - if (zone->zleak_on && (zone->zleak_capture++ % z_sample_factor == 0)) { + if (zone->zleak_on && (zone->zleak_capture++ % zleak_sample_factor == 0)) { zone->zleak_capture = 1; zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); } #endif /* CONFIG_ZLEAKS */ - REMOVE_FROM_ZONE(zone, addr, vm_offset_t); + alloc_from_zone(zone, (void **) &addr); #if ZONE_DEBUG if (addr && zone_debug_enabled(zone)) { enqueue_tail(&zone->active_zones, (queue_entry_t)addr); @@ -1999,7 +2059,7 @@ zget( /* Sampling can fail if another sample is happening at the same time in a different zone. */ if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { /* If it failed, roll back the counter so we sample the next allocation instead. */ - zone->zleak_capture = z_sample_factor; + zone->zleak_capture = zleak_sample_factor; } } @@ -2026,8 +2086,9 @@ zfree( void *addr) { vm_offset_t elem = (vm_offset_t) addr; - void *zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */ + void *zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */ int numsaved = 0; + boolean_t gzfreed = FALSE; assert(zone != ZONE_NULL); @@ -2047,10 +2108,14 @@ zfree( panic("zfree: freeing to zone_zone breaks zone_gc!"); #endif +#if CONFIG_GZALLOC + gzfreed = gzalloc_free(zone, addr); +#endif + TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr); - if (zone->collectable && !zone->allows_foreign && - !from_zone_map(elem, zone->elem_size)) { + if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign && + !from_zone_map(elem, zone->elem_size))) { #if MACH_ASSERT panic("zfree: non-allocated memory in collectable zone!"); #endif @@ -2069,7 +2134,7 @@ zfree( if (DO_LOGGING(zone)) { int i; - if (check_freed_element) { + if (corruption_debug_flag) { /* * We're logging to catch a corruption. Add a record of this zfree operation @@ -2116,7 +2181,7 @@ zfree( #if ZONE_DEBUG - if (zone_debug_enabled(zone)) { + if (!gzfreed && zone_debug_enabled(zone)) { queue_t tmp_elem; elem -= ZONE_DEBUG_OFFSET; @@ -2145,7 +2210,10 @@ zfree( if (!pmap_kernel_va(this) || this == elem) panic("zfree"); } - ADD_TO_ZONE(zone, elem); + + if (__probable(!gzfreed)) + free_to_zone(zone, (void *) elem); + #if MACH_ASSERT if (zone->count < 0) panic("zfree: count < 0!"); @@ -2178,14 +2246,15 @@ zfree( thread_t thr = current_thread(); task_t task; zinfo_usage_t zinfo; + vm_size_t sz = zone->elem_size; if (zone->caller_acct) - thr->tkm_private.free += zone->elem_size; + ledger_debit(thr->t_ledger, task_ledgers.tkm_private, sz); else - thr->tkm_shared.free += zone->elem_size; + ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, sz); + if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) - OSAddAtomic64(zone->elem_size, - (int64_t *)&zinfo[zone->index].free); + OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].free); } } @@ -2224,11 +2293,24 @@ zone_change( case Z_NOCALLOUT: zone->no_callout = value; break; -#if MACH_ASSERT + case Z_GZALLOC_EXEMPT: + zone->gzalloc_exempt = value; +#if CONFIG_GZALLOC + gzalloc_reconfigure(zone); +#endif + break; + case Z_ALIGNMENT_REQUIRED: + zone->alignment_required = value; +#if ZONE_DEBUG + zone_debug_disable(zone); +#endif +#if CONFIG_GZALLOC + gzalloc_reconfigure(zone); +#endif + break; default: panic("Zone_change: Wrong Item Type!"); /* break; */ -#endif } } @@ -2253,24 +2335,6 @@ zone_free_count(zone_t zone) return(free_count); } -/* - * zprealloc preallocates wired memory, exanding the specified - * zone to the specified size - */ -void -zprealloc( - zone_t zone, - vm_size_t size) -{ - vm_offset_t addr; - - if (size != 0) { - if (kmem_alloc_kobject(zone_map, &addr, size) != KERN_SUCCESS) - panic("zprealloc"); - zcram(zone, addr, size); - } -} - /* * Zone garbage collection subroutines */ @@ -2419,7 +2483,8 @@ zone_page_alloc( void zone_page_free_element( - zone_page_index_t *free_page_list, + zone_page_index_t *free_page_head, + zone_page_index_t *free_page_tail, vm_offset_t addr, vm_size_t size) { @@ -2444,6 +2509,7 @@ zone_page_free_element( --zp->collect_count; if (--zp->alloc_count == 0) { vm_address_t free_page_address; + vm_address_t prev_free_page_address; zp->alloc_count = ZONE_PAGE_UNUSED; zp->collect_count = 0; @@ -2454,8 +2520,16 @@ zone_page_free_element( * storage for a page freelist */ free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)i); - *(zone_page_index_t *)free_page_address = *free_page_list; - *free_page_list = i; + *(zone_page_index_t *)free_page_address = ZONE_PAGE_INDEX_INVALID; + + if (*free_page_head == ZONE_PAGE_INDEX_INVALID) { + *free_page_head = i; + *free_page_tail = i; + } else { + prev_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)(*free_page_tail)); + *(zone_page_index_t *)prev_free_page_address = i; + *free_page_tail = i; + } } } } @@ -2471,14 +2545,12 @@ struct zone_free_element { * Add a linked list of pages starting at base back into the zone * free list. Tail points to the last element on the list. */ - #define ADD_LIST_TO_ZONE(zone, base, tail) \ MACRO_BEGIN \ (tail)->next = (void *)((zone)->free_elements); \ - if (check_freed_element) { \ - if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \ - ((vm_offset_t *)(tail))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ - (zone)->free_elements; \ + if ((zone)->elem_size >= (2 * sizeof(vm_offset_t) + sizeof(uint32_t))) { \ + ((vm_offset_t *)(tail))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ + (zone)->free_elements; \ } \ (zone)->free_elements = (unsigned long)(base); \ MACRO_END @@ -2486,15 +2558,13 @@ MACRO_END /* * Add an element to the chain pointed to by prev. */ - -#define ADD_ELEMENT(zone, prev, elem) \ +#define ADD_ELEMENT(zone, prev, elem) \ MACRO_BEGIN \ (prev)->next = (elem); \ - if (check_freed_element) { \ - if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \ - ((vm_offset_t *)(prev))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ - (vm_offset_t)(elem); \ - } \ + if ((zone)->elem_size >= (2 * sizeof(vm_offset_t) + sizeof(uint32_t))) { \ + ((vm_offset_t *)(prev))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ + (vm_offset_t)(elem); \ + } \ MACRO_END struct { @@ -2513,12 +2583,14 @@ struct { * begins to run out of memory. */ void -zone_gc(void) +zone_gc(boolean_t all_zones) { unsigned int max_zones; zone_t z; unsigned int i; zone_page_index_t zone_free_page_head; + zone_page_index_t zone_free_page_tail; + thread_t mythread = current_thread(); lck_mtx_lock(&zone_gc_lock); @@ -2527,6 +2599,14 @@ zone_gc(void) z = first_zone; simple_unlock(&all_zones_lock); + + /* + * it's ok to allow eager kernel preemption while + * while holding a zone lock since it's taken + * as a spin lock (which prevents preemption) + */ + thread_set_eager_preempt(mythread); + #if MACH_ASSERT for (i = 0; i < zone_pages; i++) { struct zone_page_table_entry *zp; @@ -2536,24 +2616,26 @@ zone_gc(void) } #endif /* MACH_ASSERT */ - zone_free_page_head = ZONE_PAGE_INDEX_INVALID; - for (i = 0; i < max_zones; i++, z = z->next_zone) { - unsigned int n, m; - vm_size_t elt_size, size_freed; + unsigned int n, m; + vm_size_t elt_size, size_freed; struct zone_free_element *elt, *base_elt, *base_prev, *prev, *scan, *keep, *tail; + int kmem_frees = 0; assert(z != ZONE_NULL); if (!z->collectable) continue; + if (all_zones == FALSE && z->elem_size < PAGE_SIZE) + continue; + lock_zone(z); elt_size = z->elem_size; /* - * Do a quick feasability check before we scan the zone: + * Do a quick feasibility check before we scan the zone: * skip unless there is likelihood of getting pages back * (i.e we need a whole allocation block's worth of free * elements before we can garbage collect) and @@ -2589,6 +2671,11 @@ zone_gc(void) prev = (void *)&scan; elt = scan; n = 0; tail = keep = NULL; + + zone_free_page_head = ZONE_PAGE_INDEX_INVALID; + zone_free_page_tail = ZONE_PAGE_INDEX_INVALID; + + while (elt != NULL) { if (from_zone_map(elt, elt_size)) { zone_page_collect((vm_offset_t)elt, elt_size); @@ -2676,6 +2763,7 @@ zone_gc(void) size_freed = 0; elt = scan; n = 0; tail = keep = NULL; + while (elt != NULL) { if (zone_page_collectable((vm_offset_t)elt, elt_size)) { struct zone_free_element *next_elt = elt->next; @@ -2688,8 +2776,7 @@ zone_gc(void) * list of free-able pages. So store elt->next because * "elt" may be scribbled over. */ - zone_page_free_element(&zone_free_page_head, - (vm_offset_t)elt, elt_size); + zone_page_free_element(&zone_free_page_head, &zone_free_page_tail, (vm_offset_t)elt, elt_size); elt = next_elt; @@ -2760,29 +2847,67 @@ zone_gc(void) zone_wakeup(z); } unlock_zone(z); - } - /* - * Reclaim the pages we are freeing. - */ - while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { - zone_page_index_t zind = zone_free_page_head; - vm_address_t free_page_address; -#if ZONE_ALIAS_ADDR - z = (zone_t)zone_virtual_addr((vm_map_address_t)z); -#endif - /* Use the first word of the page about to be freed to find the next free page */ - free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind); - zone_free_page_head = *(zone_page_index_t *)free_page_address; + if (zone_free_page_head == ZONE_PAGE_INDEX_INVALID) + continue; + + /* + * we don't want to allow eager kernel preemption while holding the + * various locks taken in the kmem_free path of execution + */ + thread_clear_eager_preempt(mythread); + + /* + * Reclaim the pages we are freeing. + */ + while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { + zone_page_index_t zind = zone_free_page_head; + vm_address_t free_page_address; + int page_count; + + /* + * Use the first word of the page about to be freed to find the next free page + */ + free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind); + zone_free_page_head = *(zone_page_index_t *)free_page_address; + + page_count = 1; + + while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { + zone_page_index_t next_zind = zone_free_page_head; + vm_address_t next_free_page_address; + + next_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)next_zind); + + if (next_free_page_address == (free_page_address - PAGE_SIZE)) { + free_page_address = next_free_page_address; + } else if (next_free_page_address != (free_page_address + (PAGE_SIZE * page_count))) + break; + + zone_free_page_head = *(zone_page_index_t *)next_free_page_address; + page_count++; + } + kmem_free(zone_map, free_page_address, page_count * PAGE_SIZE); + + zgc_stats.pgs_freed += page_count; - kmem_free(zone_map, free_page_address, PAGE_SIZE); - ++zgc_stats.pgs_freed; + if (++kmem_frees == 32) { + thread_yield_internal(1); + kmem_frees = 0; + } + } + thread_set_eager_preempt(mythread); } + thread_clear_eager_preempt(mythread); lck_mtx_unlock(&zone_gc_lock); + } +extern vm_offset_t kmapoff_kaddr; +extern unsigned int kmapoff_pgcnt; + /* * consider_zone_gc: * @@ -2792,14 +2917,29 @@ zone_gc(void) void consider_zone_gc(boolean_t force) { + boolean_t all_zones = FALSE; + + if (kmapoff_kaddr != 0) { + /* + * One-time reclaim of kernel_map resources we allocated in + * early boot. + */ + (void) vm_deallocate(kernel_map, + kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64); + kmapoff_kaddr = 0; + } if (zone_gc_allowed && (zone_gc_allowed_by_time_throttle || zone_gc_forced || force)) { + if (zone_gc_allowed_by_time_throttle == TRUE) { + zone_gc_allowed_by_time_throttle = FALSE; + all_zones = TRUE; + } zone_gc_forced = FALSE; - zone_gc_allowed_by_time_throttle = FALSE; /* reset periodically */ - zone_gc(); + + zone_gc(all_zones); } } @@ -2814,6 +2954,8 @@ compute_zone_gc_throttle(void *arg __unused) } +#if CONFIG_TASK_ZONE_INFO + kern_return_t task_zone_info( task_t task, @@ -2972,9 +3114,24 @@ task_zone_info( return KERN_SUCCESS; } +#else /* CONFIG_TASK_ZONE_INFO */ + +kern_return_t +task_zone_info( + __unused task_t task, + __unused mach_zone_name_array_t *namesp, + __unused mach_msg_type_number_t *namesCntp, + __unused task_zone_info_array_t *infop, + __unused mach_msg_type_number_t *infoCntp) +{ + return KERN_FAILURE; +} + +#endif /* CONFIG_TASK_ZONE_INFO */ + kern_return_t mach_zone_info( - host_t host, + host_priv_t host, mach_zone_name_array_t *namesp, mach_msg_type_number_t *namesCntp, mach_zone_info_array_t *infop, @@ -2998,8 +3155,10 @@ mach_zone_info( if (host == HOST_NULL) return KERN_INVALID_HOST; - - num_fake_zones = sizeof fake_zones / sizeof fake_zones[0]; +#if CONFIG_DEBUGGER_FOR_ZONE_INFO + if (!PE_i_can_has_debugger(NULL)) + return KERN_INVALID_HOST; +#endif /* * We assume that zones aren't freed once allocated. @@ -3122,7 +3281,7 @@ mach_zone_info( */ kern_return_t host_zone_info( - host_t host, + host_priv_t host, zone_name_array_t *namesp, mach_msg_type_number_t *namesCntp, zone_info_array_t *infop, @@ -3146,6 +3305,10 @@ host_zone_info( if (host == HOST_NULL) return KERN_INVALID_HOST; +#if CONFIG_DEBUGGER_FOR_ZONE_INFO + if (!PE_i_can_has_debugger(NULL)) + return KERN_INVALID_HOST; +#endif #if defined(__LP64__) if (!thread_is_64bit(current_thread())) @@ -3155,8 +3318,6 @@ host_zone_info( return KERN_NOT_SUPPORTED; #endif - num_fake_zones = sizeof fake_zones / sizeof fake_zones[0]; - /* * We assume that zones aren't freed once allocated. * We won't pick up any zones that are allocated later. @@ -3260,6 +3421,19 @@ host_zone_info( return KERN_SUCCESS; } +kern_return_t +mach_zone_force_gc( + host_t host) +{ + + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + consider_zone_gc(TRUE); + + return (KERN_SUCCESS); +} + extern unsigned int stack_total; extern unsigned long long stack_allocs; @@ -3297,279 +3471,10 @@ void zone_display_zprint() printf("Kalloc.Large:\t%lu\n",(uintptr_t)kalloc_large_total); } - - -#if MACH_KDB -#include -#include -#include - -const char *zone_labels = -"ENTRY COUNT TOT_SZ MAX_SZ ELT_SZ ALLOC_SZ NAME"; - -/* Forwards */ -void db_print_zone( - zone_t addr); - -#if ZONE_DEBUG -void db_zone_check_active( - zone_t zone); -void db_zone_print_active( - zone_t zone); -#endif /* ZONE_DEBUG */ -void db_zone_print_free( - zone_t zone); -void -db_print_zone( - zone_t addr) -{ - struct zone zcopy; - - zcopy = *addr; - - db_printf("%8x %8x %8x %8x %6x %8x %s ", - addr, zcopy.count, zcopy.cur_size, - zcopy.max_size, zcopy.elem_size, - zcopy.alloc_size, zcopy.zone_name); - if (zcopy.exhaustible) - db_printf("H"); - if (zcopy.collectable) - db_printf("C"); - if (zcopy.expandable) - db_printf("X"); - if (zcopy.caller_acct) - db_printf("A"); - db_printf("\n"); -} - -/*ARGSUSED*/ -void -db_show_one_zone(db_expr_t addr, boolean_t have_addr, - __unused db_expr_t count, __unused char *modif) -{ - struct zone *z = (zone_t)((char *)0 + addr); - - if (z == ZONE_NULL || !have_addr){ - db_error("No Zone\n"); - /*NOTREACHED*/ - } - - db_printf("%s\n", zone_labels); - db_print_zone(z); -} - -/*ARGSUSED*/ -void -db_show_all_zones(__unused db_expr_t addr, boolean_t have_addr, db_expr_t count, - __unused char *modif) -{ - zone_t z; - unsigned total = 0; - - /* - * Don't risk hanging by unconditionally locking, - * risk of incoherent data is small (zones aren't freed). - */ - have_addr = simple_lock_try(&all_zones_lock); - count = num_zones; - z = first_zone; - if (have_addr) { - simple_unlock(&all_zones_lock); - } - - db_printf("%s\n", zone_labels); - for ( ; count > 0; count--) { - if (!z) { - db_error("Mangled Zone List\n"); - /*NOTREACHED*/ - } - db_print_zone(z); - total += z->cur_size, - - have_addr = simple_lock_try(&all_zones_lock); - z = z->next_zone; - if (have_addr) { - simple_unlock(&all_zones_lock); - } - } - db_printf("\nTotal %8x", total); - db_printf("\n\nzone_gc() has reclaimed %d pages\n", zgc_stats.pgs_freed); -} - -#if ZONE_DEBUG -void -db_zone_check_active( - zone_t zone) -{ - int count = 0; - queue_t tmp_elem; - - if (!zone_debug_enabled(zone) || !zone_check) - return; - tmp_elem = queue_first(&zone->active_zones); - while (count < zone->count) { - count++; - if (tmp_elem == 0) { - printf("unexpected zero element, zone=%p, count=%d\n", - zone, count); - assert(FALSE); - break; - } - if (queue_end(tmp_elem, &zone->active_zones)) { - printf("unexpected queue_end, zone=%p, count=%d\n", - zone, count); - assert(FALSE); - break; - } - tmp_elem = queue_next(tmp_elem); - } - if (!queue_end(tmp_elem, &zone->active_zones)) { - printf("not at queue_end, zone=%p, tmp_elem=%p\n", - zone, tmp_elem); - assert(FALSE); - } -} - -void -db_zone_print_active( - zone_t zone) -{ - int count = 0; - queue_t tmp_elem; - - if (!zone_debug_enabled(zone)) { - printf("zone %p debug not enabled\n", zone); - return; - } - if (!zone_check) { - printf("zone_check FALSE\n"); - return; - } - - printf("zone %p, active elements %d\n", zone, zone->count); - printf("active list:\n"); - tmp_elem = queue_first(&zone->active_zones); - while (count < zone->count) { - printf(" %p", tmp_elem); - count++; - if ((count % 6) == 0) - printf("\n"); - if (tmp_elem == 0) { - printf("\nunexpected zero element, count=%d\n", count); - break; - } - if (queue_end(tmp_elem, &zone->active_zones)) { - printf("\nunexpected queue_end, count=%d\n", count); - break; - } - tmp_elem = queue_next(tmp_elem); - } - if (!queue_end(tmp_elem, &zone->active_zones)) - printf("\nnot at queue_end, tmp_elem=%p\n", tmp_elem); - else - printf("\n"); -} -#endif /* ZONE_DEBUG */ - -void -db_zone_print_free( - zone_t zone) -{ - int count = 0; - int freecount; - vm_offset_t elem; - - freecount = zone_free_count(zone); - printf("zone %p, free elements %d\n", zone, freecount); - printf("free list:\n"); - elem = zone->free_elements; - while (count < freecount) { - printf(" 0x%x", elem); - count++; - if ((count % 6) == 0) - printf("\n"); - if (elem == 0) { - printf("\nunexpected zero element, count=%d\n", count); - break; - } - elem = *((vm_offset_t *)elem); - } - if (elem != 0) - printf("\nnot at end of free list, elem=0x%x\n", elem); - else - printf("\n"); -} - -#endif /* MACH_KDB */ - - #if ZONE_DEBUG /* should we care about locks here ? */ -#if MACH_KDB -void * -next_element( - zone_t z, - void *prev) -{ - char *elt = (char *)prev; - - if (!zone_debug_enabled(z)) - return(NULL); - elt -= ZONE_DEBUG_OFFSET; - elt = (char *) queue_next((queue_t) elt); - if ((queue_t) elt == &z->active_zones) - return(NULL); - elt += ZONE_DEBUG_OFFSET; - return(elt); -} - -void * -first_element( - zone_t z) -{ - char *elt; - - if (!zone_debug_enabled(z)) - return(NULL); - if (queue_empty(&z->active_zones)) - return(NULL); - elt = (char *)queue_first(&z->active_zones); - elt += ZONE_DEBUG_OFFSET; - return(elt); -} - -/* - * Second arg controls how many zone elements are printed: - * 0 => none - * n, n < 0 => all - * n, n > 0 => last n on active list - */ -int -zone_count( - zone_t z, - int tail) -{ - void *elt; - int count = 0; - boolean_t print = (tail != 0); - - if (tail < 0) - tail = z->count; - if (z->count < tail) - tail = 0; - tail = z->count - tail; - for (elt = first_element(z); elt; elt = next_element(z, elt)) { - if (print && tail <= count) - db_printf("%8x\n", elt); - count++; - } - assert(count == z->count); - return(count); -} -#endif /* MACH_KDB */ - #define zone_in_use(z) ( z->count || z->free_elements ) void diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index 81322fd9f..630a4fff0 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -74,11 +74,17 @@ #ifdef MACH_KERNEL_PRIVATE #include -#include #include #include #include -#include +#include + +#if CONFIG_GZALLOC +typedef struct gzalloc_data { + uint32_t gzfc_index; + vm_offset_t *gzfc; +} gzalloc_data_t; +#endif /* * A zone is a collection of fixed size blocks for which there @@ -111,27 +117,33 @@ struct zone { /* boolean_t */ async_pending :1, /* asynchronous allocation pending? */ #if CONFIG_ZLEAKS /* boolean_t */ zleak_on :1, /* Are we collecting allocation information? */ -#endif /* ZONE_DEBUG */ +#endif /* CONFIG_ZLEAKS */ /* boolean_t */ caller_acct: 1, /* do we account allocation/free to the caller? */ /* boolean_t */ doing_gc :1, /* garbage collect in progress? */ /* boolean_t */ noencrypt :1, /* boolean_t */ no_callout:1, - /* boolean_t */ async_prio_refill:1; + /* boolean_t */ async_prio_refill:1, + /* boolean_t */ gzalloc_exempt:1, + /* boolean_t */ alignment_required:1; int index; /* index into zone_info arrays for this zone */ struct zone * next_zone; /* Link for all-zones list */ - call_entry_data_t call_async_alloc; /* callout for asynchronous alloc */ + thread_call_data_t call_async_alloc; /* callout for asynchronous alloc */ const char *zone_name; /* a name for the zone */ #if ZONE_DEBUG queue_head_t active_zones; /* active elements */ #endif /* ZONE_DEBUG */ #if CONFIG_ZLEAKS - uint32_t num_allocs; /* alloc stats for zleak benchmarks */ + uint32_t num_allocs; /* alloc stats for zleak benchmarks */ uint32_t num_frees; /* free stats for zleak benchmarks */ - uint32_t zleak_capture; /* per-zone counter for capturing every N allocations */ + uint32_t zleak_capture; /* per-zone counter for capturing every N allocations */ #endif /* CONFIG_ZLEAKS */ + uint32_t free_check_count; /* counter for poisoning/checking every N frees */ vm_size_t prio_refill_watermark; thread_t zone_replenish_thread; +#if CONFIG_GZALLOC + gzalloc_data_t gz; +#endif /* CONFIG_GZALLOC */ }; /* @@ -145,7 +157,7 @@ typedef struct zinfo_usage_store_t { } zinfo_usage_store_t; typedef zinfo_usage_store_t *zinfo_usage_t; -extern void zone_gc(void); +extern void zone_gc(boolean_t); extern void consider_zone_gc(boolean_t); /* Steal memory for zone module */ @@ -178,23 +190,15 @@ extern void stack_fake_zone_info( #if ZONE_DEBUG -#if MACH_KDB - -extern void * next_element( - zone_t z, - void *elt); - -extern void * first_element( - zone_t z); - -#endif /* MACH_KDB */ - extern void zone_debug_enable( zone_t z); extern void zone_debug_disable( zone_t z); +#define zone_debug_enabled(z) z->active_zones.next +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) +#define ZONE_DEBUG_OFFSET ROUNDUP(sizeof(queue_chain_t),16) #endif /* ZONE_DEBUG */ #endif /* MACH_KERNEL_PRIVATE */ @@ -260,6 +264,8 @@ extern void zone_prio_refill_configure(zone_t, vm_size_t); #define Z_NOCALLOUT 7 /* Don't asynchronously replenish the zone via * callouts */ +#define Z_ALIGNMENT_REQUIRED 8 +#define Z_GZALLOC_EXEMPT 9 /* Not tracked in guard allocation mode */ /* Preallocate space for zone from zone map */ extern void zprealloc( zone_t zone, @@ -305,10 +311,31 @@ extern int get_zleak_state(void); #endif /* CONFIG_ZLEAKS */ /* These functions used for leak detection both in zalloc.c and mbuf.c */ -extern uint32_t fastbacktrace(uintptr_t* bt, uint32_t max_frames); -extern uintptr_t hash_mix(uintptr_t x); -extern uint32_t hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size); -extern uint32_t hashaddr(uintptr_t pt, uint32_t max_size); +extern uint32_t fastbacktrace(uintptr_t* bt, uint32_t max_frames) __attribute__((noinline)); +extern uintptr_t hash_mix(uintptr_t); +extern uint32_t hashbacktrace(uintptr_t *, uint32_t, uint32_t); +extern uint32_t hashaddr(uintptr_t, uint32_t); + +#define lock_zone(zone) \ +MACRO_BEGIN \ + lck_mtx_lock_spin(&(zone)->lock); \ +MACRO_END + +#define unlock_zone(zone) \ +MACRO_BEGIN \ + lck_mtx_unlock(&(zone)->lock); \ +MACRO_END + +#if CONFIG_GZALLOC +void gzalloc_init(vm_size_t); +void gzalloc_zone_init(zone_t); +void gzalloc_configure(void); +void gzalloc_reconfigure(zone_t); +boolean_t gzalloc_enabled(void); + +vm_offset_t gzalloc_alloc(zone_t, boolean_t); +boolean_t gzalloc_free(zone_t, void *); +#endif /* CONFIG_GZALLOC */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kperf/Makefile b/osfmk/kperf/Makefile new file mode 100644 index 000000000..9ede0b0f9 --- /dev/null +++ b/osfmk/kperf/Makefile @@ -0,0 +1,33 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +EXPORT_ONLY_FILES = \ + context.h \ + timetrigger.h \ + pet.h \ + filter.h \ + kperfbsd.h \ + action.h \ + kperf.h + + +INSTALL_MI_LIST = + +# Export our headers +EXPORT_MI_LIST = ${EXPORT_ONLY_FILES} + +# Don't install in non-local, though +INSTALL_KF_MI_LIST = "" + +EXPORT_MI_DIR = kperf + +include $(MakeInc_rule) +include $(MakeInc_dir) + + diff --git a/osfmk/kperf/action.c b/osfmk/kperf/action.c new file mode 100644 index 000000000..3f1b5e2b0 --- /dev/null +++ b/osfmk/kperf/action.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Called from a trigger. Actually takes the data from the different + * modules and puts them in a buffer + */ + +#include +#include +// #include +#include +#include /* panic */ +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ACTION_MAX 32 + +/* XXX: callback handler from chudxnu */ +/* FIXME: hook this up to something */ +//void (*kperf_thread_ast_handler)(thread_t); + +/* the list of different actions to take */ +struct action +{ + unsigned sample; +}; + +/* the list of actions */ +static unsigned actionc = 0; +static struct action *actionv = NULL; + + +/* Do the real work! */ +/* this can be called in any context ... right? */ +static kern_return_t +kperf_sample_internal( struct kperf_sample *sbuf, + struct kperf_context *context, + unsigned sample_what, boolean_t pend_user ) +{ + boolean_t enabled; + int did_ucallstack = 0, did_tinfo_extra = 0; + + /* not much point continuing here, but what to do ? return + * Shutdown? cut a tracepoint and continue? + */ + if( sample_what == 0 ) + return SAMPLE_CONTINUE; + + int is_kernel = (context->cur_pid == 0); + + /* an event occurred. Sample everything and dump it in a + * buffer. + */ + + /* collect data from samplers */ + if( sample_what & SAMPLER_TINFO ) { + kperf_threadinfo_sample( &sbuf->threadinfo, context ); + + /* XXX FIXME This drops events when the thread is idle. + * This should be configurable. */ + if (sbuf->threadinfo.runmode & 0x40) + return SAMPLE_CONTINUE; + } + + if( sample_what & SAMPLER_KSTACK ) + kperf_kcallstack_sample( &sbuf->kcallstack, context ); + + /* sensitive ones */ + if ( !is_kernel ) { + if( pend_user ) + { + if( sample_what & SAMPLER_USTACK ) + did_ucallstack = kperf_ucallstack_pend( context ); + + if( sample_what & SAMPLER_TINFOEX ) + did_tinfo_extra = kperf_threadinfo_extra_pend( context ); + } + else + { + if( sample_what & SAMPLER_USTACK ) + kperf_ucallstack_sample( &sbuf->ucallstack, context ); + + if( sample_what & SAMPLER_TINFOEX ) + kperf_threadinfo_extra_sample( &sbuf->tinfo_ex, + context ); + } + } + + /* stash the data into the buffer + * interrupts off to ensure we don't get split + */ + enabled = ml_set_interrupts_enabled(FALSE); + + if ( pend_user ) + BUF_DATA1( PERF_GEN_EVENT | DBG_FUNC_START, sample_what ); + + /* dump threadinfo */ + if( sample_what & SAMPLER_TINFO ) + kperf_threadinfo_log( &sbuf->threadinfo ); + + /* dump kcallstack */ + if( sample_what & SAMPLER_KSTACK ) + kperf_kcallstack_log( &sbuf->kcallstack ); + + + /* dump user stuff */ + if ( !is_kernel ) { + if ( pend_user ) + { + if ( did_ucallstack ) + BUF_INFO1( PERF_CS_UPEND, 0 ); + + if ( did_tinfo_extra ) + BUF_INFO1( PERF_TI_XPEND, 0 ); + } + else + { + if( sample_what & SAMPLER_USTACK ) + kperf_ucallstack_log( &sbuf->ucallstack ); + + if( sample_what & SAMPLER_TINFOEX ) + kperf_threadinfo_extra_log( &sbuf->tinfo_ex ); + } + } + + if ( pend_user ) + BUF_DATA1( PERF_GEN_EVENT | DBG_FUNC_END, sample_what ); + + /* intrs back on */ + ml_set_interrupts_enabled(enabled); + + return SAMPLE_CONTINUE; +} + +/* Translate actionid into sample bits and take a sample */ +kern_return_t +kperf_sample( struct kperf_sample *sbuf, + struct kperf_context *context, + unsigned actionid, boolean_t pend_user ) +{ + unsigned sample_what = 0; + + /* check samppling is on, or panic */ + if( kperf_sampling_status() == KPERF_SAMPLING_OFF ) + panic("trigger fired while sampling off"); + else if( kperf_sampling_status() == KPERF_SAMPLING_SHUTDOWN ) + return SAMPLE_SHUTDOWN; + + /* work out what to sample, if anything */ + if( actionid >= actionc ) + return SAMPLE_SHUTDOWN; + + sample_what = actionv[actionid].sample; + + return kperf_sample_internal( sbuf, context, sample_what, pend_user ); +} + +/* ast callback on a thread */ +void +kperf_thread_ast_handler( thread_t thread ) +{ + int r; + uint32_t t_chud; + unsigned sample_what = 0; + /* we know we're on a thread, so let's do stuff */ + task_t task = NULL; + + /* Don't sample if we are shutting down or off */ + if( kperf_sampling_status() != KPERF_SAMPLING_ON ) + return; + + BUF_INFO1(PERF_AST_HNDLR | DBG_FUNC_START, thread); + + /* FIXME: probably want a faster allocator here... :P */ + struct kperf_sample *sbuf = kalloc( sizeof(*sbuf) ); + if( sbuf == NULL ) + { + /* FIXME: error code */ + BUF_INFO1( PERF_AST_ERROR, 0 ); + goto error; + } + + /* make a context, take a sample */ + struct kperf_context ctx; + ctx.cur_thread = thread; + ctx.cur_pid = -1; + + task = chudxnu_task_for_thread(thread); + if(task) + ctx.cur_pid = chudxnu_pid_for_task(task); + + /* decode the chud bits so we know what to sample */ + t_chud = kperf_get_thread_bits(thread); + + if (t_chud & T_AST_NAME) + sample_what |= SAMPLER_TINFOEX; + + if (t_chud & T_AST_CALLSTACK) + sample_what |= SAMPLER_USTACK; + + /* do the sample, just of the user stuff */ + r = kperf_sample_internal( sbuf, &ctx, sample_what, FALSE ); + + /* free it again */ + kfree( sbuf, sizeof(*sbuf) ); + +error: + BUF_INFO1(PERF_AST_HNDLR | DBG_FUNC_END, r); + +} + +/* register AST bits */ +int +kperf_ast_pend( thread_t cur_thread, uint32_t check_bits, + uint32_t set_bits ) +{ + /* pend on the thread */ + uint32_t t_chud, set_done = 0; + + /* can only pend on the current thread */ + if( cur_thread != chudxnu_current_thread() ) + panic("pending to non-current thread"); + + /* get our current bits */ + t_chud = kperf_get_thread_bits(cur_thread); + + /* see if it's already been done or pended */ + if( !(t_chud & check_bits ) ) + { + /* set the bit on the thread */ + t_chud |= set_bits; + kperf_set_thread_bits(cur_thread, t_chud); + + /* set the actual AST */ + kperf_set_thread_ast( cur_thread ); + + set_done = 1; + } + + return set_done; + +// BUF_INFO3( dbg_code, (uintptr_t)cur_thread, t_chud, set_done ); +} + +unsigned +kperf_action_get_count(void) +{ + return actionc; +} + +int +kperf_action_set_samplers( unsigned actionid, uint32_t samplers ) +{ + if( actionid >= actionc ) + return EINVAL; + + actionv[actionid].sample = samplers; + + return 0; +} + +int +kperf_action_get_samplers( unsigned actionid, uint32_t *samplers_out ) +{ + if( actionid >= actionc ) + return EINVAL; + + *samplers_out = actionv[actionid].sample; + + return 0; +} + +int +kperf_action_set_count(unsigned count) +{ + struct action *new_actionv = NULL, *old_actionv = NULL; + unsigned old_count; + + /* easy no-op */ + if( count == actionc ) + return 0; + + /* TODO: allow shrinking? */ + if( count < actionc ) + return EINVAL; + + /* cap it for good measure */ + if( count > ACTION_MAX ) + return EINVAL; + + /* creating the action arror for the first time. create a few + * more things, too. + */ + if( actionc == 0 ) + { + int r; + r = kperf_init(); + + if( r != 0 ) + return r; + } + + /* create a new array */ + new_actionv = kalloc( count * sizeof(*new_actionv) ); + if( new_actionv == NULL ) + return ENOMEM; + + old_actionv = actionv; + old_count = actionc; + + if( old_actionv != NULL ) + bcopy( actionv, new_actionv, actionc * sizeof(*actionv) ); + + bzero( &new_actionv[actionc], (count - old_count) * sizeof(*actionv) ); + + actionv = new_actionv; + actionc = count; + + if( old_actionv != NULL ) + kfree( old_actionv, old_count * sizeof(*actionv) ); + + printf( "kperf: done the alloc\n" ); + + return 0; +} diff --git a/osfmk/kperf/action.h b/osfmk/kperf/action.h new file mode 100644 index 000000000..71e91df5f --- /dev/null +++ b/osfmk/kperf/action.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* fwd decl */ +struct kperf_sample; +struct kperf_context; + + +/* bits for defining what to do on an action */ +#define SAMPLER_TINFO (1<<0) +#define SAMPLER_TINFOEX (1<<1) +#define SAMPLER_KSTACK (1<<2) +#define SAMPLER_USTACK (1<<3) + +/* Take a sample into "sbuf" using current thread "cur_thread" */ +extern kern_return_t kperf_sample( struct kperf_sample *sbuf, + struct kperf_context*, + unsigned actionid, + boolean_t pend_user ); + +/* return codes from taking a sample + * either keep trigger, or something went wrong (or we're shutting down) + * so turn off. + */ +#define SAMPLE_CONTINUE (0) +#define SAMPLE_SHUTDOWN (1) + +/* Get the sample buffer to use from interrupt handler context. Only + * valid in interrupt contexts. + */ +extern struct kperf_sample* kperf_intr_sample_buffer(void); + +/* Interface functions */ +extern unsigned kperf_action_get_count(void); +extern int kperf_action_set_count(unsigned count); + +extern int kperf_action_set_samplers( unsigned actionid, + uint32_t samplers ); +extern int kperf_action_get_samplers( unsigned actionid, + uint32_t *samplers_out ); + +extern void +kperf_thread_ast_handler( thread_t thread ); diff --git a/osfmk/kperf/ast.h b/osfmk/kperf/ast.h new file mode 100644 index 000000000..897d549c4 --- /dev/null +++ b/osfmk/kperf/ast.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* pend ast bits on a thread */ +extern int kperf_ast_pend( thread_t, uint32_t, uint32_t ); diff --git a/osfmk/kperf/buffer.h b/osfmk/kperf/buffer.h new file mode 100644 index 000000000..0bb0f0970 --- /dev/null +++ b/osfmk/kperf/buffer.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* wrapper around kdebug */ + +#include + +/* KDEBUG codes */ +#define PERF_CODE(SubClass, code) KDBG_CODE(DBG_PERF, SubClass, code) + +/* broad sub-classes */ +#define PERF_GENERIC (0) +#define PERF_THREADINFO (1) +#define PERF_CALLSTACK (2) +#define PERF_TIMER (3) +#define PERF_PET (4) +#define PERF_AST (5) /* not confusing at all */ + +/* sub-class codes */ +#define PERF_GEN_CODE(code) PERF_CODE(PERF_GENERIC, code) +#define PERF_GEN_EVENT PERF_GEN_CODE(0) + +#define PERF_TI_CODE(code) PERF_CODE(PERF_THREADINFO, code) +#define PERF_TI_SAMPLE PERF_TI_CODE(0) +#define PERF_TI_DATA PERF_TI_CODE(1) +#define PERF_TI_XSAMPLE PERF_TI_CODE(2) +#define PERF_TI_XPEND PERF_TI_CODE(3) +#define PERF_TI_XDATA PERF_TI_CODE(4) + +#define PERF_CS_CODE(code) PERF_CODE(PERF_CALLSTACK, code) +#define PERF_CS_KSAMPLE PERF_CS_CODE(0) +#define PERF_CS_UPEND PERF_CS_CODE(1) +#define PERF_CS_USAMPLE PERF_CS_CODE(2) +#define PERF_CS_KDATA PERF_CS_CODE(3) +#define PERF_CS_UDATA PERF_CS_CODE(4) + +#define PERF_TM_CODE(code) PERF_CODE(PERF_TIMER, code) +#define PERF_TM_ASCHED PERF_TM_CODE(0) +#define PERF_TM_SCHED PERF_TM_CODE(1) +#define PERF_TM_HNDLR PERF_TM_CODE(2) + +#define PERF_PET_CODE(code) PERF_CODE(PERF_PET, code) +#define PERF_PET_THREAD PERF_PET_CODE(0) +#define PERF_PET_ERROR PERF_PET_CODE(1) +#define PERF_PET_RUN PERF_PET_CODE(2) +#define PERF_PET_PAUSE PERF_PET_CODE(3) +#define PERF_PET_IDLE PERF_PET_CODE(4) +#define PERF_PET_SAMPLE PERF_PET_CODE(5) + +#define PERF_AST_CODE(code) PERF_CODE(PERF_AST, code) +#define PERF_AST_HNDLR PERF_TM_CODE(0) +#define PERF_AST_ERROR PERF_PET_CODE(1) + +/* error sub-codes for trace data */ +enum +{ + ERR_TASK, + ERR_THREAD, + ERR_PID, + ERR_FRAMES, + ERR_GETSTACK, + ERR_NOMEM, +}; + +/* for logging information / debugging -- optional */ +#define BUF_INFO( id, a0, a1, a2, a3) KERNEL_DEBUG_CONSTANT(id,a0,a1,a2,a3,0) + +#define BUF_INFO1( id, a0 ) BUF_INFO(id, a0, 0, 0, 0 ) +#define BUF_INFO2( id, a0, a1 ) BUF_INFO(id, a0, a1, 0, 0 ) +#define BUF_INFO3( id, a0, a1, a2 ) BUF_INFO(id, a0, a1, a2, 0 ) + +/* for logging actual data -- never compiled out */ +#define BUF_DATA( id, a0, a1, a2, a3) KERNEL_DEBUG_CONSTANT(id,a0,a1,a2,a3,0) + +/* code neatness */ +#define BUF_DATA1( id, a0 ) BUF_DATA(id, a0, 0, 0, 0 ) +#define BUF_DATA2( id, a0, a1 ) BUF_DATA(id, a0, a1, 0, 0 ) +#define BUF_DATA3( id, a0, a1, a3 ) BUF_DATA(id, a0, a1, a2, a3 ) diff --git a/osfmk/kperf/callstack.c b/osfmk/kperf/callstack.c new file mode 100644 index 000000000..d0c1e3947 --- /dev/null +++ b/osfmk/kperf/callstack.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* Collect kernel callstacks */ + +#include +#include /* XXX: remove me */ +#include + +#include + +#include +#include +#include +#include + +static void +callstack_sample( struct callstack *cs, + struct kperf_context *context, + uint32_t is_user ) +{ + kern_return_t kr; + mach_msg_type_number_t nframes; /* WTF with the type? */ + uint32_t code; + + if( is_user ) + code = PERF_CS_USAMPLE; + else + code = PERF_CS_KSAMPLE; + + BUF_INFO1( code, (uintptr_t)context->cur_thread ); + + /* fill out known flags */ + cs->flags = 0; + if( !is_user ) + { + cs->flags |= CALLSTACK_KERNEL; +#ifdef __LP64__ + cs->flags |= CALLSTACK_64BIT; +#endif + } + else + { + /* FIXME: detect 32 vs 64-bit? */ + } + + /* collect the callstack */ + nframes = MAX_CALLSTACK_FRAMES; + kr = chudxnu_thread_get_callstack64( context->cur_thread, + cs->frames, + &nframes, + is_user ); + + /* check for overflow */ + if( kr == KERN_SUCCESS ) + { + cs->flags |= CALLSTACK_VALID; + cs->nframes = nframes; + } + else if( kr == KERN_RESOURCE_SHORTAGE ) + { + /* FIXME: more here */ + cs->flags |= CALLSTACK_TRUNCATED; + cs->flags |= CALLSTACK_VALID; + cs->nframes = nframes; + } + else + { + BUF_INFO2(PERF_PET_ERROR, ERR_GETSTACK, kr); + cs->nframes = 0; + } + + if( cs->nframes >= MAX_CALLSTACK_FRAMES ) + { + /* necessary? */ + BUF_INFO1(PERF_PET_ERROR, ERR_FRAMES); + cs->nframes = 0; + } + +} + +void +kperf_kcallstack_sample( struct callstack *cs, struct kperf_context *context ) +{ + callstack_sample( cs, context, 0 ); +} + +void +kperf_ucallstack_sample( struct callstack *cs, struct kperf_context *context ) +{ + callstack_sample( cs, context, 1 ); +} + +static void +callstack_log( struct callstack *cs, uint32_t code ) +{ + unsigned int i, j, n, of = 4; + + /* Header on the stack */ + BUF_DATA2( code, cs->flags, cs->nframes ); + + /* look for how many batches of 4 */ + n = cs->nframes / 4; + of = cs->nframes % 4; + if( of != 0 ) + n++; + + /* print all the stack data, and zero the overflow */ + for( i = 0; i < n; i++ ) + { +#define SCRUB_FRAME(x) (((x)nframes)?cs->frames[x]:0) + j = i * 4; + BUF_DATA ( code, + SCRUB_FRAME(j+0), + SCRUB_FRAME(j+1), + SCRUB_FRAME(j+2), + SCRUB_FRAME(j+3) ); +#undef SCRUB_FRAME + } +} + +void +kperf_kcallstack_log( struct callstack *cs ) +{ + callstack_log( cs, PERF_CS_KDATA ); +} + +void +kperf_ucallstack_log( struct callstack *cs ) +{ + callstack_log( cs, PERF_CS_UDATA ); +} + +int +kperf_ucallstack_pend( struct kperf_context * context ) +{ + return kperf_ast_pend( context->cur_thread, T_AST_CALLSTACK, + T_AST_CALLSTACK ); +} + +// kr = chudxnu_thread_get_callstack(context->generic->threadID, +// (uint32_t*)frames, &frameCount, !collectingSupervisorStack); diff --git a/osfmk/kperf/callstack.h b/osfmk/kperf/callstack.h new file mode 100644 index 000000000..3bfd96422 --- /dev/null +++ b/osfmk/kperf/callstack.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __AP_CALLSTACK_H__ +#define __AP_CALLSTACK_H__ + +#define MAX_CALLSTACK_FRAMES (128) + +#define CALLSTACK_VALID (1<<0) +#define CALLSTACK_DEFERRED (1<<1) +#define CALLSTACK_64BIT (1<<2) +#define CALLSTACK_KERNEL (1<<3) +#define CALLSTACK_TRUNCATED (1<<4) + +struct callstack +{ + uint32_t flags; + uint32_t nframes; + uint64_t frames[MAX_CALLSTACK_FRAMES]; +}; + +struct kperf_context; + +extern void kperf_kcallstack_sample( struct callstack *cs, struct kperf_context * ); +extern void kperf_kcallstack_log( struct callstack *cs ); + +extern void kperf_ucallstack_sample( struct callstack *cs, struct kperf_context * ); +extern int kperf_ucallstack_pend( struct kperf_context * ); +extern void kperf_ucallstack_log( struct callstack *cs ); + + +#endif /* __AP_CALLSTACK_H__ */ diff --git a/osfmk/kperf/context.h b/osfmk/kperf/context.h new file mode 100644 index 000000000..e06b9f978 --- /dev/null +++ b/osfmk/kperf/context.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* context of what we're looking at */ +struct kperf_context +{ + /* who was running during the event */ + int cur_pid; + thread_t cur_thread; + + /* who caused the event */ + unsigned trigger_type; + unsigned trigger_id; +}; diff --git a/osfmk/kperf/filter.c b/osfmk/kperf/filter.c new file mode 100644 index 000000000..1485d7472 --- /dev/null +++ b/osfmk/kperf/filter.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* Toy filtering. Allow system-wide or filtering on 4 PIDs */ + +#include +#include /* NULL */ +// #include + +#include +#include + +// Filter params... dodge for now +#define NPIDS (4) +int pid_list[NPIDS]; + +// function to determine whether we should take a sample +int +kperf_filter_should_sample(struct kperf_context *context) +{ + int i, restricted = 0; + + /* see if the pids are restricted */ + for( i = 0; i < NPIDS; i++ ) + { + if( context->cur_pid == pid_list[i] ) + return 1; + + if( pid_list[i] != -1 ) + restricted = 1; + } + + /* wasn't in the pid list, but something was */ + if( restricted ) + return 0; + + /* not fitered, sample it */ + return 1; +} + +/* check whether pid filtering is enabled */ +int +kperf_filter_on_pid(void) +{ + int i; + + for( i = 0; i < NPIDS; i++ ) + if( pid_list[i] != -1 ) + return 1; + + return 0; +} + +/* create a list of pids to filter */ +void +kperf_filter_pid_list( int *outcount, int **outv ) +{ + int i, found = 0; + + for( i = 0; i < NPIDS; i++ ) + if( pid_list[i] != -1 ) + found = 1; + + if( !found ) + { + *outcount = 0; + *outv = NULL; + return; + } + + /* just return our list */ + *outcount = NPIDS; + *outv = pid_list; +} + +/* free a list we created*/ +void +kperf_filter_free_pid_list( int *incount, int **inv ) +{ + // no op + (void) incount; + (void) inv; +} + +/* init the filters to nothing */ +void +kperf_filter_init(void) +{ + int i; + for( i = 0; i < NPIDS; i++ ) + pid_list[i] = -1; +} diff --git a/osfmk/kperf/filter.h b/osfmk/kperf/filter.h new file mode 100644 index 000000000..655c4fd30 --- /dev/null +++ b/osfmk/kperf/filter.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* so we can pull this in without the context header... */ +struct kperf_context; + +extern void kperf_filter_init(void); +extern int kperf_filter_should_sample(struct kperf_context *context); + +extern int kperf_filter_on_pid(void); +extern void kperf_filter_pid_list( int *outcount, int **outv ); +extern void kperf_filter_free_pid_list( int *incount, int **inv ); + +extern int pid_list[]; diff --git a/osfmk/kperf/kperf.c b/osfmk/kperf/kperf.c new file mode 100644 index 000000000..2d6fc400e --- /dev/null +++ b/osfmk/kperf/kperf.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/** misc functions **/ +#include /* XXX: should bust this out */ + +static struct kperf_sample *intr_samplev = NULL; +static unsigned intr_samplec = 0; +static unsigned sampling_status = KPERF_SAMPLING_OFF; +static unsigned kperf_initted = 0; + + +extern void (*chudxnu_thread_ast_handler)(thread_t); + +struct kperf_sample* +kperf_intr_sample_buffer(void) +{ + unsigned ncpu = chudxnu_cpu_number(); + + // XXX: assert? + if( ncpu >= intr_samplec ) + return NULL; + + return &intr_samplev[ncpu]; +} + +/* setup interrupt sample buffers */ +int +kperf_init(void) +{ + unsigned ncpus = 0; + + if( kperf_initted ) + return 0; + + /* get number of cpus */ + ncpus = machine_info.logical_cpu_max; + + /* make the CPU array + * FIXME: cache alignment + */ + intr_samplev = kalloc( ncpus * sizeof(*intr_samplev)); + + if( intr_samplev == NULL ) + return ENOMEM; + + /* clear it */ + bzero( intr_samplev, ncpus * sizeof(*intr_samplev) ); + + chudxnu_thread_ast_handler = kperf_thread_ast_handler; + + /* we're done */ + intr_samplec = ncpus; + kperf_initted = 1; + + return 0; +} + + +/** kext start/stop functions **/ +kern_return_t kperf_start (kmod_info_t * ki, void * d); + +kern_return_t +kperf_start (kmod_info_t * ki, void * d) +{ + (void) ki; + (void) d; + + /* say hello */ + printf( "aprof: kext starting\n" ); + + /* register modules */ + // kperf_action_init(); + kperf_filter_init(); + kperf_pet_init(); + + /* register the sysctls */ + //kperf_register_profiling(); + + return KERN_SUCCESS; +} + + +/* random misc-ish functions */ +uint32_t +kperf_get_thread_bits( thread_t thread ) +{ + return thread->t_chud; +} + +void +kperf_set_thread_bits( thread_t thread, uint32_t bits ) +{ + thread->t_chud = bits; +} + +/* mark an AST to fire on a thread */ +void +kperf_set_thread_ast( thread_t thread ) +{ + /* FIXME: only call this on current thread from an interrupt + * handler for now... + */ + if( thread != current_thread() ) + panic( "unsafe AST set" ); + + act_set_kperf(thread); +} + +unsigned +kperf_sampling_status(void) +{ + return sampling_status; +} + +int +kperf_sampling_enable(void) +{ + /* already running! */ + if( sampling_status == KPERF_SAMPLING_ON ) + return 0; + + if ( sampling_status != KPERF_SAMPLING_OFF ) + panic( "kperf: sampling wasn't off" ); + + /* make sure interrupt tables and actions are initted */ + if( !kperf_initted + || (kperf_action_get_count() == 0) ) + return ECANCELED; + + /* mark as running */ + sampling_status = KPERF_SAMPLING_ON; + + /* tell timers to enable */ + kperf_timer_go(); + + return 0; +} + +int +kperf_sampling_disable(void) +{ + if( sampling_status != KPERF_SAMPLING_ON ) + return 0; + + /* mark a shutting down */ + sampling_status = KPERF_SAMPLING_SHUTDOWN; + + /* tell timers to disable */ + kperf_timer_stop(); + + /* mark as off */ + sampling_status = KPERF_SAMPLING_OFF; + + return 0; +} diff --git a/osfmk/kperf/kperf.h b/osfmk/kperf/kperf.h new file mode 100644 index 000000000..1e1ab32cd --- /dev/null +++ b/osfmk/kperf/kperf.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* XXX: still needed? just access directly? */ + +#define TRIGGER_TYPE_TIMER (0) +#define TRIGGER_TYPE_PMI (1) +#define TRIGGER_TYPE_TRACE (2) + +extern uint32_t kperf_get_thread_bits( thread_t thread ); +extern void kperf_set_thread_bits( thread_t thread, uint32_t bits ); +extern void kperf_set_thread_ast( thread_t thread ); + +#define KPERF_SAMPLING_OFF 0 +#define KPERF_SAMPLING_ON 1 +#define KPERF_SAMPLING_SHUTDOWN 2 + +extern int kperf_init(void); +extern unsigned kperf_sampling_status(void); +extern int kperf_sampling_enable(void); +extern int kperf_sampling_disable(void); diff --git a/osfmk/kperf/kperf_arch.h b/osfmk/kperf/kperf_arch.h new file mode 100644 index 000000000..dd6e319c3 --- /dev/null +++ b/osfmk/kperf/kperf_arch.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KPERF_ARCH_H +#define _KPERF_ARCH_H + +/* per-arch header */ +#if defined(__x86_64__) +#include "kperf/x86_64/kperf_arch.h" +#else +#error architecture not supported +#endif + +/* common definitions */ +extern int kperf_mp_broadcast( void (*func)(void*), void *arg ); + +#endif /* _KPERF_ARCH_H */ diff --git a/osfmk/kperf/kperfbsd.c b/osfmk/kperf/kperfbsd.c new file mode 100644 index 000000000..6e626e453 --- /dev/null +++ b/osfmk/kperf/kperfbsd.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* sysctl interface for paramters from user-land */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define REQ_SAMPLING (1) +#define REQ_ACTION_COUNT (2) +#define REQ_ACTION_SAMPLERS (3) +#define REQ_TIMER_COUNT (4) +#define REQ_TIMER_PERIOD (5) +#define REQ_TIMER_PET (6) + + +static int +sysctl_timer_period( __unused struct sysctl_oid *oidp, struct sysctl_req *req ) +{ + int error = 0; + uint64_t inputs[2], retval; + unsigned timer, set = 0; + + /* get 2x 64-bit words */ + error = SYSCTL_IN( req, inputs, 2*sizeof(inputs[0]) ); + if(error) + { + printf( "error in\n" ); + return (error); + } + + /* setup inputs */ + timer = (unsigned) inputs[0]; + if( inputs[1] != ~0ULL ) + set = 1; + + printf( "%s timer: %u, inp[0] %llu\n", set ? "set" : "get", + timer, inputs[0] ); + + if( set ) + { + printf( "timer set period\n" ); + error = kperf_timer_set_period( timer, inputs[1] ); + if( error ) + return error; + } + + error = kperf_timer_get_period(timer, &retval); + if(error) + { + printf( "error get period\n" ); + return (error); + } + + inputs[1] = retval; + + if( error == 0 ) + { + error = SYSCTL_OUT( req, inputs, 2*sizeof(inputs[0]) ); + if( error ) + printf( "error out\n" ); + } + + return error; +} + +static int +sysctl_action_samplers( __unused struct sysctl_oid *oidp, + struct sysctl_req *req ) +{ + int error = 0; + uint64_t inputs[3]; + uint32_t retval; + unsigned actionid, set = 0; + + /* get 3x 64-bit words */ + error = SYSCTL_IN( req, inputs, 3*sizeof(inputs[0]) ); + if(error) + { + printf( "error in\n" ); + return (error); + } + + /* setup inputs */ + set = (unsigned) inputs[0]; + actionid = (unsigned) inputs[1]; + + if( set ) + { + error = kperf_action_set_samplers( actionid, inputs[2] ); + if( error ) + return error; + } + + printf("set %d actionid %u samplers val %u\n", + set, actionid, (unsigned) inputs[2] ); + + error = kperf_action_get_samplers(actionid, &retval); + if(error) + { + printf( "error get samplers\n" ); + return (error); + } + + inputs[2] = retval; + + if( error == 0 ) + { + error = SYSCTL_OUT( req, inputs, 3*sizeof(inputs[0]) ); + if( error ) + printf( "error out\n" ); + } + + return error; +} + +static int +sysctl_sampling( struct sysctl_oid *oidp, struct sysctl_req *req ) +{ + int error = 0; + uint32_t value = 0; + + /* get the old value and process it */ + value = kperf_sampling_status(); + + /* copy out the old value, get the new value */ + error = sysctl_handle_int(oidp, &value, 0, req); + if (error || !req->newptr) + return (error); + + printf( "setting sampling to %d\n", value ); + + /* if that worked, and we're writing... */ + if( value ) + error = kperf_sampling_enable(); + else + error = kperf_sampling_disable(); + + return error; +} + +static int +sysctl_action_count( struct sysctl_oid *oidp, struct sysctl_req *req ) +{ + int error = 0; + uint32_t value = 0; + + /* get the old value and process it */ + value = kperf_action_get_count(); + + /* copy out the old value, get the new value */ + error = sysctl_handle_int(oidp, &value, 0, req); + if (error || !req->newptr) + return (error); + + printf( "setting action count to %d\n", value ); + + /* if that worked, and we're writing... */ + return kperf_action_set_count(value); +} + +static int +sysctl_timer_count( struct sysctl_oid *oidp, struct sysctl_req *req ) +{ + int error = 0; + uint32_t value = 0; + + /* get the old value and process it */ + value = kperf_timer_get_count(); + + /* copy out the old value, get the new value */ + error = sysctl_handle_int(oidp, &value, 0, req); + if (error || !req->newptr) + return (error); + + printf( "setting timer count to %d\n", value ); + + /* if that worked, and we're writing... */ + return kperf_timer_set_count(value); +} + +static int +sysctl_timer_pet( struct sysctl_oid *oidp, struct sysctl_req *req ) +{ + int error = 0; + uint32_t value = 0; + + /* get the old value and process it */ + value = kperf_timer_get_petid(); + + /* copy out the old value, get the new value */ + error = sysctl_handle_int(oidp, &value, 0, req); + if (error || !req->newptr) + return (error); + + printf( "setting timer petid to %d\n", value ); + + /* if that worked, and we're writing... */ + return kperf_timer_set_petid(value); +} + +/* + * #define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, \ + * void *arg1, int arg2, \ + * struct sysctl_req *req ) + */ +static int +kperf_sysctl SYSCTL_HANDLER_ARGS +{ + // __unused struct sysctl_oid *unused_oidp = oidp; + (void)arg2; + + /* which request */ + switch( (uintptr_t) arg1 ) + { + case REQ_ACTION_COUNT: + return sysctl_action_count( oidp, req ); + case REQ_ACTION_SAMPLERS: + return sysctl_action_samplers( oidp, req ); + case REQ_TIMER_COUNT: + return sysctl_timer_count( oidp, req ); + case REQ_TIMER_PERIOD: + return sysctl_timer_period( oidp, req ); + case REQ_TIMER_PET: + return sysctl_timer_pet( oidp, req ); + case REQ_SAMPLING: + return sysctl_sampling( oidp, req ); + +#if 0 + case REQ_TIMER: + return sysctl_timer_period( req ); + case REQ_PET: + return sysctl_pet_period( req ); +#endif + default: + return ENOENT; + } +} + +/* root kperf node */ +SYSCTL_NODE(, OID_AUTO, kperf, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "kperf"); + +/* action sub-section */ +SYSCTL_NODE(_kperf, OID_AUTO, action, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "action"); + +SYSCTL_PROC(_kperf_action, OID_AUTO, count, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + (void*)REQ_ACTION_COUNT, + sizeof(int), kperf_sysctl, "I", "Number of actions"); + +SYSCTL_PROC(_kperf_action, OID_AUTO, samplers, + CTLFLAG_RW|CTLFLAG_ANYBODY, + (void*)REQ_ACTION_SAMPLERS, + 3*sizeof(uint64_t), kperf_sysctl, "UQ", + "What to sample what a trigger fires an action"); + +/* timer sub-section */ +SYSCTL_NODE(_kperf, OID_AUTO, timer, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "timer"); + +SYSCTL_PROC(_kperf_timer, OID_AUTO, count, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + (void*)REQ_TIMER_COUNT, + sizeof(int), kperf_sysctl, "I", "Number of time triggers"); + +SYSCTL_PROC(_kperf_timer, OID_AUTO, period, + CTLFLAG_RW|CTLFLAG_ANYBODY, + (void*)REQ_TIMER_PERIOD, + 2*sizeof(uint64_t), kperf_sysctl, "UQ", "Timer number and period"); + +SYSCTL_PROC(_kperf_timer, OID_AUTO, pet_timer, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + (void*)REQ_TIMER_PET, + sizeof(int), kperf_sysctl, "I", "Which timer ID does PET"); + +/* misc */ +SYSCTL_PROC(_kperf, OID_AUTO, sampling, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + (void*)REQ_SAMPLING, + sizeof(int), kperf_sysctl, "I", "Sampling running"); + +int legacy_mode = 1; +SYSCTL_INT(_kperf, OID_AUTO, legacy_mode, CTLFLAG_RW, &legacy_mode, 0, "legacy_mode"); + +#if 0 +SYSCTL_PROC(_kperf, OID_AUTO, timer_period, + CTLFLAG_RW, (void*)REQ_TIMER, + sizeof(uint64_t), kperf_sysctl, "QU", "nanoseconds"); + +SYSCTL_PROC(_kperf, OID_AUTO, pet_period, + CTLFLAG_RW, (void*)REQ_PET, + sizeof(uint64_t), kperf_sysctl, "QU", "nanoseconds"); + +/* FIXME: do real stuff */ +SYSCTL_INT(_kperf, OID_AUTO, filter_pid0, + CTLFLAG_RW, &pid_list[0], 0, ""); +SYSCTL_INT(_kperf, OID_AUTO, filter_pid1, + CTLFLAG_RW, &pid_list[1], 0, ""); +SYSCTL_INT(_kperf, OID_AUTO, filter_pid2, + CTLFLAG_RW, &pid_list[2], 0, ""); +SYSCTL_INT(_kperf, OID_AUTO, filter_pid3, + CTLFLAG_RW, &pid_list[3], 0, ""); + +#endif diff --git a/osfmk/kperf/kperfbsd.h b/osfmk/kperf/kperfbsd.h new file mode 100644 index 000000000..8c5864096 --- /dev/null +++ b/osfmk/kperf/kperfbsd.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + diff --git a/osfmk/kperf/pet.c b/osfmk/kperf/pet.c new file mode 100644 index 000000000..3b039c059 --- /dev/null +++ b/osfmk/kperf/pet.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* all thread states code */ +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +/* timer id to call back on */ +static unsigned pet_timerid = 0; + +/* aciton ID to call + * We also use this as the sync point for waiting, for no good reason + */ +static unsigned pet_actionid = 0; + +/* the actual thread pointer */ +static thread_t pet_thread = NULL; + +/* Lock on which to synchronise */ +static IOLock *pet_lock = NULL; + +/* where to sample data to */ +static struct kperf_sample pet_sample_buf; + +/* sample an actual, honest to god thread! */ +static void +pet_sample_thread( thread_t thread ) +{ + struct kperf_context ctx; + task_t task; + + /* work out the context */ + ctx.cur_thread = thread; + ctx.cur_pid = -1; + + task = chudxnu_task_for_thread(thread); + if(task) + ctx.cur_pid = chudxnu_pid_for_task(task); + + /* do the actual sample */ + kperf_sample( &pet_sample_buf, &ctx, pet_actionid, false ); +} + +/* given a list of threads, preferably stopped, sample 'em! */ +static void +pet_sample_thread_list( mach_msg_type_number_t threadc, thread_array_t threadv ) +{ + unsigned int i; + + for( i = 0; i < threadc; i++ ) + { + thread_t thread = threadv[i]; + + if( !thread ) + /* XXX? */ + continue; + + pet_sample_thread( thread ); + } +} + +/* given a task (preferably stopped), sample all the threads in it */ +static void +pet_sample_task( task_t task ) +{ + mach_msg_type_number_t threadc; + thread_array_t threadv; + kern_return_t kr; + + kr = chudxnu_task_threads(task, &threadv, &threadc); + if( kr != KERN_SUCCESS ) + { + BUF_INFO2(PERF_PET_ERROR, ERR_THREAD, kr); + return; + } + + pet_sample_thread_list( threadc, threadv ); + + chudxnu_free_thread_list(&threadv, &threadc); +} + +/* given a list of tasks, sample all the threads in 'em */ +static void +pet_sample_task_list( int taskc, task_array_t taskv ) +{ + int i; + + for( i = 0; i < taskc; i++ ) + { + kern_return_t kr; + task_t task = taskv[i]; + + /* FIXME: necessary? old code did this, our hacky + * filtering code does, too + */ + if(!task) { + continue; + } + + /* try and stop any task other than the kernel task */ + if( task != kernel_task ) + { + kr = task_suspend( task ); + + /* try the next task */ + if( kr != KERN_SUCCESS ) + continue; + } + + /* sample it */ + pet_sample_task( task ); + + /* if it wasn't the kernel, resume it */ + if( task != kernel_task ) + task_resume(task); + } +} + +static void +pet_sample_all_tasks(void) +{ + task_array_t taskv = NULL; + mach_msg_type_number_t taskc = 0; + kern_return_t kr; + + kr = chudxnu_all_tasks(&taskv, &taskc); + + if( kr != KERN_SUCCESS ) + { + BUF_INFO2(PERF_PET_ERROR, ERR_TASK, kr); + return; + } + + pet_sample_task_list( taskc, taskv ); + chudxnu_free_task_list(&taskv, &taskc); +} + +static void +pet_sample_pid_filter(void) +{ + task_t *taskv = NULL; + int *pidv, pidc, i; + vm_size_t asize; + + kperf_filter_pid_list( &pidc, &pidv ); + if( pidc == 0 ) + { + BUF_INFO2(PERF_PET_ERROR, ERR_PID, 0); + return; + } + + asize = pidc * sizeof(task_t); + taskv = kalloc( asize ); + + if( taskv == NULL ) + goto out; + + /* convert the pid list into a task list */ + for( i = 0; i < pidc; i++ ) + { + int pid = pidv[i]; + if( pid == -1 ) + taskv[i] = NULL; + else + taskv[i] = chudxnu_task_for_pid(pid); + } + + /* now sample the task list */ + pet_sample_task_list( pidc, taskv ); + + kfree(taskv, asize); + +out: + kperf_filter_free_pid_list( &pidc, &pidv ); +} + +/* do the pet sample */ +static void +pet_work_unit(void) +{ + int pid_filter; + + /* check if we're filtering on pid */ + pid_filter = kperf_filter_on_pid(); + + if( pid_filter ) + { + BUF_INFO1(PERF_PET_SAMPLE | DBG_FUNC_START, 1); + pet_sample_pid_filter(); + } + else + { + /* otherwise filter everything */ + BUF_INFO1(PERF_PET_SAMPLE | DBG_FUNC_START, 0); + pet_sample_all_tasks(); + } + + BUF_INFO1(PERF_PET_SAMPLE | DBG_FUNC_END, 0); + +} + +/* sleep indefinitely */ +static void +pet_idle(void) +{ + IOLockLock(pet_lock); + IOLockSleep(pet_lock, &pet_actionid, THREAD_UNINT); + IOLockUnlock(pet_lock); +} + +/* loop between sampling and waiting */ +static void +pet_thread_loop( __unused void *param, __unused wait_result_t wr ) +{ + BUF_INFO1(PERF_PET_THREAD, 1); + + while(1) + { + BUF_INFO1(PERF_PET_IDLE, 0); + pet_idle(); + + BUF_INFO1(PERF_PET_RUN, 0); + pet_work_unit(); + + /* re-program the timer */ + kperf_timer_pet_set( pet_timerid ); + + /* FIXME: break here on a condition? */ + } +} + +/* make sure the thread takes a new period value */ +void +kperf_pet_timer_config( unsigned timerid, unsigned actionid ) +{ + /* hold the lock so pet thread doesn't run while we do this */ + IOLockLock(pet_lock); + + BUF_INFO1(PERF_PET_THREAD, 3); + + /* set values */ + pet_timerid = timerid; + pet_actionid = actionid; + + /* done */ + IOLockUnlock(pet_lock); +} + +/* make the thread run! */ +void +kperf_pet_thread_go(void) +{ + /* Make the thread go */ + IOLockWakeup(pet_lock, &pet_actionid, FALSE); +} + + +/* wait for the pet thread to finish a run */ +void +kperf_pet_thread_wait(void) +{ + /* acquire the lock to ensure the thread is parked. */ + IOLockLock(pet_lock); + IOLockUnlock(pet_lock); +} + +/* keep the pet thread around while we run */ +int +kperf_pet_init(void) +{ + kern_return_t rc; + thread_t t; + + if( pet_thread != NULL ) + return 0; + + /* make the sync poing */ + pet_lock = IOLockAlloc(); + if( pet_lock == NULL ) + return ENOMEM; + + /* create the thread */ + BUF_INFO1(PERF_PET_THREAD, 0); + rc = kernel_thread_start( pet_thread_loop, NULL, &t ); + if( rc != KERN_SUCCESS ) + { + IOLockFree( pet_lock ); + pet_lock = NULL; + return ENOMEM; + } + + /* OK! */ + return 0; +} diff --git a/osfmk/kperf/pet.h b/osfmk/kperf/pet.h new file mode 100644 index 000000000..9ffa736b5 --- /dev/null +++ b/osfmk/kperf/pet.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +/* create the pet thread */ +extern int kperf_pet_init(void); + +/* Kick the pet thread so it runs a sample of all threads */ +extern void kperf_pet_thread_go(void); + +/* ensure the pet thread has stopped sampling */ +extern void kperf_pet_thread_wait(void); + +/* tell pet the timer parameters */ +extern void kperf_pet_timer_config( unsigned timerid, unsigned actionid ); diff --git a/libsyscall/custom/__psynch_cvbroad.s b/osfmk/kperf/sample.h similarity index 79% rename from libsyscall/custom/__psynch_cvbroad.s rename to osfmk/kperf/sample.h index 037fcfc07..5a871214e 100644 --- a/libsyscall/custom/__psynch_cvbroad.s +++ b/osfmk/kperf/sample.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,16 +25,17 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. */ -#include "SYS.h" +// what goes in a sample -#define __SYSCALL_32BIT_ARG_BYTES 36 +#include "threadinfo.h" +#include "callstack.h" -#if defined(__i386__) || defined(__x86_64__) +struct kperf_sample +{ + struct threadinfo threadinfo; + struct tinfo_ex tinfo_ex; + struct callstack kcallstack; + struct callstack ucallstack; +}; -__SYSCALL(___psynch_cvbroad, psynch_cvbroad, 8) - -#else -#error Unsupported architecture -#endif diff --git a/osfmk/kperf/threadinfo.c b/osfmk/kperf/threadinfo.c new file mode 100644 index 000000000..88388c3c7 --- /dev/null +++ b/osfmk/kperf/threadinfo.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +/* Sample thread data */ + +#include +#include /* thread_* */ +#include /* panic */ +// #include + +#include +#include + +#include +#include +#include +#include + +// kAppleProfileTriggerClientThreadModeIdle = 0x40, // TH_IDLE +// #define TH_IDLE 0x40 + +//kAppleProfileTriggerClientThreadModeNotIdle = kAppleProfileTriggerClientThreadModeIdle << 16, // !TH_IDLE +#define TH_IDLE_N (TH_IDLE << 16) + +static uint64_t +make_runmode(thread_t thread) +{ + /* CEG: This is a translation of + * AppleProfileGetRunModeOfThread below... kinda magic :/ + */ + const int mode = chudxnu_thread_get_scheduler_state(thread); + +#if !TARGET_OS_EMBEDDED + if( 0 == mode) + { + return (chudxnu_thread_get_idle(thread) ? TH_IDLE : TH_IDLE_N); + } + else +#endif + // Today we happen to know there's a one-to-one mapping. + return ((mode & 0xffff) | ((~mode & 0xffff) << 16)); +} + + +/* code to collect current thread info */ +void +kperf_threadinfo_sample(struct threadinfo *ti, struct kperf_context *context) +{ + thread_t cur_thread = context->cur_thread; + BUF_INFO1( PERF_TI_SAMPLE, (uintptr_t)cur_thread ); + + // fill out the fields + ti->pid = context->cur_pid; + ti->tid = thread_tid(cur_thread); + ti->dq_addr = thread_dispatchqaddr(cur_thread); + ti->runmode = make_runmode(cur_thread); +} + +/* log an existing sample into the buffer */ +void +kperf_threadinfo_log(struct threadinfo *ti) +{ + /* XXX: K64 only? */ + BUF_DATA( PERF_TI_DATA, ti->pid, ti->tid, ti->dq_addr, ti->runmode ); +} + +/* 'extra' thread-info functions that are deferred 'til thread-context + * time + */ +void +kperf_threadinfo_extra_sample(struct tinfo_ex *tex, struct kperf_context *context) +{ + thread_t cur_thread = context->cur_thread; + uint32_t t_chud; + + /* can only pend on the current thread */ + /* this is valid from PET mode... */ + /* + if( cur_thread != chudxnu_current_thread() ) + panic("pending to non-current thread"); + */ + + /* get our current bits */ + t_chud = kperf_get_thread_bits(cur_thread); + + /* check if there's anything for us to do */ + if( t_chud & T_AST_NAME ) + { + BUF_INFO1( PERF_TI_XSAMPLE, (uintptr_t)cur_thread ); + + /* get the name out */ +#ifdef FIXME + /* need kperfbsd.c? */ + proc_name( context->cur_pid, + &tex->p_comm[0], CHUD_MAXPCOMM ); +#endif + + /* mark that it's done */ + t_chud &= ~T_AST_NAME; + t_chud |= T_NAME_DONE; + + kperf_set_thread_bits(cur_thread, t_chud); + } + else + /* empty string */ + tex->p_comm[0] = '\0'; + +} + +/* log it if there's anyting useful there */ +void +kperf_threadinfo_extra_log(struct tinfo_ex *tex) +{ + /* no data */ + if( tex->p_comm[0] == '\0' ) + return; + + /* FIXME: log more */ + BUF_DATA1( PERF_TI_XDATA, (uintptr_t)*(uintptr_t*)&tex->p_comm[0] ); +} + +/* pend a flag on a thread */ +int +kperf_threadinfo_extra_pend(struct kperf_context *context) +{ + return kperf_ast_pend( context->cur_thread, T_NAME_DONE | T_AST_NAME, + T_AST_NAME ); +} + + +#if 0 + +/* transalted from the APF */ + +APTIAKernelEntry_t *threadInfo = (APTIAKernelEntry_t*)(threadInfos + account->offset); + +context->timeStamp = mach_absolute_time(); +context->cpuNum = chudxnu_cpu_number(); + +// record the process info from the callback context +context->pid = chudxnu_current_pid(); +threadInfo->pid = context->generic->pid; + +// thread_tid is a thread_t to ID function in the kernel +context->threadID = chudxnu_current_thread(); +threadInfo->tid = thread_tid(context->generic->threadID); + +// also a kernel function +threadInfo->dispatch_queue_addr = thread_dispatchqaddr(context->generic->threadID); + +// see below +threadInfo->runMode = AppleProfileGetRunModeOfThread(context->generic->threadID); + + +/****** WTF is this?! *******/ + +/*!enum AppleProfileTriggerClientThreadRunMode + * + * Specifies the thread mode in which to record samples. + */ +typedef enum { // Target Thread State - can be OR'd + // Basic Building Blocks: + // for Time Profile, use kAppleProfileTriggerClientThreadModeRunning (optionally with kAppleProfileTriggerClientThreadModeNotIdle). + // for Time Profile (All Thread States), use kAppleProfileTriggerClientThreadModeAny (or just don't specify any thread mode filters). + // for Time Profile (Blocked Threads), use kIOProfileTriggerClientThreadModeBlocked. + // etc... + + kAppleProfileTriggerClientThreadModeNone = 0x0, + + kAppleProfileTriggerClientThreadModeRunning = 0x1, // On a core + kAppleProfileTriggerClientThreadModeRunnable = 0x2, // TH_RUN + kAppleProfileTriggerClientThreadModeBlocked = 0x4, // TH_WAIT + kAppleProfileTriggerClientThreadModeUninterruptible = 0x8, // TH_UNINT + kAppleProfileTriggerClientThreadModeSuspended = 0x10, // TH_SUSP + kAppleProfileTriggerClientThreadModeTerminating = 0x20, // TH_TERMINATE + kAppleProfileTriggerClientThreadModeIdle = 0x40, // TH_IDLE + + kAppleProfileTriggerClientThreadModeNotRunning = kAppleProfileTriggerClientThreadModeRunning << 16, // Not on a core + kAppleProfileTriggerClientThreadModeNotRunnable = kAppleProfileTriggerClientThreadModeRunnable << 16, // !TH_RUN + kAppleProfileTriggerClientThreadModeNotBlocked = kAppleProfileTriggerClientThreadModeBlocked << 16, // !TH_WAIT + kAppleProfileTriggerClientThreadModeNotUninterruptible = kAppleProfileTriggerClientThreadModeUninterruptible << 16, // !TH_UNINT + kAppleProfileTriggerClientThreadModeNotSuspended = kAppleProfileTriggerClientThreadModeSuspended << 16, // !TH_SUSP + kAppleProfileTriggerClientThreadModeNotTerminating = kAppleProfileTriggerClientThreadModeTerminating << 16, // !TH_TERMINATE + kAppleProfileTriggerClientThreadModeNotIdle = kAppleProfileTriggerClientThreadModeIdle << 16, // !TH_IDLE + + kAppleProfileTriggerClientThreadModeAny = ( kAppleProfileTriggerClientThreadModeRunning + | kAppleProfileTriggerClientThreadModeNotRunning), +} AppleProfileTriggerClientThreadRunMode; + +extern "C" AppleProfileTriggerClientThreadRunMode AppleProfileGetRunModeOfThread(thread_t thread) { + const int mode = chudxnu_thread_get_scheduler_state(thread); + +#if !TARGET_OS_EMBEDDED + if (0 == mode) { + return (chudxnu_thread_get_idle(thread) ? kAppleProfileTriggerClientThreadModeIdle : kAppleProfileTriggerClientThreadModeNotIdle); + } else +#endif + return (AppleProfileTriggerClientThreadRunMode)((mode & 0xffff) | ((~mode & 0xffff) << 16)); // Today we happen to know there's a one-to-one mapping. +} + +#endif diff --git a/osfmk/kperf/threadinfo.h b/osfmk/kperf/threadinfo.h new file mode 100644 index 000000000..e7bcaafb2 --- /dev/null +++ b/osfmk/kperf/threadinfo.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __AP_THREADINFO_H__ +#define __AP_THREADINFO_H__ + +/* 'live' threadinfo */ +struct threadinfo +{ + uint64_t pid; + uint64_t tid; + uint64_t dq_addr; + uint64_t runmode; +}; + +/* extra info we sample out of bounds */ +#define CHUD_MAXPCOMM 16 /* copy from kernel somewhere :P */ +struct tinfo_ex +{ + char p_comm[CHUD_MAXPCOMM+1]; /* XXX: 16 + 1 */ +}; + +struct kperf_context; +extern void kperf_threadinfo_sample(struct threadinfo *ti, struct kperf_context *); +extern void kperf_threadinfo_log(struct threadinfo *ti); + +extern void kperf_threadinfo_extra_sample(struct tinfo_ex *, struct kperf_context *); +extern int kperf_threadinfo_extra_pend(struct kperf_context *); +extern void kperf_threadinfo_extra_log(struct tinfo_ex *); + +#endif /* __AP_THREADINFO_H__ */ diff --git a/osfmk/kperf/timetrigger.c b/osfmk/kperf/timetrigger.c new file mode 100644 index 000000000..643d63cd9 --- /dev/null +++ b/osfmk/kperf/timetrigger.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* Manage time triggers */ + +#include +#include /* current_thread() */ +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +/* represents a periodic timer */ +struct time_trigger +{ + struct timer_call tcall; + uint64_t period; + unsigned actionid; + volatile unsigned active; +}; + +/* the list of timers */ +static unsigned timerc = 0; +static struct time_trigger *timerv; +static unsigned pet_timer = 999; + +/* maximum number of timers we can construct */ +#define TIMER_MAX 16 + +/* minimal interval for a timer (100usec in nsec) */ +#define MIN_TIMER (100000) + +static void +kperf_timer_schedule( struct time_trigger *trigger, uint64_t now ) +{ + uint64_t deadline; + + BUF_INFO1(PERF_TM_SCHED, trigger->period); + + /* calculate deadline */ + deadline = now + trigger->period; + + /* re-schedule the timer, making sure we don't apply slop */ + timer_call_enter( &trigger->tcall, deadline, TIMER_CALL_CRITICAL); +} + +static void +kperf_ipi_handler( void *param ) +{ + int r; + struct kperf_sample *intbuf = NULL; + struct kperf_context ctx; + struct time_trigger *trigger = param; + task_t task = NULL; + + BUF_INFO1(PERF_TM_HNDLR | DBG_FUNC_START, 0); + + /* In an interrupt, get the interrupt buffer for this CPU */ + intbuf = kperf_intr_sample_buffer(); + + /* On a timer, we can see the "real" current thread */ + ctx.cur_pid = 0; /* remove this? */ + ctx.cur_thread = current_thread(); + + task = chudxnu_task_for_thread(ctx.cur_thread); + if (task) + ctx.cur_pid = chudxnu_pid_for_task(task); + + /* who fired */ + ctx.trigger_type = TRIGGER_TYPE_TIMER; + ctx.trigger_id = (unsigned)(trigger-timerv); /* computer timer number */ + + /* call the action -- kernel-only from interrupt, pend user */ + r = kperf_sample( intbuf, &ctx, trigger->actionid, TRUE ); + + BUF_INFO1(PERF_TM_HNDLR | DBG_FUNC_END, r); +} + +static void +kperf_timer_handler( void *param0, __unused void *param1 ) +{ + struct time_trigger *trigger = param0; + unsigned ntimer = (unsigned)(trigger - timerv); + + trigger->active = 1; + + /* along the lines of do not ipi if we are all shutting down */ + if( kperf_sampling_status() == KPERF_SAMPLING_SHUTDOWN ) + goto deactivate; + + /* ping all CPUs */ + kperf_mp_broadcast( kperf_ipi_handler, trigger ); + + /* release the pet thread? */ + if( ntimer == pet_timer ) + { + /* timer re-enabled when thread done */ + kperf_pet_thread_go(); + } + else + { + /* re-enable the timer + * FIXME: get the current time from elsewhere + */ + uint64_t now = mach_absolute_time(); + kperf_timer_schedule( trigger, now ); + } + +deactivate: + trigger->active = 0; +} + +/* program the timer from the pet thread */ +int +kperf_timer_pet_set( unsigned timer ) +{ + uint64_t now; + struct time_trigger *trigger = NULL; + + if( timer != pet_timer ) + panic( "PET setting with bogus ID\n" ); + + if( timer >= timerc ) + return EINVAL; + + /* CHECKME: we probably took so damn long in the PET thread, + * it makes sense to take the time again. + */ + now = mach_absolute_time(); + trigger = &timerv[timer]; + + /* reprogram */ + kperf_timer_schedule( trigger, now ); + + return 0; +} + + +/* turn on all the timers */ +extern int +kperf_timer_go(void) +{ + unsigned i; + uint64_t now = mach_absolute_time(); + + for( i = 0; i < timerc; i++ ) + { + if( timerv[i].period == 0 ) + continue; + + kperf_timer_schedule( &timerv[i], now ); + } + + return 0; +} + + +extern int +kperf_timer_stop(void) +{ + unsigned i; + + for( i = 0; i < timerc; i++ ) + { + if( timerv[i].period == 0 ) + continue; + + while (timerv[i].active) + ; + + timer_call_cancel( &timerv[i].tcall ); + } + + /* wait for PET to stop, too */ + kperf_pet_thread_wait(); + + return 0; +} + +unsigned +kperf_timer_get_petid(void) +{ + return pet_timer; +} + +int +kperf_timer_set_petid(unsigned timerid) +{ + struct time_trigger *trigger = NULL; + + /* they can program whatever... */ + pet_timer = timerid; + + /* clear them if it's a bogus ID */ + if( pet_timer >= timerc ) + { + kperf_pet_timer_config( 0, 0 ); + + return 0; + } + + /* update the values */ + trigger = &timerv[pet_timer]; + kperf_pet_timer_config( pet_timer, trigger->actionid ); + + return 0; +} + +int +kperf_timer_get_period( unsigned timer, uint64_t *period ) +{ + printf( "get timer %u / %u\n", timer, timerc ); + + if( timer >= timerc ) + return EINVAL; + + *period = timerv[timer].period; + + return 0; +} + +int +kperf_timer_set_period( unsigned timer, uint64_t period ) +{ + printf( "set timer %u\n", timer ); + + if( timer >= timerc ) + return EINVAL; + + if( period < MIN_TIMER ) + period = MIN_TIMER; + + timerv[timer].period = period; + + /* FIXME: re-program running timers? */ + + return 0; +} + +unsigned +kperf_timer_get_count(void) +{ + return timerc; +} + +static void +setup_timer_call( struct time_trigger *trigger ) +{ + timer_call_setup( &trigger->tcall, kperf_timer_handler, trigger ); +} + +extern int +kperf_timer_set_count(unsigned count) +{ + struct time_trigger *new_timerv = NULL, *old_timerv = NULL; + unsigned old_count, i; + + /* easy no-op */ + if( count == timerc ) + { + printf( "already got %d timers\n", timerc ); + return 0; + } + + /* TODO: allow shrinking? */ + if( count < timerc ) + return EINVAL; + + /* cap it for good measure */ + if( count > TIMER_MAX ) + return EINVAL; + + /* creating the action arror for the first time. create a few + * more things, too. + */ + if( timerc == 0 ) + { + int r; + + /* main kperf */ + r = kperf_init(); + if( r ) + return r; + + /* get the PET thread going */ + r = kperf_pet_init(); + if( r ) + return r; + } + + /* create a new array */ + new_timerv = kalloc( count * sizeof(*new_timerv) ); + if( new_timerv == NULL ) + return ENOMEM; + + old_timerv = timerv; + old_count = timerc; + + if( old_timerv != NULL ) + bcopy( timerv, new_timerv, timerc * sizeof(*timerv) ); + + /* zero the new entries */ + bzero( &new_timerv[timerc], (count - old_count) * sizeof(*new_timerv) ); + + /* setup the timer call info */ + for( i = old_count; i < count; i++ ) + setup_timer_call( &new_timerv[i] ); + + timerv = new_timerv; + timerc = count; + + if( old_timerv != NULL ) + kfree( old_timerv, old_count * sizeof(*timerv) ); + + printf( "kperf: done timer alloc, timerc %d\n", timerc ); + + return 0; +} diff --git a/osfmk/kperf/timetrigger.h b/osfmk/kperf/timetrigger.h new file mode 100644 index 000000000..b0d67b00e --- /dev/null +++ b/osfmk/kperf/timetrigger.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +// extern uint64_t timer_period; +extern void kperf_timer_reprogram(void); +extern void kperf_timer_reprogram_all(void); + + +// return values from the action +#define TIMER_REPROGRAM (0) +#define TIMER_STOP (1) + +/* blah */ +extern unsigned kperf_timer_get_count(void); +extern int kperf_timer_set_count(unsigned count); + +extern int kperf_timer_get_period( unsigned timer, uint64_t *period ); +extern int kperf_timer_set_period( unsigned timer, uint64_t period ); + +extern int kperf_timer_go(void); +extern int kperf_timer_stop(void); + +extern unsigned kperf_timer_get_petid(void); +extern int kperf_timer_set_petid(unsigned count); + +/* so PET thread can re-arm the timer */ +extern int kperf_timer_pet_set( unsigned timer ); diff --git a/osfmk/kperf/x86_64/kperf_arch.h b/osfmk/kperf/x86_64/kperf_arch.h new file mode 100644 index 000000000..7d361c768 --- /dev/null +++ b/osfmk/kperf/x86_64/kperf_arch.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* arch-dependent wrapper for kperf */ + diff --git a/osfmk/kperf/x86_64/kperf_mp.c b/osfmk/kperf/x86_64/kperf_mp.c new file mode 100644 index 000000000..d4a1e8b99 --- /dev/null +++ b/osfmk/kperf/x86_64/kperf_mp.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include + +int +kperf_mp_broadcast( void (*func)(void*), void *arg ) +{ + mp_cpus_call( CPUMASK_ALL, ASYNC, func, arg ); + + return 0; +} diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index 770208eaa..b435d9794 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -30,7 +30,6 @@ MIG_DEFS = \ host_notify_reply.defs \ host_priv.defs \ host_security.defs \ - ledger.defs \ lock_set.defs \ mach_exc.defs \ mach_host.defs \ @@ -74,7 +73,6 @@ MIG_UUHDRS = \ clock_priv.h \ host_priv.h \ host_security.h \ - ledger.h \ lock_set.h \ mach_host.h \ mach_port.h \ @@ -128,7 +126,6 @@ DATAFILES = \ sync_policy.h \ syscall_sw.h \ task_info.h \ - task_ledger.h \ task_policy.h \ task_special_ports.h \ thread_info.h \ @@ -251,7 +248,6 @@ MIG_KSHDRS = \ exc_server.h \ host_priv_server.h \ host_security_server.h \ - ledger_server.h \ lock_set_server.h \ mach_exc_server.h \ mach_host_server.h \ @@ -277,7 +273,6 @@ MIG_KSSRC = \ exc_server.c \ host_priv_server.c \ host_security_server.c \ - ledger_server.c \ lock_set_server.c \ mach_exc_server.c \ mach_host_server.c \ diff --git a/osfmk/mach/Makefile.template b/osfmk/mach/Makefile.template index 8de510721..fa15c23a1 100644 --- a/osfmk/mach/Makefile.template +++ b/osfmk/mach/Makefile.template @@ -32,8 +32,6 @@ CLOCK_REPLY_FILES = clock_reply.h clock_reply_user.c BOOTSTRAP_FILES = bootstrap_server.h bootstrap_server.c -LEDGER_FILES = ledger_user.c ledger_server.h ledger_server.c - SYNC_FILES = sync_server.h sync_server.c MACH_USER_FILES = mach_user.h mach_user.c @@ -41,7 +39,7 @@ MACH_USER_FILES = mach_user.h mach_user.c OTHERS = ${MACH_FILES} ${MACH_PORT_FILES} \ ${EXC_FILES} ${MACH_EXC_FILES} \ ${MEMORY_OBJECT_FILES} ${MEMORY_OBJECT_DEFAULT_FILES} \ - ${PROF_FILES} ${MACH_HOST_FILES} ${LEDGER_FILES} \ + ${PROF_FILES} ${MACH_HOST_FILES} \ ${CLOCK_FILES} ${CLOCK_REPLY_FILES} ${BOOTSTRAP_FILES} \ ${BOOTSTRAP_FILES} ${SYNC_FILES} \ ${MACH_USER_FILES} @@ -150,16 +148,6 @@ ${BOOTSTRAP_FILES}: mach/bootstrap.defs -server bootstrap_server.c \ $< -.ORDER: ${LEDGER_FILES} - -${LEDGER_FILES}: mach/ledger.defs ${MACH_TYPES_DEFS} - ${_MIG_} ${_MIGFLAGS_} ${MIGKSFLAGS} ${MIGKUFLAGS} \ - -header /dev/null \ - -user ledger_user.c \ - -sheader ledger_server.h \ - -server ledger_server.c \ - $< - .ORDER: ${SYNC_FILES} ${SYNC_FILES}: mach/sync.defs diff --git a/osfmk/mach/exception_types.h b/osfmk/mach/exception_types.h index b0631f16b..96de421d2 100644 --- a/osfmk/mach/exception_types.h +++ b/osfmk/mach/exception_types.h @@ -95,6 +95,9 @@ #define EXC_CRASH 10 /* Abnormal process exit */ +#define EXC_RESOURCE 11 /* Hit resource consumption limit */ + /* Exact resource is in code field. */ + /* * Machine-independent exception behaviors */ @@ -131,6 +134,7 @@ #define EXC_MASK_MACH_SYSCALL (1 << EXC_MACH_SYSCALL) #define EXC_MASK_RPC_ALERT (1 << EXC_RPC_ALERT) #define EXC_MASK_CRASH (1 << EXC_CRASH) +#define EXC_MASK_RESOURCE (1 << EXC_RESOURCE) #define EXC_MASK_ALL (EXC_MASK_BAD_ACCESS | \ EXC_MASK_BAD_INSTRUCTION | \ @@ -141,6 +145,7 @@ EXC_MASK_SYSCALL | \ EXC_MASK_MACH_SYSCALL | \ EXC_MASK_RPC_ALERT | \ + EXC_MASK_RESOURCE | \ EXC_MASK_MACHINE) #ifdef KERNEL_PRIVATE @@ -153,9 +158,13 @@ * Machine independent codes for EXC_SOFTWARE * Codes 0x10000 - 0x1FFFF reserved for OS emulation (Unix) * 0x10000 - 0x10002 in use for unix signals + * 0x20000 - 0x2FFFF reserved for MACF */ #define EXC_SOFT_SIGNAL 0x10003 /* Unix signal exceptions */ +#define EXC_MACF_MIN 0x20000 /* MACF exceptions */ +#define EXC_MACF_MAX 0x2FFFF + #ifndef ASSEMBLER #include diff --git a/osfmk/mach/host_priv.defs b/osfmk/mach/host_priv.defs index 3be39868b..ac4997b22 100644 --- a/osfmk/mach/host_priv.defs +++ b/osfmk/mach/host_priv.defs @@ -265,18 +265,7 @@ routine host_swap_exception_ports( out old_behaviors : exception_behavior_array_t, SameCount; out old_flavors : exception_flavor_array_t, SameCount); -/* - * Loads a symbol table for an external file into the kernel debugger. - * The symbol table data is an array of characters. It is assumed that - * the caller and the kernel debugger agree on its format. - * This call is only supported in MACH_DEBUG and MACH_KDB kernels, - * otherwise KERN_FAILURE is returned. - */ -routine host_load_symbol_table( - host : host_priv_t; - task : task_t; - name : symtab_name_t; - symtab : pointer_t); +skip; /* old host_load_symbol_table */ /* * Specify that the range of the virtual address space diff --git a/osfmk/mach/host_special_ports.h b/osfmk/mach/host_special_ports.h index 755327c5c..4b2037256 100644 --- a/osfmk/mach/host_special_ports.h +++ b/osfmk/mach/host_special_ports.h @@ -90,7 +90,8 @@ #define HOST_CHUD_PORT (9 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_UNFREED_PORT (10 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_AMFID_PORT (11 + HOST_MAX_SPECIAL_KERNEL_PORT) -#define HOST_MAX_SPECIAL_PORT (12 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_GSSD_PORT (12 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_MAX_SPECIAL_PORT (13 + HOST_MAX_SPECIAL_KERNEL_PORT) /* room to grow here as well */ /* @@ -177,4 +178,11 @@ #define host_set_amfid_port(host, port) \ (host_set_special_port((host), HOST_AMFID_PORT, (port))) +#define host_get_gssd_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_GSSD_PORT, (port))) + +#define host_set_gssd_port(host, port) \ + (host_set_special_port((host), HOST_GSSD_PORT, (port))) + #endif /* _MACH_HOST_SPECIAL_PORTS_H_ */ diff --git a/osfmk/mach/i386/exception.h b/osfmk/mach/i386/exception.h index 77fe4e404..a9b4fbf04 100644 --- a/osfmk/mach/i386/exception.h +++ b/osfmk/mach/i386/exception.h @@ -63,7 +63,7 @@ * No machine dependent types for the 80386 */ -#define EXC_TYPES_COUNT 11 /* incl. illegal exception 0 */ +#define EXC_TYPES_COUNT 12 /* incl. illegal exception 0 */ /* * Codes and subcodes for 80386 exceptions. diff --git a/osfmk/mach/i386/machine_types.defs b/osfmk/mach/i386/machine_types.defs index 6a356154e..9830993b2 100644 --- a/osfmk/mach/i386/machine_types.defs +++ b/osfmk/mach/i386/machine_types.defs @@ -95,6 +95,8 @@ type vm_offset_t = natural_t; type vm_size_t = natural_t; #endif +type mach_port_context_t = uint64_t; + /* * The mach_vm_xxx_t types are sized to hold the * maximum pointer, offset, etc... supported on the diff --git a/osfmk/mach/i386/sdt_isa.h b/osfmk/mach/i386/sdt_isa.h index 503f5ce63..14549e810 100644 --- a/osfmk/mach/i386/sdt_isa.h +++ b/osfmk/mach/i386/sdt_isa.h @@ -41,7 +41,7 @@ */ #ifdef __x86_64__ #define DTRACE_LAB(p, n) \ - "__dtrace_probeDOLLAR" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) + "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) #define DTRACE_LABEL(p, n) \ ".section __DATA, __data\n\t" \ diff --git a/osfmk/mach/i386/vm_param.h b/osfmk/mach/i386/vm_param.h index fb2ca164f..12eb226e2 100644 --- a/osfmk/mach/i386/vm_param.h +++ b/osfmk/mach/i386/vm_param.h @@ -111,6 +111,7 @@ #define i386_btop(x) ((ppnum_t)((x) >> I386_PGSHIFT)) #define machine_btop(x) i386_btop(x) #define i386_ptob(x) (((pmap_paddr_t)(x)) << I386_PGSHIFT) +#define machine_ptob(x) i386_ptob(x) /* * Round off or truncate to the nearest page. These will work @@ -149,7 +150,7 @@ /* process-relative values (all 32-bit legacy only for now) */ #define VM_MIN_ADDRESS ((vm_offset_t) 0) -#define VM_USRSTACK32 ((vm_offset_t) 0xC0000000) +#define VM_USRSTACK32 ((vm_offset_t) 0xC0000000) /* ASLR slides stack down by up to 1 MB */ #define VM_MAX_ADDRESS ((vm_offset_t) 0xFFE00000) @@ -199,6 +200,10 @@ #define KEXT_ALLOC_BASE(x) ((x) - KEXT_ALLOC_MAX_OFFSET) #define KEXT_ALLOC_SIZE(x) (KEXT_ALLOC_MAX_OFFSET - (x)) +#define VM_KERNEL_IS_KEXT(_o) \ + (((vm_offset_t)(_o) >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) && \ + ((vm_offset_t)(_o) < VM_MIN_KERNEL_ADDRESS)) + #else #error unsupported architecture #endif @@ -218,8 +223,24 @@ #define VM32_MIN_ADDRESS ((vm32_offset_t) 0) #define VM32_MAX_ADDRESS ((vm32_offset_t) (VM_MAX_PAGE_ADDRESS & 0xFFFFFFFF)) +/* + * kalloc() parameters: + * + * Historically kalloc's underlying zones were power-of-2 sizes, with a + * KALLOC_MINSIZE of 16 bytes. The allocator ensured that + * (sizeof == alignof) >= 16 for all kalloc allocations. + * + * Today kalloc may use zones with intermediate sizes, constrained by + * KALLOC_MINSIZE and a minimum alignment, expressed by KALLOC_LOG2_MINALIGN. + * + * The common alignment for LP64 is for longs and pointers i.e. 8 bytes. + */ + #if defined(__i386__) +#define KALLOC_MINSIZE 16 /* minimum allocation size */ +#define KALLOC_LOG2_MINALIGN 4 /* log2 minimum alignment */ + #define LINEAR_KERNEL_ADDRESS ((vm_offset_t) 0x00000000) #define VM_MIN_KERNEL_LOADED_ADDRESS ((vm_offset_t) 0x00000000U) @@ -229,6 +250,9 @@ #elif defined(__x86_64__) +#define KALLOC_MINSIZE 16 /* minimum allocation size */ +#define KALLOC_LOG2_MINALIGN 4 /* log2 minimum alignment */ + #define LINEAR_KERNEL_ADDRESS ((vm_offset_t) 0x00000000) #define VM_MIN_KERNEL_LOADED_ADDRESS ((vm_offset_t) 0xFFFFFF8000000000UL) @@ -236,6 +260,7 @@ #define NCOPY_WINDOWS 0 + #else #error unsupported architecture #endif @@ -248,34 +273,19 @@ #define round_i386_to_vm(p) (atop(round_page(i386_ptob(p)))) #define vm_to_i386(p) (i386_btop(ptoa(p))) -#define PMAP_ENTER(pmap, virtual_address, page, protection, flags, wired) \ - MACRO_BEGIN \ - pmap_t __pmap = (pmap); \ - vm_page_t __page = (page); \ - vm_prot_t __prot__ = (protection); \ - \ - if (__pmap == kernel_pmap) { \ - __prot__ |= VM_PROT_WRITE; \ - } else { \ - assert(!__page->encrypted); \ - } \ - \ - pmap_enter( \ - __pmap, \ - (virtual_address), \ - __page->phys_page, \ - __prot__, \ - flags, \ - (wired) \ - ); \ - MACRO_END -#define PMAP_ENTER_OPTIONS(pmap, virtual_address, page, protection, \ - flags, wired, options, result) \ +#define PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op) \ + MACRO_BEGIN \ + pmap_set_cache_attributes((mem)->phys_page, (cache_attr)); \ + (object)->set_cache_attr = TRUE; \ + (void) batch_pmap_op; \ + MACRO_END + +#define PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op)\ MACRO_BEGIN \ - result=KERN_SUCCESS; \ - PMAP_ENTER(pmap, virtual_address, page, protection, \ - flags, wired); \ + (void) user_page_list; \ + (void) num_pages; \ + (void) batch_pmap_op; \ MACRO_END #define IS_USERADDR64_CANONICAL(addr) \ diff --git a/osfmk/mach/i386/vm_types.h b/osfmk/mach/i386/vm_types.h index ef737ac4d..ecdc42070 100644 --- a/osfmk/mach/i386/vm_types.h +++ b/osfmk/mach/i386/vm_types.h @@ -129,6 +129,8 @@ typedef uint64_t vm_map_offset_t; typedef uint64_t vm_map_address_t; typedef uint64_t vm_map_size_t; +typedef mach_vm_address_t mach_port_context_t; + #ifdef MACH_KERNEL_PRIVATE #if VM32_SUPPORT diff --git a/osfmk/mach/kmod.h b/osfmk/mach/kmod.h index a5c5587ff..99449b7bf 100644 --- a/osfmk/mach/kmod.h +++ b/osfmk/mach/kmod.h @@ -168,6 +168,15 @@ typedef struct kmod_info_64_v1 { /* Implementation now in libkern/OSKextLib.cpp. */ extern void kmod_panic_dump(vm_offset_t * addr, unsigned int dump_cnt); +#if CONFIG_DTRACE +/* + * DTrace can take a flag indicating whether it should instrument + * probes immediately based on kernel symbols. This per kext + * flag overrides system mode in dtrace_modload(). + */ +#define KMOD_DTRACE_FORCE_INIT 0x01 +#endif /* CONFIG_DTRACE */ + #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/mach/ledger.defs b/osfmk/mach/ledger.defs index 76367a990..97aa09dbf 100644 --- a/osfmk/mach/ledger.defs +++ b/osfmk/mach/ledger.defs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,35 +45,4 @@ userprefix r_; #include #include -/* - * Create a subordinate ledger - */ -routine ledger_create( - parent_ledger : ledger_t; - ledger_ledger : ledger_t; - out new_ledger : ledger_t; - transfer : ledger_item_t); - -/* - * Destroy a ledger - */ -routine ledger_terminate( - ledger : ledger_t); - -/* - * Transfer resources from a parent ledger to a child - */ -routine ledger_transfer( - parent_ledger : ledger_t; - child_ledger : ledger_t; - transfer : ledger_item_t); - -/* - * Return the ledger limit and balance - */ -routine ledger_read( - ledger : ledger_t; - out balance : ledger_item_t; - out limit : ledger_item_t); - /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_host.defs b/osfmk/mach/mach_host.defs index 536cdce83..184d1349b 100644 --- a/osfmk/mach/mach_host.defs +++ b/osfmk/mach/mach_host.defs @@ -172,7 +172,7 @@ routine kmod_get_info( * DEPRECATED! Use mach_zone_info() instead. */ routine host_zone_info( - host : host_t; + host : host_priv_t; out names : zone_name_array_t, Dealloc; out info : zone_info_array_t, @@ -188,16 +188,8 @@ routine host_virtual_physical_table_info( out info : hash_info_bucket_array_t, Dealloc); -/* - * Returns information about the global reverse hash table. - * This call is only valid on MACH_IPC_DEBUG kernels. - * Otherwise, KERN_FAILURE is returned. - */ -routine host_ipc_hash_info( - host : host_t; - out info : hash_info_bucket_array_t, - Dealloc); +skip; /* was host_ipc_hash_info */ skip; /* was enable_bluebox */ skip; /* was disable_bluebox */ @@ -265,11 +257,22 @@ routine host_statistics64( * address space sizes (unlike host_zone_info()). */ routine mach_zone_info( - host : host_t; + host : host_priv_t; out names : mach_zone_name_array_t, Dealloc; out info : mach_zone_info_array_t, Dealloc); +#ifdef PRIVATE +/* + * Forces a zone allocator garbage collections pass. + * Pages with no in-use allocations are returned to + * the VM system for re-use. + */ +routine mach_zone_force_gc( + host : host_t); +#else +skip; +#endif /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_interface.h b/osfmk/mach/mach_interface.h index 576f4119d..12218bd01 100644 --- a/osfmk/mach/mach_interface.h +++ b/osfmk/mach/mach_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/osfmk/mach/mach_port.defs b/osfmk/mach/mach_port.defs index 6c612758d..2799790b6 100644 --- a/osfmk/mach/mach_port.defs +++ b/osfmk/mach/mach_port.defs @@ -68,6 +68,10 @@ subsystem #endif /* KERNEL_SERVER */ mach_port 3200; +#if !KERNEL && !LIBSYSCALL_INTERFACE + UserPrefix _kernelrpc_; +#endif + #include #include #include @@ -98,6 +102,9 @@ routine mach_port_type( * Changes the name by which a port (or port set) is known to * the target task. The new name can't be in use. The * old name becomes available for recycling. + * + * This interface is OBSOLETE and will always + * return KERN_NOT_SUPPORTED. */ routine mach_port_rename( @@ -475,7 +482,12 @@ routine mach_port_extract_member( routine mach_port_get_context( task : ipc_space_t; name : mach_port_name_t; - out context : mach_vm_address_t); +#ifdef LIBSYSCALL_INTERFACE + out context : mach_port_context_t +#else + out context : mach_vm_address_t +#endif + ); /* * Only valid for receive rights. @@ -485,7 +497,12 @@ routine mach_port_get_context( routine mach_port_set_context( task : ipc_space_t; name : mach_port_name_t; - context : mach_vm_address_t); +#ifdef LIBSYSCALL_INTERFACE + context : mach_port_context_t +#else + context : mach_vm_address_t +#endif + ); /* * Return the type and address of the kernel object diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index 37ab3277f..915bf9381 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -139,6 +139,8 @@ extern kern_return_t semaphore_timedwait_signal_trap( unsigned int sec, clock_res_t nsec); +#endif /* PRIVATE */ + extern kern_return_t clock_sleep_trap( mach_port_name_t clock_name, sleep_type_t sleep_type, @@ -146,7 +148,74 @@ extern kern_return_t clock_sleep_trap( int sleep_nsec, mach_timespec_t *wakeup_time); -#endif /* PRIVATE */ +extern kern_return_t _kernelrpc_mach_vm_allocate_trap( + mach_port_name_t target, + mach_vm_offset_t *addr, + mach_vm_size_t size, + int flags); + +extern kern_return_t _kernelrpc_mach_vm_deallocate_trap( + mach_port_name_t target, + mach_vm_address_t address, + mach_vm_size_t size +); + +extern kern_return_t _kernelrpc_mach_vm_protect_trap( + mach_port_name_t target, + mach_vm_address_t address, + mach_vm_size_t size, + boolean_t set_maximum, + vm_prot_t new_protection +); + +extern kern_return_t _kernelrpc_mach_port_allocate_trap( + mach_port_name_t target, + mach_port_right_t right, + mach_port_name_t *name +); + + +extern kern_return_t _kernelrpc_mach_port_destroy_trap( + mach_port_name_t target, + mach_port_name_t name +); + +extern kern_return_t _kernelrpc_mach_port_deallocate_trap( + mach_port_name_t target, + mach_port_name_t name +); + +extern kern_return_t _kernelrpc_mach_port_mod_refs_trap( + mach_port_name_t target, + mach_port_name_t name, + mach_port_right_t right, + mach_port_delta_t delta +); + +extern kern_return_t _kernelrpc_mach_port_move_member_trap( + mach_port_name_t target, + mach_port_name_t member, + mach_port_name_t after +); + +extern kern_return_t _kernelrpc_mach_port_insert_right_trap( + mach_port_name_t target, + mach_port_name_t name, + mach_port_name_t poly, + mach_msg_type_name_t polyPoly +); + +extern kern_return_t _kernelrpc_mach_port_insert_member_trap( + mach_port_name_t target, + mach_port_name_t name, + mach_port_name_t pset +); + +extern kern_return_t _kernelrpc_mach_port_extract_member_trap( + mach_port_name_t target, + mach_port_name_t name, + mach_port_name_t pset +); extern kern_return_t macx_swapon( uint64_t filename, @@ -300,7 +369,7 @@ extern mach_port_name_t host_self_trap( struct host_self_trap_args *args); struct mach_msg_overwrite_trap_args { - PAD_ARG_(mach_vm_address_t, msg); + PAD_ARG_(user_addr_t, msg); PAD_ARG_(mach_msg_option_t, option); PAD_ARG_(mach_msg_size_t, send_size); PAD_ARG_(mach_msg_size_t, rcv_size); @@ -308,7 +377,7 @@ struct mach_msg_overwrite_trap_args { PAD_ARG_(mach_msg_timeout_t, timeout); PAD_ARG_(mach_port_name_t, notify); PAD_ARG_8 - PAD_ARG_(mach_vm_address_t, rcv_msg); /* Unused on mach_msg_trap */ + PAD_ARG_(user_addr_t, rcv_msg); /* Unused on mach_msg_trap */ }; extern mach_msg_return_t mach_msg_trap( struct mach_msg_overwrite_trap_args *args); @@ -459,7 +528,7 @@ struct clock_sleep_trap_args{ PAD_ARG_(sleep_type_t, sleep_type); PAD_ARG_(int, sleep_sec); PAD_ARG_(int, sleep_nsec); - PAD_ARG_(mach_vm_address_t, wakeup_time); + PAD_ARG_(user_addr_t, wakeup_time); }; extern kern_return_t clock_sleep_trap( struct clock_sleep_trap_args *args); @@ -473,7 +542,7 @@ extern kern_return_t thread_switch( struct thread_switch_args *args); struct mach_timebase_info_trap_args { - PAD_ARG_(mach_vm_address_t, info); + PAD_ARG_(user_addr_t, info); }; extern kern_return_t mach_timebase_info_trap( struct mach_timebase_info_trap_args *args); @@ -505,11 +574,104 @@ extern kern_return_t mk_timer_arm_trap( struct mk_timer_cancel_trap_args { PAD_ARG_(mach_port_name_t, name); - PAD_ARG_(mach_vm_address_t, result_time); + PAD_ARG_(user_addr_t, result_time); }; extern kern_return_t mk_timer_cancel_trap( struct mk_timer_cancel_trap_args *args); +struct _kernelrpc_mach_vm_allocate_trap_args { + PAD_ARG_(mach_port_name_t, target); /* 1 word */ + PAD_ARG_(user_addr_t, addr); /* 1 word */ + PAD_ARG_(mach_vm_size_t, size); /* 2 words */ + PAD_ARG_(int, flags); /* 1 word */ +}; /* Total: 5 */ + +extern kern_return_t _kernelrpc_mach_vm_allocate_trap( + struct _kernelrpc_mach_vm_allocate_trap_args *args); + +struct _kernelrpc_mach_vm_deallocate_args { + PAD_ARG_(mach_port_name_t, target); /* 1 word */ + PAD_ARG_(mach_vm_address_t, address); /* 2 words */ + PAD_ARG_(mach_vm_size_t, size); /* 2 words */ +}; /* Total: 5 */ +extern kern_return_t _kernelrpc_mach_vm_deallocate_trap( + struct _kernelrpc_mach_vm_deallocate_args *args); + +struct _kernelrpc_mach_vm_protect_args { + PAD_ARG_(mach_port_name_t, target); /* 1 word */ + PAD_ARG_(mach_vm_address_t, address); /* 2 words */ + PAD_ARG_(mach_vm_size_t, size); /* 2 words */ + PAD_ARG_(boolean_t, set_maximum); /* 1 word */ + PAD_ARG_(vm_prot_t, new_protection); /* 1 word */ +}; /* Total: 7 */ +extern kern_return_t _kernelrpc_mach_vm_protect_trap( + struct _kernelrpc_mach_vm_protect_args *args); + +struct _kernelrpc_mach_port_allocate_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_right_t, right); + PAD_ARG_(user_addr_t, name); +}; +extern kern_return_t _kernelrpc_mach_port_allocate_trap( + struct _kernelrpc_mach_port_allocate_args *args); + + +struct _kernelrpc_mach_port_destroy_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); +}; +extern kern_return_t _kernelrpc_mach_port_destroy_trap( + struct _kernelrpc_mach_port_destroy_args *args); + +struct _kernelrpc_mach_port_deallocate_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); +}; +extern kern_return_t _kernelrpc_mach_port_deallocate_trap( + struct _kernelrpc_mach_port_deallocate_args *args); + +struct _kernelrpc_mach_port_mod_refs_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); + PAD_ARG_(mach_port_right_t, right); + PAD_ARG_(mach_port_delta_t, delta); +}; +extern kern_return_t _kernelrpc_mach_port_mod_refs_trap( + struct _kernelrpc_mach_port_mod_refs_args *args); + +struct _kernelrpc_mach_port_move_member_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, member); + PAD_ARG_(mach_port_name_t, after); +}; +extern kern_return_t _kernelrpc_mach_port_move_member_trap( + struct _kernelrpc_mach_port_move_member_args *args); + +struct _kernelrpc_mach_port_insert_right_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); + PAD_ARG_(mach_port_name_t, poly); + PAD_ARG_(mach_msg_type_name_t, polyPoly); +}; +extern kern_return_t _kernelrpc_mach_port_insert_right_trap( + struct _kernelrpc_mach_port_insert_right_args *args); + +struct _kernelrpc_mach_port_insert_member_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); + PAD_ARG_(mach_port_name_t, pset); +}; +extern kern_return_t _kernelrpc_mach_port_insert_member_trap( + struct _kernelrpc_mach_port_insert_member_args *args); + +struct _kernelrpc_mach_port_extract_member_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); + PAD_ARG_(mach_port_name_t, pset); +}; +extern kern_return_t _kernelrpc_mach_port_extract_member_trap( + struct _kernelrpc_mach_port_extract_member_args *args); + /* not published to LP64 clients yet */ struct iokit_user_client_trap_args { PAD_ARG_(void *, userClientRef); diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index 0f36eeec5..a013ff1ae 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -215,6 +215,8 @@ type thread_policy_t = array[*:16] of integer_t; * task audit token (8 ints) * dyld info (2 64-bit ints and 1 int) * task_extmod_info_t (8 64-bit ints) + * task_basic_info_64_2_t + * mach_task_basic_info_t (12 ints) * If other task_info flavors are added, this * definition may need to be changed. (See * mach/task_info.h and mach/policy.h) */ @@ -404,6 +406,9 @@ type ledger_t = mach_port_t type ledger_array_t = ^array[] of ledger_t; type ledger_item_t = integer_t; + /* DEPRECATED */ + +type ledger_amount_t = int64_t; type security_token_t = struct[2] of uint32_t; type audit_token_t = struct[8] of uint32_t; diff --git a/osfmk/mach/mach_types.h b/osfmk/mach/mach_types.h index e4e47f63a..9c9a1afbb 100644 --- a/osfmk/mach/mach_types.h +++ b/osfmk/mach/mach_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -260,8 +260,12 @@ typedef exception_handler_array_t exception_port_arrary_t; #define CLOCK_NULL ((clock_t) 0) #define UND_SERVER_NULL ((UNDServerRef) 0) -typedef natural_t ledger_item_t; -#define LEDGER_ITEM_INFINITY ((ledger_item_t) (~0)) +/* DEPRECATED */ +typedef natural_t ledger_item_t; +#define LEDGER_ITEM_INFINITY ((ledger_item_t) (~0)) + +typedef int64_t ledger_amount_t; +#define LEDGER_LIMIT_INFINITY ((ledger_amount_t)(((uint64_t)1 << 63) - 1)) typedef mach_vm_offset_t *emulation_vector_t; typedef char *user_subsystem_t; diff --git a/osfmk/mach/mach_vm.defs b/osfmk/mach/mach_vm.defs index ade3eaa61..f4793f8e0 100644 --- a/osfmk/mach/mach_vm.defs +++ b/osfmk/mach/mach_vm.defs @@ -76,6 +76,12 @@ subsystem #include #include +#if !KERNEL && !LIBSYSCALL_INTERFACE +#define PREFIX(NAME) _kernelrpc_ ## NAME +#else +#define PREFIX(NAME) NAME +#endif + /* * Allocate zero-filled memory in the address space * of the target task, either at the specified address, @@ -84,27 +90,50 @@ subsystem * allocation actually took place is returned. */ #if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) -routine mach_vm_allocate( +routine PREFIX(mach_vm_allocate) ( + target : vm_task_entry_t; + inout address : mach_vm_address_t; + size : mach_vm_size_t; + flags : int); + #else -routine vm_allocate( -#endif + +#if !KERNEL && !LIBSYSCALL_INTERFACE +skip; +#else +routine PREFIX(vm_allocate) ( target : vm_task_entry_t; inout address : mach_vm_address_t; size : mach_vm_size_t; flags : int); +#endif + +#endif + + /* * Deallocate the specified range from the virtual * address space of the target virtual memory map. */ #if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) -routine mach_vm_deallocate( +routine PREFIX(mach_vm_deallocate) ( + target : vm_task_entry_t; + address : mach_vm_address_t; + size : mach_vm_size_t); + #else -routine vm_deallocate( -#endif + +#if !KERNEL && !LIBSYSCALL_INTERFACE +skip; +#else +routine PREFIX(vm_deallocate) ( target : vm_task_entry_t; address : mach_vm_address_t; size : mach_vm_size_t); +#endif + +#endif /* * Set the current or maximum protection attribute @@ -117,16 +146,31 @@ routine vm_deallocate( * *permissions*. */ #if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) -routine mach_vm_protect( +routine PREFIX(mach_vm_protect) ( + target_task : vm_task_entry_t; + address : mach_vm_address_t; + size : mach_vm_size_t; + set_maximum : boolean_t; + new_protection : vm_prot_t); + + #else -routine vm_protect( -#endif + +#if !KERNEL && !LIBSYSCALL_INTERFACE +skip; +#else + +routine PREFIX(vm_protect) ( target_task : vm_task_entry_t; address : mach_vm_address_t; size : mach_vm_size_t; set_maximum : boolean_t; new_protection : vm_prot_t); +#endif + +#endif + /* * Set the inheritance attribute for the specified range * of the virtual address space of the target address space. diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index fbbe30610..57a0c28e6 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -60,6 +60,8 @@ #ifndef _MACH_MACHINE_H_ #define _MACH_MACHINE_H_ +#ifndef __ASSEMBLER__ + #include #include #include @@ -348,6 +350,10 @@ __END_DECLS #define CPU_SUBTYPE_ARM_V5TEJ ((cpu_subtype_t) 7) #define CPU_SUBTYPE_ARM_XSCALE ((cpu_subtype_t) 8) #define CPU_SUBTYPE_ARM_V7 ((cpu_subtype_t) 9) +#define CPU_SUBTYPE_ARM_V7F ((cpu_subtype_t) 10) /* Cortex A9 */ +#define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t) 12) /* Kirkwood40 */ + +#endif /* !__ASSEMBLER__ */ /* * CPU families (sysctl hw.cpufamily) diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index 846987cfd..281ff1c9f 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -370,6 +370,7 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; #define MAP_MEM_WTHRU 3 #define MAP_MEM_WCOMB 4 /* Write combining mode */ /* aka store gather */ +#define MAP_MEM_INNERWBACK 5 #define GET_MAP_MEM(flags) \ ((((unsigned int)(flags)) >> 24) & 0xFF) @@ -410,6 +411,7 @@ struct upl_page_info { speculative:1, /* page is valid, but not yet accessed */ cs_validated:1, /* CODE SIGNING: page was validated */ cs_tainted:1, /* CODE SIGNING: page is tainted */ + needed:1, /* page should be left in cache on abort */ :0; /* force to long boundary */ #else opaque; /* use upl_page_xxx() accessor funcs */ @@ -685,6 +687,7 @@ extern boolean_t upl_device_page(upl_page_info_t *upl); extern boolean_t upl_speculative_page(upl_page_info_t *upl, int index); extern void upl_clear_dirty(upl_t upl, boolean_t value); extern void upl_set_referenced(upl_t upl, boolean_t value); +extern void upl_range_needed(upl_t upl, int index, int count); __END_DECLS diff --git a/osfmk/mach/message.h b/osfmk/mach/message.h index 195607585..eba414fc1 100644 --- a/osfmk/mach/message.h +++ b/osfmk/mach/message.h @@ -78,6 +78,7 @@ #include #include +#include /* * The timeout mechanism uses mach_msg_timeout_t values, @@ -427,10 +428,9 @@ typedef struct mach_port_seqno_t msgh_seqno; security_token_t msgh_sender; audit_token_t msgh_audit; - mach_vm_address_t msgh_context; + mach_port_context_t msgh_context; } mach_msg_context_trailer_t; - typedef struct { mach_port_name_t sender; @@ -448,7 +448,7 @@ typedef struct mach_port_seqno_t msgh_seqno; security_token_t msgh_sender; audit_token_t msgh_audit; - mach_vm_address_t msgh_context; + mach_port_context_t msgh_context; int msgh_ad; msg_labels_t msgh_labels; } mach_msg_mac_trailer_t; @@ -515,9 +515,19 @@ typedef union /* * There is no fixed upper bound to the size of Mach messages. */ - #define MACH_MSG_SIZE_MAX ((mach_msg_size_t) ~0) +#if defined(__APPLE_API_PRIVATE) +/* + * But architectural limits of a given implementation, or + * temporal conditions may cause unpredictable send failures + * for messages larger than MACH_MSG_SIZE_RELIABLE. + * + * In either case, waiting for memory is [currently] outside + * the scope of send timeout values provided to IPC. + */ +#define MACH_MSG_SIZE_RELIABLE ((mach_msg_size_t) 256 * 1024) +#endif /* * Compatibility definitions, for code written * when there was a msgh_kind instead of msgh_seqno. @@ -621,7 +631,8 @@ typedef integer_t mach_msg_option_t; * It also makes things work properly if MACH_RCV_TRAILER_LABELS is ORed * with one of the other options. */ -#define REQUESTED_TRAILER_SIZE(y) \ + +#define REQUESTED_TRAILER_SIZE_NATIVE(y) \ ((mach_msg_trailer_size_t) \ ((GET_RCV_ELEMENTS(y) == MACH_RCV_TRAILER_NULL) ? \ sizeof(mach_msg_trailer_t) : \ @@ -637,6 +648,15 @@ typedef integer_t mach_msg_option_t; sizeof(mach_msg_mac_trailer_t) : \ sizeof(mach_msg_max_trailer_t)))))))) + +#ifdef XNU_KERNEL_PRIVATE + +#define REQUESTED_TRAILER_SIZE(is64, y) REQUESTED_TRAILER_SIZE_NATIVE(y) + +#else /* XNU_KERNEL_PRIVATE */ +#define REQUESTED_TRAILER_SIZE(y) REQUESTED_TRAILER_SIZE_NATIVE(y) +#endif /* XNU_KERNEL_PRIVATE */ + /* * Much code assumes that mach_msg_return_t == kern_return_t. * This definition is useful for descriptive purposes. diff --git a/osfmk/mach/mig.h b/osfmk/mach/mig.h index 31a454a60..3d7655076 100644 --- a/osfmk/mach/mig.h +++ b/osfmk/mach/mig.h @@ -45,31 +45,25 @@ #if defined(MACH_KERNEL) -#if defined(BSMALL_LATER) -/* Really small configurations don't need type checking */ -#define __MigTypeCheck 0 -#else +#if !defined(__MigTypeCheck) /* Turn MIG type checking on by default for kernel */ #define __MigTypeCheck 1 #endif + #define __MigKernelSpecificCode 1 #define _MIG_KERNEL_SPECIFIC_CODE_ 1 -/* Otherwise check legacy setting (temporary) */ -#elif defined(TypeCheck) - -#define __MigTypeCheck TypeCheck - #elif !defined(__MigTypeCheck) -/* otherwise, default MIG type checking on - except in small configurations */ -#if defined(BSMALL) -#define __MigTypeCheck 0 +#if defined(TypeCheck) +/* use legacy setting (temporary) */ +#define __MigTypeCheck TypeCheck #else +/* default MIG type checking on */ #define __MigTypeCheck 1 #endif -#endif /* !defined(__MigTypeCheck) */ +#endif /* !defined(MACH_KERNEL) && !defined(__MigTypeCheck) */ /* * Pack MIG message structs. diff --git a/osfmk/mach/ndr.h b/osfmk/mach/ndr.h index cb64d3fc3..9baa731fe 100644 --- a/osfmk/mach/ndr.h +++ b/osfmk/mach/ndr.h @@ -34,6 +34,8 @@ #include #include +#include + typedef struct { unsigned char mig_vers; @@ -65,20 +67,16 @@ typedef struct { extern NDR_record_t NDR_record; -#if defined(BSMALL) +/* NDR conversion off by default */ + +#if !defined(__NDR_convert__) #define __NDR_convert__ 0 -#define __NDR_convert__int_rep__ 0 -#else -#ifndef __NDR_convert__ -#define __NDR_convert__ 1 -#endif /* __NDR_convert__ */ +#endif /* !defined(__NDR_convert__) */ #ifndef __NDR_convert__int_rep__ -#define __NDR_convert__int_rep__ 1 +#define __NDR_convert__int_rep__ __NDR_convert__ #endif /* __NDR_convert__int_rep__ */ -#endif /* defined(BSMALL) */ - #ifndef __NDR_convert__char_rep__ #define __NDR_convert__char_rep__ 0 #endif /* __NDR_convert__char_rep__ */ @@ -103,8 +101,6 @@ extern NDR_record_t NDR_record; #if __NDR_convert__int_rep__ -#include - #define __NDR_READSWAP_assign(a, rs) do { *(a) = rs(a); } while (0) #define __NDR_READSWAP__uint16_t(a) OSReadSwapInt16((void *)a, 0) diff --git a/osfmk/mach/shared_region.h b/osfmk/mach/shared_region.h index 29ced2a40..2297ba7b7 100644 --- a/osfmk/mach/shared_region.h +++ b/osfmk/mach/shared_region.h @@ -70,6 +70,7 @@ #define SHARED_REGION_NESTING_MIN_ARM ? #define SHARED_REGION_NESTING_MAX_ARM ? + #if defined(__i386__) #define SHARED_REGION_BASE SHARED_REGION_BASE_I386 #define SHARED_REGION_SIZE SHARED_REGION_SIZE_I386 diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index bac3552d3..a3ef52610 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -76,6 +76,25 @@ * table in . */ +/* + * i386 and x86_64 just load of the stack or use + * registers in order; no munging is required, + * and number of args is ignored. ARM loads args + * into registers beyond r3, unlike the normal + * procedure call standard; we pad for 64-bit args. + */ +kernel_trap(_kernelrpc_mach_vm_allocate_trap,-10,5) /* 4 args, +1 for mach_vm_size_t */ +kernel_trap(_kernelrpc_mach_vm_deallocate_trap,-12,5) /* 3 args, +2 for mach_vm_size_t and mach_vm_address_t */ +kernel_trap(_kernelrpc_mach_vm_protect_trap,-14,7) /* 5 args, +2 for mach_vm_address_t and mach_vm_size_t */ +kernel_trap(_kernelrpc_mach_port_allocate_trap,-16,3) +kernel_trap(_kernelrpc_mach_port_destroy_trap,-17,2) +kernel_trap(_kernelrpc_mach_port_deallocate_trap,-18,2) +kernel_trap(_kernelrpc_mach_port_mod_refs_trap,-19,4) +kernel_trap(_kernelrpc_mach_port_move_member_trap,-20,3) +kernel_trap(_kernelrpc_mach_port_insert_right_trap,-21,4) +kernel_trap(_kernelrpc_mach_port_insert_member_trap,-22,3) +kernel_trap(_kernelrpc_mach_port_extract_member_trap,-23,3) + kernel_trap(mach_reply_port,-26,0) kernel_trap(thread_self_trap,-27,0) kernel_trap(task_self_trap,-28,0) diff --git a/osfmk/mach/task_info.h b/osfmk/mach/task_info.h index a43dc6cb9..d62982d0c 100644 --- a/osfmk/mach/task_info.h +++ b/osfmk/mach/task_info.h @@ -69,6 +69,8 @@ #include #include #include +#include /* for vm_extmod_statistics_data_t */ +#include #include @@ -78,6 +80,7 @@ typedef natural_t task_flavor_t; typedef integer_t *task_info_t; /* varying array of int */ +/* Deprecated, use per structure _data_t's instead */ #define TASK_INFO_MAX (1024) /* maximum array size */ typedef integer_t task_info_data_t[TASK_INFO_MAX]; @@ -87,6 +90,7 @@ typedef integer_t task_info_data_t[TASK_INFO_MAX]; #pragma pack(4) +/* Don't use this, use MACH_TASK_BASIC_INFO instead */ #define TASK_BASIC_INFO_32 4 /* basic information */ #define TASK_BASIC2_INFO_32 6 @@ -105,9 +109,7 @@ typedef struct task_basic_info_32 *task_basic_info_32_t; #define TASK_BASIC_INFO_32_COUNT \ (sizeof(task_basic_info_32_data_t) / sizeof(natural_t)) - -#define TASK_BASIC_INFO_64 5 /* 64-bit capable basic info */ - +/* Don't use this, use MACH_TASK_BASIC_INFO instead */ struct task_basic_info_64 { integer_t suspend_count; /* suspend count for task */ mach_vm_size_t virtual_size; /* virtual memory size (bytes) */ @@ -120,12 +122,14 @@ struct task_basic_info_64 { }; typedef struct task_basic_info_64 task_basic_info_64_data_t; typedef struct task_basic_info_64 *task_basic_info_64_t; + +#define TASK_BASIC_INFO_64 5 /* 64-bit capable basic info */ #define TASK_BASIC_INFO_64_COUNT \ (sizeof(task_basic_info_64_data_t) / sizeof(natural_t)) /* localized structure - cannot be safely passed between tasks of differing sizes */ - +/* Don't use this, use MACH_TASK_BASIC_INFO instead */ struct task_basic_info { integer_t suspend_count; /* suspend count for task */ vm_size_t virtual_size; /* virtual memory size (bytes) */ @@ -245,7 +249,8 @@ typedef struct task_dyld_info *task_dyld_info_t; #define TASK_DYLD_ALL_IMAGE_INFO_32 0 /* format value */ #define TASK_DYLD_ALL_IMAGE_INFO_64 1 /* format value */ -#define TASK_EXTMOD_INFO 18 + +#define TASK_EXTMOD_INFO 19 struct task_extmod_info { unsigned char task_uuid[16]; @@ -256,8 +261,24 @@ typedef struct task_extmod_info *task_extmod_info_t; #define TASK_EXTMOD_INFO_COUNT \ (sizeof(task_extmod_info_data_t) / sizeof(natural_t)) -#pragma pack() - +/* Always 64-bit in user and kernel */ +#define MACH_TASK_BASIC_INFO 20 /* always 64-bit basic info */ + +struct mach_task_basic_info { + mach_vm_size_t virtual_size; /* virtual memory size (bytes) */ + mach_vm_size_t resident_size; /* resident memory size (bytes) */ + mach_vm_size_t resident_size_max; /* maximum resident memory size (bytes) */ + time_value_t user_time; /* total user run time for + terminated threads */ + time_value_t system_time; /* total system run time for + terminated threads */ + policy_t policy; /* default policy for new threads */ + integer_t suspend_count; /* suspend count for task */ +}; +typedef struct mach_task_basic_info mach_task_basic_info_data_t; +typedef struct mach_task_basic_info *mach_task_basic_info_t; +#define MACH_TASK_BASIC_INFO_COUNT \ + (sizeof(mach_task_basic_info_data_t) / sizeof(natural_t)) /* * Obsolete interfaces. @@ -269,4 +290,6 @@ typedef struct task_extmod_info *task_extmod_info_t; #define TASK_SCHED_INFO 14 +#pragma pack() + #endif /* _MACH_TASK_INFO_H_ */ diff --git a/osfmk/mach/task_special_ports.h b/osfmk/mach/task_special_ports.h index ec980cfe2..fb08e1b7d 100644 --- a/osfmk/mach/task_special_ports.h +++ b/osfmk/mach/task_special_ports.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,27 +82,12 @@ typedef int task_special_port_t; * Evolving and likely to change. */ -#define TASK_WIRED_LEDGER_PORT 5 /* Wired resource ledger for task. */ - -#define TASK_PAGED_LEDGER_PORT 6 /* Paged resource ledger for task. */ - #define TASK_SEATBELT_PORT 7 /* Seatbelt compiler/DEM port for task. */ -#define TASK_GSSD_PORT 8 /* GSSD port for security context */ +/* PORT 8 was the GSSD TASK PORT which transformed to a host port */ #define TASK_ACCESS_PORT 9 /* Permission check for task_for_pid. */ -#define task_get_wired_ledger_port(task, port) \ - (task_get_special_port((task), TASK_WIRED_LEDGER_PORT, (port))) - -#define task_set_wired_ledger_port(task, port) \ - (task_set_special_port((task), TASK_WIRED_LEDGER_PORT, (port))) - -#define task_get_paged_ledger_port(task, port) \ - (task_get_special_port((task), TASK_PAGED_LEDGER_PORT, (port))) - -#define task_set_paged_ledger_port(task, port) \ - (task_set_special_port((task), TASK_PAGED_LEDGER_PORT, (port))) /* * Definitions for ease of use @@ -126,12 +111,6 @@ typedef int task_special_port_t; #define task_set_bootstrap_port(task, port) \ (task_set_special_port((task), TASK_BOOTSTRAP_PORT, (port))) -#define task_get_gssd_port(task, port) \ - (task_get_special_port((task), TASK_GSSD_PORT, (port))) - -#define task_set_gssd_port(task, port) \ - (task_set_special_port((task), TASK_GSSD_PORT, (port))) - #define task_get_task_access_port(task, port) \ (task_get_special_port((task), TASK_ACCESS_PORT, (port))) diff --git a/osfmk/mach/vm_map.defs b/osfmk/mach/vm_map.defs index b59e795ef..521d5886e 100644 --- a/osfmk/mach/vm_map.defs +++ b/osfmk/mach/vm_map.defs @@ -71,6 +71,12 @@ subsystem #include #include +#if !KERNEL && !LIBSYSCALL_INTERFACE +#define PREFIX(NAME) _kernelrpc_ ## NAME +#else +#define PREFIX(NAME) NAME +#endif + /* * Returns information about the contents of the virtual * address space of the target task at the specified @@ -99,21 +105,33 @@ routine vm_region( * of the specified size. The address at which the * allocation actually took place is returned. */ -routine vm_allocate( + +#if !KERNEL && !LIBSYSCALL_INTERFACE +skip; +#else +routine PREFIX(vm_allocate)( target_task : vm_task_entry_t; inout address : vm_address_t; size : vm_size_t; flags : int); +#endif + /* * Deallocate the specified range from the virtual * address space of the target task. */ -routine vm_deallocate( + +#if !KERNEL && !LIBSYSCALL_INTERFACE +skip; +#else +routine PREFIX(vm_deallocate)( target_task : vm_task_entry_t; address : vm_address_t; size : vm_size_t); +#endif + /* * Set the current or maximum protection attribute * for the specified range of the virtual address @@ -124,12 +142,17 @@ routine vm_deallocate( * Protections are specified as a set of {read, write, execute} * *permissions*. */ -routine vm_protect( + +#if !KERNEL && !LIBSYSCALL_INTERFACE +skip; +#else +routine PREFIX(vm_protect)( target_task : vm_task_entry_t; address : vm_address_t; size : vm_size_t; set_maximum : boolean_t; new_protection : vm_prot_t); +#endif /* * Set the inheritance attribute for the specified range diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 468920caa..60e65de24 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -106,8 +106,8 @@ */ #if 1 -#define atop(x) ((uint32_t)(x) >> PAGE_SHIFT) -#define ptoa(x) ((uint32_t)(x) << PAGE_SHIFT) +#define atop(x) ((vm_address_t)(x) >> PAGE_SHIFT) +#define ptoa(x) ((vm_address_t)(x) << PAGE_SHIFT) #else #define atop(x) (0UL = 0) #define ptoa(x) (0UL = 0) @@ -240,11 +240,41 @@ extern addr64_t vm_last_addr; /* Highest kernel virtual address known to the VM extern const vm_offset_t vm_min_kernel_address; extern const vm_offset_t vm_max_kernel_address; +extern vm_offset_t vm_kernel_stext; +extern vm_offset_t vm_kernel_etext; +extern vm_offset_t vm_kernel_base; +extern vm_offset_t vm_kernel_top; +extern vm_offset_t vm_kernel_slide; +extern vm_offset_t vm_kernel_addrperm; + +#define VM_KERNEL_IS_SLID(_o) \ + (((vm_offset_t)(_o) >= vm_kernel_base) && \ + ((vm_offset_t)(_o) < vm_kernel_top)) +/* + * VM_KERNEL_IS_KEXT is platform-specific, defined in . + * Set default if undefined. + */ +#ifndef VM_KERNEL_IS_KEXT +#define VM_KERNEL_IS_KEXT(_o) (FALSE) +#endif +#define VM_KERNEL_UNSLIDE(_v) \ + ((VM_KERNEL_IS_SLID(_v) || \ + VM_KERNEL_IS_KEXT(_v)) ? \ + (vm_offset_t)(_v) - vm_kernel_slide : \ + (vm_offset_t)(_v)) +#define VM_KERNEL_SLIDE(_u) \ + ((vm_offset_t)(_u) + vm_kernel_slide) + +#define VM_KERNEL_ADDRPERM(_v) \ + (((vm_offset_t)(_v) == 0) ? \ + (vm_offset_t)(0) : \ + (vm_offset_t)(_v) + vm_kernel_addrperm) + #endif /* XNU_KERNEL_PRIVATE */ extern vm_size_t page_size; extern vm_size_t page_mask; -extern int page_shift; +extern int page_shift; /* We need a way to get rid of compiler warnings when we cast from */ /* a 64 bit value to an address (which may be 32 bits or 64-bits). */ diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index 4d1b13a56..487549894 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -328,12 +328,17 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_MEMORY_DYLIB 33 #define VM_MEMORY_OBJC_DISPATCHERS 34 +/* Was a nested pmap (VM_MEMORY_SHARED_PMAP) which has now been unnested */ +#define VM_MEMORY_UNSHARED_PMAP 35 + + // Placeholders for now -- as we analyze the libraries and find how they // use memory, we can make these labels more specific. #define VM_MEMORY_APPKIT 40 #define VM_MEMORY_FOUNDATION 41 #define VM_MEMORY_COREGRAPHICS 42 -#define VM_MEMORY_CARBON 43 +#define VM_MEMORY_CORESERVICES 43 +#define VM_MEMORY_CARBON VM_MEMORY_CORESERVICES #define VM_MEMORY_JAVA 44 #define VM_MEMORY_ATS 50 #define VM_MEMORY_LAYERKIT 51 @@ -388,7 +393,7 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_MEMORY_COREPROFILE 71 /* assetsd / MobileSlideShow memory */ -#define VM_MEMORY_ASSETSD 72 +#define VM_MEMORY_ASSETSD 72 /* Reserve 240-255 for application */ #define VM_MEMORY_APPLICATION_SPECIFIC_1 240 diff --git a/osfmk/machine/Makefile b/osfmk/machine/Makefile index 2170671dd..ca45cc170 100644 --- a/osfmk/machine/Makefile +++ b/osfmk/machine/Makefile @@ -9,11 +9,12 @@ include $(MakeInc_def) DATAFILES = \ - cpu_number.h \ cpu_capabilities.h \ + cpu_number.h \ io_map_entries.h \ lock.h \ locks.h \ + machine_cpuid.h \ machine_routines.h \ pal_routines.h \ pal_hibernate.h \ diff --git a/osfmk/machine/commpage.h b/osfmk/machine/commpage.h index d11521702..2a525b0f2 100644 --- a/osfmk/machine/commpage.h +++ b/osfmk/machine/commpage.h @@ -38,7 +38,7 @@ #ifndef __ASSEMBLER__ extern void commpage_populate( void ); /* called once during startup */ - +extern void commpage_text_populate( void ); #endif /* __ASSEMBLER__ */ #endif /* _MACHINE_COMMPAGE_H */ diff --git a/osfmk/machine/db_machdep.h b/osfmk/machine/machine_cpuid.h similarity index 84% rename from osfmk/machine/db_machdep.h rename to osfmk/machine/machine_cpuid.h index 76ce9b313..ebe9396f6 100644 --- a/osfmk/machine/db_machdep.h +++ b/osfmk/machine/machine_cpuid.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,13 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _MACHINE_DB_MACHDEP_H -#define _MACHINE_DB_MACHDEP_H +#ifdef KERNEL_PRIVATE + +#ifndef _MACHINE_CPUID_H +#define _MACHINE_CPUID_H -#if defined (__i386__) || defined (__x86_64__) -#include "i386/db_machdep.h" -#else #error architecture not supported -#endif -#endif /* _MACHINE_DB_MACHDEP_H */ +#endif /* _MACHINE_CPUID_H */ + +#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/pmc/pmc.c b/osfmk/pmc/pmc.c index f5a894823..a637a1b93 100644 --- a/osfmk/pmc/pmc.c +++ b/osfmk/pmc/pmc.c @@ -73,8 +73,8 @@ uint64_t pmc_spin_timeout_count = 0; /* Number of times where a PMC spin loop ca do { \ kprintf("perfmon: %p (obj: %p refCt: %u switchable: %u)\n", \ x, x->object, x->useCount, \ - x->methods.supports_context_switching ? \ - x->methods.supports_context_switching(x->object) : 0); \ + (x->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING) ? \ + 1 : 0); \ } while(0) static const char const * pmc_state_state_name(pmc_state_t state) { @@ -190,8 +190,10 @@ static lck_grp_attr_t *pmc_lock_grp_attr; static lck_attr_t *pmc_lock_attr; /* PMC tracking queue locks */ -static lck_spin_t perf_monitor_queue_spin; /* protects adding and removing from queue */ -static lck_spin_t perf_counters_queue_spin; /* protects adding and removing from queue */ + +static lck_mtx_t cpu_monitor_queue_mutex; /* protects per-cpu queues at initialisation time */ +static lck_spin_t perf_monitor_queue_spin; /* protects adding and removing from queue */ +static lck_spin_t perf_counters_queue_spin; /* protects adding and removing from queue */ /* Reservation tracking queues lock */ static lck_spin_t reservations_spin; @@ -201,10 +203,13 @@ static lck_spin_t reservations_spin; * * Keeps track of registered perf monitors and perf counters */ -static queue_t perf_monitors_queue = NULL; + +static queue_head_t **cpu_monitor_queues = NULL; + +static queue_head_t *perf_monitors_queue = NULL; static volatile uint32_t perf_monitors_count = 0U; -static queue_t perf_counters_queue = NULL; +static queue_head_t *perf_counters_queue = NULL; static volatile uint32_t perf_counters_count = 0U; /* @@ -218,16 +223,16 @@ static volatile uint32_t perf_counters_count = 0U; * every task and thread) to determine if/when a new reservation would * constitute a conflict. */ -static queue_t system_reservations = NULL; + +static queue_head_t *system_reservations = NULL; static volatile uint32_t system_reservation_count = 0U; -static queue_t task_reservations = NULL; +static queue_head_t *task_reservations = NULL; static volatile uint32_t task_reservation_count = 0U; -static queue_t thread_reservations = NULL; +static queue_head_t *thread_reservations = NULL; static volatile uint32_t thread_reservation_count = 0U; - #if XNU_KERNEL_PRIVATE /* @@ -248,6 +253,8 @@ static void init_pmc_locks(void) { lck_spin_init(&perf_counters_queue_spin, pmc_lock_grp, pmc_lock_attr); lck_spin_init(&reservations_spin, pmc_lock_grp, pmc_lock_attr); + + lck_mtx_init(&cpu_monitor_queue_mutex, pmc_lock_grp, pmc_lock_attr); } /* @@ -272,27 +279,28 @@ static void init_pmc_zones(void) { * registering and reserving individual pmcs and perf monitors. */ static void init_pmc_queues(void) { - perf_monitors_queue = (queue_t)kalloc(sizeof(queue_t)); + + perf_monitors_queue = (queue_head_t*)kalloc(sizeof(queue_head_t)); assert(perf_monitors_queue); queue_init(perf_monitors_queue); - perf_counters_queue = (queue_t)kalloc(sizeof(queue_t)); + perf_counters_queue = (queue_head_t*)kalloc(sizeof(queue_head_t)); assert(perf_counters_queue); queue_init(perf_counters_queue); - system_reservations = (queue_t)kalloc(sizeof(queue_t)); + system_reservations = (queue_head_t*)kalloc(sizeof(queue_t)); assert(system_reservations); queue_init(system_reservations); - task_reservations = (queue_t)kalloc(sizeof(queue_t)); + task_reservations = (queue_head_t*)kalloc(sizeof(queue_head_t)); assert(task_reservations); queue_init(task_reservations); - thread_reservations = (queue_t)kalloc(sizeof(queue_t)); + thread_reservations = (queue_head_t*)kalloc(sizeof(queue_head_t)); assert(thread_reservations); queue_init(thread_reservations); @@ -329,7 +337,7 @@ static void perf_monitor_free(void *pm) { zfree(perf_small_zone, pm); } -static void perf_monitor_init(perf_monitor_t pm) { +static void perf_monitor_init(perf_monitor_t pm, int cpu) { assert(pm); pm->object = NULL; @@ -337,8 +345,13 @@ static void perf_monitor_init(perf_monitor_t pm) { bzero(&(pm->methods), sizeof(perf_monitor_methods_t)); pm->useCount = 1; /* initial retain count of 1, for caller */ + + pm->reservedCounters = 0; + + pm->cpu = cpu; pm->link.next = pm->link.prev = (queue_entry_t)NULL; + pm->cpu_link.next = pm->cpu_link.prev = (queue_entry_t)NULL; } /* @@ -348,6 +361,13 @@ static void perf_monitor_init(perf_monitor_t pm) { static void perf_monitor_dequeue(perf_monitor_t pm) { lck_spin_lock(&perf_monitor_queue_spin); + if (pm->methods.flags & PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS) { + /* If this flag is set, the monitor is already validated to be + * accessible from a single cpu only. + */ + queue_remove(cpu_monitor_queues[pm->cpu], pm, perf_monitor_t, cpu_link); + } + /* * remove the @pm object from the @perf_monitor_queue queue (it is of type * and has a field called @link that is the queue_link_t @@ -364,13 +384,45 @@ static void perf_monitor_dequeue(perf_monitor_t pm) { * thereby registering it for use with the system. */ static void perf_monitor_enqueue(perf_monitor_t pm) { + + lck_mtx_lock(&cpu_monitor_queue_mutex); lck_spin_lock(&perf_monitor_queue_spin); + if (pm->cpu >= 0) { + /* Deferred initialisation; saves memory and permits ml_get_max_cpus() + * to block until cpu initialisation is complete. + */ + if (!cpu_monitor_queues) { + uint32_t max_cpus; + queue_head_t **queues; + uint32_t i; + + lck_spin_unlock(&perf_monitor_queue_spin); + + max_cpus = ml_get_max_cpus(); + + queues = (queue_head_t**)kalloc(sizeof(queue_head_t*) * max_cpus); + assert(queues); + for (i = 0; i < max_cpus; i++) { + queue_head_t *queue = (queue_head_t*)kalloc(sizeof(queue_head_t)); + assert(queue); + queue_init(queue); + queues[i] = queue; + } + + lck_spin_lock(&perf_monitor_queue_spin); + + cpu_monitor_queues = queues; + } + + queue_enter(cpu_monitor_queues[pm->cpu], pm, perf_monitor_t, cpu_link); + } + queue_enter(perf_monitors_queue, pm, perf_monitor_t, link); - perf_monitors_count++; - + lck_spin_unlock(&perf_monitor_queue_spin); + lck_mtx_unlock(&cpu_monitor_queue_mutex); } /* @@ -417,8 +469,7 @@ static perf_monitor_t perf_monitor_find(perf_monitor_object_t monitor) { lck_spin_lock(&perf_monitor_queue_spin); queue_iterate(perf_monitors_queue, element, perf_monitor_t, link) { - if(element && element->object == monitor) { - /* We found it - reference the object. */ + if(element->object == monitor) { perf_monitor_reference(element); found = element; break; @@ -432,8 +483,9 @@ static perf_monitor_t perf_monitor_find(perf_monitor_object_t monitor) { /* * perf_monitor_add_pmc adds a newly registered PMC to the perf monitor it is - * aassociated with. + * associated with. */ + static void perf_monitor_add_pmc(perf_monitor_t pm, pmc_t pmc __unused) { assert(pm); assert(pmc); @@ -546,9 +598,8 @@ static pmc_t pmc_find(pmc_object_t object) { pmc_t found = NULL; queue_iterate(perf_counters_queue, element, pmc_t, link) { - if(element && element->object == object) { + if(element->object == object) { pmc_reference(element); - found = element; break; } @@ -750,8 +801,7 @@ static uint32_t pmc_accessible_core_count(pmc_t pmc) { * matches the new incoming one (for thread/task reservations only). Will only * return TRUE if the task/thread matches. */ -static boolean_t pmc_internal_reservation_queue_contains_pmc(queue_t queue, pmc_reservation_t -resv) { +static boolean_t pmc_internal_reservation_queue_contains_pmc(queue_t queue, pmc_reservation_t resv) { assert(queue); assert(resv); @@ -759,62 +809,60 @@ resv) { pmc_reservation_t tmp = NULL; queue_iterate(queue, tmp, pmc_reservation_t, link) { - if(tmp) { - if(tmp->pmc == resv->pmc) { - /* PMC matches - make sure scope matches first */ - switch(PMC_FLAG_SCOPE(tmp->flags)) { - case PMC_FLAG_SCOPE_SYSTEM: - /* - * Found a reservation in system queue with same pmc - always a - * conflict. - */ - ret = TRUE; - break; - case PMC_FLAG_SCOPE_THREAD: + if(tmp->pmc == resv->pmc) { + /* PMC matches - make sure scope matches first */ + switch(PMC_FLAG_SCOPE(tmp->flags)) { + case PMC_FLAG_SCOPE_SYSTEM: + /* + * Found a reservation in system queue with same pmc - always a + * conflict. + */ + ret = TRUE; + break; + case PMC_FLAG_SCOPE_THREAD: + /* + * Found one in thread queue with the same PMC as the + * argument. Only a conflict if argument scope isn't + * thread or system, or the threads match. + */ + ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_THREAD) || + (tmp->thread == resv->thread); + + if(!ret) { /* - * Found one in thread queue with the same PMC as the - * argument. Only a conflict if argument scope isn't - * thread or system, or the threads match. + * so far, no conflict - check that the pmc that is + * being reserved isn't accessible from more than + * one core, if it is, we need to say it's already + * taken. */ - ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_THREAD) || - (tmp->thread == resv->thread); - - if(!ret) { - /* - * so far, no conflict - check that the pmc that is - * being reserved isn't accessible from more than - * one core, if it is, we need to say it's already - * taken. - */ - if(1 != pmc_accessible_core_count(tmp->pmc)) { - ret = TRUE; - } + if(1 != pmc_accessible_core_count(tmp->pmc)) { + ret = TRUE; } - break; - case PMC_FLAG_SCOPE_TASK: - /* - * Follow similar semantics for task scope. + } + break; + case PMC_FLAG_SCOPE_TASK: + /* + * Follow similar semantics for task scope. + */ + + ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_TASK) || + (tmp->task == resv->task); + if(!ret) { + /* + * so far, no conflict - check that the pmc that is + * being reserved isn't accessible from more than + * one core, if it is, we need to say it's already + * taken. */ - - ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_TASK) || - (tmp->task == resv->task); - if(!ret) { - /* - * so far, no conflict - check that the pmc that is - * being reserved isn't accessible from more than - * one core, if it is, we need to say it's already - * taken. - */ - if(1 != pmc_accessible_core_count(tmp->pmc)) { - ret = TRUE; - } + if(1 != pmc_accessible_core_count(tmp->pmc)) { + ret = TRUE; } + } - break; - } - - if(ret) break; + break; } + + if(ret) break; } } @@ -823,7 +871,7 @@ resv) { /* * pmc_internal_reservation_validate_for_pmc returns TRUE if the given reservation can be - * added to its target queue without createing conflicts (target queue is + * added to its target queue without creating conflicts (target queue is * determined by the reservation's scope flags). Further, this method returns * FALSE if any level contains a reservation for a PMC that can be accessed from * more than just 1 core, and the given reservation also wants the same PMC. @@ -912,54 +960,50 @@ static boolean_t pmc_internal_reservation_add(pmc_reservation_t resv) { /* Check if the reservation can be added without conflicts */ if(pmc_internal_reservation_validate_for_pmc(resv)) { - ret = TRUE; - } - - if(ret) { + /* add reservation to appropriate scope */ switch(PMC_FLAG_SCOPE(resv->flags)) { + case PMC_FLAG_SCOPE_SYSTEM: + /* Simply add it to the system queue */ + pmc_internal_reservation_enqueue(system_reservations, resv); + system_reservation_count++; + + lck_spin_unlock(&reservations_spin); - /* System-wide counter */ - case PMC_FLAG_SCOPE_SYSTEM: - /* Simply add it to the system queue */ - pmc_internal_reservation_enqueue(system_reservations, resv); - system_reservation_count++; - - lck_spin_unlock(&reservations_spin); - - break; + break; - /* Task-switched counter */ - case PMC_FLAG_SCOPE_TASK: - assert(resv->task); + case PMC_FLAG_SCOPE_TASK: + assert(resv->task); - /* Not only do we enqueue it in our local queue for tracking */ - pmc_internal_reservation_enqueue(task_reservations, resv); - task_reservation_count++; + /* Not only do we enqueue it in our local queue for tracking */ + pmc_internal_reservation_enqueue(task_reservations, resv); + task_reservation_count++; - lck_spin_unlock(&reservations_spin); + lck_spin_unlock(&reservations_spin); - /* update the task mask, and propagate it to existing threads */ - pmc_internal_update_task_flag(resv->task, TRUE); - break; + /* update the task mask, and propagate it to existing threads */ + pmc_internal_update_task_flag(resv->task, TRUE); + break; - /* Thread-switched counter */ - case PMC_FLAG_SCOPE_THREAD: - assert(resv->thread); + /* Thread-switched counter */ + case PMC_FLAG_SCOPE_THREAD: + assert(resv->thread); - /* - * Works the same as a task-switched counter, only at - * thread-scope - */ + /* + * Works the same as a task-switched counter, only at + * thread-scope + */ - pmc_internal_reservation_enqueue(thread_reservations, resv); - thread_reservation_count++; + pmc_internal_reservation_enqueue(thread_reservations, resv); + thread_reservation_count++; - lck_spin_unlock(&reservations_spin); - - pmc_internal_update_thread_flag(resv->thread, TRUE); - break; - } + lck_spin_unlock(&reservations_spin); + + pmc_internal_update_thread_flag(resv->thread, TRUE); + break; + } + + ret = TRUE; } else { lck_spin_unlock(&reservations_spin); } @@ -993,8 +1037,6 @@ static void pmc_internal_reservation_broadcast(pmc_reservation_t reservation, vo /* core_cnt = 0 really means all cpus */ mask = CPUMASK_ALL; } - - /* Have each core run pmc_internal_reservation_stop_cpu asynchronously. */ mp_cpus_call(mask, ASYNC, action_func, reservation); #else #error pmc_reservation_interrupt needs an inter-processor method invocation mechanism for this architecture @@ -1021,20 +1063,20 @@ static void pmc_internal_reservation_remove(pmc_reservation_t resv) { * using the reservation's scope flags. */ + /* Lock the global spin lock */ + lck_spin_lock(&reservations_spin); + switch(PMC_FLAG_SCOPE(resv->flags)) { case PMC_FLAG_SCOPE_SYSTEM: - lck_spin_lock(&reservations_spin); pmc_internal_reservation_dequeue(system_reservations, resv); system_reservation_count--; + lck_spin_unlock(&reservations_spin); + break; case PMC_FLAG_SCOPE_TASK: - - /* Lock the global spin lock */ - lck_spin_lock(&reservations_spin); - /* remove from the global queue */ pmc_internal_reservation_dequeue(task_reservations, resv); task_reservation_count--; @@ -1044,11 +1086,10 @@ static void pmc_internal_reservation_remove(pmc_reservation_t resv) { /* Recalculate task's counter mask */ pmc_internal_update_task_flag(resv->task, FALSE); + break; case PMC_FLAG_SCOPE_THREAD: - lck_spin_lock(&reservations_spin); - pmc_internal_reservation_dequeue(thread_reservations, resv); thread_reservation_count--; @@ -1489,11 +1530,6 @@ static void pmc_internal_reservation_store(pmc_reservation_t reservation) { COUNTER_DEBUG(" [error] disable: 0x%x\n", ret); } - /* - * At this point, we're off the hardware, so we don't have to - * set_on_hardare(TRUE) if anything fails from here on. - */ - /* store the counter value into the reservation's stored count */ ret = store_pmc->methods.get_count(store_pmc_obj, &reservation->value); if(KERN_SUCCESS != ret) { @@ -1576,11 +1612,28 @@ static void pmc_internal_reservation_load(pmc_reservation_t reservation) { } +/* + * pmc_accessible_from_core will return TRUE if the given @pmc is directly + * (e.g., hardware) readable from the given logical core. + * + * NOTE: This method is interrupt safe. + */ +static inline boolean_t pmc_accessible_from_core(pmc_t pmc, uint32_t logicalCore) { + boolean_t ret = FALSE; + + assert(pmc); + + ret = pmc->methods.accessible_from_core(pmc->object, logicalCore); + + return ret; +} + static void pmc_internal_reservation_start_cpu(void * arg) { pmc_reservation_t reservation = (pmc_reservation_t)arg; assert(reservation); + if (pmc_internal_reservation_matches_context(reservation)) { /* We are in context, but the reservation may have already had the context_in method run. Attempt * to set this cpu's bit in the active_last_context_in mask. If we set it, call context_in. @@ -1600,6 +1653,7 @@ static void pmc_internal_reservation_stop_cpu(void * arg) { assert(reservation); + if (pmc_internal_reservation_matches_context(reservation)) { COUNTER_DEBUG("Stopping in-context reservation %p for cpu %d\n", reservation, cpu_number()); @@ -1703,6 +1757,7 @@ static void pmc_reservation_interrupt(void *target, void *refCon) { */ kern_return_t perf_monitor_register(perf_monitor_object_t monitor, perf_monitor_methods_t *methods) { + int cpu = -1; COUNTER_DEBUG("registering perf monitor %p\n", monitor); @@ -1715,9 +1770,30 @@ kern_return_t perf_monitor_register(perf_monitor_object_t monitor, return KERN_INVALID_ARGUMENT; } + /* If the monitor requires idle notifications, ensure that it is + * accessible from a single core only. + */ + if (methods->flags & PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS) { + uint32_t *cores; + size_t core_cnt; + + if (KERN_SUCCESS == methods->accessible_cores(monitor, &cores, &core_cnt)) { + /* + * Guard against disabled cores - monitors will always match and + * attempt registration, irrespective of 'cpus=x' boot-arg. + */ + if ((core_cnt == 1) && (cores[0] < (uint32_t)ml_get_max_cpus())) { + cpu = cores[0]; + } else { + return KERN_INVALID_ARGUMENT; + } + } + } + /* All methods are required */ - if(!methods->supports_context_switching || !methods->enable_counters || - !methods->disable_counters) { + if(!methods->accessible_cores | + !methods->enable_counters || !methods->disable_counters || + !methods->on_idle || !methods->on_idle_exit) { return KERN_INVALID_ARGUMENT; } @@ -1735,13 +1811,13 @@ kern_return_t perf_monitor_register(perf_monitor_object_t monitor, } /* initialize the object */ - perf_monitor_init(pm); + perf_monitor_init(pm, cpu); /* copy in the registration info */ pm->object = monitor; memcpy(&(pm->methods), methods, sizeof(perf_monitor_methods_t)); - /* place it in the tracking queue */ + /* place it in the tracking queues */ perf_monitor_enqueue(pm); /* debug it */ @@ -1766,7 +1842,7 @@ kern_return_t perf_monitor_unregister(perf_monitor_object_t monitor) { perf_monitor_t pm = perf_monitor_find(monitor); if(pm) { - /* Remove it from the queue. */ + /* Remove it from the queues. */ perf_monitor_dequeue(pm); /* drop extra retain from find */ @@ -1903,6 +1979,16 @@ kern_return_t pmc_unregister(perf_monitor_object_t monitor, pmc_object_t pmc_obj return KERN_SUCCESS; } +static void perf_monitor_reservation_add(perf_monitor_t monitor) { + assert(monitor); + OSIncrementAtomic(&(monitor->reservedCounters)); +} + +static void perf_monitor_reservation_remove(perf_monitor_t monitor) { + assert(monitor); + OSDecrementAtomic(&(monitor->reservedCounters)); +} + #if 0 #pragma mark - #pragma mark KPI @@ -2089,10 +2175,8 @@ kern_return_t pmc_get_pmc_list(pmc_t **pmcs, size_t *pmcCount) { /* copy the bits out */ queue_iterate(perf_counters_queue, pmc, pmc_t, link) { - if(pmc) { - /* copy out the pointer */ - array[count++] = pmc; - } + /* copy out the pointer */ + array[count++] = pmc; } lck_spin_unlock(&perf_counters_queue_spin); @@ -2227,22 +2311,6 @@ kern_return_t pmc_get_accessible_core_list(pmc_t pmc, uint32_t **logicalCores, return ret; } -/* - * pmc_accessible_from_core will return TRUE if the given @pmc is directly - * (e.g., hardware) readable from the given logical core. - * - * NOTE: This method is interrupt safe. - */ -boolean_t pmc_accessible_from_core(pmc_t pmc, uint32_t logicalCore) { - boolean_t ret = FALSE; - - assert(pmc); - - ret = pmc->methods.accessible_from_core(pmc->object, logicalCore); - - return ret; -} - static boolean_t pmc_reservation_setup_pmi(pmc_reservation_t resv, pmc_config_t config) { assert(resv); assert(resv->pmc); @@ -2318,7 +2386,7 @@ kern_return_t pmc_reserve(pmc_t pmc, pmc_config_t config, return KERN_FAILURE; } - /* Here's where we setup the PMI method (if needed) */ + perf_monitor_reservation_add(pmc->monitor); *reservation = resv; @@ -2346,7 +2414,7 @@ kern_return_t pmc_reserve_task(pmc_t pmc, pmc_config_t config, return KERN_INVALID_ARGUMENT; } - if(!pmc->monitor->methods.supports_context_switching(pmc->monitor->object)) { + if (!(pmc->monitor->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING)) { COUNTER_DEBUG("pmc %p cannot be context switched!\n", pmc); return KERN_INVALID_ARGUMENT; } @@ -2377,6 +2445,8 @@ kern_return_t pmc_reserve_task(pmc_t pmc, pmc_config_t config, return KERN_FAILURE; } + perf_monitor_reservation_add(pmc->monitor); + *reservation = resv; return KERN_SUCCESS; @@ -2402,7 +2472,7 @@ kern_return_t pmc_reserve_thread(pmc_t pmc, pmc_config_t config, return KERN_INVALID_ARGUMENT; } - if(!pmc->monitor->methods.supports_context_switching(pmc->monitor->object)) { + if (!(pmc->monitor->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING)) { COUNTER_DEBUG("pmc %p cannot be context switched!\n", pmc); return KERN_INVALID_ARGUMENT; } @@ -2433,6 +2503,8 @@ kern_return_t pmc_reserve_thread(pmc_t pmc, pmc_config_t config, return KERN_FAILURE; } + perf_monitor_reservation_add(pmc->monitor); + *reservation = resv; return KERN_SUCCESS; @@ -2632,6 +2704,8 @@ kern_return_t pmc_reservation_free(pmc_reservation_t reservation) { return KERN_INVALID_ARGUMENT; } + perf_monitor_reservation_remove(reservation->pmc->monitor); + /* Move the state machine */ if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_FREE, NULL))) { return KERN_FAILURE; @@ -2661,6 +2735,56 @@ kern_return_t pmc_reservation_free(pmc_reservation_t reservation) { return KERN_SUCCESS; } +/* + * pmc_idle notifies eligible monitors of impending per-CPU idle, and can be used to save state. + */ +boolean_t pmc_idle(void) { + perf_monitor_t monitor = NULL; + queue_head_t *cpu_queue; + + lck_spin_lock(&perf_monitor_queue_spin); + + if (cpu_monitor_queues) { + cpu_queue = cpu_monitor_queues[cpu_number()]; + + queue_iterate(cpu_queue, monitor, perf_monitor_t, cpu_link) { + perf_monitor_methods_t *methods = &(monitor->methods); + if ((methods->flags & PERFMON_FLAG_ALWAYS_ACTIVE) || (monitor->reservedCounters)) { + methods->on_idle(monitor->object); + } + } + } + + lck_spin_unlock(&perf_monitor_queue_spin); + + return TRUE; +} + +/* + * pmc_idle_exit notifies eligible monitors of wake from idle; it can be used to restore state. + */ +boolean_t pmc_idle_exit(void) { + perf_monitor_t monitor = NULL; + queue_head_t *cpu_queue; + + lck_spin_lock(&perf_monitor_queue_spin); + + if (cpu_monitor_queues) { + cpu_queue = cpu_monitor_queues[cpu_number()]; + + queue_iterate(cpu_queue, monitor, perf_monitor_t, cpu_link) { + perf_monitor_methods_t *methods = &(monitor->methods); + if ((methods->flags & PERFMON_FLAG_ALWAYS_ACTIVE) || (monitor->reservedCounters)) { + methods->on_idle_exit(monitor->object); + } + } + } + + lck_spin_unlock(&perf_monitor_queue_spin); + + return TRUE; +} + /* * pmc_context_switch performs all context switching necessary to save all pmc * state associated with @oldThread (and the task to which @oldThread belongs), @@ -2673,43 +2797,37 @@ boolean_t pmc_context_switch(thread_t oldThread, thread_t newThread) { pmc_reservation_t resv = NULL; uint32_t cpuNum = cpu_number(); - /* Out going thread: save pmc state */ lck_spin_lock(&reservations_spin); - /* interate over any reservations */ - queue_iterate(thread_reservations, resv, pmc_reservation_t, link) { - if(resv && oldThread == resv->thread) { - - /* check if we can read the associated pmc from this core. */ - if(pmc_accessible_from_core(resv->pmc, cpuNum)) { - /* save the state At this point, if it fails, it fails. */ + /* Save pmc states */ + if (thread_reservation_count) { + queue_iterate(thread_reservations, resv, pmc_reservation_t, link) { + if ((oldThread == resv->thread) && pmc_accessible_from_core(resv->pmc, cpuNum)) { (void)pmc_internal_reservation_context_out(resv); } } } - queue_iterate(task_reservations, resv, pmc_reservation_t, link) { - if(resv && resv->task == oldThread->task) { - if(pmc_accessible_from_core(resv->pmc, cpuNum)) { - (void)pmc_internal_reservation_context_out(resv); + if (task_reservation_count) { + queue_iterate(task_reservations, resv, pmc_reservation_t, link) { + if ((resv->task == oldThread->task) && pmc_accessible_from_core(resv->pmc, cpuNum)) { + (void)pmc_internal_reservation_context_out(resv); } } } - /* Incoming task: restore */ - - queue_iterate(thread_reservations, resv, pmc_reservation_t, link) { - if(resv && resv->thread == newThread) { - if(pmc_accessible_from_core(resv->pmc, cpuNum)) { + /* Restore */ + if (thread_reservation_count) { + queue_iterate(thread_reservations, resv, pmc_reservation_t, link) { + if ((resv->thread == newThread) && pmc_accessible_from_core(resv->pmc, cpuNum)) { (void)pmc_internal_reservation_context_in(resv); } } } - - queue_iterate(task_reservations, resv, pmc_reservation_t, link) { - if(resv && resv->task == newThread->task) { - if(pmc_accessible_from_core(resv->pmc, cpuNum)) { + if (task_reservation_count) { + queue_iterate(task_reservations, resv, pmc_reservation_t, link) { + if ((resv->task == newThread->task) && pmc_accessible_from_core(resv->pmc, cpuNum)) { (void)pmc_internal_reservation_context_in(resv); } } @@ -2792,11 +2910,6 @@ kern_return_t pmc_get_accessible_core_list(pmc_t pmc __unused, return KERN_FAILURE; } -boolean_t pmc_accessible_from_core(pmc_t pmc __unused, - uint32_t logicalCore __unused) { - return FALSE; -} - kern_return_t pmc_reserve(pmc_t pmc __unused, pmc_config_t config __unused, pmc_reservation_t *reservation __unused) { return KERN_FAILURE; diff --git a/osfmk/pmc/pmc.h b/osfmk/pmc/pmc.h index 72692fa54..789a5b223 100644 --- a/osfmk/pmc/pmc.h +++ b/osfmk/pmc/pmc.h @@ -34,8 +34,6 @@ extern "C" { #include #include -#include - /**************************************************************************** * The four main object types * @@ -85,12 +83,7 @@ typedef struct pmc_config *pmc_config_t; * to the IORegistry (this way only usable PMCs and Perf Monitors will be shown.) ****************************************************************************/ -/*!typedef - * @abstract A pointer to a method that returns whether or not the given performance monitor driver supports context switched counters - * @param pm A registered performance monitor driver object (see perf_monitor_register). - * @result TRUE if the driver supports context switching, FALSE otherwise. - */ -typedef boolean_t (*perfmon_supports_context_switch_method_t)(perf_monitor_object_t pm); +typedef kern_return_t (*perfmon_get_accessible_cores_method_t)(pmc_object_t pmc, uint32_t **cores, size_t *coreCt); /*!typedef * @abstract A pointer to a method that enables a set of counters. @@ -109,7 +102,14 @@ typedef kern_return_t (*perfmon_enable_counters_method_t)(perf_monitor_object_t */ typedef kern_return_t (*perfmon_disable_counters_method_t)(perf_monitor_object_t pm, pmc_object_t *pmcs, uint32_t pmcCount); -#define MACH_PERFMON_METHODS_VERSION 0 +typedef void (*perfmon_on_idle_method_t)(perf_monitor_object_t pm); +typedef void (*perfmon_on_idle_exit_method_t)(perf_monitor_object_t pm); + +#define MACH_PERFMON_METHODS_VERSION 1 + +#define PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING 0x1 +#define PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS 0x2 +#define PERFMON_FLAG_ALWAYS_ACTIVE 0x4 /*!struct perf_monitor_methods * @abstract A set of method pointers to be used when interacting with a performance monitor object @@ -119,12 +119,16 @@ typedef kern_return_t (*perfmon_disable_counters_method_t)(perf_monitor_object_t typedef struct perf_monitor_methods { uint32_t perf_monitor_methods_version; // Always set to MACH_PERFMON_METHODS_VERSION when writing driver kexts - // All methods are required. - perfmon_supports_context_switch_method_t supports_context_switching; + uint32_t flags; + + perfmon_get_accessible_cores_method_t accessible_cores; + perfmon_enable_counters_method_t enable_counters; perfmon_disable_counters_method_t disable_counters; -}perf_monitor_methods_t; + perfmon_on_idle_method_t on_idle; + perfmon_on_idle_exit_method_t on_idle_exit; +} perf_monitor_methods_t; /**************************************************************************** * Method types for performance counter registration @@ -233,7 +237,8 @@ typedef boolean_t (*pmc_is_accessible_from_logical_core_method_t)(pmc_object_t p /*!typedef * @abstract A pointer to a method that returns an array of the logical cores from which a PMC can be accessed. - * @discussion A pointer to a method that returns an array of the logical cores from which a PMC can be accessed. Resulting array of cores should not be released by xnu. + * @discussion A pointer to a method that returns an array of the logical cores from which a PMC can be accessed. + * Resulting array of cores should not be released by xnu. * Implementations of this method type must be safe to call at interrupt context. * @param pmc A valid pmc object * @param cores A value-returned array of logical cores that can access the given PMC. @@ -311,7 +316,7 @@ typedef struct pmc_methods { pmc_enable_method_t enable; pmc_open_method_t open; pmc_close_method_t close; -}pmc_methods_t; +} pmc_methods_t; /* * Kext interface Methods @@ -338,14 +343,6 @@ typedef struct pmc_methods { * KERN_RESOURCE_SHORTAGE if the kernel lacks the resources to register another performance monitor * driver, KERN_INVALID_ARGUMENT if one or both of the arguments is null */ - -/* Prevent older AppleProfileFamily kexts from loading on newer kernels. - * Alas, C doesn't necessarily have a cleaner way to do the version number concatenation - */ -#define PERF_REG_NAME1(a, b) a ## b -#define PERF_REG_NAME(a, b) PERF_REG_NAME1(a, b) -#define perf_monitor_register PERF_REG_NAME(perf_monitor_register_, VERSION_MAJOR) - kern_return_t perf_monitor_register(perf_monitor_object_t monitor, perf_monitor_methods_t *methods); /*!fn @@ -414,8 +411,14 @@ typedef struct perf_monitor { // reference counted uint32_t useCount; - // link to other perf monitors + uint32_t reservedCounters; + + // A value of -1 here indicates independence from a particular core + int cpu; + + // links to other perf monitors queue_chain_t link; + queue_chain_t cpu_link; }*perf_monitor_t; /*!struct pmc @@ -554,14 +557,17 @@ void pmc_free_config(pmc_t pmc, pmc_config_t config); /*!fn * @abstract Setup the configuration - * @discussion Configurations for counter are architecture-neutral key-value pairs (8bit key, 64bit value). Meanings of the keys and values are defined by the driver-writer and are listed in XML form available for interrogation via the CoreProfile framework. This method is not interrupt safe. + * @discussion Configurations for counter are architecture-neutral key-value pairs (8bit key, 64bit value). Meanings of the keys and values are defined + * by the driver-writer and are listed in XML form available for interrogation via the CoreProfile framework. This method is not interrupt safe. * @result KERN_SUCCESS on success. */ kern_return_t pmc_config_set_value(pmc_t pmc, pmc_config_t config, uint8_t id, uint64_t value); /*!fn * @abstract Interrupt Threshold Setup - * @discussion In order to configure a PMC to use PMI (cause an interrupt after so-many events occur), use this method, and provide a function to be called after the interrupt occurs, along with a reference context. PMC Threshold handler methods will have the pmc that generated the interrupt as the first argument when the interrupt handler is invoked, and the given @refCon (which may be NULL) as the second. This method is not interrupt safe. + * @discussion In order to configure a PMC to use PMI (cause an interrupt after so-many events occur), use this method, and provide a function to be + * called after the interrupt occurs, along with a reference context. PMC Threshold handler methods will have the pmc that generated the interrupt as + * the first argument when the interrupt handler is invoked, and the given @refCon (which may be NULL) as the second. This method is not interrupt safe. */ kern_return_t pmc_config_set_interrupt_threshold(pmc_t pmc, pmc_config_t config, uint64_t threshold, pmc_interrupt_method_t method, void *refCon); @@ -583,7 +589,8 @@ void pmc_free_pmc_list(pmc_t *pmcs, size_t pmcCount); /*!fn * @abstract Finds pmcs by partial string matching. - * @discussion This method returns a list of pmcs (similar to pmc_get_pmc_list) whose names match the given string up to it's length. For example, searching for "ia32" would return pmcs "ia32gp0" and "ia32gp1". Results should be released by the caller using pmc_free_pmc_list + * @discussion This method returns a list of pmcs (similar to pmc_get_pmc_list) whose names match the given string up to it's length. + * For example, searching for "ia32" would return pmcs "ia32gp0" and "ia32gp1". Results should be released by the caller using pmc_free_pmc_list * @param name Partial string to search for. * @param pmcs Storage for the resultant pmc_t array pointer. * @param pmcCount Storage for the resultant count of pmc_t's. @@ -599,21 +606,14 @@ const char *pmc_get_name(pmc_t pmc); /*!fn * @abstract Returns a list of logical cores from which the given pmc can be read from or written to. - * @discussion This method can return a NULL list with count of 0 -- this indicates any core can read the given pmc. This method does not allocate the list, therefore callers should take care not to mutate or free the resultant list. This method is interrupt safe. + * @discussion This method can return a NULL list with count of 0 -- this indicates any core can read the given pmc. This method does not allocate the list, + * therefore callers should take care not to mutate or free the resultant list. This method is interrupt safe. * @param pmc The PMC for which to return the cores that can read/write it. * @param logicalCores Storage for the pointer to the list. * @param logicalCoreCt Value-return number of elements in the returned list. 0 indicates all cores can read/write the given pmc. */ kern_return_t pmc_get_accessible_core_list(pmc_t pmc, uint32_t **logicalCores, size_t *logicalCoreCt); -/*!fn - * @abstract Returns TRUE if the given logical core can read/write the given PMC. - * @discussion This method is interrupt safe. - * @param pmc The PMC to test - * @param logicalCore The core from which to test. - */ -boolean_t pmc_accessible_from_core(pmc_t pmc, uint32_t logicalCore); - /* * BEGIN PMC Reservations * @@ -623,7 +623,9 @@ boolean_t pmc_accessible_from_core(pmc_t pmc, uint32_t logicalCore); /*!fn * @abstract Reserve a PMC for System-wide counting. - * @discussion This method will attempt to reserve the given pmc at system-scope. It will configure the given pmc to count the event indicated by the given configuration object. This method consumes the given configuration object if the return value is KERN_SUCCESS - any other return value indicates the caller should free the configuration object via pmc_free_config. This method is not interrupt safe. + * @discussion This method will attempt to reserve the given pmc at system-scope. It will configure the given pmc to count the event indicated by the given + * configuration object. This method consumes the given configuration object if the return value is KERN_SUCCESS - any other return value indicates the caller + * should free the configuration object via pmc_free_config. This method is not interrupt safe. * @param pmc The PMC to reserve. * @param config The configuration object to use with the given pmc. * @param reservation A value-return reservation object to be used in pmc_reservation_* methods. @@ -638,7 +640,8 @@ kern_return_t pmc_reserve(pmc_t pmc, pmc_config_t config, pmc_reservation_t *res /*!fn * @abstract Reserve a PMC for task-wide counting. - * @discussion This method will attempt to reserve the given pmc for task-wide counting. The resulting reservation will only count when the task is running on one of the logical cores that can read the given pmc. The semantics of this method are the same as pmc_reserve in all other respects. + * @discussion This method will attempt to reserve the given pmc for task-wide counting. The resulting reservation will only count when the task is running + * on one of the logical cores that can read the given pmc. The semantics of this method are the same as pmc_reserve in all other respects. * @param pmc The PMC to reserve * @param config The configuration object to use. * @param task The task for which to enable the counter. @@ -649,7 +652,8 @@ kern_return_t pmc_reserve_task(pmc_t pmc, pmc_config_t config, task_t task, pmc_ /*!fn * @abstract Reserve a PMC for thread-wide counting. - * @discussion This method will attempt to reserve the given pmc for thread-wide counting. The resulting reservation will only count when the thread is running on one of the logical cores that can read the given pmc. The semantics of this method are the same as pmc_reserve_task in all other respects. + * @discussion This method will attempt to reserve the given pmc for thread-wide counting. The resulting reservation will only count when the thread is + * running on one of the logical cores that can read the given pmc. The semantics of this method are the same as pmc_reserve_task in all other respects. * @param pmc The PMC to reserve * @param config The configuration object to use. * @param thread The thread for which to enable the counter. @@ -660,21 +664,28 @@ kern_return_t pmc_reserve_thread(pmc_t pmc, pmc_config_t config, thread_t thread /*!fn * @abstract Start counting - * @discussion This method instructs the given reservation to start counting as soon as possible. If the reservation is for a thread (or task) other than the current thread, or for a pmc that is not accessible from the current logical core, the reservation will start counting the next time the thread (or task) runs on a logical core than can access the pmc. This method is interrupt safe. If this method is called from outside of interrupt context, it may block. + * @discussion This method instructs the given reservation to start counting as soon as possible. If the reservation is for a thread (or task) other than the + * current thread, or for a pmc that is not accessible from the current logical core, the reservation will start counting the next time the thread (or task) + * runs on a logical core than can access the pmc. This method is interrupt safe. If this method is called from outside of interrupt context, it may block. * @param reservation The reservation to start counting */ kern_return_t pmc_reservation_start(pmc_reservation_t reservation); /*!fn * @abstract Stop counting - * @discussion This method instructs the given reservation to stop counting as soon as possible. If the reservation is for a thread (or task) other than the current thread, or for a pmc that is not accessible from the current logical core, the reservation will stop counting the next time the thread (or task) ceases to run on a logical core than can access the pmc. This method is interrupt safe. If called form outside of interrupt context, this method may block. + * @discussion This method instructs the given reservation to stop counting as soon as possible. If the reservation is for a thread (or task) other than the + * current thread, or for a pmc that is not accessible from the current logical core, the reservation will stop counting the next time the thread (or task) c + * eases to run on a logical core than can access the pmc. This method is interrupt safe. If called form outside of interrupt context, this method may block. * @param reservation The reservation to stop counting */ kern_return_t pmc_reservation_stop(pmc_reservation_t reservation); /*!fn * @abstract Read the counter value - * @discussion This method will read the event count associated with the given reservation. If the pmc is currently on hardware, and the caller is currently executing in a context that both a) matches the reservation's context, and b) can access the reservation's pmc directly, the value will be read directly from the hardware. Otherwise, the value stored in the reservation is returned. This method is interrupt safe. If the caller is calling from outside of interrupt context, this method may block. + * @discussion This method will read the event count associated with the given reservation. If the pmc is currently on hardware, and the caller is currently ] + * executing in a context that both a) matches the reservation's context, and b) can access the reservation's pmc directly, the value will be read directly + * from the hardware. Otherwise, the value stored in the reservation is returned. This method is interrupt safe. If the caller is calling from outside of + * interrupt context, this method may block. * @param reservation The reservation whose value to read. * @param value Value-return event count */ @@ -682,7 +693,10 @@ kern_return_t pmc_reservation_read(pmc_reservation_t reservation, uint64_t *valu /*!fn * @abstract Write the counter value - * @discussion This method will write the event count associated with the given reservation. If the pmc is currently on hardware, and the caller is currently executing in a context that both a) matches the reservation's context, and b) can access the reservation's pmc directly, the value will be written directly to the hardware. Otherwise, the value stored in the reservation is overwritten. This method is interrupt safe. If the caller is calling from outside of interrupt context, this method may block. + * @discussion This method will write the event count associated with the given reservation. If the pmc is currently on hardware, and the caller is currently + * executing in a context that both a) matches the reservation's context, and b) can access the reservation's pmc directly, the value will be written directly + * to the hardware. Otherwise, the value stored in the reservation is overwritten. This method is interrupt safe. If the caller is calling from outside of + * interrupt context, this method may block. * @param reservation The reservation to write. * @param value The event count to write */ @@ -690,7 +704,8 @@ kern_return_t pmc_reservation_write(pmc_reservation_t reservation, uint64_t valu /*!fn * @abstract Free a reservation and all associated resources. - * @discussion This method will free the resources associated with the given reservation and release the associated PMC back to general availability. If the reservation is currently counting, it will be stopped prior to release. This method is not interrupt safe. + * @discussion This method will free the resources associated with the given reservation and release the associated PMC back to general availability. + * If the reservation is currently counting, it will be stopped prior to release. This method is not interrupt safe. * @param reservation The reservation to free */ kern_return_t pmc_reservation_free(pmc_reservation_t reservation); @@ -705,10 +720,38 @@ void pmc_bootstrap(void); /*!fn * @abstract Performs a pmc context switch. - * @discussion This method will save all PMCs reserved for oldThread (and the task associated with oldThread), as well as restore all PMCs reserved for newThread (and the task associated with newThread). This method is for xnu-internal context switching routines only. + * @discussion This method will save all PMCs reserved for oldThread (and the task associated with oldThread), as well as restore all PMCs reserved + * for newThread (and the task associated with newThread). This method is for xnu-internal context switching routines only. */ boolean_t pmc_context_switch(thread_t oldThread, thread_t newThread); +/*!fn + * @abstract Called on per-core idle. + * @discussion This method notifies registered performance monitors of impending cpu idle, and can be used to save counter state. + */ +boolean_t pmc_idle(void); + +/*!fn + * @abstract Called on per-core wake from idle. + * @discussion This method notifies registered performance monitors of wake-up from the prior idle, and can be used to restore + * previously saved counter configuration. + */ +boolean_t pmc_idle_exit(void); + +#if defined(THREAD_PMC_FLAG) +/* Allow inclusion from outside of MACH_KERNEL_PRIVATE scope. */ + +/*!fn + * @abstract Returns true if thread has been marked for counting. + * @discussion Task-level reservations are propagated to child threads via thread_create_internal. Any mutation of task reservations forces a recalculate + * of t_chud (for the pmc flag) for all threads in that task. Consequently, we can simply check the current thread's flag against THREAD_PMC_FLAG. + */ +static inline boolean_t pmc_thread_eligible(thread_t t) { + return (t != NULL) ? ((t->t_chud & THREAD_PMC_FLAG) ? TRUE : FALSE) : FALSE; +} + +#endif /* THREAD_PMC_FLAG*/ + #endif // XNU_KERNEL_PRIVATE #ifdef __cplusplus diff --git a/osfmk/profiling/Makefile b/osfmk/profiling/Makefile index 3b2b64363..3fda07c40 100644 --- a/osfmk/profiling/Makefile +++ b/osfmk/profiling/Makefile @@ -16,8 +16,6 @@ INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ x86_64 -INSTINC_SUBDIRS_ARM = \ - arm EXPINC_SUBDIRS = \ machine @@ -25,8 +23,6 @@ EXPINC_SUBDIRS = \ EXPINC_SUBDIRS_I386 = \ i386 -EXPINC_SUBDIRS_ARM = \ - arm EXPINC_SUBDIRS_X86_64 = \ x86_64 diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index cd8dc8317..97d76cdf0 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -407,63 +407,8 @@ memory_object_control_uiomove( if ((dst_page = vm_page_lookup(object, offset)) == VM_PAGE_NULL) break; - /* - * if we're in this routine, we are inside a filesystem's - * locking model, so we don't ever want to wait for pages that have - * list_req_pending == TRUE since it means that the - * page is a candidate for some type of I/O operation, - * but that it has not yet been gathered into a UPL... - * this implies that it is still outside the domain - * of the filesystem and that whoever is responsible for - * grabbing it into a UPL may be stuck behind the filesystem - * lock this thread owns, or trying to take a lock exclusively - * and waiting for the readers to drain from a rw lock... - * if we block in those cases, we will deadlock - */ - if (dst_page->list_req_pending) { - - if (dst_page->absent) { - /* - * this is the list_req_pending | absent | busy case - * which originates from vm_fault_page... we want - * to fall out of the fast path and go back - * to the caller which will gather this page - * into a UPL and issue the I/O if no one - * else beats us to it - */ - break; - } - if (dst_page->pageout || dst_page->cleaning) { - /* - * this is the list_req_pending | pageout | busy case - * or the list_req_pending | cleaning case... - * which originate from the pageout_scan and - * msync worlds for the pageout case and the hibernate - * pre-cleaning world for the cleaning case... - * we need to reset the state of this page to indicate - * it should stay in the cache marked dirty... nothing else we - * can do at this point... we can't block on it, we can't busy - * it and we can't clean it from this routine. - */ - vm_page_lockspin_queues(); - - vm_pageout_queue_steal(dst_page, TRUE); - vm_page_deactivate(dst_page); - - vm_page_unlock_queues(); - } - /* - * this is the list_req_pending | cleaning case... - * we can go ahead and deal with this page since - * its ok for us to mark this page busy... if a UPL - * tries to gather this page, it will block until the - * busy is cleared, thus allowing us safe use of the page - * when we're done with it, we will clear busy and wake - * up anyone waiting on it, thus allowing the UPL creation - * to finish - */ - } else if (dst_page->busy || dst_page->cleaning) { + if (dst_page->busy || dst_page->cleaning) { /* * someone else is playing with the page... if we've * already collected pages into this run, go ahead @@ -476,7 +421,11 @@ memory_object_control_uiomove( PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } - + if (dst_page->laundry) { + dst_page->pageout = FALSE; + + vm_pageout_steal_laundry(dst_page, FALSE); + } /* * this routine is only called when copying * to/from real files... no need to consider @@ -485,7 +434,7 @@ memory_object_control_uiomove( assert(!dst_page->encrypted); if (mark_dirty) { - dst_page->dirty = TRUE; + SET_PAGE_DIRTY(dst_page, FALSE); if (dst_page->cs_validated && !dst_page->cs_tainted) { /* diff --git a/osfmk/vm/cpm.h b/osfmk/vm/cpm.h index 9233e644f..4d4b968d2 100644 --- a/osfmk/vm/cpm.h +++ b/osfmk/vm/cpm.h @@ -41,7 +41,6 @@ * Contiguous physical memory allocator. */ -#include #include #include @@ -56,14 +55,4 @@ extern kern_return_t cpm_allocate(vm_size_t size, vm_page_t *list, ppnum_t max_pnum, ppnum_t pnum_mask, boolean_t wire, int flags); -/* - * CPM-specific event counters. - */ -#define VM_CPM_COUNTERS (MACH_KDB && MACH_COUNTERS && VM_CPM) -#if VM_CPM_COUNTERS -#define cpm_counter(foo) foo -#else /* VM_CPM_COUNTERS */ -#define cpm_counter(foo) -#endif /* VM_CPM_COUNTERS */ - #endif /* _VM_CPM_H_ */ diff --git a/osfmk/vm/default_freezer.c b/osfmk/vm/default_freezer.c index dd8197d7d..c6e9c8ef6 100644 --- a/osfmk/vm/default_freezer.c +++ b/osfmk/vm/default_freezer.c @@ -28,13 +28,30 @@ #if CONFIG_FREEZE -#include "default_freezer.h" +#ifndef CONFIG_MEMORYSTATUS +#error "CONFIG_FREEZE defined without matching CONFIG_MEMORYSTATUS" +#endif + +#include /* * Indicates that a page has been faulted back in. */ #define FREEZER_OFFSET_ABSENT ((vm_object_offset_t)(-1)) +lck_grp_attr_t default_freezer_handle_lck_grp_attr; +lck_grp_t default_freezer_handle_lck_grp; + +void +default_freezer_init(void) +{ + lck_grp_attr_setdefault(&default_freezer_handle_lck_grp_attr); + lck_grp_init(&default_freezer_handle_lck_grp, "default_freezer_handle", + &default_freezer_handle_lck_grp_attr); + +} + + /* * Create the mapping table that will * tell us the object/offset pair that @@ -42,7 +59,7 @@ * out or being brought back in. */ -void* +default_freezer_mapping_table_t default_freezer_mapping_create(vm_object_t object, vm_offset_t offset) { default_freezer_mapping_table_t table; @@ -57,13 +74,18 @@ default_freezer_mapping_create(vm_object_t object, vm_offset_t offset) table->object = object; table->offset = offset; - return (void*)table; + return table; } +/* + * Table modifications/lookup are done behind + * the compact_object lock. + */ + void -default_freezer_mapping_free(void **table, boolean_t all) +default_freezer_mapping_free(default_freezer_mapping_table_t *table_p, boolean_t all) { - default_freezer_mapping_table_t freezer_table = *((default_freezer_mapping_table_t *)table); + default_freezer_mapping_table_t freezer_table = *table_p; assert(freezer_table); if (all) { @@ -79,31 +101,33 @@ default_freezer_mapping_free(void **table, boolean_t all) kern_return_t default_freezer_mapping_store( - default_freezer_mapping_table_t *table, + default_freezer_mapping_table_t table, memory_object_offset_t table_offset, memory_object_t memory_object, memory_object_offset_t offset) { default_freezer_mapping_table_entry_t entry; uint32_t index; - - assert(*table); - - if ((*table)->index >= MAX_FREEZE_TABLE_ENTRIES) { - vm_object_t compact_object = (*table)->object; + + assert(table); + + while (table->next) { + table = table->next; + } + + if (table->index >= MAX_FREEZE_TABLE_ENTRIES) { + vm_object_t compact_object = table->object; default_freezer_mapping_table_t next; next = default_freezer_mapping_create(compact_object, table_offset); if (!next) { return KERN_FAILURE; } - - (*table)->next = next; - *table = next; + table->next = next; } - index = (*table)->index++; - entry = &(*table)->entry[index]; + index = (table)->index++; + entry = &(table)->entry[index]; entry->memory_object = memory_object; entry->offset = offset; @@ -165,15 +189,17 @@ default_freezer_mapping_update( return kr; } + + /* * Create a freezer memory object for this - * vm object. + * vm object. This will be one of the vm + * objects that will pack the compact object. */ void default_freezer_memory_object_create( - vm_object_t object, - vm_object_t compact_object, - default_freezer_mapping_table_t table) + vm_object_t object, + default_freezer_handle_t df_handle) { default_freezer_memory_object_t fo = NULL; @@ -189,9 +215,10 @@ default_freezer_memory_object_create( assert (control != MEMORY_OBJECT_CONTROL_NULL); df_memory_object_init((memory_object_t)fo, control, 0); - fo->fo_compact_object = compact_object; - fo->fo_table = table; - + fo->fo_df_handle = df_handle; + + default_freezer_handle_reference_locked(fo->fo_df_handle); + object->pager = (memory_object_t)fo; object->pager_created = TRUE; object->pager_initialized = TRUE; @@ -203,53 +230,110 @@ default_freezer_memory_object_create( } } +kern_return_t +default_freezer_pack( + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + boolean_t *shared, + vm_object_t src_object, + default_freezer_handle_t df_handle) +{ + kern_return_t kr = KERN_SUCCESS; + + if (df_handle) { + default_freezer_handle_lock(df_handle); + } + + kr = vm_object_pack(purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared, src_object, df_handle); + + if (df_handle) { + default_freezer_handle_unlock(df_handle); + } + + return kr; +} + +/* + * Called with freezer_handle locked. + * default_freezer_pack locks the handle, calls + * vm_object_pack which, in turn, will call + * default_freezer_pack_page(). + */ void default_freezer_pack_page( vm_page_t p, - vm_object_t compact_object, - vm_object_offset_t offset, - void **table) + default_freezer_handle_t df_handle) { - default_freezer_mapping_table_t *freeze_table = (default_freezer_mapping_table_t *)table; - memory_object_t memory_object = p->object->pager; - + default_freezer_mapping_table_t freeze_table = NULL; + memory_object_t memory_object = NULL; + vm_object_t compact_object = VM_OBJECT_NULL; + + assert(df_handle); + + compact_object = df_handle->dfh_compact_object; + + assert(compact_object); + + freeze_table = df_handle->dfh_table; + memory_object = p->object->pager; + if (memory_object == NULL) { - default_freezer_memory_object_create(p->object, compact_object, *freeze_table); + default_freezer_memory_object_create(p->object, df_handle); memory_object = p->object->pager; } else { - default_freezer_memory_object_t fo = (default_freezer_memory_object_t)memory_object; - if (fo->fo_compact_object == VM_OBJECT_NULL) { - fo->fo_compact_object = compact_object; - fo->fo_table = *freeze_table; - } + assert(df_handle == ((default_freezer_memory_object_t)memory_object)->fo_df_handle); } - - default_freezer_mapping_store(freeze_table, offset, memory_object, p->offset + p->object->paging_offset); - /* Remove from the original and insert into the compact destination object */ - vm_page_rename(p, compact_object, offset, FALSE); + vm_object_lock(compact_object); + default_freezer_mapping_store(freeze_table, df_handle->dfh_compact_offset, memory_object, p->offset + p->object->paging_offset); + vm_page_rename(p, compact_object, df_handle->dfh_compact_offset, FALSE); + vm_object_unlock(compact_object); + + df_handle->dfh_compact_offset += PAGE_SIZE; } void default_freezer_unpack( - vm_object_t object, - void **table) + default_freezer_handle_t df_handle) { - vm_page_t p = VM_PAGE_NULL; - uint32_t index = 0; - vm_object_t src_object = VM_OBJECT_NULL; - memory_object_t src_mem_object = MEMORY_OBJECT_NULL; - memory_object_offset_t src_offset = 0; - vm_object_offset_t compact_offset = 0; - default_freezer_memory_object_t fo = NULL; - default_freezer_memory_object_t last_memory_object_thawed = NULL; - default_freezer_mapping_table_t freeze_table = *(default_freezer_mapping_table_t *)table; - - assert(freeze_table); + vm_page_t compact_page = VM_PAGE_NULL, src_page = VM_PAGE_NULL; + uint32_t index = 0; + vm_object_t src_object = VM_OBJECT_NULL; + vm_object_t compact_object = VM_OBJECT_NULL; + memory_object_t src_mem_object = MEMORY_OBJECT_NULL; + memory_object_offset_t src_offset = 0; + vm_object_offset_t compact_offset = 0; + default_freezer_memory_object_t fo = NULL; + default_freezer_mapping_table_t freeze_table = NULL; + boolean_t should_unlock_handle = FALSE; + + assert(df_handle); + + default_freezer_handle_lock(df_handle); + should_unlock_handle = TRUE; + + freeze_table = df_handle->dfh_table; + compact_object = df_handle->dfh_compact_object; + + assert(compact_object); + assert(compact_object->alive); + assert(!compact_object->terminating); + assert(compact_object->pager_ready); - vm_object_lock(object); + /* Bring the pages back in */ + if (vm_object_pagein(compact_object) != KERN_SUCCESS) { + if (should_unlock_handle) { + default_freezer_handle_unlock(df_handle); + } + return; + } + + vm_object_lock(compact_object); for (index = 0, compact_offset = 0; ; index++, compact_offset += PAGE_SIZE){ if (index >= freeze_table->index) { @@ -258,8 +342,8 @@ default_freezer_unpack( table_next = freeze_table->next; /* Free the tables as we go along */ - default_freezer_mapping_free((void**)&freeze_table, FALSE); - + default_freezer_mapping_free(&freeze_table, FALSE); + if (table_next == NULL){ break; } @@ -281,8 +365,8 @@ default_freezer_unpack( src_offset = freeze_table->entry[index].offset; if (src_offset != FREEZER_OFFSET_ABSENT) { - p = vm_page_lookup(object, compact_offset); - assert(p); + compact_page = vm_page_lookup(compact_object, compact_offset); + assert(compact_page); fo = (default_freezer_memory_object_t)src_mem_object; @@ -290,39 +374,37 @@ default_freezer_unpack( /* Move back over from the freeze object to the original */ vm_object_lock(src_object); - vm_page_rename(p, src_object, src_offset - src_object->paging_offset, FALSE); + src_page = vm_page_lookup(src_object, src_offset - src_object->paging_offset); + if (src_page != VM_PAGE_NULL){ + /* + * We might be racing with a VM fault. + * So handle that gracefully. + */ + assert(src_page->absent == TRUE); + VM_PAGE_FREE(src_page); + } + vm_page_rename(compact_page, src_object, src_offset - src_object->paging_offset, FALSE); vm_object_unlock(src_object); } - if (src_mem_object != ((memory_object_t)last_memory_object_thawed)){ - if (last_memory_object_thawed != NULL){ - last_memory_object_thawed->fo_compact_object = VM_OBJECT_NULL; - last_memory_object_thawed->fo_table = NULL; - } - last_memory_object_thawed = (default_freezer_memory_object_t)src_mem_object; - } } - if (last_memory_object_thawed != NULL){ - last_memory_object_thawed->fo_compact_object = VM_OBJECT_NULL; - last_memory_object_thawed->fo_table = NULL; - } + vm_object_unlock(compact_object); - vm_object_unlock(object); -} - -vm_object_t -default_freezer_get_compact_vm_object(void** table) -{ - default_freezer_mapping_table_t freeze_table = *((default_freezer_mapping_table_t *)table); - assert(freeze_table); - return ((vm_object_t)(freeze_table->object)); + vm_object_deallocate(compact_object); + + if (should_unlock_handle) { + df_handle->dfh_table = NULL; + df_handle->dfh_compact_object = VM_OBJECT_NULL; + df_handle->dfh_compact_offset = 0; + default_freezer_handle_unlock(df_handle); + } } void df_memory_object_reference(__unused memory_object_t mem_obj) { - + /* No-op */ } @@ -331,52 +413,65 @@ df_memory_object_deallocate(memory_object_t mem_obj) { default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; - vm_object_t compact_object = fo->fo_compact_object; - + assert(fo); - if (compact_object != VM_OBJECT_NULL) { + if (fo->fo_df_handle != NULL) { - default_freezer_mapping_table_t fo_table = fo->fo_table; + default_freezer_mapping_table_t table = NULL; default_freezer_mapping_table_entry_t entry; boolean_t found = FALSE; uint32_t index = 0; + vm_object_t compact_object = VM_OBJECT_NULL; - vm_object_lock(compact_object); - - /* Remove from table */ - while (1) { - if (index >= fo_table->index) { - if (fo_table->next) { - fo_table = fo_table->next; - index = 0; - } else { - /* End of tables */ - break; + default_freezer_handle_lock(fo->fo_df_handle); + + compact_object = fo->fo_df_handle->dfh_compact_object; + table = fo->fo_df_handle->dfh_table; + + if (compact_object == VM_OBJECT_NULL || table == NULL) { + /*Nothing to do. A thaw must have cleared it all out.*/ + } else { + vm_object_lock(compact_object); + + /* Remove from table */ + while (1) { + if (index >= table->index) { + if (table->next) { + table = table->next; + index = 0; + } else { + /* End of tables */ + break; + } } - } - entry = &fo_table->entry[index]; - if (mem_obj == entry->memory_object) { - /* It matches, so clear the entry */ - if (!found) { - found = TRUE; - } - entry->memory_object = MEMORY_OBJECT_NULL; - entry->offset = 0; - } else if (MEMORY_OBJECT_NULL != entry->memory_object) { - /* We have a different valid object; we're done */ - if (found) { - break; + entry = &table->entry[index]; + if (mem_obj == entry->memory_object) { + /* It matches, so clear the entry */ + if (!found) { + found = TRUE; + } + entry->memory_object = MEMORY_OBJECT_NULL; + entry->offset = 0; + } else if (MEMORY_OBJECT_NULL != entry->memory_object) { + /* We have a different valid object; we're done */ + if (found) { + break; + } } + + index++; } - index++; + vm_object_unlock(compact_object); } - - vm_object_unlock(compact_object); + + if (default_freezer_handle_deallocate_locked(fo->fo_df_handle)) { + default_freezer_handle_unlock(fo->fo_df_handle); + } } - + kfree(fo, sizeof(*fo)); } @@ -407,6 +502,7 @@ df_memory_object_terminate(memory_object_t mem_obj) return KERN_SUCCESS; } + kern_return_t df_memory_object_data_request( memory_object_t mem_obj, @@ -420,29 +516,44 @@ df_memory_object_data_request( memory_object_offset_t compact_offset = 0; memory_object_t pager = NULL; kern_return_t kr = KERN_SUCCESS; + boolean_t drop_object_ref = FALSE; default_freezer_memory_object_t fo = (default_freezer_memory_object_t)mem_obj; + default_freezer_handle_t df_handle = NULL; - src_object = memory_object_control_to_vm_object(fo->fo_pager_control); - compact_object = fo->fo_compact_object; - - if (compact_object != VM_OBJECT_NULL) { + df_handle = fo->fo_df_handle; + + if (df_handle == NULL) { + kr = KERN_FAILURE; + } else { + default_freezer_handle_lock(df_handle); - vm_object_lock(compact_object); + src_object = memory_object_control_to_vm_object(fo->fo_pager_control); + compact_object = fo->fo_df_handle->dfh_compact_object; - kr = default_freezer_mapping_update(fo->fo_table, - mem_obj, - offset, - &compact_offset, - FALSE); - - vm_object_unlock(compact_object); - } else { - kr = KERN_FAILURE; + if (compact_object == NULL) { + kr = KERN_FAILURE; + } else { + vm_object_lock(compact_object); + vm_object_reference_locked(compact_object); + drop_object_ref = TRUE; + + kr = default_freezer_mapping_update(fo->fo_df_handle->dfh_table, + mem_obj, + offset, + &compact_offset, + FALSE); + vm_object_unlock(compact_object); + } + default_freezer_handle_unlock(df_handle); } + if (length == 0){ /*Caller is just querying to see if we have the page*/ + if (drop_object_ref) { + vm_object_deallocate(compact_object); + } return kr; } @@ -466,30 +577,38 @@ df_memory_object_data_request( PAGE_SIZE, PAGE_SIZE, &upl, NULL, &page_list_count, request_flags); + upl_range_needed(upl, 0, 1); upl_abort(upl, UPL_ABORT_UNAVAILABLE); upl_deallocate(upl); + if (drop_object_ref) { + vm_object_deallocate(compact_object); + } + return KERN_SUCCESS; } - vm_object_lock(compact_object); + assert(compact_object->alive); + assert(!compact_object->terminating); + assert(compact_object->pager_ready); - pager = (memory_object_t)compact_object->pager; + vm_object_lock(compact_object); - if (!compact_object->pager_ready || pager == MEMORY_OBJECT_NULL){ - vm_object_unlock(compact_object); - return KERN_FAILURE; - } - vm_object_paging_wait(compact_object, THREAD_UNINT); vm_object_paging_begin(compact_object); compact_object->blocked_access = TRUE; + pager = (memory_object_t)compact_object->pager; + vm_object_unlock(compact_object); ((vm_object_fault_info_t) fault_info)->io_sync = TRUE; + /* + * We have a reference on both the default_freezer + * memory object handle and the compact object. + */ kr = dp_memory_object_data_request(pager, compact_offset, length, @@ -497,7 +616,7 @@ df_memory_object_data_request( fault_info); if (kr == KERN_SUCCESS){ - vm_page_t src_page = VM_PAGE_NULL, dst_page = VM_PAGE_NULL; + vm_page_t compact_page = VM_PAGE_NULL, dst_page = VM_PAGE_NULL; vm_object_lock(compact_object); @@ -506,31 +625,42 @@ df_memory_object_data_request( vm_object_lock(src_object); - if ((src_page = vm_page_lookup(compact_object, compact_offset)) != VM_PAGE_NULL){ + if ((compact_page = vm_page_lookup(compact_object, compact_offset)) != VM_PAGE_NULL){ dst_page = vm_page_lookup(src_object, offset - src_object->paging_offset); - VM_PAGE_FREE(dst_page); - vm_page_rename(src_page, src_object, offset - src_object->paging_offset, FALSE); - - if (default_freezer_mapping_update(fo->fo_table, - mem_obj, - offset, - NULL, - TRUE) != KERN_SUCCESS) { - printf("Page for object: 0x%lx at offset: 0x%lx not found in table\n", (uintptr_t)src_object, (uintptr_t)offset); + if (!dst_page->absent){ + /* + * Someone raced us here and unpacked + * the object behind us. + * So cleanup before we return. + */ + VM_PAGE_FREE(compact_page); + } else { + VM_PAGE_FREE(dst_page); + vm_page_rename(compact_page, src_object, offset - src_object->paging_offset, FALSE); + + if (default_freezer_mapping_update(fo->fo_df_handle->dfh_table, + mem_obj, + offset, + NULL, + TRUE) != KERN_SUCCESS) { + printf("Page for object: 0x%lx at offset: 0x%lx not found in table\n", (uintptr_t)src_object, (uintptr_t)offset); + } + + PAGE_WAKEUP_DONE(compact_page); } - - PAGE_WAKEUP_DONE(src_page); } else { printf("%d: default_freezer: compact_object doesn't have the page for object 0x%lx at offset 0x%lx \n", kr, (uintptr_t)compact_object, (uintptr_t)compact_offset); - kr = KERN_FAILURE; + kr = KERN_SUCCESS; } vm_object_unlock(src_object); vm_object_unlock(compact_object); + vm_object_deallocate(compact_object); } else { panic("%d: default_freezer TOC pointed us to default_pager incorrectly\n", kr); } + return kr; } @@ -613,4 +743,111 @@ df_memory_object_data_reclaim( panic("df_memory_object_data_reclaim\n"); return KERN_SUCCESS; } + + +/* + * The freezer handle is used to make sure that + * we don't race against the lookup and termination + * of the compact object. + */ + +void +default_freezer_handle_lock(default_freezer_handle_t df_handle) { + lck_rw_lock_exclusive(&df_handle->dfh_lck); +} + +void +default_freezer_handle_unlock(default_freezer_handle_t df_handle) { + lck_rw_done(&df_handle->dfh_lck); +} + +default_freezer_handle_t +default_freezer_handle_allocate(void) +{ + + default_freezer_handle_t df_handle = NULL; + df_handle = kalloc(sizeof(struct default_freezer_handle)); + + if (df_handle) { + memset(df_handle, 0, sizeof(struct default_freezer_handle)); + lck_rw_init(&df_handle->dfh_lck, &default_freezer_handle_lck_grp, NULL); + /* No one knows of this handle yet so no need to lock it. */ + default_freezer_handle_reference_locked(df_handle); + } else { + panic("Failed to allocated default_freezer_handle structure\n"); + } + return df_handle; +} + +kern_return_t +default_freezer_handle_init( + default_freezer_handle_t df_handle) +{ + kern_return_t kr = KERN_SUCCESS; + vm_object_t compact_object = VM_OBJECT_NULL; + + if (df_handle == NULL || df_handle->dfh_table != NULL) { + kr = KERN_FAILURE; + } else { + /* Create our compact object */ + compact_object = vm_object_allocate((vm_map_offset_t)(VM_MAX_ADDRESS) - (vm_map_offset_t)(VM_MIN_ADDRESS)); + if (!compact_object) { + kr = KERN_FAILURE; + } else { + df_handle->dfh_compact_object = compact_object; + df_handle->dfh_compact_offset = 0; + df_handle->dfh_table = default_freezer_mapping_create(df_handle->dfh_compact_object, df_handle->dfh_compact_offset); + if (!df_handle->dfh_table) { + kr = KERN_FAILURE; + } + } + } + + return kr; +} + +void +default_freezer_handle_reference_locked( + default_freezer_handle_t df_handle) +{ + assert(df_handle); + df_handle->dfh_ref_count++; +} + +void +default_freezer_handle_deallocate( + default_freezer_handle_t df_handle) +{ + assert(df_handle); + default_freezer_handle_lock(df_handle); + if (default_freezer_handle_deallocate_locked(df_handle)) { + default_freezer_handle_unlock(df_handle); + } +} + +boolean_t +default_freezer_handle_deallocate_locked( + default_freezer_handle_t df_handle) +{ + boolean_t should_unlock = TRUE; + + assert(df_handle); + df_handle->dfh_ref_count--; + if (df_handle->dfh_ref_count == 0) { + lck_rw_destroy(&df_handle->dfh_lck, &default_freezer_handle_lck_grp); + kfree(df_handle, sizeof(struct default_freezer_handle)); + should_unlock = FALSE; + } + return should_unlock; +} + +void +default_freezer_pageout( + default_freezer_handle_t df_handle) +{ + assert(df_handle); + + vm_object_pageout(df_handle->dfh_compact_object); +} + #endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/default_freezer.h b/osfmk/vm/default_freezer.h index 46730fd71..f08de63a5 100644 --- a/osfmk/vm/default_freezer.h +++ b/osfmk/vm/default_freezer.h @@ -26,11 +26,11 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if CONFIG_FREEZE - #ifndef _DEFAULT_FREEZER_H_ #define _DEFAULT_FREEZER_H_ +#if CONFIG_FREEZE + #ifdef MACH_KERNEL #include @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -122,21 +123,35 @@ struct default_freezer_mapping_table { }; typedef struct default_freezer_mapping_table_entry *default_freezer_mapping_table_entry_t; +struct default_freezer_handle { + lck_rw_t dfh_lck; + uint32_t dfh_ref_count; + default_freezer_mapping_table_t dfh_table; + vm_object_t dfh_compact_object; + vm_object_offset_t dfh_compact_offset; +}; +typedef struct default_freezer_handle *default_freezer_handle_t; + struct default_freezer_memory_object{ struct ipc_object_header fo_pager_header; /* fake ip_kotype() */ memory_object_pager_ops_t fo_pager_ops; /* == &default_freezer_ops */ memory_object_control_t fo_pager_control; - vm_object_t fo_compact_object; - default_freezer_mapping_table_t fo_table; + default_freezer_handle_t fo_df_handle; }; typedef struct default_freezer_memory_object *default_freezer_memory_object_t; -__private_extern__ void* default_freezer_mapping_create(vm_object_t, vm_offset_t); +__private_extern__ void default_freezer_handle_lock(default_freezer_handle_t); +__private_extern__ void default_freezer_handle_unlock(default_freezer_handle_t); -__private_extern__ void default_freezer_mapping_free(void**, boolean_t all); +extern lck_grp_attr_t default_freezer_handle_lck_grp_attr; +extern lck_grp_t default_freezer_handle_lck_grp; -__private_extern__ kern_return_t default_freezer_mapping_store( default_freezer_mapping_table_t *, +__private_extern__ default_freezer_mapping_table_t default_freezer_mapping_create(vm_object_t, vm_offset_t); + +__private_extern__ void default_freezer_mapping_free(default_freezer_mapping_table_t *table_p, boolean_t all); + +__private_extern__ kern_return_t default_freezer_mapping_store( default_freezer_mapping_table_t , memory_object_offset_t, memory_object_t, memory_object_offset_t ); @@ -147,14 +162,12 @@ __private_extern__ kern_return_t default_freezer_mapping_update( default_freezer memory_object_offset_t *, boolean_t ); -__private_extern__ void default_freezer_memory_object_create(vm_object_t, vm_object_t, default_freezer_mapping_table_t); - -__private_extern__ void default_freezer_pack_page(vm_page_t, vm_object_t, vm_object_offset_t, void**); +__private_extern__ void default_freezer_handle_reference_locked(default_freezer_handle_t); -__private_extern__ void default_freezer_unpack(vm_object_t, void**); +__private_extern__ boolean_t default_freezer_handle_deallocate_locked(default_freezer_handle_t); -__private_extern__ vm_object_t default_freezer_get_compact_vm_object(void**); +__private_extern__ void default_freezer_memory_object_create(vm_object_t, default_freezer_handle_t); #endif /* MACH_KERNEL */ -#endif /* DEFAULT_FREEZER_H */ #endif /* CONFIG_FREEZE */ +#endif /* DEFAULT_FREEZER_H */ diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index de7baff29..67b69df41 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -176,57 +176,12 @@ memory_object_lock_page( m, should_return, should_flush, prot, 0); - if (m->busy || m->cleaning) { - if (m->list_req_pending && - should_return == MEMORY_OBJECT_RETURN_NONE && - should_flush == TRUE) { + if (m->busy || m->cleaning) + return (MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); - if (m->absent) { - /* - * this is the list_req_pending | absent | busy case - * which originates from vm_fault_page. - * Combine that with should_flush == TRUE and we - * have a case where we need to toss the page from - * the object. - */ - if (!VM_PAGE_WIRED(m)) { - return (MEMORY_OBJECT_LOCK_RESULT_MUST_FREE); - } else { - return (MEMORY_OBJECT_LOCK_RESULT_DONE); - } - } - if (m->pageout || m->cleaning) { - /* - * if pageout is set, page was earmarked by vm_pageout_scan - * to be cleaned and stolen... if cleaning is set, we're - * pre-cleaning pages for a hibernate... - * in either case, we're going - * to take it back since we are being asked to - * flush the page w/o cleaning it (i.e. we don't - * care that it's dirty, we want it gone from - * the cache) and we don't want to stall - * waiting for it to be cleaned for 2 reasons... - * 1 - no use paging it out since we're probably - * shrinking the file at this point or we no - * longer care about the data in the page - * 2 - if we stall, we may casue a deadlock in - * the FS trying to acquire its locks - * on the VNOP_PAGEOUT path presuming that - * those locks are already held on the truncate - * path before calling through to this function - * - * so undo all of the state that vm_pageout_scan - * hung on this page - */ + if (m->laundry) + vm_pageout_steal_laundry(m, FALSE); - vm_pageout_queue_steal(m, FALSE); - PAGE_WAKEUP_DONE(m); - } else { - panic("list_req_pending on page %p without absent/pageout/cleaning set\n", m); - } - } else - return (MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); - } /* * Don't worry about pages for which the kernel * does not have any data. @@ -262,8 +217,9 @@ memory_object_lock_page( * for the page to go from the clean to the dirty state * after we've made our decision */ - if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) - m->dirty = TRUE; + if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m, FALSE); + } } else { /* * If we are decreasing permission, do it now; @@ -650,12 +606,6 @@ vm_object_update_extent( data_cnt += PAGE_SIZE; next_offset = offset + PAGE_SIZE_64; - /* - * Clean - */ - m->list_req_pending = TRUE; - m->cleaning = TRUE; - /* * wired pages shouldn't be flushed and * since they aren't on any queue, @@ -667,10 +617,7 @@ vm_object_update_extent( /* * add additional state for the flush */ - m->busy = TRUE; m->pageout = TRUE; - - dwp->dw_mask |= DW_vm_page_wire; } /* * we use to remove the page from the queues at this @@ -858,6 +805,7 @@ vm_object_update( fault_info.io_sync = FALSE; fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; vm_object_paging_begin(copy_object); @@ -1793,7 +1741,6 @@ host_default_memory_manager( thread_wakeup((event_t) &memory_manager_default); -#ifndef CONFIG_FREEZE /* * Now that we have a default pager for anonymous memory, * reactivate all the throttled pages (i.e. dirty pages with @@ -1803,7 +1750,6 @@ host_default_memory_manager( { vm_page_reactivate_all_throttled(); } -#endif } out: lck_mtx_unlock(&memory_manager_default_lock); diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 76d7cb305..26c26d6c0 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -187,6 +187,7 @@ extern void pmap_virtual_space( * Routines to manage the physical map data structure. */ extern pmap_t pmap_create( /* Create a pmap_t. */ + ledger_t ledger, vm_map_size_t size, #ifdef __i386__ boolean_t is_64bit); @@ -204,6 +205,7 @@ extern void pmap_enter( /* Enter a mapping */ vm_map_offset_t v, ppnum_t pn, vm_prot_t prot, + vm_prot_t fault_type, unsigned int flags, boolean_t wired); @@ -212,6 +214,7 @@ extern kern_return_t pmap_enter_options( vm_map_offset_t v, ppnum_t pn, vm_prot_t prot, + vm_prot_t fault_type, unsigned int flags, boolean_t wired, unsigned int options); @@ -374,24 +377,25 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory /* * Macro to be used in place of pmap_enter() */ -#define PMAP_ENTER(pmap, virtual_address, page, protection, flags, wired) \ +#define PMAP_ENTER(pmap, virtual_address, page, protection, fault_type, flags, wired) \ MACRO_BEGIN \ pmap_t __pmap = (pmap); \ vm_page_t __page = (page); \ \ PMAP_ENTER_CHECK(__pmap, __page) \ - pmap_enter(__pmap, \ + pmap_enter(__pmap, \ (virtual_address), \ __page->phys_page, \ - (protection), \ + (protection), \ + (fault_type), \ (flags), \ (wired)); \ MACRO_END #endif /* !PMAP_ENTER */ #ifndef PMAP_ENTER_OPTIONS -#define PMAP_ENTER_OPTIONS(pmap, virtual_address, page, protection, \ - flags, wired, options, result) \ +#define PMAP_ENTER_OPTIONS(pmap, virtual_address, page, protection, fault_type, \ + flags, wired, options, result) \ MACRO_BEGIN \ pmap_t __pmap = (pmap); \ vm_page_t __page = (page); \ @@ -400,13 +404,41 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory result = pmap_enter_options(__pmap, \ (virtual_address), \ __page->phys_page, \ - (protection), \ + (protection), \ + (fault_type), \ (flags), \ (wired), \ - options); \ + options); \ MACRO_END #endif /* !PMAP_ENTER_OPTIONS */ +#ifndef PMAP_SET_CACHE_ATTR +#define PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op) \ + MACRO_BEGIN \ + if (!batch_pmap_op) { \ + pmap_set_cache_attributes(mem->phys_page, cache_attr); \ + object->set_cache_attr = TRUE; \ + } \ + MACRO_END +#endif /* PMAP_SET_CACHE_ATTR */ + +#ifndef PMAP_BATCH_SET_CACHE_ATTR +#define PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, \ + cache_attr, num_pages, batch_pmap_op) \ + MACRO_BEGIN \ + if ((batch_pmap_op)) { \ + unsigned int __page_idx=0; \ + while (__page_idx < (num_pages)) { \ + pmap_set_cache_attributes( \ + user_page_list[__page_idx].phys_addr, \ + (cache_attr)); \ + __page_idx++; \ + } \ + (object)->set_cache_attr = TRUE; \ + } \ + MACRO_END +#endif /* PMAP_BATCH_SET_CACHE_ATTR */ + #define PMAP_ENTER_CHECK(pmap, page) \ { \ if ((pmap) != kernel_pmap) { \ @@ -494,10 +526,14 @@ extern pmap_t kernel_pmap; /* The kernel's map */ #define VM_WIMG_MASK 0xFF #define VM_MEM_SUPERPAGE 0x100 /* map a superpage instead of a base page */ +#define VM_MEM_STACK 0x200 #define PMAP_OPTIONS_NOWAIT 0x1 /* don't block, return * KERN_RESOURCE_SHORTAGE * instead */ +#define PMAP_OPTIONS_NOENTER 0x2 /* expand pmap if needed + * but don't enter mapping + */ #if !defined(__LP64__) extern vm_offset_t pmap_extract(pmap_t pmap, diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index ef46cfeca..a98fdbb3a 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -357,6 +357,7 @@ apple_protect_pager_data_request( fault_info.stealth = TRUE; fault_info.io_sync = FALSE; fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; interruptible = fault_info.interruptible; pager = apple_protect_pager_lookup(mem_obj); @@ -512,6 +513,7 @@ apple_protect_pager_data_request( kernel_mapping, src_page->phys_page, VM_PROT_READ, + VM_PROT_NONE, 0, TRUE); /* @@ -527,6 +529,7 @@ apple_protect_pager_data_request( kernel_mapping + PAGE_SIZE_64, dst_pnum, VM_PROT_READ | VM_PROT_WRITE, + VM_PROT_NONE, 0, TRUE); diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index ab281c928..6dc7767f5 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -64,7 +64,6 @@ #include #include -#include #include #include @@ -141,10 +140,6 @@ extern unsigned int dp_pages_free, dp_pages_reserve; extern int cs_debug; -#if MACH_KDB -extern struct db_watchpoint *db_watchpoint_list; -#endif /* MACH_KDB */ - boolean_t current_thread_aborted(void); /* Forward declarations of internal routines. */ @@ -173,6 +168,7 @@ extern void vm_fault_classify_init(void); #endif unsigned long vm_pmap_enter_blocked = 0; +unsigned long vm_pmap_enter_retried = 0; unsigned long vm_cs_validates = 0; unsigned long vm_cs_revalidates = 0; @@ -233,7 +229,7 @@ vm_fault_cleanup( register vm_page_t top_page) { vm_object_paging_end(object); - vm_object_unlock(object); + vm_object_unlock(object); if (top_page != VM_PAGE_NULL) { object = top_page->object; @@ -493,7 +489,7 @@ vm_fault_deactivate_behind( for (n = 0; n < max_pages_in_run; n++) { m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); - if (m && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { + if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { page_run[pages_in_run++] = m; pmap_clear_reference(m->phys_page); } @@ -698,6 +694,12 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) assert(!VM_PAGE_WIRED(m)); + /* + * can't be on the pageout queue since we don't + * have a pager to try and clean to + */ + assert(!m->pageout_queue); + VM_PAGE_QUEUES_REMOVE(m); queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); @@ -705,11 +707,6 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) vm_page_throttled_count++; vm_page_unlock_queues(); - } else { - if (current_thread()->t_page_creation_count > vm_page_creation_throttle) { - m->zero_fill = TRUE; - VM_ZF_COUNT_INCR(); - } } return (my_fault); } @@ -764,6 +761,7 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) * paging_in_progress reference. */ unsigned int vm_fault_page_blocked_access = 0; +unsigned int vm_fault_page_forced_retry = 0; vm_fault_return_t vm_fault_page( @@ -799,12 +797,16 @@ vm_fault_page( vm_object_t next_object; vm_object_t copy_object; boolean_t look_for_page; + boolean_t force_fault_retry = FALSE; vm_prot_t access_required = fault_type; vm_prot_t wants_copy_flag; CLUSTER_STAT(int pages_at_higher_offsets;) CLUSTER_STAT(int pages_at_lower_offsets;) kern_return_t wait_result; boolean_t interruptible_state; + boolean_t data_already_requested = FALSE; + vm_behavior_t orig_behavior; + vm_size_t orig_cluster_size; vm_fault_return_t error; int my_fault; uint32_t try_failed_count; @@ -866,25 +868,6 @@ vm_fault_page( dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */ #endif - -#if MACH_KDB - /* - * If there are watchpoints set, then - * we don't want to give away write permission - * on a read fault. Make the task write fault, - * so that the watchpoint code notices the access. - */ - if (db_watchpoint_list) { - /* - * If we aren't asking for write permission, - * then don't give it away. We're using write - * faults to set the dirty bit. - */ - if (!(fault_type & VM_PROT_WRITE)) - *protection &= ~VM_PROT_WRITE; - } -#endif /* MACH_KDB */ - interruptible = fault_info->interruptible; interruptible_state = thread_interrupt_level(interruptible); @@ -986,116 +969,35 @@ vm_fault_page( /* * The page is being brought in, * wait for it and then retry. - * - * A possible optimization: if the page - * is known to be resident, we can ignore - * pages that are absent (regardless of - * whether they're busy). */ #if TRACEFAULTPAGE dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - if (m->list_req_pending) { - /* - * "list_req_pending" means that the - * page has been marked for a page-in - * or page-out operation but hasn't been - * grabbed yet. - * Since whoever marked it - * "list_req_pending" might now be - * making its way through other layers - * of code and possibly blocked on locks - * that we might be holding, we can't - * just block on a "busy" and - * "list_req_pending" page or we might - * deadlock with that other thread. - * - * [ For pages backed by a file on an - * HFS volume, we might deadlock with - * the HFS truncate lock, for example: - * A: starts a pageout or pagein - * operation and marks a page "busy", - * "list_req_pending" and either - * "pageout", "cleaning" or "absent". - * A: makes its way through the - * memory object (vnode) code. - * B: starts from the memory object - * side, via a write() on a file, for - * example. - * B: grabs some filesystem locks. - * B: attempts to grab the same page for - * its I/O. - * B: blocks here because the page is - * "busy". - * A: attempts to grab the filesystem - * lock we're holding. - * And we have a deadlock... ] - * - * Since the page hasn't been claimed - * by the other thread yet, it's fair - * for us to grab here. - */ - if (m->absent) { - /* - * The page needs to be paged - * in. We can do it here but we - * need to get rid of "m", the - * place holder page inserted by - * another thread who is also - * trying to page it in. When - * that thread resumes, it will - * either wait for our page to - * arrive or it will find it - * already there. - */ - VM_PAGE_FREE(m); + wait_result = PAGE_SLEEP(object, m, interruptible); - /* - * Retry the fault. We'll find - * that the page is not resident - * and initiate a page-in again. - */ - continue; - } - if (m->pageout || m->cleaning) { - /* - * This page has been selected - * for a page-out but we want - * to bring it in. Let's just - * cancel the page-out... - */ - vm_pageout_queue_steal(m, FALSE); - /* - * ... and clear "busy" and - * wake up any waiters... - */ - PAGE_WAKEUP_DONE(m); - /* - * ... and continue with the - * "fault" handling. - */ - } - } else { - wait_result = PAGE_SLEEP(object, m, interruptible); - XPR(XPR_VM_FAULT, - "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", - object, offset, - m, 0, 0); - counter(c_vm_fault_page_block_busy_kernel++); + XPR(XPR_VM_FAULT, + "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", + object, offset, + m, 0, 0); + counter(c_vm_fault_page_block_busy_kernel++); - if (wait_result != THREAD_AWAKENED) { - vm_fault_cleanup(object, first_m); - thread_interrupt_level(interruptible_state); + if (wait_result != THREAD_AWAKENED) { + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); - if (wait_result == THREAD_RESTART) - return (VM_FAULT_RETRY); - else - return (VM_FAULT_INTERRUPTED); - } - continue; + if (wait_result == THREAD_RESTART) + return (VM_FAULT_RETRY); + else + return (VM_FAULT_INTERRUPTED); } + continue; } + if (m->laundry) { + m->pageout = FALSE; + if (!m->cleaning) + vm_pageout_steal_laundry(m, FALSE); + } if (m->phys_page == vm_page_guard_addr) { /* * Guard page: off limits ! @@ -1253,7 +1155,10 @@ vm_fault_page( m->busy = TRUE; vm_page_lockspin_queues(); + + assert(!m->pageout_queue); VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); } XPR(XPR_VM_FAULT, @@ -1348,7 +1253,8 @@ vm_fault_page( * the page in the speculative queue. */ vm_page_lockspin_queues(); - VM_PAGE_QUEUES_REMOVE(m); + if (m->speculative) + VM_PAGE_QUEUES_REMOVE(m); vm_page_unlock_queues(); } @@ -1416,14 +1322,17 @@ vm_fault_page( * this object can provide the data or we're the top object... * object is locked; m == NULL */ + if (must_be_resident) + goto dont_look_for_page; + look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply); #if TRACEFAULTPAGE dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */ #endif - if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) { + if (!look_for_page && object == first_object && !object->phys_contiguous) { /* - * Allocate a new page for this object/offset pair + * Allocate a new page for this object/offset pair as a placeholder */ m = vm_page_grab(); #if TRACEFAULTPAGE @@ -1436,9 +1345,14 @@ vm_fault_page( return (VM_FAULT_MEMORY_SHORTAGE); } - vm_page_insert(m, object, offset); + + if (fault_info && fault_info->batch_pmap_op == TRUE) { + vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE); + } else { + vm_page_insert(m, object, offset); + } } - if (look_for_page && !must_be_resident) { + if (look_for_page) { kern_return_t rc; /* @@ -1523,12 +1437,8 @@ vm_fault_page( } } if (m != VM_PAGE_NULL) { - /* - * Indicate that the page is waiting for data - * from the memory manager. - */ - m->list_req_pending = TRUE; - m->absent = TRUE; + VM_PAGE_FREE(m); + m = VM_PAGE_NULL; } #if TRACEFAULTPAGE @@ -1577,6 +1487,45 @@ vm_fault_page( object, offset, m, access_required | wants_copy_flag, 0); + if (object->copy == first_object) { + /* + * if we issue the memory_object_data_request in + * this state, we are subject to a deadlock with + * the underlying filesystem if it is trying to + * shrink the file resulting in a push of pages + * into the copy object... that push will stall + * on the placeholder page, and if the pushing thread + * is holding a lock that is required on the pagein + * path (such as a truncate lock), we'll deadlock... + * to avoid this potential deadlock, we throw away + * our placeholder page before calling memory_object_data_request + * and force this thread to retry the vm_fault_page after + * we have issued the I/O. the second time through this path + * we will find the page already in the cache (presumably still + * busy waiting for the I/O to complete) and then complete + * the fault w/o having to go through memory_object_data_request again + */ + assert(first_m != VM_PAGE_NULL); + assert(first_m->object == first_object); + + vm_object_lock(first_object); + VM_PAGE_FREE(first_m); + vm_object_paging_end(first_object); + vm_object_unlock(first_object); + + first_m = VM_PAGE_NULL; + force_fault_retry = TRUE; + + vm_fault_page_forced_retry++; + } + + if (data_already_requested == TRUE) { + orig_behavior = fault_info->behavior; + orig_cluster_size = fault_info->cluster_size; + + fault_info->behavior = VM_BEHAVIOR_RANDOM; + fault_info->cluster_size = PAGE_SIZE; + } /* * Call the memory manager to retrieve the data. */ @@ -1587,6 +1536,12 @@ vm_fault_page( access_required | wants_copy_flag, (memory_object_fault_info_t)fault_info); + if (data_already_requested == TRUE) { + fault_info->behavior = orig_behavior; + fault_info->cluster_size = orig_cluster_size; + } else + data_already_requested = TRUE; + #if TRACEFAULTPAGE dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */ #endif @@ -1615,6 +1570,13 @@ vm_fault_page( return (VM_FAULT_INTERRUPTED); } + if (force_fault_retry == TRUE) { + + vm_fault_cleanup(object, first_m); + thread_interrupt_level(interruptible_state); + + return (VM_FAULT_RETRY); + } if (m == VM_PAGE_NULL && object->phys_contiguous) { /* * No page here means that the object we @@ -1646,7 +1608,7 @@ vm_fault_page( */ continue; } - +dont_look_for_page: /* * We get here if the object has no pager, or an existence map * exists and indicates the page isn't present on the pager @@ -1899,7 +1861,7 @@ vm_fault_page( */ assert(copy_m->busy); vm_page_insert(copy_m, object, offset); - copy_m->dirty = TRUE; + SET_PAGE_DIRTY(copy_m, TRUE); m = copy_m; /* @@ -2111,17 +2073,80 @@ vm_fault_page( vm_page_activate(copy_m); vm_page_unlock_queues(); - copy_m->dirty = TRUE; + SET_PAGE_DIRTY(copy_m, TRUE); PAGE_WAKEUP_DONE(copy_m); - } - else { + + } else if (copy_object->internal) { + /* + * For internal objects check with the pager to see + * if the page already exists in the backing store. + * If yes, then we can drop the copy page. If not, + * then we'll activate it, mark it dirty and keep it + * around. + */ + + kern_return_t kr = KERN_SUCCESS; + + memory_object_t copy_pager = copy_object->pager; + assert(copy_pager != MEMORY_OBJECT_NULL); + vm_object_paging_begin(copy_object); + + vm_object_unlock(copy_object); + + kr = memory_object_data_request( + copy_pager, + copy_offset + copy_object->paging_offset, + 0, /* Only query the pager. */ + VM_PROT_READ, + NULL); + + vm_object_lock(copy_object); + + vm_object_paging_end(copy_object); + + /* + * Since we dropped the copy_object's lock, + * check whether we'll have to deallocate + * the hard way. + */ + if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { + vm_object_unlock(copy_object); + vm_object_deallocate(copy_object); + vm_object_lock(object); + + continue; + } + if (kr == KERN_SUCCESS) { + /* + * The pager has the page. We don't want to overwrite + * that page by sending this one out to the backing store. + * So we drop the copy page. + */ + VM_PAGE_FREE(copy_m); + + } else { + /* + * The pager doesn't have the page. We'll keep this one + * around in the copy object. It might get sent out to + * the backing store under memory pressure. + */ + vm_page_lockspin_queues(); + assert(!m->cleaning); + vm_page_activate(copy_m); + vm_page_unlock_queues(); + + SET_PAGE_DIRTY(copy_m, TRUE); + PAGE_WAKEUP_DONE(copy_m); + } + } else { + assert(copy_m->busy == TRUE); assert(!m->cleaning); /* * dirty is protected by the object lock */ - copy_m->dirty = TRUE; + SET_PAGE_DIRTY(copy_m, TRUE); /* * The page is already ready for pageout: @@ -2159,6 +2184,7 @@ vm_fault_page( */ vm_object_lock(object); } + /* * Because we're pushing a page upward * in the object tree, we must restart @@ -2287,6 +2313,7 @@ vm_fault_enter(vm_page_t m, boolean_t change_wiring, boolean_t no_cache, boolean_t cs_bypass, + boolean_t *need_retry, int *type_of_fault) { kern_return_t kr, pe_result; @@ -2532,19 +2559,38 @@ vm_fault_enter(vm_page_t m, /* Prevent a deadlock by not * holding the object lock if we need to wait for a page in * pmap_enter() - */ - PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, 0, + PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0, wired, PMAP_OPTIONS_NOWAIT, pe_result); if(pe_result == KERN_RESOURCE_SHORTAGE) { + + if (need_retry) { + /* + * this will be non-null in the case where we hold the lock + * on the top-object in this chain... we can't just drop + * the lock on the object we're inserting the page into + * and recall the PMAP_ENTER since we can still cause + * a deadlock if one of the critical paths tries to + * acquire the lock on the top-object and we're blocked + * in PMAP_ENTER waiting for memory... our only recourse + * is to deal with it at a higher level where we can + * drop both locks. + */ + *need_retry = TRUE; + vm_pmap_enter_retried++; + goto after_the_pmap_enter; + } /* The nonblocking version of pmap_enter did not succeed. - * Use the blocking version instead. Requires marking + * and we don't need to drop other locks and retry + * at the level above us, so + * use the blocking version instead. Requires marking * the page busy and unlocking the object */ boolean_t was_busy = m->busy; m->busy = TRUE; vm_object_unlock(m->object); - PMAP_ENTER(pmap, vaddr, m, prot, 0, wired); - + PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired); + /* Take the object lock again. */ vm_object_lock(m->object); @@ -2582,7 +2628,7 @@ after_the_pmap_enter: vm_page_deactivate(m); vm_page_unlock_queues(); } else { - if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) { + if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) { if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) { struct vpl *lq; @@ -2632,27 +2678,35 @@ after_the_pmap_enter: /* * test again now that we hold the page queue lock */ - if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m)) { + if (!VM_PAGE_WIRED(m)) { + if (m->clean_queue) { + VM_PAGE_QUEUES_REMOVE(m); - /* - * If this is a no_cache mapping and the page has never been - * mapped before or was previously a no_cache page, then we - * want to leave pages in the speculative state so that they - * can be readily recycled if free memory runs low. Otherwise - * the page is activated as normal. - */ + vm_pageout_cleaned_reactivated++; + vm_pageout_cleaned_fault_reactivated++; + } - if (no_cache && (!previously_pmapped || m->no_cache)) { - m->no_cache = TRUE; + if ((!m->active && !m->inactive) || no_cache) { + /* + * If this is a no_cache mapping and the page has never been + * mapped before or was previously a no_cache page, then we + * want to leave pages in the speculative state so that they + * can be readily recycled if free memory runs low. Otherwise + * the page is activated as normal. + */ - if (!m->speculative) - vm_page_speculate(m, FALSE); + if (no_cache && (!previously_pmapped || m->no_cache)) { + m->no_cache = TRUE; - } else if (!m->active && !m->inactive) - vm_page_activate(m); + if (!m->speculative) + vm_page_speculate(m, FALSE); - } + } else if (!m->active && !m->inactive) { + vm_page_activate(m); + } + } + } vm_page_unlock_queues(); } } @@ -2714,13 +2768,15 @@ vm_fault( vm_prot_t original_fault_type; struct vm_object_fault_info fault_info; boolean_t need_collapse = FALSE; + boolean_t need_retry = FALSE; int object_lock_type = 0; int cur_object_lock_type; vm_object_t top_object = VM_OBJECT_NULL; int throttle_delay; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, (int)((uint64_t)vaddr >> 32), (int)vaddr, (map == kernel_map), @@ -2728,7 +2784,8 @@ vm_fault( 0); if (get_preemption_level() != 0) { - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, (int)((uint64_t)vaddr >> 32), (int)vaddr, KERN_FAILURE, @@ -2782,6 +2839,7 @@ RetryFault: fault_info.stealth = FALSE; fault_info.io_sync = FALSE; fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; /* * If the page is wired, we must fault for the current protection @@ -2941,6 +2999,45 @@ RetryFault: kr = KERN_ABORTED; goto done; } + if (m->laundry) { + if (object != cur_object) { + if (cur_object_lock_type == OBJECT_LOCK_SHARED) { + cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + vm_object_unlock(object); + vm_object_unlock(cur_object); + + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + goto RetryFault; + } + + } else if (object_lock_type == OBJECT_LOCK_SHARED) { + + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + + if (vm_object_lock_upgrade(object) == FALSE) { + /* + * couldn't upgrade, so explictly take the lock + * exclusively and go relookup the page since we + * will have dropped the object lock and + * a different thread could have inserted + * a page at this offset + * no need for a full retry since we're + * at the top level of the object chain + */ + vm_object_lock(object); + + continue; + } + } + m->pageout = FALSE; + + vm_pageout_steal_laundry(m, FALSE); + } + if (m->phys_page == vm_page_guard_addr) { /* * Guard page: let the slow path deal with it @@ -3166,6 +3263,7 @@ FastPmapEnter: change_wiring, fault_info.no_cache, fault_info.cs_bypass, + (top_object != VM_OBJECT_NULL ? &need_retry : NULL), &type_of_fault); } else { kr = vm_fault_enter(m, @@ -3177,6 +3275,7 @@ FastPmapEnter: change_wiring, fault_info.no_cache, fault_info.cs_bypass, + (top_object != VM_OBJECT_NULL ? &need_retry : NULL), &type_of_fault); } @@ -3197,7 +3296,8 @@ FastPmapEnter: if (need_collapse == TRUE) vm_object_collapse(object, offset, TRUE); - if (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT) { + if (need_retry == FALSE && + (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) { /* * evaluate access pattern and update state * vm_fault_deactivate_behind depends on the @@ -3219,6 +3319,20 @@ FastPmapEnter: if (real_map != map) vm_map_unlock(real_map); + if (need_retry == TRUE) { + /* + * vm_fault_enter couldn't complete the PMAP_ENTER... + * at this point we don't hold any locks so it's safe + * to ask the pmap layer to expand the page table to + * accommodate this mapping... once expanded, we'll + * re-drive the fault which should result in vm_fault_enter + * being able to successfully enter the mapping this time around + */ + (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER); + + need_retry = FALSE; + goto RetryFault; + } goto done; } /* @@ -3307,7 +3421,7 @@ FastPmapEnter: */ vm_page_copy(cur_m, m); vm_page_insert(m, object, offset); - m->dirty = TRUE; + SET_PAGE_DIRTY(m, FALSE); /* * Now cope with the source page and object @@ -3779,6 +3893,7 @@ handle_copy_delay: change_wiring, fault_info.no_cache, fault_info.cs_bypass, + NULL, &type_of_fault); } else { kr = vm_fault_enter(m, @@ -3790,6 +3905,7 @@ handle_copy_delay: change_wiring, fault_info.no_cache, fault_info.cs_bypass, + NULL, &type_of_fault); } if (kr != KERN_SUCCESS) { @@ -3926,7 +4042,8 @@ handle_copy_delay: done: thread_interrupt_level(interruptible_state); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, (int)((uint64_t)vaddr >> 32), (int)vaddr, kr, @@ -4041,6 +4158,7 @@ vm_fault_unwire( fault_info.io_sync = FALSE; fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; /* * Since the pages are wired down, we must be able to @@ -4318,6 +4436,7 @@ vm_fault_wire_fast( FALSE, FALSE, FALSE, + NULL, &type_of_fault); done: @@ -4453,6 +4572,7 @@ vm_fault_copy( fault_info_src.io_sync = FALSE; fault_info_src.cs_bypass = FALSE; fault_info_src.mark_zf_absent = FALSE; + fault_info_src.batch_pmap_op = FALSE; fault_info_dst.interruptible = interruptible; fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL; @@ -4464,6 +4584,7 @@ vm_fault_copy( fault_info_dst.io_sync = FALSE; fault_info_dst.cs_bypass = FALSE; fault_info_dst.mark_zf_absent = FALSE; + fault_info_dst.batch_pmap_op = FALSE; do { /* while (amount_left > 0) */ /* @@ -4689,7 +4810,7 @@ vm_fault_copy( (vm_size_t)part_size); if(!dst_page->dirty){ vm_object_lock(dst_object); - dst_page->dirty = TRUE; + SET_PAGE_DIRTY(dst_page, TRUE); vm_object_unlock(dst_page->object); } @@ -4700,10 +4821,13 @@ vm_fault_copy( if (result_page == VM_PAGE_NULL) vm_page_zero_fill(dst_page); else{ + vm_object_lock(result_page->object); vm_page_copy(result_page, dst_page); + vm_object_unlock(result_page->object); + if(!dst_page->dirty){ vm_object_lock(dst_object); - dst_page->dirty = TRUE; + SET_PAGE_DIRTY(dst_page, TRUE); vm_object_unlock(dst_page->object); } } @@ -4892,6 +5016,7 @@ vm_page_validate_cs_mapped( /* verify the SHA1 hash for this page */ validated = cs_validate_page(blobs, + pager, offset + object->paging_offset, (const void *)kaddr, &tainted); diff --git a/osfmk/vm/vm_fault.h b/osfmk/vm/vm_fault.h index 6d90a84b0..878d140f1 100644 --- a/osfmk/vm/vm_fault.h +++ b/osfmk/vm/vm_fault.h @@ -164,6 +164,7 @@ extern kern_return_t vm_fault_enter( boolean_t change_wiring, boolean_t no_cache, boolean_t cs_bypass, + boolean_t *need_retry, int *type_of_fault); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index cf29c82f6..59af43c26 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,6 +64,7 @@ */ #include +#include #include #include #include @@ -88,8 +89,12 @@ const vm_offset_t vm_min_kernel_address = VM_MIN_KERNEL_AND_KEXT_ADDRESS; const vm_offset_t vm_max_kernel_address = VM_MAX_KERNEL_ADDRESS; boolean_t vm_kernel_ready = FALSE; +boolean_t kmem_ready = FALSE; boolean_t zlog_ready = FALSE; +vm_offset_t kmapoff_kaddr; +unsigned int kmapoff_pgcnt; + /* * vm_mem_bootstrap initializes the virtual memory system. * This is done only by the first cpu up. @@ -107,7 +112,7 @@ vm_mem_bootstrap(void) * From here on, all physical memory is accounted for, * and we use only virtual addresses. */ -#define vm_mem_bootstrap_kprintf(x) +#define vm_mem_bootstrap_kprintf(x) /* kprintf(x) */ vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling vm_page_bootstrap\n")); vm_page_bootstrap(&start, &end); @@ -124,11 +129,25 @@ vm_mem_bootstrap(void) vm_kernel_ready = TRUE; - vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling vm_map_int\n")); + vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling vm_map_init\n")); vm_map_init(); vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling kmem_init\n")); kmem_init(start, end); + kmem_ready = TRUE; + /* + * Eat a random amount of kernel_map to fuzz subsequent heap, zone and + * stack addresses. (With a 4K page and 9 bits of randomness, this + * eats at most 2M of VA from the map.) + */ + if (!PE_parse_boot_argn("kmapoff", &kmapoff_pgcnt, + sizeof (kmapoff_pgcnt))) + kmapoff_pgcnt = early_random() & 0x1ff; /* 9 bits */ + + if (kmapoff_pgcnt > 0 && + vm_allocate(kernel_map, &kmapoff_kaddr, + kmapoff_pgcnt * PAGE_SIZE_64, VM_FLAGS_ANYWHERE) != KERN_SUCCESS) + panic("cannot vm_allocate %u kernel_map pages", kmapoff_pgcnt); vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling pmap_init\n")); pmap_init(); @@ -158,21 +177,27 @@ vm_mem_bootstrap(void) vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling zone_init\n")); assert((vm_size_t) zsize == zsize); - zone_init((vm_size_t) zsize); /* Allocate address space for zones */ - + zone_init((vm_size_t) zsize); /* Allocate address space for zones */ + + /* The vm_page_zone must be created prior to kalloc_init; that + * routine can trigger zalloc()s (for e.g. mutex statistic structure + * initialization). The vm_page_zone must exist to saisfy fictitious + * page allocations (which are used for guard pages by the guard + * mode zone allocator). + */ + vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling vm_page_module_init\n")); + vm_page_module_init(); + vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling kalloc_init\n")); kalloc_init(); vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling vm_fault_init\n")); vm_fault_init(); - vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling vm_page_module_init\n")); - vm_page_module_init(); - vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling memory_manager_default_init\n")); memory_manager_default_init(); - vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling meory_object_control_bootstrap\n")); + vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling memory_object_control_bootstrap\n")); memory_object_control_bootstrap(); vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling device_pager_bootstrap\n")); diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index acd7d2a82..e35f70daf 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -126,15 +126,16 @@ kmem_alloc_contig( if (map == VM_MAP_NULL || (flags & ~(KMA_KOBJECT | KMA_LOMEM | KMA_NOPAGEWAIT))) return KERN_INVALID_ARGUMENT; + + map_size = vm_map_round_page(size); + map_mask = (vm_map_offset_t)mask; - if (size == 0) { + /* Check for zero allocation size (either directly or via overflow) */ + if (map_size == 0) { *addrp = 0; return KERN_INVALID_ARGUMENT; } - map_size = vm_map_round_page(size); - map_mask = (vm_map_offset_t)mask; - /* * Allocate a new object (if necessary) and the reference we * will be donating to the map entry. We must do this before @@ -244,19 +245,21 @@ kernel_memory_allocate( int wired_page_count = 0; int i; int vm_alloc_flags; + vm_prot_t kma_prot; if (! vm_kernel_ready) { panic("kernel_memory_allocate: VM is not ready"); } - if (size == 0) { - *addrp = 0; - return KERN_INVALID_ARGUMENT; - } map_size = vm_map_round_page(size); map_mask = (vm_map_offset_t) mask; vm_alloc_flags = 0; + /* Check for zero allocation size (either directly or via overflow) */ + if (map_size == 0) { + *addrp = 0; + return KERN_INVALID_ARGUMENT; + } /* * limit the size of a single extent of wired memory @@ -406,6 +409,9 @@ kernel_memory_allocate( mem->busy = FALSE; pg_offset += PAGE_SIZE_64; } + + kma_prot = VM_PROT_READ | VM_PROT_WRITE; + for (pg_offset = fill_start; pg_offset < fill_start + fill_size; pg_offset += PAGE_SIZE_64) { if (wired_page_list == NULL) panic("kernel_memory_allocate: wired_page_list == NULL"); @@ -422,7 +428,7 @@ kernel_memory_allocate( mem->wpmapped = TRUE; PMAP_ENTER(kernel_pmap, map_addr + pg_offset, mem, - VM_PROT_READ | VM_PROT_WRITE, 0, TRUE); + kma_prot, VM_PROT_NONE, ((flags & KMA_KSTACK) ? VM_MEM_STACK : 0), TRUE); if (flags & KMA_NOENCRYPT) { bzero(CAST_DOWN(void *, (map_addr + pg_offset)), PAGE_SIZE); @@ -812,7 +818,7 @@ kmem_remap_pages( mem->pmapped = TRUE; mem->wpmapped = TRUE; - PMAP_ENTER(kernel_pmap, map_start, mem, protection, 0, TRUE); + PMAP_ENTER(kernel_pmap, map_start, mem, protection, VM_PROT_NONE, 0, TRUE); map_start += PAGE_SIZE; offset += PAGE_SIZE; diff --git a/osfmk/vm/vm_kern.h b/osfmk/vm/vm_kern.h index 1c03bac0c..c4c8696d2 100644 --- a/osfmk/vm/vm_kern.h +++ b/osfmk/vm/vm_kern.h @@ -88,6 +88,7 @@ extern kern_return_t kernel_memory_allocate( #define KMA_GUARD_LAST 0x20 #define KMA_PERMANENT 0x40 #define KMA_NOENCRYPT 0x80 +#define KMA_KSTACK 0x100 extern kern_return_t kmem_alloc_contig( vm_map_t map, diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 0ce07a4d9..178a1cae0 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -94,7 +94,6 @@ #include #include #include -#include #include #include @@ -106,6 +105,7 @@ #include #include +extern u_int32_t random(void); /* from */ /* Internal prototypes */ @@ -288,11 +288,6 @@ static kern_return_t vm_map_can_reuse( vm_map_offset_t start, vm_map_offset_t end); -#if CONFIG_FREEZE -struct default_freezer_table; -__private_extern__ void* default_freezer_mapping_create(vm_object_t, vm_offset_t); -__private_extern__ void default_freezer_mapping_free(void**, boolean_t all); -#endif /* * Macros to copy a vm_map_entry. We must be careful to correctly @@ -303,6 +298,7 @@ __private_extern__ void default_freezer_mapping_free(void**, boolean_t all); * wire count; it's used for map splitting and zone changing in * vm_map_copyout. */ + #define vm_map_entry_copy(NEW,OLD) \ MACRO_BEGIN \ boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ @@ -313,6 +309,7 @@ boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ (NEW)->wired_count = 0; \ (NEW)->user_wired_count = 0; \ (NEW)->permanent = FALSE; \ + (NEW)->used_for_jit = FALSE; \ (NEW)->from_reserved_zone = _vmec_reserved; \ MACRO_END @@ -623,6 +620,8 @@ vm_map_init( void) { vm_size_t entry_zone_alloc_size; + const char *mez_name = "VM map entries"; + vm_map_zone = zinit((vm_map_size_t) sizeof(struct _vm_map), 40*1024, PAGE_SIZE, "maps"); zone_change(vm_map_zone, Z_NOENCRYPT, TRUE); @@ -631,12 +630,12 @@ vm_map_init( #else entry_zone_alloc_size = PAGE_SIZE * 6; #endif - vm_map_entry_zone = zinit((vm_map_size_t) sizeof(struct vm_map_entry), 1024*1024, entry_zone_alloc_size, - "VM map entries"); + mez_name); zone_change(vm_map_entry_zone, Z_NOENCRYPT, TRUE); zone_change(vm_map_entry_zone, Z_NOCALLOUT, TRUE); + zone_change(vm_map_entry_zone, Z_GZALLOC_EXEMPT, TRUE); vm_map_entry_reserved_zone = zinit((vm_map_size_t) sizeof(struct vm_map_entry), kentry_data_size * 64, kentry_data_size, @@ -659,6 +658,7 @@ vm_map_init( zone_change(vm_map_entry_reserved_zone, Z_NOCALLOUT, TRUE); zone_change(vm_map_entry_reserved_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ zone_change(vm_map_copy_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ + zone_change(vm_map_entry_reserved_zone, Z_GZALLOC_EXEMPT, TRUE); zcram(vm_map_zone, (vm_offset_t)map_data, map_data_size); zcram(vm_map_entry_reserved_zone, (vm_offset_t)kentry_data, kentry_data_size); @@ -666,6 +666,10 @@ vm_map_init( lck_grp_attr_setdefault(&vm_map_lck_grp_attr); lck_grp_init(&vm_map_lck_grp, "vm_map", &vm_map_lck_grp_attr); lck_attr_setdefault(&vm_map_lck_attr); + +#if CONFIG_FREEZE + default_freezer_init(); +#endif /* CONFIG_FREEZE */ } void @@ -688,6 +692,15 @@ vm_map_steal_memory( #else kentry_initial_pages = 6; #endif + +#if CONFIG_GZALLOC + /* If using the guard allocator, reserve more memory for the kernel + * reserved map entry pool. + */ + if (gzalloc_enabled()) + kentry_initial_pages *= 1024; +#endif + kentry_data_size = kentry_initial_pages * PAGE_SIZE; kentry_data = pmap_steal_memory(kentry_data_size); } @@ -737,7 +750,7 @@ vm_map_create( result->max_offset = max; result->wiring_required = FALSE; result->no_zero_fill = FALSE; - result->mapped = FALSE; + result->mapped_in_other_pmaps = FALSE; result->wait_for_space = FALSE; result->switch_protect = FALSE; result->disable_vmentry_reuse = FALSE; @@ -748,7 +761,7 @@ vm_map_create( result->color_rr = (color_seed++) & vm_color_mask; result->jit_entry_exists = FALSE; #if CONFIG_FREEZE - result->default_freezer_toc = NULL; + result->default_freezer_handle = NULL; #endif vm_map_lock_init(result); lck_mtx_init_ext(&result->s_lock, &result->s_lock_ext, &vm_map_lck_grp, &vm_map_lck_attr); @@ -798,7 +811,9 @@ _vm_map_entry_create( entry->from_reserved_zone = (zone == vm_map_entry_reserved_zone); vm_map_store_update( (vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE); - +#if MAP_ENTRY_CREATION_DEBUG + fastbacktrace(&entry->vme_bt[0], (sizeof(entry->vme_bt)/sizeof(uintptr_t))); +#endif return(entry); } @@ -812,7 +827,6 @@ _vm_map_entry_create( * of the stores */ #define vm_map_entry_dispose(map, entry) \ - vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE); \ _vm_map_entry_dispose(&(map)->hdr, (entry)) #define vm_map_copy_entry_dispose(map, entry) \ @@ -949,8 +963,9 @@ vm_map_destroy( flags, VM_MAP_NULL); #if CONFIG_FREEZE - if (map->default_freezer_toc){ - default_freezer_mapping_free( &(map->default_freezer_toc), TRUE); + if (map->default_freezer_handle) { + default_freezer_handle_deallocate(map->default_freezer_handle); + map->default_freezer_handle = NULL; } #endif vm_map_unlock(map); @@ -1321,6 +1336,8 @@ vm_map_find_space( new_entry->permanent = FALSE; new_entry->superpage_size = 0; + new_entry->used_for_jit = 0; + new_entry->alias = 0; new_entry->zero_wired_pages = FALSE; @@ -1400,7 +1417,7 @@ vm_map_pmap_enter( } type_of_fault = DBG_CACHE_HIT_FAULT; kr = vm_fault_enter(m, map->pmap, addr, protection, protection, - VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, + VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, NULL, &type_of_fault); vm_object_unlock(object); @@ -1445,6 +1462,58 @@ boolean_t vm_map_pmap_is_empty( #endif /* MACHINE_PMAP_IS_EMPTY */ } +#define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000 +kern_return_t +vm_map_random_address_for_size( + vm_map_t map, + vm_map_offset_t *address, + vm_map_size_t size) +{ + kern_return_t kr = KERN_SUCCESS; + int tries = 0; + vm_map_offset_t random_addr = 0; + vm_map_offset_t hole_end; + + vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL; + vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL; + vm_map_size_t vm_hole_size = 0; + vm_map_size_t addr_space_size; + + addr_space_size = vm_map_max(map) - vm_map_min(map); + + assert(page_aligned(size)); + + while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) { + random_addr = ((vm_map_offset_t)random()) << PAGE_SHIFT; + random_addr = trunc_page(vm_map_min(map) + + (random_addr % addr_space_size)); + + if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) { + if (prev_entry == vm_map_to_entry(map)) { + next_entry = vm_map_first_entry(map); + } else { + next_entry = prev_entry->vme_next; + } + if (next_entry == vm_map_to_entry(map)) { + hole_end = vm_map_max(map); + } else { + hole_end = next_entry->vme_start; + } + vm_hole_size = hole_end - random_addr; + if (vm_hole_size >= size) { + *address = random_addr; + break; + } + } + tries++; + } + + if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) { + kr = KERN_NO_SPACE; + } + return kr; +} + /* * Routine: vm_map_enter * @@ -1489,6 +1558,7 @@ vm_map_enter( boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); boolean_t is_submap = ((flags & VM_FLAGS_SUBMAP) != 0); boolean_t permanent = ((flags & VM_FLAGS_PERMANENT) != 0); + boolean_t entry_for_jit = ((flags & VM_FLAGS_MAP_JIT) != 0); unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT); char alias; vm_map_offset_t effective_min_offset, effective_max_offset; @@ -1522,7 +1592,7 @@ vm_map_enter( #if CONFIG_EMBEDDED if (cur_protection & VM_PROT_WRITE){ - if ((cur_protection & VM_PROT_EXECUTE) && !(flags & VM_FLAGS_MAP_JIT)){ + if ((cur_protection & VM_PROT_EXECUTE) && !entry_for_jit){ printf("EMBEDDED: %s curprot cannot be write+execute. turning off execute\n", __PRETTY_FUNCTION__); cur_protection &= ~VM_PROT_EXECUTE; } @@ -1616,11 +1686,22 @@ StartAgain: ; vm_map_lock(map); map_locked = TRUE; - if ((flags & VM_FLAGS_MAP_JIT) && (map->jit_entry_exists)){ - result = KERN_INVALID_ARGUMENT; - goto BailOut; + if (entry_for_jit) { + if (map->jit_entry_exists) { + result = KERN_INVALID_ARGUMENT; + goto BailOut; + } + /* + * Get a random start address. + */ + result = vm_map_random_address_for_size(map, address, size); + if (result != KERN_SUCCESS) { + goto BailOut; + } + start = *address; } + /* * Calculate the first possible address. */ @@ -1665,8 +1746,10 @@ StartAgain: ; if (entry == NULL) { vm_map_entry_t tmp_entry; - if (vm_map_lookup_entry(map, start, &tmp_entry)) + if (vm_map_lookup_entry(map, start, &tmp_entry)) { + assert(!entry_for_jit); start = tmp_entry->vme_end; + } entry = tmp_entry; } } @@ -1872,11 +1955,13 @@ StartAgain: ; * semantics. */ - if (purgable) { + if (purgable || entry_for_jit) { if (object == VM_OBJECT_NULL) { object = vm_object_allocate(size); object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - object->purgable = VM_PURGABLE_NONVOLATILE; + if (purgable) { + object->purgable = VM_PURGABLE_NONVOLATILE; + } offset = (vm_object_offset_t)0; } } else if ((is_submap == FALSE) && @@ -1951,11 +2036,11 @@ StartAgain: ; FALSE, FALSE, cur_protection, max_protection, VM_BEHAVIOR_DEFAULT, - (flags & VM_FLAGS_MAP_JIT)? VM_INHERIT_NONE: inheritance, + (entry_for_jit)? VM_INHERIT_NONE: inheritance, 0, no_cache, permanent, superpage_size); new_entry->alias = alias; - if (flags & VM_FLAGS_MAP_JIT){ + if (entry_for_jit){ if (!(map->jit_entry_exists)){ new_entry->used_for_jit = TRUE; map->jit_entry_exists = TRUE; @@ -1973,8 +2058,10 @@ StartAgain: ; use_pmap = (alias == VM_MEMORY_SHARED_PMAP); #ifndef NO_NESTED_PMAP if (use_pmap && submap->pmap == NULL) { + ledger_t ledger = map->pmap->ledger; /* we need a sub pmap to nest... */ - submap->pmap = pmap_create(0, submap_is_64bit); + submap->pmap = pmap_create(ledger, 0, + submap_is_64bit); if (submap->pmap == NULL) { /* let's proceed without nesting... */ } @@ -2048,9 +2135,9 @@ StartAgain: ; */ if ((map->wiring_required)||(superpage_size)) { pmap_empty = FALSE; /* pmap won't be empty */ - result = vm_map_wire(map, start, end, + kr = vm_map_wire(map, start, end, new_entry->protection, TRUE); - RETURN(result); + RETURN(kr); } if ((object != VM_OBJECT_NULL) && @@ -2321,16 +2408,20 @@ vm_map_enter_mem_object( * once it's been set and if we race, we'll * just end up setting it twice, which is OK. */ - if (submap->mapped == FALSE) { + if (submap->mapped_in_other_pmaps == FALSE && + vm_map_pmap(submap) != PMAP_NULL && + vm_map_pmap(submap) != + vm_map_pmap(target_map)) { /* - * This submap has never been mapped. - * Set its "mapped" flag now that it - * has been mapped. - * This happens only for the first ever - * mapping of a "submap". + * This submap is being mapped in a map + * that uses a different pmap. + * Set its "mapped_in_other_pmaps" flag + * to indicate that we now need to + * remove mappings from all pmaps rather + * than just the submap's pmap. */ vm_map_lock(submap); - submap->mapped = TRUE; + submap->mapped_in_other_pmaps = TRUE; vm_map_unlock(submap); } *address = map_addr; @@ -2370,6 +2461,8 @@ vm_map_enter_mem_object( wimg_mode = VM_WIMG_IO; } else if (access == MAP_MEM_COPYBACK) { wimg_mode = VM_WIMG_USE_DEFAULT; + } else if (access == MAP_MEM_INNERWBACK) { + wimg_mode = VM_WIMG_INNERWBACK; } else if (access == MAP_MEM_WTHRU) { wimg_mode = VM_WIMG_WTHRU; } else if (access == MAP_MEM_WCOMB) { @@ -2731,14 +2824,11 @@ vm_map_enter_cpm( kern_return_t kr; vm_map_offset_t va, start, end, offset; #if MACH_ASSERT - vm_map_offset_t prev_addr; + vm_map_offset_t prev_addr = 0; #endif /* MACH_ASSERT */ boolean_t anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0); - if (!vm_allocate_cpm_enabled) - return KERN_FAILURE; - if (size == 0) { *addr = 0; return KERN_SUCCESS; @@ -2763,7 +2853,7 @@ vm_map_enter_cpm( cpm_obj = vm_object_allocate((vm_object_size_t)size); assert(cpm_obj != VM_OBJECT_NULL); assert(cpm_obj->internal); - assert(cpm_obj->size == (vm_object_size_t)size); + assert(cpm_obj->vo_size == (vm_object_size_t)size); assert(cpm_obj->can_persist == FALSE); assert(cpm_obj->pager_created == FALSE); assert(cpm_obj->pageout == FALSE); @@ -2873,7 +2963,7 @@ vm_map_enter_cpm( type_of_fault = DBG_ZERO_FILL_FAULT; vm_fault_enter(m, pmap, va, VM_PROT_ALL, VM_PROT_WRITE, - VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, + VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, NULL, &type_of_fault); vm_object_unlock(cpm_obj); @@ -2888,8 +2978,8 @@ vm_map_enter_cpm( m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset); vm_object_unlock(cpm_obj); if (m == VM_PAGE_NULL) - panic("vm_allocate_cpm: obj 0x%x off 0x%x no page", - cpm_obj, offset); + panic("vm_allocate_cpm: obj %p off 0x%llx no page", + cpm_obj, (uint64_t)offset); assert(m->tabled); assert(!m->busy); assert(!m->wanted); @@ -2898,15 +2988,15 @@ vm_map_enter_cpm( assert(!m->absent); assert(!m->error); assert(!m->cleaning); + assert(!m->laundry); assert(!m->precious); assert(!m->clustered); if (offset != 0) { if (m->phys_page != prev_addr + 1) { - printf("start 0x%x end 0x%x va 0x%x\n", - start, end, va); - printf("obj 0x%x off 0x%x\n", cpm_obj, offset); - printf("m 0x%x prev_address 0x%x\n", m, - prev_addr); + printf("start 0x%llx end 0x%llx va 0x%llx\n", + (uint64_t)start, (uint64_t)end, (uint64_t)va); + printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset); + printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr); panic("vm_allocate_cpm: pages not contig!"); } } @@ -2995,7 +3085,7 @@ vm_map_clip_unnest( pmap_unnest(map->pmap, entry->vme_start, entry->vme_end - entry->vme_start); - if ((map->mapped) && (map->ref_count)) { + if ((map->mapped_in_other_pmaps) && (map->ref_count)) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, @@ -3004,6 +3094,9 @@ vm_map_clip_unnest( entry->offset); } entry->use_pmap = FALSE; + if (entry->alias == VM_MEMORY_SHARED_PMAP) { + entry->alias = VM_MEMORY_UNSHARED_PMAP; + } } #endif /* NO_NESTED_PMAP */ @@ -3327,13 +3420,27 @@ vm_map_submap( entry->is_sub_map = TRUE; entry->object.sub_map = submap; vm_map_reference(submap); - submap->mapped = TRUE; + if (submap->mapped_in_other_pmaps == FALSE && + vm_map_pmap(submap) != PMAP_NULL && + vm_map_pmap(submap) != vm_map_pmap(map)) { + /* + * This submap is being mapped in a map + * that uses a different pmap. + * Set its "mapped_in_other_pmaps" flag + * to indicate that we now need to + * remove mappings from all pmaps rather + * than just the submap's pmap. + */ + submap->mapped_in_other_pmaps = TRUE; + } #ifndef NO_NESTED_PMAP if (use_pmap) { /* nest if platform code will allow */ if(submap->pmap == NULL) { - submap->pmap = pmap_create((vm_map_size_t) 0, FALSE); + ledger_t ledger = map->pmap->ledger; + submap->pmap = pmap_create(ledger, + (vm_map_size_t) 0, FALSE); if(submap->pmap == PMAP_NULL) { vm_map_unlock(map); return(KERN_NO_SPACE); @@ -3948,11 +4055,11 @@ vm_map_wire_nested( s, user_wire); return(KERN_FAILURE); } + vm_object_unlock(object); if(real_map != lookup_map) vm_map_unlock(real_map); vm_map_unlock_read(lookup_map); vm_map_lock(map); - vm_object_unlock(object); /* we unlocked, so must re-lookup */ if (!vm_map_lookup_entry(map, @@ -4667,7 +4774,7 @@ vm_map_submap_pmap_clean( entry->offset); } else { - if((map->mapped) && (map->ref_count) + if((map->mapped_in_other_pmaps) && (map->ref_count) && (entry->object.vm_object != NULL)) { vm_object_pmap_protect( entry->object.vm_object, @@ -4700,7 +4807,7 @@ vm_map_submap_pmap_clean( entry->object.sub_map, entry->offset); } else { - if((map->mapped) && (map->ref_count) + if((map->mapped_in_other_pmaps) && (map->ref_count) && (entry->object.vm_object != NULL)) { vm_object_pmap_protect( entry->object.vm_object, @@ -5077,7 +5184,7 @@ vm_map_delete( (addr64_t)entry->vme_start, entry->vme_end - entry->vme_start); #endif /* NO_NESTED_PMAP */ - if ((map->mapped) && (map->ref_count)) { + if ((map->mapped_in_other_pmaps) && (map->ref_count)) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, @@ -5093,7 +5200,7 @@ vm_map_delete( } } else if (entry->object.vm_object != kernel_object) { object = entry->object.vm_object; - if((map->mapped) && (map->ref_count)) { + if((map->mapped_in_other_pmaps) && (map->ref_count)) { vm_object_pmap_protect( object, entry->offset, entry->vme_end - entry->vme_start, @@ -6774,7 +6881,7 @@ vm_map_copy_overwrite_aligned( (addr64_t)entry->vme_start, entry->vme_end - entry->vme_start); #endif /* NO_NESTED_PMAP */ - if(dst_map->mapped) { + if(dst_map->mapped_in_other_pmaps) { /* clean up parent */ /* map/maps */ vm_map_submap_pmap_clean( @@ -6793,7 +6900,7 @@ vm_map_copy_overwrite_aligned( vm_map_deallocate( entry->object.sub_map); } else { - if(dst_map->mapped) { + if(dst_map->mapped_in_other_pmaps) { vm_object_pmap_protect( entry->object.vm_object, entry->offset, @@ -7377,7 +7484,7 @@ StartAgain: ; type_of_fault = DBG_CACHE_HIT_FAULT; vm_fault_enter(m, dst_map->pmap, va, prot, prot, - VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, + VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, NULL, &type_of_fault); vm_object_unlock(object); @@ -8212,7 +8319,7 @@ vm_map_fork_share( if (override_nx(old_map, old_entry->alias) && prot) prot |= VM_PROT_EXECUTE; - if (old_map->mapped) { + if (old_map->mapped_in_other_pmaps) { vm_object_pmap_protect( old_entry->object.vm_object, old_entry->offset, @@ -8374,6 +8481,7 @@ vm_map_fork_copy( */ vm_map_t vm_map_fork( + ledger_t ledger, vm_map_t old_map) { pmap_t new_pmap; @@ -8384,11 +8492,11 @@ vm_map_fork( boolean_t src_needs_copy; boolean_t new_entry_needs_copy; - new_pmap = pmap_create((vm_map_size_t) 0, + new_pmap = pmap_create(ledger, (vm_map_size_t) 0, #if defined(__i386__) || defined(__x86_64__) old_map->pmap->pm_task_map != TASK_MAP_32BIT #else - 0 +#error Unknown architecture. #endif ); #if defined(__i386__) @@ -8469,7 +8577,7 @@ vm_map_fork( (old_entry->vme_end - old_entry->vme_start), ((old_entry->is_shared - || old_map->mapped) + || old_map->mapped_in_other_pmaps) ? PMAP_NULL : old_map->pmap), old_entry->vme_start, @@ -8791,7 +8899,7 @@ submap_recurse: prot = submap_entry->protection & ~VM_PROT_WRITE; - if (override_nx(map, submap_entry->alias) && prot) + if (override_nx(old_map, submap_entry->alias) && prot) prot |= VM_PROT_EXECUTE; vm_object_pmap_protect( @@ -8800,7 +8908,7 @@ submap_recurse: submap_entry->vme_end - submap_entry->vme_start, (submap_entry->is_shared - || map->mapped) ? + || map->mapped_in_other_pmaps) ? PMAP_NULL : map->pmap, submap_entry->vme_start, prot); @@ -8912,7 +9020,7 @@ submap_recurse: prot = entry->protection; - if (override_nx(map, entry->alias) && prot) { + if (override_nx(old_map, entry->alias) && prot) { /* * HACK -- if not a stack, then allow execution */ @@ -9031,6 +9139,7 @@ submap_recurse: fault_info->io_sync = FALSE; fault_info->cs_bypass = (entry->used_for_jit)? TRUE : FALSE; fault_info->mark_zf_absent = FALSE; + fault_info->batch_pmap_op = FALSE; } /* @@ -10105,7 +10214,7 @@ vm_map_simplify_entry( (prev_entry->is_shared == FALSE) && (this_entry->is_shared == FALSE) ) { - _vm_map_store_entry_unlink(&map->hdr, prev_entry); + vm_map_store_entry_unlink(map, prev_entry); assert(prev_entry->vme_start < this_entry->vme_end); this_entry->vme_start = prev_entry->vme_start; this_entry->offset = prev_entry->offset; @@ -10451,6 +10560,7 @@ vm_map_willneed( fault_info.io_sync = FALSE; fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; /* * The MADV_WILLNEED operation doesn't require any changes to the @@ -10616,7 +10726,22 @@ vm_map_entry_is_reusable( if (object == VM_OBJECT_NULL) { return TRUE; } - if (object->ref_count == 1 && + if ( +#if 0 + /* + * Let's proceed even if the VM object is potentially + * shared. + * We check for this later when processing the actual + * VM pages, so the contents will be safe if shared. + * + * But we can still mark this memory region as "reusable" to + * acknowledge that the caller did let us know that the memory + * could be re-used and should not be penalized for holding + * on to it. This allows its "resident size" to not include + * the reusable range. + */ + object->ref_count == 1 && +#endif object->wired_page_count == 0 && object->copy == VM_OBJECT_NULL && object->shadow == VM_OBJECT_NULL && @@ -10864,300 +10989,6 @@ vm_map_can_reuse( } - -#include -#if MACH_KDB -#include -#include - -#define printf db_printf - -/* - * Forward declarations for internal functions. - */ -extern void vm_map_links_print( - struct vm_map_links *links); - -extern void vm_map_header_print( - struct vm_map_header *header); - -extern void vm_map_entry_print( - vm_map_entry_t entry); - -extern void vm_follow_entry( - vm_map_entry_t entry); - -extern void vm_follow_map( - vm_map_t map); - -/* - * vm_map_links_print: [ debug ] - */ -void -vm_map_links_print( - struct vm_map_links *links) -{ - iprintf("prev = %08X next = %08X start = %016llX end = %016llX\n", - links->prev, - links->next, - (unsigned long long)links->start, - (unsigned long long)links->end); -} - -/* - * vm_map_header_print: [ debug ] - */ -void -vm_map_header_print( - struct vm_map_header *header) -{ - vm_map_links_print(&header->links); - iprintf("nentries = %08X, %sentries_pageable\n", - header->nentries, - (header->entries_pageable ? "" : "!")); -} - -/* - * vm_follow_entry: [ debug ] - */ -void -vm_follow_entry( - vm_map_entry_t entry) -{ - int shadows; - - iprintf("map entry %08X\n", entry); - - db_indent += 2; - - shadows = vm_follow_object(entry->object.vm_object); - iprintf("Total objects : %d\n",shadows); - - db_indent -= 2; -} - -/* - * vm_map_entry_print: [ debug ] - */ -void -vm_map_entry_print( - register vm_map_entry_t entry) -{ - static const char *inheritance_name[4] = - { "share", "copy", "none", "?"}; - static const char *behavior_name[4] = - { "dflt", "rand", "seqtl", "rseqntl" }; - - iprintf("map entry %08X - prev = %08X next = %08X\n", entry, entry->vme_prev, entry->vme_next); - - db_indent += 2; - - vm_map_links_print(&entry->links); - - iprintf("start = %016llX end = %016llX - prot=%x/%x/%s\n", - (unsigned long long)entry->vme_start, - (unsigned long long)entry->vme_end, - entry->protection, - entry->max_protection, - inheritance_name[(entry->inheritance & 0x3)]); - - iprintf("behavior = %s, wired_count = %d, user_wired_count = %d\n", - behavior_name[(entry->behavior & 0x3)], - entry->wired_count, - entry->user_wired_count); - iprintf("%sin_transition, %sneeds_wakeup\n", - (entry->in_transition ? "" : "!"), - (entry->needs_wakeup ? "" : "!")); - - if (entry->is_sub_map) { - iprintf("submap = %08X - offset = %016llX\n", - entry->object.sub_map, - (unsigned long long)entry->offset); - } else { - iprintf("object = %08X offset = %016llX - ", - entry->object.vm_object, - (unsigned long long)entry->offset); - printf("%sis_shared, %sneeds_copy\n", - (entry->is_shared ? "" : "!"), - (entry->needs_copy ? "" : "!")); - } - - db_indent -= 2; -} - -/* - * vm_follow_map: [ debug ] - */ -void -vm_follow_map( - vm_map_t map) -{ - register vm_map_entry_t entry; - - iprintf("task map %08X\n", map); - - db_indent += 2; - - for (entry = vm_map_first_entry(map); - entry && entry != vm_map_to_entry(map); - entry = entry->vme_next) { - vm_follow_entry(entry); - } - - db_indent -= 2; -} - -/* - * vm_map_print: [ debug ] - */ -void -vm_map_print( - db_addr_t inmap) -{ - register vm_map_entry_t entry; - vm_map_t map; -#if TASK_SWAPPER - char *swstate; -#endif /* TASK_SWAPPER */ - - map = (vm_map_t)(long) - inmap; /* Make sure we have the right type */ - - iprintf("task map %08X\n", map); - - db_indent += 2; - - vm_map_header_print(&map->hdr); - - iprintf("pmap = %08X size = %08X ref = %d hint = %08X first_free = %08X\n", - map->pmap, - map->size, - map->ref_count, - map->hint, - map->first_free); - - iprintf("%swait_for_space, %swiring_required, timestamp = %d\n", - (map->wait_for_space ? "" : "!"), - (map->wiring_required ? "" : "!"), - map->timestamp); - -#if TASK_SWAPPER - switch (map->sw_state) { - case MAP_SW_IN: - swstate = "SW_IN"; - break; - case MAP_SW_OUT: - swstate = "SW_OUT"; - break; - default: - swstate = "????"; - break; - } - iprintf("res = %d, sw_state = %s\n", map->res_count, swstate); -#endif /* TASK_SWAPPER */ - - for (entry = vm_map_first_entry(map); - entry && entry != vm_map_to_entry(map); - entry = entry->vme_next) { - vm_map_entry_print(entry); - } - - db_indent -= 2; -} - -/* - * Routine: vm_map_copy_print - * Purpose: - * Pretty-print a copy object for ddb. - */ - -void -vm_map_copy_print( - db_addr_t incopy) -{ - vm_map_copy_t copy; - vm_map_entry_t entry; - - copy = (vm_map_copy_t)(long) - incopy; /* Make sure we have the right type */ - - printf("copy object 0x%x\n", copy); - - db_indent += 2; - - iprintf("type=%d", copy->type); - switch (copy->type) { - case VM_MAP_COPY_ENTRY_LIST: - printf("[entry_list]"); - break; - - case VM_MAP_COPY_OBJECT: - printf("[object]"); - break; - - case VM_MAP_COPY_KERNEL_BUFFER: - printf("[kernel_buffer]"); - break; - - default: - printf("[bad type]"); - break; - } - printf(", offset=0x%llx", (unsigned long long)copy->offset); - printf(", size=0x%x\n", copy->size); - - switch (copy->type) { - case VM_MAP_COPY_ENTRY_LIST: - vm_map_header_print(©->cpy_hdr); - for (entry = vm_map_copy_first_entry(copy); - entry && entry != vm_map_copy_to_entry(copy); - entry = entry->vme_next) { - vm_map_entry_print(entry); - } - break; - - case VM_MAP_COPY_OBJECT: - iprintf("object=0x%x\n", copy->cpy_object); - break; - - case VM_MAP_COPY_KERNEL_BUFFER: - iprintf("kernel buffer=0x%x", copy->cpy_kdata); - printf(", kalloc_size=0x%x\n", copy->cpy_kalloc_size); - break; - - } - - db_indent -=2; -} - -/* - * db_vm_map_total_size(map) [ debug ] - * - * return the total virtual size (in bytes) of the map - */ -vm_map_size_t -db_vm_map_total_size( - db_addr_t inmap) -{ - vm_map_entry_t entry; - vm_map_size_t total; - vm_map_t map; - - map = (vm_map_t)(long) - inmap; /* Make sure we have the right type */ - - total = 0; - for (entry = vm_map_first_entry(map); - entry != vm_map_to_entry(map); - entry = entry->vme_next) { - total += entry->vme_end - entry->vme_start; - } - - return total; -} - -#endif /* MACH_KDB */ - /* * Routine: vm_map_entry_insert * @@ -11357,7 +11188,7 @@ vm_map_remap_extract( if (override_nx(map, src_entry->alias) && prot) prot |= VM_PROT_EXECUTE; - if(map->mapped) { + if(map->mapped_in_other_pmaps) { vm_object_pmap_protect( src_entry->object.vm_object, src_entry->offset, @@ -11405,6 +11236,14 @@ vm_map_remap_extract( */ RestartCopy: if (!copy) { + /* + * Cannot allow an entry describing a JIT + * region to be shared across address spaces. + */ + if (src_entry->used_for_jit == TRUE) { + result = KERN_INVALID_ARGUMENT; + break; + } src_entry->is_shared = TRUE; new_entry->is_shared = TRUE; if (!(new_entry->is_sub_map)) @@ -11440,7 +11279,7 @@ vm_map_remap_extract( offset, entry_size, ((src_entry->is_shared - || map->mapped) ? + || map->mapped_in_other_pmaps) ? PMAP_NULL : map->pmap), src_entry->vme_start, prot); @@ -11827,7 +11666,7 @@ StartAgain: ; */ zap_map = vm_map_create(PMAP_NULL, start, - end - start, + end, map->hdr.entries_pageable); if (zap_map == VM_MAP_NULL) { return KERN_RESOURCE_SHORTAGE; @@ -12937,8 +12776,9 @@ vm_map_is_64bit( } boolean_t -vm_map_has_4GB_pagezero( - vm_map_t map) +vm_map_has_hard_pagezero( + vm_map_t map, + vm_map_offset_t pagezero_size) { /* * XXX FBDP @@ -12950,7 +12790,7 @@ vm_map_has_4GB_pagezero( * VM map is being torn down, and when a new map is created via * load_machfile()/execve(). */ - return (map->min_offset >= 0x100000000ULL); + return (map->min_offset >= pagezero_size); } void @@ -12974,6 +12814,38 @@ vm_map_clear_4GB_pagezero(vm_map_t map) #endif } +/* + * Raise a VM map's maximun offset. + */ +kern_return_t +vm_map_raise_max_offset( + vm_map_t map, + vm_map_offset_t new_max_offset) +{ + kern_return_t ret; + + vm_map_lock(map); + ret = KERN_INVALID_ADDRESS; + + if (new_max_offset >= map->max_offset) { + if (!vm_map_is_64bit(map)) { + if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) { + map->max_offset = new_max_offset; + ret = KERN_SUCCESS; + } + } else { + if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) { + map->max_offset = new_max_offset; + ret = KERN_SUCCESS; + } + } + } + + vm_map_unlock(map); + return ret; +} + + /* * Raise a VM map's minimum offset. * To strictly enforce "page zero" reservation. @@ -13120,7 +12992,7 @@ kern_return_t vm_map_sign(vm_map_t map, /* Pull the dirty status from the pmap, since we cleared the * wpmapped bit */ if ((refmod & VM_MEM_MODIFIED) && !m->dirty) { - m->dirty = TRUE; + SET_PAGE_DIRTY(m, FALSE); } /* On to the next page */ @@ -13140,6 +13012,7 @@ kern_return_t vm_map_freeze_walk( unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, + unsigned int dirty_budget, boolean_t *has_shared) { vm_map_entry_t entry; @@ -13161,7 +13034,7 @@ kern_return_t vm_map_freeze_walk( continue; } - vm_object_pack(&purgeable, &wired, &clean, &dirty, &shared, entry->object.vm_object, VM_OBJECT_NULL, NULL, NULL); + default_freezer_pack(&purgeable, &wired, &clean, &dirty, dirty_budget, &shared, entry->object.vm_object, NULL); *purgeable_count += purgeable; *wired_count += wired; @@ -13171,6 +13044,14 @@ kern_return_t vm_map_freeze_walk( if (shared) { *has_shared = TRUE; } + + /* Adjust pageout budget and finish up if reached */ + if (dirty_budget) { + dirty_budget -= dirty; + if (dirty_budget == 0) { + break; + } + } } vm_map_unlock_read(map); @@ -13184,31 +13065,15 @@ kern_return_t vm_map_freeze( unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, + unsigned int dirty_budget, boolean_t *has_shared) { vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; - vm_object_t compact_object = VM_OBJECT_NULL; - vm_object_offset_t offset = 0x0; kern_return_t kr = KERN_SUCCESS; - void *default_freezer_toc = NULL; - boolean_t cleanup = FALSE; *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; *has_shared = FALSE; - /* Create our compact object */ - compact_object = vm_object_allocate((vm_map_offset_t)(VM_MAX_ADDRESS) - (vm_map_offset_t)(VM_MIN_ADDRESS)); - if (!compact_object) { - kr = KERN_FAILURE; - goto done; - } - - default_freezer_toc = default_freezer_mapping_create(compact_object, offset); - if (!default_freezer_toc) { - kr = KERN_FAILURE; - goto done; - } - /* * We need the exclusive lock here so that we can * block any page faults or lookups while we are @@ -13216,20 +13081,19 @@ kern_return_t vm_map_freeze( */ vm_map_lock(map); - if (map->default_freezer_toc != NULL){ + if (map->default_freezer_handle == NULL) { + map->default_freezer_handle = default_freezer_handle_allocate(); + } + + if ((kr = default_freezer_handle_init(map->default_freezer_handle)) != KERN_SUCCESS) { /* - * This map has already been frozen. + * Can happen if default_freezer_handle passed in is NULL + * Or, a table has already been allocated and associated + * with this handle, i.e. the map is already frozen. */ - cleanup = TRUE; - kr = KERN_SUCCESS; goto done; } - - /* Get a mapping in place for the freezing about to commence */ - map->default_freezer_toc = default_freezer_toc; - - vm_object_lock(compact_object); - + for (entry2 = vm_map_first_entry(map); entry2 != vm_map_to_entry(map); entry2 = entry2->vme_next) { @@ -13241,13 +13105,21 @@ kern_return_t vm_map_freeze( unsigned int purgeable, clean, dirty, wired; boolean_t shared; - vm_object_pack(&purgeable, &wired, &clean, &dirty, &shared, - src_object, compact_object, &default_freezer_toc, &offset); + default_freezer_pack(&purgeable, &wired, &clean, &dirty, dirty_budget, &shared, + src_object, map->default_freezer_handle); *purgeable_count += purgeable; *wired_count += wired; *clean_count += clean; *dirty_count += dirty; + + /* Adjust pageout budget and finish up if reached */ + if (dirty_budget) { + dirty_budget -= dirty; + if (dirty_budget == 0) { + break; + } + } if (shared) { *has_shared = TRUE; @@ -13255,61 +13127,36 @@ kern_return_t vm_map_freeze( } } - vm_object_unlock(compact_object); - /* Finally, throw out the pages to swap */ - vm_object_pageout(compact_object); + default_freezer_pageout(map->default_freezer_handle); done: vm_map_unlock(map); - - /* Unwind if there was a failure */ - if ((cleanup) || (KERN_SUCCESS != kr)) { - if (default_freezer_toc){ - default_freezer_mapping_free(&map->default_freezer_toc, TRUE); - } - if (compact_object){ - vm_object_deallocate(compact_object); - } - } return kr; } -__private_extern__ vm_object_t default_freezer_get_compact_vm_object( void** ); - -void +kern_return_t vm_map_thaw( vm_map_t map) { - void **default_freezer_toc; - vm_object_t compact_object; + kern_return_t kr = KERN_SUCCESS; vm_map_lock(map); - if (map->default_freezer_toc == NULL){ + if (map->default_freezer_handle == NULL) { /* * This map is not in a frozen state. */ + kr = KERN_FAILURE; goto out; } - - default_freezer_toc = &(map->default_freezer_toc); - - compact_object = default_freezer_get_compact_vm_object(default_freezer_toc); - - /* Bring the pages back in */ - vm_object_pagein(compact_object); - - /* Shift pages back to their original objects */ - vm_object_unpack(compact_object, default_freezer_toc); - vm_object_deallocate(compact_object); - - map->default_freezer_toc = NULL; - + default_freezer_unpack(map->default_freezer_handle); out: vm_map_unlock(map); + + return kr; } #endif diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index f88bd545d..a89ad4d50 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -143,6 +143,7 @@ typedef union vm_map_object { } vm_map_object_t; #define named_entry_lock_init(object) lck_mtx_init(&(object)->Lock, &vm_object_lck_grp, &vm_object_lck_attr) +#define named_entry_lock_destroy(object) lck_mtx_destroy(&(object)->Lock, &vm_object_lck_grp) #define named_entry_lock(object) lck_mtx_lock(&(object)->Lock) #define named_entry_unlock(object) lck_mtx_unlock(&(object)->Lock) @@ -238,9 +239,15 @@ struct vm_map_entry { /* boolean_t */ zero_wired_pages:1, /* zero out the wired pages of this entry it is being deleted without unwiring them */ /* boolean_t */ used_for_jit:1, /* boolean_t */ from_reserved_zone:1; /* Allocated from - * kernel reserved zone */ + * kernel reserved zone */ unsigned short wired_count; /* can be paged if = 0 */ unsigned short user_wired_count; /* for vm_wire */ +#if DEBUG +#define MAP_ENTRY_CREATION_DEBUG (1) +#endif +#if MAP_ENTRY_CREATION_DEBUG + uintptr_t vme_bt[16]; +#endif }; /* @@ -317,7 +324,7 @@ struct _vm_map { /* boolean_t */ wait_for_space:1, /* Should callers wait for space? */ /* boolean_t */ wiring_required:1, /* All memory wired? */ /* boolean_t */ no_zero_fill:1, /*No zero fill absent pages */ - /* boolean_t */ mapped:1, /*has this map been mapped */ + /* boolean_t */ mapped_in_other_pmaps:1, /*has this submap been mapped in maps that use a different pmap */ /* boolean_t */ switch_protect:1, /* Protect map from write faults while switched */ /* boolean_t */ disable_vmentry_reuse:1, /* All vm entries should keep using newer and higher addresses in the map */ /* boolean_t */ map_disallow_data_exec:1, /* Disallow execution from data pages on exec-permissive architectures */ @@ -325,7 +332,7 @@ struct _vm_map { unsigned int timestamp; /* Version number */ unsigned int color_rr; /* next color (not protected by a lock) */ #if CONFIG_FREEZE - void *default_freezer_toc; + void *default_freezer_handle; #endif boolean_t jit_entry_exists; } ; @@ -701,6 +708,11 @@ extern kern_return_t vm_map_copyin_object( vm_object_size_t size, vm_map_copy_t *copy_result); /* OUT */ +extern kern_return_t vm_map_random_address_for_size( + vm_map_t map, + vm_map_offset_t *address, + vm_map_size_t size); + /* Enter a mapping */ extern kern_return_t vm_map_enter( vm_map_t map, @@ -753,6 +765,7 @@ extern kern_return_t vm_map_read_user( /* Create a new task map using an existing task map as a template. */ extern vm_map_t vm_map_fork( + ledger_t ledger, vm_map_t old_map); /* Change inheritance */ @@ -982,11 +995,14 @@ extern void vm_map_set_64bit( extern void vm_map_set_32bit( vm_map_t map); +extern boolean_t vm_map_has_hard_pagezero( + vm_map_t map, + vm_map_offset_t pagezero_size); + extern boolean_t vm_map_is_64bit( vm_map_t map); +#define vm_map_has_4GB_pagezero(map) vm_map_has_hard_pagezero(map, (vm_map_offset_t)0x100000000ULL) -extern boolean_t vm_map_has_4GB_pagezero( - vm_map_t map); extern void vm_map_set_4GB_pagezero( vm_map_t map); @@ -994,6 +1010,10 @@ extern void vm_map_set_4GB_pagezero( extern void vm_map_clear_4GB_pagezero( vm_map_t map); +extern kern_return_t vm_map_raise_max_offset( + vm_map_t map, + vm_map_offset_t new_max_offset); + extern kern_return_t vm_map_raise_min_offset( vm_map_t map, vm_map_offset_t new_min_offset); @@ -1078,12 +1098,17 @@ extern kern_return_t vm_map_sign(vm_map_t map, #endif #if CONFIG_FREEZE +void vm_map_freeze_thaw_init(void); +void vm_map_freeze_thaw(void); +void vm_map_demand_fault(void); + extern kern_return_t vm_map_freeze_walk( vm_map_t map, unsigned int *purgeable_count, unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, + unsigned int dirty_budget, boolean_t *has_shared); extern kern_return_t vm_map_freeze( @@ -1092,9 +1117,10 @@ extern kern_return_t vm_map_freeze( unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, + unsigned int dirty_budget, boolean_t *has_shared); -extern void vm_map_thaw( +extern kern_return_t vm_map_thaw( vm_map_t map); #endif diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index ccfcd062f..b875fd651 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -151,6 +151,7 @@ vm_map_store_entry_unlink( vm_map_t map, vm_map_entry_t entry) } _vm_map_store_entry_unlink(&VMEU_map->hdr, VMEU_entry); + vm_map_store_update( map, entry, VM_MAP_ENTRY_DELETE); update_first_free_ll(VMEU_map, VMEU_first_free); #ifdef VM_MAP_STORE_USE_RB update_first_free_rb(VMEU_map, VMEU_first_free); diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 2f7d54e3c..1cbe6926d 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -72,6 +72,8 @@ #include #include +#include + #include #include @@ -96,10 +98,6 @@ #include #include -#if CONFIG_EMBEDDED -#include -#endif - /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given @@ -1104,7 +1102,7 @@ vm_object_page_grab( p = next_p; next_p = (vm_page_t)queue_next(&next_p->listq); - if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->fictitious) + if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry || p->fictitious) goto move_page_in_obj; if (p->pmapped || p->dirty || p->precious) { @@ -1121,8 +1119,9 @@ vm_object_page_grab( if (refmod_state & VM_MEM_REFERENCED) p->reference = TRUE; - if (refmod_state & VM_MEM_MODIFIED) - p->dirty = TRUE; + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(p, FALSE); + } } if (p->dirty == FALSE && p->precious == FALSE) { @@ -1130,8 +1129,9 @@ vm_object_page_grab( if (refmod_state & VM_MEM_REFERENCED) p->reference = TRUE; - if (refmod_state & VM_MEM_MODIFIED) - p->dirty = TRUE; + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(p, FALSE); + } if (p->dirty == FALSE) goto take_page; @@ -1346,7 +1346,7 @@ vm_object_cache_evict( object->vo_cache_pages_to_scan--; - if (VM_PAGE_WIRED(p) || p->busy || p->cleaning) { + if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry) { queue_remove(&object->memq, p, vm_page_t, listq); queue_enter(&object->memq, p, vm_page_t, listq); @@ -1373,6 +1373,12 @@ vm_object_cache_evict( p->reference = FALSE; p->no_cache = FALSE; + /* + * we've already filtered out pages that are in the laundry + * so if we get here, this page can't be on the pageout queue + */ + assert(!p->pageout_queue); + VM_PAGE_QUEUES_REMOVE(p); VM_PAGE_ENQUEUE_INACTIVE(p, TRUE); @@ -1833,7 +1839,7 @@ vm_object_reap_pages( restart_after_sleep: if (queue_empty(&object->memq)) return; - loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH) + 1; + loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH); vm_page_lockspin_queues(); @@ -1859,38 +1865,13 @@ restart_after_sleep: } else mutex_pause(0); - loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH) + 1; + loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH); vm_page_lockspin_queues(); } if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) { - if (reap_type == REAP_DATA_FLUSH && - ((p->pageout == TRUE || p->cleaning == TRUE) && p->list_req_pending == TRUE)) { - p->list_req_pending = FALSE; - p->cleaning = FALSE; - /* - * need to drop the laundry count... - * we may also need to remove it - * from the I/O paging queue... - * vm_pageout_throttle_up handles both cases - * - * the laundry and pageout_queue flags are cleared... - */ - vm_pageout_throttle_up(p); - - if (p->pageout == TRUE) { - /* - * toss the wire count we picked up - * when we initially set this page up - * to be cleaned and stolen... - */ - vm_page_unwire(p, TRUE); - p->pageout = FALSE; - } - PAGE_WAKEUP(p); - - } else if (p->busy || p->cleaning) { + if (p->busy || p->cleaning) { vm_page_unlock_queues(); /* @@ -1903,6 +1884,11 @@ restart_after_sleep: goto restart_after_sleep; } + if (p->laundry) { + p->pageout = FALSE; + + vm_pageout_steal_laundry(p, TRUE); + } } switch (reap_type) { @@ -1920,15 +1906,29 @@ restart_after_sleep: case REAP_PURGEABLE: if (VM_PAGE_WIRED(p)) { - /* can't purge a wired page */ + /* + * can't purge a wired page + */ vm_page_purged_wired++; continue; } + if (p->laundry && !p->busy && !p->cleaning) { + p->pageout = FALSE; + vm_pageout_steal_laundry(p, TRUE); + } + if (p->cleaning || p->laundry) { + /* + * page is being acted upon, + * so don't mess with it + */ + vm_page_purged_others++; + continue; + } if (p->busy) { /* * We can't reclaim a busy page but we can - * make it pageable (it's not wired) to make + * make it more likely to be paged (it's not wired) to make * sure that it gets considered by * vm_pageout_scan() later. */ @@ -1937,14 +1937,6 @@ restart_after_sleep: continue; } - if (p->cleaning || p->laundry || p->list_req_pending) { - /* - * page is being acted upon, - * so don't mess with it - */ - vm_page_purged_others++; - continue; - } assert(p->object != kernel_object); /* @@ -1957,7 +1949,7 @@ restart_after_sleep: */ refmod_state = pmap_disconnect(p->phys_page); if (refmod_state & VM_MEM_MODIFIED) { - p->dirty = TRUE; + SET_PAGE_DIRTY(p, FALSE); } } if (p->dirty || p->precious) { @@ -1989,15 +1981,14 @@ restart_after_sleep: if ((p->dirty || p->precious) && !p->error && object->alive) { - p->busy = TRUE; - - VM_PAGE_QUEUES_REMOVE(p); - /* - * flush page... page will be freed - * upon completion of I/O - */ - vm_pageout_cluster(p); - + if (!p->laundry) { + VM_PAGE_QUEUES_REMOVE(p); + /* + * flush page... page will be freed + * upon completion of I/O + */ + vm_pageout_cluster(p, TRUE); + } vm_page_unlock_queues(); /* * free the pages reclaimed so far @@ -2521,11 +2512,9 @@ deactivate_pages_in_object( MARK_PAGE_HANDLED(*chunk_state, p); - if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy)) { + if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy) && (!m->laundry)) { int clear_refmod; - assert(!m->laundry); - clear_refmod = VM_MEM_REFERENCED; dwp->dw_mask = DW_clear_reference; @@ -3092,6 +3081,7 @@ vm_object_copy_slowly( fault_info.io_sync = FALSE; fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; for ( ; size != 0 ; @@ -3149,11 +3139,6 @@ vm_object_copy_slowly( result_page = _result_page; /* - * We don't need to hold the object - * lock -- the busy page will be enough. - * [We don't care about picking up any - * new modifications.] - * * Copy the page to the new object. * * POLICY DECISION: @@ -3162,15 +3147,15 @@ vm_object_copy_slowly( * of copying. */ - vm_object_unlock(result_page->object); vm_page_copy(result_page, new_page); + vm_object_unlock(result_page->object); /* * Let go of both pages (make them * not busy, perform wakeup, activate). */ vm_object_lock(new_object); - new_page->dirty = TRUE; + SET_PAGE_DIRTY(new_page, FALSE); PAGE_WAKEUP_DONE(new_page); vm_object_unlock(new_object); @@ -4758,7 +4743,19 @@ vm_object_do_bypass( vm_object_res_reference(backing_object); } #endif /* TASK_SWAPPER */ + /* + * vm_object_collapse (the caller of this function) is + * now called from contexts that may not guarantee that a + * valid reference is held on the object... w/o a valid + * reference, it is unsafe and unwise (you will definitely + * regret it) to unlock the object and then retake the lock + * since the object may be terminated and recycled in between. + * The "activity_in_progress" reference will keep the object + * 'stable'. + */ + vm_object_activity_begin(object); vm_object_unlock(object); + vm_object_unlock(backing_object); vm_object_deallocate(backing_object); @@ -4770,6 +4767,7 @@ vm_object_do_bypass( */ vm_object_lock(object); + vm_object_activity_end(object); } object_bypasses++; @@ -5269,7 +5267,7 @@ vm_object_page_remove( for (; start < end; start += PAGE_SIZE_64) { p = vm_page_lookup(object, start); if (p != VM_PAGE_NULL) { - assert(!p->cleaning && !p->pageout); + assert(!p->cleaning && !p->pageout && !p->laundry); if (!p->fictitious && p->pmapped) pmap_disconnect(p->phys_page); VM_PAGE_FREE(p); @@ -5282,7 +5280,7 @@ vm_object_page_remove( while (!queue_end(&object->memq, (queue_entry_t) p)) { next = (vm_page_t) queue_next(&p->listq); if ((start <= p->offset) && (p->offset < end)) { - assert(!p->cleaning && !p->pageout); + assert(!p->cleaning && !p->pageout && !p->laundry); if (!p->fictitious && p->pmapped) pmap_disconnect(p->phys_page); VM_PAGE_FREE(p); @@ -5462,332 +5460,6 @@ vm_object_page_map( } } -#include - -#if MACH_KDB -#include -#include - -#define printf kdbprintf - -extern boolean_t vm_object_cached( - vm_object_t object); - -extern void print_bitstring( - char byte); - -boolean_t vm_object_print_pages = FALSE; - -void -print_bitstring( - char byte) -{ - printf("%c%c%c%c%c%c%c%c", - ((byte & (1 << 0)) ? '1' : '0'), - ((byte & (1 << 1)) ? '1' : '0'), - ((byte & (1 << 2)) ? '1' : '0'), - ((byte & (1 << 3)) ? '1' : '0'), - ((byte & (1 << 4)) ? '1' : '0'), - ((byte & (1 << 5)) ? '1' : '0'), - ((byte & (1 << 6)) ? '1' : '0'), - ((byte & (1 << 7)) ? '1' : '0')); -} - -boolean_t -vm_object_cached( - __unused register vm_object_t object) -{ -#if VM_OBJECT_CACHE - register vm_object_t o; - - queue_iterate(&vm_object_cached_list, o, vm_object_t, cached_list) { - if (object == o) { - return TRUE; - } - } -#endif - return FALSE; -} - -#if MACH_PAGEMAP -/* - * vm_external_print: [ debug ] - */ -void -vm_external_print( - vm_external_map_t emap, - vm_object_size_t size) -{ - if (emap == VM_EXTERNAL_NULL) { - printf("0 "); - } else { - vm_object_size_t existence_size = stob(size); - printf("{ size=%lld, map=[", (uint64_t) existence_size); - if (existence_size > 0) { - print_bitstring(emap[0]); - } - if (existence_size > 1) { - print_bitstring(emap[1]); - } - if (existence_size > 2) { - printf("..."); - print_bitstring(emap[existence_size-1]); - } - printf("] }\n"); - } - return; -} -#endif /* MACH_PAGEMAP */ - -int -vm_follow_object( - vm_object_t object) -{ - int count = 0; - int orig_db_indent = db_indent; - - while (TRUE) { - if (object == VM_OBJECT_NULL) { - db_indent = orig_db_indent; - return count; - } - - count += 1; - - iprintf("object 0x%x", object); - printf(", shadow=0x%x", object->shadow); - printf(", copy=0x%x", object->copy); - printf(", pager=0x%x", object->pager); - printf(", ref=%d\n", object->ref_count); - - db_indent += 2; - object = object->shadow; - } - -} - -/* - * vm_object_print: [ debug ] - */ -void -vm_object_print(db_expr_t db_addr, __unused boolean_t have_addr, - __unused db_expr_t arg_count, __unused char *modif) -{ - vm_object_t object; - register vm_page_t p; - const char *s; - - register int count; - - object = (vm_object_t) (long) db_addr; - if (object == VM_OBJECT_NULL) - return; - - iprintf("object 0x%x\n", object); - - db_indent += 2; - - iprintf("size=0x%x", object->vo_size); - printf(", memq_hint=%p", object->memq_hint); - printf(", ref_count=%d\n", object->ref_count); - iprintf(""); -#if TASK_SWAPPER - printf("res_count=%d, ", object->res_count); -#endif /* TASK_SWAPPER */ - printf("resident_page_count=%d\n", object->resident_page_count); - - iprintf("shadow=0x%x", object->shadow); - if (object->shadow) { - register int i = 0; - vm_object_t shadow = object; - while((shadow = shadow->shadow)) - i++; - printf(" (depth %d)", i); - } - printf(", copy=0x%x", object->copy); - printf(", shadow_offset=0x%x", object->vo_shadow_offset); - printf(", last_alloc=0x%x\n", object->last_alloc); - - iprintf("pager=0x%x", object->pager); - printf(", paging_offset=0x%x", object->paging_offset); - printf(", pager_control=0x%x\n", object->pager_control); - - iprintf("copy_strategy=%d[", object->copy_strategy); - switch (object->copy_strategy) { - case MEMORY_OBJECT_COPY_NONE: - printf("copy_none"); - break; - - case MEMORY_OBJECT_COPY_CALL: - printf("copy_call"); - break; - - case MEMORY_OBJECT_COPY_DELAY: - printf("copy_delay"); - break; - - case MEMORY_OBJECT_COPY_SYMMETRIC: - printf("copy_symmetric"); - break; - - case MEMORY_OBJECT_COPY_INVALID: - printf("copy_invalid"); - break; - - default: - printf("?"); - } - printf("]"); - - iprintf("all_wanted=0x%x<", object->all_wanted); - s = ""; - if (vm_object_wanted(object, VM_OBJECT_EVENT_INITIALIZED)) { - printf("%sinit", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGER_READY)) { - printf("%sready", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS)) { - printf("%spaging", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_LOCK_IN_PROGRESS)) { - printf("%slock", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_UNCACHING)) { - printf("%suncaching", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_COPY_CALL)) { - printf("%scopy_call", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_CACHING)) { - printf("%scaching", s); - s = ","; - } - printf(">"); - printf(", paging_in_progress=%d\n", object->paging_in_progress); - printf(", activity_in_progress=%d\n", object->activity_in_progress); - - iprintf("%screated, %sinit, %sready, %spersist, %strusted, %spageout, %s, %s\n", - (object->pager_created ? "" : "!"), - (object->pager_initialized ? "" : "!"), - (object->pager_ready ? "" : "!"), - (object->can_persist ? "" : "!"), - (object->pager_trusted ? "" : "!"), - (object->pageout ? "" : "!"), - (object->internal ? "internal" : "external"), - (object->temporary ? "temporary" : "permanent")); - iprintf("%salive, %spurgeable, %spurgeable_volatile, %spurgeable_empty, %sshadowed, %scached, %sprivate\n", - (object->alive ? "" : "!"), - ((object->purgable != VM_PURGABLE_DENY) ? "" : "!"), - ((object->purgable == VM_PURGABLE_VOLATILE) ? "" : "!"), - ((object->purgable == VM_PURGABLE_EMPTY) ? "" : "!"), - (object->shadowed ? "" : "!"), - (vm_object_cached(object) ? "" : "!"), - (object->private ? "" : "!")); - iprintf("%sadvisory_pageout, %ssilent_overwrite\n", - (object->advisory_pageout ? "" : "!"), - (object->silent_overwrite ? "" : "!")); - -#if MACH_PAGEMAP - iprintf("existence_map="); - vm_external_print(object->existence_map, object->vo_size); -#endif /* MACH_PAGEMAP */ -#if MACH_ASSERT - iprintf("paging_object=0x%x\n", object->paging_object); -#endif /* MACH_ASSERT */ - - if (vm_object_print_pages) { - count = 0; - p = (vm_page_t) queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t) p)) { - if (count == 0) { - iprintf("memory:="); - } else if (count == 2) { - printf("\n"); - iprintf(" ..."); - count = 0; - } else { - printf(","); - } - count++; - - printf("(off=0x%llX,page=%p)", p->offset, p); - p = (vm_page_t) queue_next(&p->listq); - } - if (count != 0) { - printf("\n"); - } - } - db_indent -= 2; -} - - -/* - * vm_object_find [ debug ] - * - * Find all tasks which reference the given vm_object. - */ - -boolean_t vm_object_find(vm_object_t object); -boolean_t vm_object_print_verbose = FALSE; - -boolean_t -vm_object_find( - vm_object_t object) -{ - task_t task; - vm_map_t map; - vm_map_entry_t entry; - boolean_t found = FALSE; - - queue_iterate(&tasks, task, task_t, tasks) { - map = task->map; - for (entry = vm_map_first_entry(map); - entry && entry != vm_map_to_entry(map); - entry = entry->vme_next) { - - vm_object_t obj; - - /* - * For the time being skip submaps, - * only the kernel can have submaps, - * and unless we are interested in - * kernel objects, we can simply skip - * submaps. See sb/dejan/nmk18b7/src/mach_kernel/vm - * for a full solution. - */ - if (entry->is_sub_map) - continue; - if (entry) - obj = entry->object.vm_object; - else - continue; - - while (obj != VM_OBJECT_NULL) { - if (obj == object) { - if (!found) { - printf("TASK\t\tMAP\t\tENTRY\n"); - found = TRUE; - } - printf("0x%x\t0x%x\t0x%x\n", - task, map, entry); - } - obj = obj->shadow; - } - } - } - - return(found); -} - -#endif /* MACH_KDB */ - kern_return_t vm_object_populate_with_private( vm_object_t object, @@ -5799,23 +5471,27 @@ vm_object_populate_with_private( vm_object_offset_t base_offset; - if(!object->private) + if (!object->private) return KERN_FAILURE; base_page = phys_page; vm_object_lock(object); - if(!object->phys_contiguous) { + + if (!object->phys_contiguous) { vm_page_t m; - if((base_offset = trunc_page_64(offset)) != offset) { + + if ((base_offset = trunc_page_64(offset)) != offset) { vm_object_unlock(object); return KERN_FAILURE; } base_offset += object->paging_offset; - while(size) { + + while (size) { m = vm_page_lookup(object, base_offset); - if(m != VM_PAGE_NULL) { - if(m->fictitious) { + + if (m != VM_PAGE_NULL) { + if (m->fictitious) { if (m->phys_page != vm_page_guard_addr) { vm_page_lockspin_queues(); @@ -5824,16 +5500,16 @@ vm_object_populate_with_private( m->fictitious = FALSE; m->phys_page = base_page; - if(!m->busy) { - m->busy = TRUE; - } - if(!m->absent) { - m->absent = TRUE; - } - m->list_req_pending = TRUE; } } else if (m->phys_page != base_page) { - if (m->pmapped) { + + if ( !m->private) { + /* + * we'd leak a real page... that can't be right + */ + panic("vm_object_populate_with_private - %p not private", m); + } + if (m->pmapped) { /* * pmap call to clear old mapping */ @@ -5841,17 +5517,12 @@ vm_object_populate_with_private( } m->phys_page = base_page; } - - /* - * ENCRYPTED SWAP: - * We're not pointing to the same - * physical page any longer and the - * contents of the new one are not - * supposed to be encrypted. - * XXX What happens to the original - * physical page. Is it lost ? - */ - m->encrypted = FALSE; + if (m->encrypted) { + /* + * we should never see this on a ficticious or private page + */ + panic("vm_object_populate_with_private - %p encrypted", m); + } } else { while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) @@ -5864,9 +5535,8 @@ vm_object_populate_with_private( m->private = TRUE; m->fictitious = FALSE; m->phys_page = base_page; - m->list_req_pending = TRUE; - m->absent = TRUE; m->unusual = TRUE; + m->busy = FALSE; vm_page_insert(m, object, base_offset); } @@ -5887,6 +5557,7 @@ vm_object_populate_with_private( object->vo_size = size; } vm_object_unlock(object); + return KERN_SUCCESS; } @@ -6444,7 +6115,7 @@ vm_object_purgable_control( purgeable_q_t queue = vm_purgeable_object_remove(object); assert(queue); - vm_purgeable_token_delete_first(queue); + vm_purgeable_token_delete_last(queue); assert(queue->debug_count_objects>=0); vm_page_unlock_queues(); @@ -6465,7 +6136,7 @@ vm_object_purgable_control( refmod = pmap_disconnect(p->phys_page); if ((refmod & VM_MEM_MODIFIED) && !p->dirty) { - p->dirty = TRUE; + SET_PAGE_DIRTY(p, FALSE); } } } @@ -6538,7 +6209,7 @@ vm_object_purgable_control( /* Changing queue. Have to move token. */ vm_page_lock_queues(); - vm_purgeable_token_delete_first(old_queue); + vm_purgeable_token_delete_last(old_queue); result = vm_purgeable_token_add(queue); vm_page_unlock_queues(); @@ -6566,7 +6237,7 @@ vm_object_purgable_control( refmod = pmap_disconnect(p->phys_page); if ((refmod & VM_MEM_MODIFIED) && !p->dirty) { - p->dirty = TRUE; + SET_PAGE_DIRTY(p, FALSE); } } } @@ -6583,7 +6254,7 @@ vm_object_purgable_control( old_queue = vm_purgeable_object_remove(object); assert(old_queue); vm_page_lock_queues(); - vm_purgeable_token_delete_first(old_queue); + vm_purgeable_token_delete_last(old_queue); vm_page_unlock_queues(); } (void) vm_object_purge(object); @@ -7072,7 +6743,7 @@ extern int ignore_is_ssd; #if CONFIG_EMBEDDED unsigned int preheat_pages_max = MAX_UPL_TRANSFER; -unsigned int preheat_pages_min = 8; +unsigned int preheat_pages_min = 10; #else unsigned int preheat_pages_max = MAX_UPL_TRANSFER; unsigned int preheat_pages_min = 8; @@ -7269,11 +6940,15 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, pre_heat_size = max_length; if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size_in_bytes)) { - if (vm_page_free_count < vm_page_throttle_limit) + + unsigned int consider_free = vm_page_free_count + vm_page_cleaned_count; + + if (consider_free < vm_page_throttle_limit) { pre_heat_size = trunc_page(pre_heat_size / 16); - else if (vm_page_free_count < vm_page_free_target) + } else if (consider_free < vm_page_free_target) { pre_heat_size = trunc_page(pre_heat_size / 4); - + } + if (pre_heat_size < min_ph_size_in_bytes) pre_heat_size = min_ph_size_in_bytes; } @@ -7406,6 +7081,8 @@ out: pre_heat_cluster[*length / PAGE_SIZE]++; vm_object_unlock(object); + + DTRACE_VM1(clustersize, vm_size_t, *length); } @@ -7492,7 +7169,9 @@ vm_object_page_op( /* if such violations occur we will assert sooner */ /* or later. */ assert(dst_page->busy || (ops & UPL_POP_BUSY)); - if (ops & UPL_POP_DIRTY) dst_page->dirty = TRUE; + if (ops & UPL_POP_DIRTY) { + SET_PAGE_DIRTY(dst_page, FALSE); + } if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE; if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE; if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE; @@ -7611,12 +7290,7 @@ vm_object_range_op( dst_page = vm_page_lookup(object, offset); if (dst_page != VM_PAGE_NULL) { if (ops & UPL_ROP_DUMP) { - if (dst_page->list_req_pending) { - /* - * This page isn't on a UPL yet. - * So it's safe to steal it here and dump it. - */ - } else if (dst_page->busy || dst_page->cleaning) { + if (dst_page->busy || dst_page->cleaning) { /* * someone else is playing with the * page, we will have to wait @@ -7630,6 +7304,11 @@ vm_object_range_op( */ continue; } + if (dst_page->laundry) { + dst_page->pageout = FALSE; + + vm_pageout_steal_laundry(dst_page, FALSE); + } if (dst_page->pmapped == TRUE) pmap_disconnect(dst_page->phys_page); @@ -7748,19 +7427,15 @@ vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode) #if CONFIG_FREEZE -__private_extern__ void default_freezer_pack_page(vm_page_t , vm_object_t , vm_object_offset_t, void**); -__private_extern__ void default_freezer_unpack(vm_object_t , void**); - kern_return_t vm_object_pack( - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - boolean_t *shared, - vm_object_t src_object, - vm_object_t compact_object, - void **table, - vm_object_offset_t *offset) + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + boolean_t *shared, + vm_object_t src_object, + struct default_freezer_handle *df_handle) { kern_return_t kr = KERN_SUCCESS; @@ -7777,8 +7452,8 @@ kern_return_t vm_object_pack( if (src_object->purgable == VM_PURGABLE_VOLATILE) { *purgeable_count = src_object->resident_page_count; - /* If the destination object is null, we're just walking the pages to discover how many can be hibernated */ - if (VM_OBJECT_NULL != compact_object) { + /* If the default freezer handle is null, we're just walking the pages to discover how many can be hibernated */ + if (df_handle != NULL) { purgeable_q_t queue; /* object should be on a queue */ assert(src_object->objq.next != NULL && @@ -7794,7 +7469,7 @@ kern_return_t vm_object_pack( } if (src_object->ref_count == 1) { - vm_object_pack_pages(wired_count, clean_count, dirty_count, src_object, compact_object, table, offset); + vm_object_pack_pages(wired_count, clean_count, dirty_count, dirty_budget, src_object, df_handle); } else { if (src_object->internal) { *shared = TRUE; @@ -7809,34 +7484,27 @@ done: void vm_object_pack_pages( - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - vm_object_t src_object, - vm_object_t compact_object, - void **table, - vm_object_offset_t *offset) + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + vm_object_t src_object, + struct default_freezer_handle *df_handle) { vm_page_t p, next; next = (vm_page_t)queue_first(&src_object->memq); - /* Since this function is dual purpose in order that we can count - * the freezable pages as well as prepare them, assert that our - * arguments are sane. Gnarly, but avoids code duplication. - */ - if (VM_OBJECT_NULL == compact_object){ - assert(!table); - assert(!offset); - } else { - assert(table); - assert(offset); - } - while (!queue_end(&src_object->memq, (queue_entry_t)next)) { p = next; next = (vm_page_t)queue_next(&next->listq); + /* Finish up if we've hit our pageout limit */ + if (dirty_budget && (dirty_budget == *dirty_count)) { + break; + } + assert(!p->laundry); + if (p->fictitious || p->busy ) continue; @@ -7848,7 +7516,7 @@ vm_object_pack_pages( continue; } - if (VM_OBJECT_NULL == compact_object) { + if (df_handle == NULL) { if (p->dirty || pmap_is_modified(p->phys_page)) { (*dirty_count)++; } else { @@ -7858,14 +7526,7 @@ vm_object_pack_pages( } if (p->cleaning) { - p->busy = TRUE; p->pageout = TRUE; - p->dump_cleaning = TRUE; - - vm_page_lockspin_queues(); - vm_page_wire(p); - vm_page_unlock_queues(); - continue; } @@ -7873,16 +7534,12 @@ vm_object_pack_pages( int refmod_state; refmod_state = pmap_disconnect(p->phys_page); if (refmod_state & VM_MEM_MODIFIED) { - p->dirty = TRUE; + SET_PAGE_DIRTY(p, FALSE); } } if (p->dirty) { - p->busy = TRUE; - - default_freezer_pack_page(p, compact_object, *offset, table); - *offset += PAGE_SIZE; - + default_freezer_pack_page(p, df_handle); (*dirty_count)++; } else { @@ -7911,9 +7568,14 @@ vm_object_pageout( /* Throw to the pageout queue */ vm_page_lockspin_queues(); - VM_PAGE_QUEUES_REMOVE(p); - vm_pageout_cluster(p); - + /* + * see if page is already in the process of + * being cleaned... if so, leave it alone + */ + if (!p->laundry) { + VM_PAGE_QUEUES_REMOVE(p); + vm_pageout_cluster(p, TRUE); + } vm_page_unlock_queues(); } @@ -7953,26 +7615,4 @@ vm_object_pagein( return kr; } - -void -vm_object_unpack( - vm_object_t compact_object, - void **table) -{ - /* - * Future Work: - * Right now we treat the default freezer much like - * the default pager with respect to when it is - * created and terminated. - * But, in the future, we may want to terminate the - * default freezer at the very instant that an object - * has been completely re-filled with all it's previously - * paged-out pages. - * At that time we'll need to reset the object fields like - * "pager" and the associated "pager_{created,initialized,trusted}" - * fields right here. - */ - default_freezer_unpack(compact_object, table); -} - #endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index 0d21734af..21bc4ddcb 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -115,7 +115,8 @@ struct vm_object_fault_info { /* boolean_t */ io_sync:1, /* boolean_t */ cs_bypass:1, /* boolean_t */ mark_zf_absent:1, - __vm_object_fault_info_unused_bits:27; + /* boolean_t */ batch_pmap_op:1, + __vm_object_fault_info_unused_bits:26; }; @@ -477,7 +478,7 @@ __private_extern__ void vm_object_res_deallocate( vm_object_lock_assert_shared(object); \ assert((RLObject)->ref_count > 0); \ OSAddAtomic(1, &(RLObject)->ref_count); \ - assert((RLObject)->ref_count > 1); \ + assert((RLObject)->ref_count > 0); \ /* XXX we would need an atomic version of the following ... */ \ vm_object_res_reference(RLObject); \ MACRO_END @@ -699,39 +700,35 @@ __private_extern__ void vm_object_reap_pages( #define REAP_DATA_FLUSH 3 #if CONFIG_FREEZE +struct default_freezer_handle; __private_extern__ kern_return_t vm_object_pack( - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - boolean_t *shared, - vm_object_t src_object, - vm_object_t dst_object, - void **table, - vm_object_offset_t *offset); + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + boolean_t *shared, + vm_object_t src_object, + struct default_freezer_handle *df_handle); __private_extern__ void vm_object_pack_pages( - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - vm_object_t src_object, - vm_object_t dst_object, - void **table, - vm_object_offset_t *offset); - -__private_extern__ void vm_object_pageout( - vm_object_t object); - -__private_extern__ kern_return_t vm_object_pagein( - vm_object_t object); + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + vm_object_t src_object, + struct default_freezer_handle *df_handle); -__private_extern__ void vm_object_unpack( - vm_object_t object, - void **table); +__private_extern__ void +vm_object_pageout( + vm_object_t object); +__private_extern__ kern_return_t +vm_object_pagein( + vm_object_t object); #endif /* CONFIG_FREEZE */ /* diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 543a0c6f5..b9888628d 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -179,19 +179,19 @@ struct vm_page { */ #define local_id wire_count unsigned int wire_count:16, /* how many wired down maps use me? (O&P) */ - /* boolean_t */ inactive:1, /* page is in inactive list (P) */ - zero_fill:1, - active:1, /* page is in active list (P) */ + /* boolean_t */ active:1, /* page is in active list (P) */ + inactive:1, /* page is in inactive list (P) */ + clean_queue:1, /* page is in pre-cleaned list (P) */ + local:1, /* page is in one of the local queues (P) */ + speculative:1, /* page is in speculative list (P) */ + throttled:1, /* pager is not responding (P) */ + free:1, /* page is on free list (P) */ pageout_queue:1,/* page is on queue for pageout (P) */ - speculative:1, /* page is on speculative list (P) */ laundry:1, /* page is being cleaned now (P)*/ - free:1, /* page is on free list (P) */ reference:1, /* page has been used (P) */ gobbled:1, /* page used internally (P) */ private:1, /* Page should not be returned to * the free list (P) */ - throttled:1, /* pager is not responding (P) */ - local:1, no_cache:1, /* page is not to be cached and should * be reused ahead of other pages (P) */ __unused_pageq_bits:3; /* 3 bits available here */ @@ -238,20 +238,13 @@ struct vm_page { page locked */ encrypted:1, /* encrypted for secure swap (O) */ encrypted_cleaning:1, /* encrypting page */ - list_req_pending:1, /* pagein/pageout alt mechanism */ - /* allows creation of list */ - /* requests on pages that are */ - /* actively being paged. */ - dump_cleaning:1, /* set by the pageout daemon when */ - /* a page being cleaned is */ - /* encountered and targeted as */ - /* a pageout candidate */ cs_validated:1, /* code-signing: page was checked */ cs_tainted:1, /* code-signing: page is tainted */ reusable:1, lopage:1, slid:1, - __unused_object_bits:7; /* 7 bits available here */ + was_dirty:1, /* was this page previously dirty? */ + __unused_object_bits:8; /* 8 bits available here */ #if __LP64__ unsigned int __unused_padding; /* Pad structure explicitly @@ -405,7 +398,9 @@ queue_head_t vm_page_queue_active; /* active memory queue */ extern queue_head_t vm_page_queue_inactive; /* inactive memory queue for normal pages */ extern -queue_head_t vm_page_queue_zf; /* inactive memory queue for zero fill */ +queue_head_t vm_page_queue_cleaned; /* clean-queue inactive memory */ +extern +queue_head_t vm_page_queue_anonymous; /* inactive memory queue for anonymous pages */ extern queue_head_t vm_page_queue_throttled; /* memory queue for throttled pageout pages */ @@ -423,6 +418,8 @@ unsigned int vm_page_active_count; /* How many pages are active? */ extern unsigned int vm_page_inactive_count; /* How many pages are inactive? */ extern +unsigned int vm_page_cleaned_count; /* How many pages are in the clean queue? */ +extern unsigned int vm_page_throttled_count;/* How many inactives are throttled */ extern unsigned int vm_page_speculative_count; /* How many speculative pages are unclaimed? */ @@ -439,6 +436,8 @@ uint32_t vm_page_creation_throttle; /* When to throttle new page creation */ extern unsigned int vm_page_inactive_target;/* How many do we want inactive? */ extern +unsigned int vm_page_anonymous_min; /* When it's ok to pre-clean */ +extern unsigned int vm_page_inactive_min; /* When do wakeup pageout */ extern unsigned int vm_page_free_reserved; /* How many pages reserved to do pageout */ @@ -563,6 +562,8 @@ extern void vm_page_deactivate_internal( vm_page_t page, boolean_t clear_hw_reference); +extern void vm_page_enqueue_cleaned(vm_page_t page); + extern void vm_page_lru( vm_page_t page); @@ -593,7 +594,8 @@ extern void vm_page_insert_internal( vm_object_t object, vm_object_offset_t offset, boolean_t queues_lock_held, - boolean_t insert_in_hash); + boolean_t insert_in_hash, + boolean_t batch_pmap_op); extern void vm_page_replace( vm_page_t mem, @@ -647,14 +649,44 @@ extern void vm_page_free_prepare_object( vm_page_t page, boolean_t remove_from_hash); -extern void vm_check_memorystatus(void); - +#if CONFIG_JETSAM +extern void memorystatus_update(unsigned int pages_avail); + +#define VM_CHECK_MEMORYSTATUS do { \ + memorystatus_update( \ + vm_page_active_count + \ + vm_page_inactive_count + \ + vm_page_speculative_count + \ + vm_page_free_count + \ + (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) ? 0 : vm_page_purgeable_count) \ + ); \ + } while(0) +#else +#define VM_CHECK_MEMORYSTATUS do {} while(0) +#endif /* * Functions implemented as macros. m->wanted and m->busy are * protected by the object lock. */ +#if CONFIG_EMBEDDED +#define SET_PAGE_DIRTY(m, set_pmap_modified) \ + MACRO_BEGIN \ + vm_page_t __page__ = (m); \ + if (__page__->dirty == FALSE && (set_pmap_modified)) { \ + pmap_set_modify(__page__->phys_page); \ + } \ + __page__->dirty = TRUE; \ + MACRO_END +#else /* CONFIG_EMBEDDED */ +#define SET_PAGE_DIRTY(m, set_pmap_modified) \ + MACRO_BEGIN \ + vm_page_t __page__ = (m); \ + __page__->dirty = TRUE; \ + MACRO_END +#endif /* CONFIG_EMBEDDED */ + #define PAGE_ASSERT_WAIT(m, interruptible) \ (((m)->wanted = TRUE), \ assert_wait((event_t) (m), (interruptible))) @@ -736,16 +768,24 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); * this is why its safe to utilze the wire_count field in the vm_page_t as the local_id... * 'wired' and local are ALWAYS mutually exclusive conditions. */ + #define VM_PAGE_QUEUES_REMOVE(mem) \ MACRO_BEGIN \ VM_PAGE_QUEUES_ASSERT(mem, 1); \ assert(!mem->laundry); \ - assert(!mem->pageout_queue); \ +/* \ + * if (mem->pageout_queue) \ + * NOTE: VM_PAGE_QUEUES_REMOVE does not deal with removing pages from the pageout queue... \ + * the caller is responsible for determing if the page is on that queue, and if so, must \ + * either first remove it (it needs both the page queues lock and the object lock to do \ + * this via vm_pageout_steal_laundry), or avoid the call to VM_PAGE_QUEUES_REMOVE \ + */ \ if (mem->local) { \ struct vpl *lq; \ assert(mem->object != kernel_object); \ assert(!mem->inactive && !mem->speculative); \ assert(!mem->active && !mem->throttled); \ + assert(!mem->clean_queue); \ assert(!mem->fictitious); \ lq = &vm_page_local_q[mem->local_id].vpl_un.vpl; \ VPL_LOCK(&lq->vpl_lock); \ @@ -760,6 +800,7 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); else if (mem->active) { \ assert(mem->object != kernel_object); \ assert(!mem->inactive && !mem->speculative); \ + assert(!mem->clean_queue); \ assert(!mem->throttled); \ assert(!mem->fictitious); \ queue_remove(&vm_page_queue_active, \ @@ -773,17 +814,24 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); assert(!mem->active && !mem->speculative); \ assert(!mem->throttled); \ assert(!mem->fictitious); \ - if (mem->zero_fill) { \ - queue_remove(&vm_page_queue_zf, \ - mem, vm_page_t, pageq); \ - vm_zf_queue_count--; \ + vm_page_inactive_count--; \ + if (mem->clean_queue) { \ + queue_remove(&vm_page_queue_cleaned, \ + mem, vm_page_t, pageq); \ + mem->clean_queue = FALSE; \ + vm_page_cleaned_count--; \ } else { \ - queue_remove(&vm_page_queue_inactive, \ - mem, vm_page_t, pageq); \ + if (mem->object->internal) { \ + queue_remove(&vm_page_queue_anonymous, \ + mem, vm_page_t, pageq); \ + vm_page_anonymous_count--; \ + } else { \ + queue_remove(&vm_page_queue_inactive, \ + mem, vm_page_t, pageq); \ + } \ + vm_purgeable_q_advance_all(); \ } \ mem->inactive = FALSE; \ - vm_page_inactive_count--; \ - vm_purgeable_q_advance_all(); \ } \ \ else if (mem->throttled) { \ @@ -819,12 +867,12 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); assert(!mem->fictitious); \ assert(!mem->laundry); \ assert(!mem->pageout_queue); \ - if (mem->zero_fill) { \ + if (mem->object->internal) { \ if (first == TRUE) \ - queue_enter_first(&vm_page_queue_zf, mem, vm_page_t, pageq); \ + queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, pageq); \ else \ - queue_enter(&vm_page_queue_zf, mem, vm_page_t, pageq); \ - vm_zf_queue_count++; \ + queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, pageq); \ + vm_page_anonymous_count++; \ } else { \ if (first == TRUE) \ queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, pageq); \ @@ -873,7 +921,7 @@ extern void vm_page_queues_assert(vm_page_t mem, int val); #define DW_set_reference 0x800 #define DW_move_page 0x1000 #define DW_VM_PAGE_QUEUES_REMOVE 0x2000 -#define DW_set_list_req_pending 0x4000 +#define DW_enqueue_cleaned 0x4000 struct vm_page_delayed_work { vm_page_t dw_m; @@ -895,14 +943,6 @@ extern unsigned int vm_max_delayed_work_limit; * set, we need to set it and ask vm_page_do_delayed_work * to clear it and wakeup anyone that might have blocked on * it once we're done processing the page. - * - * additionally, we can't call vm_page_do_delayed_work with - * list_req_pending == TRUE since it may need to - * drop the object lock before dealing - * with this page and because list_req_pending == TRUE, - * busy == TRUE will NOT protect this page from being stolen - * so clear list_req_pending and ask vm_page_do_delayed_work - * to re-set it once it holds both the pageq and object locks */ #define VM_PAGE_ADD_DELAYED_WORK(dwp, mem, dw_cnt) \ @@ -912,13 +952,9 @@ extern unsigned int vm_max_delayed_work_limit; if ( !(dwp->dw_mask & DW_vm_page_free)) \ dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); \ } \ - if (mem->list_req_pending) { \ - mem->list_req_pending = FALSE; \ - dwp->dw_mask |= DW_set_list_req_pending; \ - } \ dwp->dw_m = mem; \ dwp++; \ - dw_count++; \ + dw_cnt++; \ MACRO_END extern vm_page_t vm_object_page_grab(vm_object_t); diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 28b3cb172..2bfd6e7a5 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include @@ -95,8 +94,6 @@ #include #include -#include - #include #include #include @@ -110,14 +107,18 @@ /* * ENCRYPTED SWAP: */ -#include <../bsd/crypto/aes/aes.h> +#include extern u_int32_t random(void); /* from */ +extern int cs_debug; + #if UPL_DEBUG #include #endif -extern void consider_pressure_events(void); +#if VM_PRESSURE_EVENTS +extern void consider_vm_pressure_events(void); +#endif #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */ #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100 @@ -159,6 +160,14 @@ extern void consider_pressure_events(void); #define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */ #endif /* VM_PAGEOUT_IDLE_WAIT */ +#ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED +#define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED 1000 /* maximum pages considered before we issue a pressure event */ +#endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */ + +#ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS +#define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS 5 /* seconds */ +#endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */ + unsigned int vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS; unsigned int vm_page_speculative_percentage = 5; @@ -185,7 +194,7 @@ unsigned int vm_page_speculative_percentage = 5; */ #ifndef VM_PAGE_INACTIVE_TARGET -#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3) +#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2) #endif /* VM_PAGE_INACTIVE_TARGET */ /* @@ -251,6 +260,8 @@ unsigned int vm_page_speculative_percentage = 5; #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 100 +extern boolean_t hibernate_cleaning_in_progress; + /* * Exported variable used to broadcast the activation of the pageout scan * Working Set uses this to throttle its use of pmap removes. In this @@ -264,10 +275,12 @@ unsigned int vm_pageout_scan_event_counter = 0; * Forward declarations for internal routines. */ +static void vm_pressure_thread(void); static void vm_pageout_garbage_collect(int); static void vm_pageout_iothread_continue(struct vm_pageout_queue *); static void vm_pageout_iothread_external(void); static void vm_pageout_iothread_internal(void); +static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t); extern void vm_pageout_continue(void); extern void vm_pageout_scan(void); @@ -289,15 +302,6 @@ unsigned int vm_pageout_burst_inactive_throttle = 0; int vm_upl_wait_for_pages = 0; -/* - * Protection against zero fill flushing live working sets derived - * from existing backing store and files - */ -unsigned int vm_accellerate_zf_pageout_trigger = 400; -unsigned int zf_queue_min_count = 100; -unsigned int vm_zf_queue_count = 0; - -uint64_t vm_zf_count __attribute__((aligned(8))) = 0; /* * These variables record the pageout daemon's actions: @@ -320,10 +324,27 @@ unsigned int vm_pageout_inactive_used = 0; /* debugging */ unsigned int vm_pageout_cache_evicted = 0; /* debugging */ unsigned int vm_pageout_inactive_clean = 0; /* debugging */ unsigned int vm_pageout_speculative_clean = 0; /* debugging */ + +unsigned int vm_pageout_freed_from_cleaned = 0; +unsigned int vm_pageout_freed_from_speculative = 0; +unsigned int vm_pageout_freed_from_inactive_clean = 0; + +unsigned int vm_pageout_enqueued_cleaned_from_inactive_clean = 0; +unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0; + +unsigned int vm_pageout_cleaned_reclaimed = 0; /* debugging; how many cleaned pages are reclaimed by the pageout scan */ +unsigned int vm_pageout_cleaned_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */ +unsigned int vm_pageout_cleaned_reference_reactivated = 0; +unsigned int vm_pageout_cleaned_volatile_reactivated = 0; +unsigned int vm_pageout_cleaned_fault_reactivated = 0; +unsigned int vm_pageout_cleaned_commit_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */ +unsigned int vm_pageout_cleaned_busy = 0; +unsigned int vm_pageout_cleaned_nolock = 0; + unsigned int vm_pageout_inactive_dirty_internal = 0; /* debugging */ unsigned int vm_pageout_inactive_dirty_external = 0; /* debugging */ unsigned int vm_pageout_inactive_deactivated = 0; /* debugging */ -unsigned int vm_pageout_inactive_zf = 0; /* debugging */ +unsigned int vm_pageout_inactive_anonymous = 0; /* debugging */ unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */ unsigned int vm_pageout_purged_objects = 0; /* debugging */ unsigned int vm_stat_discard = 0; /* debugging */ @@ -339,16 +360,22 @@ unsigned int vm_pageout_scan_active_throttled = 0; unsigned int vm_pageout_scan_inactive_throttled_internal = 0; unsigned int vm_pageout_scan_inactive_throttled_external = 0; unsigned int vm_pageout_scan_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_throttle_aborted = 0; /* debugging */ unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */ unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */ unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */ unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */ unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */ -unsigned int vm_pageout_inactive_external_forced_reactivate_count = 0; /* debugging */ +unsigned int vm_pageout_inactive_external_forced_reactivate_count = 0; /* debugging */ +unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0; /* debugging */ unsigned int vm_page_speculative_count_drifts = 0; unsigned int vm_page_speculative_count_drift_max = 0; + +unsigned int vm_precleaning_aborted = 0; + +static boolean_t vm_pageout_need_to_refill_clean_queue = FALSE; +static boolean_t vm_pageout_precleaning_delayed = FALSE; + /* * Backing store throttle when BS is exhausted */ @@ -385,6 +412,12 @@ unsigned long vm_cs_validated_resets = 0; int vm_debug_events = 0; +#if CONFIG_MEMORYSTATUS +extern int memorystatus_wakeup; +#endif +#if CONFIG_JETSAM +extern int memorystatus_kill_top_proc_from_VM(void); +#endif /* * Routine: vm_backing_store_disable @@ -457,6 +490,7 @@ vm_pageout_object_terminate( assert(p->pageout); p->pageout = FALSE; assert(!p->cleaning); + assert(!p->laundry); offset = p->offset; VM_PAGE_FREE(p); @@ -467,13 +501,6 @@ vm_pageout_object_terminate( if(m == VM_PAGE_NULL) continue; - assert(m->cleaning); - /* used as a trigger on upl_commit etc to recognize the */ - /* pageout daemon's subseqent desire to pageout a cleaning */ - /* page. When the bit is on the upl commit code will */ - /* respect the pageout bit in the target page over the */ - /* caller's page list indication */ - m->dump_cleaning = FALSE; assert((m->dirty) || (m->precious) || (m->busy && m->cleaning)); @@ -483,9 +510,8 @@ vm_pageout_object_terminate( * Also decrement the burst throttle (if external). */ vm_page_lock_queues(); - if (m->laundry) { + if (m->laundry) vm_pageout_throttle_up(m); - } /* * Handle the "target" page(s). These pages are to be freed if @@ -514,10 +540,11 @@ vm_pageout_object_terminate( * can detect whether the page was redirtied during * pageout by checking the modify state. */ - if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) - m->dirty = TRUE; - else - m->dirty = FALSE; + if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m, FALSE); + } else { + m->dirty = FALSE; + } if (m->dirty) { CLUSTER_STAT(vm_pageout_target_page_dirtied++;) @@ -588,7 +615,7 @@ vm_pageout_object_terminate( else vm_pageout_cluster_cleaned++; if (m->wanted) vm_pageout_cluster_collisions++; #else - m->dirty = 0; + m->dirty = FALSE; #endif } if (m->encrypted_cleaning == TRUE) { @@ -650,7 +677,7 @@ vm_pageclean_setup( * Mark original page as cleaning in place. */ m->cleaning = TRUE; - m->dirty = TRUE; + SET_PAGE_DIRTY(m, FALSE); m->precious = FALSE; /* @@ -697,7 +724,6 @@ vm_pageout_initialize_page( { vm_object_t object; vm_object_offset_t paging_offset; - vm_page_t holding_page; memory_object_t pager; XPR(XPR_VM_PAGEOUT, @@ -740,21 +766,17 @@ vm_pageout_initialize_page( return; } - /* set the page for future call to vm_fault_list_request */ - vm_object_paging_begin(object); - holding_page = NULL; - + /* + * set the page for future call to vm_fault_list_request + */ pmap_clear_modify(m->phys_page); - m->dirty = TRUE; - m->busy = TRUE; - m->list_req_pending = TRUE; - m->cleaning = TRUE; + SET_PAGE_DIRTY(m, FALSE); m->pageout = TRUE; - vm_page_lockspin_queues(); - vm_page_wire(m); - vm_page_unlock_queues(); - + /* + * keep the object from collapsing or terminating + */ + vm_object_paging_begin(object); vm_object_unlock(object); /* @@ -797,7 +819,7 @@ struct { */ void -vm_pageout_cluster(vm_page_t m) +vm_pageout_cluster(vm_page_t m, boolean_t pageout) { vm_object_t object = m->object; struct vm_pageout_queue *q; @@ -816,27 +838,19 @@ vm_pageout_cluster(vm_page_t m) /* * Only a certain kind of page is appreciated here. */ - assert(m->busy && (m->dirty || m->precious) && (!VM_PAGE_WIRED(m))); - assert(!m->cleaning && !m->pageout); + assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m))); + assert(!m->cleaning && !m->pageout && !m->laundry); #ifndef CONFIG_FREEZE assert(!m->inactive && !m->active); assert(!m->throttled); #endif /* - * protect the object from collapse - - * locking in the object's paging_offset. + * protect the object from collapse or termination */ - vm_object_paging_begin(object); + vm_object_activity_begin(object); - /* - * set the page for future call to vm_fault_list_request - * page should already be marked busy - */ - vm_page_wire(m); - m->list_req_pending = TRUE; - m->cleaning = TRUE; - m->pageout = TRUE; + m->pageout = pageout; if (object->internal == TRUE) q = &vm_pageout_queue_internal; @@ -879,6 +893,11 @@ vm_pageout_throttle_up( assert(m->object != VM_OBJECT_NULL); assert(m->object != kernel_object); +#if DEBUG + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + vm_object_lock_assert_exclusive(m->object); +#endif + vm_pageout_throttle_up_count++; if (m->object->internal == TRUE) @@ -894,10 +913,9 @@ vm_pageout_throttle_up( m->pageq.next = NULL; m->pageq.prev = NULL; - vm_object_paging_end(m->object); + vm_object_activity_end(m->object); } - - if ( m->laundry == TRUE ) { + if (m->laundry == TRUE) { m->laundry = FALSE; q->pgo_laundry--; @@ -910,6 +928,16 @@ vm_pageout_throttle_up( q->pgo_draining = FALSE; thread_wakeup((event_t) (&q->pgo_laundry+1)); } + if (vm_pageout_precleaning_delayed == TRUE) { + /* + * since the pageout scan can return on laundry congestion, wake it up this way + * don't depend on pgo_throttled == TRUE to indicate that the pageout scan thread + * is blocked on &q->pgo_laundry since the hibernation mechanism utilizes both + * pgo_throttled and pgo_draining + */ + vm_pageout_precleaning_delayed = FALSE; + thread_wakeup((event_t)(&vm_page_free_wanted)); + } } } @@ -965,6 +993,14 @@ compute_memory_pressure( vm_pageout_stat_now = vm_pageout_next; } + +/* + * IMPORTANT + * mach_vm_ctl_page_free_wanted() is called indirectly, via + * mach_vm_pressure_monitor(), when taking a stackshot. Therefore, + * it must be safe in the restricted stackshot context. Locks and/or + * blocking are not allowable. + */ unsigned int mach_vm_ctl_page_free_wanted(void) { @@ -981,6 +1017,15 @@ mach_vm_ctl_page_free_wanted(void) return page_free_wanted; } + +/* + * IMPORTANT: + * mach_vm_pressure_monitor() is called when taking a stackshot, with + * wait_for_pressure FALSE, so that code path must remain safe in the + * restricted stackshot context. No blocking or locks are allowable. + * on that code path. + */ + kern_return_t mach_vm_pressure_monitor( boolean_t wait_for_pressure, @@ -1049,15 +1094,24 @@ mach_vm_pressure_monitor( return KERN_SUCCESS; } -/* Page States: Used below to maintain the page state - before it's removed from it's Q. This saved state - helps us do the right accounting in certain cases -*/ + +/* + * function in BSD to apply I/O throttle to the pageout thread + */ +extern void vm_pageout_io_throttle(void); + + +/* + * Page States: Used below to maintain the page state + * before it's removed from it's Q. This saved state + * helps us do the right accounting in certain cases + */ #define PAGE_STATE_SPECULATIVE 1 -#define PAGE_STATE_ZEROFILL 2 +#define PAGE_STATE_ANONYMOUS 2 #define PAGE_STATE_INACTIVE 3 #define PAGE_STATE_INACTIVE_FIRST 4 +#define PAGE_STATE_CLEAN 5 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m) \ MACRO_BEGIN \ @@ -1091,11 +1145,13 @@ struct flow_control { mach_timespec_t ts; }; +uint32_t vm_pageout_considered_page = 0; + /* * vm_pageout_scan does the dirty work for the pageout daemon. - * It returns with vm_page_queue_free_lock held and - * vm_page_free_wanted == 0. + * It returns with both vm_page_queue_free_lock and vm_page_queue_lock + * held and vm_page_free_wanted == 0. */ void vm_pageout_scan(void) @@ -1121,11 +1177,11 @@ vm_pageout_scan(void) unsigned int msecs = 0; vm_object_t object; vm_object_t last_object_tried; - uint64_t zf_ratio; - uint64_t zf_run_count; uint32_t catch_up_count = 0; uint32_t inactive_reclaim_run; boolean_t forced_reclaim; + boolean_t exceeded_burst_throttle; + boolean_t grab_anonymous = FALSE; int page_prev_state = 0; int cache_evict_throttle = 0; uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; @@ -1155,6 +1211,7 @@ vm_pageout_scan(void) vm_page_inactive_count); inactive_reclaim_run = 0; + vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; /* * We want to gradually dribble pages from the active queue @@ -1176,39 +1233,16 @@ vm_pageout_scan(void) Restart: assert(delayed_unlock!=0); - - /* - * A page is "zero-filled" if it was not paged in from somewhere, - * and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big. - * Recalculate the zero-filled page ratio. We use this to apportion - * victimized pages between the normal and zero-filled inactive - * queues according to their relative abundance in memory. Thus if a task - * is flooding memory with zf pages, we begin to hunt them down. - * It would be better to throttle greedy tasks at a higher level, - * but at the moment mach vm cannot do this. - */ - { - uint64_t total = vm_page_active_count + vm_page_inactive_count; - uint64_t normal = total - vm_zf_count; - /* zf_ratio is the number of zf pages we victimize per normal page */ - - if (vm_zf_count < vm_accellerate_zf_pageout_trigger) - zf_ratio = 0; - else if ((vm_zf_count <= normal) || (normal == 0)) - zf_ratio = 1; - else - zf_ratio = vm_zf_count / normal; - - zf_run_count = 0; - } - /* * Recalculate vm_page_inactivate_target. */ vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + vm_page_inactive_count + vm_page_speculative_count); + + vm_page_anonymous_min = vm_page_inactive_target / 3; + /* * don't want to wake the pageout_scan thread up everytime we fall below * the targets... set a low water mark at 0.25% below the target @@ -1223,8 +1257,6 @@ Restart: vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + vm_page_inactive_count); - vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; - object = NULL; last_object_tried = NULL; try_failed = FALSE; @@ -1233,7 +1265,7 @@ Restart: catch_up_count = vm_page_inactive_count + vm_page_speculative_count; else catch_up_count = 0; - + for (;;) { vm_page_t m; @@ -1254,8 +1286,9 @@ Restart: /* * Move pages from active to inactive if we're below the target */ + /* if we are trying to make clean, we need to make sure we actually have inactive - mj */ if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target) - goto done_moving_active_pages; + goto done_moving_active_pages; if (object != NULL) { vm_object_unlock(object); @@ -1336,6 +1369,9 @@ Restart: done_moving_active_pages: + if (vm_page_cleaned_count < VM_PAGE_CLEANED_MIN && vm_page_anonymous_count > vm_page_anonymous_min) + vm_pageout_need_to_refill_clean_queue = TRUE; + if (vm_page_free_count + local_freed >= vm_page_free_target) { if (object != NULL) { vm_object_unlock(object); @@ -1349,7 +1385,7 @@ done_moving_active_pages: VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, vm_page_free_count, local_freed, delayed_unlock_limit, 2); - vm_page_free_list(local_freeq, TRUE); + vm_page_free_list(local_freeq, TRUE); VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, vm_page_free_count, local_freed, 0, 2); @@ -1358,6 +1394,16 @@ done_moving_active_pages: local_freed = 0; vm_page_lock_queues(); } + /* + * make sure the pageout I/O threads are running + * throttled in case there are still requests + * in the laundry... since we have met our targets + * we don't need the laundry to be cleaned in a timely + * fashion... so let's avoid interfering with foreground + * activity + */ + vm_pageout_adjust_io_throttles(iq, eq, TRUE); + /* * recalculate vm_page_inactivate_target */ @@ -1377,19 +1423,18 @@ done_moving_active_pages: lck_mtx_lock(&vm_page_queue_free_lock); if ((vm_page_free_count >= vm_page_free_target) && + (vm_page_cleaned_count >= VM_PAGE_CLEANED_TARGET || vm_pageout_need_to_refill_clean_queue == FALSE) && (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { /* * done - we have met our target *and* * there is no one waiting for a page. */ - vm_page_unlock_queues(); - - thread_wakeup((event_t) &vm_pageout_garbage_collect); - + vm_pageout_need_to_refill_clean_queue = FALSE; +return_from_scan: assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, - vm_pageout_inactive, vm_pageout_inactive_used, 0, 0); + vm_pageout_inactive, vm_pageout_inactive_used, vm_pageout_need_to_refill_clean_queue, 0); VM_DEBUG_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, vm_pageout_speculative_clean, vm_pageout_inactive_clean, vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); @@ -1524,14 +1569,15 @@ done_moving_active_pages: cache_evict_throttle--; + exceeded_burst_throttle = FALSE; /* * Sometimes we have to pause: * 1) No inactive pages - nothing to do. - * 2) Flow control - default pageout queue is full - * 3) Loop control - no acceptable pages found on the inactive queue + * 2) Loop control - no acceptable pages found on the inactive queue * within the last vm_pageout_burst_inactive_throttle iterations + * 3) Flow control - default pageout queue is full */ - if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q)) { + if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_anonymous) && queue_empty(&sq->age_q)) { vm_pageout_scan_empty_throttle++; msecs = vm_pageout_empty_wait; goto vm_pageout_scan_delay; @@ -1542,6 +1588,8 @@ done_moving_active_pages: vm_page_speculative_count))) { vm_pageout_scan_burst_throttle++; msecs = vm_pageout_burst_wait; + + exceeded_burst_throttle = TRUE; goto vm_pageout_scan_delay; } else if (VM_PAGE_Q_THROTTLED(iq) && @@ -1552,6 +1600,14 @@ done_moving_active_pages: switch (flow_control.state) { case FCS_IDLE: + if ((vm_page_free_count + local_freed) < vm_page_free_target) { + if (vm_page_inactive_count - vm_page_anonymous_count > 0) { + grab_anonymous = FALSE; + goto consider_inactive; + } + if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) + continue; + } reset_deadlock_timer: ts.tv_sec = vm_pageout_deadlock_wait / 1000; ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; @@ -1590,7 +1646,6 @@ reset_deadlock_timer: vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged; vm_pageout_scan_deadlock_detected++; flow_control.state = FCS_DEADLOCK_DETECTED; - thread_wakeup((event_t) &vm_pageout_garbage_collect); goto consider_inactive; } @@ -1609,8 +1664,6 @@ reset_deadlock_timer: goto reset_deadlock_timer; } - vm_pageout_scan_throttle++; - iq->pgo_throttled = TRUE; vm_pageout_scan_delay: if (object != NULL) { vm_object_unlock(object); @@ -1624,7 +1677,7 @@ vm_pageout_scan_delay: VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, vm_page_free_count, local_freed, delayed_unlock_limit, 3); - vm_page_free_list(local_freeq, TRUE); + vm_page_free_list(local_freeq, TRUE); VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, vm_page_free_count, local_freed, 0, 3); @@ -1636,13 +1689,86 @@ vm_pageout_scan_delay: if (flow_control.state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) { flow_control.state = FCS_IDLE; - vm_pageout_scan_throttle_aborted++; goto consider_inactive; } } + + if (vm_page_free_count >= vm_page_free_target) { + /* + * we're here because either + * 1) someone else freed up some pages while we had + * the queues unlocked above or + * 2) we're precleaning and we haven't yet met + * our cleaned target + * and we've hit one of the 3 conditions that + * cause us to pause the pageout scan thread + * + * since we already have enough free pages, + * let's avoid stalling and return normally + * + * before we return, make sure the pageout I/O threads + * are running throttled in case there are still requests + * in the laundry... since we have enough free pages + * we don't need the laundry to be cleaned in a timely + * fashion... so let's avoid interfering with foreground + * activity + * + * we don't want to hold vm_page_queue_free_lock when + * calling vm_pageout_adjust_io_throttles (since it + * may cause other locks to be taken), we do the intitial + * check outside of the lock. Once we take the lock, + * we recheck the condition since it may have changed. + * if it has, no problem, we will make the threads + * non-throttled before actually blocking + */ + vm_pageout_adjust_io_throttles(iq, eq, TRUE); + } + lck_mtx_lock(&vm_page_queue_free_lock); + if (vm_page_free_count >= vm_page_free_target) { + if (vm_page_cleaned_count < VM_PAGE_CLEANED_TARGET) { + vm_precleaning_aborted++; + vm_pageout_precleaning_delayed = TRUE; + } + goto return_from_scan; + } + lck_mtx_unlock(&vm_page_queue_free_lock); + + if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) { + /* + * we're most likely about to block due to one of + * the 3 conditions that cause vm_pageout_scan to + * not be able to make forward progress w/r + * to providing new pages to the free queue, + * so unthrottle the I/O threads in case we + * have laundry to be cleaned... it needs + * to be completed ASAP. + * + * even if we don't block, we want the io threads + * running unthrottled since the sum of free + + * clean pages is still under our free target + */ + vm_pageout_adjust_io_throttles(iq, eq, FALSE); + } + if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) { + /* + * if we get here we're below our free target and + * we're stalling due to a full laundry queue or + * we don't have any inactive pages other then + * those in the clean queue... + * however, we have pages on the clean queue that + * can be moved to the free queue, so let's not + * stall the pageout scan + */ + flow_control.state = FCS_IDLE; + goto consider_inactive; + } VM_CHECK_MEMORYSTATUS; + if (flow_control.state != FCS_IDLE) + vm_pageout_scan_throttle++; + iq->pgo_throttled = TRUE; + assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC); counter(c_vm_pageout_scan_block++); @@ -1679,9 +1805,12 @@ consider_inactive: loop_count++; inactive_burst_count++; vm_pageout_inactive++; - - /* Choose a victim. */ + boolean_t pageout_making_free = ((vm_page_free_count + local_freed) < vm_page_free_target); /* TRUE if making free, FALSE if making clean */ + + /* + * Choose a victim. + */ while (1) { m = NULL; @@ -1689,43 +1818,109 @@ consider_inactive: assert(vm_page_throttled_count == 0); assert(queue_empty(&vm_page_queue_throttled)); } - + /* - * The most eligible pages are ones we paged in speculatively, - * but which have not yet been touched. + * If we are still below the free target, try speculative + * and clean queue pages. */ - if ( !queue_empty(&sq->age_q) ) { - m = (vm_page_t) queue_first(&sq->age_q); + if (pageout_making_free) { + /* + * The most eligible pages are ones we paged in speculatively, + * but which have not yet been touched. + */ + if ( !queue_empty(&sq->age_q) ) { + m = (vm_page_t) queue_first(&sq->age_q); - page_prev_state = PAGE_STATE_SPECULATIVE; - break; + page_prev_state = PAGE_STATE_SPECULATIVE; + + break; + } + + /* + * Try a clean-queue inactive page, if we are still trying to fill the free list. + */ + if ( !queue_empty(&vm_page_queue_cleaned) ) { + m = (vm_page_t) queue_first(&vm_page_queue_cleaned); + + page_prev_state = PAGE_STATE_CLEAN; + + break; + } + + if (grab_anonymous == FALSE || queue_empty(&vm_page_queue_anonymous)) { + + if ( !queue_empty(&vm_page_queue_inactive) ) { + m = (vm_page_t) queue_first(&vm_page_queue_inactive); + + page_prev_state = PAGE_STATE_INACTIVE; + if (vm_pageout_need_to_refill_clean_queue == TRUE) + grab_anonymous = TRUE; + break; + } + } } - /* - * Time for a zero-filled inactive page? - */ - if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) || - queue_empty(&vm_page_queue_inactive)) { - if ( !queue_empty(&vm_page_queue_zf) ) { - m = (vm_page_t) queue_first(&vm_page_queue_zf); + if (vm_pageout_need_to_refill_clean_queue == TRUE) { + if ( !queue_empty(&vm_page_queue_anonymous) ) { + m = (vm_page_t) queue_first(&vm_page_queue_anonymous); - page_prev_state = PAGE_STATE_ZEROFILL; - zf_run_count++; + page_prev_state = PAGE_STATE_ANONYMOUS; + grab_anonymous = FALSE; break; } } + /* - * It's either a normal inactive page or nothing. + * if we've gotten here, we have no victim page. + * if making clean, free the local freed list and return. + * if making free, check to see if we've finished balancing the queues + * yet, if we haven't just continue, else panic */ - if ( !queue_empty(&vm_page_queue_inactive) ) { - m = (vm_page_t) queue_first(&vm_page_queue_inactive); + vm_page_unlock_queues(); - page_prev_state = PAGE_STATE_INACTIVE; - zf_run_count = 0; - break; - } + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + + if (local_freeq) { + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, + vm_page_free_count, local_freed, delayed_unlock_limit, 5); + + vm_page_free_list(local_freeq, TRUE); + + VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, + vm_page_free_count, local_freed, 0, 5); + + local_freeq = NULL; + local_freed = 0; + } + vm_page_lock_queues(); + delayed_unlock = 1; + + if (pageout_making_free == FALSE) { + if (vm_pageout_need_to_refill_clean_queue == TRUE) + DTRACE_VM(novictimforclean); - panic("vm_pageout: no victim"); + lck_mtx_lock(&vm_page_queue_free_lock); + goto return_from_scan; + + } + if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) + goto Restart; + + panic("vm_pageout: no victim"); + + /* NOTREACHED */ } + + /* + * we just found this page on one of our queues... + * it can't also be on the pageout queue, so safe + * to call VM_PAGE_QUEUES_REMOVE + */ + assert(!m->pageout_queue); + VM_PAGE_QUEUES_REMOVE(m); assert(!m->laundry); @@ -1771,6 +1966,9 @@ consider_inactive: vm_pageout_inactive_nolock++; + if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_nolock++; + if (page_prev_state == PAGE_STATE_SPECULATIVE) page_prev_state = PAGE_STATE_INACTIVE_FIRST; @@ -1788,14 +1986,17 @@ consider_inactive: */ m->object->scan_collisions++; - if ( !queue_empty(&sq->age_q) ) - m_want = (vm_page_t) queue_first(&sq->age_q); - else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) || - queue_empty(&vm_page_queue_inactive)) { - if ( !queue_empty(&vm_page_queue_zf) ) - m_want = (vm_page_t) queue_first(&vm_page_queue_zf); - } else if ( !queue_empty(&vm_page_queue_inactive) ) { - m_want = (vm_page_t) queue_first(&vm_page_queue_inactive); + if (pageout_making_free) { + if ( !queue_empty(&sq->age_q) ) + m_want = (vm_page_t) queue_first(&sq->age_q); + else if (!queue_empty(&vm_page_queue_cleaned)) + m_want = (vm_page_t) queue_first(&vm_page_queue_cleaned); + else if (grab_anonymous == FALSE || queue_empty(&vm_page_queue_anonymous)) + m_want = (vm_page_t) queue_first(&vm_page_queue_inactive); + } + if (m_want == NULL && vm_pageout_need_to_refill_clean_queue == TRUE) { + if ( !queue_empty(&vm_page_queue_anonymous) ) + m_want = (vm_page_t) queue_first(&vm_page_queue_anonymous); } /* * this is the next object we're going to be interested in @@ -1850,6 +2051,10 @@ consider_inactive: * */ vm_pageout_inactive_busy++; + + if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_busy++; + requeue_page: switch (page_prev_state) { @@ -1857,12 +2062,8 @@ requeue_page: vm_page_speculate(m, FALSE); break; - case PAGE_STATE_ZEROFILL: - m->zero_fill = TRUE; - /* - * fall through to add in the - * inactive state - */ + case PAGE_STATE_ANONYMOUS: + case PAGE_STATE_CLEAN: case PAGE_STATE_INACTIVE: VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); break; @@ -1890,7 +2091,7 @@ requeue_page: vm_pageout_inactive_notalive++; else vm_pageout_inactive_error++; -reclaim_page: +reclaim_page: if (vm_pageout_deadlock_target) { vm_pageout_scan_inactive_throttle_success++; vm_pageout_deadlock_target--; @@ -1903,7 +2104,10 @@ reclaim_page: } else { DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL); } - vm_page_free_prepare_queues(m); + assert(!m->cleaning); + assert(!m->laundry); + + m->busy = TRUE; /* * remove page from object here since we're already @@ -1919,6 +2123,13 @@ reclaim_page: m->pageq.next = (queue_entry_t)local_freeq; local_freeq = m; local_freed++; + + if (page_prev_state == PAGE_STATE_SPECULATIVE) + vm_pageout_freed_from_speculative++; + else if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_freed_from_cleaned++; + else + vm_pageout_freed_from_inactive_clean++; inactive_burst_count = 0; @@ -1935,12 +2146,11 @@ reclaim_page: */ if (object->copy == VM_OBJECT_NULL) { if (object->purgable == VM_PURGABLE_EMPTY) { - m->busy = TRUE; if (m->pmapped == TRUE) { /* unmap the page */ refmod_state = pmap_disconnect(m->phys_page); if (refmod_state & VM_MEM_MODIFIED) { - m->dirty = TRUE; + SET_PAGE_DIRTY(m, FALSE); } } if (m->dirty || m->precious) { @@ -1955,11 +2165,15 @@ reclaim_page: /* just stick it back on! */ reactivated_this_call++; + + if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_volatile_reactivated++; + goto reactivate_page; } } - consider_inactive_page: +consider_inactive_page: if (m->busy) { /* * CAUTION CAUTION: @@ -1994,37 +2208,54 @@ reclaim_page: if (refmod_state & VM_MEM_REFERENCED) m->reference = TRUE; - if (refmod_state & VM_MEM_MODIFIED) - m->dirty = TRUE; + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m, FALSE); + } } - + /* + * if (m->cleaning) * If already cleaning this page in place and it hasn't - * been recently referenced, convert from - * "adjacent" to "target". We can leave the page mapped, - * and upl_commit_range will determine whether - * to free or reactivate. + * been recently referenced, just pull off the queue. + * We can leave the page mapped, and upl_commit_range + * will put it on the clean queue. * * note: if m->encrypted_cleaning == TRUE, then * m->cleaning == TRUE * and we'll handle it here + * + * if (m->pageout && !m->cleaning) + * an msync INVALIDATE is in progress... + * this page has been marked for destruction + * after it has been cleaned, + * but not yet gathered into a UPL + * where 'cleaning' will be set... + * just leave it off the paging queues + * + * if (m->pageout && m->clenaing) + * an msync INVALIDATE is in progress + * and the UPL has already gathered this page... + * just leave it off the paging queues */ + + /* + * page with m->pageout and still on the queues means that an + * MS_INVALIDATE in progress on this page... leave it alone + */ + if (m->pageout) { + inactive_burst_count = 0; + goto done_with_inactivepage; + } + + /* if cleaning, reactivate if referenced. otherwise, just pull off queue */ if (m->cleaning) { - if (m->reference == TRUE) { reactivated_this_call++; goto reactivate_page; + } else { + inactive_burst_count = 0; + goto done_with_inactivepage; } - m->busy = TRUE; - m->pageout = TRUE; - m->dump_cleaning = TRUE; - vm_page_wire(m); - - CLUSTER_STAT(vm_pageout_cluster_conversions++); - - inactive_burst_count = 0; - - goto done_with_inactivepage; } if (m->reference || m->dirty) { @@ -2049,6 +2280,10 @@ reclaim_page: vm_pageout_inactive_force_reclaim++; } else { uint32_t isinuse; + + if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_reference_reactivated++; + reactivate_page: if ( !object->internal && object->pager != MEMORY_OBJECT_NULL && vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) { @@ -2065,6 +2300,10 @@ reactivate_page: vm_page_activate(m); VM_STAT_INCR(reactivations); } + + if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_reactivated++; + vm_pageout_inactive_used++; inactive_burst_count = 0; @@ -2077,8 +2316,9 @@ reactivate_page: */ if ((refmod_state == -1) && !m->dirty && m->pmapped) { refmod_state = pmap_get_refmod(m->phys_page); - if (refmod_state & VM_MEM_MODIFIED) - m->dirty = TRUE; + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m, FALSE); + } } forced_reclaim = TRUE; } else { @@ -2156,14 +2396,18 @@ throttle_inactive: * b) The thread doing the writing is waiting for pages while holding the truncate lock * c) Most of the pages in the inactive queue belong to this file. */ - - vm_page_activate(m); + queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); + m->active = TRUE; + vm_page_active_count++; + + vm_pageout_adjust_io_throttles(iq, eq, FALSE); + vm_pageout_inactive_external_forced_reactivate_count++; vm_pageout_inactive_external_forced_reactivate_limit--; if (vm_pageout_inactive_external_forced_reactivate_limit <= 0){ vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; -#if CONFIG_EMBEDDED +#if CONFIG_JETSAM /* * Possible deadlock scenario so request jetsam action */ @@ -2172,10 +2416,11 @@ throttle_inactive: object = VM_OBJECT_NULL; vm_page_unlock_queues(); - if (jetsam_kill_top_proc(TRUE, kJetsamFlagsKilledVM) < 0){ + if (memorystatus_kill_top_proc_from_VM() < 0){ panic("vm_pageout_scan: Jetsam request failed\n"); } + vm_pageout_inactive_external_forced_jetsam_count++; vm_page_lock_queues(); delayed_unlock = 1; #endif @@ -2191,12 +2436,7 @@ throttle_inactive: * we've got a page that we can steal... * eliminate all mappings and make sure * we have the up-to-date modified state - * first take the page BUSY, so that no new - * mappings can be made - */ - m->busy = TRUE; - - /* + * * if we need to do a pmap_disconnect then we * need to re-evaluate m->dirty since the pmap_disconnect * provides the true state atomically... the @@ -2204,9 +2444,7 @@ throttle_inactive: * and may have been dirtied at the last microsecond * * we also check for the page being referenced 'late' - * if it was, we first need to do a WAKEUP_DONE on it - * since we already set m->busy = TRUE, before - * going off to reactivate it + * and reactivate it for that case * * Note that if 'pmapped' is FALSE then the page is not * and has not been in any map, so there is no point calling @@ -2216,8 +2454,9 @@ throttle_inactive: if (m->pmapped == TRUE) { refmod_state = pmap_disconnect(m->phys_page); - if (refmod_state & VM_MEM_MODIFIED) - m->dirty = TRUE; + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m, FALSE); + } if (refmod_state & VM_MEM_REFERENCED) { /* If m->reference is already set, this page must have @@ -2230,7 +2469,8 @@ throttle_inactive: ++reactivated_this_call >= reactivate_limit) vm_pageout_reactivation_limit_exceeded++; else { - PAGE_WAKEUP_DONE(m); + if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_reference_reactivated++; goto reactivate_page; } } @@ -2250,10 +2490,71 @@ throttle_inactive: if (page_prev_state == PAGE_STATE_SPECULATIVE) vm_pageout_speculative_clean++; else { - if (page_prev_state == PAGE_STATE_ZEROFILL) - vm_pageout_inactive_zf++; + if (page_prev_state == PAGE_STATE_ANONYMOUS) + vm_pageout_inactive_anonymous++; + else if (page_prev_state == PAGE_STATE_CLEAN) + vm_pageout_cleaned_reclaimed++; + + if (m->was_dirty) { + /* page on clean queue used to be dirty; we should increment the vm_stat pageout count here */ + VM_STAT_INCR(pageouts); + DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); + } vm_pageout_inactive_clean++; } + /* FYI: (!pageout_making_free) == (!m->clean_queue && !m->speculative) */ + if (((vm_page_free_count + local_freed) >= vm_page_free_target) && !pageout_making_free) { + + assert(!m->clean_queue); + assert(!m->speculative); + + /* + * we have met our free page target and this page wasn't just pulled + * from the clean or speculative queues, so put it on the clean queue + */ + if (m->reference == TRUE) { + /* + * must have come through the forced reclaim path. + * we need to clear out the reference state in this case + * so that we don't just reactivate the page when we + * find it in the clean queue based on an old reference. + * if it gets re-referenced while on the queue, then + * the reactivation is justified + */ + m->reference = FALSE; + pmap_clear_reference(m->phys_page); + } + + vm_pageout_enqueued_cleaned_from_inactive_clean++; + vm_page_enqueue_cleaned(m); + + inactive_burst_count = 0; /* we found a usable page on the inactive queue, hooray */ + + goto done_with_inactivepage; + + } + /* + * OK, at this point we have found a page we are going to free. + */ + +#ifndef CONFIG_EMBEDDED + +#define VM_PRESSURE_INTERVAL_NS 250000000 /* nanoseconds; == .25 seconds */ + if (vm_pageout_need_to_refill_clean_queue == TRUE || page_prev_state == PAGE_STATE_CLEAN) { + static uint64_t vm_pressure_last_time_ns = 0; + uint64_t cur_time_ns = 0; + absolutetime_to_nanoseconds(mach_absolute_time(), &cur_time_ns); + if (cur_time_ns >= vm_pressure_last_time_ns + VM_PRESSURE_INTERVAL_NS) { + vm_pressure_last_time_ns = cur_time_ns; + thread_wakeup(&vm_pressure_thread); +#if CONFIG_MEMORYSTATUS + /* Wake up idle-exit thread */ + thread_wakeup((event_t)&memorystatus_wakeup); +#endif + } + } +#endif /* !CONFIG_EMBEDDED */ + goto reclaim_page; } @@ -2270,22 +2571,19 @@ throttle_inactive: inactive_throttled = TRUE; } - if (inactive_throttled == TRUE) { - /* - * we set busy before issuing the pmap_disconnect, - * so clear it and wakeup anyone that happened upon - * it in that state - */ - PAGE_WAKEUP_DONE(m); + if (inactive_throttled == TRUE) goto throttle_inactive; - } - - vm_pageout_stats[vm_pageout_stat_now].reclaimed++; - - vm_pageout_cluster(m); + + /* + * do NOT set the pageout bit! + * sure, we might need free pages, but this page is going to take time to become free + * anyway, so we may as well put it on the clean queue first and take it from there later + * if necessary. that way, we'll ensure we don't free up too much. -mj + */ + vm_pageout_cluster(m, FALSE); - if (page_prev_state == PAGE_STATE_ZEROFILL) - vm_pageout_inactive_zf++; + if (page_prev_state == PAGE_STATE_ANONYMOUS) + vm_pageout_inactive_anonymous++; if (object->internal) vm_pageout_inactive_dirty_internal++; else @@ -2306,8 +2604,8 @@ done_with_inactivepage: VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, vm_page_free_count, local_freed, delayed_unlock_limit, 4); - - vm_page_free_list(local_freeq, TRUE); + + vm_page_free_list(local_freeq, TRUE); VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, vm_page_free_count, local_freed, 0, 4); @@ -2320,6 +2618,8 @@ done_with_inactivepage: delayed_unlock = 1; } + vm_pageout_considered_page++; + /* * back to top of pageout scan loop */ @@ -2358,7 +2658,7 @@ vm_page_free_reserve( vm_page_free_target = vm_page_free_min + 5; vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3); - vm_page_creation_throttle = vm_page_free_target / 2; + vm_page_creation_throttle = vm_page_free_target * 3; } /* @@ -2370,12 +2670,18 @@ vm_pageout_continue(void) { DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); vm_pageout_scan_event_counter++; + vm_pageout_scan(); - /* we hold vm_page_queue_free_lock now */ + /* + * we hold both the vm_page_queue_free_lock + * and the vm_page_queues_lock at this point + */ assert(vm_page_free_wanted == 0); assert(vm_page_free_wanted_privileged == 0); assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); + lck_mtx_unlock(&vm_page_queue_free_lock); + vm_page_unlock_queues(); counter(c_vm_pageout_block++); thread_block((thread_continue_t)vm_pageout_continue); @@ -2397,6 +2703,7 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) { vm_page_t m = NULL; vm_object_t object; + vm_object_offset_t offset; memory_object_t pager; thread_t self = current_thread(); @@ -2418,6 +2725,19 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) m->pageout_queue = FALSE; m->pageq.next = NULL; m->pageq.prev = NULL; + + /* + * grab a snapshot of the object and offset this + * page is tabled in so that we can relookup this + * page after we've taken the object lock - these + * fields are stable while we hold the page queues lock + * but as soon as we drop it, there is nothing to keep + * this page in this object... we hold an activity_in_progress + * on this object which will keep it from terminating + */ + object = m->object; + offset = m->offset; + vm_page_unlock_queues(); #ifdef FAKE_DEADLOCK @@ -2439,10 +2759,27 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) } } #endif - object = m->object; - vm_object_lock(object); + m = vm_page_lookup(object, offset); + + if (m == NULL || + m->busy || m->cleaning || m->pageout_queue || !m->laundry) { + /* + * it's either the same page that someone else has + * started cleaning (or it's finished cleaning or + * been put back on the pageout queue), or + * the page has been freed or we have found a + * new page at this offset... in all of these cases + * we merely need to release the activity_in_progress + * we took when we put the page on the pageout queue + */ + vm_object_activity_end(object); + vm_object_unlock(object); + + vm_page_lockspin_queues(); + continue; + } if (!object->pager_initialized) { /* @@ -2464,9 +2801,11 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) * Should only happen if there is no * default pager. */ + m->pageout = FALSE; + vm_page_lockspin_queues(); - vm_pageout_queue_steal(m, TRUE); + vm_pageout_throttle_up(m); vm_page_activate(m); vm_pageout_dirty_no_pager++; @@ -2475,9 +2814,7 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) /* * And we are done with it. */ - PAGE_WAKEUP_DONE(m); - - vm_object_paging_end(object); + vm_object_activity_end(object); vm_object_unlock(object); vm_page_lockspin_queues(); @@ -2485,6 +2822,7 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) } } pager = object->pager; + if (pager == MEMORY_OBJECT_NULL) { /* * This pager has been destroyed by either @@ -2501,7 +2839,7 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) } else { vm_page_lockspin_queues(); - vm_pageout_queue_steal(m, TRUE); + vm_pageout_throttle_up(m); vm_page_activate(m); vm_page_unlock_queues(); @@ -2509,25 +2847,32 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) /* * And we are done with it. */ - PAGE_WAKEUP_DONE(m); } - vm_object_paging_end(object); + vm_object_activity_end(object); vm_object_unlock(object); vm_page_lockspin_queues(); continue; } +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(m); - vm_object_unlock(object); +#endif /* - * we expect the paging_in_progress reference to have - * already been taken on the object before it was added - * to the appropriate pageout I/O queue... this will - * keep the object from being terminated and/or the - * paging_offset from changing until the I/O has - * completed... therefore no need to lock the object to - * pull the paging_offset from it. - * + * give back the activity_in_progress reference we + * took when we queued up this page and replace it + * it with a paging_in_progress reference that will + * also hold the paging offset from changing and + * prevent the object from terminating + */ + vm_object_activity_end(object); + vm_object_paging_begin(object); + vm_object_unlock(object); + + /* * Send the data to the pager. * any pageout clustering happens there */ @@ -2544,20 +2889,14 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) vm_object_paging_end(object); vm_object_unlock(object); - vm_page_lockspin_queues(); - } - assert_wait((event_t) q, THREAD_UNINT); + vm_pageout_io_throttle(); - if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) { - q->pgo_throttled = FALSE; - thread_wakeup((event_t) &q->pgo_laundry); - } - if (q->pgo_draining == TRUE && q->pgo_laundry == 0) { - q->pgo_draining = FALSE; - thread_wakeup((event_t) (&q->pgo_laundry+1)); + vm_page_lockspin_queues(); } q->pgo_busy = FALSE; q->pgo_idle = TRUE; + + assert_wait((event_t) q, THREAD_UNINT); vm_page_unlock_queues(); thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending); @@ -2565,6 +2904,47 @@ vm_pageout_iothread_continue(struct vm_pageout_queue *q) } + +static void +vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_queue *eq, boolean_t req_lowpriority) +{ + uint32_t policy; + boolean_t set_iq = FALSE; + boolean_t set_eq = FALSE; + + if (hibernate_cleaning_in_progress == TRUE) + req_lowpriority = FALSE; + + if (iq->pgo_inited == TRUE && iq->pgo_lowpriority != req_lowpriority) + set_iq = TRUE; + + if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) + set_eq = TRUE; + + if (set_iq == TRUE || set_eq == TRUE) { + + vm_page_unlock_queues(); + + if (req_lowpriority == TRUE) { + policy = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE; + DTRACE_VM(laundrythrottle); + } else { + policy = TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_NORMAL; + DTRACE_VM(laundryunthrottle); + } + if (set_iq == TRUE) { + proc_apply_thread_diskacc(kernel_task, iq->pgo_tid, policy); + iq->pgo_lowpriority = req_lowpriority; + } + if (set_eq == TRUE) { + proc_apply_thread_diskacc(kernel_task, eq->pgo_tid, policy); + eq->pgo_lowpriority = req_lowpriority; + } + vm_page_lock_queues(); + } +} + + static void vm_pageout_iothread_external(void) { @@ -2572,11 +2952,22 @@ vm_pageout_iothread_external(void) self->options |= TH_OPT_VMPRIV; + DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL); + proc_apply_thread_diskacc(kernel_task, self->thread_id, TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + + vm_page_lock_queues(); + + vm_pageout_queue_external.pgo_tid = self->thread_id; + vm_pageout_queue_external.pgo_lowpriority = TRUE; + vm_pageout_queue_external.pgo_inited = TRUE; + + vm_page_unlock_queues(); + vm_pageout_iothread_continue(&vm_pageout_queue_external); + /*NOTREACHED*/ } - static void vm_pageout_iothread_internal(void) { @@ -2584,7 +2975,19 @@ vm_pageout_iothread_internal(void) self->options |= TH_OPT_VMPRIV; + DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL); + proc_apply_thread_diskacc(kernel_task, self->thread_id, TASK_POLICY_HWACCESS_DISK_ATTRIBUTE_THROTTLE); + + vm_page_lock_queues(); + + vm_pageout_queue_internal.pgo_tid = self->thread_id; + vm_pageout_queue_internal.pgo_lowpriority = TRUE; + vm_pageout_queue_internal.pgo_inited = TRUE; + + vm_page_unlock_queues(); + vm_pageout_iothread_continue(&vm_pageout_queue_internal); + /*NOTREACHED*/ } @@ -2598,28 +3001,67 @@ vm_set_buffer_cleanup_callout(boolean_t (*func)(int)) } } +static void +vm_pressure_thread(void) { + static boolean_t set_up_thread = FALSE; + + if (set_up_thread) { +#if VM_PRESSURE_EVENTS + consider_vm_pressure_events(); +#endif /* VM_PRESSURE_EVENTS */ + } + + set_up_thread = TRUE; + assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT); + thread_block((thread_continue_t)vm_pressure_thread); +} + +uint32_t vm_pageout_considered_page_last = 0; + +/* + * called once per-second via "compute_averages" + */ +void +compute_pageout_gc_throttle() +{ + if (vm_pageout_considered_page != vm_pageout_considered_page_last) { + + vm_pageout_considered_page_last = vm_pageout_considered_page; + + thread_wakeup((event_t) &vm_pageout_garbage_collect); + } +} + + static void vm_pageout_garbage_collect(int collect) { + if (collect) { boolean_t buf_large_zfree = FALSE; + boolean_t first_try = TRUE; + stack_collect(); - /* - * consider_zone_gc should be last, because the other operations - * might return memory to zones. - */ consider_machine_collect(); - if (consider_buffer_cache_collect != NULL) { - buf_large_zfree = (*consider_buffer_cache_collect)(0); - } - consider_zone_gc(buf_large_zfree); + + do { + if (consider_buffer_cache_collect != NULL) { + buf_large_zfree = (*consider_buffer_cache_collect)(0); + } + if (first_try == TRUE || buf_large_zfree == TRUE) { + /* + * consider_zone_gc should be last, because the other operations + * might return memory to zones. + */ + consider_zone_gc(buf_large_zfree); + } + first_try = FALSE; + + } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target); consider_machine_adjust(); - consider_pressure_events(); - } - assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT); thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1); @@ -2708,6 +3150,10 @@ vm_pageout(void) vm_pageout_queue_external.pgo_busy = FALSE; vm_pageout_queue_external.pgo_throttled = FALSE; vm_pageout_queue_external.pgo_draining = FALSE; + vm_pageout_queue_external.pgo_lowpriority = FALSE; + vm_pageout_queue_external.pgo_tid = -1; + vm_pageout_queue_external.pgo_inited = FALSE; + queue_init(&vm_pageout_queue_internal.pgo_pending); vm_pageout_queue_internal.pgo_maxlaundry = 0; @@ -2716,7 +3162,9 @@ vm_pageout(void) vm_pageout_queue_internal.pgo_busy = FALSE; vm_pageout_queue_internal.pgo_throttled = FALSE; vm_pageout_queue_internal.pgo_draining = FALSE; - + vm_pageout_queue_internal.pgo_lowpriority = FALSE; + vm_pageout_queue_internal.pgo_tid = -1; + vm_pageout_queue_internal.pgo_inited = FALSE; /* internal pageout thread started when default pager registered first time */ /* external pageout and garbage collection threads started here */ @@ -2730,13 +3178,22 @@ vm_pageout(void) thread_deallocate(vm_pageout_external_iothread); result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, - MINPRI_KERNEL, + BASEPRI_DEFAULT, &thread); if (result != KERN_SUCCESS) panic("vm_pageout_garbage_collect: create failed"); thread_deallocate(thread); + result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL, + BASEPRI_DEFAULT, + &thread); + + if (result != KERN_SUCCESS) + panic("vm_pressure_thread: create failed"); + + thread_deallocate(thread); + vm_object_reaper_init(); @@ -2824,6 +3281,9 @@ upl_create(int type, int flags, upl_size_t size) upl->upl_commit_index = 0; bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records)); + upl->uplq.next = 0; + upl->uplq.prev = 0; + (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES); #endif /* UPL_DEBUG */ @@ -2841,7 +3301,7 @@ upl_destroy(upl_t upl) } #if UPL_DEBUG - { + if ( !(upl->flags & UPL_VECTOR)) { vm_object_t object; if (upl->flags & UPL_SHADOWED) { @@ -2851,6 +3311,8 @@ upl_destroy(upl_t upl) } vm_object_lock(object); queue_remove(&object->uplq, upl, upl_t, uplq); + vm_object_activity_end(object); + vm_object_collapse(object, 0, TRUE); vm_object_unlock(object); } #endif /* UPL_DEBUG */ @@ -2873,6 +3335,7 @@ upl_destroy(upl_t upl) } upl_lock_destroy(upl); upl->vector_upl = (vector_upl_t) 0xfeedbeef; + if (upl->flags & UPL_INTERNAL) { kfree(upl, sizeof(struct upl) + @@ -3080,6 +3543,7 @@ vm_object_upl_request( upl->offset = offset + object->paging_offset; #if UPL_DEBUG + vm_object_activity_begin(object); queue_enter(&object->uplq, upl, upl_t, uplq); #endif /* UPL_DEBUG */ @@ -3140,8 +3604,9 @@ vm_object_upl_request( dst_page->fictitious || dst_page->absent || dst_page->error || - (VM_PAGE_WIRED(dst_page) && !dst_page->pageout && !dst_page->list_req_pending)) { - + dst_page->cleaning || + (VM_PAGE_WIRED(dst_page))) { + if (user_page_list) user_page_list[entry].phys_addr = 0; @@ -3172,7 +3637,7 @@ vm_object_upl_request( /* * we're only asking for DIRTY pages to be returned */ - if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) { + if (dst_page->pageout || !(cntrl_flags & UPL_FOR_PAGEOUT)) { /* * if we were the page stolen by vm_pageout_scan to be * cleaned (as opposed to a buddy being clustered in @@ -3188,13 +3653,11 @@ vm_object_upl_request( * this is a request for a PAGEOUT cluster and this page * is merely along for the ride as a 'buddy'... not only * does it have to be dirty to be returned, but it also - * can't have been referenced recently... note that we've - * already filtered above based on whether this page is - * currently on the inactive queue or it meets the page - * ticket (generation count) check + * can't have been referenced recently... */ - if ( (cntrl_flags & UPL_CLEAN_IN_PLACE || !(refmod_state & VM_MEM_REFERENCED) || dst_page->throttled) && - ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) { + if ( (hibernate_cleaning_in_progress == TRUE || + (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) || dst_page->throttled)) && + ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) { goto check_busy; } dont_return: @@ -3202,15 +3665,29 @@ dont_return: * if we reach here, we're not to return * the page... go on to the next one */ + if (dst_page->laundry == TRUE) { + /* + * if we get here, the page is not 'cleaning' (filtered out above). + * since it has been referenced, remove it from the laundry + * so we don't pay the cost of an I/O to clean a page + * we're just going to take back + */ + vm_page_lockspin_queues(); + + vm_pageout_steal_laundry(dst_page, TRUE); + vm_page_activate(dst_page); + + vm_page_unlock_queues(); + } if (user_page_list) user_page_list[entry].phys_addr = 0; goto try_next_page; } check_busy: - if (dst_page->busy && (!(dst_page->list_req_pending && (dst_page->pageout || dst_page->cleaning)))) { - if (cntrl_flags & UPL_NOBLOCK) { - if (user_page_list) + if (dst_page->busy) { + if (cntrl_flags & UPL_NOBLOCK) { + if (user_page_list) user_page_list[entry].phys_addr = 0; goto try_next_page; @@ -3221,16 +3698,7 @@ check_busy: */ PAGE_SLEEP(object, dst_page, THREAD_UNINT); - continue; - } - /* - * Someone else already cleaning the page? - */ - if ((dst_page->cleaning || dst_page->absent || VM_PAGE_WIRED(dst_page)) && !dst_page->list_req_pending) { - if (user_page_list) - user_page_list[entry].phys_addr = 0; - - goto try_next_page; + continue; } /* * ENCRYPTED SWAP: @@ -3281,23 +3749,15 @@ check_busy: * were not counted in the initial * vm_pageout_scan work */ - if (dst_page->list_req_pending) + if (dst_page->pageout) encountered_lrp = TRUE; - if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) { + if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious))) { if (encountered_lrp) CLUSTER_STAT(pages_at_higher_offsets++;) else CLUSTER_STAT(pages_at_lower_offsets++;) } #endif - /* - * Turn off busy indication on pending - * pageout. Note: we can only get here - * in the request pending case. - */ - dst_page->list_req_pending = FALSE; - dst_page->busy = FALSE; - hw_dirty = refmod_state & VM_MEM_MODIFIED; dirty = hw_dirty ? TRUE : dst_page->dirty; @@ -3340,14 +3800,15 @@ check_busy: */ vm_external_state_set(object->existence_map, dst_page->offset); #endif /*MACH_PAGEMAP*/ - dst_page->dirty = dirty; + if (dirty) { + SET_PAGE_DIRTY(dst_page, FALSE); + } else { + dst_page->dirty = FALSE; + } if (!dirty) dst_page->precious = TRUE; - if (dst_page->pageout) - dst_page->busy = TRUE; - if ( (cntrl_flags & UPL_ENCRYPT) ) { /* * ENCRYPTED SWAP: @@ -3367,16 +3828,8 @@ check_busy: dst_page->encrypted_cleaning = TRUE; } if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) { - /* - * deny access to the target page - * while it is being worked on - */ - if ((!dst_page->pageout) && ( !VM_PAGE_WIRED(dst_page))) { - dst_page->busy = TRUE; + if ( !VM_PAGE_WIRED(dst_page)) dst_page->pageout = TRUE; - - dwp->dw_mask |= DW_vm_page_wire; - } } } else { if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) { @@ -3427,70 +3880,33 @@ check_busy: if (dst_page != VM_PAGE_NULL) { if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) { + /* + * skip over pages already present in the cache + */ + if (user_page_list) + user_page_list[entry].phys_addr = 0; - if ( !(dst_page->absent && dst_page->list_req_pending) ) { - /* - * skip over pages already present in the cache - */ - if (user_page_list) - user_page_list[entry].phys_addr = 0; - - goto try_next_page; - } + goto try_next_page; + } + if (dst_page->fictitious) { + panic("need corner case for fictitious page"); } - if ( !(dst_page->list_req_pending) ) { - - if (dst_page->cleaning) { - /* - * someone else is writing to the page... wait... - */ - PAGE_SLEEP(object, dst_page, THREAD_UNINT); - - continue; - } - } else { - if (dst_page->fictitious && - dst_page->phys_page == vm_page_fictitious_addr) { - assert( !dst_page->speculative); - /* - * dump the fictitious page - */ - dst_page->list_req_pending = FALSE; - - VM_PAGE_FREE(dst_page); - dst_page = NULL; + if (dst_page->busy || dst_page->cleaning) { + /* + * someone else is playing with the + * page. We will have to wait. + */ + PAGE_SLEEP(object, dst_page, THREAD_UNINT); - } else if (dst_page->absent) { - /* - * the default_pager case - */ - dst_page->list_req_pending = FALSE; - PAGE_WAKEUP_DONE(dst_page); + continue; + } + if (dst_page->laundry) { + dst_page->pageout = FALSE; - } else if (dst_page->pageout || dst_page->cleaning) { - /* - * page was earmarked by vm_pageout_scan - * to be cleaned and stolen... we're going - * to take it back since we are not attempting - * to read that page and we don't want to stall - * waiting for it to be cleaned for 2 reasons... - * 1 - no use paging it out and back in - * 2 - if we stall, we may casue a deadlock in - * the FS trying to acquire the its locks - * on the VNOP_PAGEOUT path presuming that - * those locks are already held on the read - * path before trying to create this UPL - * - * so undo all of the state that vm_pageout_scan - * hung on this page - */ - vm_pageout_queue_steal(dst_page, FALSE); - PAGE_WAKEUP_DONE(dst_page); - } + vm_pageout_steal_laundry(dst_page, FALSE); } - } - if (dst_page == VM_PAGE_NULL) { + } else { if (object->private) { /* * This is a nasty wrinkle for users @@ -3580,18 +3996,6 @@ check_busy: dst_page->clustered = TRUE; } } - if (dst_page->fictitious) { - panic("need corner case for fictitious page"); - } - if (dst_page->busy) { - /* - * someone else is playing with the - * page. We will have to wait. - */ - PAGE_SLEEP(object, dst_page, THREAD_UNINT); - - continue; - } /* * ENCRYPTED SWAP: */ @@ -3698,7 +4102,7 @@ check_busy: } if (cntrl_flags & UPL_PRECIOUS) { if (dst_page->object->internal) { - dst_page->dirty = TRUE; + SET_PAGE_DIRTY(dst_page, FALSE); dst_page->precious = FALSE; } else { dst_page->precious = TRUE; @@ -3719,6 +4123,7 @@ check_busy: user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].precious = dst_page->precious; user_page_list[entry].device = FALSE; + user_page_list[entry].needed = FALSE; if (dst_page->clustered == TRUE) user_page_list[entry].speculative = dst_page->speculative; else @@ -3979,7 +4384,6 @@ REDISCOVER_ENTRY: if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE) *upl_size = MAX_UPL_SIZE * PAGE_SIZE; } - /* * Create an object if necessary. */ @@ -4023,7 +4427,7 @@ REDISCOVER_ENTRY: vm_object_pmap_protect(local_object, entry->offset, entry->vme_end - entry->vme_start, - ((entry->is_shared || map->mapped) + ((entry->is_shared || map->mapped_in_other_pmaps) ? PMAP_NULL : map->pmap), entry->vme_start, @@ -4371,7 +4775,7 @@ process_upl_to_enter: /* m->wpmapped = TRUE; */ assert(map==kernel_map); - PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, 0, TRUE); + PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, VM_PROT_NONE, 0, TRUE); } offset += PAGE_SIZE_64; } @@ -4685,7 +5089,7 @@ process_upl_to_commit: page_list[entry].phys_addr = 0; if (flags & UPL_COMMIT_SET_DIRTY) { - m->dirty = TRUE; + SET_PAGE_DIRTY(m, FALSE); } else if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; @@ -4733,6 +5137,9 @@ process_upl_to_commit: goto commit_next_page; } + if (page_list) + page_list[entry].phys_addr = 0; + /* * make sure to clear the hardware * modify or reference bits before @@ -4743,87 +5150,79 @@ process_upl_to_commit: if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; - if (! (flags & UPL_COMMIT_CS_VALIDATED) && - m->cs_validated && !m->cs_tainted) { - /* - * CODE SIGNING: - * This page is no longer dirty - * but could have been modified, - * so it will need to be - * re-validated. - */ - m->cs_validated = FALSE; -#if DEVELOPMENT || DEBUG - vm_cs_validated_resets++; -#endif - pmap_disconnect(m->phys_page); - } clear_refmod |= VM_MEM_MODIFIED; } - if (page_list) { - upl_page_info_t *p; - - p = &(page_list[entry]); - - if (p->phys_addr && p->pageout && !m->pageout) { - m->busy = TRUE; - m->pageout = TRUE; - - dwp->dw_mask |= DW_vm_page_wire; + if (m->laundry) + dwp->dw_mask |= DW_vm_pageout_throttle_up; - } else if (p->phys_addr && - !p->pageout && m->pageout && - !m->dump_cleaning) { - m->pageout = FALSE; + if (VM_PAGE_WIRED(m)) + m->pageout = FALSE; + + if (! (flags & UPL_COMMIT_CS_VALIDATED) && + m->cs_validated && !m->cs_tainted) { + /* + * CODE SIGNING: + * This page is no longer dirty + * but could have been modified, + * so it will need to be + * re-validated. + */ + m->cs_validated = FALSE; +#if DEVELOPMENT || DEBUG + vm_cs_validated_resets++; +#endif + pmap_disconnect(m->phys_page); + } + if (m->overwriting) { + /* + * the (COPY_OUT_FROM == FALSE) request_page_list case + */ + if (m->busy) { m->absent = FALSE; - m->overwriting = FALSE; - dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP); + dwp->dw_mask |= DW_clear_busy; + } else { + /* + * alternate (COPY_OUT_FROM == FALSE) page_list case + * Occurs when the original page was wired + * at the time of the list request + */ + assert(VM_PAGE_WIRED(m)); + + dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */ } - page_list[entry].phys_addr = 0; + m->overwriting = FALSE; } - m->dump_cleaning = FALSE; + if (m->encrypted_cleaning == TRUE) { + m->encrypted_cleaning = FALSE; - if (m->laundry) - dwp->dw_mask |= DW_vm_pageout_throttle_up; + dwp->dw_mask |= DW_clear_busy | DW_PAGE_WAKEUP; + } + m->cleaning = FALSE; if (m->pageout) { - m->cleaning = FALSE; - m->encrypted_cleaning = FALSE; + /* + * With the clean queue enabled, UPL_PAGEOUT should + * no longer set the pageout bit. It's pages now go + * to the clean queue. + */ + assert(!(flags & UPL_PAGEOUT)); + m->pageout = FALSE; #if MACH_CLUSTER_STATS if (m->wanted) vm_pageout_target_collisions++; #endif - m->dirty = FALSE; - - if (! (flags & UPL_COMMIT_CS_VALIDATED) && - m->cs_validated && !m->cs_tainted) { - /* - * CODE SIGNING: - * This page is no longer dirty - * but could have been modified, - * so it will need to be - * re-validated. - */ - m->cs_validated = FALSE; -#if DEVELOPMENT || DEBUG - vm_cs_validated_resets++; -#endif - pmap_disconnect(m->phys_page); - } - if ((flags & UPL_COMMIT_SET_DIRTY) || - (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))) - m->dirty = TRUE; - - if (m->dirty) { + (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))) { /* * page was re-dirtied after we started * the pageout... reactivate it since * we don't know whether the on-disk * copy matches what is now in memory */ - dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP); + SET_PAGE_DIRTY(m, FALSE); + + dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP; if (upl->flags & UPL_PAGEOUT) { CLUSTER_STAT(vm_pageout_target_page_dirtied++;) @@ -4835,23 +5234,15 @@ process_upl_to_commit: * page has been successfully cleaned * go ahead and free it for other use */ - if (m->object->internal) { DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL); } else { DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL); } - dwp->dw_mask |= DW_vm_page_free; - - if (upl->flags & UPL_PAGEOUT) { - CLUSTER_STAT(vm_pageout_target_page_freed++;) + m->dirty = FALSE; + m->busy = TRUE; - if (page_list[entry].dirty) { - VM_STAT_INCR(pageouts); - DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); - pgpgout_count++; - } - } + dwp->dw_mask |= DW_vm_page_free; } goto commit_next_page; } @@ -4863,51 +5254,6 @@ process_upl_to_commit: else vm_pageout_cluster_cleaned++; if (m->wanted) vm_pageout_cluster_collisions++; #endif - m->dirty = FALSE; - - if (! (flags & UPL_COMMIT_CS_VALIDATED) && - m->cs_validated && !m->cs_tainted) { - /* - * CODE SIGNING: - * This page is no longer dirty - * but could have been modified, - * so it will need to be - * re-validated. - */ - m->cs_validated = FALSE; -#if DEVELOPMENT || DEBUG - vm_cs_validated_resets++; -#endif - pmap_disconnect(m->phys_page); - } - - if (m->overwriting) { - /* - * the (COPY_OUT_FROM == FALSE) request_page_list case - */ - if (m->busy) { - m->absent = FALSE; - - dwp->dw_mask |= DW_clear_busy; - } else { - /* - * alternate (COPY_OUT_FROM == FALSE) page_list case - * Occurs when the original page was wired - * at the time of the list request - */ - assert(VM_PAGE_WIRED(m)); - - dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */ - } - m->overwriting = FALSE; - } - if (m->encrypted_cleaning == TRUE) { - m->encrypted_cleaning = FALSE; - - dwp->dw_mask |= DW_clear_busy; - } - m->cleaning = FALSE; - /* * It is a part of the semantic of COPYOUT_FROM * UPLs that a commit implies cache sync @@ -4918,17 +5264,29 @@ process_upl_to_commit: if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) m->precious = FALSE; - if (flags & UPL_COMMIT_SET_DIRTY) - m->dirty = TRUE; + if (flags & UPL_COMMIT_SET_DIRTY) { + SET_PAGE_DIRTY(m, FALSE); + } else { + m->dirty = FALSE; + } + + /* with the clean queue on, move *all* cleaned pages to the clean queue */ + if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) { + pgpgout_count++; + + /* this page used to be dirty; now it's on the clean queue. */ + m->was_dirty = TRUE; - if (should_be_throttled == TRUE && !m->active && !m->inactive && !m->speculative && !m->throttled) { + dwp->dw_mask |= DW_enqueue_cleaned; + vm_pageout_enqueued_cleaned_from_inactive_dirty++; + } else if (should_be_throttled == TRUE && !m->active && !m->inactive && !m->speculative && !m->throttled) { /* * page coming back in from being 'frozen'... * it was dirty before it was frozen, so keep it so * the vm_page_activate will notice that it really belongs * on the throttle queue and put it there */ - m->dirty = TRUE; + SET_PAGE_DIRTY(m, FALSE); dwp->dw_mask |= DW_vm_page_activate; } else { @@ -5032,6 +5390,7 @@ commit_next_page: * against this object */ vm_object_activity_end(shadow_object); + vm_object_collapse(shadow_object, 0, TRUE); } else { /* * we dontated the paging reference to @@ -5078,6 +5437,7 @@ upl_abort_range( int error, boolean_t *empty) { + upl_page_info_t *user_page_list = NULL; upl_size_t xfer_size, subupl_size = size; vm_object_t shadow_object; vm_object_t object; @@ -5155,6 +5515,8 @@ process_upl_to_abort: lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); + + user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl)); } else { lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl)); @@ -5190,17 +5552,21 @@ process_upl_to_abort: while (xfer_size) { vm_page_t t, m; + unsigned int pg_num; + boolean_t needed; - dwp->dw_mask = 0; + pg_num = (unsigned int) (target_offset/PAGE_SIZE); + assert(pg_num == target_offset/PAGE_SIZE); + + needed = FALSE; + if (user_page_list) + needed = user_page_list[pg_num].needed; + + dwp->dw_mask = 0; m = VM_PAGE_NULL; if (upl->flags & UPL_LITE) { - unsigned int pg_num; - - pg_num = (unsigned int) (target_offset/PAGE_SIZE); - assert(pg_num == target_offset/PAGE_SIZE); - if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) { lite_list[pg_num>>5] &= ~(1 << (pg_num & 31)); @@ -5249,7 +5615,7 @@ process_upl_to_abort: m->unusual = TRUE; must_free = FALSE; } - if (m->clustered) { + if (m->clustered && needed == FALSE) { /* * This page was a part of a speculative * read-ahead initiated by the kernel @@ -5311,13 +5677,6 @@ process_upl_to_abort: */ dwp->dw_mask |= DW_clear_busy; } - if (m->pageout) { - assert(m->busy); - assert(m->wire_count == 1); - m->pageout = FALSE; - - dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy); - } if (m->overwriting) { if (m->busy) dwp->dw_mask |= DW_clear_busy; @@ -5340,7 +5699,7 @@ process_upl_to_abort: dwp->dw_mask |= DW_clear_busy; } - m->dump_cleaning = FALSE; + m->pageout = FALSE; m->cleaning = FALSE; #if MACH_PAGEMAP vm_external_state_clr(m->object->existence_map, m->offset); @@ -5350,14 +5709,18 @@ process_upl_to_abort: dwp->dw_mask |= DW_vm_page_free; } else { - if (error & UPL_ABORT_REFERENCE) { - /* - * we've been told to explictly - * reference this page... for - * file I/O, this is done by - * implementing an LRU on the inactive q - */ - dwp->dw_mask |= DW_vm_page_lru; + if (!(dwp->dw_mask & DW_vm_page_unwire)) { + if (error & UPL_ABORT_REFERENCE) { + /* + * we've been told to explictly + * reference this page... for + * file I/O, this is done by + * implementing an LRU on the inactive q + */ + dwp->dw_mask |= DW_vm_page_lru; + + } else if (!m->active && !m->inactive && !m->speculative) + dwp->dw_mask |= DW_vm_page_deactivate_internal; } dwp->dw_mask |= DW_PAGE_WAKEUP; } @@ -5432,6 +5795,7 @@ abort_next_page: * against this object */ vm_object_activity_end(shadow_object); + vm_object_collapse(shadow_object, 0, TRUE); } else { /* * we dontated the paging reference to @@ -5490,6 +5854,21 @@ upl_commit( return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty); } +void +vm_object_set_pmap_cache_attr( + vm_object_t object, + upl_page_info_array_t user_page_list, + unsigned int num_pages, + boolean_t batch_pmap_op) +{ + unsigned int cache_attr = 0; + + cache_attr = object->wimg_bits & VM_WIMG_MASK; + assert(user_page_list); + if (cache_attr != VM_WIMG_USE_DEFAULT) { + PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op); + } +} unsigned int vm_object_iopl_request_sleep_for_cleaning = 0; @@ -5627,7 +6006,7 @@ vm_object_iopl_request( if (cntrl_flags & UPL_BLOCK_ACCESS) { /* - * The user requested that access to the pages in this URL + * The user requested that access to the pages in this UPL * be blocked until the UPL is commited or aborted. */ upl->flags |= UPL_ACCESS_BLOCKED; @@ -5635,6 +6014,7 @@ vm_object_iopl_request( if (object->phys_contiguous) { #if UPL_DEBUG + vm_object_activity_begin(object); queue_enter(&object->uplq, upl, upl_t, uplq); #endif /* UPL_DEBUG */ @@ -5676,6 +6056,7 @@ vm_object_iopl_request( } #if UPL_DEBUG + vm_object_activity_begin(object); queue_enter(&object->uplq, upl, upl_t, uplq); #endif /* UPL_DEBUG */ @@ -5768,6 +6149,7 @@ vm_object_iopl_request( fault_info.interruptible = interruptible; fault_info.cluster_size = xfer_size; + fault_info.batch_pmap_op = TRUE; vm_object_paging_begin(object); @@ -5870,7 +6252,7 @@ vm_object_iopl_request( if (dst_page->cleaning) { /* - * Someone else is cleaning this page in place.as + * Someone else is cleaning this page in place. * In theory, we should be able to proceed and use this * page but they'll probably end up clearing the "busy" * bit on it in upl_commit_range() but they didn't set @@ -5883,6 +6265,11 @@ vm_object_iopl_request( PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } + if (dst_page->laundry) { + dst_page->pageout = FALSE; + + vm_pageout_steal_laundry(dst_page, FALSE); + } if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) && dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) { vm_page_t low_page; @@ -5927,8 +6314,9 @@ vm_object_iopl_request( if (refmod & VM_MEM_REFERENCED) low_page->reference = TRUE; - if (refmod & VM_MEM_MODIFIED) - low_page->dirty = TRUE; + if (refmod & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(low_page, FALSE); + } vm_page_replace(low_page, object, dst_offset); @@ -5960,8 +6348,9 @@ vm_object_iopl_request( */ dwp->dw_mask |= DW_set_reference; - if (!(cntrl_flags & UPL_COPYOUT_FROM)) - dst_page->dirty = TRUE; + if (!(cntrl_flags & UPL_COPYOUT_FROM)) { + SET_PAGE_DIRTY(dst_page, TRUE); + } record_phys_addr: if (dst_page->busy) upl->flags |= UPL_HAS_BUSY; @@ -5980,6 +6369,7 @@ record_phys_addr: user_page_list[entry].dirty = dst_page->dirty; user_page_list[entry].precious = dst_page->precious; user_page_list[entry].device = FALSE; + user_page_list[entry].needed = FALSE; if (dst_page->clustered == TRUE) user_page_list[entry].speculative = dst_page->speculative; else @@ -6013,6 +6403,8 @@ record_phys_addr: if (dw_count) vm_page_do_delayed_work(object, &dw_array[0], dw_count); + vm_object_set_pmap_cache_attr(object, user_page_list, entry, TRUE); + if (page_list_count != NULL) { if (upl->flags & UPL_INTERNAL) *page_list_count = 0; @@ -6093,6 +6485,7 @@ return_err: #endif if (! (upl->flags & UPL_KERNEL_OBJECT)) { vm_object_activity_end(object); + vm_object_collapse(object, 0, TRUE); } vm_object_unlock(object); upl_destroy(upl); @@ -6179,6 +6572,27 @@ done: return retval; } +void +upl_range_needed( + upl_t upl, + int index, + int count) +{ + upl_page_info_t *user_page_list; + int size_in_pages; + + if ( !(upl->flags & UPL_INTERNAL) || count <= 0) + return; + + size_in_pages = upl->size / PAGE_SIZE; + + user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl)); + + while (count-- && index < size_in_pages) + user_page_list[index++].needed = TRUE; +} + + /* * ENCRYPTED SWAP: * @@ -6217,7 +6631,7 @@ done: * can call the encryption/decryption routines with a kernel * virtual address. We keep this pool of pre-allocated kernel * virtual addresses so that we don't have to scan the kernel's - * virtual address space each time we need to encrypt or decrypt + * virtaul address space each time we need to encrypt or decrypt * a physical page. * It would be nice to be able to encrypt and decrypt in physical * mode but that might not always be more efficient... @@ -6373,6 +6787,7 @@ vm_paging_map_object( page_map_offset, page, protection, + VM_PROT_NONE, 0, TRUE); vm_paging_objects_mapped++; @@ -6462,6 +6877,7 @@ vm_paging_map_object( *address + page_map_offset, page, protection, + VM_PROT_NONE, 0, TRUE); } @@ -6538,7 +6954,7 @@ vm_paging_unmap_object( */ #define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */ boolean_t swap_crypt_ctx_initialized = FALSE; -aes_32t swap_crypt_key[8]; /* big enough for a 256 key */ +uint32_t swap_crypt_key[8]; /* big enough for a 256 key */ aes_ctx swap_crypt_ctx; const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, }; @@ -6667,7 +7083,6 @@ vm_page_encrypt( } assert(page->busy); - assert(page->dirty || page->precious); if (page->encrypted) { /* @@ -6676,6 +7091,8 @@ vm_page_encrypt( vm_page_encrypt_already_encrypted_counter++; return; } + assert(page->dirty || page->precious); + ASSERT_PAGE_DECRYPTED(page); /* @@ -7072,17 +7489,12 @@ vm_page_decrypt( #endif /* CRYPTO */ +/* + * page->object must be locked + */ void -vm_pageout_queue_steal(vm_page_t page, boolean_t queues_locked) +vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked) { - boolean_t pageout; - - pageout = page->pageout; - - page->list_req_pending = FALSE; - page->cleaning = FALSE; - page->pageout = FALSE; - if (!queues_locked) { vm_page_lockspin_queues(); } @@ -7097,14 +7509,6 @@ vm_pageout_queue_steal(vm_page_t page, boolean_t queues_locked) */ vm_pageout_throttle_up(page); - if (pageout == TRUE) { - /* - * toss the wire count we picked up - * when we intially set this page up - * to be cleaned... - */ - vm_page_unwire(page, TRUE); - } vm_page_steal_pageout_page++; if (!queues_locked) { @@ -7152,7 +7556,7 @@ vector_upl_deallocate(upl_t upl) vector_upl->size = 0; vector_upl->offset = 0; kfree(vector_upl, sizeof(struct _vector_upl)); - vector_upl = (vector_upl_t)0xdeadbeef; + vector_upl = (vector_upl_t)0xfeedfeed; } else panic("vector_upl_deallocate was passed a non-vectored upl\n"); @@ -7166,7 +7570,7 @@ vector_upl_is_valid(upl_t upl) { if(upl && ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) { vector_upl_t vector_upl = upl->vector_upl; - if(vector_upl == NULL || vector_upl == (vector_upl_t)0xdeadbeef || vector_upl == (vector_upl_t)0xfeedbeef) + if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) return FALSE; else return TRUE; @@ -7495,6 +7899,9 @@ vm_page_slide( uint32_t pageIndex = 0; assert(!page->slid); + + if (page->error) + return KERN_FAILURE; /* * Take a paging-in-progress reference to keep the object @@ -7548,6 +7955,15 @@ vm_page_slide( page->dirty = FALSE; pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); + + if (kr != KERN_SUCCESS || cs_debug > 1) { + printf("vm_page_slide(%p): " + "obj %p off 0x%llx mobj %p moff 0x%llx\n", + page, + page->object, page->offset, + page->object->pager, + page->offset + page->object->paging_offset); + } if (kr == KERN_SUCCESS) { page->slid = TRUE; @@ -7635,7 +8051,7 @@ vm_countdirtypages(void) vm_page_unlock_queues(); vm_page_lock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_zf); + m = (vm_page_t) queue_first(&vm_page_queue_anonymous); do { if (m ==(vm_page_t )0) break; @@ -7647,7 +8063,7 @@ vm_countdirtypages(void) m = (vm_page_t) queue_next(&m->pageq); if (m ==(vm_page_t )0) break; - } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m)); + } while (!queue_end(&vm_page_queue_anonymous,(queue_entry_t) m)); vm_page_unlock_queues(); printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages); @@ -7705,80 +8121,3 @@ int upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2) return KERN_SUCCESS; } #endif /* UPL_DEBUG */ - - - -#if MACH_KDB -#include -#include -#include - -#define printf kdbprintf -void db_pageout(void); - -void -db_vm(void) -{ - - iprintf("VM Statistics:\n"); - db_indent += 2; - iprintf("pages:\n"); - db_indent += 2; - iprintf("activ %5d inact %5d free %5d", - vm_page_active_count, vm_page_inactive_count, - vm_page_free_count); - printf(" wire %5d gobbl %5d\n", - vm_page_wire_count, vm_page_gobble_count); - db_indent -= 2; - iprintf("target:\n"); - db_indent += 2; - iprintf("min %5d inact %5d free %5d", - vm_page_free_min, vm_page_inactive_target, - vm_page_free_target); - printf(" resrv %5d\n", vm_page_free_reserved); - db_indent -= 2; - iprintf("pause:\n"); - db_pageout(); - db_indent -= 2; -} - -#if MACH_COUNTERS -extern int c_laundry_pages_freed; -#endif /* MACH_COUNTERS */ - -void -db_pageout(void) -{ - iprintf("Pageout Statistics:\n"); - db_indent += 2; - iprintf("active %5d inactv %5d\n", - vm_pageout_active, vm_pageout_inactive); - iprintf("nolock %5d avoid %5d busy %5d absent %5d\n", - vm_pageout_inactive_nolock, vm_pageout_inactive_avoid, - vm_pageout_inactive_busy, vm_pageout_inactive_absent); - iprintf("used %5d clean %5d dirty(internal) %5d dirty(external) %5d\n", - vm_pageout_inactive_used, vm_pageout_inactive_clean, - vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); -#if MACH_COUNTERS - iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed); -#endif /* MACH_COUNTERS */ -#if MACH_CLUSTER_STATS - iprintf("Cluster Statistics:\n"); - db_indent += 2; - iprintf("dirtied %5d cleaned %5d collisions %5d\n", - vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned, - vm_pageout_cluster_collisions); - iprintf("clusters %5d conversions %5d\n", - vm_pageout_cluster_clusters, vm_pageout_cluster_conversions); - db_indent -= 2; - iprintf("Target Statistics:\n"); - db_indent += 2; - iprintf("collisions %5d page_dirtied %5d page_freed %5d\n", - vm_pageout_target_collisions, vm_pageout_target_page_dirtied, - vm_pageout_target_page_freed); - db_indent -= 2; -#endif /* MACH_CLUSTER_STATS */ - db_indent -= 2; -} - -#endif /* MACH_KDB */ diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index d8ddac6a7..d04bbe8cf 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -87,9 +87,17 @@ #include +#define VM_PAGE_CLEANED_TARGET 30000 /* 25600 pages = 100 MB */ +#define VM_PAGE_CLEANED_MIN ((VM_PAGE_CLEANED_TARGET * 80) / 100) + +#define VM_PAGE_AVAILABLE_COUNT() ((unsigned int)(vm_page_cleaned_count)) + +/* externally manipulated counters */ +extern unsigned int vm_pageout_cleaned_reactivated, vm_pageout_cleaned_fault_reactivated, vm_pageout_cleaned_commit_reactivated; + #if CONFIG_FREEZE -extern boolean_t vm_freeze_enabled; -#define VM_DYNAMIC_PAGING_ENABLED(port) ((vm_freeze_enabled == FALSE) && IP_VALID(port)) +extern boolean_t memorystatus_freeze_enabled; +#define VM_DYNAMIC_PAGING_ENABLED(port) ((memorystatus_freeze_enabled == FALSE) && IP_VALID(port)) #else #define VM_DYNAMIC_PAGING_ENABLED(port) IP_VALID(port) #endif @@ -111,6 +119,8 @@ extern int vm_debug_events; #define VM_UPL_PAGE_WAIT 0x120 #define VM_IOPL_PAGE_WAIT 0x121 +#define VM_PRESSURE_EVENT 0x130 + #define VM_DEBUG_EVENT(name, event, control, arg1, arg2, arg3, arg4) \ MACRO_BEGIN \ if (vm_debug_events) { \ @@ -159,20 +169,8 @@ extern vm_page_t vm_page_get_next(vm_page_t page); #include extern unsigned int vm_pageout_scan_event_counter; -extern unsigned int vm_zf_queue_count; - +extern unsigned int vm_page_anonymous_count; -extern uint64_t vm_zf_count; - -#define VM_ZF_COUNT_INCR() \ - MACRO_BEGIN \ - OSAddAtomic64(1, (SInt64 *) &vm_zf_count); \ - MACRO_END \ - -#define VM_ZF_COUNT_DECR() \ - MACRO_BEGIN \ - OSAddAtomic64(-1, (SInt64 *) &vm_zf_count); \ - MACRO_END \ /* * must hold the page queues lock to @@ -182,11 +180,14 @@ struct vm_pageout_queue { queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */ unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */ unsigned int pgo_maxlaundry; + uint64_t pgo_tid; /* thread ID of I/O thread that services this queue */ + uint8_t pgo_lowpriority; /* iothread is set to use low priority I/O */ unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */ pgo_busy:1, /* iothread is currently processing request from pgo_pending */ pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */ pgo_draining:1, + pgo_inited:1, :0; }; @@ -208,7 +209,8 @@ extern void vm_pageout_object_terminate( vm_object_t object); extern void vm_pageout_cluster( - vm_page_t m); + vm_page_t m, + boolean_t pageout); extern void vm_pageout_initialize_page( vm_page_t m); @@ -328,6 +330,12 @@ extern void vector_upl_get_iostate_byindex(upl_t, uint32_t, upl_offset_t*, upl_s extern upl_t vector_upl_subupl_byindex(upl_t , uint32_t); extern upl_t vector_upl_subupl_byoffset(upl_t , upl_offset_t*, upl_size_t*); +extern void vm_object_set_pmap_cache_attr( + vm_object_t object, + upl_page_info_array_t user_page_list, + unsigned int num_pages, + boolean_t batch_pmap_op); + extern kern_return_t vm_object_iopl_request( vm_object_t object, vm_object_offset_t offset, @@ -399,7 +407,7 @@ decl_simple_lock_data(extern, vm_paging_lock) */ extern unsigned int vm_backing_store_low; -extern void vm_pageout_queue_steal( +extern void vm_pageout_steal_laundry( vm_page_t page, boolean_t queues_locked); diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index 53cf9e12a..fa8c27872 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -68,15 +68,6 @@ extern int default_pager_init_flag; /* * osfmk */ -#ifndef _KERN_IPC_TT_H_ /* XXX FBDP */ -/* these should be exported cleanly from OSFMK since BSD needs them */ -extern ipc_port_t convert_task_to_port( - task_t task); -extern ipc_port_t convert_thread_to_port( - thread_t thread); -extern ipc_port_t convert_task_name_to_port( - task_name_t task_name); -#endif /* _KERN_IPC_TT_H_ */ #ifndef _IPC_IPC_PORT_H_ extern mach_port_name_t ipc_port_copyout_send( ipc_port_t sright, @@ -343,7 +334,38 @@ extern kern_return_t default_pager_memory_object_create( #if CONFIG_FREEZE extern unsigned int default_pager_swap_pages_free(void); -#endif +struct default_freezer_handle; +struct vm_page; +__private_extern__ void default_freezer_init(void); +__private_extern__ struct default_freezer_handle* default_freezer_handle_allocate(void); +__private_extern__ kern_return_t +default_freezer_handle_init( + struct default_freezer_handle *df_handle); +__private_extern__ void +default_freezer_handle_deallocate( + struct default_freezer_handle *df_handle); +__private_extern__ void +default_freezer_pageout( + struct default_freezer_handle *df_handle); +__private_extern__ kern_return_t +default_freezer_pack( + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + boolean_t *shared, + vm_object_t src_object, + struct default_freezer_handle *df_handle); +__private_extern__ void +default_freezer_unpack( + struct default_freezer_handle *df_handle); +__private_extern__ void +default_freezer_pack_page( + struct vm_page* p, + struct default_freezer_handle *df_handle); + +#endif /* CONFIG_FREEZE */ extern void device_pager_reference(memory_object_t); extern void device_pager_deallocate(memory_object_t); @@ -410,6 +432,7 @@ extern void log_unnest_badness(vm_map_t, vm_map_offset_t, vm_map_offset_t); extern int cs_allow_invalid(struct proc *p); extern int cs_invalid_page(addr64_t vaddr); extern boolean_t cs_validate_page(void *blobs, + memory_object_t pager, memory_object_offset_t offset, const void *data, boolean_t *tainted); diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index f9f18a161..61830edeb 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -99,7 +99,7 @@ vm_purgeable_token_check_queue(purgeable_q_t queue) { our_inactive_count = page_cnt + queue->new_pages + token_new_pagecount; assert(our_inactive_count >= 0); - assert((uint32_t) our_inactive_count == vm_page_inactive_count); + assert((uint32_t) our_inactive_count == vm_page_inactive_count - vm_page_cleaned_count); } } #endif @@ -321,6 +321,68 @@ vm_purgeable_token_remove_first(purgeable_q_t queue) return token; } +static token_idx_t +vm_purgeable_token_remove_last(purgeable_q_t queue) +{ +#if MACH_ASSERT + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#endif + + token_idx_t token; + token = queue->token_q_tail; + + assert(token); + + if (token) { + assert(queue->token_q_head); + + if (queue->token_q_tail == queue->token_q_head) + assert(tokens[token].next == 0); + + if (queue->token_q_unripe == 0) { + /* we're removing a ripe token. decrease count */ + available_for_purge--; + assert(available_for_purge >= 0); + } else if (queue->token_q_unripe == token) { + /* we're removing the only unripe token */ + queue->token_q_unripe = 0; + } + + if (token == queue->token_q_head) { + /* token is the last one in the queue */ + queue->token_q_head = 0; + queue->token_q_tail = 0; + } else { + token_idx_t new_tail; + + for (new_tail = queue->token_q_head; + tokens[new_tail].next != token && new_tail != 0; + new_tail = tokens[new_tail].next) { + } + assert(tokens[new_tail].next == token); + queue->token_q_tail = new_tail; + tokens[new_tail].next = 0; + } + + queue->new_pages += tokens[token].count; + +#if MACH_ASSERT + queue->debug_count_tokens--; + vm_purgeable_token_check_queue(queue); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, TOKEN_DELETE)), + queue->type, + tokens[queue->token_q_head].count, /* num pages on new + * first token */ + token_new_pagecount, /* num pages waiting for + * next token */ + available_for_purge, + 0); +#endif + } + return token; +} + /* * Delete first token from queue. Return token to token queue. * Call with page queue locked. @@ -340,6 +402,21 @@ vm_purgeable_token_delete_first(purgeable_q_t queue) } } +void +vm_purgeable_token_delete_last(purgeable_q_t queue) +{ +#if MACH_ASSERT + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#endif + token_idx_t token = vm_purgeable_token_remove_last(queue); + + if (token) { + /* stick removed token on free queue */ + tokens[token].next = token_free_idx; + token_free_idx = token; + } +} + /* Call with page queue locked. */ void diff --git a/osfmk/vm/vm_purgeable_internal.h b/osfmk/vm/vm_purgeable_internal.h index 4f720eb39..169aa660d 100644 --- a/osfmk/vm/vm_purgeable_internal.h +++ b/osfmk/vm/vm_purgeable_internal.h @@ -88,6 +88,7 @@ kern_return_t vm_purgeable_token_add(purgeable_q_t queue); /* enter with page queue locked */ void vm_purgeable_token_delete_first(purgeable_q_t queue); +void vm_purgeable_token_delete_last(purgeable_q_t queue); /* * decrement token counters. diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 0c0a34e04..a548c0600 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -93,11 +93,9 @@ #include - -#include - #include +boolean_t hibernate_cleaning_in_progress = FALSE; boolean_t vm_page_free_verify = TRUE; uint32_t vm_lopage_free_count = 0; @@ -259,6 +257,8 @@ unsigned int vm_page_free_count_minimum; /* debugging */ zone_t vm_page_zone; vm_locks_array_t vm_page_locks; decl_lck_mtx_data(,vm_page_alloc_lock) +lck_mtx_ext_t vm_page_alloc_lock_ext; + unsigned int io_throttle_zero_fill; unsigned int vm_page_local_q_count = 0; @@ -266,6 +266,9 @@ unsigned int vm_page_local_q_soft_limit = 250; unsigned int vm_page_local_q_hard_limit = 500; struct vplq *vm_page_local_q = NULL; +/* N.B. Guard and fictitious pages must not + * be assigned a zero phys_page value. + */ /* * Fictitious pages don't have a physical address, * but we must initialize phys_page to something. @@ -296,11 +299,12 @@ ppnum_t vm_page_guard_addr = (ppnum_t) -2; */ queue_head_t vm_page_queue_active; queue_head_t vm_page_queue_inactive; -queue_head_t vm_page_queue_zf; /* inactive memory queue for zero fill */ +queue_head_t vm_page_queue_anonymous; /* inactive memory queue for anonymous pages */ queue_head_t vm_page_queue_throttled; unsigned int vm_page_active_count; unsigned int vm_page_inactive_count; +unsigned int vm_page_anonymous_count; unsigned int vm_page_throttled_count; unsigned int vm_page_speculative_count; unsigned int vm_page_wire_count; @@ -319,6 +323,11 @@ unsigned int vm_page_speculative_created = 0; unsigned int vm_page_speculative_used = 0; #endif +queue_head_t vm_page_queue_cleaned; + +unsigned int vm_page_cleaned_count = 0; +unsigned int vm_pageout_enqueued_cleaned = 0; + uint64_t max_valid_dma_address = 0xffffffffffffffffULL; ppnum_t max_valid_low_ppnum = 0xffffffff; @@ -334,10 +343,12 @@ unsigned int vm_page_free_min = 0; unsigned int vm_page_throttle_limit = 0; uint32_t vm_page_creation_throttle = 0; unsigned int vm_page_inactive_target = 0; +unsigned int vm_page_anonymous_min = 0; unsigned int vm_page_inactive_min = 0; unsigned int vm_page_free_reserved = 0; unsigned int vm_page_throttle_count = 0; + /* * The VM system has a couple of heuristics for deciding * that pages are "uninteresting" and should be placed @@ -424,6 +435,7 @@ vm_page_init_lck_grp(void) lck_grp_init(&vm_page_lck_grp_alloc, "vm_page_alloc", &vm_page_lck_grp_attr); lck_grp_init(&vm_page_lck_grp_bucket, "vm_page_bucket", &vm_page_lck_grp_attr); lck_attr_setdefault(&vm_page_lck_attr); + lck_mtx_init_ext(&vm_page_alloc_lock, &vm_page_alloc_lock_ext, &vm_page_lck_grp_alloc, &vm_page_lck_attr); } void @@ -528,14 +540,12 @@ vm_page_bootstrap( m->unusual = FALSE; m->encrypted = FALSE; m->encrypted_cleaning = FALSE; - m->list_req_pending = FALSE; - m->dump_cleaning = FALSE; m->cs_validated = FALSE; m->cs_tainted = FALSE; m->no_cache = FALSE; - m->zero_fill = FALSE; m->reusable = FALSE; m->slid = FALSE; + m->was_dirty = FALSE; m->__unused_object_bits = 0; @@ -570,8 +580,9 @@ vm_page_bootstrap( queue_init(&vm_lopage_queue_free); queue_init(&vm_page_queue_active); queue_init(&vm_page_queue_inactive); + queue_init(&vm_page_queue_cleaned); queue_init(&vm_page_queue_throttled); - queue_init(&vm_page_queue_zf); + queue_init(&vm_page_queue_anonymous); for ( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) { queue_init(&vm_page_queue_speculative[i].age_q); @@ -588,9 +599,8 @@ vm_page_bootstrap( /* * Steal memory for the map and zone subsystems. */ - - vm_map_steal_memory(); zone_steal_memory(); + vm_map_steal_memory(); /* * Allocate (and initialize) the virtual-to-physical @@ -754,7 +764,7 @@ pmap_steal_memory( #endif pmap_enter(kernel_pmap, vaddr, phys_page, - VM_PROT_READ|VM_PROT_WRITE, + VM_PROT_READ|VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); /* * Account for newly stolen memory @@ -806,7 +816,18 @@ pmap_startup( */ fill = 0; /* Assume no fill */ if (PE_parse_boot_argn("fill", &fillval, sizeof (fillval))) fill = 1; /* Set fill */ - +#if DEBUG + /* This slows down booting the DEBUG kernel, particularly on + * large memory systems, but is worthwhile in deterministically + * trapping uninitialized memory usage. + */ + if (fill == 0) { + fill = 1; + fillval = 0xDEB8F177; + } +#endif + if (fill) + kprintf("Filling vm_pages with pattern: 0x%x\n", fillval); // -debug code remove if (2 == vm_himemory_mode) { // free low -> high so high is preferred @@ -903,7 +924,7 @@ vm_page_module_init(void) zone_change(vm_page_zone, Z_EXPAND, FALSE); zone_change(vm_page_zone, Z_EXHAUST, TRUE); zone_change(vm_page_zone, Z_FOREIGN, TRUE); - + zone_change(vm_page_zone, Z_GZALLOC_EXEMPT, TRUE); /* * Adjust zone statistics to account for the real pages allocated * in vm_page_create(). [Q: is this really what we want?] @@ -911,8 +932,6 @@ vm_page_module_init(void) vm_page_zone->count += vm_page_pages; vm_page_zone->sum_count += vm_page_pages; vm_page_zone->cur_size += vm_page_pages * vm_page_zone->elem_size; - - lck_mtx_init(&vm_page_alloc_lock, &vm_page_lck_grp_alloc, &vm_page_lck_attr); } /* @@ -973,7 +992,7 @@ vm_page_insert( vm_object_t object, vm_object_offset_t offset) { - vm_page_insert_internal(mem, object, offset, FALSE, TRUE); + vm_page_insert_internal(mem, object, offset, FALSE, TRUE, FALSE); } void @@ -982,7 +1001,8 @@ vm_page_insert_internal( vm_object_t object, vm_object_offset_t offset, boolean_t queues_lock_held, - boolean_t insert_in_hash) + boolean_t insert_in_hash, + boolean_t batch_pmap_op) { vm_page_bucket_t *bucket; lck_spin_t *bucket_lock; @@ -991,8 +1011,13 @@ vm_page_insert_internal( XPR(XPR_VM_PAGE, "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n", object, offset, mem, 0,0); - +#if 0 + /* + * we may not hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(mem); +#endif if (object == vm_submap_object) { /* the vm_submap_object is only a placeholder for submaps */ @@ -1047,13 +1072,13 @@ vm_page_insert_internal( lck_spin_unlock(bucket_lock); } - { unsigned int cache_attr; + { + unsigned int cache_attr; cache_attr = object->wimg_bits & VM_WIMG_MASK; if (cache_attr != VM_WIMG_USE_DEFAULT) { - pmap_set_cache_attributes(mem->phys_page, cache_attr); - object->set_cache_attr = TRUE; + PMAP_SET_CACHE_ATTR(mem, object, cache_attr, batch_pmap_op); } } /* @@ -1118,7 +1143,13 @@ vm_page_replace( lck_spin_t *bucket_lock; int hash_id; +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(mem); +#endif vm_object_lock_assert_exclusive(object); #if DEBUG if (mem->tabled || mem->object != VM_OBJECT_NULL) @@ -1181,7 +1212,7 @@ vm_page_replace( */ vm_page_free_unlocked(found_m, FALSE); } - vm_page_insert_internal(mem, object, offset, FALSE, FALSE); + vm_page_insert_internal(mem, object, offset, FALSE, FALSE, FALSE); } /* @@ -1211,8 +1242,14 @@ vm_page_remove( vm_object_lock_assert_exclusive(mem->object); assert(mem->tabled); assert(!mem->cleaning); + assert(!mem->laundry); +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(mem); - +#endif if (remove_from_hash == TRUE) { /* * Remove from the object_object/offset hash table @@ -1389,7 +1426,13 @@ vm_page_lookup( lck_spin_lock(bucket_lock); for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) { +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(mem); +#endif if ((mem->object == object) && (mem->offset == offset)) break; } @@ -1454,7 +1497,7 @@ vm_page_rename( vm_page_lockspin_queues(); vm_page_remove(mem, TRUE); - vm_page_insert_internal(mem, new_object, new_offset, TRUE, TRUE); + vm_page_insert_internal(mem, new_object, new_offset, TRUE, TRUE, FALSE); vm_page_unlock_queues(); } @@ -1932,9 +1975,9 @@ return_page_from_cpu_list: * it doesn't really matter. */ if ((vm_page_free_count < vm_page_free_min) || - ((vm_page_free_count < vm_page_free_target) && - ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) - thread_wakeup((event_t) &vm_page_free_wanted); + ((vm_page_free_count < vm_page_free_target) && + ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) + thread_wakeup((event_t) &vm_page_free_wanted); VM_CHECK_MEMORYSTATUS; @@ -1964,7 +2007,6 @@ vm_page_release( } // dbgLog(mem->phys_page, vm_page_free_count, vm_page_wire_count, 5); /* (TEST/DEBUG) */ - pmap_clear_noencrypt(mem->phys_page); lck_mtx_lock_spin(&vm_page_queue_free_lock); @@ -2204,7 +2246,6 @@ vm_page_free_prepare_queues( VM_PAGE_CHECK(mem); assert(!mem->free); assert(!mem->cleaning); - assert(!mem->pageout); #if DEBUG lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); if (mem->free) @@ -2213,18 +2254,18 @@ vm_page_free_prepare_queues( if (mem->object) { vm_object_lock_assert_exclusive(mem->object); } - if (mem->laundry) { /* * We may have to free a page while it's being laundered * if we lost its pager (due to a forced unmount, for example). - * We need to call vm_pageout_throttle_up() before removing - * the page from its VM object, so that we can find out on - * which pageout queue the page is on. + * We need to call vm_pageout_steal_laundry() before removing + * the page from its VM object, so that we can remove it + * from its pageout queue and adjust the laundry accounting */ - vm_pageout_throttle_up(mem); + vm_pageout_steal_laundry(mem, TRUE); counter(++c_laundry_pages_freed); } + VM_PAGE_QUEUES_REMOVE(mem); /* clears local/active/inactive/throttled/speculative */ if (VM_PAGE_WIRED(mem)) { @@ -2268,8 +2309,6 @@ vm_page_free_prepare_object( mem->phys_page = vm_page_fictitious_addr; } if ( !mem->fictitious) { - if (mem->zero_fill == TRUE) - VM_ZF_COUNT_DECR(); vm_page_init(mem, mem->phys_page, mem->lopage); } } @@ -2315,196 +2354,163 @@ vm_page_free_unlocked( } } + /* * Free a list of pages. The list can be up to several hundred pages, * as blocked up by vm_pageout_scan(). * The big win is not having to take the free list lock once - * per page. We sort the incoming pages into n lists, one for - * each color. + * per page. */ void vm_page_free_list( - vm_page_t mem, + vm_page_t freeq, boolean_t prepare_object) { + vm_page_t mem; vm_page_t nxt; - int pg_count = 0; - int color; - int inuse_list_head = -1; + vm_page_t local_freeq; + int pg_count; - queue_head_t free_list[MAX_COLORS]; - int inuse[MAX_COLORS]; + while (freeq) { - for (color = 0; color < (signed) vm_colors; color++) { - queue_init(&free_list[color]); - } - - while (mem) { - assert(!mem->inactive); - assert(!mem->active); - assert(!mem->throttled); - assert(!mem->free); - assert(!mem->speculative); - assert(!VM_PAGE_WIRED(mem)); - assert(mem->pageq.prev == NULL); + pg_count = 0; + local_freeq = VM_PAGE_NULL; + mem = freeq; - nxt = (vm_page_t)(mem->pageq.next); + /* + * break up the processing into smaller chunks so + * that we can 'pipeline' the pages onto the + * free list w/o introducing too much + * contention on the global free queue lock + */ + while (mem && pg_count < 64) { + + assert(!mem->inactive); + assert(!mem->active); + assert(!mem->throttled); + assert(!mem->free); + assert(!mem->speculative); + assert(!VM_PAGE_WIRED(mem)); + assert(mem->pageq.prev == NULL); + + nxt = (vm_page_t)(mem->pageq.next); - if (prepare_object == TRUE) - vm_page_free_prepare_object(mem, TRUE); + if (vm_page_free_verify && !mem->fictitious && !mem->private) { + assert(pmap_verify_free(mem->phys_page)); + } + if (prepare_object == TRUE) + vm_page_free_prepare_object(mem, TRUE); - if (vm_page_free_verify && !mem->fictitious && !mem->private) { - assert(pmap_verify_free(mem->phys_page)); - } + if (!mem->fictitious) { + assert(mem->busy); - if (!mem->fictitious) { - assert(mem->busy); - if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && - vm_lopage_free_count < vm_lopage_free_limit && - mem->phys_page < max_valid_low_ppnum) { - mem->pageq.next = NULL; - vm_page_release(mem); - } else { + if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && + vm_lopage_free_count < vm_lopage_free_limit && + mem->phys_page < max_valid_low_ppnum) { + mem->pageq.next = NULL; + vm_page_release(mem); + } else { + /* + * IMPORTANT: we can't set the page "free" here + * because that would make the page eligible for + * a physically-contiguous allocation (see + * vm_page_find_contiguous()) right away (we don't + * hold the vm_page_queue_free lock). That would + * cause trouble because the page is not actually + * in the free queue yet... + */ + mem->pageq.next = (queue_entry_t)local_freeq; + local_freeq = mem; + pg_count++; - /* - * IMPORTANT: we can't set the page "free" here - * because that would make the page eligible for - * a physically-contiguous allocation (see - * vm_page_find_contiguous()) right away (we don't - * hold the vm_page_queue_free lock). That would - * cause trouble because the page is not actually - * in the free queue yet... - */ - color = mem->phys_page & vm_color_mask; - if (queue_empty(&free_list[color])) { - inuse[color] = inuse_list_head; - inuse_list_head = color; + pmap_clear_noencrypt(mem->phys_page); } - queue_enter_first(&free_list[color], - mem, - vm_page_t, - pageq); - pg_count++; - - pmap_clear_noencrypt(mem->phys_page); + } else { + assert(mem->phys_page == vm_page_fictitious_addr || + mem->phys_page == vm_page_guard_addr); + vm_page_release_fictitious(mem); } - } else { - assert(mem->phys_page == vm_page_fictitious_addr || - mem->phys_page == vm_page_guard_addr); - vm_page_release_fictitious(mem); + mem = nxt; } - mem = nxt; - } - if (pg_count) { - unsigned int avail_free_count; - unsigned int need_wakeup = 0; - unsigned int need_priv_wakeup = 0; + freeq = mem; + + if ( (mem = local_freeq) ) { + unsigned int avail_free_count; + unsigned int need_wakeup = 0; + unsigned int need_priv_wakeup = 0; - lck_mtx_lock_spin(&vm_page_queue_free_lock); + lck_mtx_lock_spin(&vm_page_queue_free_lock); - color = inuse_list_head; - - while( color != -1 ) { - vm_page_t first, last; - vm_page_t first_free; + while (mem) { + int color; + + nxt = (vm_page_t)(mem->pageq.next); - /* - * Now that we hold the vm_page_queue_free lock, - * it's safe to mark all pages in our local queue - * as "free"... - */ - queue_iterate(&free_list[color], - mem, - vm_page_t, - pageq) { assert(!mem->free); assert(mem->busy); mem->free = TRUE; - } - /* - * ... and insert our local queue at the head of - * the global free queue. - */ - first = (vm_page_t) queue_first(&free_list[color]); - last = (vm_page_t) queue_last(&free_list[color]); - first_free = (vm_page_t) queue_first(&vm_page_queue_free[color]); - if (queue_empty(&vm_page_queue_free[color])) { - queue_last(&vm_page_queue_free[color]) = - (queue_entry_t) last; - } else { - queue_prev(&first_free->pageq) = - (queue_entry_t) last; + color = mem->phys_page & vm_color_mask; + queue_enter_first(&vm_page_queue_free[color], + mem, + vm_page_t, + pageq); + mem = nxt; } - queue_first(&vm_page_queue_free[color]) = - (queue_entry_t) first; - queue_prev(&first->pageq) = - (queue_entry_t) &vm_page_queue_free[color]; - queue_next(&last->pageq) = - (queue_entry_t) first_free; - - /* next color */ - color = inuse[color]; - } - - vm_page_free_count += pg_count; - avail_free_count = vm_page_free_count; - - if (vm_page_free_wanted_privileged > 0 && - avail_free_count > 0) { - if (avail_free_count < vm_page_free_wanted_privileged) { - need_priv_wakeup = avail_free_count; - vm_page_free_wanted_privileged -= - avail_free_count; - avail_free_count = 0; - } else { - need_priv_wakeup = vm_page_free_wanted_privileged; - vm_page_free_wanted_privileged = 0; - avail_free_count -= - vm_page_free_wanted_privileged; + vm_page_free_count += pg_count; + avail_free_count = vm_page_free_count; + + if (vm_page_free_wanted_privileged > 0 && avail_free_count > 0) { + + if (avail_free_count < vm_page_free_wanted_privileged) { + need_priv_wakeup = avail_free_count; + vm_page_free_wanted_privileged -= avail_free_count; + avail_free_count = 0; + } else { + need_priv_wakeup = vm_page_free_wanted_privileged; + vm_page_free_wanted_privileged = 0; + avail_free_count -= vm_page_free_wanted_privileged; + } } - } + if (vm_page_free_wanted > 0 && avail_free_count > vm_page_free_reserved) { + unsigned int available_pages; - if (vm_page_free_wanted > 0 && - avail_free_count > vm_page_free_reserved) { - unsigned int available_pages; + available_pages = avail_free_count - vm_page_free_reserved; - available_pages = (avail_free_count - - vm_page_free_reserved); + if (available_pages >= vm_page_free_wanted) { + need_wakeup = vm_page_free_wanted; + vm_page_free_wanted = 0; + } else { + need_wakeup = available_pages; + vm_page_free_wanted -= available_pages; + } + } + lck_mtx_unlock(&vm_page_queue_free_lock); - if (available_pages >= vm_page_free_wanted) { - need_wakeup = vm_page_free_wanted; - vm_page_free_wanted = 0; - } else { - need_wakeup = available_pages; - vm_page_free_wanted -= available_pages; + if (need_priv_wakeup != 0) { + /* + * There shouldn't be that many VM-privileged threads, + * so let's wake them all up, even if we don't quite + * have enough pages to satisfy them all. + */ + thread_wakeup((event_t)&vm_page_free_wanted_privileged); + } + if (need_wakeup != 0 && vm_page_free_wanted == 0) { + /* + * We don't expect to have any more waiters + * after this, so let's wake them all up at + * once. + */ + thread_wakeup((event_t) &vm_page_free_count); + } else for (; need_wakeup != 0; need_wakeup--) { + /* + * Wake up one waiter per page we just released. + */ + thread_wakeup_one((event_t) &vm_page_free_count); } - } - lck_mtx_unlock(&vm_page_queue_free_lock); - if (need_priv_wakeup != 0) { - /* - * There shouldn't be that many VM-privileged threads, - * so let's wake them all up, even if we don't quite - * have enough pages to satisfy them all. - */ - thread_wakeup((event_t)&vm_page_free_wanted_privileged); + VM_CHECK_MEMORYSTATUS; } - if (need_wakeup != 0 && vm_page_free_wanted == 0) { - /* - * We don't expect to have any more waiters - * after this, so let's wake them all up at - * once. - */ - thread_wakeup((event_t) &vm_page_free_count); - } else for (; need_wakeup != 0; need_wakeup--) { - /* - * Wake up one waiter per page we just released. - */ - thread_wakeup_one((event_t) &vm_page_free_count); - } - - VM_CHECK_MEMORYSTATUS; } } @@ -2543,6 +2549,11 @@ vm_page_wire( lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); #endif if ( !VM_PAGE_WIRED(mem)) { + + if (mem->pageout_queue) { + mem->pageout = FALSE; + vm_pageout_throttle_up(mem); + } VM_PAGE_QUEUES_REMOVE(mem); if (mem->object) { @@ -2579,10 +2590,6 @@ vm_page_wire( if (mem->gobbled) vm_page_gobble_count--; mem->gobbled = FALSE; - if (mem->zero_fill == TRUE) { - mem->zero_fill = FALSE; - VM_ZF_COUNT_DECR(); - } VM_CHECK_MEMORYSTATUS; @@ -2728,7 +2735,15 @@ vm_page_deactivate_internal( vm_page_gobble_count--; m->gobbled = FALSE; } - if (m->private || m->fictitious || (VM_PAGE_WIRED(m))) + /* + * if this page is currently on the pageout queue, we can't do the + * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case) + * and we can't remove it manually since we would need the object lock + * (which is not required here) to decrement the activity_in_progress + * reference which is held on the object while the page is in the pageout queue... + * just let the normal laundry processing proceed + */ + if (m->pageout_queue || m->private || m->fictitious || (VM_PAGE_WIRED(m))) return; if (!m->absent && clear_hw_reference == TRUE) @@ -2740,9 +2755,6 @@ vm_page_deactivate_internal( if (!m->inactive) { VM_PAGE_QUEUES_REMOVE(m); - assert(!m->laundry); - assert(m->pageq.next == NULL && m->pageq.prev == NULL); - if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && m->dirty && m->object->internal && (m->object->purgable == VM_PURGABLE_DENY || @@ -2764,6 +2776,54 @@ vm_page_deactivate_internal( } } +/* + * vm_page_enqueue_cleaned + * + * Put the page on the cleaned queue, mark it cleaned, etc. + * Being on the cleaned queue (and having m->clean_queue set) + * does ** NOT ** guarantee that the page is clean! + * + * Call with the queues lock held. + */ + +void vm_page_enqueue_cleaned(vm_page_t m) +{ + assert(m->phys_page != vm_page_guard_addr); +#if DEBUG + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#endif + assert( !(m->absent && !m->unusual)); + + if (m->gobbled) { + assert( !VM_PAGE_WIRED(m)); + if (!m->private && !m->fictitious) + vm_page_wire_count--; + vm_page_gobble_count--; + m->gobbled = FALSE; + } + /* + * if this page is currently on the pageout queue, we can't do the + * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case) + * and we can't remove it manually since we would need the object lock + * (which is not required here) to decrement the activity_in_progress + * reference which is held on the object while the page is in the pageout queue... + * just let the normal laundry processing proceed + */ + if (m->clean_queue || m->pageout_queue || m->private || m->fictitious) + return; + + VM_PAGE_QUEUES_REMOVE(m); + + queue_enter(&vm_page_queue_cleaned, m, vm_page_t, pageq); + m->clean_queue = TRUE; + vm_page_cleaned_count++; + + m->inactive = TRUE; + vm_page_inactive_count++; + + vm_pageout_enqueued_cleaned++; +} + /* * vm_page_activate: * @@ -2793,7 +2853,15 @@ vm_page_activate( vm_page_gobble_count--; m->gobbled = FALSE; } - if (m->private || m->fictitious) + /* + * if this page is currently on the pageout queue, we can't do the + * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case) + * and we can't remove it manually since we would need the object lock + * (which is not required here) to decrement the activity_in_progress + * reference which is held on the object while the page is in the pageout queue... + * just let the normal laundry processing proceed + */ + if (m->pageout_queue || m->private || m->fictitious) return; #if DEBUG @@ -2805,12 +2873,11 @@ vm_page_activate( DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL); } - + VM_PAGE_QUEUES_REMOVE(m); if ( !VM_PAGE_WIRED(m)) { - assert(!m->laundry); - assert(m->pageq.next == NULL && m->pageq.prev == NULL); + if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) && m->dirty && m->object->internal && (m->object->purgable == VM_PURGABLE_DENY || @@ -2853,7 +2920,15 @@ vm_page_speculate( #endif assert( !(m->absent && !m->unusual)); - if (m->private || m->fictitious) + /* + * if this page is currently on the pageout queue, we can't do the + * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case) + * and we can't remove it manually since we would need the object lock + * (which is not required here) to decrement the activity_in_progress + * reference which is held on the object while the page is in the pageout queue... + * just let the normal laundry processing proceed + */ + if (m->pageout_queue || m->private || m->fictitious) return; VM_PAGE_QUEUES_REMOVE(m); @@ -2974,19 +3049,21 @@ vm_page_lru( #if DEBUG lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); #endif - if (m->active || m->reference) - return; - - if (m->private || (VM_PAGE_WIRED(m))) + /* + * if this page is currently on the pageout queue, we can't do the + * VM_PAGE_QUEUES_REMOVE (which doesn't handle the pageout queue case) + * and we can't remove it manually since we would need the object lock + * (which is not required here) to decrement the activity_in_progress + * reference which is held on the object while the page is in the pageout queue... + * just let the normal laundry processing proceed + */ + if (m->pageout_queue || m->private || (VM_PAGE_WIRED(m))) return; m->no_cache = FALSE; VM_PAGE_QUEUES_REMOVE(m); - assert(!m->laundry); - assert(m->pageq.next == NULL && m->pageq.prev == NULL); - VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); } @@ -3160,7 +3237,14 @@ vm_page_part_zero_fill( { vm_page_t tmp; +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(m); +#endif + #ifdef PMAP_ZERO_PART_PAGE_IMPLEMENTED pmap_zero_part_page(m->phys_page, m_pa, len); #else @@ -3198,8 +3282,13 @@ vm_page_zero_fill( XPR(XPR_VM_PAGE, "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n", m->object, m->offset, m, 0,0); - +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(m); +#endif // dbgTrace(0xAEAEAEAE, m->phys_page, 0); /* (BRINGUP) */ pmap_zero_page(m->phys_page); @@ -3219,9 +3308,14 @@ vm_page_part_copy( vm_offset_t dst_pa, vm_size_t len) { +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(src_m); VM_PAGE_CHECK(dst_m); - +#endif pmap_copy_part_page(src_m->phys_page, src_pa, dst_m->phys_page, dst_pa, len); } @@ -3249,9 +3343,15 @@ vm_page_copy( src_m->object, src_m->offset, dest_m->object, dest_m->offset, 0); - +#if 0 + /* + * we don't hold the page queue lock + * so this check isn't safe to make + */ VM_PAGE_CHECK(src_m); VM_PAGE_CHECK(dest_m); +#endif + vm_object_lock_assert_held(src_m->object); /* * ENCRYPTED SWAP: @@ -3281,7 +3381,7 @@ vm_page_copy( src_m->busy = TRUE; (void) vm_page_slide(src_m, 0); assert(src_m->busy); - if(!was_busy) { + if (!was_busy) { PAGE_WAKEUP_DONE(src_m); } } @@ -3345,14 +3445,10 @@ _vm_page_print( (p->unusual ? "" : "!"), (p->encrypted ? "" : "!"), (p->encrypted_cleaning ? "" : "!")); - printf(" %slist_req_pending, %sdump_cleaning, %scs_validated, %scs_tainted, %sno_cache\n", - (p->list_req_pending ? "" : "!"), - (p->dump_cleaning ? "" : "!"), + printf(" %scs_validated, %scs_tainted, %sno_cache\n", (p->cs_validated ? "" : "!"), (p->cs_tainted ? "" : "!"), (p->no_cache ? "" : "!")); - printf(" %szero_fill\n", - (p->zero_fill ? "" : "!")); printf("phys_page=0x%x\n", p->phys_page); } @@ -3496,6 +3592,9 @@ vm_page_queues_assert( vm_page_t mem, int val) { +#if DEBUG + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#endif if (mem->free + mem->active + mem->inactive + mem->speculative + mem->throttled + mem->pageout_queue > (val)) { _vm_page_print(mem); @@ -3506,6 +3605,7 @@ vm_page_queues_assert( assert(!mem->inactive); assert(!mem->speculative); assert(!mem->throttled); + assert(!mem->pageout_queue); } } #endif /* MACH_ASSERT */ @@ -3665,8 +3765,7 @@ retry: } else if (VM_PAGE_WIRED(m) || m->gobbled || m->encrypted || m->encrypted_cleaning || m->cs_validated || m->cs_tainted || m->error || m->absent || m->pageout_queue || m->laundry || m->wanted || m->precious || - m->cleaning || m->overwriting || m->restart || m->unusual || m->list_req_pending || - m->pageout) { + m->cleaning || m->overwriting || m->restart || m->unusual || m->pageout) { /* * page is in a transient state * or a state we don't want to deal @@ -3922,7 +4021,7 @@ did_consider: (VM_PAGE_WIRED(m1) || m1->gobbled || m1->encrypted || m1->encrypted_cleaning || m1->cs_validated || m1->cs_tainted || m1->error || m1->absent || m1->pageout_queue || m1->laundry || m1->wanted || m1->precious || - m1->cleaning || m1->overwriting || m1->restart || m1->unusual || m1->list_req_pending || m1->busy)) { + m1->cleaning || m1->overwriting || m1->restart || m1->unusual || m1->busy)) { if (locked_object) { vm_object_unlock(locked_object); @@ -3958,8 +4057,9 @@ did_consider: if (refmod & VM_MEM_REFERENCED) m2->reference = TRUE; - if (refmod & VM_MEM_MODIFIED) - m2->dirty = TRUE; + if (refmod & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m2, TRUE); + } offset = m1->offset; /* @@ -3981,7 +4081,7 @@ did_consider: /* * now put the substitute page on the object */ - vm_page_insert_internal(m2, locked_object, offset, TRUE, TRUE); + vm_page_insert_internal(m2, locked_object, offset, TRUE, TRUE, FALSE); if (m2->reference) vm_page_activate(m2); @@ -4143,9 +4243,9 @@ cpm_allocate( * determine need for wakeups */ if ((vm_page_free_count < vm_page_free_min) || - ((vm_page_free_count < vm_page_free_target) && - ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) - thread_wakeup((event_t) &vm_page_free_wanted); + ((vm_page_free_count < vm_page_free_target) && + ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) + thread_wakeup((event_t) &vm_page_free_wanted); VM_CHECK_MEMORYSTATUS; @@ -4190,7 +4290,6 @@ vm_page_do_delayed_work( int j; vm_page_t m; vm_page_t local_free_q = VM_PAGE_NULL; - boolean_t dropped_obj_lock = FALSE; /* * pageout_scan takes the vm_page_lock_queues first @@ -4218,28 +4317,11 @@ vm_page_do_delayed_work( mutex_pause(j); vm_page_lockspin_queues(); } - dropped_obj_lock = TRUE; } for (j = 0; j < dw_count; j++, dwp++) { m = dwp->dw_m; - if (dwp->dw_mask & DW_set_list_req_pending) { - m->list_req_pending = TRUE; - - if (dropped_obj_lock == TRUE) { - /* - * need to make sure anyone that might have - * blocked on busy == TRUE when we dropped - * the object lock gets a chance to re-evaluate - * its state since we have several places - * where we avoid potential deadlocks with - * the fileysystem by stealing pages with - * list_req_pending == TRUE and busy == TRUE - */ - dwp->dw_mask |= DW_PAGE_WAKEUP; - } - } if (dwp->dw_mask & DW_vm_pageout_throttle_up) vm_pageout_throttle_up(m); @@ -4272,25 +4354,51 @@ vm_page_do_delayed_work( } else if (dwp->dw_mask & DW_vm_page_speculate) vm_page_speculate(m, TRUE); + else if (dwp->dw_mask & DW_enqueue_cleaned) { + /* + * if we didn't hold the object lock and did this, + * we might disconnect the page, then someone might + * soft fault it back in, then we would put it on the + * cleaned queue, and so we would have a referenced (maybe even dirty) + * page on that queue, which we don't want + */ + int refmod_state = pmap_disconnect(m->phys_page); + + if ((refmod_state & VM_MEM_REFERENCED)) { + /* + * this page has been touched since it got cleaned; let's activate it + * if it hasn't already been + */ + vm_pageout_enqueued_cleaned++; + vm_pageout_cleaned_reactivated++; + vm_pageout_cleaned_commit_reactivated++; + + if (m->active == FALSE) + vm_page_activate(m); + } else { + m->reference = FALSE; + vm_page_enqueue_cleaned(m); + } + } else if (dwp->dw_mask & DW_vm_page_lru) vm_page_lru(m); - else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) - VM_PAGE_QUEUES_REMOVE(m); - + else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) { + if ( !m->pageout_queue) + VM_PAGE_QUEUES_REMOVE(m); + } if (dwp->dw_mask & DW_set_reference) m->reference = TRUE; else if (dwp->dw_mask & DW_clear_reference) m->reference = FALSE; if (dwp->dw_mask & DW_move_page) { - VM_PAGE_QUEUES_REMOVE(m); + if ( !m->pageout_queue) { + VM_PAGE_QUEUES_REMOVE(m); - assert(!m->laundry); - assert(m->object != kernel_object); - assert(m->pageq.next == NULL && - m->pageq.prev == NULL); + assert(m->object != kernel_object); - VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); + VM_PAGE_ENQUEUE_INACTIVE(m, FALSE); + } } if (dwp->dw_mask & DW_clear_busy) m->busy = FALSE; @@ -4308,38 +4416,6 @@ vm_page_do_delayed_work( } - - - -void vm_check_memorystatus() -{ -#if CONFIG_EMBEDDED - static boolean_t in_critical = FALSE; - static unsigned int last_memorystatus = 0; - unsigned int pages_avail; - - if (!kern_memorystatus_delta) { - return; - } - - pages_avail = (vm_page_active_count + - vm_page_inactive_count + - vm_page_speculative_count + - vm_page_free_count + - (VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) ? 0 : vm_page_purgeable_count)); - if ( (!in_critical && (pages_avail < kern_memorystatus_delta)) || - (pages_avail >= (last_memorystatus + kern_memorystatus_delta)) || - (last_memorystatus >= (pages_avail + kern_memorystatus_delta)) ) { - kern_memorystatus_level = pages_avail * 100 / atop_64(max_mem); - last_memorystatus = pages_avail; - - thread_wakeup((event_t)&kern_memorystatus_wakeup); - - in_critical = (pages_avail < kern_memorystatus_delta) ? TRUE : FALSE; - } -#endif -} - kern_return_t vm_page_alloc_list( int page_count, @@ -4409,7 +4485,6 @@ extern boolean_t (* volatile consider_buffer_cache_collect)(int); static int hibernate_drain_pageout_queue(struct vm_pageout_queue *); static int hibernate_flush_dirty_pages(void); static int hibernate_flush_queue(queue_head_t *, int); -static void hibernate_dirty_page(vm_page_t); void hibernate_flush_wait(void); void hibernate_mark_in_progress(void); @@ -4477,46 +4552,6 @@ hibernate_drain_pageout_queue(struct vm_pageout_queue *q) return (0); } -static void -hibernate_dirty_page(vm_page_t m) -{ - vm_object_t object = m->object; - struct vm_pageout_queue *q; - -#if DEBUG - lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); -#endif - vm_object_lock_assert_exclusive(object); - - /* - * protect the object from collapse - - * locking in the object's paging_offset. - */ - vm_object_paging_begin(object); - - m->list_req_pending = TRUE; - m->cleaning = TRUE; - m->busy = TRUE; - - if (object->internal == TRUE) - q = &vm_pageout_queue_internal; - else - q = &vm_pageout_queue_external; - - /* - * pgo_laundry count is tied to the laundry bit - */ - m->laundry = TRUE; - q->pgo_laundry++; - - m->pageout_queue = TRUE; - queue_enter(&q->pgo_pending, m, vm_page_t, pageq); - - if (q->pgo_idle == TRUE) { - q->pgo_idle = FALSE; - thread_wakeup((event_t) &q->pgo_pending); - } -} static int hibernate_flush_queue(queue_head_t *q, int qcount) @@ -4532,6 +4567,7 @@ hibernate_flush_queue(queue_head_t *q, int qcount) struct vm_pageout_queue *eq; struct vm_pageout_queue *tq; + hibernate_cleaning_in_progress = TRUE; KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_START, q, qcount, 0, 0, 0); @@ -4595,7 +4631,7 @@ hibernate_flush_queue(queue_head_t *q, int qcount) vm_pageout_scan_wants_object = VM_OBJECT_NULL; } } - if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->busy || m->absent || m->error) { + if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error) { /* * page is not to be cleaned * put it back on the head of its queue @@ -4622,8 +4658,9 @@ hibernate_flush_queue(queue_head_t *q, int qcount) if ( !m->dirty && m->pmapped) { refmod_state = pmap_get_refmod(m->phys_page); - if ((refmod_state & VM_MEM_MODIFIED)) - m->dirty = TRUE; + if ((refmod_state & VM_MEM_MODIFIED)) { + SET_PAGE_DIRTY(m, FALSE); + } } else refmod_state = 0; @@ -4661,9 +4698,9 @@ hibernate_flush_queue(queue_head_t *q, int qcount) assert_wait_timeout((event_t) &tq->pgo_laundry, THREAD_INTERRUPTIBLE, 1000, 1000*NSEC_PER_USEC); - vm_page_unlock_queues(); + vm_page_unlock_queues(); - wait_result = thread_block(THREAD_CONTINUE_NULL); + wait_result = thread_block(THREAD_CONTINUE_NULL); vm_page_lock_queues(); @@ -4674,9 +4711,9 @@ hibernate_flush_queue(queue_head_t *q, int qcount) break; if (--wait_count == 0) { - hibernate_stats.hibernate_throttle_timeout++; - retval = 1; - } + hibernate_stats.hibernate_throttle_timeout++; + retval = 1; + } } if (retval) break; @@ -4685,9 +4722,16 @@ hibernate_flush_queue(queue_head_t *q, int qcount) continue; } + /* + * we've already factored out pages in the laundry which + * means this page can't be on the pageout queue so it's + * safe to do the VM_PAGE_QUEUES_REMOVE + */ + assert(!m->pageout_queue); + VM_PAGE_QUEUES_REMOVE(m); - hibernate_dirty_page(m); + vm_pageout_cluster(m, FALSE); hibernate_stats.hibernate_found_dirty++; @@ -4708,12 +4752,14 @@ next_pg: vm_object_unlock(l_object); l_object = NULL; } - vm_pageout_scan_wants_object = VM_OBJECT_NULL; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; vm_page_unlock_queues(); KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 4) | DBG_FUNC_END, hibernate_stats.hibernate_found_dirty, retval, 0, 0, 0); + hibernate_cleaning_in_progress = FALSE; + return (retval); } @@ -4759,9 +4805,11 @@ hibernate_flush_dirty_pages() } if (hibernate_flush_queue(&vm_page_queue_active, vm_page_active_count)) return (1); - if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_zf_queue_count)) + if (hibernate_flush_queue(&vm_page_queue_inactive, vm_page_inactive_count - vm_page_anonymous_count - vm_page_cleaned_count)) return (1); - if (hibernate_flush_queue(&vm_page_queue_zf, vm_zf_queue_count)) + if (hibernate_flush_queue(&vm_page_queue_anonymous, vm_page_anonymous_count)) + return (1); + if (hibernate_flush_queue(&vm_page_queue_cleaned, vm_page_cleaned_count)) return (1); if (hibernate_drain_pageout_queue(&vm_pageout_queue_internal)) @@ -4935,7 +4983,7 @@ hibernate_consider_discard(vm_page_t m) hibernate_stats.cd_found_cleaning++; break; } - if (m->laundry || m->list_req_pending) { + if (m->laundry) { hibernate_stats.cd_found_laundry++; break; } @@ -4945,8 +4993,9 @@ hibernate_consider_discard(vm_page_t m) if (refmod_state & VM_MEM_REFERENCED) m->reference = TRUE; - if (refmod_state & VM_MEM_MODIFIED) - m->dirty = TRUE; + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(m, FALSE); + } } /* @@ -4977,6 +5026,15 @@ hibernate_discard_page(vm_page_t m) */ return; +#if DEBUG + vm_object_t object = m->object; + if (!vm_object_lock_try(m->object)) + panic("hibernate_discard_page(%p) !vm_object_lock_try", m); +#else + /* No need to lock page queue for token delete, hibernate_vm_unlock() + makes sure these locks are uncontended before sleep */ +#endif /* !DEBUG */ + if (m->pmapped == TRUE) { __unused int refmod_state = pmap_disconnect(m->phys_page); @@ -4995,13 +5053,15 @@ hibernate_discard_page(vm_page_t m) assert((m->object->objq.next != NULL) && (m->object->objq.prev != NULL)); purgeable_q_t old_queue = vm_purgeable_object_remove(m->object); assert(old_queue); - /* No need to lock page queue for token delete, hibernate_vm_unlock() - makes sure these locks are uncontended before sleep */ vm_purgeable_token_delete_first(old_queue); m->object->purgable = VM_PURGABLE_EMPTY; } vm_page_free(m); + +#if DEBUG + vm_object_unlock(object); +#endif /* DEBUG */ } /* @@ -5020,10 +5080,11 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, vm_page_t m; uint32_t pages = page_list->page_count; uint32_t count_zf = 0, count_throttled = 0; - uint32_t count_inactive = 0, count_active = 0, count_speculative = 0; + uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0; uint32_t count_wire = pages; uint32_t count_discard_active = 0; uint32_t count_discard_inactive = 0; + uint32_t count_discard_cleaned = 0; uint32_t count_discard_purgeable = 0; uint32_t count_discard_speculative = 0; uint32_t i; @@ -5034,6 +5095,18 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, HIBLOG("hibernate_page_list_setall start %p, %p\n", page_list, page_list_wired); +#if DEBUG + vm_page_lock_queues(); + if (vm_page_local_q) { + for (i = 0; i < vm_page_local_q_count; i++) { + struct vpl *lq; + lq = &vm_page_local_q[i].vpl_un.vpl; + VPL_LOCK(&lq->vpl_lock); + } + } +#endif /* DEBUG */ + + KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_START, count_wire, 0, 0, 0, 0); clock_get_uptime(&start); @@ -5123,7 +5196,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); } - queue_iterate( &vm_page_queue_zf, + queue_iterate( &vm_page_queue_anonymous, m, vm_page_t, pageq ) @@ -5163,6 +5236,26 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); } + queue_iterate( &vm_page_queue_cleaned, + m, + vm_page_t, + pageq ) + { + if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) + && hibernate_consider_discard(m)) + { + hibernate_page_bitset(page_list, TRUE, m->phys_page); + if (m->dirty) + count_discard_purgeable++; + else + count_discard_cleaned++; + } + else + count_cleaned++; + count_wire--; + hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + } + for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) { queue_iterate(&vm_page_queue_speculative[i].age_q, @@ -5219,17 +5312,28 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_list_setall_machine(page_list, page_list_wired, &pages); hibernate_stats.cd_count_wire = count_wire; - hibernate_stats.cd_discarded = count_discard_active + count_discard_inactive + count_discard_purgeable + count_discard_speculative; + hibernate_stats.cd_discarded = count_discard_active + count_discard_inactive + count_discard_purgeable + count_discard_speculative + count_discard_cleaned; clock_get_uptime(&end); absolutetime_to_nanoseconds(end - start, &nsec); HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL); - HIBLOG("pages %d, wire %d, act %d, inact %d, spec %d, zf %d, throt %d, could discard act %d inact %d purgeable %d spec %d\n", - pages, count_wire, count_active, count_inactive, count_speculative, count_zf, count_throttled, - count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative); + HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, zf %d, throt %d, could discard act %d inact %d purgeable %d spec %d cleaned %d\n", + pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative, count_zf, count_throttled, + count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned); - *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative; + *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative - count_discard_cleaned; + +#if DEBUG + if (vm_page_local_q) { + for (i = 0; i < vm_page_local_q_count; i++) { + struct vpl *lq; + lq = &vm_page_local_q[i].vpl_un.vpl; + VPL_UNLOCK(&lq->vpl_lock); + } + } + vm_page_unlock_queues(); +#endif /* DEBUG */ KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 8) | DBG_FUNC_END, count_wire, *pagesOut, 0, 0, 0); } @@ -5244,12 +5348,24 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) uint32_t count_discard_active = 0; uint32_t count_discard_inactive = 0; uint32_t count_discard_purgeable = 0; + uint32_t count_discard_cleaned = 0; uint32_t count_discard_speculative = 0; +#if DEBUG + vm_page_lock_queues(); + if (vm_page_local_q) { + for (i = 0; i < vm_page_local_q_count; i++) { + struct vpl *lq; + lq = &vm_page_local_q[i].vpl_un.vpl; + VPL_LOCK(&lq->vpl_lock); + } + } +#endif /* DEBUG */ + clock_get_uptime(&start); - m = (vm_page_t) queue_first(&vm_page_queue_zf); - while (m && !queue_end(&vm_page_queue_zf, (queue_entry_t)m)) + m = (vm_page_t) queue_first(&vm_page_queue_anonymous); + while (m && !queue_end(&vm_page_queue_anonymous, (queue_entry_t)m)) { next = (vm_page_t) m->pageq.next; if (hibernate_page_bittst(page_list, m->phys_page)) @@ -5308,11 +5424,37 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = next; } + m = (vm_page_t) queue_first(&vm_page_queue_cleaned); + while (m && !queue_end(&vm_page_queue_cleaned, (queue_entry_t)m)) + { + next = (vm_page_t) m->pageq.next; + if (hibernate_page_bittst(page_list, m->phys_page)) + { + if (m->dirty) + count_discard_purgeable++; + else + count_discard_cleaned++; + hibernate_discard_page(m); + } + m = next; + } + +#if DEBUG + if (vm_page_local_q) { + for (i = 0; i < vm_page_local_q_count; i++) { + struct vpl *lq; + lq = &vm_page_local_q[i].vpl_un.vpl; + VPL_UNLOCK(&lq->vpl_lock); + } + } + vm_page_unlock_queues(); +#endif /* DEBUG */ + clock_get_uptime(&end); absolutetime_to_nanoseconds(end - start, &nsec); - HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d\n", + HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d cleaned %d\n", nsec / 1000000ULL, - count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative); + count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned); } #endif /* HIBERNATION */ @@ -5367,64 +5509,3 @@ vm_page_info( return vm_page_bucket_count; } #endif /* MACH_VM_DEBUG */ - -#include -#if MACH_KDB - -#include -#include -#define printf kdbprintf - -/* - * Routine: vm_page_print [exported] - */ -void -vm_page_print( - db_addr_t db_addr) -{ - vm_page_t p; - - p = (vm_page_t) (long) db_addr; - - iprintf("page 0x%x\n", p); - - db_indent += 2; - - iprintf("object=0x%x", p->object); - printf(", offset=0x%x", p->offset); - printf(", wire_count=%d", p->wire_count); - - iprintf("%slocal, %sinactive, %sactive, %sthrottled, %sgobbled, %slaundry, %sfree, %sref, %sencrypted\n", - (p->local ? "" : "!"), - (p->inactive ? "" : "!"), - (p->active ? "" : "!"), - (p->throttled ? "" : "!"), - (p->gobbled ? "" : "!"), - (p->laundry ? "" : "!"), - (p->free ? "" : "!"), - (p->reference ? "" : "!"), - (p->encrypted ? "" : "!")); - iprintf("%sbusy, %swanted, %stabled, %sfictitious, %sprivate, %sprecious\n", - (p->busy ? "" : "!"), - (p->wanted ? "" : "!"), - (p->tabled ? "" : "!"), - (p->fictitious ? "" : "!"), - (p->private ? "" : "!"), - (p->precious ? "" : "!")); - iprintf("%sabsent, %serror, %sdirty, %scleaning, %spageout, %sclustered\n", - (p->absent ? "" : "!"), - (p->error ? "" : "!"), - (p->dirty ? "" : "!"), - (p->cleaning ? "" : "!"), - (p->pageout ? "" : "!"), - (p->clustered ? "" : "!")); - iprintf("%soverwriting, %srestart, %sunusual\n", - (p->overwriting ? "" : "!"), - (p->restart ? "" : "!"), - (p->unusual ? "" : "!")); - - iprintf("phys_page=0x%x", p->phys_page); - - db_indent -= 2; -} -#endif /* MACH_KDB */ diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index a5931c998..80ded84be 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -647,7 +647,7 @@ vm_shared_region_create( } /* create a VM sub map and its pmap */ - sub_map = vm_map_create(pmap_create(0, is_64bit), + sub_map = vm_map_create(pmap_create(NULL, 0, is_64bit), 0, size, TRUE); if (sub_map == VM_MAP_NULL) { @@ -851,13 +851,13 @@ vm_shared_region_undo_mappings( unsigned int j = 0; vm_shared_region_t shared_region = NULL; boolean_t reset_shared_region_state = FALSE; - + shared_region = vm_shared_region_get(current_task()); if (shared_region == NULL) { - SHARED_REGION_TRACE_DEBUG(("Failed to undo mappings because of NULL shared region.\n")); + printf("Failed to undo mappings because of NULL shared region.\n"); return; } - + if (sr_map == NULL) { ipc_port_t sr_handle; @@ -968,7 +968,7 @@ vm_shared_region_map_file( mach_vm_offset_t sr_base_address; unsigned int i; mach_port_t map_port; - mach_vm_offset_t target_address; + vm_map_offset_t target_address; vm_object_t object; vm_object_size_t obj_size; boolean_t found_mapping_to_slide = FALSE; @@ -1384,7 +1384,7 @@ vm_shared_region_sliding_valid(uint32_t slide) { if ((shared_region_completed_slide == TRUE) && slide) { if (slide != slide_info.slide) { - SHARED_REGION_TRACE_DEBUG(("Only one shared region can be slid\n")); + printf("Only one shared region can be slid\n"); kr = KERN_FAILURE; } else if (slide == slide_info.slide) { /* @@ -1429,7 +1429,7 @@ vm_shared_region_slide_init( } if (slide_info_size > SANE_SLIDE_INFO_SIZE) { - SHARED_REGION_TRACE_DEBUG(("Slide_info_size too large: %lx\n", (uintptr_t)slide_info_size)); + printf("Slide_info_size too large: %lx\n", (uintptr_t)slide_info_size); kr = KERN_FAILURE; return kr; } @@ -1619,7 +1619,8 @@ vm_shared_region_slide(vm_offset_t vaddr, uint32_t pageIndex) * to the upper 32 bits. * The sliding failed... */ - printf("vm_shared_region_slide() carry over\n"); + printf("vm_shared_region_slide() carry over: i=%d j=%d b=0x%x slide=0x%x old=0x%x new=0x%x\n", + i, j, b, slide, old_value, *ptr_to_slide); return KERN_FAILURE; } } @@ -1643,6 +1644,17 @@ vm_named_entry_t commpage64_entry = NULL; vm_map_t commpage32_map = VM_MAP_NULL; vm_map_t commpage64_map = VM_MAP_NULL; +ipc_port_t commpage_text32_handle = IPC_PORT_NULL; +ipc_port_t commpage_text64_handle = IPC_PORT_NULL; +vm_named_entry_t commpage_text32_entry = NULL; +vm_named_entry_t commpage_text64_entry = NULL; +vm_map_t commpage_text32_map = VM_MAP_NULL; +vm_map_t commpage_text64_map = VM_MAP_NULL; + +user32_addr_t commpage_text32_location = (user32_addr_t) _COMM_PAGE32_TEXT_START; +user64_addr_t commpage_text64_location = (user64_addr_t) _COMM_PAGE64_TEXT_START; + +#if defined(__i386__) || defined(__x86_64__) /* * Create a memory entry, VM submap and pmap for one commpage. */ @@ -1664,7 +1676,7 @@ _vm_commpage_init( if (kr != KERN_SUCCESS) { panic("_vm_commpage_init: could not allocate mem_entry"); } - new_map = vm_map_create(pmap_create(0, FALSE), 0, size, TRUE); + new_map = vm_map_create(pmap_create(NULL, 0, FALSE), 0, size, TRUE); if (new_map == VM_MAP_NULL) { panic("_vm_commpage_init: could not allocate VM map"); } @@ -1679,6 +1691,42 @@ _vm_commpage_init( ("commpage: _init(0x%llx) <- %p\n", (long long)size, *handlep)); } +#endif + + +/* + *Initialize the comm text pages at boot time + */ + extern u_int32_t random(void); + void +vm_commpage_text_init(void) +{ + SHARED_REGION_TRACE_DEBUG( + ("commpage text: ->init()\n")); +#if defined(__i386__) || defined(__x86_64__) + /* create the 32 bit comm text page */ + unsigned int offset = (random() % _PFZ32_SLIDE_RANGE) << PAGE_SHIFT; /* restricting to 32bMAX-2PAGE */ + _vm_commpage_init(&commpage_text32_handle, _COMM_PAGE_TEXT_AREA_LENGTH); + commpage_text32_entry = (vm_named_entry_t) commpage_text32_handle->ip_kobject; + commpage_text32_map = commpage_text32_entry->backing.map; + commpage_text32_location = (user32_addr_t) (_COMM_PAGE32_TEXT_START + offset); + /* XXX if (cpu_is_64bit_capable()) ? */ + /* create the 64-bit comm page */ + offset = (random() % _PFZ64_SLIDE_RANGE) << PAGE_SHIFT; /* restricting sliding upto 2Mb range */ + _vm_commpage_init(&commpage_text64_handle, _COMM_PAGE_TEXT_AREA_LENGTH); + commpage_text64_entry = (vm_named_entry_t) commpage_text64_handle->ip_kobject; + commpage_text64_map = commpage_text64_entry->backing.map; + commpage_text64_location = (user64_addr_t) (_COMM_PAGE64_TEXT_START + offset); + + commpage_text_populate(); +#else +#error Unknown architecture. +#endif /* __i386__ || __x86_64__ */ + /* populate the routines in here */ + SHARED_REGION_TRACE_DEBUG( + ("commpage text: init() <-\n")); + +} /* * Initialize the comm pages at boot time. @@ -1689,6 +1737,7 @@ vm_commpage_init(void) SHARED_REGION_TRACE_DEBUG( ("commpage: -> init()\n")); +#if defined(__i386__) || defined(__x86_64__) /* create the 32-bit comm page */ _vm_commpage_init(&commpage32_handle, _COMM_PAGE32_AREA_LENGTH); commpage32_entry = (vm_named_entry_t) commpage32_handle->ip_kobject; @@ -1700,6 +1749,8 @@ vm_commpage_init(void) commpage64_entry = (vm_named_entry_t) commpage64_handle->ip_kobject; commpage64_map = commpage64_entry->backing.map; +#endif /* __i386__ || __x86_64__ */ + /* populate them according to this specific platform */ commpage_populate(); __commpage_setup = 1; @@ -1722,9 +1773,9 @@ vm_commpage_enter( vm_map_t map, task_t task) { - ipc_port_t commpage_handle; - vm_map_offset_t commpage_address, objc_address; - vm_map_size_t commpage_size, objc_size; + ipc_port_t commpage_handle, commpage_text_handle; + vm_map_offset_t commpage_address, objc_address, commpage_text_address; + vm_map_size_t commpage_size, objc_size, commpage_text_size; int vm_flags; kern_return_t kr; @@ -1732,6 +1783,7 @@ vm_commpage_enter( ("commpage: -> enter(%p,%p)\n", map, task)); + commpage_text_size = _COMM_PAGE_TEXT_AREA_LENGTH; /* the comm page is likely to be beyond the actual end of the VM map */ vm_flags = VM_FLAGS_FIXED | VM_FLAGS_BEYOND_MAX; @@ -1743,6 +1795,8 @@ vm_commpage_enter( commpage_size = _COMM_PAGE64_AREA_LENGTH; objc_size = _COMM_PAGE64_OBJC_SIZE; objc_address = _COMM_PAGE64_OBJC_BASE; + commpage_text_handle = commpage_text64_handle; + commpage_text_address = (vm_map_offset_t) commpage_text64_location; } else { commpage_handle = commpage32_handle; commpage_address = @@ -1750,6 +1804,8 @@ vm_commpage_enter( commpage_size = _COMM_PAGE32_AREA_LENGTH; objc_size = _COMM_PAGE32_OBJC_SIZE; objc_address = _COMM_PAGE32_OBJC_BASE; + commpage_text_handle = commpage_text32_handle; + commpage_text_address = (vm_map_offset_t) commpage_text32_location; } if ((commpage_address & (pmap_nesting_size_min - 1)) == 0 && @@ -1757,7 +1813,6 @@ vm_commpage_enter( /* the commpage is properly aligned or sized for pmap-nesting */ vm_flags |= VM_MAKE_TAG(VM_MEMORY_SHARED_PMAP); } - /* map the comm page in the task's address space */ assert(commpage_handle != IPC_PORT_NULL); kr = vm_map_enter_mem_object( @@ -1769,8 +1824,8 @@ vm_commpage_enter( commpage_handle, 0, FALSE, - VM_PROT_READ|VM_PROT_EXECUTE, - VM_PROT_READ|VM_PROT_EXECUTE, + VM_PROT_READ, + VM_PROT_READ, VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( @@ -1780,6 +1835,28 @@ vm_commpage_enter( (long long)commpage_size, commpage_handle, kr)); } + /* map the comm text page in the task's address space */ + assert(commpage_text_handle != IPC_PORT_NULL); + kr = vm_map_enter_mem_object( + map, + &commpage_text_address, + commpage_text_size, + 0, + vm_flags, + commpage_text_handle, + 0, + FALSE, + VM_PROT_READ|VM_PROT_EXECUTE, + VM_PROT_READ|VM_PROT_EXECUTE, + VM_INHERIT_SHARE); + if (kr != KERN_SUCCESS) { + SHARED_REGION_TRACE_ERROR( + ("commpage text: enter(%p,0x%llx,0x%llx) " + "commpage text %p mapping failed 0x%x\n", + map, (long long)commpage_text_address, + (long long)commpage_text_size, commpage_text_handle, kr)); + } + /* * Since we're here, we also pre-allocate some virtual space for the * Objective-C run-time, if needed... diff --git a/osfmk/vm/vm_shared_region.h b/osfmk/vm/vm_shared_region.h index 51742f0b0..cec44658f 100644 --- a/osfmk/vm/vm_shared_region.h +++ b/osfmk/vm/vm_shared_region.h @@ -204,6 +204,7 @@ extern kern_return_t vm_shared_region_slide( vm_offset_t vaddr, uint32_t pageIndex); extern void vm_commpage_init(void); +extern void vm_commpage_text_init(void); extern kern_return_t vm_commpage_enter( struct _vm_map *map, struct task *task); diff --git a/osfmk/vm/vm_swapfile_pager.c b/osfmk/vm/vm_swapfile_pager.c index 4739455df..bb985060f 100644 --- a/osfmk/vm/vm_swapfile_pager.c +++ b/osfmk/vm/vm_swapfile_pager.c @@ -412,6 +412,7 @@ swapfile_pager_data_request( kernel_mapping, dst_pnum, VM_PROT_READ | VM_PROT_WRITE, + VM_PROT_NONE, 0, TRUE); diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 8271d71b2..05b51b4b2 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -881,12 +881,17 @@ mach_vm_map( vm_prot_t max_protection, vm_inherit_t inheritance) { + kern_return_t kr; + vm_map_offset_t vmmaddr; + + vmmaddr = (vm_map_offset_t) *address; + /* filter out any kernel-only flags */ if (flags & ~VM_FLAGS_USER_MAP) return KERN_INVALID_ARGUMENT; - return vm_map_enter_mem_object(target_map, - address, + kr = vm_map_enter_mem_object(target_map, + &vmmaddr, initial_size, mask, flags, @@ -896,6 +901,9 @@ mach_vm_map( cur_protection, max_protection, inheritance); + + *address = vmmaddr; + return kr; } @@ -1887,6 +1895,9 @@ mach_make_memory_entry_64( } else if (access == MAP_MEM_COPYBACK) { SET_MAP_MEM(access, parent_entry->protection); wimg_mode = VM_WIMG_USE_DEFAULT; + } else if (access == MAP_MEM_INNERWBACK) { + SET_MAP_MEM(access, parent_entry->protection); + wimg_mode = VM_WIMG_INNERWBACK; } else if (access == MAP_MEM_WTHRU) { SET_MAP_MEM(access, parent_entry->protection); wimg_mode = VM_WIMG_WTHRU; @@ -1951,6 +1962,8 @@ mach_make_memory_entry_64( wimg_mode = VM_WIMG_IO; } else if (access == MAP_MEM_COPYBACK) { wimg_mode = VM_WIMG_USE_DEFAULT; + } else if (access == MAP_MEM_INNERWBACK) { + wimg_mode = VM_WIMG_INNERWBACK; } else if (access == MAP_MEM_WTHRU) { wimg_mode = VM_WIMG_WTHRU; } else if (access == MAP_MEM_WCOMB) { @@ -2156,6 +2169,10 @@ redo_lookup: */ protections &= next_entry->max_protection; } + if ((next_entry->wired_count) && + (map_entry->wired_count == 0)) { + break; + } if(((next_entry->max_protection) & protections) != protections) { break; @@ -2264,7 +2281,7 @@ redo_lookup: object, map_entry->offset, total_size, ((map_entry->is_shared - || target_map->mapped) + || target_map->mapped_in_other_pmaps) ? PMAP_NULL : target_map->pmap), map_entry->vme_start, @@ -2276,6 +2293,9 @@ redo_lookup: vm_object_lock(shadow_object); while (total_size) { + assert((next_entry->wired_count == 0) || + (map_entry->wired_count)); + if(next_entry->object.vm_object == object) { vm_object_reference_locked(shadow_object); next_entry->object.vm_object @@ -2327,6 +2347,8 @@ redo_lookup: wimg_mode = VM_WIMG_IO; } else if (access == MAP_MEM_COPYBACK) { wimg_mode = VM_WIMG_USE_DEFAULT; + } else if (access == MAP_MEM_INNERWBACK) { + wimg_mode = VM_WIMG_INNERWBACK; } else if (access == MAP_MEM_WTHRU) { wimg_mode = VM_WIMG_WTHRU; } else if (access == MAP_MEM_WCOMB) { @@ -2768,8 +2790,10 @@ mach_destroy_memory_entry( assert(ip_kotype(port) == IKOT_NAMED_ENTRY); #endif /* MACH_ASSERT */ named_entry = (vm_named_entry_t)port->ip_kobject; - lck_mtx_lock(&(named_entry)->Lock); + + named_entry_lock(named_entry); named_entry->ref_count -= 1; + if(named_entry->ref_count == 0) { if (named_entry->is_sub_map) { vm_map_deallocate(named_entry->backing.map); @@ -2778,12 +2802,13 @@ mach_destroy_memory_entry( vm_object_deallocate(named_entry->backing.object); } /* else JMM - need to drop reference on pager in that case */ - lck_mtx_unlock(&(named_entry)->Lock); + named_entry_unlock(named_entry); + named_entry_lock_destroy(named_entry); kfree((void *) port->ip_kobject, sizeof (struct vm_named_entry)); } else - lck_mtx_unlock(&(named_entry)->Lock); + named_entry_unlock(named_entry); } /* Allow manipulation of individual page state. This is actually part of */ diff --git a/osfmk/x86_64/boot_pt.c b/osfmk/x86_64/boot_pt.c new file mode 100644 index 000000000..392c3c1f7 --- /dev/null +++ b/osfmk/x86_64/boot_pt.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include + +/* + * These pagetables are used during early processor startup during + * the transition from protected mode to 64-bit mode and the jump + * to high kernel address space. + * + * They are required to be at the base of the kernel and specifically + * the base of the special __HIB section. + * + * These tables are statically-defined as physical-zero-based. + * Startup code in start.s rebases these according to the actual physical + * base address. + */ + +/* + * NB: This must be located at the kernel's base address! + */ +#define PML4_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) +pml4_entry_t BootPML4[PTE_PER_PAGE] + __attribute__((section("__HIB, __bootPT"))) = { + [0] = ((uint64_t)(PAGE_SIZE) | PML4_PROT), + [KERNEL_PML4_INDEX] = ((uint64_t)(PAGE_SIZE) | PML4_PROT), +}; + +#define PDPT_PROT (INTEL_PTE_VALID | INTEL_PTE_WRITE) +pdpt_entry_t BootPDPT[PTE_PER_PAGE] + __attribute__((section("__HIB, __bootPT"))) = { + [0] = ((uint64_t)(2*PAGE_SIZE) | PDPT_PROT), + [1] = ((uint64_t)(3*PAGE_SIZE) | PDPT_PROT), + [2] = ((uint64_t)(4*PAGE_SIZE) | PDPT_PROT), + [3] = ((uint64_t)(5*PAGE_SIZE) | PDPT_PROT), +}; + +#if NPGPTD != 4 +#error Please update boot_pt.c to reflect the new value of NPGPTD +#endif + +#if MACHINE_BOOTSTRAPPTD + +#define PDT_PROT (INTEL_PTE_PS | INTEL_PTE_VALID | INTEL_PTE_WRITE) +#define ID_MAP_2MEG(x) [(x)] = ((((uint64_t)(x)) << 21) | (PDT_PROT)), + +#define L0(x,n) x(n) +#define L1(x,n) L0(x,n-1) L0(x,n) +#define L2(x,n) L1(x,n-2) L1(x,n) +#define L3(x,n) L2(x,n-4) L2(x,n) +#define L4(x,n) L3(x,n-8) L3(x,n) +#define L5(x,n) L4(x,n-16) L4(x,n) +#define L6(x,n) L5(x,n-32) L5(x,n) +#define L7(x,n) L6(x,n-64) L6(x,n) +#define L8(x,n) L7(x,n-128) L7(x,n) +#define L9(x,n) L8(x,n-256) L8(x,n) +#define L10(x,n) L9(x,n-512) L9(x,n) +#define L11(x,n) L10(x,n-1024) L10(x,n) + +#define FOR_0_TO_2047(x) L11(x,2047) + +pd_entry_t BootPTD[2048] + __attribute__((section("__HIB, __bootPT"))) = { + FOR_0_TO_2047(ID_MAP_2MEG) +}; +#endif /* MACHINE_BOOTSTRAPPTD */ diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index 50bc8b991..a4a62feaa 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -27,7 +27,6 @@ */ #include #include -#include #include #include #include @@ -77,7 +76,6 @@ #define HNDL_UNIX_SCALL EXT(hndl_unix_scall) #define HNDL_MACH_SCALL EXT(hndl_mach_scall) #define HNDL_MDEP_SCALL EXT(hndl_mdep_scall) -#define HNDL_DIAG_SCALL EXT(hndl_diag_scall) #define HNDL_DOUBLE_FAULT EXT(hndl_double_fault) #define HNDL_MACHINE_CHECK EXT(hndl_machine_check) @@ -158,7 +156,7 @@ L_dispatch: push %rcx mov EXT(pal_efi_saved_cr3)(%rip), %rcx mov %rcx, %cr3 - leaq 0(%rip), %rcx + leaq (%rip), %rcx shr $32, %rcx /* splice the upper 32-bits of rip */ shl $32, %rsp /* .. and the lower 32-bits of rsp */ shrd $32, %rcx, %rsp /* to recover the full 64-bits of rsp */ @@ -181,8 +179,8 @@ L_64bit_dispatch: /* * Save segment regs - for completeness since theyre not used. */ - mov %fs, R64_FS(%rsp) - mov %gs, R64_GS(%rsp) + movl %fs, R64_FS(%rsp) + movl %gs, R64_GS(%rsp) /* Save general-purpose registers */ mov %rax, R64_RAX(%rsp) @@ -240,10 +238,10 @@ L_32bit_dispatch: /* 32-bit user task */ /* * Save segment regs */ - mov %ds, R32_DS(%rsp) - mov %es, R32_ES(%rsp) - mov %fs, R32_FS(%rsp) - mov %gs, R32_GS(%rsp) + movl %ds, R32_DS(%rsp) + movl %es, R32_ES(%rsp) + movl %fs, R32_FS(%rsp) + movl %gs, R32_GS(%rsp) /* * Save general 32-bit registers @@ -322,7 +320,7 @@ L_common_dispatch: mov %gs:CPU_ACTIVE_THREAD, %rcx /* Get the active thread */ cmpq $0, TH_PCB_IDS(%rcx) /* Is there a debug register state? */ je 3f - mov $0, %rcx /* If so, reset DR7 (the control) */ + xor %ecx, %ecx /* If so, reset DR7 (the control) */ mov %rcx, %dr7 3: incl %gs:hwIntCnt(,%ebx,4) // Bump the trap/intr count @@ -340,7 +338,7 @@ Entry(ret_to_user) mov %gs:CPU_ACTIVE_THREAD, %rdx movq TH_PCB_IDS(%rdx),%rax /* Obtain this thread's debug state */ - cmpq $0,%rax /* Is there a debug register context? */ + test %rax, %rax /* Is there a debug register context? */ je 2f /* branch if not */ cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP /* Are we a 32-bit task? */ jne 1f @@ -431,21 +429,21 @@ L_32bit_return: */ swapgs EXT(ret32_set_ds): - movw R32_DS(%rsp), %ds + movl R32_DS(%rsp), %ds EXT(ret32_set_es): - movw R32_ES(%rsp), %es + movl R32_ES(%rsp), %es EXT(ret32_set_fs): - movw R32_FS(%rsp), %fs + movl R32_FS(%rsp), %fs EXT(ret32_set_gs): - movw R32_GS(%rsp), %gs + movl R32_GS(%rsp), %gs /* pop compat frame + trapno, trapfn and error */ add $(ISC32_OFFSET)+8+8+8, %rsp - cmp $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) + cmpl $(SYSENTER_CS),ISF64_CS-8-8-8(%rsp) /* test for fast entry/exit */ - je L_fast_exit + je L_fast_exit EXT(ret32_iret): - iretq /* return from interrupt */ + iretq /* return from interrupt */ L_fast_exit: pop %rdx /* user return eip */ @@ -454,7 +452,7 @@ L_fast_exit: popf /* flags - carry denotes failure */ pop %rcx /* user return esp */ sti /* interrupts enabled after sysexit */ - sysexit /* 32-bit sysexit */ + .byte 0x0f,0x35 /* 32-bit sysexit */ ret_to_kernel: #if DEBUG_IDT64 @@ -553,14 +551,6 @@ Entry(idt64_mdep_scall) pushq $(MACHDEP_INT) jmp L_32bit_entry_check - -Entry(idt64_diag_scall) - swapgs /* switch to kernel gs (cpu_data) */ - push %rax /* save system call number */ - PUSH_FUNCTION(HNDL_DIAG_SCALL) - pushq $(DIAG_INT) - jmp L_32bit_entry_check - Entry(hi64_syscall) Entry(idt64_syscall) L_syscall_continue: @@ -582,6 +572,7 @@ L_syscall_continue: movq $(T_SYSCALL), ISF64_TRAPNO(%rsp) /* trapno */ leaq HNDL_SYSCALL(%rip), %r11; movq %r11, ISF64_TRAPFN(%rsp) + mov ISF64_RFLAGS(%rsp), %r11 /* Avoid info leak,restore R11 */ jmp L_64bit_dispatch /* this can only be a 64-bit task */ /* @@ -807,9 +798,9 @@ L_kernel_trap: * 24 ISF64_RIP: rip * 32 ISF64_CS: cs * 40 ISF64_RFLAGS: rflags - * 48 ISF64_RSP: rsp --> new trapno - * 56 ISF64_SS: ss --> new trapfn - * 64 pad --> new errcode + * 48 ISF64_RSP: rsp <-- new trapno + * 56 ISF64_SS: ss <-- new trapfn + * 64 pad8 <-- new errcode * 72 user rip * 80 user cs * 88 user rflags @@ -820,7 +811,7 @@ L_fault_iret: pop %rax /* recover saved %rax */ mov %rax, ISF64_RIP(%rsp) /* save rax (we don`t need saved rip) */ mov ISF64_TRAPNO(%rsp), %rax - mov %rax, ISF64_TRAPNO(%rsp)/* put in user trap number */ + mov %rax, ISF64_RSP(%rsp) /* put in user trap number */ mov ISF64_TRAPFN(%rsp), %rax mov %rax, ISF64_SS(%rsp) /* put in user trap function */ mov ISF64_ERR(%rsp), %rax /* get error code */ @@ -1024,7 +1015,7 @@ Entry(hndl_allintrs) incl %gs:CPU_INTERRUPT_LEVEL movq %gs:CPU_INT_STATE, %rdi - + CCALL(interrupt) /* call generic interrupt routine */ cli /* just in case we returned with intrs enabled */ @@ -1240,34 +1231,6 @@ Entry(hndl_mdep_scall) * always returns through thread_exception_return */ - -Entry(hndl_diag_scall) - TIME_TRAP_UENTRY - - movq %gs:CPU_KERNEL_STACK,%rdi - xchgq %rdi,%rsp /* switch to kernel stack */ - - /* Check for active vtimers in the current task */ - movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ - movq TH_TASK(%rcx),%rbx /* point to current task */ - TASK_VTIMER_CHECK(%rbx,%rcx) - - pushq %rdi /* push pcb stack */ - - CCALL(diagCall) // Call diagnostics - - cli // Disable interruptions just in case - cmpl $0,%eax // What kind of return is this? - je 1f // - branch if bad (zero) - popq %rsp // Get back the pcb stack - jmp EXT(return_to_user) // Normal return, do not check asts... -1: - CCALL3(i386_exception, $EXC_SYSCALL, $0x6000, $1) - // pass what would be the diag syscall - // error return - cause an exception - /* no return */ - - /* * 64bit Tasks * System call entries via syscall only: @@ -1305,6 +1268,7 @@ Entry(hndl_syscall) je EXT(hndl_diag_scall64) /* Syscall class unknown */ + sti CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1) /* no return */ @@ -1338,16 +1302,16 @@ Entry(hndl_mdep_scall64) * always returns through thread_exception_return */ - Entry(hndl_diag_scall64) pushq %rdi // Push the previous stack CCALL(diagCall64) // Call diagnostics cli // Disable interruptions just in case - cmpl $0,%eax // What kind of return is this? + test %eax, %eax // What kind of return is this? je 1f // - branch if bad (zero) popq %rsp // Get back the pcb stack jmp EXT(return_to_user) // Normal return, do not check asts... 1: + sti CCALL3(i386_exception, $EXC_SYSCALL, $0x6000, $1) /* no return */ diff --git a/osfmk/x86_64/idt_table.h b/osfmk/x86_64/idt_table.h index f2f26ce13..2c1d33497 100644 --- a/osfmk/x86_64/idt_table.h +++ b/osfmk/x86_64/idt_table.h @@ -34,19 +34,11 @@ USER_TRAP(0x04,idt64_into) USER_TRAP(0x05,idt64_bounds) TRAP(0x06,idt64_invop) TRAP(0x07,idt64_nofpu) -#if MACH_KDB -TRAP_IST(0x08,idt64_db_task_dbl_fault) -#else TRAP_IST(0x08,idt64_double_fault) -#endif TRAP(0x09,idt64_fpu_over) TRAP(0x0a,idt64_inv_tss) TRAP_SPC(0x0b,idt64_segnp) -#if MACH_KDB -TRAP_IST(0x0c,idt64_db_task_stk_fault) -#else TRAP_SPC(0x0c,idt64_stack_fault) -#endif TRAP_SPC(0x0d,idt64_gen_prot) TRAP_SPC(0x0e,idt64_page_fault) TRAP(0x0f,idt64_trap_0f) @@ -172,8 +164,8 @@ USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */ USER_TRAP_SPC(0x80,idt64_unix_scall) USER_TRAP_SPC(0x81,idt64_mach_scall) USER_TRAP_SPC(0x82,idt64_mdep_scall) -USER_TRAP_SPC(0x83,idt64_diag_scall) +INTERRUPT(0x83) INTERRUPT(0x84) INTERRUPT(0x85) INTERRUPT(0x86) diff --git a/osfmk/x86_64/locore.s b/osfmk/x86_64/locore.s index af3bac12a..f13db5aab 100644 --- a/osfmk/x86_64/locore.s +++ b/osfmk/x86_64/locore.s @@ -56,10 +56,7 @@ #include #include -#include -#include #include -#include #include #include diff --git a/osfmk/x86_64/loose_ends.c b/osfmk/x86_64/loose_ends.c index 10a086542..b912c6d9b 100644 --- a/osfmk/x86_64/loose_ends.c +++ b/osfmk/x86_64/loose_ends.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -582,7 +582,7 @@ flush_dcache64(addr64_t addr, unsigned count, int phys) dcache_incoherent_io_flush64(addr, count); } else { - uint32_t linesize = cpuid_info()->cache_linesize; + uint64_t linesize = cpuid_info()->cache_linesize; addr64_t bound = (addr + count + linesize -1) & ~(linesize - 1); __mfence(); while (addr < bound) { @@ -646,6 +646,20 @@ kdp_register_callout(void) } #endif +/* + * Return a uniformly distributed 64-bit random number. + * + * This interface should have minimal dependencies on kernel + * services, and thus be available very early in the life + * of the kernel. But as a result, it may not be very random + * on all platforms. + */ +uint64_t +early_random(void) +{ + return (ml_early_random()); +} + #if !CONFIG_VMX int host_vmxon(boolean_t exclusive __unused) { diff --git a/osfmk/x86_64/lowglobals.h b/osfmk/x86_64/lowglobals.h index e4f404b80..cde29d955 100644 --- a/osfmk/x86_64/lowglobals.h +++ b/osfmk/x86_64/lowglobals.h @@ -46,33 +46,27 @@ * which is in lowmem_vectors.s */ -/* - * This is where we put constants, pointers, and data areas that must be accessed - * quickly through assembler. They are designed to be accessed directly with - * absolute addresses, not via a base register. This is a global area, and not - * per processor. - */ - #pragma pack(8) /* Make sure the structure stays as we defined it */ typedef struct lowglo { unsigned char lgVerCode[8]; /* 0xffffff8000002000 System verification code */ - uint64_t lgZero[2]; /* 0xffffff8000002008 Double constant 0 */ - uint64_t lgRsv010; /* 0xffffff8000002018 Reserved */ - uint64_t lgCHUDXNUfnStart; /* 0xffffff8000002020 CHUD XNU function glue table */ - uint64_t lgRsv018; /* 0xffffff8000002028 Reserved */ - uint64_t lgVersion; /* 0xffffff8000002030 Pointer to kernel version string */ - uint64_t lgRsv020[280]; /* 0xffffff8000002038 Reserved */ - uint64_t lgKmodptr; /* 0xffffff80000028f8 Pointer to kmod, debugging aid */ - uint64_t lgTransOff; /* 0xffffff8000002900 Pointer to kdp_trans_off, debugging aid */ - uint64_t lgReadIO; /* 0xffffff8000002908 Pointer to kdp_read_io, debugging aid */ - uint64_t lgDevSlot1; /* 0xffffff8000002910 For developer use */ - uint64_t lgDevSlot2; /* 0xffffff8000002918 For developer use */ - uint64_t lgOSVersion; /* 0xffffff8000002920 Pointer to OS version string */ - uint64_t lgRebootFlag; /* 0xffffff8000002928 Pointer to debugger reboot trigger */ - uint64_t lgManualPktAddr; /* 0xffffff8000002930 Pointer to manual packet structure */ + uint64_t lgZero; /* 0xffffff8000002008 Double constant 0 */ + uint64_t lgStext; /* 0xffffff8000002010 Start of kernel text */ + uint64_t lgRsv018; /* 0xffffff8000002018 Reserved */ + uint64_t lgCHUDXNUfnStart; /* 0xffffff8000002020 CHUD XNU function glue table */ + uint64_t lgRsv028; /* 0xffffff8000002028 Reserved */ + uint64_t lgVersion; /* 0xffffff8000002030 Pointer to kernel version string */ + uint64_t lgRsv038[280]; /* 0xffffff8000002038 Reserved */ + uint64_t lgKmodptr; /* 0xffffff80000028f8 Pointer to kmod, debugging aid */ + uint64_t lgTransOff; /* 0xffffff8000002900 Pointer to kdp_trans_off, debugging aid */ + uint64_t lgReadIO; /* 0xffffff8000002908 Pointer to kdp_read_io, debugging aid */ + uint64_t lgDevSlot1; /* 0xffffff8000002910 For developer use */ + uint64_t lgDevSlot2; /* 0xffffff8000002918 For developer use */ + uint64_t lgOSVersion; /* 0xffffff8000002920 Pointer to OS version string */ + uint64_t lgRebootFlag; /* 0xffffff8000002928 Pointer to debugger reboot trigger */ + uint64_t lgManualPktAddr; /* 0xffffff8000002930 Pointer to manual packet structure */ - uint64_t lgRsv49C[217]; /* 0xffffff8000002938 Reserved - push to 1 page */ + uint64_t lgRsv938[217]; /* 0xffffff8000002938 Reserved - push to 1 page */ } lowglo; #pragma pack() extern lowglo lowGlo; diff --git a/osfmk/ddb/db_output.h b/osfmk/x86_64/lowmem_vectors.c similarity index 68% rename from osfmk/ddb/db_output.h rename to osfmk/x86_64/lowmem_vectors.c index 39052640c..d1d1e7f24 100644 --- a/osfmk/ddb/db_output.h +++ b/osfmk/x86_64/lowmem_vectors.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,6 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + /* * @OSF_COPYRIGHT@ */ @@ -53,35 +54,53 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -/* - */ -/* - * Author: David B. Golub, Carnegie Mellon University - * Date: 8/90 - */ -/* - * Printing routines for kernel debugger. +#include +#include +#include +#include + +/* + * on x86_64 the low mem vectors live here and get mapped to 0xffffff8000002000 at + * system startup time */ -#ifndef _DDB_DB_OUTPUT_H_ -#define _DDB_DB_OUTPUT_H_ +extern void *version; +extern void *kmod; +extern void *kdp_trans_off; +extern void *kdp_read_io; +extern void *osversion; +extern void *flag_kdp_trigger_reboot; +extern void *manual_pkt; -#include +lowglo lowGlo __attribute__ ((aligned(PAGE_SIZE))) = { -extern int db_indent; + .lgVerCode = { 'C','a','t','f','i','s','h',' ' }, -/* - * Prototypes for functions exported by this module. - */ -void db_force_whitespace(void); -void db_putchar(char c); -int db_print_position(void); -void db_end_line(void); -void db_printf(const char *fmt, ...); -void kdbprintf(const char *fmt, ...); -void iprintf(const char *fmt, ...); -boolean_t db_reserve_output_position(int len); -void db_reset_more(void); -void db_output_prompt(void); -#endif /* !_DDB_DB_OUTPUT_H_ */ + .lgCHUDXNUfnStart = 0, + + .lgVersion = (uint64_t) &version, + + .lgKmodptr = (uint64_t) &kmod, + +#if MACH_KDP + .lgTransOff = (uint64_t) &kdp_trans_off, + .lgReadIO = (uint64_t) &kdp_read_io, +#else + .lgTransOff = 0, + .lgReadIO = 0, +#endif + + .lgDevSlot1 = 0, + .lgDevSlot2 = 0, + + .lgOSVersion = (uint64_t) &osversion, + +#if MACH_KDP + .lgRebootFlag = (uint64_t) &flag_kdp_trigger_reboot, + .lgManualPktAddr = (uint64_t) &manual_pkt, +#else + .lgRebootFlag = 0, + .lgManualPktAddr = 0, +#endif +}; diff --git a/osfmk/x86_64/lowmem_vectors.s b/osfmk/x86_64/lowmem_vectors.s deleted file mode 100644 index 40133c4a5..000000000 --- a/osfmk/x86_64/lowmem_vectors.s +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ - -#include -#include -#include - -#include -#include -#include -#include - - -/* - * on x86_64 the low mem vectors live here and get mapped to 0xffffff8000200000 at - * system startup time - */ - - .text - .align 12 - .globl EXT(lowGlo) -EXT(lowGlo): - - .ascii "Catfish " /* +0x000 System verification code */ - .quad 0 /* +0x008 Double constant 0 */ - .quad 0 - .quad 0 /* +0x018 Reserved */ - .quad 0 /* +0x020 Reserved */ - .quad 0 /* +0x028 Reserved */ - .quad EXT(version) /* +0x030 Pointer to kernel version string */ - .fill 560, 4, 0 /* +0x038 Reserved - rdar://problem/5783217 */ - .quad EXT(kmod) /* +0x8f8 Pointer to kmod, debugging aid */ -#if MACH_KDP - .quad EXT(kdp_trans_off) /* +0x900 Pointer to kdp_trans_off, debugging aid */ - .quad EXT(kdp_read_io) /* +0x908 Pointer to kdp_read_io, debugging aid */ -#else - .quad 0 /* +0x900 Reserved */ - .quad 0 /* +0x908 Reserved */ -#endif - .quad 0 /* +0x910 Reserved for developer use */ - .quad 0 /* +0x918 Reserved for developer use */ - .quad EXT(osversion) /* +0x920 Pointer to osversion string */ -#if MACH_KDP - .quad EXT(flag_kdp_trigger_reboot) /* +0x928 Pointer to debugger reboot trigger */ - .quad EXT(manual_pkt) /* +0x930 Pointer to manual packet structure */ -#else - .quad 0 /* +0x928 Reserved */ - .quad 0 /* +0x930 Reserved */ -#endif - .fill 434, 4, 0 /* pad to 0x1000 (page size) - rdar://problem/5783217 */ diff --git a/osfmk/x86_64/machine_routines_asm.s b/osfmk/x86_64/machine_routines_asm.s index 1c74f9fc8..362887586 100644 --- a/osfmk/x86_64/machine_routines_asm.s +++ b/osfmk/x86_64/machine_routines_asm.s @@ -33,6 +33,7 @@ #include #include +#include #include /* @@ -161,7 +162,7 @@ Lslow: hlt .data 1: String "_rtc_nanotime_read() - slow algorithm not supported" - + .text Entry(call_continuation) movq %rdi,%rcx /* get continuation */ @@ -173,3 +174,86 @@ Entry(call_continuation) movq %gs:CPU_ACTIVE_THREAD,%rdi call EXT(thread_terminate) +Entry(x86_init_wrapper) + xor %rbp, %rbp + movq %rsi, %rsp + callq *%rdi + + /* + * Generate a 64-bit quantity with possibly random characteristics, intended for use + * before the kernel entropy pool is available. The processor's RNG is used if + * available, and a value derived from the Time Stamp Counter is returned if not. + * Multiple invocations may result in well-correlated values if sourced from the TSC. + */ +Entry(ml_early_random) + mov %rbx, %rsi + mov $1, %eax + cpuid + mov %rsi, %rbx + test $(1 << 30), %ecx + jz Lnon_rdrand + RDRAND_RAX /* RAX := 64 bits of DRBG entropy */ + jnc Lnon_rdrand + ret +Lnon_rdrand: + rdtsc /* EDX:EAX := TSC */ + /* Distribute low order bits */ + mov %eax, %ecx + xor %al, %ah + shl $16, %rcx + xor %rcx, %rax + xor %eax, %edx + + /* Incorporate ASLR entropy, if any */ + lea (%rip), %rcx + shr $21, %rcx + movzbl %cl, %ecx + shl $16, %ecx + xor %ecx, %edx + + mov %ah, %cl + ror %cl, %edx /* Right rotate EDX (TSC&0xFF ^ (TSC>>8 & 0xFF))&1F */ + shl $32, %rdx + xor %rdx, %rax + mov %cl, %al + ret + +#if CONFIG_VMX + +/* + * __vmxon -- Enter VMX Operation + * int __vmxon(addr64_t v); + */ +Entry(__vmxon) + FRAME + push %rdi + + mov $(VMX_FAIL_INVALID), %ecx + mov $(VMX_FAIL_VALID), %edx + mov $(VMX_SUCCEED), %eax + vmxon (%rsp) + cmovcl %ecx, %eax /* CF = 1, ZF = 0 */ + cmovzl %edx, %eax /* CF = 0, ZF = 1 */ + + pop %rdi + EMARF + ret + +/* + * __vmxoff -- Leave VMX Operation + * int __vmxoff(void); + */ +Entry(__vmxoff) + FRAME + + mov $(VMX_FAIL_INVALID), %ecx + mov $(VMX_FAIL_VALID), %edx + mov $(VMX_SUCCEED), %eax + vmxoff + cmovcl %ecx, %eax /* CF = 1, ZF = 0 */ + cmovzl %edx, %eax /* CF = 0, ZF = 1 */ + + EMARF + ret + +#endif /* CONFIG_VMX */ diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 147372eff..2bc5bfab7 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -89,7 +89,6 @@ */ #include -#include #include #include @@ -100,6 +99,7 @@ #include #include #include +#include #include #include @@ -135,17 +135,13 @@ #include #include -#if MACH_KDB -#include -#include -#include -#include -#endif /* MACH_KDB */ - #include #include #include +#include + +#include #ifdef IWANTTODEBUG @@ -204,7 +200,7 @@ static struct vm_object kpdptobj_object_store; * One byte per physical page. */ char *pmap_phys_attributes; -unsigned int last_managed_page = 0; +ppnum_t last_managed_page = 0; /* * Amount of virtual memory mapped by one @@ -226,9 +222,6 @@ pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE]; struct pmap kernel_pmap_store; pmap_t kernel_pmap; -pd_entry_t high_shared_pde; -pd_entry_t commpage64_pde; - struct zone *pmap_zone; /* zone of pmap structures */ struct zone *pmap_anchor_zone; @@ -252,12 +245,11 @@ pt_entry_t *DMAP1, *DMAP2; caddr_t DADDR1; caddr_t DADDR2; -/* - * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. - * properly deals with the anchor. - * must be called with the hash locked, does not unlock it - */ +const boolean_t pmap_disable_kheap_nx = FALSE; +const boolean_t pmap_disable_kstack_nx = FALSE; +extern boolean_t doconstro_override; +extern long __stack_chk_guard[]; /* * Map memory at initialization. The physical addresses being @@ -279,7 +271,7 @@ pmap_map( ps = PAGE_SIZE; while (start_addr < end_addr) { pmap_enter(kernel_pmap, (vm_map_offset_t)virt, - (ppnum_t) i386_btop(start_addr), prot, flags, FALSE); + (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE); virt += ps; start_addr += ps; } @@ -293,7 +285,8 @@ extern vm_offset_t sHIB; extern vm_offset_t eHIB; extern vm_offset_t stext; extern vm_offset_t etext; -extern vm_offset_t sdata; +extern vm_offset_t sdata, edata; +extern vm_offset_t sconstdata, econstdata; extern void *KPTphys; @@ -355,7 +348,7 @@ pmap_bootstrap( kernel_pmap = &kernel_pmap_store; kernel_pmap->ref_count = 1; - kernel_pmap->nx_enabled = FALSE; + kernel_pmap->nx_enabled = TRUE; kernel_pmap->pm_task_map = TASK_MAP_64BIT; kernel_pmap->pm_obj = (vm_object_t) NULL; kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD); @@ -435,6 +428,27 @@ pmap_bootstrap( if (pmap_smep_enabled) printf("PMAP: Supervisor Mode Execute Protection enabled\n"); +#if DEBUG + printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]); + printf("ml_early_random(): 0x%qx\n", ml_early_random()); +#endif + boolean_t ptmp; + /* Check if the user has requested disabling stack or heap no-execute + * enforcement. These are "const" variables; that qualifier is cast away + * when altering them. The TEXT/DATA const sections are marked + * write protected later in the kernel startup sequence, so altering + * them is possible at this point, in pmap_bootstrap(). + */ + if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) { + boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx; + *pdknxp = TRUE; + } + + if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) { + boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx; + *pdknhp = TRUE; + } + boot_args *args = (boot_args *)PE_state.bootArgs; if (args->efiMode == kBootArgsEfiMode32) { printf("EFI32: kernel virtual space limited to 4GB\n"); @@ -552,7 +566,7 @@ pmap_init(void) for (i = 0; i < pmap_memory_region_count; i++, pmptr++) { if (pmptr->type != kEfiConventionalMemory) continue; - unsigned int pn; + ppnum_t pn; for (pn = pmptr->base; pn <= pmptr->end; pn++) { if (pn < last_pn) { pmap_phys_attributes[pn] |= PHYS_MANAGED; @@ -584,13 +598,12 @@ pmap_init(void) pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors"); zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE); -#if ZONE_DEBUG /* The anchor is required to be page aligned. Zone debugging adds - * padding which may violate that requirement. Disable it - * to avoid assumptions. + * padding which may violate that requirement. Tell the zone + * subsystem that alignment is required. */ - zone_debug_disable(pmap_anchor_zone); -#endif + + zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE); s = (vm_size_t) sizeof(struct pv_hashed_entry); pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */, @@ -619,7 +632,41 @@ pmap_init(void) * Ensure the kernel's PML4 entry exists for the basement * before this is shared with any user. */ - pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT); + pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE); +} + +static +void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) { + uint64_t ev = sv + nxrosz, cv = sv; + pd_entry_t *pdep; + pt_entry_t *ptep = NULL; + + assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0); + + for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) { + uint64_t pdev = (cv & ~((uint64_t)PDEMASK)); + + if (*pdep & INTEL_PTE_PS) { + if (NX) + *pdep |= INTEL_PTE_NX; + if (ro) + *pdep &= ~INTEL_PTE_WRITE; + cv += NBPD; + cv &= ~((uint64_t) PDEMASK); + pdep = pmap_pde(npmap, cv); + continue; + } + + for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) { + if (NX) + *ptep |= INTEL_PTE_NX; + if (ro) + *ptep &= ~INTEL_PTE_WRITE; + cv += NBPT; + ptep = pmap_pte(npmap, cv); + } + } + DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0); } /* @@ -667,50 +714,72 @@ pmap_init(void) * 4K pages covering [stext,etext] are coalesced as 2M large pages. * The now unused level-1 PTE pages are also freed. */ -extern uint32_t pmap_reserved_ranges; +extern ppnum_t vm_kernel_base_page; void pmap_lowmem_finalize(void) { spl_t spl; int i; - /* Check the kernel is linked at the expected base address */ - if (i386_btop(kvtophys((vm_offset_t) &IdlePML4)) != - I386_KERNEL_IMAGE_BASE_PAGE) - panic("pmap_lowmem_finalize() unexpected kernel base address"); - /* * Update wired memory statistics for early boot pages */ - PMAP_ZINFO_PALLOC(bootstrap_wired_pages * PAGE_SIZE); + PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE); /* - * Free all pages in pmap regions below the base: + * Free pages in pmap regions below the base: * rdar://6332712 * We can't free all the pages to VM that EFI reports available. * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake. * There's also a size miscalculation here: pend is one page less * than it should be but this is not fixed to be backwards * compatible. - * Due to this current EFI limitation, we take only the first - * entry in the memory region table. However, the loop is retained - * (with the intended termination criteria commented out) in the - * hope that some day we can free all low-memory ranges. + * This is important for KASLR because up to 256*2MB = 512MB of space + * needs has to be released to VM. */ for (i = 0; -// pmap_memory_regions[i].end <= I386_KERNEL_IMAGE_BASE_PAGE; - i < 1 && (pmap_reserved_ranges == 0); + pmap_memory_regions[i].end < vm_kernel_base_page; i++) { - vm_offset_t pbase = (vm_offset_t)i386_ptob(pmap_memory_regions[i].base); - vm_offset_t pend = (vm_offset_t)i386_ptob(pmap_memory_regions[i].end); -// vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1); + vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base); + vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1); - DBG("ml_static_mfree(%p,%p) for pmap region %d\n", + DBG("pmap region %d [%p..[%p\n", + i, (void *) pbase, (void *) pend); + + if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) + continue; + /* + * rdar://6332712 + * Adjust limits not to free pages in range 0xc0000-0xff000. + */ + if (pbase >= 0xc0000 && pend <= 0x100000) + continue; + if (pbase < 0xc0000 && pend > 0x100000) { + /* page range entirely within region, free lower part */ + DBG("- ml_static_mfree(%p,%p)\n", + (void *) ml_static_ptovirt(pbase), + (void *) (0xc0000-pbase)); + ml_static_mfree(ml_static_ptovirt(pbase),0xc0000-pbase); + pbase = 0x100000; + } + if (pbase < 0xc0000) + pend = MIN(pend, 0xc0000); + if (pend > 0x100000) + pbase = MAX(pbase, 0x100000); + DBG("- ml_static_mfree(%p,%p)\n", (void *) ml_static_ptovirt(pbase), - (void *) (pend - pbase), i); + (void *) (pend - pbase)); ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase); } + /* A final pass to get rid of all initial identity mappings to + * low pages. + */ + DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base); + + /* Remove all mappings past the descriptor aliases and low globals */ + pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base); + /* * If text and data are both 2MB-aligned, * we can map text with large-pages, @@ -746,7 +815,7 @@ pmap_lowmem_finalize(void) ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); if (ptep) - pmap_store_pte(ptep, *ptep & ~INTEL_PTE_RW); + pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE); } } @@ -784,7 +853,7 @@ pmap_lowmem_finalize(void) pde |= pte_phys; /* take page frame from pte */ if (wpkernel) - pde &= ~INTEL_PTE_RW; + pde &= ~INTEL_PTE_WRITE; DBG("pmap_store_pte(%p,0x%llx)\n", (void *)pdep, pde); pmap_store_pte(pdep, pde); @@ -807,19 +876,98 @@ pmap_lowmem_finalize(void) pmap_kernel_text_ps = I386_LPGBYTES; } - /* map lowmem global page into fixed addr */ - pt_entry_t *pte = NULL; - if (0 == (pte = pmap_pte(kernel_pmap, - VM_MIN_KERNEL_LOADED_ADDRESS + 0x2000))) - panic("lowmem pte"); - /* make sure it is defined on page boundary */ - assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); - pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo) - | INTEL_PTE_REF - | INTEL_PTE_MOD - | INTEL_PTE_WIRED - | INTEL_PTE_VALID - | INTEL_PTE_RW); + boolean_t doconstro = TRUE; + + (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro)); + + if ((sconstdata | econstdata) & PAGE_MASK) { + kprintf("Const DATA misaligned 0x%lx 0x%lx\n", sconstdata, econstdata); + if ((sconstdata & PAGE_MASK) || (doconstro_override == FALSE)) + doconstro = FALSE; + } + + if ((sconstdata > edata) || (sconstdata < sdata) || ((econstdata - sconstdata) >= (edata - sdata))) { + kprintf("Const DATA incorrect size 0x%lx 0x%lx 0x%lx 0x%lx\n", sconstdata, econstdata, sdata, edata); + doconstro = FALSE; + } + + if (doconstro) + kprintf("Marking const DATA read-only\n"); + + vm_offset_t dva; + + for (dva = sdata; dva < edata; dva += I386_PGBYTES) { + assert(((sdata | edata) & PAGE_MASK) == 0); + if ( (sdata | edata) & PAGE_MASK) { + kprintf("DATA misaligned, 0x%lx, 0x%lx\n", sdata, edata); + break; + } + + pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva); + + dpte = *dptep; + + assert((dpte & INTEL_PTE_VALID)); + if ((dpte & INTEL_PTE_VALID) == 0) { + kprintf("Missing data mapping 0x%lx 0x%lx 0x%lx\n", dva, sdata, edata); + continue; + } + + dpte |= INTEL_PTE_NX; + if (doconstro && (dva >= sconstdata) && (dva < econstdata)) { + dpte &= ~INTEL_PTE_WRITE; + } + pmap_store_pte(dptep, dpte); + } + kernel_segment_command_t * seg; + kernel_section_t * sec; + + for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) { + if (!strcmp(seg->segname, "__TEXT") || + !strcmp(seg->segname, "__DATA")) { + continue; + } + //XXX + if (!strcmp(seg->segname, "__KLD")) { + continue; + } + if (!strcmp(seg->segname, "__HIB")) { + for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) { + if (sec->addr & PAGE_MASK) + panic("__HIB segment's sections misaligned"); + if (!strcmp(sec->sectname, "__text")) { + pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE); + } else { + pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE); + } + } + } else { + pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE); + } + } + + /* + * If we're debugging, map the low global vector page at the fixed + * virtual address. Otherwise, remove the mapping for this. + */ + if (debug_boot_arg) { + pt_entry_t *pte = NULL; + if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) + panic("lowmem pte"); + /* make sure it is defined on page boundary */ + assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); + pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo) + | INTEL_PTE_REF + | INTEL_PTE_MOD + | INTEL_PTE_WIRED + | INTEL_PTE_VALID + | INTEL_PTE_WRITE + | INTEL_PTE_NX); + } else { + pmap_remove(kernel_pmap, + LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE); + } + splx(spl); if (pmap_pcid_ncpus) tlb_flush_global(); @@ -908,6 +1056,7 @@ pmap_is_empty( */ pmap_t pmap_create( + ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit) { @@ -942,10 +1091,13 @@ pmap_create( p->ref_count = 1; p->nx_enabled = 1; p->pm_shared = FALSE; + ledger_reference(ledger); + p->ledger = ledger; p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;; if (pmap_pcid_ncpus) pmap_pcid_initialize(p); + p->pm_pml4 = zalloc(pmap_anchor_zone); pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0); @@ -973,7 +1125,7 @@ pmap_create( kpml4 = kernel_pmap->pm_pml4; pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX]; pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX]; - pml4[KERNEL_PHYSMAP_INDEX] = kpml4[KERNEL_PHYSMAP_INDEX]; + pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX]; PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, p, is_64bit, 0, 0, 0); @@ -1043,8 +1195,8 @@ pmap_destroy(pmap_t p) vm_object_deallocate(p->pm_obj); OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(inuse_ptepages * PAGE_SIZE); - + PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE); + ledger_dereference(p->ledger); zfree(pmap_zone, p); PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END, @@ -1079,7 +1231,6 @@ pmap_remove_some_phys( } - /* * Set the physical protection on the * specified range of this map as requested. @@ -1142,19 +1293,14 @@ pmap_protect( continue; if (prot & VM_PROT_WRITE) - pmap_update_pte(spte, *spte, - *spte | INTEL_PTE_WRITE); + pmap_update_pte(spte, 0, INTEL_PTE_WRITE); else - pmap_update_pte(spte, *spte, - *spte & ~INTEL_PTE_WRITE); + pmap_update_pte(spte, INTEL_PTE_WRITE, 0); if (set_NX) - pmap_update_pte(spte, *spte, - *spte | INTEL_PTE_NX); + pmap_update_pte(spte, 0, INTEL_PTE_NX); else - pmap_update_pte(spte, *spte, - *spte & ~INTEL_PTE_NX); - + pmap_update_pte(spte, INTEL_PTE_NX, 0); num_found++; } } @@ -1190,17 +1336,17 @@ pmap_map_block( cur_page_size = PAGE_SIZE; for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) { - pmap_enter(pmap, va, pa, prot, attr, TRUE); + pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE); va += cur_page_size; pa+=cur_page_size/PAGE_SIZE; } } - -void +kern_return_t pmap_expand_pml4( pmap_t map, - vm_map_offset_t vaddr) + vm_map_offset_t vaddr, + unsigned int options) { vm_page_t m; pmap_paddr_t pa; @@ -1213,9 +1359,11 @@ pmap_expand_pml4( /* * Allocate a VM page for the pml4 page */ - while ((m = vm_page_grab()) == VM_PAGE_NULL) + while ((m = vm_page_grab()) == VM_PAGE_NULL) { + if (options & PMAP_EXPAND_OPTIONS_NOWAIT) + return KERN_RESOURCE_SHORTAGE; VM_PAGE_WAIT(); - + } /* * put the page into the pmap's obj list so it * can be found later. @@ -1235,7 +1383,7 @@ pmap_expand_pml4( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_PALLOC(PAGE_SIZE); + PMAP_ZINFO_PALLOC(map, PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pml4); @@ -1251,8 +1399,8 @@ pmap_expand_pml4( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); - return; + PMAP_ZINFO_PFREE(map, PAGE_SIZE); + return KERN_SUCCESS; } #if 0 /* DEBUG */ @@ -1276,13 +1424,11 @@ pmap_expand_pml4( PMAP_UNLOCK(map); - return; + return KERN_SUCCESS; } -void -pmap_expand_pdpt( - pmap_t map, - vm_map_offset_t vaddr) +kern_return_t +pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options) { vm_page_t m; pmap_paddr_t pa; @@ -1293,14 +1439,19 @@ pmap_expand_pdpt( DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr); while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) { - pmap_expand_pml4(map, vaddr); + kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options); + if (pep4kr != KERN_SUCCESS) + return pep4kr; } /* * Allocate a VM page for the pdpt page */ - while ((m = vm_page_grab()) == VM_PAGE_NULL) + while ((m = vm_page_grab()) == VM_PAGE_NULL) { + if (options & PMAP_EXPAND_OPTIONS_NOWAIT) + return KERN_RESOURCE_SHORTAGE; VM_PAGE_WAIT(); + } /* * put the page into the pmap's obj list so it @@ -1321,7 +1472,7 @@ pmap_expand_pdpt( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_PALLOC(PAGE_SIZE); + PMAP_ZINFO_PALLOC(map, PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj_pdpt); @@ -1337,8 +1488,8 @@ pmap_expand_pdpt( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); - return; + PMAP_ZINFO_PFREE(map, PAGE_SIZE); + return KERN_SUCCESS; } #if 0 /* DEBUG */ @@ -1362,7 +1513,7 @@ pmap_expand_pdpt( PMAP_UNLOCK(map); - return; + return KERN_SUCCESS; } @@ -1383,10 +1534,11 @@ pmap_expand_pdpt( * has been expanded enough. * (We won't loop forever, since page tables aren't shrunk.) */ -void +kern_return_t pmap_expand( pmap_t map, - vm_map_offset_t vaddr) + vm_map_offset_t vaddr, + unsigned int options) { pt_entry_t *pdp; register vm_page_t m; @@ -1406,15 +1558,19 @@ pmap_expand( while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) { - /* need room for another pde entry */ - pmap_expand_pdpt(map, vaddr); + kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options); + if (pepkr != KERN_SUCCESS) + return pepkr; } /* * Allocate a VM page for the pde entries. */ - while ((m = vm_page_grab()) == VM_PAGE_NULL) + while ((m = vm_page_grab()) == VM_PAGE_NULL) { + if (options & PMAP_EXPAND_OPTIONS_NOWAIT) + return KERN_RESOURCE_SHORTAGE; VM_PAGE_WAIT(); + } /* * put the page into the pmap's obj list so it @@ -1435,7 +1591,7 @@ pmap_expand( OSAddAtomic(1, &inuse_ptepages_count); OSAddAtomic64(1, &alloc_ptepages_count); - PMAP_ZINFO_PALLOC(PAGE_SIZE); + PMAP_ZINFO_PALLOC(map, PAGE_SIZE); /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ vm_object_lock(map->pm_obj); @@ -1452,8 +1608,8 @@ pmap_expand( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); - return; + PMAP_ZINFO_PFREE(map, PAGE_SIZE); + return KERN_SUCCESS; } #if 0 /* DEBUG */ @@ -1476,7 +1632,7 @@ pmap_expand( PMAP_UNLOCK(map); - return; + return KERN_SUCCESS; } /* On K64 machines with more than 32GB of memory, pmap_steal_memory @@ -1658,7 +1814,7 @@ pmap_collect( VM_PAGE_FREE(m); OSAddAtomic(-1, &inuse_ptepages_count); - PMAP_ZINFO_PFREE(PAGE_SIZE); + PMAP_ZINFO_PFREE(p, PAGE_SIZE); } PMAP_LOCK(p); @@ -1710,7 +1866,6 @@ pmap_pageable( #endif /* lint */ } - void invalidate_icache(__unused vm_offset_t addr, __unused unsigned cnt, @@ -2015,6 +2170,10 @@ pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv) } } + if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) { + panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map); + } + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END, pmap, cpus_to_signal, startv, endv, 0); } @@ -2054,3 +2213,114 @@ pmap_update_interrupt(void) PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END, 0, 0, 0, 0, 0); } + +#include /* mach_vm_region_recurse() */ +/* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries + * and identify ranges with mismatched VM permissions and PTE permissions + */ +kern_return_t +pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev) { + vm_offset_t cv = sv; + kern_return_t rv = KERN_SUCCESS; + uint64_t skip4 = 0, skip2 = 0; + + sv &= ~PAGE_MASK_64; + ev &= ~PAGE_MASK_64; + while (cv < ev) { + if (__improbable((cv > 0x00007FFFFFFFFFFFULL) && + (cv < 0xFFFF800000000000ULL))) { + cv = 0xFFFF800000000000ULL; + } + /* Potential inconsistencies from not holding pmap lock + * but harmless for the moment. + */ + if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) { + if ((cv + NBPML4) > cv) + cv += NBPML4; + else + break; + skip4++; + continue; + } + if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) { + if ((cv + NBPD) > cv) + cv += NBPD; + else + break; + skip2++; + continue; + } + + pt_entry_t *ptep = pmap_pte(ipmap, cv); + if (ptep && (*ptep & INTEL_PTE_VALID)) { + if (*ptep & INTEL_PTE_WRITE) { + if (!(*ptep & INTEL_PTE_NX)) { + kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap64_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep))))); + rv = KERN_FAILURE; + } + } + } + cv += PAGE_SIZE; + } + kprintf("Completed pmap scan\n"); + cv = sv; + + struct vm_region_submap_info_64 vbr; + mach_msg_type_number_t vbrcount = 0; + mach_vm_size_t vmsize; + vm_prot_t prot; + uint32_t nesting_depth = 0; + kern_return_t kret; + + while (cv < ev) { + + for (;;) { + vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; + if((kret = mach_vm_region_recurse(ivmmap, + (mach_vm_address_t *) &cv, &vmsize, &nesting_depth, + (vm_region_recurse_info_t)&vbr, + &vbrcount)) != KERN_SUCCESS) { + break; + } + + if(vbr.is_submap) { + nesting_depth++; + continue; + } else { + break; + } + } + + if(kret != KERN_SUCCESS) + break; + + prot = vbr.protection; + + if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) { + kprintf("W+X map entry at address 0x%lx\n", cv); + rv = KERN_FAILURE; + } + + if (prot) { + vm_offset_t pcv; + for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) { + pt_entry_t *ptep = pmap_pte(ipmap, pcv); + vm_prot_t tprot; + + if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) + continue; + tprot = VM_PROT_READ; + if (*ptep & INTEL_PTE_WRITE) + tprot |= VM_PROT_WRITE; + if ((*ptep & INTEL_PTE_NX) == 0) + tprot |= VM_PROT_EXECUTE; + if (tprot != prot) { + kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot); + rv = KERN_FAILURE; + } + } + } + cv += vmsize; + } + return rv; +} diff --git a/osfmk/x86_64/start.s b/osfmk/x86_64/start.s index 8ca246de3..22045a2d5 100644 --- a/osfmk/x86_64/start.s +++ b/osfmk/x86_64/start.s @@ -57,7 +57,7 @@ */ #include -#include +#include #include #include @@ -73,9 +73,10 @@ /* * Interrupt and bootup stack for initial processor. + * Note: we switch to a dynamically allocated interrupt stack once VM is up. */ - /* in the __HIB section since the hibernate restore code uses this stack. */ +/* in the __HIB section since the hibernate restore code uses this stack. */ .section __HIB, __data .align 12 @@ -95,17 +96,6 @@ EXT(gIOHibernateRestoreStackEnd): .section __DATA, __data -/* - * Stack for last-gasp double-fault handler. - */ - .align 12 - .globl EXT(df_task_stack) -EXT(df_task_stack): - .space INTSTACK_SIZE - .globl EXT(df_task_stack_end) -EXT(df_task_stack_end): - - /* * Stack for machine-check handler. */ @@ -116,6 +106,25 @@ EXT(mc_task_stack): .globl EXT(mc_task_stack_end) EXT(mc_task_stack_end): + /* Must not clobber EDI */ +#define SWITCH_TO_64BIT_MODE \ + movl $(CR4_PAE),%eax /* enable PAE */ ;\ + movl %eax,%cr4 ;\ + movl $MSR_IA32_EFER,%ecx ;\ + rdmsr ;\ + /* enable long mode, NX */ ;\ + orl $(MSR_IA32_EFER_LME | MSR_IA32_EFER_NXE),%eax ;\ + wrmsr ;\ + movl $EXT(BootPML4),%eax ;\ + movl %eax,%cr3 ;\ + movl %cr0,%eax ;\ + orl $(CR0_PG|CR0_WP),%eax /* enable paging */ ;\ + movl %eax,%cr0 ;\ + /* "The Aussie Maneuver" ("Myria" variant) */ ;\ + pushl $(0xcb<<24)|KERNEL64_CS /* reload CS with 0x08 */ ;\ + call .-1 ;\ + .code64 + /* * BSP CPU start here. * eax points to kernbootstruct @@ -123,51 +132,17 @@ EXT(mc_task_stack_end): * Environment: * protected mode, no paging, flat 32-bit address space. * (Code/data/stack segments have base == 0, limit == 4G) - */ - -#define SWITCH_TO_64BIT_MODE \ - movl $(CR4_PAE),%eax /* enable PAE */ ;\ - movl %eax,%cr4 ;\ - movl $MSR_IA32_EFER,%ecx ;\ - rdmsr ;\ - orl $MSR_IA32_EFER_LME,%eax /* enable long mode */ ;\ - wrmsr ;\ - movl $INITPT_SEG_BASE,%eax ;\ - movl %eax,%cr3 ;\ - movl %cr0,%eax ;\ - orl $(CR0_PG|CR0_WP),%eax /* enable paging */ ;\ - movl %eax,%cr0 ;\ - /* "The Aussie Maneuver" ("Myria" variant) */ ;\ - pushl $(0xcb<<24)|KERNEL64_CS /* reload CS with 0x08 */ ;\ - call .-1 ;\ - .code64 - -/* - * [ We used to have a reason for the following statement; ] - * [ but the issue has been fixed. The line is true ] - * [ nevertheless, therefore it should remain there. ] - * This proves that Little Endian is superior to Big Endian. */ +.code32 .text + .section __HIB, __text .align ALIGN .globl EXT(_start) - .globl EXT(_pstart) + .globl EXT(pstart) LEXT(_start) -LEXT(_pstart) - - .code32 +LEXT(pstart) -#if 0 - mov $0x3f8, %dx - mov $0x4D, %al; out %al, %dx - mov $0x49, %al; out %al, %dx - mov $0x53, %al; out %al, %dx - mov $0x54, %al; out %al, %dx - mov $0x0D, %al; out %al, %dx - mov $0x0A, %al; out %al, %dx -#endif - /* * Here we do the minimal setup to switch from 32 bit mode to 64 bit long mode. * @@ -177,8 +152,13 @@ LEXT(_pstart) * | | * | Kernel text/data | * | | - * ------------------------- Kernel start addr + * |-----------------------| Kernel text base addr - 2MB-aligned + * | padding | + * |-----------------------| + * | __HIB section | + * |-----------------------| Page-aligned * | | + * | padding | * | | * ------------------------- 0 * @@ -186,14 +166,31 @@ LEXT(_pstart) mov %eax, %edi /* save kernbootstruct */ /* Use low 32-bits of address as 32-bit stack */ - movl $EXT(low_eintstack), %esp + movl $EXT(low_eintstack), %esp + POSTCODE(PSTART_ENTRY) + /* * Set up segmentation */ movl $EXT(protected_mode_gdtr), %eax lgdtl (%eax) + /* + * Rebase Boot page tables to kernel base address. + */ + movl $EXT(BootPML4), %eax // Level 4: + add %eax, 0*8+0(%eax) // - 1:1 + add %eax, KERNEL_PML4_INDEX*8+0(%eax) // - kernel space + + movl $EXT(BootPDPT), %edx // Level 3: + add %eax, 0*8+0(%edx) + add %eax, 1*8+0(%edx) + add %eax, 2*8+0(%edx) + add %eax, 3*8+0(%edx) + + POSTCODE(PSTART_REBASE) + /* the following code is shared by the master CPU and all slave CPUs */ L_pstart_common: /* @@ -209,16 +206,56 @@ L_pstart_common: mov %ax, %fs mov %ax, %gs + test %edi, %edi /* Populate stack canary on BSP */ + jz Lvstartshim + + mov $1, %eax + cpuid + test $(1 << 30), %ecx + jz Lnon_rdrand + RDRAND_RAX /* RAX := 64 bits of DRBG entropy */ + jnc Lnon_rdrand /* TODO: complain if DRBG fails at this stage */ + +Lstore_random_guard: + xor %ah, %ah /* Security: zero second byte of stack canary */ + movq %rax, ___stack_chk_guard(%rip) + /* %edi = boot_args_start if BSP */ +Lvstartshim: + + POSTCODE(PSTART_VSTART) + /* %edi = boot_args_start */ - leaq _vstart(%rip), %rcx - movq $0xffffff8000000000, %rax /* adjust the pointer to be up high */ - or %rax, %rsp /* and stack pointer up there too */ - or %rcx, %rax - andq $0xfffffffffffffff0, %rsp /* align stack */ - xorq %rbp, %rbp /* zero frame pointer */ - callq *%rax - + leaq _vstart(%rip), %rcx + movq $0xffffff8000000000, %rax /* adjust pointer up high */ + or %rax, %rsp /* and stack pointer up there */ + or %rcx, %rax + andq $0xfffffffffffffff0, %rsp /* align stack */ + xorq %rbp, %rbp /* zero frame pointer */ + callq *%rax + +Lnon_rdrand: + rdtsc /* EDX:EAX := TSC */ + /* Distribute low order bits */ + mov %eax, %ecx + xor %al, %ah + shl $16, %rcx + xor %rcx, %rax + xor %eax, %edx + + /* Incorporate ASLR entropy, if any */ + lea (%rip), %rcx + shr $21, %rcx + movzbl %cl, %ecx + shl $16, %ecx + xor %ecx, %edx + + mov %ah, %cl + ror %cl, %edx /* Right rotate EDX (TSC&0xFF ^ (TSC>>8 & 0xFF))&1F */ + shl $32, %rdx + xor %rdx, %rax + mov %cl, %al + jmp Lstore_random_guard /* * AP (slave) CPUs enter here. * @@ -232,18 +269,11 @@ LEXT(slave_pstart) .code32 cli /* disable interrupts, so we don`t */ /* need IDT for a while */ - POSTCODE(SLAVE_PSTART_ENTRY) + POSTCODE(SLAVE_PSTART) movl $EXT(mp_slave_stack) + PAGE_SIZE, %esp - /* set up identity mapping of page tables */ - movl $INITPT_SEG_BASE,%eax - movl (KERNEL_PML4_INDEX*8)(%eax), %esi - movl %esi, (0)(%eax) - movl (KERNEL_PML4_INDEX*8+4)(%eax), %esi - movl %esi, (0+4)(%eax) - - movl $0, %edi /* "no kernbootstruct" */ + xor %edi, %edi /* AP, no "kernbootstruct" */ jmp L_pstart_common /* hop a ride to vstart() */ @@ -252,13 +282,13 @@ LEXT(slave_pstart) .section __HIB, __text /* -This code is linked into the kernel but part of the "__HIB" section, which means -its used by code running in the special context of restoring the kernel text and data -from the hibernation image read by the booter. hibernate_kernel_entrypoint() and everything -it calls or references (ie. hibernate_restore_phys_page()) -needs to be careful to only touch memory also in the "__HIB" section. -*/ - + * This code is linked into the kernel but part of the "__HIB" section, + * which means it's used by code running in the special context of restoring + * the kernel text and data from the hibernation image read by the booter. + * hibernate_kernel_entrypoint() and everything it calls or references + * (ie. hibernate_restore_phys_page()) needs to be careful to only touch + * memory also in the "__HIB" section. + */ .align ALIGN .globl EXT(hibernate_machine_entrypoint) @@ -266,54 +296,35 @@ needs to be careful to only touch memory also in the "__HIB" section. LEXT(hibernate_machine_entrypoint) movl %eax, %edi /* regparm(1) calling convention */ - /* restore gdt */ - mov $(SLEEP_SEG_BASE)+20, %eax // load saved_gdt, this may break + /* Use low 32-bits of address as 32-bit stack */ + movl $EXT(low_eintstack), %esp + + /* + * Set up GDT + */ + movl $EXT(master_gdtr), %eax lgdtl (%eax) - /* setup the protected mode segment registers */ - mov $KERNEL_DS, %eax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - xor %eax,%eax - movw %ax, %fs - movw %ax, %gs + /* Switch to 64-bit on the Boot PTs */ + SWITCH_TO_64BIT_MODE - /* set up the page tables to use BootstrapPTD - * as done in idle_pt.c, but this must be done programatically */ - mov $(INITPT_SEG_BASE + PAGE_SIZE), %eax - mov $(INITPT_SEG_BASE + 2*PAGE_SIZE | INTEL_PTE_WRITE | INTEL_PTE_VALID), %ecx - mov $0x0, %edx - mov %ecx, (0*8+0)(%eax) - mov %edx, (0*8+4)(%eax) - add $(PAGE_SIZE), %ecx - mov %ecx, (1*8+0)(%eax) - mov %edx, (1*8+4)(%eax) - add $(PAGE_SIZE), %ecx - mov %ecx, (2*8+0)(%eax) - mov %edx, (2*8+4)(%eax) - add $(PAGE_SIZE), %ecx - mov %ecx, (3*8+0)(%eax) - mov %edx, (3*8+4)(%eax) - - /* Temporary stack */ - mov $(REAL_MODE_BOOTSTRAP_OFFSET + PROT_MODE_START), %esp + leaq EXT(hibernate_kernel_entrypoint)(%rip),%rcx - SWITCH_TO_64BIT_MODE + /* adjust the pointers to be up high */ + movq $0xffffff8000000000, %rax + orq %rax, %rsp + orq %rcx, %rax - leaq EXT(hibernate_kernel_entrypoint)(%rip),%rcx - leaq EXT(gIOHibernateRestoreStackEnd)(%rip),%rsp /* switch to the bootup stack */ - movq $0xffffff8000000000, %rax /* adjust the pointer to be up high */ - orq %rax, %rsp /* and stack pointer up there too :D */ - orq %rcx, %rax /* put entrypoint in %rax */ /* %edi is already filled with header pointer */ - xorl %esi, %esi /* zero 2nd arg */ - xorl %edx, %edx /* zero 3rd arg */ - xorl %ecx, %ecx /* zero 4th arg */ - andq $0xfffffffffffffff0, %rsp /* align stack */ - /* (future-proofing, stack should already be aligned) */ - xorq %rbp, %rbp /* zero frame pointer */ - call *%rax /* call instead of jmp to keep the required stack alignment */ + xorl %esi, %esi /* zero 2nd arg */ + xorl %edx, %edx /* zero 3rd arg */ + xorl %ecx, %ecx /* zero 4th arg */ + andq $0xfffffffffffffff0, %rsp /* align stack */ + + /* call instead of jmp to keep the required stack alignment */ + xorq %rbp, %rbp /* zero frame pointer */ + call *%rax + /* NOTREACHED */ hlt @@ -325,41 +336,11 @@ LEXT(hibernate_machine_entrypoint) #include - - -#define PA(addr) (addr) - /* * acpi_wake_start - * - * The code from acpi_wake_start to acpi_wake_end is copied to - * memory below 1MB. The firmware waking vector is updated to - * point at acpi_wake_start in low memory before sleeping. */ .section __TEXT,__text -.text -.align 12 /* Page align for single bcopy_phys() */ -.code32 -.globl EXT(acpi_wake_prot) -EXT(acpi_wake_prot): - /* protected mode, paging disabled */ - - /* jump to acpi_temp_alloc (stored in saved_tmp) */ - mov $(SLEEP_SEG_BASE)+16, %eax - mov (%eax), %ecx // Load acpi_temp_reloc from saved_eip - jmp *%ecx -acpi_temp_reloc: - mov $(SLEEP_SEG_BASE)+16, %esp /* setup stack for 64bit */ - - SWITCH_TO_64BIT_MODE - - lea Lwake_64(%rip), %rax - movq $0xffffff8000000000, %rdx - orq %rdx, %rax - jmp *%rax -.code32 - .code64 /* @@ -404,6 +385,8 @@ ENTRY(acpi_sleep_cpu) mov %rax, saved_cr0(%rip) mov %cr2, %rax mov %rax, saved_cr2(%rip) + mov %cr3, %rax + mov %rax, saved_cr3(%rip) mov %cr4, %rax mov %rax, saved_cr4(%rip) @@ -431,13 +414,6 @@ ENTRY(acpi_sleep_cpu) sidt saved_idt(%rip) str saved_tr(%rip) - /* - * When system wakes up, the real mode wake handler will revert to - * protected mode, then jump to the address stored at saved_eip. - */ - leaq acpi_temp_reloc(%rip), %rax - mov %eax, saved_eip(%rip) - /* * Call ACPI function provided by the caller to sleep the platform. * This call will not return on success. @@ -449,48 +425,47 @@ ENTRY(acpi_sleep_cpu) /* sleep failed, no cpu context lost */ jmp wake_restore +.section __HIB, __text +.code32 +.globl EXT(acpi_wake_prot) +EXT(acpi_wake_prot): + /* protected mode, paging disabled */ + movl $EXT(low_eintstack), %esp + + SWITCH_TO_64BIT_MODE + + jmp Lwake_64 + +.section __TEXT,__text +.code64 + .globl EXT(acpi_wake_prot_entry) EXT(acpi_wake_prot_entry): POSTCODE(ACPI_WAKE_PROT_ENTRY) - /* Entry from the hibernate code in iokit/Kernel/IOHibernateRestoreKernel.c - * - * Reset the first 4 PDE's to point to entries in IdlePTD, as done in - * Idle_PTs_init() during startup */ - leaq _IdlePDPT(%rip), %rax - movq _IdlePTD(%rip), %rcx - mov %ecx, %ecx /* zero top 32bits of %rcx */ - orq $(INTEL_PTE_WRITE|INTEL_PTE_VALID), %rcx - movq %rcx, 0x0(%rax) - add $0x1000, %rcx - movq %rcx, 0x8(%rax) - add $0x1000, %rcx - movq %rcx, 0x10(%rax) - add $0x1000, %rcx - movq %rcx, 0x18(%rax) - mov %cr3, %rax - mov %rax, %cr3 - + /* Return from hibernate code in iokit/Kernel/IOHibernateRestoreKernel.c + */ Lwake_64: /* * restore cr4, PAE and NXE states in an orderly fashion */ - mov saved_cr4(%rip), %rcx - mov %rcx, %cr4 - - mov $(MSR_IA32_EFER), %ecx /* MSR number in ecx */ - rdmsr /* MSR value return in edx: eax */ - or $(MSR_IA32_EFER_NXE), %eax /* Set NXE bit in low 32-bits */ - wrmsr /* Update Extended Feature Enable reg */ + mov saved_cr4(%rip), %rcx + mov %rcx, %cr4 - /* restore kernel GDT */ - lgdt EXT(protected_mode_gdtr)(%rip) + mov $(MSR_IA32_EFER), %ecx /* MSR number in ecx */ + rdmsr /* MSR value in edx:eax */ + or $(MSR_IA32_EFER_NXE), %eax /* Set NXE bit in low 32-bits */ + wrmsr /* Update */ movq saved_cr2(%rip), %rax - mov %rax, %cr2 + mov %rax, %cr2 /* restore CR0, paging enabled */ - mov saved_cr0(%rip), %rax - mov %rax, %cr0 + mov saved_cr0(%rip), %rax + mov %rax, %cr0 + + /* restore the page tables */ + mov saved_cr3(%rip), %rax + mov %rax, %cr3 /* protected mode, paging enabled */ POSTCODE(ACPI_WAKE_PAGED_ENTRY) @@ -500,7 +475,8 @@ Lwake_64: movw %ax, %ss movw %ax, %ds - /* restore local and interrupt descriptor tables */ + /* restore descriptor tables */ + lgdt saved_gdt(%rip) lldt saved_ldt(%rip) lidt saved_idt(%rip) @@ -580,7 +556,7 @@ wake_restore: .byte 0x15 ;\ .long address-EXT(real_mode_bootstrap_base) -.section __TEXT,__text +.section __HIB, __text .align 12 /* Page align for single bcopy_phys() */ .code32 Entry(real_mode_bootstrap_base) @@ -603,7 +579,7 @@ Entry(real_mode_bootstrap_base) movw %ax, %ds movw %ax, %es movw %ax, %ss - xor %eax,%eax + xor %eax,%eax movw %ax, %fs movw %ax, %gs @@ -613,20 +589,22 @@ Entry(real_mode_bootstrap_base) jmp *%ecx Entry(protected_mode_gdtr) - .short 160 /* limit (8*6 segs) */ + .short 160 /* limit (8*20 segs) */ .quad EXT(master_gdt) Entry(real_mode_bootstrap_end) /* Save area used across sleep/wake */ -.section __SLEEP, __data +.section __HIB, __data .align 2 -temp_stack: .quad 0 - .quad 0 -saved_eip: .long 0 +/* gdtr for real address of master_gdt in HIB (not the aliased address) */ +Entry(master_gdtr) + .word 160 /* limit (8*20 segs) */ + .quad EXT(master_gdt) + saved_gdt: .word 0 - .quad 0 + .quad 0 saved_rsp: .quad 0 saved_es: .word 0 saved_fs: .word 0 @@ -634,6 +612,7 @@ saved_gs: .word 0 saved_ss: .word 0 saved_cr0: .quad 0 saved_cr2: .quad 0 +saved_cr3: .quad 0 saved_cr4: .quad 0 saved_idt: .word 0 .quad 0 diff --git a/pexpert/conf/MASTER b/pexpert/conf/MASTER index 7731f8388..833878544 100644 --- a/pexpert/conf/MASTER +++ b/pexpert/conf/MASTER @@ -85,6 +85,7 @@ ident PEXPERT options MACH_PE # Objective-C support # options MACH_KERNEL options DEBUG # general debugging code # +options MACH_ASSERT # # options CONFIG_DTRACE # dtrace support # options PANIC_INFO # want kernel panic info # diff --git a/pexpert/conf/MASTER.x86_64 b/pexpert/conf/MASTER.x86_64 index 9283af226..9f0004250 100644 --- a/pexpert/conf/MASTER.x86_64 +++ b/pexpert/conf/MASTER.x86_64 @@ -5,7 +5,7 @@ # # RELEASE = [ intel mach mach_pe panic_info config_dtrace ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ RELEASE debug ] +# DEBUG = [ RELEASE debug mach_assert ] # # EMBEDDED = [ intel mach mach_pe panic_info ] # DEVELOPMENT = [ EMBEDDED ] diff --git a/pexpert/conf/Makefile b/pexpert/conf/Makefile index 06a9defdf..482f105be 100644 --- a/pexpert/conf/Makefile +++ b/pexpert/conf/Makefile @@ -42,9 +42,11 @@ $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ do_all: $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ + next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH)); \ ${MAKE} -C $(COMPOBJROOT)/$(PEXPERT_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(PEXPERT_KERNEL_CONFIG)/Makefile \ SOURCE=$${next_source} \ + RELATIVE_SOURCE_PATH=$${next_relsource} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ KERNEL_CONFIG=$(PEXPERT_KERNEL_CONFIG) \ diff --git a/pexpert/conf/files b/pexpert/conf/files index 3ad1855bc..8c11d3933 100644 --- a/pexpert/conf/files +++ b/pexpert/conf/files @@ -1,5 +1,4 @@ # -OPTIONS/mach_kdb optional mach_kdb OPTIONS/panic_info optional panic_info OPTIONS/config_dtrace optional config_dtrace diff --git a/pexpert/gen/bootargs.c b/pexpert/gen/bootargs.c index 6ca4fa102..6bc636010 100644 --- a/pexpert/gen/bootargs.c +++ b/pexpert/gen/bootargs.c @@ -31,13 +31,14 @@ static boolean_t isargsep( char c); #if !CONFIG_EMBEDDED static int argstrcpy(char *from, char *to); -#endif +#endif static int argstrcpy2(char *from,char *to, unsigned maxlen); static int argnumcpy(int val, void *to, unsigned maxlen); static int getval(char *s, int *val); extern int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize); + struct i24 { int32_t i24 : 24; int32_t _pad : 8; @@ -71,7 +72,7 @@ PE_parse_boot_argn( { char *args; char *cp, c; - unsigned int i; + uintptr_t i; int val; boolean_t arg_boolean; boolean_t arg_found; diff --git a/pexpert/gen/device_tree.c b/pexpert/gen/device_tree.c index dc3ea9ddc..d78bed0bf 100644 --- a/pexpert/gen/device_tree.c +++ b/pexpert/gen/device_tree.c @@ -127,11 +127,18 @@ GetNextChild(RealDTEntry sibling) static const char * GetNextComponent(const char *cp, char *bp) { + size_t length = 0; + char *origbp = bp; + while (*cp != 0) { if (*cp == kDTPathNameSeparator) { cp++; break; } + if (++length > kDTMaxEntryNameLength) { + *origbp = '\0'; + return cp; + } *bp++ = *cp++; } *bp = 0; diff --git a/pexpert/i386/pe_init.c b/pexpert/i386/pe_init.c index 21978a2a7..1ef6c7c7b 100644 --- a/pexpert/i386/pe_init.c +++ b/pexpert/i386/pe_init.c @@ -201,6 +201,15 @@ void PE_init_platform(boolean_t vm_initialized, void * _args) PE_state.video.v_scale = (kBootArgsFlagHiDPI & args->flags) ? 2 : 1; strlcpy(PE_state.video.v_pixelFormat, "PPPPPPPP", sizeof(PE_state.video.v_pixelFormat)); + +#ifdef kBootArgsFlagHiDPI + if (args->flags & kBootArgsFlagHiDPI) + PE_state.video.v_scale = kPEScaleFactor2x; + else + PE_state.video.v_scale = kPEScaleFactor1x; +#else + PE_state.video.v_scale = kPEScaleFactor1x; +#endif } if (!vm_initialized) { diff --git a/pexpert/pexpert/device_tree.h b/pexpert/pexpert/device_tree.h index e68a85588..d6fd3d9f8 100644 --- a/pexpert/pexpert/device_tree.h +++ b/pexpert/pexpert/device_tree.h @@ -57,11 +57,11 @@ typedef char DTPropertyNameBuf[32]; /* Entry Name Definitions (Entry Names are C-Strings)*/ enum { - kDTMaxEntryNameLength = 31 /* Max length of a C-String Entry Name (terminator not included) */ + kDTMaxEntryNameLength = 63 /* Max length of a C-String Entry Name (terminator not included) */ }; /* length of DTEntryNameBuf = kDTMaxEntryNameLength +1*/ -typedef char DTEntryNameBuf[32]; +typedef char DTEntryNameBuf[kDTMaxEntryNameLength+1]; /* Entry*/ diff --git a/pexpert/pexpert/i386/boot.h b/pexpert/pexpert/i386/boot.h index b347de0e3..369c2c477 100644 --- a/pexpert/pexpert/i386/boot.h +++ b/pexpert/pexpert/i386/boot.h @@ -121,8 +121,9 @@ typedef struct boot_icon_element boot_icon_element; #define kBootArgsEfiMode32 32 #define kBootArgsEfiMode64 64 -#define kBootArgsFlagRebootOnPanic 1 -#define kBootArgsFlagHiDPI 2 +/* Bitfields for boot_args->flags */ +#define kBootArgsFlagRebootOnPanic (1 << 0) +#define kBootArgsFlagHiDPI (1 << 1) typedef struct boot_args { uint16_t Revision; /* Revision of boot_args structure */ @@ -152,7 +153,7 @@ typedef struct boot_args { uint64_t efiRuntimeServicesVirtualPageStart; /* virtual address of defragmented runtime pages */ uint32_t efiSystemTable; /* physical address of system table in runtime area */ - uint32_t __reserved2; + uint32_t kslide; uint32_t performanceDataStart; /* physical address of log */ uint32_t performanceDataSize; diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index b57d59edb..31209ff78 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -58,8 +58,6 @@ void PE_init_platform( void *args); - - void PE_init_kprintf( boolean_t vm_initialized); @@ -172,6 +170,13 @@ enum { kDebugTypeSerial = 2 }; +/* Scale factor values for PE_Video.v_scale */ +enum { + kPEScaleFactorUnknown = 0, + kPEScaleFactor1x = 1, + kPEScaleFactor2x = 2 +}; + struct PE_Video { unsigned long v_baseAddr; /* Base address of video memory */ unsigned long v_rowBytes; /* Number of bytes per pixel row */ diff --git a/security/conf/MASTER b/security/conf/MASTER index 8d1598990..330092535 100644 --- a/security/conf/MASTER +++ b/security/conf/MASTER @@ -60,8 +60,15 @@ options CONFIG_LCTX # Login Context options CONFIG_DTRACE # dtrace support # +options VM_PRESSURE_EVENTS # + options CONFIG_NO_PANIC_STRINGS # options CONFIG_NO_PRINTF_STRINGS # options CONFIG_NO_KPRINTF_STRINGS # options CONFIG_FSE # file system events # options CONFIG_TRIGGERS # trigger vnodes # +options CONFIG_EXT_RESOLVER # e.g. memberd # + +options SECURE_KERNEL # +options DEBUG # # +options MACH_ASSERT # # diff --git a/security/conf/MASTER.i386 b/security/conf/MASTER.i386 index dd4fb5f69..60bcfbe5e 100644 --- a/security/conf/MASTER.i386 +++ b/security/conf/MASTER.i386 @@ -1,6 +1,6 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp config_dtrace audit ] +# RELEASE = [ intel mach libkerncpp config_dtrace audit vm_pressure_events ] # PROFILE = [ RELEASE profile ] # DEBUG = [ RELEASE debug ] # @@ -20,6 +20,8 @@ options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support options CONFIG_FSE options CONFIG_TRIGGERS +options CONFIG_VFS_FUNNEL +options CONFIG_EXT_RESOLVER #options CONFIG_MACF_SOCKET #options CONFIG_MACF_NET #options CONFIG_MACF_ALWAYS_LABEL_MBUF diff --git a/security/conf/MASTER.x86_64 b/security/conf/MASTER.x86_64 index d362cf049..4483af782 100644 --- a/security/conf/MASTER.x86_64 +++ b/security/conf/MASTER.x86_64 @@ -1,8 +1,8 @@ ###################################################################### # -# RELEASE = [ intel mach libkerncpp config_dtrace audit ] +# RELEASE = [ intel mach libkerncpp config_dtrace audit vm_pressure_events ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ RELEASE debug ] +# DEBUG = [ RELEASE debug mach_assert ] # # EMBEDDED = [ intel mach libkerncpp audit ] # DEVELOPMENT = [ EMBEDDED ] @@ -17,6 +17,7 @@ options CONFIG_MACF # Mandatory Access Control Framework options CONFIG_MACF_SOCKET_SUBSET # MACF subset of socket support options CONFIG_FSE options CONFIG_TRIGGERS +options CONFIG_EXT_RESOLVER #options CONFIG_MACF_SOCKET #options CONFIG_MACF_NET #options CONFIG_MACF_ALWAYS_LABEL_MBUF diff --git a/security/conf/Makefile b/security/conf/Makefile index bdb8f33f8..3bab0d1ce 100644 --- a/security/conf/Makefile +++ b/security/conf/Makefile @@ -42,9 +42,11 @@ $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ do_all: $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ + next_relsource=$(subst conf/,,$(RELATIVE_SOURCE_PATH)); \ ${MAKE} -C $(COMPOBJROOT)/$(SECURITY_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(SECURITY_KERNEL_CONFIG)/Makefile \ SOURCE=$${next_source} \ + RELATIVE_SOURCE_PATH=$${next_relsource} \ TARGET=$(TARGET) \ INCL_MAKEDEP=FALSE \ KERNEL_CONFIG=$(SECURITY_KERNEL_CONFIG) \ diff --git a/security/mac.h b/security/mac.h index 3e0cf7a89..480d1a30b 100644 --- a/security/mac.h +++ b/security/mac.h @@ -147,7 +147,8 @@ struct user64_mac { #define MAC_PROC_CHECK_SUSPEND 0 #define MAC_PROC_CHECK_RESUME 1 #define MAC_PROC_CHECK_HIBERNATE 2 -#define MAC_PROC_CHECK_SHUTDOWN_SOCKETS 3 +#define MAC_PROC_CHECK_SHUTDOWN_SOCKETS 3 +#define MAC_PROC_CHECK_PIDBIND 4 #ifndef KERNEL /* diff --git a/security/mac_base.c b/security/mac_base.c index 33dd04457..ae808e277 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -88,6 +88,7 @@ #include #include +#include #include #include @@ -123,7 +124,11 @@ SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW|CTLFLAG_LOCKED, 0, SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "TrustedBSD MAC policy controls"); - +#if DEBUG +#define SECURITY_MAC_CTLFLAGS CTLFLAG_RW | CTLFLAG_LOCKED +#else +#define SECURITY_MAC_CTLFLAGS CTLFLAG_RD | CTLFLAG_LOCKED +#endif /* * Declare that the kernel provides MAC support, version 1. This permits @@ -163,7 +168,7 @@ int mac_late = 0; */ #if CONFIG_MACF_NET unsigned int mac_label_mbufs = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, SECURITY_MAC_CTLFLAGS, &mac_label_mbufs, 0, "Label all MBUFs"); #endif @@ -180,87 +185,68 @@ SYSCTL_UINT(_security_mac, OID_AUTO, label_mbufs, CTLFLAG_RW | CTLFLAG_LOCKED, * be a problem. */ unsigned int mac_label_vnodes = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, labelvnodes, SECURITY_MAC_CTLFLAGS, &mac_label_vnodes, 0, "Label all vnodes"); unsigned int mac_mmap_revocation = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation, SECURITY_MAC_CTLFLAGS, &mac_mmap_revocation, 0, "Revoke mmap access to files on subject " "relabel"); unsigned int mac_mmap_revocation_via_cow = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation_via_cow, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, mmap_revocation_via_cow, SECURITY_MAC_CTLFLAGS, &mac_mmap_revocation_via_cow, 0, "Revoke mmap access to files via " "copy-on-write semantics, or by removing all write access"); unsigned int mac_device_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, device_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, device_enforce, SECURITY_MAC_CTLFLAGS, &mac_device_enforce, 0, "Enforce MAC policy on device operations"); -unsigned int mac_file_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, file_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, - &mac_file_enforce, 0, "Enforce MAC policy on file operations"); - -unsigned int mac_iokit_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, iokit_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, - &mac_file_enforce, 0, "Enforce MAC policy on IOKit operations"); - unsigned int mac_pipe_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, pipe_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, pipe_enforce, SECURITY_MAC_CTLFLAGS, &mac_pipe_enforce, 0, "Enforce MAC policy on pipe operations"); unsigned int mac_posixsem_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, posixsem_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, posixsem_enforce, SECURITY_MAC_CTLFLAGS, &mac_posixsem_enforce, 0, "Enforce MAC policy on POSIX semaphores"); unsigned int mac_posixshm_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, posixshm_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, posixshm_enforce, SECURITY_MAC_CTLFLAGS, &mac_posixshm_enforce, 0, "Enforce MAC policy on Posix Shared Memory"); unsigned int mac_proc_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, proc_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, proc_enforce, SECURITY_MAC_CTLFLAGS, &mac_proc_enforce, 0, "Enforce MAC policy on process operations"); unsigned int mac_socket_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, socket_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, socket_enforce, SECURITY_MAC_CTLFLAGS, &mac_socket_enforce, 0, "Enforce MAC policy on socket operations"); unsigned int mac_system_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, system_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, system_enforce, SECURITY_MAC_CTLFLAGS, &mac_system_enforce, 0, "Enforce MAC policy on system-wide interfaces"); unsigned int mac_sysvmsg_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, sysvmsg_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, sysvmsg_enforce, SECURITY_MAC_CTLFLAGS, &mac_sysvmsg_enforce, 0, "Enforce MAC policy on System V IPC message queues"); unsigned int mac_sysvsem_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, sysvsem_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, sysvsem_enforce, SECURITY_MAC_CTLFLAGS, &mac_sysvsem_enforce, 0, "Enforce MAC policy on System V IPC semaphores"); unsigned int mac_sysvshm_enforce = 1; -SYSCTL_INT(_security_mac, OID_AUTO, sysvshm_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_INT(_security_mac, OID_AUTO, sysvshm_enforce, SECURITY_MAC_CTLFLAGS, &mac_sysvshm_enforce, 0, "Enforce MAC policy on System V Shared Memory"); unsigned int mac_vm_enforce = 1; -SYSCTL_INT(_security_mac, OID_AUTO, vm_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_INT(_security_mac, OID_AUTO, vm_enforce, SECURITY_MAC_CTLFLAGS, &mac_vm_enforce, 0, "Enforce MAC policy on VM operations"); unsigned int mac_vnode_enforce = 1; -SYSCTL_UINT(_security_mac, OID_AUTO, vnode_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_UINT(_security_mac, OID_AUTO, vnode_enforce, SECURITY_MAC_CTLFLAGS, &mac_vnode_enforce, 0, "Enforce MAC policy on vnode operations"); - -#if CONFIG_MACF_MACH -unsigned int mac_port_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, port_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, - &mac_port_enforce, 0, "Enforce MAC policy on Mach port operations"); - -unsigned int mac_task_enforce = 0; -SYSCTL_UINT(_security_mac, OID_AUTO, task_enforce, CTLFLAG_RW | CTLFLAG_LOCKED, - &mac_task_enforce, 0, "Enforce MAC policy on Mach task operations"); -#endif - #if CONFIG_AUDIT /* * mac_audit_data_zone is the zone used for data pushed into the audit @@ -2254,6 +2240,61 @@ __mac_get_mount(proc_t p __unused, struct __mac_get_mount_args *uap, return mac_mount_label_get(mp, uap->mac_p); } +/* + * mac_schedule_userret() + * + * Schedule a callback to the mpo_thread_userret hook. The mpo_thread_userret + * hook is called just before the thread exit from the kernel in ast_taken(). + * + * Returns: 0 Success + * !0 Not successful + */ +int +mac_schedule_userret(void) +{ + + act_set_astmacf(current_thread()); + return (0); +} + +/* + * mac_do_machexc() + * + * Do a Mach exception. This should only be done in the mpo_thread_userret + * callback. + * + * params: code exception code + * subcode exception subcode + * flags flags: + * MAC_DOEXCF_TRACED Only do exception if being + * ptrace()'ed. + * + * + * Returns: 0 Success + * !0 Not successful + */ +int +mac_do_machexc(int64_t code, int64_t subcode, uint32_t flags) +{ + mach_exception_data_type_t codes[EXCEPTION_CODE_MAX]; + proc_t p = current_proc(); + + /* Only allow execption codes in MACF's reserved range. */ + if ((code < EXC_MACF_MIN) || (code > EXC_MACF_MAX)) + return (1); + + if (flags & MAC_DOEXCF_TRACED && + !(p->p_lflag & P_LTRACED && (p->p_lflag & P_LPPWAIT) == 0)) + return (0); + + + /* Send the Mach exception */ + codes[0] = (mach_exception_data_type_t)code; + codes[1] = (mach_exception_data_type_t)subcode; + + return (bsd_exception(EXC_SOFTWARE, codes, 2) != KERN_SUCCESS); +} + #else /* MAC */ int @@ -2404,4 +2445,18 @@ __mac_get_mount(proc_t p __unused, return (ENOSYS); } + +int +mac_schedule_userret(void) +{ + + return (1); +} + +int +mac_do_machexc(int64_t code __unused, int64_t subcode __unused, uint32_t flags __unused) +{ + + return (1); +} #endif /* !MAC */ diff --git a/security/mac_framework.h b/security/mac_framework.h index 20780b249..7d0f15a10 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -116,6 +116,7 @@ struct thread; struct timespec; struct ucred; struct uio; +struct uthread; struct vfs_attr; struct vfs_context; struct vnode; @@ -168,6 +169,7 @@ void mac_cred_label_destroy(kauth_cred_t cred); int mac_cred_label_externalize_audit(proc_t p, struct mac *mac); void mac_cred_label_free(struct label *label); void mac_cred_label_init(kauth_cred_t cred); +int mac_cred_label_compare(struct label *a, struct label *b); void mac_cred_label_update(kauth_cred_t cred, struct label *newlabel); int mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred, struct vnode *vp, struct label *scriptvnodelabel, @@ -310,7 +312,8 @@ void mac_posixsem_label_init(struct pseminfo *psem); int mac_posixshm_check_create(kauth_cred_t cred, const char *name); int mac_posixshm_check_mmap(kauth_cred_t cred, struct pshminfo *pshm, int prot, int flags); -int mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *pshm); +int mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *pshm, + int fflags); int mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *pshm); int mac_posixshm_check_truncate(kauth_cred_t cred, struct pshminfo *pshm, off_t s); @@ -334,6 +337,7 @@ int mac_proc_check_getaudit(proc_t proc); int mac_proc_check_getauid(proc_t proc); int mac_proc_check_getlcid(proc_t proc1, proc_t proc2, pid_t pid); +int mac_proc_check_ledger(proc_t curp, proc_t target, int op); int mac_proc_check_map_anon(proc_t proc, user_addr_t u_addr, user_size_t u_size, int prot, int flags, int *maxprot); int mac_proc_check_mprotect(proc_t proc, @@ -401,6 +405,7 @@ int mac_system_check_swapon(kauth_cred_t cred, struct vnode *vp); int mac_system_check_sysctl(kauth_cred_t cred, int *name, u_int namelen, user_addr_t oldctl, user_addr_t oldlenp, int inkernel, user_addr_t newctl, size_t newlen); +int mac_system_check_kas_info(kauth_cred_t cred, int selector); void mac_sysvmsg_label_associate(kauth_cred_t cred, struct msqid_kernel *msqptr, struct msg *msgptr); void mac_sysvmsg_label_init(struct msg *msgptr); @@ -445,6 +450,10 @@ void mac_sysvshm_label_associate(kauth_cred_t cred, void mac_sysvshm_label_destroy(struct shmid_kernel *shmsegptr); void mac_sysvshm_label_init(struct shmid_kernel* shmsegptr); void mac_sysvshm_label_recycle(struct shmid_kernel *shmsegptr); +struct label * mac_thread_label_alloc(void); +void mac_thread_label_destroy(struct uthread *uthread); +void mac_thread_label_free(struct label *label); +void mac_thread_label_init(struct uthread *uthread); int mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp, int acc_mode); int mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp); diff --git a/security/mac_inet.c b/security/mac_inet.c index 2b823bab9..1f57fdc79 100644 --- a/security/mac_inet.c +++ b/security/mac_inet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,8 +71,6 @@ #include #include -#include -#include #include #include diff --git a/security/mac_internal.h b/security/mac_internal.h index 6e8ae3d2a..6ca8b699e 100644 --- a/security/mac_internal.h +++ b/security/mac_internal.h @@ -184,8 +184,6 @@ extern struct mac_policy_list mac_policy_list; * at all in the system. */ extern unsigned int mac_device_enforce; -extern unsigned int mac_file_enforce; -extern unsigned int mac_iokit_enforce; extern unsigned int mac_pipe_enforce; extern unsigned int mac_posixsem_enforce; extern unsigned int mac_posixshm_enforce; @@ -198,11 +196,6 @@ extern unsigned int mac_sysvshm_enforce; extern unsigned int mac_vm_enforce; extern unsigned int mac_vnode_enforce; -#if CONFIG_MACF_MACH -extern unsigned int mac_port_enforce; -extern unsigned int mac_task_enforce; -#endif - #if CONFIG_MACF_NET extern unsigned int mac_label_mbufs; #endif diff --git a/security/mac_mach_internal.h b/security/mac_mach_internal.h index 799f0fac0..5393c750d 100644 --- a/security/mac_mach_internal.h +++ b/security/mac_mach_internal.h @@ -60,6 +60,15 @@ int mac_port_label_compute(struct label *subj, struct label *obj, const char *serv, struct label *out); int mac_port_check_method(task_t task, struct label *sub, struct label *obj, int msgid); +/* mac_do_machexc() flags */ +#define MAC_DOEXCF_TRACED 0x01 /* Only do mach exeception if + being ptrace()'ed */ +struct uthread; +int mac_do_machexc(int64_t code, int64_t subcode, uint32_t flags __unused); +int mac_schedule_userret(void); +struct label *mac_thread_get_threadlabel(struct thread *thread); +struct label *mac_thread_get_uthreadlabel(struct uthread *uthread); + #if CONFIG_MACF void mac_policy_init(void); void mac_policy_initmach(void); @@ -106,6 +115,10 @@ int mac_port_label_internalize(struct label *label, char *string); void mac_task_label_update(struct label *cred, struct label *task); int mac_port_check_service(struct label *subj, struct label *obj, const char *serv, const char *perm); + +/* threads */ +void act_set_astmacf(struct thread *); +void mac_thread_userret(struct thread *); #endif /* MAC */ #endif /* !_SECURITY_MAC_MACH_INTERNAL_H_ */ diff --git a/security/mac_policy.h b/security/mac_policy.h index 836be3cc0..914393813 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -2881,6 +2881,7 @@ typedef int mpo_posixshm_check_mmap_t( @param cred Subject credential @param ps Pointer to shared memory information structure @param shmlabel Label associated with the shared memory region + @param fflags shm_open(2) open flags ('fflags' encoded) Determine whether the subject identified by the credential can open the POSIX shared memory region. @@ -2891,7 +2892,8 @@ typedef int mpo_posixshm_check_mmap_t( typedef int mpo_posixshm_check_open_t( kauth_cred_t cred, struct pshminfo *ps, - struct label *shmlabel + struct label *shmlabel, + int fflags ); /** @brief Access control check for POSIX shared memory stat @@ -3123,6 +3125,25 @@ typedef int mpo_proc_check_getlcid_t( struct proc *p, pid_t pid ); +/** + @brief Access control check for retrieving ledger information + @param cred Subject credential + @param target Object process + @param op ledger operation + + Determine if ledger(2) system call is permitted. + + Information returned by this system call is similar to that returned via + process listings etc. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_proc_check_ledger_t( + kauth_cred_t cred, + struct proc *target, + int op +); /** @brief Access control check for mmap MAP_ANON @param proc User process requesting the memory @@ -4082,6 +4103,22 @@ typedef int mpo_system_check_sysctl_t( user_addr_t newvalue, /* NULLOK */ size_t newlen ); +/** + @brief Access control check for kas_info + @param cred Subject credential + @param selector Category of information to return. See kas_info.h + + Determine whether the subject identified by the credential can perform + introspection of the kernel address space layout for + debugging/performance analysis. + + @return Return 0 if access is granted, otherwise an appropriate value for + errno should be returned. +*/ +typedef int mpo_system_check_kas_info_t( + kauth_cred_t cred, + int selector +); /** @brief Create a System V message label @param cred Subject credential @@ -4723,6 +4760,38 @@ typedef void mpo_task_label_update_t( struct label *cred, struct label *task ); +/** + @brief Perform MAC-related events when a thread returns to user space + @param thread Mach (not BSD) thread that is returning + + This entry point permits policy modules to perform MAC-related + events when a thread returns to user space, via a system call + return or trap return. +*/ +typedef void mpo_thread_userret_t( + struct thread *thread +); +/** + @brief Initialize per thread label + @param label New label to initialize + + Initialize the label for a newly instantiated thread. + Sleeping is permitted. +*/ +typedef void mpo_thread_label_init_t( + struct label *label +); +/** + @brief Destroy thread label + @param label The label to be destroyed + + Destroy a user thread label. Since the user thread + is going out of scope, policy modules should free any internal + storage associated with the label so that it may be destroyed. +*/ +typedef void mpo_thread_label_destroy_t( + struct label *label +); /** @brief Check vnode access @param cred Subject credential @@ -5967,7 +6036,7 @@ typedef void mpo_reserved_hook_t(void); /*! \struct mac_policy_ops */ -#define MAC_POLICY_OPS_VERSION 11 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 13 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -6278,7 +6347,7 @@ struct mac_policy_ops { mpo_vnode_check_uipc_connect_t *mpo_vnode_check_uipc_connect; mac_proc_check_run_cs_invalid_t *mpo_proc_check_run_cs_invalid; mpo_proc_check_suspend_resume_t *mpo_proc_check_suspend_resume; - mpo_reserved_hook_t *mpo_reserved12; + mpo_thread_userret_t *mpo_thread_userret; mpo_iokit_check_set_properties_t *mpo_iokit_check_set_properties; mpo_system_check_chud_t *mpo_system_check_chud; mpo_vnode_check_searchfs_t *mpo_vnode_check_searchfs; @@ -6287,11 +6356,11 @@ struct mac_policy_ops { mpo_proc_check_map_anon_t *mpo_proc_check_map_anon; mpo_vnode_check_fsgetpath_t *mpo_vnode_check_fsgetpath; mpo_iokit_check_open_t *mpo_iokit_check_open; + mpo_proc_check_ledger_t *mpo_proc_check_ledger; mpo_vnode_notify_rename_t *mpo_vnode_notify_rename; - mpo_reserved_hook_t *mpo_reserved14; - mpo_reserved_hook_t *mpo_reserved15; - mpo_reserved_hook_t *mpo_reserved16; - mpo_reserved_hook_t *mpo_reserved17; + mpo_thread_label_init_t *mpo_thread_label_init; + mpo_thread_label_destroy_t *mpo_thread_label_destroy; + mpo_system_check_kas_info_t *mpo_system_check_kas_info; mpo_reserved_hook_t *mpo_reserved18; mpo_reserved_hook_t *mpo_reserved19; mpo_reserved_hook_t *mpo_reserved20; diff --git a/security/mac_posix_shm.c b/security/mac_posix_shm.c index f6cc28e56..f2ffd9daf 100644 --- a/security/mac_posix_shm.c +++ b/security/mac_posix_shm.c @@ -136,14 +136,14 @@ mac_posixshm_check_create(kauth_cred_t cred, const char *name) } int -mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *shm) +mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *shm, int fflags) { int error = 0; if (!mac_posixshm_enforce) return 0; - MAC_CHECK(posixshm_check_open, cred, shm, shm->pshm_label); + MAC_CHECK(posixshm_check_open, cred, shm, shm->pshm_label, fflags); return (error); } diff --git a/security/mac_process.c b/security/mac_process.c index 631b468a9..18fbdecca 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -72,8 +72,10 @@ #include #include #include +#include #include +#include #include @@ -102,6 +104,12 @@ mac_cred_label_free(struct label *label) mac_labelzone_free(label); } +int +mac_cred_label_compare(struct label *a, struct label *b) +{ + return (bcmp(a, b, sizeof (*a)) == 0); +} + int mac_cred_label_externalize_audit(struct proc *p, struct mac *mac) { @@ -590,3 +598,75 @@ mac_proc_check_suspend_resume(proc_t curp, int sr) return (error); } + +int +mac_proc_check_ledger(proc_t curp, proc_t proc, int ledger_op) +{ + kauth_cred_t cred; + int error = 0; + + if (!mac_proc_enforce || + !mac_proc_check_enforce(curp, MAC_PROC_ENFORCE)) + return (0); + + cred = kauth_cred_proc_ref(curp); + MAC_CHECK(proc_check_ledger, cred, proc, ledger_op); + kauth_cred_unref(&cred); + + return (error); +} + +struct label * +mac_thread_label_alloc(void) +{ + struct label *label; + + label = mac_labelzone_alloc(MAC_WAITOK); + if (label == NULL) + return (NULL); + MAC_PERFORM(thread_label_init, label); + return (label); +} + +void +mac_thread_label_init(struct uthread *uthread) +{ + uthread->uu_label = mac_thread_label_alloc(); +} + +void +mac_thread_label_free(struct label *label) +{ + MAC_PERFORM(thread_label_destroy, label); + mac_labelzone_free(label); +} + +void +mac_thread_label_destroy(struct uthread *uthread) +{ + + mac_thread_label_free(uthread->uu_label); + uthread->uu_label = NULL; +} + +void +mac_thread_userret(struct thread *td) +{ + + MAC_PERFORM(thread_userret, td); +} + +struct label * +mac_thread_get_uthreadlabel(struct uthread *uthread) +{ + + return (uthread->uu_label); +} + +struct label * +mac_thread_get_threadlabel(struct thread *thread) +{ + struct uthread *uthread = get_bsdthread_info(thread); + + return (mac_thread_get_uthreadlabel(uthread)); +} diff --git a/security/mac_socket.c b/security/mac_socket.c index 45c7daef6..32acf01f5 100644 --- a/security/mac_socket.c +++ b/security/mac_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2007-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,12 +82,6 @@ #include #include -#include -#include - -#include -#include - #include #if CONFIG_MACF_SOCKET diff --git a/security/mac_system.c b/security/mac_system.c index 8089caac8..f3de4ca13 100644 --- a/security/mac_system.c +++ b/security/mac_system.c @@ -192,3 +192,16 @@ mac_system_check_sysctl(kauth_cred_t cred, int *name, u_int namelen, return (error); } + +int +mac_system_check_kas_info(kauth_cred_t cred, int selector) +{ + int error; + + if (!mac_system_enforce) + return (0); + + MAC_CHECK(system_check_kas_info, cred, selector); + + return (error); +} diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 7cc5561a2..ba8e50fce 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -1221,7 +1221,7 @@ mac_mount_label_associate(vfs_context_t ctx, struct mount *mp) } MAC_PERFORM(mount_label_associate, cred, mp, mp->mnt_mntlabel); -#if MAC_DEBUG +#if DEBUG printf("MAC Framework enabling %s support: %s -> %s (%s)\n", mp->mnt_flag & MNT_MULTILABEL ? "multilabel" : "singlelabel", mp->mnt_vfsstat.f_mntfromname, diff --git a/tools/tests/MPMMTest/KQMPMMtest.c b/tools/tests/MPMMTest/KQMPMMtest.c index 5b659a833..4ee81c427 100644 --- a/tools/tests/MPMMTest/KQMPMMtest.c +++ b/tools/tests/MPMMTest/KQMPMMtest.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -419,7 +420,7 @@ server(void *serverarg) } #else if (kev[0].data != args.port) - printf("kevent64(MACH_PORT_NULL) port name (0x%x) != expected (0x%x)\n", kev[0].data, args.port); + printf("kevent64(MACH_PORT_NULL) port name (%lld) != expected (0x%x)\n", kev[0].data, args.port); args.req_msg->msgh_bits = 0; args.req_msg->msgh_size = args.req_size; @@ -470,6 +471,7 @@ server(void *serverarg) } } } + return NULL; } static inline void @@ -535,6 +537,7 @@ calibrate_client_work(void) printf("calibration_count=%d calibration_usec=%d\n", calibration_count, calibration_usec); } + return NULL; } static void * @@ -549,6 +552,7 @@ client_work(void) if (client_delay) { usleep(client_delay); } + return NULL; } void *client(void *threadarg) @@ -558,7 +562,7 @@ void *client(void *threadarg) mach_msg_header_t *req, *reply; mach_port_t bsport, servport; kern_return_t ret; - long server_num = (long) threadarg; + int server_num = (int) threadarg; void *ints = malloc(sizeof(u_int32_t) * num_ints); if (verbose) @@ -655,7 +659,7 @@ void *client(void *threadarg) } free(ints); - return; + return NULL; } static void @@ -670,12 +674,12 @@ thread_spawn(thread_id_t *thread, void *(fn)(void *), void *arg) { if (ret != 0) err(1, "pthread_create()"); if (verbose) - printf("created pthread 0x%x\n", thread->tid); + printf("created pthread %p\n", thread->tid); } else { thread->pid = fork(); if (thread->pid == 0) { if (verbose) - printf("calling 0x%x(0x%x)\n", fn, arg); + printf("calling %p(%p)\n", fn, arg); fn(arg); exit(0); } @@ -689,10 +693,10 @@ thread_join(thread_id_t *thread) { if (threaded) { kern_return_t ret; if (verbose) - printf("joining thread 0x%x\n", thread->tid); + printf("joining thread %p\n", thread->tid); ret = pthread_join(thread->tid, NULL); if (ret != KERN_SUCCESS) - err(1, "pthread_join(0x%x)", thread->tid); + err(1, "pthread_join(%p)", thread->tid); } else { int stat; if (verbose) @@ -820,8 +824,8 @@ int main(int argc, char *argv[]) double dsecs = (double) deltatv.tv_sec + 1.0E-6 * (double) deltatv.tv_usec; - printf(" in %u.%03u seconds\n", - deltatv.tv_sec, deltatv.tv_usec/1000); + printf(" in %ld.%03u seconds\n", + (long)deltatv.tv_sec, deltatv.tv_usec/1000); printf(" throughput in messages/sec: %g\n", (double)totalmsg / dsecs); printf(" average message latency (usec): %2.3g\n", diff --git a/tools/tests/MPMMTest/MPMMtest.c b/tools/tests/MPMMTest/MPMMtest.c index 44389b35d..590ac04b1 100644 --- a/tools/tests/MPMMTest/MPMMtest.c +++ b/tools/tests/MPMMTest/MPMMtest.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -417,6 +418,7 @@ server(void *serverarg) } } } + return NULL; } static inline void @@ -482,6 +484,7 @@ calibrate_client_work(void) printf("calibration_count=%d calibration_usec=%d\n", calibration_count, calibration_usec); } + return NULL; } static void * @@ -496,6 +499,7 @@ client_work(void) if (client_delay) { usleep(client_delay); } + return NULL; } void *client(void *threadarg) @@ -505,7 +509,7 @@ void *client(void *threadarg) mach_msg_header_t *req, *reply; mach_port_t bsport, servport; kern_return_t ret; - long server_num = (long) threadarg; + int server_num = (int) threadarg; void *ints = malloc(sizeof(u_int32_t) * num_ints); if (verbose) @@ -602,7 +606,7 @@ void *client(void *threadarg) } free(ints); - return; + return NULL; } static void @@ -617,12 +621,12 @@ thread_spawn(thread_id_t *thread, void *(fn)(void *), void *arg) { if (ret != 0) err(1, "pthread_create()"); if (verbose) - printf("created pthread 0x%x\n", thread->tid); + printf("created pthread %p\n", thread->tid); } else { thread->pid = fork(); if (thread->pid == 0) { if (verbose) - printf("calling 0x%x(0x%x)\n", fn, arg); + printf("calling %p(%p)\n", fn, arg); fn(arg); exit(0); } @@ -636,10 +640,10 @@ thread_join(thread_id_t *thread) { if (threaded) { kern_return_t ret; if (verbose) - printf("joining thread 0x%x\n", thread->tid); + printf("joining thread %p\n", thread->tid); ret = pthread_join(thread->tid, NULL); if (ret != KERN_SUCCESS) - err(1, "pthread_join(0x%x)", thread->tid); + err(1, "pthread_join(%p)", thread->tid); } else { int stat; if (verbose) diff --git a/tools/tests/execperf/exit.c b/tools/tests/execperf/exit.c index ded537881..1f6e025ad 100644 --- a/tools/tests/execperf/exit.c +++ b/tools/tests/execperf/exit.c @@ -1,3 +1,5 @@ +#include + void mystart(void) __asm__("mystart"); void mystart(void) { diff --git a/tools/tests/execperf/printexecinfo.c b/tools/tests/execperf/printexecinfo.c index 1acf0d493..5dfcd6bb7 100644 --- a/tools/tests/execperf/printexecinfo.c +++ b/tools/tests/execperf/printexecinfo.c @@ -6,6 +6,7 @@ #include #include #include +#include __attribute__((constructor)) void init(int argc, const char *argv[], const char *envp[], const char *appl[], void *vars __attribute__((unused))) { diff --git a/tools/tests/execperf/run.c b/tools/tests/execperf/run.c index d7d5f6a5b..79a2bf602 100644 --- a/tools/tests/execperf/run.c +++ b/tools/tests/execperf/run.c @@ -5,6 +5,7 @@ #include #include #include +#include extern char **environ; diff --git a/tools/tests/libMicro/AppleReadMe b/tools/tests/libMicro/AppleReadMe index de49c7daf..8b3834586 100755 --- a/tools/tests/libMicro/AppleReadMe +++ b/tools/tests/libMicro/AppleReadMe @@ -20,6 +20,7 @@ runs the libMicro test suite excluding the lmbench tests and gives you a text fi gives you a html file comparing two runs. *** To run libMicro testsuite with stepper disabled *** +*** For Desktop use coreos_bench script*** To get a more consistent result of libMicro benchmark run, we need to disable the stepper to prevent it from causing wide variations in results. See rdar://6243819 @@ -39,6 +40,10 @@ which provides '/usr/local/bin/pstates'. 2) 'coreos_bench' script is used exactly like the 'bench' script. All the usage examples for 'bench' script in this readme file also holds true for 'coreos_bench' script. + + + + *** Makefile *** The Makefile invokes Makefile.Darwin which invokes Makefile.com.Darwin. @@ -62,8 +67,11 @@ ARCH defaults to i386 the makefile will automatically build with ARCH_FLAG="-arch i386 -arch x86_64" and put the results in bin-fat to build for ARM architecture, - first set an environment variable 'SDKROOT' to point to iPhone sdk - make ARCH=ARM_ARCH where ARM_ARCH can be armv6 or armv7 + first set an environment variable 'SDKROOT' to point to iPhone internal sdk + For example: + $export SDKROOT="/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS5.0.Internal.sdk/" + to build use: + make ARCH=ARM_ARCH where ARM_ARCH can be for e.g. armv6, armv7 this will put the results in bin-ARM_ARCH to build with only two of the architectures see below @@ -99,6 +107,9 @@ system then needs to be rebooted. The shell script "bench" will run all the benchmarks, or you can pass it a parameter to run a single benchmark, e.g. +*** To run libMicro on the embedded platform, use "embd_bench" script. 'embd_bench' script is used exactly like the 'bench' script. All the usage examples for +'bench' script in this readme file also holds true for 'embd_bench' script. *** + bench lmbench_bw_unix By default the script will run only the libMicro testsuite excluding the lmbench tests. diff --git a/tools/tests/libMicro/Makefile b/tools/tests/libMicro/Makefile index 877beb36d..d9dad443e 100644 --- a/tools/tests/libMicro/Makefile +++ b/tools/tests/libMicro/Makefile @@ -72,16 +72,21 @@ BINS= $(ALL:%=bin-$(ARCH)/%) bin-$(ARCH)/tattle wrapper.sh \ README +ifeq "$(Embedded)" "YES" +SEMOP_FLAG= +endif + default $(ALL) run cstyle lint tattle: $(BINS) @cp bench.sh bench @cp coreos_bench.sh coreos_bench + @cp embd_bench.sh embd_bench @cp multiview.sh multiview @cp wrapper.sh wrapper @cp create_stuff.sh create_stuff @cp benchDS.sh benchDS @cp od_account_create.sh od_account_create @cp od_account_delete.sh od_account_delete - @chmod +x bench coreos_bench create_stuff multiview wrapper benchDS od_account_create od_account_delete + @chmod +x bench coreos_bench embd_bench create_stuff multiview wrapper benchDS od_account_create od_account_delete @mkdir -p bin-$(ARCH); cd bin-$(ARCH); MACH=$(ARCH) $(MAKE) -f ../Makefile.`uname -s` ARCH=$(ARCH) UNAME_RELEASE=`uname -r | sed 's/\./_/g'` $@ @echo "code signing all the binaries under bin-$(ARCH) and apple/bin-$(ARCH)" @for file in $(abspath bin-$(ARCH)/*) $(abspath apple/bin-$(ARCH)/*);do \ @@ -94,7 +99,7 @@ default $(ALL) run cstyle lint tattle: $(BINS) .PHONY: clean clean_subdirs clean_$(SUBDIRS) clean: clean_subdirs - rm -rf bin bin-* wrapper multiview create_stuff bench tattle benchDS od_account_create od_account_delete coreos_bench + rm -rf bin bin-* wrapper multiview create_stuff bench tattle benchDS od_account_create od_account_delete coreos_bench embd_bench clean_subdirs: for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean; done diff --git a/tools/tests/libMicro/Makefile.Darwin b/tools/tests/libMicro/Makefile.Darwin index d113fc4f2..9d4e00608 100644 --- a/tools/tests/libMicro/Makefile.Darwin +++ b/tools/tests/libMicro/Makefile.Darwin @@ -53,14 +53,23 @@ endif OPT_FLAG= -Os SEMOP_FLAG= -DUSE_SEMOP +ifeq "$(Embedded)" "YES" +SEMOP_FLAG= +endif + ### ###CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall ###extra_CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall ### CFLAGS+= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall + ifeq "$(Embedded)" "YES" -CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) +#CFLAGS+= $(OPT_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall +CFLAGS+= -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) endif + + + extra_CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall CPPFLAGS= $(SEMOP_FLAG) -D_REENTRANT -Wall MATHLIB= -lm diff --git a/tools/tests/libMicro/apple/Makefile.Darwin b/tools/tests/libMicro/apple/Makefile.Darwin index fe5e573bf..9ef0e27cf 100644 --- a/tools/tests/libMicro/apple/Makefile.Darwin +++ b/tools/tests/libMicro/apple/Makefile.Darwin @@ -53,15 +53,21 @@ endif ### OPT_FLAG value was modified from '-g' to '-Os' as part of the fix for radar 7508837 OPT_FLAG= -Os SEMOP_FLAG= -DUSE_SEMOP +ifeq "$(Embedded)" "YES" +SEMOP_FLAG= +endif ### ###CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall ###extra_CFLAGS= -Os -DUSE_SEMOP -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall -### Added -DUSE_GETHRTIME to CFLAGS and extra_CFLAGS as part of the fix for radar 7508837 +### CFLAGS+= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall + ifeq "$(Embedded)" "YES" -CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) +#CFLAGS+= $(OPT_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall +CFLAGS+= -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) endif + extra_CFLAGS= $(OPT_FLAG) $(SEMOP_FLAG) -DUSE_GETHRTIME -fno-builtin $(NOPIC) $(ARCH_FLAG) -Wall CPPFLAGS= $(SEMOP_FLAG) -D_REENTRANT -Wall MATHLIB= -lm diff --git a/tools/tests/libMicro/apple/Makefile.benchmarks b/tools/tests/libMicro/apple/Makefile.benchmarks index a26d12871..210cf37a5 100644 --- a/tools/tests/libMicro/apple/Makefile.benchmarks +++ b/tools/tests/libMicro/apple/Makefile.benchmarks @@ -26,6 +26,7 @@ # Use is subject to license terms. # +Embedded=$(shell tconf --test TARGET_OS_EMBEDDED) ALL = \ create_file \ @@ -52,7 +53,6 @@ ALL = \ posix_spawn \ trivial \ vm_allocate \ - od_query_create_with_node \ mbr_check_service_membership \ getpwnam \ mbr_check_membership \ @@ -63,3 +63,9 @@ ALL = \ getaddrinfo_host \ getaddrinfo_port \ getgrnam + +# Compile the following test on desktop platform only +ifeq "$(Embedded)" "NO" +ALL += od_query_create_with_node +endif + diff --git a/tools/tests/libMicro/coreos_bench.sh b/tools/tests/libMicro/coreos_bench.sh index a862cbd86..b153b43dd 100644 --- a/tools/tests/libMicro/coreos_bench.sh +++ b/tools/tests/libMicro/coreos_bench.sh @@ -621,8 +621,6 @@ pwrite $OPTS -N "pwrite_n1k" -s 1k -I 100 -f /dev/null pwrite $OPTS -N "pwrite_n10k" -s 10k -I 100 -f /dev/null pwrite $OPTS -N "pwrite_n100k" -s 100k -I 100 -f /dev/null -mmap $OPTS -N "mmap_z8k" -l 8k -I 1000 -B 50 -f /dev/zero -mmap $OPTS -N "mmap_z128k" -l 128k -I 2000 -B 100 -f /dev/zero mmap $OPTS -N "mmap_t8k" -l 8k -I 1000 -f $TFILE mmap $OPTS -N "mmap_t128k" -l 128k -I 1000 -f $TFILE mmap $OPTS -N "mmap_u8k" -l 8k -I 1000 -f $VFILE @@ -631,8 +629,7 @@ mmap $OPTS -N "mmap_a8k" -l 8k -I 200 -f MAP_ANON mmap $OPTS -N "mmap_a128k" -l 128k -I 200 -f MAP_ANON -mmap $OPTS -N "mmap_rz8k" -l 8k -I 2000 -r -f /dev/zero -mmap $OPTS -N "mmap_rz128k" -l 128k -I 2000 -r -f /dev/zero + mmap $OPTS -N "mmap_rt8k" -l 8k -I 2000 -r -f $TFILE mmap $OPTS -N "mmap_rt128k" -l 128k -I 20000 -r -f $TFILE mmap $OPTS -N "mmap_ru8k" -l 8k -I 2000 -r -f $VFILE @@ -640,8 +637,7 @@ mmap $OPTS -N "mmap_ru128k" -l 128k -I 20000 -r -f $VFILE mmap $OPTS -N "mmap_ra8k" -l 8k -I 2000 -r -f MAP_ANON mmap $OPTS -N "mmap_ra128k" -l 128k -I 20000 -r -f MAP_ANON -mmap $OPTS -N "mmap_wz8k" -l 8k -I 5000 -w -B 50 -f /dev/zero -mmap $OPTS -N "mmap_wz128k" -l 128k -I 50000 -w -B 50 -f /dev/zero + mmap $OPTS -N "mmap_wt8k" -l 8k -I 5000 -w -f $TFILE mmap $OPTS -N "mmap_wt128k" -l 128k -I 50000 -w -f $TFILE mmap $OPTS -N "mmap_wu8k" -l 8k -I 5000 -w -f $VFILE @@ -649,8 +645,7 @@ mmap $OPTS -N "mmap_wu128k" -l 128k -I 500000 -w -f $VFILE mmap $OPTS -N "mmap_wa8k" -l 8k -I 3000 -w -f MAP_ANON mmap $OPTS -N "mmap_wa128k" -l 128k -I 50000 -w -f MAP_ANON -munmap $OPTS -N "unmap_z8k" -l 8k -I 500 -f /dev/zero -munmap $OPTS -N "unmap_z128k" -l 128k -I 500 -B 100 -f /dev/zero + munmap $OPTS -N "unmap_t8k" -l 8k -I 500 -f $TFILE munmap $OPTS -N "unmap_t128k" -l 128k -I 500 -f $TFILE munmap $OPTS -N "unmap_u8k" -l 8k -I 500 -f $VFILE @@ -658,8 +653,7 @@ munmap $OPTS -N "unmap_u128k" -l 128k -I 500 -f $VFILE munmap $OPTS -N "unmap_a8k" -l 8k -I 500 -f MAP_ANON munmap $OPTS -N "unmap_a128k" -l 128k -I 500 -f MAP_ANON -munmap $OPTS -N "unmap_rz8k" -l 8k -I 1000 -r -f /dev/zero -munmap $OPTS -N "unmap_rz128k" -l 128k -I 2000 -r -B 100 -f /dev/zero + munmap $OPTS -N "unmap_rt8k" -l 8k -I 1000 -r -f $TFILE munmap $OPTS -N "unmap_rt128k" -l 128k -I 3000 -r -f $TFILE munmap $OPTS -N "unmap_ru8k" -l 8k -I 1000 -r -f $VFILE @@ -669,8 +663,7 @@ munmap $OPTS -N "unmap_ra128k" -l 128k -I 2000 -r -f MAP_ANON connection $OPTS -N "conn_connect" -B 256 -c -munmap $OPTS -N "unmap_wz8k" -l 8k -I 1000 -w -f /dev/zero -munmap $OPTS -N "unmap_wz128k" -l 128k -I 8000 -w -B 100 -f /dev/zero + munmap $OPTS -N "unmap_wt8k" -l 8k -I 1000 -w -f $TFILE munmap $OPTS -N "unmap_wt128k" -l 128k -I 10000 -w -f $TFILE munmap $OPTS -N "unmap_wu8k" -l 8k -I 1000 -w -f $VFILE @@ -678,7 +671,6 @@ munmap $OPTS -N "unmap_wu128k" -l 128k -I 50000 -w -B 10 -f $VFILE munmap $OPTS -N "unmap_wa8k" -l 8k -I 1000 -w -f MAP_ANON munmap $OPTS -N "unmap_wa128k" -l 128k -I 10000 -w -f MAP_ANON - mprotect $OPTS -N "mprot_z8k" -l 8k -I 300 -f /dev/zero mprotect $OPTS -N "mprot_z128k" -l 128k -I 500 -f /dev/zero mprotect $OPTS -N "mprot_wz8k" -l 8k -I 500 -w -f /dev/zero diff --git a/tools/tests/libMicro/embd_bench.sh b/tools/tests/libMicro/embd_bench.sh new file mode 100644 index 000000000..7b61d0f59 --- /dev/null +++ b/tools/tests/libMicro/embd_bench.sh @@ -0,0 +1,815 @@ +#!/bin/sh +# +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms +# of the Common Development and Distribution License +# (the "License"). You may not use this file except +# in compliance with the License. +# +# You can obtain a copy of the license at +# src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing +# permissions and limitations under the License. +# +# When distributing Covered Code, include this CDDL +# HEADER in each file and include the License file at +# usr/src/OPENSOLARIS.LICENSE. If applicable, +# add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your +# own identifying information: Portions Copyright [yyyy] +# [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + + +# usage function - defines all the options that can be given to this script. +function usage { + echo "Usage" + echo "$0 [-l] [-h] [name of test]" + echo "-l : This option runs the lmbench tests along with the default libmicro tests." + echo "-h : Help. This option displays information on how to run the script. " + echo "[name of test] : This option runs only the test that is specified" + echo "" + echo "Examples" + echo "$0 : This is the defualt execution. This will run only the default libmicro tests." + echo "$0 -l : This will run the lmbench tests too " + echo "$0 getppid : This will run only the getppid tests" + exit + +} + +if [ $# -eq 1 ] +then + lmbench=2 # to check if only a single test is to be run. e.g, ./bench.sh getppid +else + lmbench=0 # to run the default libMicro tests, without the lmbench tests. +fi + +while getopts "lh" OPT_LIST +do + case $OPT_LIST in + l) lmbench=1;; # to run the libmicro tests including the lmbench tests. + h) usage;; + *) usage;; + esac +done + +if [ -w / ]; then + #do nothing + echo "/ is mounted" +else + echo "ERROR: the test requires that the / directory be read/writable, please mount using the command: 'mount -uw /' " + exit 1 +fi + + +tattle="./tattle" + +bench_version=0.4.0 +libmicro_version=`$tattle -V` + +case $libmicro_version in +$bench_version) + ;; +*) + echo "ERROR: libMicro version doesn't match 'bench' script version" + exit 1 +esac + +TMPROOT=/private/tmp/libmicro.$$ +VARROOT=/private/var/tmp/libmicro.$$ +mkdir -p $TMPROOT +mkdir -p $VARROOT +trap "rm -rf $TMPROOT $VARROOT && exit" 0 2 + +TFILE=$TMPROOT/data +IFILE=$TMPROOT/ifile +TDIR1=$TMPROOT/0/1/2/3/4/5/6/7/8/9 +TDIR2=$TMPROOT/1/2/3/4/5/6/7/8/9/0 +VFILE=$VARROOT/data +VDIR1=$VARROOT/0/1/2/3/4/5/6/7/8/9 +VDIR2=$VARROOT/1/2/3/4/5/6/7/8/9/0 + + +OPTS="-E -C 200 -L -S -W" + +dd if=/dev/zero of=$TFILE bs=1024k count=10 2>/dev/null +dd if=/dev/zero of=$VFILE bs=1024k count=10 2>/dev/null +mkdir -p $TDIR1 $TDIR2 +mkdir -p $VDIR1 $VDIR2 + +touch $IFILE +/usr/bin/touch /private/var/tmp/lmbench + + +# produce benchmark header for easier comparisons + +hostname=`uname -n` + +if [ -f /usr/sbin/psrinfo ]; then + p_count=`psrinfo|wc -l` + p_mhz=`psrinfo -v | awk '/operates/{print $6 "MHz"; exit }'` + p_type=`psrinfo -vp 2>/dev/null | awk '{if (NR == 3) {print $0; exit}}'` + p_ipaddr=`getent hosts $hostname | awk '{print $1}'` +fi + +if [ -f /proc/cpuinfo ]; then + p_count=`egrep processor /proc/cpuinfo | wc -l` + p_mhz=`awk -F: '/cpu MHz/{printf("%5.0f00Mhz\n",$2/100); exit}' /proc/cpuinfo` + p_type=`awk -F: '/model name/{print $2; exit}' /proc/cpuinfo` + p_ipaddr=`getent hosts $hostname | awk '{print $1}'` +else +## Mac OS X specific stuff +# first, get ugly output, in case pretty output isn't available +# + p_count=`sysctl -n hw.physicalcpu` + p_mhz=`sysctl -n hw.cpufrequency` + p_type=`sysctl -n hw.model` + +if [ -x /usr/sbin/system_profiler ]; then + # requires this hunk of work-around + # grep the XML for the characteristic we need. The key appears twice, so grep for the useful key (with 'string') + # use sed to strip off the and the tabs in front of the string. So much work for so little result. + # + p_mhz=`system_profiler -xml -detailLevel mini SPHardwareDataType | \ + grep -A1 current_processor_speed | grep string | \ + sed -E 's/(.+)<\/string>/\1/' | sed 's- --g'` + p_type=`system_profiler -xml -detailLevel mini SPHardwareDataType | \ + grep -A1 cpu_type | grep string | \ + sed -E 's/(.+)<\/string>/\1/' | sed 's- --g'` +fi + +# look for en0 (usually ethernet) if that isn't there try en1 (usually wireless) else give up + p_ipaddr=`ipconfig getpacket en0 | grep yiaddr | tr "= " "\n" | grep [0-9]` + if [ ! $p_ipaddr ]; then + p_ipaddr=`ipconfig getpacket en1 | grep yiaddr | tr "= " "\n" | grep [0-9]` + elif [ ! $p_ipaddr ]; then + p_ipaddr="unknown" + fi +fi + +printf "\n\n!Libmicro_#: %30s\n" $libmicro_version +printf "!Options: %30s\n" "$OPTS" +printf "!Machine_name: %30s\n" "$hostname" +printf "!OS_name: %30s\n" `uname -s` +printf "!OS_release: %30s\n" `sw_vers -productVersion` +printf "!OS_build: %30.18s\n" "`sw_vers -buildVersion`" +printf "!Processor: %30s\n" `arch` +printf "!#CPUs: %30s\n" $p_count +printf "!CPU_MHz: %30s\n" "$p_mhz" +printf "!CPU_NAME: %30s\n" "$p_type" +printf "!IP_address: %30s\n" "$p_ipaddr" +printf "!Run_by: %30s\n" $LOGNAME +printf "!Date: %30s\n" "`date '+%D %R'`" +printf "!Compiler: %30s\n" `$tattle -c` +printf "!Compiler Ver.:%30s\n" "`$tattle -v`" +printf "!sizeof(long): %30s\n" `$tattle -s` +printf "!extra_CFLAGS: %30s\n" "`$tattle -f`" +printf "!TimerRes: %30s\n\n\n" "`$tattle -r`" + +bin_dir="$TMPROOT/bin" + +mkdir -p $bin_dir +cp bin-*/exec_bin $bin_dir/$A + +cp ./apple/bin-*/posix_spawn_bin $bin_dir/$A + +newline=0 + +# +# Everything below the while loop is input for the while loop +# if you have any tests which can't run in the while loop, put +# them above this comment +# +while read A B +do + # $A contains the command, $B contains the arguments + # we echo blank lines and comments + # we skip anything which fails to match *$1* (useful + # if we only want to test one case, but a nasty hack) + + case $A in + \#*) + echo "$A $B" + newline=1 + continue + ;; + + "") + if [ $newline -eq 1 ] + then + newline=0 + echo + echo + fi + + continue + ;; + + *$1*) + # Default execution without the lmbench tests. + # checks if there is no argument passed by the user. + if [ $lmbench -eq 0 ] + then + string=lmbench + if [ "${A:0:7}" == "$string" ] + then + continue + fi + fi + + ;; + + *) + if [ $lmbench -ne 1 ] + then + continue + fi + ;; + esac + + if [ ! -f $bin_dir/$A ] + then + cp bin-*/$A $bin_dir/$A + fi + + echo + + (cd $TMPROOT && eval "bin/$A $B") + + echo + echo +done <<. + +# +# Obligatory null system call: use very short time +# for default since SuSe implements this "syscall" in userland +# + +getpid $OPTS -N "getpid" -I 5 +getppid $OPTS -N "getppid" -I 5 + +getenv $OPTS -N "getenv" -s 100 -I 100 +getenv $OPTS -N "getenvT2" -s 100 -I 100 -T 2 + +gettimeofday $OPTS -N "gettimeofday" + +log $OPTS -N "log" -I 20 -B 300000 +exp $OPTS -N "exp" -I 20 -B 100000 +lrand48 $OPTS -N "lrand48" + +memset $OPTS -N "memset_10" -s 10 -I 10 +memset $OPTS -N "memset_256" -s 256 -I 20 +memset $OPTS -N "memset_256_u" -s 256 -a 1 -I 20 +memset $OPTS -N "memset_1k" -s 1k -I 100 -B 2000 +memset $OPTS -N "memset_4k" -s 4k -I 250 -B 500 +memset $OPTS -N "memset_4k_uc" -s 4k -u -I 400 + +memset $OPTS -N "memset_10k" -s 10k -I 600 -B 500 +memset $OPTS -N "memset_1m" -s 1m -I 200000 +memset $OPTS -N "memset_10m" -s 10m -I 2000000 +memset $OPTS -N "memsetP2_10m" -s 10m -P 2 -I 2000000 + +memrand $OPTS -N "memrand" -s 40m -B 10000 + +# This is an elided test and is not ported yet. +# Check Makefile.darwin for list of elided tests +# cachetocache $OPTS -N "cachetocache" -s 100k -T 2 -I 200 + +isatty $OPTS -N "isatty_yes" +isatty $OPTS -N "isatty_no" -f $IFILE + +malloc $OPTS -N "malloc_10" -s 10 -g 10 -I 50 +malloc $OPTS -N "malloc_100" -s 100 -g 10 -I 50 +malloc $OPTS -N "malloc_1k" -s 1k -g 10 -I 50 +malloc $OPTS -N "malloc_10k" -s 10k -g 10 -I 50 +malloc $OPTS -N "malloc_100k" -s 100k -g 10 -I 2000 + +malloc $OPTS -N "mallocT2_10" -s 10 -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_100" -s 100 -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_1k" -s 1k -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_10k" -s 10k -g 10 -T 2 -I 200 +malloc $OPTS -N "mallocT2_100k" -s 100k -g 10 -T 2 -I 10000 + +close $OPTS -N "close_bad" -B 96 -b +close $OPTS -N "close_tmp" -B 64 -f $TFILE +close $OPTS -N "close_usr" -B 64 -f $VFILE +close $OPTS -N "close_zero" -B 64 -f /dev/zero +close_tcp $OPTS -N "close_tcp" -B 32 + +memcpy $OPTS -N "memcpy_10" -s 10 -I 10 +memcpy $OPTS -N "memcpy_1k" -s 1k -I 50 +memcpy $OPTS -N "memcpy_10k" -s 10k -I 800 +memcpy $OPTS -N "memcpy_1m" -s 1m -I 500000 +memcpy $OPTS -N "memcpy_10m" -s 10m -I 5000000 + +strcpy $OPTS -N "strcpy_10" -s 10 -I 5 +strcpy $OPTS -N "strcpy_1k" -s 1k -I 100 + +strlen $OPTS -N "strlen_10" -s 10 -I 5 +strlen $OPTS -N "strlen_1k" -s 1k -I 100 + +strchr $OPTS -N "strchr_10" -s 10 -I 5 +strchr $OPTS -N "strchr_1k" -s 1k -I 200 +strcmp $OPTS -N "strcmp_10" -s 10 -I 10 +strcmp $OPTS -N "strcmp_1k" -s 1k -I 200 + +strcasecmp $OPTS -N "scasecmp_10" -s 10 -I 50 -B 2000 +strcasecmp $OPTS -N "scasecmp_1k" -s 1k -I 20000 -B 100 + +strtol $OPTS -N "strtol" -I 20 + +# This is an elided test and is not ported yet. +# Check Makefile.darwin for list of elided tests +# getcontext $OPTS -N "getcontext" -I 100 + +# This is an elided test and is not ported yet. +# Check Makefile.darwin for list of elided tests +# setcontext $OPTS -N "setcontext" -I 100 + +mutex $OPTS -N "mutex_st" -I 10 +mutex $OPTS -N "mutex_mt" -t -I 10 +mutex $OPTS -N "mutex_T2" -T 2 -I 100 + +longjmp $OPTS -N "longjmp" -I 10 +siglongjmp $OPTS -N "siglongjmp" -I 20 + +getrusage $OPTS -N "getrusage" -I 200 + +times $OPTS -N "times" -I 200 +time $OPTS -N "time" -I 50 +localtime_r $OPTS -N "localtime_r" -I 200 +strftime $OPTS -N "strftime" -I 10000 -B 100 + +mktime $OPTS -N "mktime" -I 500 +mktime $OPTS -N "mktimeT2" -T 2 -I 1000 + +cascade_mutex $OPTS -N "c_mutex_1" -I 50 +cascade_mutex $OPTS -N "c_mutex_10" -T 10 -I 5000 +cascade_mutex $OPTS -N "c_mutex_200" -T 200 -I 2000000 + +cascade_cond $OPTS -N "c_cond_1" -I 100 +cascade_cond $OPTS -N "c_cond_10" -T 10 -I 3000 +cascade_cond $OPTS -N "c_cond_200" -T 200 -I 2000000 + +cascade_lockf $OPTS -N "c_lockf_1" -I 1000 +cascade_lockf $OPTS -N "c_lockf_10" -P 10 -I 50000 +#cascade_lockf $OPTS -N "c_lockf_200" -P 200 -I 5000000 + + + +cascade_flock $OPTS -N "c_flock" -I 1000 +cascade_flock $OPTS -N "c_flock_10" -P 10 -I 50000 +#cascade_flock $OPTS -N "c_flock_200" -P 200 -I 5000000 + + + +cascade_fcntl $OPTS -N "c_fcntl_1" -I 2000 +cascade_fcntl $OPTS -N "c_fcntl_10" -P 10 -I 20000 +#cascade_fcntl $OPTS -N "c_fcntl_200" -P 200 -I 5000000 + + +file_lock $OPTS -N "file_lock" -I 1000 + +getsockname $OPTS -N "getsockname" -I 100 +getpeername $OPTS -N "getpeername" -I 100 + +chdir $OPTS -N "chdir_tmp" -I 2000 $TDIR1 $TDIR2 +chdir $OPTS -N "chdir_usr" -I 2000 $VDIR1 $VDIR2 + +chdir $OPTS -N "chgetwd_tmp" -I 3000 -g $TDIR1 $TDIR2 +chdir $OPTS -N "chgetwd_usr" -I 3000 -g $VDIR1 $VDIR2 + +realpath $OPTS -N "realpath_tmp" -I 3000 -f $TDIR1 +realpath $OPTS -N "realpath_usr" -I 3000 -f $VDIR1 + +stat $OPTS -N "stat_tmp" -I 1000 -f $TFILE +stat $OPTS -N "stat_usr" -I 1000 -f $VFILE + +lmbench_stat $OPTS -N "lmbench_stat_tmp" -I 1000 -f $TFILE +lmbench_stat $OPTS -N "lmbench_stat_usr" -I 10000 -B 100 -f /private/var/tmp/lmbench + +# +# lmbench uses a touched empty file in /private/var/tmp +# libMicro uses a 1M file in a directory off /private/var/tmp +# performance difference is ~ 0.2 usecs/call +# +# why? - walking the dir tree, empty file vs. non-empty file, non-empty dir +# in the case of libMicro, etc., etc. +# + +lmbench_stat $OPTS -N "lmbench_stat_usr - Default" -I 10000 -B 100 -f /private/var/tmp/lmbench + +lmbench_fstat $OPTS -N "lmbench_fstat_tmp" -I 1000 -f $TFILE +lmbench_fstat $OPTS -N "lmbench_fstat_usr" -I 10000 -B 100 -f /private/var/tmp/lmbench + +# see stat test to understand why we are using /private/var/tmp/lmbench + +lmbench_fstat $OPTS -N "lmbench_fstat_usr - Default" -I 10000 -B 100 -f /private/var/tmp/lmbench + +lmbench_openclose $OPTS -N "lmbench_openclose - Default" -I 10000 -B 100 -f /private/var/tmp/lmbench + +lmbench_select_file $OPTS -N "lmbench_select_file_10" -n 10 -B 100 +lmbench_select_file $OPTS -N "lmbench_select_file_100" -n 100 -B 100 +lmbench_select_file $OPTS -N "lmbench_select_file_250" -n 250 -B 100 +lmbench_select_file $OPTS -N "lmbench_select_file_500" -n 500 -B 100 + +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_10" -n 10 -B 100 +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_100" -n 100 -B 100 +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_250" -n 250 -B 100 +lmbench_select_tcp $OPTS -N "lmbench_select_tcp_500" -n 500 -B 100 + +fcntl $OPTS -N "fcntl_tmp" -I 100 -f $TFILE +fcntl $OPTS -N "fcntl_usr" -I 100 -f $VFILE +fcntl_ndelay $OPTS -N "fcntl_ndelay" -I 100 + +lseek $OPTS -N "lseek_t8k" -s 8k -I 50 -f $TFILE +lseek $OPTS -N "lseek_u8k" -s 8k -I 50 -f $VFILE + +open $OPTS -N "open_tmp" -B 256 -f $TFILE +open $OPTS -N "open_usr" -B 256 -f $VFILE +open $OPTS -N "open_zero" -B 256 -f /dev/zero + +dup $OPTS -N "dup" -B 512 + +socket $OPTS -N "socket_u" -B 256 +socket $OPTS -N "socket_i" -B 256 -f PF_INET + +socketpair $OPTS -N "socketpair" -B 256 + +setsockopt $OPTS -N "setsockopt" -I 200 + +bind $OPTS -N "bind" -B 100 + +listen $OPTS -N "listen" -B 100 + +#connection $OPTS -N "connection" -B 256 + +poll $OPTS -N "poll_10" -n 10 -I 500 +poll $OPTS -N "poll_100" -n 100 -I 1000 +poll $OPTS -N "poll_1000" -n 1000 -I 5000 + +poll $OPTS -N "poll_w10" -n 10 -I 500 -w 1 +poll $OPTS -N "poll_w100" -n 100 -I 2000 -w 10 +poll $OPTS -N "poll_w1000" -n 1000 -I 40000 -w 100 + +select $OPTS -N "select_10" -n 10 -I 500 +select $OPTS -N "select_100" -n 100 -I 1000 +select $OPTS -N "select_1000" -n 1000 -I 5000 + +select $OPTS -N "select_w10" -n 10 -I 500 -w 1 +select $OPTS -N "select_w100" -n 100 -I 2000 -w 10 +select $OPTS -N "select_w1000" -n 1000 -I 40000 -w 100 + +semop $OPTS -N "semop" -I 200 + +sigaction $OPTS -N "sigaction" -I 100 +signal $OPTS -N "signal" -I 1000 +sigprocmask $OPTS -N "sigprocmask" -I 200 + +lmbench_lat_sig_install $OPTS -N "lmbench_siginstall" +# sigcatch and sigsend need to be evaluated together +# lmbench framework will allow multiple measurements within the same +# benchmark test which allow them to factor out the cost of sending +# a signal from catching one +# +# for our purposes sigcatch results - sigsend results yield +# lmbench sig handler overhead measurements +lmbench_lat_sig_catch $OPTS -N "lmbench_sigcatch" +lmbench_lat_sig_send $OPTS -N "lmbench_sigsend" + + +pthread_create $OPTS -N "pthread_8" -B 8 +pthread_create $OPTS -N "pthread_32" -B 32 +pthread_create $OPTS -N "pthread_128" -B 128 +pthread_create $OPTS -N "pthread_512" -B 512 + +fork $OPTS -N "fork_10" -B 10 +#fork $OPTS -N "fork_100" -B 100 -C 100 + +#fork $OPTS -N "fork_1000" -B 1000 -C 50 + +exit $OPTS -N "exit_10" -B 10 +##exit $OPTS -N "exit_100" -B 100 + +#exit $OPTS -N "exit_1000" -B 1000 -C 50 + +exit $OPTS -N "exit_10_nolibc" -e -B 10 + +exec $OPTS -N "exec" -B 10 + +posix_spawn $OPTS -N "posix_spawn" -B 10 + +system $OPTS -N "system" -I 1000000 + +recurse $OPTS -N "recurse" -B 512 + +read $OPTS -N "read_t1k" -s 1k -B 50 -f $TFILE +read $OPTS -N "read_t10k" -s 10k -B 16 -f $TFILE +read $OPTS -N "read_t100k" -s 100k -B 4 -f $TFILE + +read $OPTS -N "read_u1k" -s 1k -B 50 -f $VFILE +read $OPTS -N "read_u10k" -s 10k -B 16 -f $VFILE +read $OPTS -N "read_u100k" -s 100k -B 4 -f $VFILE + +read $OPTS -N "read_z1k" -s 1k -B 100 -f /dev/zero +read $OPTS -N "read_z10k" -s 10k -B 30 -f /dev/zero +read $OPTS -N "read_z100k" -s 100k -B 4 -f /dev/zero +read $OPTS -N "read_zw100k" -s 100k -B 4 -w -f /dev/zero + +lmbench_read $OPTS -N "read_t1b" -s 1 -B 50 -f $TFILE +lmbench_read $OPTS -N "read_t1k" -s 1k -B 50 -f $TFILE +lmbench_read $OPTS -N "read_t10k" -s 10k -B 16 -f $TFILE +lmbench_read $OPTS -N "read_t100k" -s 100k -B 4 -f $TFILE + +lmbench_read $OPTS -N "read_u1b" -s 1 -B 50 -f $VFILE +lmbench_read $OPTS -N "read_u1k" -s 1k -B 50 -f $VFILE +lmbench_read $OPTS -N "read_u10k" -s 10k -B 16 -f $VFILE +lmbench_read $OPTS -N "read_u100k" -s 100k -B 4 -f $VFILE + +lmbench_read $OPTS -N "read_z1b - Default" -s 1 -B 100 -f /dev/zero +lmbench_read $OPTS -N "read_z1k" -s 1k -B 100 -f /dev/zero +lmbench_read $OPTS -N "read_z10k" -s 10k -B 30 -f /dev/zero +lmbench_read $OPTS -N "read_z100k" -s 100k -B 4 -f /dev/zero +lmbench_read $OPTS -N "read_zw100k" -s 100k -B 4 -w -f /dev/zero + +write $OPTS -N "write_t1k" -s 1k -B 50 -f $TFILE +write $OPTS -N "write_t10k" -s 10k -B 25 -f $TFILE +write $OPTS -N "write_t100k" -s 100k -B 4 -f $TFILE + +write $OPTS -N "write_u1k" -s 1k -B 50 -f $VFILE +write $OPTS -N "write_u10k" -s 10k -B 25 -f $VFILE +write $OPTS -N "write_u100k" -s 100k -B 4 -f $VFILE + +write $OPTS -N "write_n1k" -s 1k -I 100 -B 0 -f /dev/null +write $OPTS -N "write_n10k" -s 10k -I 100 -B 0 -f /dev/null +write $OPTS -N "write_n100k" -s 100k -I 100 -B 0 -f /dev/null + +lmbench_write $OPTS -N "lmbench_write_t1b" -s 1 -B 50 -f $TFILE +lmbench_write $OPTS -N "lmbench_write_t1k" -s 1k -B 50 -f $TFILE +lmbench_write $OPTS -N "lmbench_write_t10k" -s 10k -B 25 -f $TFILE +lmbench_write $OPTS -N "lmbench_write_t100k" -s 100k -B 4 -f $TFILE + +lmbench_write $OPTS -N "lmbench_write_u1b" -s 1 -B 50 -f $VFILE +lmbench_write $OPTS -N "lmbench_write_u1k" -s 1k -B 50 -f $VFILE +lmbench_write $OPTS -N "lmbench_write_u10k" -s 10k -B 25 -f $VFILE +lmbench_write $OPTS -N "lmbench_write_u100k" -s 100k -B 4 -f $VFILE + +lmbench_write $OPTS -N "lmbench_write_n1b - Default" -s 1 -I 100 -B 0 -f /dev/null +lmbench_write $OPTS -N "lmbench_write_n1k" -s 1k -I 100 -B 0 -f /dev/null +lmbench_write $OPTS -N "lmbench_write_n10k" -s 10k -I 100 -B 0 -f /dev/null +lmbench_write $OPTS -N "lmbench_write_n100k" -s 100k -I 100 -B 0 -f /dev/null + +writev $OPTS -N "writev_t1k" -s 1k -B 20 -f $TFILE +writev $OPTS -N "writev_t10k" -s 10k -B 4 -f $TFILE +writev $OPTS -N "writev_t100k" -s 100k -f $TFILE + +writev $OPTS -N "writev_u1k" -s 1k -B 20 -f $VFILE +writev $OPTS -N "writev_u10k" -s 10k -B 4 -f $VFILE +writev $OPTS -N "writev_u100k" -s 100k -f $VFILE + +writev $OPTS -N "writev_n1k" -s 1k -I 100 -B 0 -f /dev/null +writev $OPTS -N "writev_n10k" -s 10k -I 100 -B 0 -f /dev/null +writev $OPTS -N "writev_n100k" -s 100k -I 100 -B 0 -f /dev/null + +pread $OPTS -N "pread_t1k" -s 1k -I 300 -f $TFILE +pread $OPTS -N "pread_t10k" -s 10k -I 1000 -f $TFILE +pread $OPTS -N "pread_t100k" -s 100k -I 10000 -f $TFILE + +pread $OPTS -N "pread_u1k" -s 1k -I 300 -f $VFILE +pread $OPTS -N "pread_u10k" -s 10k -I 1000 -f $VFILE +pread $OPTS -N "pread_u100k" -s 100k -I 10000 -f $VFILE + +pread $OPTS -N "pread_z1k" -s 1k -I 300 -f /dev/zero +pread $OPTS -N "pread_z10k" -s 10k -I 1000 -f /dev/zero +pread $OPTS -N "pread_z100k" -s 100k -I 2000 -f /dev/zero +pread $OPTS -N "pread_zw100k" -s 100k -w -I 10000 -f /dev/zero + +pwrite $OPTS -N "pwrite_t1k" -s 1k -I 500 -f $TFILE +pwrite $OPTS -N "pwrite_t10k" -s 10k -I 1000 -f $TFILE +pwrite $OPTS -N "pwrite_t100k" -s 100k -I 10000 -f $TFILE + +pwrite $OPTS -N "pwrite_u1k" -s 1k -I 500 -f $VFILE +pwrite $OPTS -N "pwrite_u10k" -s 10k -I 1000 -f $VFILE +pwrite $OPTS -N "pwrite_u100k" -s 100k -I 20000 -f $VFILE + +pwrite $OPTS -N "pwrite_n1k" -s 1k -I 100 -f /dev/null +pwrite $OPTS -N "pwrite_n10k" -s 10k -I 100 -f /dev/null +pwrite $OPTS -N "pwrite_n100k" -s 100k -I 100 -f /dev/null + + +mmap $OPTS -N "mmap_t8k" -l 8k -I 1000 -f $TFILE +mmap $OPTS -N "mmap_t128k" -l 128k -I 1000 -f $TFILE +mmap $OPTS -N "mmap_u8k" -l 8k -I 1000 -f $VFILE +mmap $OPTS -N "mmap_u128k" -l 128k -I 1000 -f $VFILE +mmap $OPTS -N "mmap_a8k" -l 8k -I 200 -f MAP_ANON +mmap $OPTS -N "mmap_a128k" -l 128k -I 200 -f MAP_ANON + + + +mmap $OPTS -N "mmap_rt8k" -l 8k -I 2000 -r -f $TFILE +mmap $OPTS -N "mmap_rt128k" -l 128k -I 20000 -r -f $TFILE +mmap $OPTS -N "mmap_ru8k" -l 8k -I 2000 -r -f $VFILE +mmap $OPTS -N "mmap_ru128k" -l 128k -I 20000 -r -f $VFILE +mmap $OPTS -N "mmap_ra8k" -l 8k -I 2000 -r -f MAP_ANON +mmap $OPTS -N "mmap_ra128k" -l 128k -I 20000 -r -f MAP_ANON + + +mmap $OPTS -N "mmap_wt8k" -l 8k -I 5000 -w -f $TFILE +mmap $OPTS -N "mmap_wt128k" -l 128k -I 50000 -w -f $TFILE +mmap $OPTS -N "mmap_wu8k" -l 8k -I 5000 -w -f $VFILE +mmap $OPTS -N "mmap_wu128k" -l 128k -I 500000 -w -f $VFILE +mmap $OPTS -N "mmap_wa8k" -l 8k -I 3000 -w -f MAP_ANON +mmap $OPTS -N "mmap_wa128k" -l 128k -I 50000 -w -f MAP_ANON + + +munmap $OPTS -N "unmap_t8k" -l 8k -I 500 -f $TFILE +munmap $OPTS -N "unmap_t128k" -l 128k -I 500 -f $TFILE +munmap $OPTS -N "unmap_u8k" -l 8k -I 500 -f $VFILE +munmap $OPTS -N "unmap_u128k" -l 128k -I 500 -f $VFILE +munmap $OPTS -N "unmap_a8k" -l 8k -I 500 -f MAP_ANON +munmap $OPTS -N "unmap_a128k" -l 128k -I 500 -f MAP_ANON + + +munmap $OPTS -N "unmap_rt8k" -l 8k -I 1000 -r -f $TFILE +munmap $OPTS -N "unmap_rt128k" -l 128k -I 3000 -r -f $TFILE +munmap $OPTS -N "unmap_ru8k" -l 8k -I 1000 -r -f $VFILE +munmap $OPTS -N "unmap_ru128k" -l 128k -I 3000 -r -f $VFILE +munmap $OPTS -N "unmap_ra8k" -l 8k -I 1000 -r -f MAP_ANON +munmap $OPTS -N "unmap_ra128k" -l 128k -I 2000 -r -f MAP_ANON + +connection $OPTS -N "conn_connect" -B 256 -c + + +munmap $OPTS -N "unmap_wt8k" -l 8k -I 1000 -w -f $TFILE +munmap $OPTS -N "unmap_wt128k" -l 128k -I 10000 -w -f $TFILE +munmap $OPTS -N "unmap_wu8k" -l 8k -I 1000 -w -f $VFILE +munmap $OPTS -N "unmap_wu128k" -l 128k -I 50000 -w -B 10 -f $VFILE +munmap $OPTS -N "unmap_wa8k" -l 8k -I 1000 -w -f MAP_ANON +munmap $OPTS -N "unmap_wa128k" -l 128k -I 10000 -w -f MAP_ANON + +mprotect $OPTS -N "mprot_z8k" -l 8k -I 300 -f /dev/zero +mprotect $OPTS -N "mprot_z128k" -l 128k -I 500 -f /dev/zero +mprotect $OPTS -N "mprot_wz8k" -l 8k -I 500 -w -f /dev/zero +mprotect $OPTS -N "mprot_wz128k" -l 128k -I 1000 -w -f /dev/zero +mprotect $OPTS -N "mprot_twz8k" -l 8k -I 1000 -w -t -f /dev/zero +mprotect $OPTS -N "mprot_tw128k" -l 128k -I 2000 -w -t -f /dev/zero +mprotect $OPTS -N "mprot_tw4m" -l 4m -w -t -B 1 -f /dev/zero + +pipe $OPTS -N "pipe_pst1" -s 1 -I 1000 -x pipe -m st +pipe $OPTS -N "pipe_pmt1" -s 1 -I 8000 -x pipe -m mt +pipe $OPTS -N "pipe_pmp1" -s 1 -I 8000 -x pipe -m mp +pipe $OPTS -N "pipe_pst4k" -s 4k -I 1000 -x pipe -m st +pipe $OPTS -N "pipe_pmt4k" -s 4k -I 8000 -x pipe -m mt +pipe $OPTS -N "pipe_pmp4k" -s 4k -I 8000 -x pipe -m mp + +pipe $OPTS -N "pipe_sst1" -s 1 -I 1000 -x sock -m st +pipe $OPTS -N "pipe_smt1" -s 1 -I 8000 -x sock -m mt +pipe $OPTS -N "pipe_smp1" -s 1 -I 8000 -x sock -m mp +pipe $OPTS -N "pipe_sst4k" -s 4k -I 1000 -x sock -m st +pipe $OPTS -N "pipe_smt4k" -s 4k -I 8000 -x sock -m mt +pipe $OPTS -N "pipe_smp4k" -s 4k -I 8000 -x sock -m mp + +pipe $OPTS -N "pipe_tst1" -s 1 -I 1000 -x tcp -m st +pipe $OPTS -N "pipe_tmt1" -s 1 -I 8000 -x tcp -m mt +pipe $OPTS -N "pipe_tmp1" -s 1 -I 8000 -x tcp -m mp +pipe $OPTS -N "pipe_tst4k" -s 4k -I 1000 -x tcp -m st +pipe $OPTS -N "pipe_tmt4k" -s 4k -I 8000 -x tcp -m mt +pipe $OPTS -N "pipe_tmp4k" -s 4k -I 8000 -x tcp -m mp + +#connection $OPTS -N "conn_accept" -B 256 -a + +lmbench_bw_unix -B 11 -L -W + +lmbench_bw_mem $OPTS -N lmbench_bcopy_512 -s 512 -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_1k -s 1k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_2k -s 2k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_4k -s 4k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_8k -s 8k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_16k -s 16k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_32k -s 32k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_64k -s 64k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_128k -s 128k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_256k -s 256k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_512k -s 512k -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bcopy_1m -s 1m -x bcopy +lmbench_bw_mem $OPTS -N lmbench_bzero_512 -s 512 -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_1k -s 1k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_2k -s 2k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_4k -s 4k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_8k -s 8k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_16k -s 16k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_32k -s 32k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_64k -s 64k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_128k -s 128k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_256k -s 256k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_512k -s 512k -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_1m -s 1m -x bzero +lmbench_bw_mem $OPTS -N lmbench_bzero_512 -s 512 -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_1k -s 1k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_2k -s 2k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_4k -s 4k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_8k -s 8k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_16k -s 16k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_32k -s 32k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_64k -s 64k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_128k -s 128k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_256k -s 256k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_512k -s 512k -x fcp +lmbench_bw_mem $OPTS -N lmbench_bzero_1m -s 1m -x fcp +lmbench_bw_mem $OPTS -N lmbench_cp_512 -s 512 -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_1k -s 1k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_2k -s 2k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_4k -s 4k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_8k -s 8k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_16k -s 16k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_32k -s 32k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_64k -s 64k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_128k -s 128k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_256k -s 256k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_512k -s 512k -x cp +lmbench_bw_mem $OPTS -N lmbench_cp_1m -s 1m -x cp +lmbench_bw_mem $OPTS -N lmbench_frd_512 -s 512 -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_1k -s 1k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_2k -s 2k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_4k -s 4k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_8k -s 8k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_16k -s 16k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_32k -s 32k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_64k -s 64k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_128k -s 128k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_256k -s 256k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_512k -s 512k -x frd +lmbench_bw_mem $OPTS -N lmbench_frd_1m -s 1m -x frd +lmbench_bw_mem $OPTS -N lmbench_rd_512 -s 512 -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_1k -s 1k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_2k -s 2k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_4k -s 4k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_8k -s 8k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_16k -s 16k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_32k -s 32k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_64k -s 64k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_128k -s 128k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_256k -s 256k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_512k -s 512k -x rd +lmbench_bw_mem $OPTS -N lmbench_rd_1m -s 1m -x rd +lmbench_bw_mem $OPTS -N lmbench_fwr_512 -s 512 -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_1k -s 1k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_2k -s 2k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_4k -s 4k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_8k -s 8k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_16k -s 16k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_32k -s 32k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_64k -s 64k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_128k -s 128k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_256k -s 256k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_512k -s 512k -x fwr +lmbench_bw_mem $OPTS -N lmbench_fwr_1m -s 1m -x fwr +lmbench_bw_mem $OPTS -N lmbench_wr_512 -s 512 -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_1k -s 1k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_2k -s 2k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_4k -s 4k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_8k -s 8k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_16k -s 16k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_32k -s 32k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_64k -s 64k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_128k -s 128k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_256k -s 256k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_512k -s 512k -x wr +lmbench_bw_mem $OPTS -N lmbench_wr_1m -s 1m -x wr +lmbench_bw_mem $OPTS -N lmbench_rdwr_512 -s 512 -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_1k -s 1k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_2k -s 2k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_4k -s 4k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_8k -s 8k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_16k -s 16k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_32k -s 32k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_64k -s 64k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_128k -s 128k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_256k -s 256k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_512k -s 512k -x rdwr +lmbench_bw_mem $OPTS -N lmbench_rdwr_1m -s 1m -x rdwr + +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_512 -s 512 -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_1k -s 1k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_2k -s 2k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_4k -s 4k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_8k -s 8k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_16k -s 16k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_32k -s 32k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_64k -s 64k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_128k -s 128k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_256k -s 256k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_512k -s 512k -f $TFILE +lmbench_bw_mmap_rd $OPTS -N bw_mmap_rd_1m -s 1m -f $TFILE + +. diff --git a/tools/tests/xnu_quick_test/32bit_inode_tests.c b/tools/tests/xnu_quick_test/32bit_inode_tests.c index b85bb8911..c6b1e6f48 100644 --- a/tools/tests/xnu_quick_test/32bit_inode_tests.c +++ b/tools/tests/xnu_quick_test/32bit_inode_tests.c @@ -42,7 +42,7 @@ int getdirentries_test( void * the_argp ) char * my_pathp = NULL; char * my_bufp = NULL; char * my_file_namep; - unsigned long my_base; + long my_base; unsigned long my_count; unsigned long my_new_state; fsobj_id_t my_obj_id; diff --git a/tools/tests/xnu_quick_test/commpage_tests.c b/tools/tests/xnu_quick_test/commpage_tests.c index 792e78f00..37e1ae621 100644 --- a/tools/tests/xnu_quick_test/commpage_tests.c +++ b/tools/tests/xnu_quick_test/commpage_tests.c @@ -156,7 +156,7 @@ int commpage_data_tests( void * the_argp ) #endif /* __i386__ || __x86_64__ */ /* These fields are not implemented for all architectures */ -#ifdef _COMM_PAGE_SCHED_GEN +#if defined(_COMM_PAGE_SCHED_GEN) && !TARGET_OS_EMBEDDED uint32_t preempt_count1, preempt_count2; uint64_t count; @@ -189,8 +189,11 @@ int commpage_data_tests( void * the_argp ) goto fail; } + /* We shouldn't be supporting userspace processor_start/processor_exit on embedded */ +#if !TARGET_OS_EMBEDDED ret = active_cpu_test(); if (ret) goto fail; +#endif /* !TARGET_OS_EMBEDDED */ #endif /* _COMM_PAGE_ACTIVE_CPUS */ #ifdef _COMM_PAGE_PHYSICAL_CPUS @@ -289,8 +292,8 @@ int active_cpu_test(void) processor_t *processor_list; host_name_port_t host; struct processor_basic_info processor_basic_info; - int cpu_count; - int data_count; + mach_msg_type_number_t cpu_count; + mach_msg_type_number_t data_count; int i; diff --git a/tools/tests/xnu_quick_test/content_protection_test.c b/tools/tests/xnu_quick_test/content_protection_test.c new file mode 100644 index 000000000..9f2cceb1d --- /dev/null +++ b/tools/tests/xnu_quick_test/content_protection_test.c @@ -0,0 +1,922 @@ +#include "tests.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Note that this test (due to the need to lock/unlock the device on demand, and the + need to manipulate the passcode) has the unfortunate effect of link xnu_quick_test + to the IOKit Framework. */ + +/* TODO: Change the test to use a single cleanup label. */ + +#define CPT_IO_SIZE 4096 +#define CPT_AKS_BUF_SIZE 256 +#define CPT_MAX_PASS_LEN 64 + +#define GET_PROT_CLASS(fd) fcntl((fd), F_GETPROTECTIONCLASS) +#define SET_PROT_CLASS(fd, prot_class) fcntl((fd), F_SETPROTECTIONCLASS, (prot_class)) + +#define PRINT_LOCK_FAIL printf("%s, line %d: failed to lock the device.\n", cpt_fail_header, __LINE__); +#define PRINT_UNLOCK_FAIL printf("%s, line %d: failed to unlock the device.\n", cpt_fail_header, __LINE__); + +extern char g_target_path[PATH_MAX]; + +char * cpt_fail_header = "Content protection test failed"; +char * keystorectl_path = "/usr/local/bin/keystorectl"; + +/* Shamelessly ripped from keystorectl routines; a wrapper for invoking the AKS user client. */ +int apple_key_store(uint32_t command, + uint64_t * inputs, + uint32_t input_count, + void * input_structs, + size_t input_struct_count, + uint64_t * outputs, + uint32_t * output_count) +{ + int result = -1; + io_connect_t connection = IO_OBJECT_NULL; + io_registry_entry_t apple_key_bag_service = IO_OBJECT_NULL; + kern_return_t k_result = KERN_FAILURE; + IOReturn io_result = IO_OBJECT_NULL; + + apple_key_bag_service = IOServiceGetMatchingService(kIOMasterPortDefault, IOServiceMatching(kAppleKeyStoreServiceName)); + + if (apple_key_bag_service == IO_OBJECT_NULL) + { + printf("FAILURE: failed to match kAppleKeyStoreServiceName.\n"); + goto end; + } + + k_result = IOServiceOpen(apple_key_bag_service, mach_task_self(), 0, &connection); + + if (k_result != KERN_SUCCESS) + { + printf("FAILURE: failed to open AppleKeyStore.\n"); + goto end; + } + + k_result = IOConnectCallMethod(connection, kAppleKeyStoreUserClientOpen, NULL, 0, NULL, 0, NULL, NULL, NULL, NULL); + + if (k_result != KERN_SUCCESS) + { + printf("FAILURE: call to AppleKeyStore method kAppleKeyStoreUserClientOpen failed.\n"); + goto close; + } + + io_result = IOConnectCallMethod(connection, command, inputs, input_count, input_structs, input_struct_count, outputs, output_count, NULL, NULL); + + if (io_result != kIOReturnSuccess) + { + printf("FAILURE: call to AppleKeyStore method %d failed.\n", command); + goto close; + } + + result = 0; + +close: + IOServiceClose(apple_key_bag_service); + +end: + return(result); +} + +#ifndef KEYBAG_ENTITLEMENTS +/* Just a wrapper around forking to exec keystorectl for commands requiring entitlements. */ +int keystorectl(char * const command[]) +{ + int child_result = -1; + int result = -1; + pid_t child = -1; + + child = fork(); + + if (child == -1) + { + printf("FAILURE: failed to fork.\n"); + goto end; + } + else if (child == 0) + { + /* TODO: This keeps keystorectl from bombarding us with key state changes, but + there must be a better way of doing this; killing stderr is a bit nasty, + and if keystorectl fails, we want all the information we can get. */ + fclose(stderr); + fclose(stdin); + execv(keystorectl_path, command); + printf("FAILURE: child failed to execv keystorectl, errno = %s.\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if ((waitpid(child, &child_result, 0) != child) || WEXITSTATUS(child_result)) + { + printf("FAILURE: keystorectl failed.\n"); + result = -1; + } + else + { + result = 0; + } + +end: + return(result); +} +#endif /* KEYBAG_ENTITLEMENTS */ + +/* Code based on Mobile Key Bag; specifically MKBDeviceSupportsContentProtection + and MKBDeviceFormattedForContentProtection. */ +/* We want to verify that we support content protection, and that + we are formatted for it. */ +int supports_content_prot() +{ + int local_result = -1; + int result = -1; + uint32_t buffer_size = 1; + char buffer[buffer_size]; + io_registry_entry_t defaults = IO_OBJECT_NULL; + kern_return_t k_result = KERN_FAILURE; + struct statfs statfs_results; + + defaults = IORegistryEntryFromPath(kIOMasterPortDefault, kIODeviceTreePlane ":/defaults"); + + if (defaults == IO_OBJECT_NULL) + { + printf("FAILURE: failed to find defaults registry entry.\n"); + goto end; + } + + k_result = IORegistryEntryGetProperty(defaults, "content-protect", buffer, &buffer_size); + + if (k_result != KERN_SUCCESS) + { /* This isn't a failure; it means the entry doesn't exist, so we assume CP + is unsupported. */ + result = 0; + goto end; + } + + /* At this point, we SUPPORT content protection... but are we formatted for it? */ + /* This is ugly; we should be testing the file system we'll be testing in, not + just /tmp/. */ + local_result = statfs(g_target_path, &statfs_results); + + if (local_result == -1) + { + printf("FAILURE: failed to statfs the test directory, errno = %s.\n", + strerror(errno)); + } + else if (statfs_results.f_flags & MNT_CPROTECT) + { + result = 1; + } + else + { /* This isn't a failure, it means the filesystem isn't formatted for CP. */ + result = 0; + } + +end: + return(result); +} + +#if 0 +int device_lock_state() +{ + /* TODO: Actually implement this. */ + /* We fail if a passcode already exists, and the methods being used to lock/unlock + the device in this test appear to be synchronous... do we need this function? */ + int result = -1; + + return(result); +} +#endif + +int lock_device() +{ + int result = -1; + +#ifdef KEYBAG_ENTITLEMENTS + /* If we're entitled, we can lock the device ourselves. */ + uint64_t inputs[] = {device_keybag_handle}; + uint32_t input_count = (sizeof(inputs) / sizeof(*inputs)); + result = apple_key_store(kAppleKeyStoreKeyBagLock, inputs, input_count, NULL, 0, NULL, NULL); +#else + /* If we aren't entitled, we'll need to use keystorectl to lock the device. */ + /* keystorectl seems to have a bus error (though it locks successfully) unless + lock is passed an argument, so we'll also pass it the empty string. */ + char * const keystorectl_args[] = {keystorectl_path, "lock", "", NULL}; + result = keystorectl(keystorectl_args); +#endif /* KEYBAG_ENTITLEMENTS */ + + return(result); +} + +int unlock_device(char * passcode) +{ + int result = -1; + +#ifdef KEYBAG_ENTITLEMENTS + /* If we're entitled, we can unlock the device ourselves. */ + uint64_t inputs[] = {device_keybag_handle}; + uint32_t input_count = (sizeof(inputs) / sizeof(*inputs)); + size_t input_struct_count = 0; + + if ((passcode == NULL) || ((input_struct_count = strnlen(passcode, CPT_MAX_PASS_LEN)) == CPT_MAX_PASS_LEN)) + { + passcode = ""; + input_struct_count = 0; + } + + result = apple_key_store(kAppleKeyStoreKeyBagUnlock, inputs, input_count, passcode, input_struct_count, NULL, NULL); +#else + /* If we aren't entitled, we'll need to use keystorectl to unlock the device. */ + if ((passcode == NULL) || (strnlen(passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)) + { + passcode = ""; + } + + char * const keystorectl_args[] = {keystorectl_path, "unlock", passcode, NULL}; + result = keystorectl(keystorectl_args); +#endif /* KEYBAG_ENTITLEMENTS */ + + return(result); +} + +int set_passcode(char * new_passcode, char * old_passcode) +{ + int result = -1; + +#ifdef KEYBAG_ENTITLEMENTS + /* If we're entitled, we can set the passcode ourselves. */ + uint64_t inputs[] = {device_keybag_handle}; + uint32_t input_count = (sizeof(inputs) / sizeof(*inputs)); + void * input_structs = NULL; + size_t input_struct_count = 0; + char buffer[CPT_AKS_BUF_SIZE]; + char * buffer_ptr = buffer; + uint32_t old_passcode_len = 0; + uint32_t new_passcode_len = 0; + + if ((old_passcode == NULL) || ((old_passcode_len = strnlen(old_passcode, CPT_MAX_PASS_LEN)) == CPT_MAX_PASS_LEN)) + { + old_passcode = ""; + old_passcode_len = 0; + } + + if ((new_passcode == NULL) || ((new_passcode_len = strnlen(new_passcode, CPT_MAX_PASS_LEN)) == CPT_MAX_PASS_LEN)) + { + new_passcode = ""; + new_passcode_len = 0; + } + + *((uint32_t *) buffer_ptr) = ((uint32_t) 2); + buffer_ptr += sizeof(uint32_t); + *((uint32_t *) buffer_ptr) = old_passcode_len; + buffer_ptr += sizeof(uint32_t); + memcpy(buffer_ptr, old_passcode, old_passcode_len); + buffer_ptr += ((old_passcode_len + sizeof(uint32_t) - 1) & ~(sizeof(uint32_t) - 1)); + *((uint32_t *) buffer_ptr) = new_passcode_len; + buffer_ptr += sizeof(uint32_t); + memcpy(buffer_ptr, new_passcode, new_passcode_len); + buffer_ptr += ((new_passcode_len + sizeof(uint32_t) - 1) & ~(sizeof(uint32_t) - 1)); + input_structs = buffer; + input_struct_count = (buffer_ptr - buffer); + + result = apple_key_store(kAppleKeyStoreKeyBagSetPasscode, inputs, input_count, input_structs, input_struct_count, NULL, NULL); +#else + /* If we aren't entitled, we'll need to use keystorectl to set the passcode. */ + if ((old_passcode == NULL) || (strnlen(old_passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)) + { + old_passcode = ""; + } + + if ((new_passcode == NULL) || (strnlen(new_passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)) + { + new_passcode = ""; + } + + char * const keystorectl_args[] = {keystorectl_path, "change-password", old_passcode, new_passcode, NULL}; + result = keystorectl(keystorectl_args); +#endif /* KEYBAG_ENTITLEMENTS */ + + return(result); +} + +int clear_passcode(char * passcode) +{ + /* For the moment, this will set the passcode to the empty string (a known value); + this will most likely need to change, or running this test may ruin everything(tm). */ + int result = -1; + + result = set_passcode(NULL, passcode); + + return(result); +} + +#if 0 +/* Determines if we will try to test class C semanatics. */ +int unlocked_since_boot() +{ + /* TODO: Actually implement this. */ + /* The actual semantics for CP mean that even with this primative, we would need + set a passcode and then reboot the device in order to test this; this function + will probably be rather worthless as a result. */ + int result = 1; + + return(result); +} +#endif + +/* If the device has a passcode when we want to test it, things are going to go wrong. + As such, we'll assume the device never has a passcode. + No, not even then. + Or we could just try "" to ""; it works. */ +int has_passcode() +{ + int result = -1; + + result = set_passcode(NULL, NULL); + + return(result); +} + +int content_protection_test(void * argp) +{ + #pragma unused (argp) + int init_result = 0; + int local_result = -1; + int test_result = -1; + int fd = -1; + int dir_fd = -1; + int subdir_fd = -1; + int new_prot_class = -1; + int old_prot_class = -1; + int current_byte = 0; + char filepath[PATH_MAX]; + char dirpath[PATH_MAX]; + char subdirpath[PATH_MAX]; + char rd_buffer[CPT_IO_SIZE]; + char wr_buffer[CPT_IO_SIZE]; + char * passcode = "IAmASecurePassword"; + + /* Do some initial setup (names). */ + bzero(filepath, PATH_MAX); + bzero(dirpath, PATH_MAX); + bzero(subdirpath, PATH_MAX); + + /* This is just easier than checking each result individually. */ + init_result |= (strlcat(filepath, g_target_path, PATH_MAX) == PATH_MAX); + init_result |= (strlcat(filepath, "/", PATH_MAX) == PATH_MAX); + init_result |= (strlcpy(dirpath, filepath, PATH_MAX) == PATH_MAX); + init_result |= (strlcat(filepath, "cpt_test_file", PATH_MAX) == PATH_MAX); + init_result |= (strlcat(dirpath, "cpt_test_dir/", PATH_MAX) == PATH_MAX); + init_result |= (strlcpy(subdirpath, dirpath, PATH_MAX) == PATH_MAX); + init_result |= (strlcat(subdirpath, "cpt_test_subdir/", PATH_MAX) == PATH_MAX); + + if (init_result) + { /* If any of the initialization failed, we're just going to fail now. */ + printf("%s, line %d: failed to initialize test strings.\n", + cpt_fail_header, __LINE__); + goto end; + } + + local_result = supports_content_prot(); + + if (local_result == -1) + { + printf("%s, line %d: failed to determine if content protection is supported.\n", + cpt_fail_header, __LINE__); + goto end; + } + else if (local_result == 0) + { /* If we don't support content protection at the moment, pass the test. */ + printf("This device does not support or is not formatted for content protection.\n"); + test_result = 0; + goto end; + } + + /* If we support content protection, we'll need to be able to set the passcode. */ + local_result = has_passcode(); + + if (local_result == -1) + { + printf("%s, line %d: the device appears to have a passcode.\n", + cpt_fail_header, __LINE__); + goto end; + } + + if (set_passcode(passcode, NULL)) + { + printf("%s, line %d: failed to set a new passcode.\n", + cpt_fail_header, __LINE__); + goto end; + } + + fd = open(filepath, O_CREAT | O_EXCL | O_RDWR | O_CLOEXEC); + + if (fd == -1) + { + printf("%s, line %d: failed to create the test file, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto remove_passcode; + } + + /* Ensure we can freely read and change protection classes when unlocked. */ + for (new_prot_class = PROTECTION_CLASS_A; new_prot_class <= PROTECTION_CLASS_F; new_prot_class++) + { + old_prot_class = GET_PROT_CLASS(fd); + + if (old_prot_class == -1) + { + printf("%s, line %d: failed to get protection class when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + if (SET_PROT_CLASS(fd, new_prot_class)) + { + printf("%s, line %d: failed to change protection class from %d to %d during unlock, errno = %s.\n", + cpt_fail_header, __LINE__, old_prot_class, new_prot_class, strerror(errno)); + goto cleanup_file; + } + } + + if (SET_PROT_CLASS(fd, PROTECTION_CLASS_D)) + { + printf("%s, line %d: failed to change protection class from F to D when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + /* Try making a class A file while locked. */ + if (lock_device()) + { + PRINT_LOCK_FAIL; + goto cleanup_file; + } + + if (!SET_PROT_CLASS(fd, PROTECTION_CLASS_A)) + { + printf("%s, line %d: was able to change protection class from D to A when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + if (unlock_device(passcode)) + { + PRINT_UNLOCK_FAIL; + goto cleanup_file; + } + + /* Attempt opening/IO to a class A file while unlocked. */ + if (SET_PROT_CLASS(fd, PROTECTION_CLASS_A)) + { + printf("%s, line %d: failed to change protection class from D to A when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + close(fd); + fd = open(filepath, O_RDWR | O_CLOEXEC); + + if (fd == -1) + { + printf("%s, line %d: failed to open a class A file when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto remove_file; + } + + /* TODO: Write specific data we can check for. + If we're going to do that, the write scheme should be deliberately ugly. */ + current_byte = 0; + + while (current_byte < CPT_IO_SIZE) + { + local_result = pwrite(fd, &wr_buffer[current_byte], CPT_IO_SIZE - current_byte, current_byte); + + if (local_result == -1) + { + printf("%s, line %d: failed to write to class A file when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + current_byte += local_result; + } + + current_byte = 0; + + while (current_byte < CPT_IO_SIZE) + { + local_result = pread(fd, &rd_buffer[current_byte], CPT_IO_SIZE - current_byte, current_byte); + + if (local_result == -1) + { + printf("%s, line %d: failed to read from class A file when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + current_byte += local_result; + } + + /* Again, but now while locked; and try to change the file class as well. */ + if (lock_device()) + { + PRINT_LOCK_FAIL; + goto cleanup_file; + } + + if (pread(fd, rd_buffer, CPT_IO_SIZE, 0) > 0) + { + printf("%s, line %d: was able to read from a class A file when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + if (pwrite(fd, wr_buffer, CPT_IO_SIZE, 0) > 0) + { + printf("%s, line %d: was able to write to a class A file when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + if (!SET_PROT_CLASS(fd, PROTECTION_CLASS_D)) + { + printf("%s, line %d: was able to change protection class from A to D when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + /* Try to open and truncate the file. */ + close(fd); + fd = open(filepath, O_RDWR | O_TRUNC | O_CLOEXEC); + + if (fd != -1) + { + printf("%s, line %d: was able to open and truncate a class A file when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + /* Try to open the file */ + fd = open(filepath, O_RDWR | O_CLOEXEC); + + if (fd != -1) + { + printf("%s, line %d: was able to open a class A file when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + /* What about class B files? */ + if (unlock_device(passcode)) + { + PRINT_UNLOCK_FAIL; + goto cleanup_file; + } + + fd = open(filepath, O_RDWR | O_CLOEXEC); + + if (fd == -1) + { + printf("%s, line %d: was unable to open a class A file when unlocked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + if (SET_PROT_CLASS(fd, PROTECTION_CLASS_D)) + { + printf("%s, line %d: failed to change protection class from A to D when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + if (lock_device()) + { + PRINT_LOCK_FAIL; + goto cleanup_file; + } + + /* Can we create a class B file while locked? */ + if (SET_PROT_CLASS(fd, PROTECTION_CLASS_B)) + { + printf("%s, line %d: failed to change protection class from D to B when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + /* We should also be able to read/write to the file descriptor while it is open. */ + current_byte = 0; + + while (current_byte < CPT_IO_SIZE) + { + local_result = pwrite(fd, &wr_buffer[current_byte], CPT_IO_SIZE - current_byte, current_byte); + + if (local_result == -1) + { + printf("%s, line %d: failed to write to new class B file when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + current_byte += local_result; + } + + current_byte = 0; + + while (current_byte < CPT_IO_SIZE) + { + local_result = pread(fd, &rd_buffer[current_byte], CPT_IO_SIZE - current_byte, current_byte); + + if (local_result == -1) + { + printf("%s, line %d: failed to read from new class B file when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + current_byte += local_result; + } + + /* We should not be able to open a class B file under lock. */ + close(fd); + fd = open(filepath, O_RDWR | O_CLOEXEC); + + if (fd != -1) + { + printf("%s, line %d: was able to open a class B file when locked.\n", + cpt_fail_header, __LINE__); + goto cleanup_file; + } + + unlink(filepath); + + /* We still need to test directory semantics. */ + if (mkdir(dirpath, 0x0777) == -1) + { + printf("%s, line %d: failed to create a new directory when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto remove_passcode; + } + + /* The newly created directory should not have a protection class. */ + dir_fd = open(dirpath, O_RDONLY | O_CLOEXEC); + + if (dir_fd == -1) + { + printf("%s, line %d: failed to open an unclassed directory when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto remove_dir; + } + + if (GET_PROT_CLASS(dir_fd) != PROTECTION_CLASS_D) + { + printf("%s, line %d: newly created directory had a non-D protection class.\n", + cpt_fail_header, __LINE__); + goto cleanup_dir; + } + + if (SET_PROT_CLASS(dir_fd, PROTECTION_CLASS_A)) + { + printf("%s, line %d: was unable to change a directory from class D to class A during lock.\n", + cpt_fail_header, __LINE__); + goto cleanup_dir; + } + + if (SET_PROT_CLASS(dir_fd, PROTECTION_CLASS_D)) + { + printf("%s, line %d: failed to change a directory from class A to class D during lock, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_dir; + } + + /* Do all files created in the directory properly inherit the directory's protection class? */ + if ((strlcpy(filepath, dirpath, PATH_MAX) == PATH_MAX) || (strlcat(filepath, "cpt_test_file", PATH_MAX) == PATH_MAX)) + { + printf("%s, line %d: failed to construct the path for a file in the directory.\n", + cpt_fail_header, __LINE__); + goto cleanup_dir; + } + + if (unlock_device(passcode)) + { + PRINT_UNLOCK_FAIL; + goto cleanup_dir; + } + + for (new_prot_class = PROTECTION_CLASS_A; new_prot_class <= PROTECTION_CLASS_E; new_prot_class++) + { + old_prot_class = GET_PROT_CLASS(dir_fd); + + if (old_prot_class == -1) + { + printf("%s, line %d: failed to get the protection class for the directory, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_dir; + } + + if (SET_PROT_CLASS(dir_fd, new_prot_class)) + { + printf("%s, line %d: failed to change the protection class for the directory from %d to %d, errno = %s.\n", + cpt_fail_header, __LINE__, old_prot_class, new_prot_class, strerror(errno)); + goto cleanup_dir; + } + + fd = open(filepath, O_CREAT | O_EXCL | O_CLOEXEC); + + if (fd == -1) + { + printf("%s, line %d: failed to create a file in a class %d directory when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, new_prot_class, strerror(errno)); + goto cleanup_dir; + } + + local_result = GET_PROT_CLASS(fd); + + if (local_result == -1) + { + printf("%s, line %d: failed to get the new file's protection class, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + else if (local_result != new_prot_class) + { + printf("%s, line %d: new file did not inherit the directory's protection class.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + close(fd); + unlink(filepath); + } + + /* Do we disallow creation of a class F directory? */ + if (!SET_PROT_CLASS(dir_fd, PROTECTION_CLASS_F)) + { + printf("%s, line %d: creation of a class F directory did not fail as expected.\n", + cpt_fail_header, __LINE__); + goto cleanup_dir; + } + + /* And are class A and class B semantics followed for when we create these files during lock? */ + if (SET_PROT_CLASS(dir_fd, PROTECTION_CLASS_A)) + { + printf("%s, line %d: failed to change directory class from F to A when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_dir; + } + + if (lock_device()) + { + PRINT_LOCK_FAIL; + goto cleanup_dir; + } + + fd = open(filepath, O_CREAT | O_EXCL | O_CLOEXEC); + + if (fd != -1) + { + printf("%s, line %d: was able to create a new file in a class A directory when locked.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + if (unlock_device(passcode)) + { + PRINT_UNLOCK_FAIL; + goto cleanup_dir; + } + + if (SET_PROT_CLASS(dir_fd, PROTECTION_CLASS_B)) + { + printf("%s, line %d: failed to change directory class from A to B when unlocked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_dir; + } + + if (lock_device()) + { + PRINT_LOCK_FAIL; + goto cleanup_dir; + } + + fd = open(filepath, O_CREAT | O_EXCL | O_RDWR | O_CLOEXEC); + + if (fd == -1) + { + printf("%s, line %d: failed to create new file in class B directory when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_dir; + } + + local_result = GET_PROT_CLASS(fd); + + if (local_result == -1) + { + printf("%s, line %d: failed to get protection class for a new file when locked, errno = %s.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + else if (local_result != PROTECTION_CLASS_B) + { + printf("%s, line %d: new file in class B directory did not inherit protection class.\n", + cpt_fail_header, __LINE__, strerror(errno)); + goto cleanup_file; + } + + /* What happens when we try to create new subdirectories? */ + if (unlock_device(passcode)) + { + PRINT_UNLOCK_FAIL; + goto cleanup_file; + } + + for (new_prot_class = PROTECTION_CLASS_A; new_prot_class <= PROTECTION_CLASS_E; new_prot_class++) + { + if (SET_PROT_CLASS(dir_fd, new_prot_class)) + { + printf("%s, line %d: failed to change directory to class %d, errno = %s.\n", + cpt_fail_header, __LINE__, new_prot_class, strerror(errno)); + goto cleanup_file; + } + + local_result = mkdir(subdirpath, 0x0777); + + if (local_result == -1) + { + printf("%s, line %d: failed to create subdirectory in class %d directory, errno = %s.\n", + cpt_fail_header, __LINE__, new_prot_class, strerror(errno)); + goto cleanup_file; + } + + subdir_fd = open(subdirpath, O_RDONLY | O_CLOEXEC); + + if (subdir_fd == -1) + { + printf("%s, line %d: failed to open subdirectory in class %d directory, errno = %s.\n", + cpt_fail_header, __LINE__, new_prot_class, strerror(errno)); + goto remove_subdir; + } + + local_result = GET_PROT_CLASS(subdir_fd); + + if (local_result == -1) + { + printf("%s, line %d: failed to get class of new subdirectory of class %d directory, errno = %s.\n", + cpt_fail_header, __LINE__, new_prot_class, strerror(errno)); + goto cleanup_subdir; + } + else if (local_result != new_prot_class) + { + printf("%s, line %d: new subdirectory had different class than class %d parent.\n", + cpt_fail_header, __LINE__, new_prot_class); + goto cleanup_subdir; + } + + close(subdir_fd); + rmdir(subdirpath); + } + + /* If we've made it this far, the test was successful. */ + test_result = 0; + +cleanup_subdir: + close(subdir_fd); + +remove_subdir: + rmdir(subdirpath); + +cleanup_file: + close(fd); + +remove_file: + unlink(filepath); + +cleanup_dir: + close(dir_fd); + +remove_dir: + rmdir(dirpath); + +remove_passcode: + /* Try to unlock the device (no ramifications if it isn't locked when we try) and remove the passcode. */ + if (unlock_device(passcode)) + { + printf("WARNING: failed to unlock the device.\n"); + } + + if (clear_passcode(passcode)) + { + printf("WARNING: failed to clear the passcode.\n"); + } + +end: + return(test_result); +} + diff --git a/tools/tests/xnu_quick_test/helpers/data_exec.c b/tools/tests/xnu_quick_test/helpers/data_exec.c index 8cd7c0316..e6c1229ab 100644 --- a/tools/tests/xnu_quick_test/helpers/data_exec.c +++ b/tools/tests/xnu_quick_test/helpers/data_exec.c @@ -63,7 +63,7 @@ int expected[4] = { }; -main(int argc, char *argv[]) +int main(int argc, char *argv[]) { int (*func)(); int result, test; diff --git a/tools/tests/xnu_quick_test/helpers/launch.c b/tools/tests/xnu_quick_test/helpers/launch.c index 206116042..c9b135789 100644 --- a/tools/tests/xnu_quick_test/helpers/launch.c +++ b/tools/tests/xnu_quick_test/helpers/launch.c @@ -18,6 +18,7 @@ */ #include +#include #include #include diff --git a/tools/tests/xnu_quick_test/main.c b/tools/tests/xnu_quick_test/main.c index d1ca1574d..5c526f246 100644 --- a/tools/tests/xnu_quick_test/main.c +++ b/tools/tests/xnu_quick_test/main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -106,8 +107,8 @@ struct test_entry g_tests[] = {1, &directory_tests, NULL, "getattrlist, getdirentriesattr, setattrlist"}, #if !TARGET_OS_EMBEDDED {1, &getdirentries_test, NULL, "getdirentries"}, -#endif {1, &exchangedata_test, NULL, "exchangedata"}, +#endif {1, &searchfs_test, NULL, "searchfs"}, {1, &sema2_tests, NULL, "sem_close, sem_open, sem_post, sem_trywait, sem_unlink, sem_wait"}, {1, &sema_tests, NULL, "semctl, semget, semop"}, @@ -124,6 +125,11 @@ struct test_entry g_tests[] = {1, &atomic_fifo_queue_test, NULL, "OSAtomicFifoEnqueue, OSAtomicFifoDequeue"}, #endif {1, &sched_tests, NULL, "Scheduler tests"}, +#if TARGET_OS_EMBEDDED + {1, &content_protection_test, NULL, "Content protection tests"}, +#endif + {1, &pipes_test, NULL, "Pipes tests"}, + {1, &kaslr_test, NULL, "KASLR tests"}, {0, NULL, NULL, "last one"} }; @@ -132,7 +138,9 @@ static void list_all_tests( void ); static void mark_tests_to_run( long my_start, long my_end ); static int parse_tests_to_run( int argc, const char * argv[], int * indexp ); static void usage( void ); +#if !TARGET_OS_EMBEDDED static int setgroups_if_single_user(void); +#endif static const char *current_arch( void ); /* globals */ @@ -269,23 +277,23 @@ g_testbots_active = 1; #endif /* Code added to run xnu_quick_test under testbots */ if ( g_testbots_active == 1 ) { - printf("[TEST] xnu_quick_test \n"); /* Declare the beginning of test suite */ + printf("[TEST] xnu_quick_test \n"); /* Declare the beginning of test suite */ } - + +#if !TARGET_OS_EMBEDDED /* Populate groups list if we're in single user mode */ if (setgroups_if_single_user()) { return 1; } - +#endif if ( list_the_tests != 0 ) { list_all_tests( ); return 0; } #if !TARGET_OS_EMBEDDED if (g_xilog_active == 1) { - logRef = XILogOpenLogExtended( logPath, "xnu_quick_test", "com.apple.coreos", - config, xml, echo, NULL, "ResultOwner", - "com.apple.coreos", NULL ); + logRef = XILogOpenLogExtended( logPath, "xnu_quick_test", "com.apple.coreos", config, xml, + echo, NULL, "ResultOwner", "com.apple.coreos", NULL ); if( logRef == NULL ) { fprintf(stderr,"Couldn't create log: %s",logPath); exit(-1); @@ -304,9 +312,6 @@ g_testbots_active = 1; printf( "Current architecture is %s\n", current_arch() ); /* Code added to run xnu_quick_test under testbots */ - if ( g_testbots_active == 1 ) { - printf("[PASS] xnu_quick_test started\n"); - } /* run each test that is marked to run in our table until we complete all of them or * hit the maximum number of failures. @@ -325,6 +330,11 @@ g_testbots_active = 1; XILogMsg( "test #%d - %s \n", (i + 1), my_testp->test_infop ); } #endif + + if ( g_testbots_active == 1 ) { + printf("[BEGIN] %s \n", my_testp->test_infop); + } + printf( "test #%d - %s \n", (i + 1), my_testp->test_infop ); fflush(stdout); my_err = my_testp->test_routine( my_testp->test_input ); @@ -347,7 +357,7 @@ g_testbots_active = 1; printf( "\n Reached the maximum number of failures - Aborting xnu_quick_test. \n" ); /* Code added to run xnu_quick_test under testbots */ if ( g_testbots_active == 1 ) { - printf("[FAIL] %s \n", my_testp->test_infop); + printf("[FAIL] %s \n", my_testp->test_infop); } goto exit_this_routine; } @@ -369,7 +379,7 @@ g_testbots_active = 1; #endif /* Code added to run xnu_quick_test under testbots */ if ( g_testbots_active == 1 ) { - printf("[PASS] %s \n", my_testp->test_infop); + printf("[PASS] %s \n", my_testp->test_infop); } } @@ -573,6 +583,7 @@ static void usage( void ) } /* usage */ +#if !TARGET_OS_EMBEDDED /* This is a private API between Libinfo, Libc, and the DirectoryService daemon. * Since we are trying to determine if an external provider will back group * lookups, we can use this, without relying on additional APIs or tools @@ -629,6 +640,7 @@ setgroups_if_single_user(void) return retval; } +#endif static const char *current_arch( void ) { diff --git a/tools/tests/xnu_quick_test/makefile b/tools/tests/xnu_quick_test/makefile index 554416475..9dbf1631b 100644 --- a/tools/tests/xnu_quick_test/makefile +++ b/tools/tests/xnu_quick_test/makefile @@ -2,17 +2,21 @@ SDKROOT ?= / Product=$(shell tconf --product) Embedded=$(shell tconf --test TARGET_OS_EMBEDDED) +SDKVERSION=$(shell xcodebuild -sdk $(SDKROOT) -version SDKVersion | head -1) + ifeq "$(Embedded)" "YES" XILogFLAG = SDKPATH = $(shell xcodebuild -sdk $(SDKROOT) -version Path) -CFLAGS += -isysroot $(SDKPATH) -LIBFLAGS += -isysroot $(SDKPATH) +CFLAGS += -isysroot $(SDKPATH) -miphoneos-version-min=$(SDKVERSION) +LIBFLAGS += -isysroot $(SDKPATH) -miphoneos-version-min=$(SDKVERSION) else XILogFLAG = -framework XILog +CFLAGS += -mmacosx-version-min=$(SDKVERSION) +LIBFLAGS += -mmacosx-version-min=$(SDKVERSION) endif -HOSTCC = gcc -CC = xcrun -sdk $(SDKROOT) gcc +HOSTCC = cc +CC = xcrun -sdk $(SDKROOT) cc ifdef RC_BUILDIT DOING_BUILDIT=yes @@ -42,7 +46,7 @@ else # this hack should be removed once tconf gets # ifeq "$(Product)" "iPhone" - ARCH=armv6 + ARCH=armv7 endif ifeq "$(Product)" "AppleTV" ARCH=i386 @@ -57,18 +61,26 @@ else endif -CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) +CFLAGS += -g -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders/ -F/AppleInternal/Library/Frameworks/ $(MORECFLAGS) -Wno-deprecated-declarations LIBFLAGS += -I $(SDKPATH)/System/Library/Frameworks/System.framework/Versions/B/PrivateHeaders -F/AppleInternal/Library/Frameworks/ $(XILogFLAG) +# The current implementation of the content protection test requires IOKit. +ifeq "$(Product)" "iPhone" +LIBFLAGS += -framework IOKit +endif + MY_OBJECTS = $(OBJROOT)/main.o $(OBJROOT)/memory_tests.o $(OBJROOT)/misc.o \ $(OBJROOT)/sema_tests.o $(OBJROOT)/shared_memory_tests.o \ $(OBJROOT)/socket_tests.o $(OBJROOT)/tests.o \ $(OBJROOT)/xattr_tests.o $(OBJROOT)/kqueue_tests.o \ $(OBJROOT)/machvm_tests.o $(OBJROOT)/commpage_tests.o \ - $(OBJROOT)/atomic_fifo_queue_test.o $(OBJROOT)/sched_tests.o + $(OBJROOT)/atomic_fifo_queue_test.o $(OBJROOT)/sched_tests.o \ + $(OBJROOT)/pipes_tests.o ifneq "$(Product)" "iPhone" MY_OBJECTS += $(OBJROOT)/32bit_inode_tests.o +else +MY_OBJECTS += $(OBJROOT)/content_protection_test.o endif # In networked home directories, the chown will fail; we notice and print a helpful message @@ -112,9 +124,9 @@ ifeq "$(Product)" "MacOSX" endif ifeq "$(Product)" "iPhone" - $(CC) -arch armv6 -isysroot $(SDKROOT) $(CFLAGS) helpers/sleep.c -o $(DSTROOT)/helpers/sleep-arm - $(CC) $(LIBFLAGS) -arch armv6 -isysroot $(SDKROOT) $(OBJROOT)/misc.o helpers/launch.c -o $(DSTROOT)/helpers/launch-arm - $(CC) $(MY_ARCH) -isysroot $(SDKROOT) helpers/arch.c -o $(DSTROOT)/helpers/arch + $(CC) $(CFLAGS) helpers/sleep.c -o $(DSTROOT)/helpers/sleep-arm + $(CC) $(LIBFLAGS) $(CFLAGS) $(OBJROOT)/misc.o helpers/launch.c -o $(DSTROOT)/helpers/launch-arm + $(CC) $(MY_ARCH) $(CFLAGS) helpers/arch.c -o $(DSTROOT)/helpers/arch endif @@ -136,7 +148,7 @@ $(OBJROOT)/memory_tests.o : memory_tests.c tests.h # misc.o has to be built 3-way for the helpers to link $(OBJROOT)/misc.o : misc.c tests.h ifeq "$(Product)" "iPhone" - $(CC) -arch armv6 $(CFLAGS) -c misc.c -o $@ + $(CC) -arch armv7 $(CFLAGS) -c misc.c -o $@ else $(CC) -arch i386 -arch x86_64 $(CFLAGS) -c misc.c -o $@ endif @@ -174,6 +186,11 @@ $(OBJROOT)/commpage_tests.o : commpage_tests.c tests.h $(OBJROOT)/atomic_fifo_queue_test.o : atomic_fifo_queue_test.c tests.h $(CC) $(CFLAGS) -c atomic_fifo_queue_test.c -o $@ +$(OBJROOT)/content_protection_test.o : content_protection_test.c tests.h + $(CC) $(CFLAGS) -c content_protection_test.c -o $@ + +$(OBJROOT)/pipes_tests.o : pipes_tests.c tests.h + $(CC) $(CFLAGS) -c pipes_tests.c -o $@ ifndef DOING_BUILDIT .PHONY : clean diff --git a/tools/tests/xnu_quick_test/memory_tests.c b/tools/tests/xnu_quick_test/memory_tests.c index dc8675087..03e31a456 100644 --- a/tools/tests/xnu_quick_test/memory_tests.c +++ b/tools/tests/xnu_quick_test/memory_tests.c @@ -49,16 +49,14 @@ crashcount(char *namebuf1, char *namebuf2) char *crash_file_pfx = "xnu_quick_test"; int crash_file_pfxlen = strlen(crash_file_pfx); struct stat sb; - DIR *dirp1, *dirp2; + DIR *dirp1 = NULL, *dirp2 = NULL; struct dirent *dep1, *dep2; int count = 0; - /* If we can't open the directory, it hasn't been created */ - if ((dirp1 = opendir(crashdir1)) == NULL) { - return( 0 ); - } + /* If we can't open the directory, dirp1 will be NULL */ + dirp1 = opendir(crashdir1); - while((dep1 = readdir(dirp1)) != NULL) { + while(dirp1 != NULL && ((dep1 = readdir(dirp1)) != NULL)) { if (strncmp(crash_file_pfx, dep1->d_name, crash_file_pfxlen)) continue; /* record each one to get the last one */ @@ -70,14 +68,14 @@ crashcount(char *namebuf1, char *namebuf2) count++; } - closedir(dirp1); + if (dirp1 != NULL) + closedir(dirp1); - /* If we can't open the directory, it hasn't been created */ - if ((dirp2 = opendir(crashdir2)) == NULL) { - return( 0 ); - } +#if !TARGET_OS_EMBEDDED + /* If we can't open the directory, dirp2 will be NULL */ + dirp2 = opendir(crashdir2); - while((dep2 = readdir(dirp2)) != NULL) { + while(dirp2 != NULL && (dep2 = readdir(dirp2)) != NULL) { if (strncmp(crash_file_pfx, dep2->d_name, crash_file_pfxlen)) continue; /* record each one to get the last one */ @@ -88,10 +86,10 @@ crashcount(char *namebuf1, char *namebuf2) } count++; } - - closedir(dirp2); - - return( count/2 ); + if (dirp2 != NULL) + closedir(dirp2); +#endif + return( count ); } @@ -155,6 +153,8 @@ int memory_tests( void * the_argp ) * Find out how many crashes there have already been; if it's not * zero, then don't even attempt this test. */ + my_namebuf1[0] = '\0'; + my_namebuf2[0] = '\0'; if ((my_crashcount = crashcount(my_namebuf1, my_namebuf2)) != 0) { printf( "memtest aborted: can not distinguish our expected crash from \n"); printf( "%d existing crashes including %s \n", my_crashcount, my_namebuf2); @@ -406,23 +406,27 @@ exit_child: * Find out how many crashes there have already been; if it's not * one, then don't even attempt this test. */ - if ((my_crashcount = crashcount(my_namebuf1, my_namebuf2)) != 1) { + my_namebuf1[0] = '\0'; + my_namebuf2[0] = '\0'; + my_crashcount = crashcount(my_namebuf1, my_namebuf2); + if (!(my_crashcount == 1 || my_crashcount == 2)) { printf( "child did not crash as expected \n"); - printf( "saw %d crashes including %s \n", my_crashcount, my_namebuf2); + printf( "saw %d crashes including %s \n", my_crashcount, my_namebuf1); goto test_failed_exit; } /* post-remove the expected crash report */ - if (unlink(my_namebuf1)) { + if (unlink(my_namebuf1) && !(errno == ENOENT || errno == ENOTDIR)) { printf("unlink of expected crash report '%s' failed \n", my_namebuf1); goto test_failed_exit; } - - if (unlink(my_namebuf2)) { +#if !TARGET_OS_EMBEDDED + /* /Library/Logs/DiagnosticReports/ does not exist on embedded targets. */ + if (unlink(my_namebuf2) && !(errno == ENOENT || errno == ENOTDIR)) { printf("unlink of expected crash report '%s' failed \n", my_namebuf2); goto test_failed_exit; } - +#endif /* make sure shared page got modified in child */ if ( strcmp( my_test_page_p, "parent data child data" ) != 0 ) { printf( "minherit did not work correctly - shared page looks wrong \n" ); diff --git a/tools/tests/xnu_quick_test/misc.c b/tools/tests/xnu_quick_test/misc.c index 9545bf140..5e3706211 100644 --- a/tools/tests/xnu_quick_test/misc.c +++ b/tools/tests/xnu_quick_test/misc.c @@ -139,6 +139,7 @@ int create_file_with_name( char *the_target_dirp, char *the_namep, int remove_ex printf( "open failed with error %d - \"%s\" \n", errno, strerror( errno) ); goto failure_exit; } + fcntl( my_fd, F_FULLFSYNC ); close( my_fd ); } goto routine_exit; @@ -319,9 +320,9 @@ int get_architecture() char *errmsg = NULL; errmsg = "sysctlbyname() failed when getting hw.cputype"; - if (my_err = sysctlbyname("hw.cputype", NULL, &length, NULL, 0)) goto finished; /* get length of data */ + if ((my_err = sysctlbyname("hw.cputype", NULL, &length, NULL, 0))) goto finished; /* get length of data */ if (length != sizeof(buf)) goto finished; - if (my_err = sysctlbyname("hw.cputype", &buf, &length, NULL, 0)) goto finished; /* copy data */ + if ((my_err = sysctlbyname("hw.cputype", &buf, &length, NULL, 0))) goto finished; /* copy data */ switch (buf) { case CPU_TYPE_X86: case CPU_TYPE_X86_64: diff --git a/tools/tests/xnu_quick_test/pipes_tests.c b/tools/tests/xnu_quick_test/pipes_tests.c new file mode 100644 index 000000000..c87f94d18 --- /dev/null +++ b/tools/tests/xnu_quick_test/pipes_tests.c @@ -0,0 +1,880 @@ +/* Mach virtual memory unit tests + * + * The main goal of this code is to facilitate the construction, + * running, result logging and clean up of a test suite, taking care + * of all the scaffolding. A test suite is a sequence of very targeted + * unit tests, each running as a separate process to isolate its + * address space. + * A unit test is abstracted as a unit_test_t structure, consisting of + * a test function and a logging identifier. A test suite is a suite_t + * structure, consisting of an unit_test_t array, a logging identifier, + * and fixture set up and tear down functions. + * Test suites are created dynamically. Each of its unit test runs in + * its own fork()d process, with the fixture set up and tear down + * running before and after each test. The parent process will log a + * pass result if the child exits normally, and a fail result in any + * other case (non-zero exit status, abnormal signal). The suite + * results are then aggregated and logged, and finally the test suite + * is destroyed. + * Everything is logged to stdout in the standard Testbot format, which + * can be easily converted to Munin or SimonSays logging + * format. Logging is factored out as much as possible for future + * flexibility. In our particular case, a unit test is logged as a + * Testbot Test Case ([BEGIN]/[PASS]/[FAIL], and a test suite is + * logged as a Testbot Test ([TEST]). This is confusing but + * unfortunately cannot be avoided for compatibility. Suite results + * are aggregated after the [SUMMARY] keyword. + * The included test suites cover the various pipe buffer operations + * with dynamic expansion. + * + * Vishal Patel (vishal_patel@apple.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/**************************/ +/**************************/ +/* Unit Testing Framework */ +/**************************/ +/**************************/ + +/*********************/ +/* Private interface */ +/*********************/ + +static const char frameworkname[] = "pipes_unitester"; + +/* Type for test, fixture set up and fixture tear down functions. */ +typedef void (*test_fn_t)(); + +/* Unit test structure. */ +typedef struct { + const char *name; + test_fn_t test; +} unit_test_t; + +/* Test suite structure. */ +typedef struct { + const char *name; + int numoftests; + test_fn_t set_up; + unit_test_t *tests; + test_fn_t tear_down; +} suite_t; + +int _quietness = 0; +unsigned int _timeout = 0; +int _expected_signal = 0; + +struct { + uintmax_t numoftests; + uintmax_t passed_tests; +} results = { 0, 0 }; + +void logr(char *format, ...) __printflike(1, 2); + +static void die(int condition, const char *culprit) +{ + if (condition) { + printf("%s: %s error: %s.\n", frameworkname, culprit, + strerror(errno)); + exit(1); + } +} + +static void die_on_stdout_error() +{ + die(ferror(stdout), "stdout"); +} + +/* Individual test result logging. */ +void logr(char *format, ...) +{ + if (_quietness <= 1) { + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + die_on_stdout_error(); + } +} + +static suite_t *create_suite(const char *name, int numoftests, + test_fn_t set_up, unit_test_t *tests, + test_fn_t tear_down) +{ + suite_t *suite = (suite_t *)malloc(sizeof(suite_t)); + die(suite == NULL, "malloc()"); + + suite->name = name; + suite->numoftests = numoftests; + suite->set_up = set_up; + suite->tests = tests; + suite->tear_down = tear_down; + return suite; +} + +static void destroy_suite(suite_t *suite) +{ + free(suite); +} + +static void log_suite_info(suite_t *suite) +{ + logr("[TEST] %s\n", suite->name); + logr("Number of tests: %d\n\n", suite->numoftests); +} + +static void log_suite_results(suite_t *suite, int passed_tests) +{ + results.numoftests += (uintmax_t)suite->numoftests; + results.passed_tests += (uintmax_t)passed_tests; +} + +static void log_test_info(unit_test_t *unit_test) +{ + logr("[BEGIN] %s\n", unit_test->name); +} + +static void log_test_result(unit_test_t *unit_test, + boolean_t test_passed) +{ + logr("[%s] %s\n\n", test_passed ? "PASS" : "FAIL", + unit_test->name); +} + +/* Handler for test time out. */ +static void alarm_handler(int signo) +{ + write(1,"Child process timed out.\n", + strlen("Child process timed out.\n")); + _Exit(6); +} + +/* Run a test with fixture set up and teardown, while enforcing the + * time out constraint. */ +static void run_test(suite_t *suite, unit_test_t *unit_test) +{ + struct sigaction alarm_act; + + log_test_info(unit_test); + alarm_act.sa_handler = alarm_handler; + sigemptyset(&alarm_act.sa_mask); + alarm_act.sa_flags = 0; + die(sigaction(SIGALRM, &alarm_act, NULL) != 0, "sigaction()"); + alarm(_timeout); + + suite->set_up(); + unit_test->test(); + suite->tear_down(); +} + +/* Check a child return status. */ +static boolean_t child_terminated_normally(int child_status) +{ + boolean_t normal_exit = FALSE; + + if (WIFEXITED(child_status)) { + int exit_status = WEXITSTATUS(child_status); + if (exit_status) { + printf("Child process unexpectedly exited with code " + "%d.\n", exit_status); + } else if (!_expected_signal) { + normal_exit = TRUE; + } + } else if (WIFSIGNALED(child_status)) { + int signal = WTERMSIG(child_status); + if (signal == _expected_signal) { + if (_quietness <= 0) { + printf("Child process died with expected signal " + "%d.\n", signal); + } + normal_exit = TRUE; + } else { + printf("Child process unexpectedly died with signal " + "%d.\n", signal); + } + } else { + printf("Child process unexpectedly did not exit nor " + "die.\n"); + } + die_on_stdout_error(); + return normal_exit; +} + +/* Run a test in its own process, and report the result. */ +static boolean_t child_test_passed(suite_t *suite, + unit_test_t *unit_test) +{ + int test_status; + + pid_t test_pid = fork(); + die(test_pid == -1, "fork()"); + if (!test_pid) { + run_test(suite, unit_test); + exit(0); + } + while (waitpid(test_pid, &test_status, 0) != test_pid) { + continue; + } + boolean_t test_result = child_terminated_normally(test_status); + log_test_result(unit_test, test_result); + return test_result; +} + +/* Run each test in a suite, and report the results. */ +static int count_passed_suite_tests(suite_t *suite) +{ + int passed_tests = 0; + int i; + + for (i = 0; i < suite->numoftests; i++) { + passed_tests += child_test_passed(suite, + &(suite->tests[i])); + } + return passed_tests; +} + +/********************/ +/* Public interface */ +/********************/ + +#define DEFAULT_TIMEOUT 5U +#define DEFAULT_QUIETNESS 1 + +#define assert(condition, exit_status, ...) \ + if (!(condition)) { \ + _fatal(__FILE__, __LINE__, __func__, \ + (exit_status), __VA_ARGS__); \ + } + +/* Include in tests whose expected outcome is a specific signal. */ +#define expect_signal(signal) \ + struct sigaction _act; \ + _act.sa_handler = expected_signal_handler; \ + sigemptyset(&_act.sa_mask); \ + _act.sa_flags = 0; \ + assert(sigaction((signal), &_act, NULL) == 0, 1, \ + "sigaction() error: %s.", strerror(errno)); + +#define run_suite(set_up, tests, tear_down, ...) \ + _run_suite((sizeof(tests)/sizeof(tests[0])), \ + (set_up), (tests), (tear_down), __VA_ARGS__) + +typedef unit_test_t UnitTests[]; + +void _fatal(const char *file, int line, const char *function, + int exit_status, const char *format, ...) + __printflike(5, 6); +void _run_suite(int numoftests, test_fn_t set_up, UnitTests tests, + test_fn_t tear_down, const char *format, ...) + __printflike(5, 6); +void logv(char *format, ...) __printflike(1, 2); + +void _fatal(const char *file, int line, const char *function, + int exit_status, const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + printf("\n"); + printf("Assert failed in file %s, function %s(), line %d.\n", + file, function, line); + va_end(ap); + exit(exit_status); +} + +void _run_suite(int numoftests, test_fn_t set_up, UnitTests tests, + test_fn_t tear_down, const char *format, ...) +{ + va_list ap; + char *name; + + va_start(ap, format); + die(vasprintf(&name, format, ap) == -1, "vasprintf()"); + va_end(ap); + suite_t *suite = create_suite(name, numoftests, set_up, tests, + tear_down); + log_suite_info(suite); + log_suite_results(suite, count_passed_suite_tests(suite)); + free(name); + destroy_suite(suite); +} + +/* Signal handler for tests expected to terminate with a specific + * signal. */ +void expected_signal_handler(int signo) +{ + write(1,"Child process received expected signal.\n", + strlen("Child process received expected signal.\n")); + _Exit(0); +} + +/* Setters and getters for various test framework global + * variables. Should only be used outside of the test, set up and tear + * down functions. */ + +/* Time out constraint for running a single test. */ +void set_timeout(unsigned int time) +{ + _timeout = time; +} + +unsigned int get_timeout() +{ + return _timeout; +} + +/* Expected signal for a test, default is 0. */ +void set_expected_signal(int signal) +{ + _expected_signal = signal; +} + +int get_expected_signal() +{ + return _expected_signal; +} + +/* Logging verbosity. */ +void set_quietness(int value) +{ + _quietness = value; +} + +int get_quietness() +{ + return _quietness; +} + +/* For fixture set up and tear down functions, and units tests. */ +void do_nothing() { +} + +/* Verbose (default) logging. */ +void logv(char *format, ...) +{ + if (get_quietness() <= 0) { + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + die_on_stdout_error(); + } +} + +void log_aggregated_results() +{ + printf("[SUMMARY] Aggregated Test Results\n"); + printf("Total: %ju\n", results.numoftests); + printf("Passed: %ju\n", results.passed_tests); + printf("Failed: %ju\n\n", results.numoftests + - results.passed_tests); + die_on_stdout_error(); +} + +/*******************************/ +/*******************************/ +/* pipes buffer unit testing */ +/*******************************/ +/*******************************/ + +static const char progname[] = "pipes_unitester"; + +static void die_on_error(int condition, const char *culprit) +{ + assert(!condition, 1, "%s: %s error: %s.", progname, culprit, + strerror(errno)); +} + + +/*******************************/ +/* Usage and option processing */ +/*******************************/ + +static void usage(int exit_status) +{ + printf("Usage : %s\n", progname); + exit(exit_status); +} + +static void die_on_invalid_value(int condition, + const char *value_string) +{ + if (condition) { + printf("%s: invalid value: %s.\n", progname, value_string); + usage(1); + } +} + +/* Convert a storage unit suffix into an exponent. */ +static int strtoexp(const char *string) +{ + if (string[0] == '\0') { + return 0; + } + + char first_letter = toupper(string[0]); + char prefixes[] = "BKMGTPE"; + const int numofprefixes = strlen(prefixes); + prefixes[numofprefixes] = first_letter; + int i = 0; + + while (prefixes[i] != first_letter) { + i++; + } + die_on_invalid_value(i >= numofprefixes || (string[1] != '\0' && + (toupper(string[1]) + != 'B' || string[2] + != '\0')), string); + return 10 * i; +} + +static void process_options(int argc, char *argv[]) +{ + int opt; + char *endptr; + + setvbuf(stdout, NULL, _IONBF, 0); + + set_timeout(DEFAULT_TIMEOUT); + set_quietness(DEFAULT_QUIETNESS); + + while ((opt = getopt(argc, argv, "t:vqh")) != -1) { + switch (opt) { + case 't': + errno = 0; + set_timeout(strtoul(optarg, &endptr, 0)); + die_on_invalid_value(errno == ERANGE || *endptr != '\0' + || endptr == optarg, optarg); + break; + case 'q': + set_quietness(get_quietness() + 1); + break; + case 'v': + set_quietness(0); + break; + case 'h': + usage(0); + break; + default: + usage(1); + break; + } + } +} + +/*********************************/ +/* Various function declarations */ +/*********************************/ + +void initialize_data(int *ptr, int len); + +int verify_data(int *base, int *target, int len); + +void clear_data(int *ptr, int len); + +/*******************************/ +/* Arrays for test suite loops */ +/*******************************/ + +#define BUFMAX 20000 +#define BUFMAXLEN (BUFMAX * sizeof(int)) + +const unsigned int pipesize_blocks[] = {128,256,1024,2048,PAGE_SIZE,PAGE_SIZE*2,PAGE_SIZE*4}; +static const int bufsizes[] = { 128, 512, 1024, 2048, 4096, 16384 }; + +int data[BUFMAX],readbuf[BUFMAX]; +int pipefd[2] = {0,0}; + +typedef int * pipe_t; + +struct thread_work_data { + pipe_t p; + unsigned int total_bytes; + unsigned int chunk_size; +}; + +void * reader_thread(void *ptr); +void * writer_thread(void *ptr); + +dispatch_semaphore_t r_sem, w_sem; + +unsigned long current_buf_size=0; + +/*************************************/ +/* Global variables set up functions */ +/*************************************/ + + +void initialize_data(int *ptr, int len) +{ + int i; + if (!ptr || len <=0 ) + return; + + for (i = 0; i < len; i ++) + ptr[i] = i; +} + +void clear_data(int *ptr, int len) +{ + + int i; + if (!ptr) + return; + for (i = 0; i < len; i++) + ptr[i]=0; +} + +int verify_data(int *base, int *target, int len) +{ + int i = 0; + + if (!base || !target) + return 0; + + for (i = 0; i < len; i++){ + if (base[i] != target[i]) + return 0; + } + + return 1; +} + +void initialize_data_buffer() +{ + initialize_data(data, BUFMAX); + initialize_data(readbuf, BUFMAX); +} + +/*******************************/ +/* core read write helper funtions */ +/*******************************/ + +ssize_t read_whole_buffer(pipe_t p, void *scratch_buf, int size); +ssize_t pipe_read_data(pipe_t p, void *dest_buf, int size); +ssize_t pipe_write_data(pipe_t p, void *src_buf, int size); + +ssize_t read_whole_buffer(pipe_t p, void *scratch_buf, int size) +{ + int fd = p[0]; + logv("reading whole buffer from fd %d, size %d", fd, size); + int retval = pread(fd, scratch_buf, size, 0); + if (retval == -1 ){ + logv("Error reading whole buffer. (%d) %s\n",errno, strerror(errno)); + } + return retval; + +} + +ssize_t pipe_read_data(pipe_t p, void *dest_buf, int size) +{ + int fd = p[0]; + //logv("reading from pipe %d, for size %d", fd, size); + int retval = read(fd, dest_buf, size); + if (retval == -1) { + logv("Error reading from buffer. (%d)",errno); + } + return retval; +} + +ssize_t pipe_write_data(pipe_t p, void *src_buf, int size) +{ + int fd = p[1]; + //logv("writing to pipe %d, for size %d", fd, size); + int retval = write(fd, src_buf, size); + if (retval == -1) { + logv("Error writing to buffer. (%d) %s",errno, strerror(errno)); + } + return retval; +} + + +void * reader_thread(void *ptr) +{ + struct thread_work_data *m; + m = (struct thread_work_data *) ptr; + int i = m->total_bytes/m->chunk_size; + int retval, data_idx=0; + while (i > 0){ + dispatch_semaphore_wait(r_sem, 8000); + retval = pipe_read_data(m->p, &readbuf[data_idx], m->chunk_size); + assert(retval == m->chunk_size, 1, "Pipe read returned different amount of numbe"); + data_idx +=m->chunk_size; + //logv("RD %d \n", m->chunk_size); + dispatch_semaphore_signal(w_sem); + i--; + } + return 0; +} + +void * writer_thread(void *ptr) +{ + struct thread_work_data *m; + m = (struct thread_work_data *)ptr; + int i = m->total_bytes/m->chunk_size; + int retval, data_idx=0; + while ( i > 0 ){ + + dispatch_semaphore_wait(w_sem, 8000); + //logv("WR %d \n", m->chunk_size); + retval=pipe_write_data(m->p, &data[data_idx], m->chunk_size); + assert(retval == m->chunk_size, 1, "Pipe write failed"); + data_idx +=m->chunk_size; + dispatch_semaphore_signal(r_sem); + i--; + } + return 0; +} + + +void create_threads(struct thread_work_data *rdata, struct thread_work_data *wdata){ + + pthread_t thread1, thread2; + r_sem = dispatch_semaphore_create(0); + w_sem = dispatch_semaphore_create(1); + int iret1, iret2; + void * thread_ret1 =0; + void * thread_ret2 =0; + /* Create independent threads each of which will execute function */ + + iret1 = pthread_create( &thread1, NULL, reader_thread, (void*) rdata); + iret2 = pthread_create( &thread2, NULL, writer_thread, (void*) wdata); + + pthread_join( thread2, &thread_ret1); + pthread_join( thread1, &thread_ret1); + assert(thread_ret1 == 0, 1, "Reader Thread Failed"); + assert(thread_ret2 == 0, 1, "Writer Thread Failed"); +} + + +/*******************************/ +/* Pipes unit test functions */ +/*******************************/ +void test_pipebuffer_setup () +{ + + logv("Setting up buffers data and readbuf\n"); + clear_data(data, BUFMAX); + clear_data(readbuf, BUFMAX); + logv("Initializing buffers data and readbuf\n"); + initialize_data(data, BUFMAX); + initialize_data(readbuf, BUFMAX); + logv("verifying data for correctness\n"); + die_on_error(!verify_data(data, readbuf, BUFMAX), "data initialization"); + clear_data(readbuf, BUFMAX); +} + +void test_pipe_create(){ + int pipefds[2] = {0,0}; + pipe_t p = pipefds; + int err = pipe(p); + if ( err ){ + logv("error opening pipes (%d) %s", errno, strerror(errno)); + return; + } + + die_on_error(0 != close(pipefds[0]), "close()"); + die_on_error(0 != close(pipefds[1]), "close()"); +} + +void test_pipe_write_single_byte(){ + int pipefds[2] = { 0 , 0 }; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + int i = 0,retval; + for ( ; i < current_buf_size; i++){ + if ( i > 16384){ + logv("cannot fill continuously beyond 16K."); + break; + } + retval=pipe_write_data(p, &data[i], 1); + assert(retval == 1, 1, "Pipe write failed"); + } + + close(p[0]); + close(p[1]); +} + +void test_pipe_single_read_write(){ + int pipefds[2] = { 0 , 0 }; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + struct thread_work_data d = { p, current_buf_size, 1}; + create_threads(&d, &d); + verify_data(data, readbuf, current_buf_size); + close(p[0]); + close(p[1]); + +} + +void test_pipe_single_read_2write(){ + int pipefds[2] = { 0 , 0 }; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + struct thread_work_data rd = { p, current_buf_size, 1}; + struct thread_work_data wd = { p, current_buf_size, 2}; + create_threads(&rd, &wd); + verify_data(data, readbuf, current_buf_size); + close(p[0]); + close(p[1]); + +} + +void test_pipe_expansion_buffer(){ + int pipefds[2] = { 0 , 0 }; + int iter = 0; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + for ( iter=0; iter < sizeof(pipesize_blocks)/sizeof(unsigned int); iter++){ + assert(pipesize_blocks[iter] == pipe_write_data(p, &data[0], pipesize_blocks[iter] ), 1, "expansion write failed"); + assert(pipesize_blocks[iter] == pipe_read_data(p, &readbuf[0], pipesize_blocks[iter]+200), 1, "reading from expanded data failed"); + /* logv("finished round for size %u \n", pipesize_blocks[iter]); */ + } + verify_data(data, readbuf, current_buf_size); + close(p[0]); + close(p[1]); + +} + +void test_pipe_initial_big_allocation(){ + int pipefds[2] = { 0 , 0 }; + int iter = 0; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + assert(current_buf_size == pipe_write_data(p, &data[0], current_buf_size ), 1, "initial big allocation failed"); + assert(current_buf_size == pipe_read_data(p, &readbuf[0], current_buf_size+200), 1, "reading from initial big write failed"); + assert(verify_data(data, readbuf, current_buf_size), 1, "big pipe initial allocation -not able to verify data"); + close(p[0]); + close(p[1]); + +} + +void test_pipe_cycle_small_writes(){ + int pipefds[2] = { 0 , 0 }; + int iter = 0; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + int buf_size = current_buf_size / 2; + + assert(buf_size == pipe_write_data(p, &data[0], buf_size ), 1, "cycle write failed"); + assert(buf_size == pipe_read_data(p, &readbuf[0], buf_size+200), 1, "reading from cycle read failed"); + assert(verify_data(data, readbuf, buf_size), 1, "data verification failed"); + + assert(buf_size == pipe_write_data(p, &data[0], buf_size ), 1, "cycle write failed"); + assert(buf_size == pipe_read_data(p, &readbuf[0], buf_size+200), 1, "reading from cycle read failed"); + assert(verify_data(data, readbuf, buf_size), 1, "data verification failed"); + + assert(buf_size == pipe_write_data(p, &data[0], buf_size ), 1, "cycle write failed"); + assert(buf_size == pipe_read_data(p, &readbuf[0], buf_size+200), 1, "reading from cycle read failed"); + assert(verify_data(data, readbuf, buf_size), 1, "data verification failed"); + + close(p[0]); + close(p[1]); + +} + +void test_pipe_moving_data(){ + int pipefds[2] = { 0 , 0 }; + int iter = 0; + pipe_t p = pipefds; + die_on_error( 0 != pipe(p), "pipe()"); + initialize_data_buffer(); + int buf_size = current_buf_size / 2; + if (buf_size > PAGE_SIZE) + buf_size = PAGE_SIZE; + + assert(buf_size == pipe_write_data(p, &data[0], buf_size ), 1, "cycle write failed"); + logv("write of size =%d\n", buf_size); + assert(buf_size == pipe_write_data(p, &data[buf_size/sizeof(int)], buf_size ), 1, "cycle write failed"); + logv("write of size =%d\n", buf_size*2); + assert(buf_size == pipe_write_data(p, &data[(buf_size*2)/sizeof(int)], buf_size ), 1, "cycle write failed"); + logv("write of size =%d\n", buf_size*3); + assert((3*buf_size) == pipe_read_data(p, &readbuf[0], (3*buf_size)+200), 1, "reading from cycle read failed"); + assert(verify_data(data, readbuf, (3*buf_size)/sizeof(int)), 1, "data verification failed"); + + close(p[0]); + close(p[1]); + +} + + +/*************/ +/* pipe Suites */ +/*************/ + +void run_pipe_basic_tests() +{ + int sizes_idx; + int numofsizes = sizeof(bufsizes)/sizeof(int); + + logv("running tests for %d different sizes \n", numofsizes); + + UnitTests pipe_basic_tests = { + { "1. create buffer and verify both reads/writes are valid", + test_pipebuffer_setup }, + { "2. open and close pipes", test_pipe_create }, + { "3. single byte write to full", test_pipe_write_single_byte}, + { "4. single byte read/write in sync", test_pipe_single_read_write}, + { "5. single byte read/2write in sync", test_pipe_single_read_2write}, + { "6. expansion from existing size", test_pipe_expansion_buffer}, + { "7. initial big allocation " , test_pipe_initial_big_allocation}, + { "8. cycle_small_writes " ,test_pipe_cycle_small_writes }, + { "9. test moving data " ,test_pipe_moving_data } + }; + for (sizes_idx = 0; sizes_idx < numofsizes; sizes_idx++) { + current_buf_size = bufsizes[sizes_idx]; + run_suite(do_nothing, + pipe_basic_tests, + do_nothing, "pipe create base test " + "Size: 0x%jx (%ju)", + (uintmax_t)bufsizes[sizes_idx], + (uintmax_t)bufsizes[sizes_idx]); + } +} + + +int pipes_test(void *the_argp) +{ + set_quietness(2); + run_pipe_basic_tests(); + //log_aggregated_results(); + return results.numoftests - results.passed_tests; +} + +/* + * retaining the old main function to debug issues with the tests and not the xnu_quick_test framework + * or the system + */ +int main_nonuse(int argc, char *argv[]) +{ + process_options(argc, argv); + + run_pipe_basic_tests(); + + log_aggregated_results(); + return 0; +} diff --git a/tools/tests/xnu_quick_test/socket_tests.c b/tools/tests/xnu_quick_test/socket_tests.c index 00433d5ba..e9a34380f 100644 --- a/tools/tests/xnu_quick_test/socket_tests.c +++ b/tools/tests/xnu_quick_test/socket_tests.c @@ -210,7 +210,7 @@ int socket_tests( void * the_argp ) } #endif -#if 1 +#if !TARGET_OS_EMBEDDED /* sendfile test. Open libsystem, set up some headers, and send it */ struct sf_hdtr my_sf_hdtr; int my_libsys_fd; @@ -328,7 +328,7 @@ int socket_tests( void * the_argp ) } #endif -#if 1 +#if !TARGET_OS_EMBEDDED size_t neededBytes = 11; /* Check for sendfile output */ diff --git a/tools/tests/xnu_quick_test/tests.c b/tools/tests/xnu_quick_test/tests.c index 2d79c6be5..cf2867e8a 100644 --- a/tools/tests/xnu_quick_test/tests.c +++ b/tools/tests/xnu_quick_test/tests.c @@ -12,6 +12,7 @@ #include /* for message queue tests */ #include /* for get / settid */ #include /* for determining hw */ +#include /* for kas_info() */ #include /* for determination of Mac OS X version (tiger, leopard, etc.) */ #include /* for OSSwap32() */ #include @@ -893,12 +894,19 @@ test_passed_exit: */ int access_chmod_fchmod_test( void * the_argp ) { - int my_err; - int my_fd = -1; + int error_occurred; + int my_err; + int my_fd = -1; + char * my_pathp = NULL; - uid_t euid,ruid; - struct stat my_sb; - kern_return_t my_kr; + + uid_t euid,ruid; + struct stat my_sb; + + FILE * file_handle; + + kern_return_t my_kr; + my_kr = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t*)&my_pathp, PATH_MAX, VM_FLAGS_ANYWHERE); if(my_kr != KERN_SUCCESS){ @@ -963,49 +971,77 @@ int access_chmod_fchmod_test( void * the_argp ) /* another test for the access system call -- refer ro radar# 6725311 */ - - system("touch /tmp/me"); - system("echo local | sudo touch /tmp/notme"); - + +#if !TARGET_OS_EMBEDDED + + /* + * This test makes sure that the access system call does not give the current user extra + * permissions on files the current user does not own. From radar #6725311, this could + * happen when the current user calls access() on a file owned by the current user in + * the same directory as the other files not owned by the current user. + * + * Note: This test expects that the effective uid (euid) is set to root. + * + */ + + /* Create a file that root owns */ + file_handle = fopen(FILE_NOTME, "w"); + fclose(file_handle); + + /* Currently running as root (through setreuid manipulation), switch to running as the current user. */ euid = geteuid(); ruid = getuid(); - //printf("effective user id is %d: and real user id is %d: \n", (int)euid, (int)ruid); setreuid(ruid, ruid); - //printf("effective user id is %d: and real user id is %d: \n", (int)geteuid, (int)getuid); + + /* Create a file that the current user owns */ + file_handle = fopen(FILE_ME, "w"); + fclose(file_handle); + + error_occurred = 0; + + /* Try to remove the file owned by root (this should fail). */ my_err = unlink(FILE_NOTME); + if (my_err < 0) { my_err = errno; } + if (my_err == 0) { - printf("Unresolved: First attempt deleted '" FILE_NOTME "'! \n" ); - goto test_failed_exit; + printf("Unresolved: First attempt deleted '" FILE_NOTME "'! \n"); + error_occurred = 1; } else { printf("Status: First attempt to delete '" FILE_NOTME "' failed with error %d - %s.\n", my_err, strerror( my_err )); - - if (true) { - my_err = access(FILE_ME, _DELETE_OK); - if (my_err < 0) { - my_err = errno; - } - //printf("Status: access('" FILE_ME "') = %d - %s.\n", my_err, strerror( my_err )); - fprintf(stderr, "Status: access('" FILE_ME "') = %d\n", my_err); - } + + /* Set _DELETE_OK on a file that the current user owns */ + access(FILE_ME, _DELETE_OK); + + /* Try to remove the file owned by root again (should give us: EPERM [13]) */ my_err = unlink(FILE_NOTME); - if (my_err < 0) { - my_err = errno; - } - if (my_err == 0) { + + if (my_err < 0) { + my_err = errno; + } + + if (my_err == 0) { printf("Failed: Second attempt deleted '" FILE_NOTME "'!\n"); - //fprintf(stderr, "Failed: Second attempt deleted '" FILE_NOTME "'!\n"); - goto test_failed_exit; - } else { + error_occurred = 1; + } else if (my_err == 13) { printf("Passed: Second attempt to delete '" FILE_NOTME "' failed with error %d - %s.\n", my_err, strerror( my_err )); - // fprintf(stderr, "Passed: Second attempt to delete '" FILE_NOTME "' failed with error %d\n", my_err); - - } + } else { + printf("Failed: Second attempt to delete '" FILE_NOTME "' failed with error %d - %s.\n", my_err, strerror( my_err )); + error_occurred = 1; + } } + + /* Reset to running as root */ setreuid(ruid, euid); - //printf("effective user id is %d: and real user id is %d ---1: \n", euid, ruid); + + if(error_occurred == 1) { + goto test_failed_exit; + } + +#endif + /* end of test*/ @@ -1052,6 +1088,26 @@ test_passed_exit: return( my_err ); } +#if !TARGET_OS_EMBEDDED +static bool _prime_groups(void) +{ + /* + * prime groups with a known list to ensure consistent test behavior + */ + + gid_t my_exp_groups[] = { getegid(), 20, 61, 12 }; + int my_err; + + my_err = setgroups( ( sizeof(my_exp_groups) / sizeof(*my_exp_groups) ), &my_exp_groups[0] ); + if ( my_err == -1 ) { + printf( "initial setgroups call failed. got errno %d - %s. \n", errno, strerror( errno ) ); + return false; + } + + return true; +} +#endif + /* ************************************************************************************************************** * Test chown, fchown, lchown, lstat, readlink, symlink system calls. * ************************************************************************************************************** @@ -1103,6 +1159,10 @@ int chown_fchown_lchown_lstat_symlink_test( void * the_argp ) goto test_failed_exit; } + if ( !_prime_groups() ) { + goto test_failed_exit; + } + /* set up by getting a list of groups */ my_group_count = getgroups( NGROUPS_MAX, &my_groups[0] ); @@ -2114,6 +2174,10 @@ int groups_test( void * the_argp ) my_real_gid = getgid( ); my_effective_gid = getegid( ); + if ( !_prime_groups() ) { + goto test_failed_exit; + } + /* start by getting list of groups the current user belongs to */ my_orig_group_count = getgroups( NGROUPS_MAX, &my_groups[0] ); @@ -3357,6 +3421,7 @@ int fcntl_test( void * the_argp ) close( my_newfd ); my_newfd = -1; +#if !TARGET_OS_EMBEDDED /* This section of the test is specific for the desktop platform, refer */ /* While we're here, dup it via an open of /dev/fd/ .. */ { @@ -3385,7 +3450,7 @@ int fcntl_test( void * the_argp ) } close ( my_newfd ); my_newfd = -1; - +#endif my_err = 0; goto test_passed_exit; @@ -4418,6 +4483,7 @@ typedef struct packed_result * packed_result_p; int searchfs_test( void * the_argp ) { +#if !TARGET_OS_EMBEDDED int my_err, my_items_found = 0, my_ebusy_count; char * my_pathp = NULL; unsigned long my_matches; @@ -4612,6 +4678,10 @@ test_passed_exit: vm_deallocate(mach_task_self(), (vm_address_t)my_pathp, PATH_MAX); } return( my_err ); +#else + printf( "\t--> Not supported on EMBEDDED TARGET\n" ); + return 0; +#endif } @@ -4623,7 +4693,6 @@ test_passed_exit: */ int aio_tests( void * the_argp ) { -#if !TARGET_OS_EMBEDDED int my_err, i; char * my_pathp; struct aiocb * my_aiocbp; @@ -4888,10 +4957,6 @@ test_passed_exit: } } return( my_err ); -#else - printf( "\t--> Not supported on EMBEDDED TARGET\n" ); - return 0; -#endif } @@ -5072,6 +5137,80 @@ test_passed_exit: return my_err; } +/* ************************************************************************************************************** + * Test KASLR-related functionality + * ************************************************************************************************************** + */ +int kaslr_test( void * the_argp ) +{ + int result = 0; + uint64_t slide = 0; + size_t size; + int slide_enabled; + + size = sizeof(slide_enabled); + result = sysctlbyname("kern.slide", &slide_enabled, &size, NULL, 0); + if (result != 0) { + printf("sysctlbyname(\"kern.slide\") failed with errno %d\n", errno); + goto test_failed_exit; + } + + /* Test positive case first */ + size = sizeof(slide); + result = kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, &slide, &size); + if (result == 0) { + /* syscall supported, slide must be non-zero if running latest xnu and KASLR is enabled */ + if (slide_enabled && (slide == 0)) { + printf("kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, &slide, &size) reported slide of 0x%016llx\n", slide); + goto test_failed_exit; + } + if (size != sizeof(slide)) { + printf("kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, &slide, &size) reported size of %lu\n", size); + goto test_failed_exit; + } + } else { + /* Only ENOTSUP is allowed. If so, assume all calls will be unsupported */ + if (errno == ENOTSUP) { + return 0; + } else { + printf("kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, &slide, &size) returned unexpected errno (errno %d)\n", errno); + goto test_failed_exit; + } + } + + /* Negative cases for expected failures */ + size = sizeof(slide); + result = kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, NULL /* EFAULT */, &size); + if ((result == 0) || (errno != EFAULT)) { + printf("kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, NULL, &size) returned unexpected success or errno (result %d errno %d)\n", result, errno); + goto test_failed_exit; + } + + size = sizeof(slide) + 1; /* EINVAL */ + result = kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, NULL, &size); + if ((result == 0) || (errno != EINVAL)) { + printf("kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, NULL, &size+1) returned unexpected success or errno (result %d errno %d)\n", result, errno); + goto test_failed_exit; + } + + result = kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, NULL /* EFAULT */, NULL /* EFAULT */); + if ((result == 0) || (errno != EFAULT)) { + printf("kas_info(KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR, NULL, NULL) returned unexpected success or errno (result %d errno %d)\n", result, errno); + goto test_failed_exit; + } + + size = sizeof(slide); + result = kas_info(KAS_INFO_MAX_SELECTOR /* EINVAL */, &slide, &size); + if ((result == 0) || (errno != EINVAL)) { + printf("kas_info(KAS_INFO_MAX_SELECTOR, &slide, &size) returned unexpected success or errno (result %d errno %d)\n", result, errno); + goto test_failed_exit; + } + + return 0; + +test_failed_exit: + return -1; +} #if TEST_SYSTEM_CALLS diff --git a/tools/tests/xnu_quick_test/tests.h b/tools/tests/xnu_quick_test/tests.h index 53b346804..6edbaa9b6 100644 --- a/tools/tests/xnu_quick_test/tests.h +++ b/tools/tests/xnu_quick_test/tests.h @@ -55,8 +55,9 @@ * Random values used by execve tests to * determine architecture of machine. */ -#define FILE_NOTME "/tmp/notme" /* file in /tm not owned by me */ -#define FILE_ME "/tmp/me" /* file in /tmp owned by me */ + +#define FILE_NOTME "/private/tmp/notme" /* file in /private/tmp not owned by the current user */ +#define FILE_ME "/private/tmp/me" /* file in /private/tmp owned by the current user */ typedef int (*test_rtn_t)(void *); @@ -72,6 +73,7 @@ int create_file_with_name( char *the_pathp, char *the_namep, int remove_existing int create_random_name( char *the_pathp, int do_open ); int directory_tests( void * the_argp ); int do_execve_test(char * path, char * argv[], void * envpi, int killwait); +int do_spawn_test(int arch, int shouldfail); int dup_test( void * the_argp ); int exchangedata_test( void * the_argp ); int execve_kill_vfork_test( void * the_argp ); @@ -118,6 +120,9 @@ int statfs_32bit_inode_tests( void * the_argp ); int commpage_data_tests( void * the_argp ); int atomic_fifo_queue_test( void * the_argp ); int sched_tests( void * the_argp ); +int content_protection_test( void * the_argp ); +int pipes_test( void * the_argp ); +int kaslr_test( void * the_argp ); struct test_entry { -- 2.45.2